def analyse(factory):
    data = iter_corpus()
    predictor = factory()
    predictor.fit(data)
    p1 = ExtractText()
    X1 = p1.transform(data)
    p2 = EncodingText(predictor.vocabulary)
    p2.fit(X1)
    X = p2.transform(X1)
    y = [a.rating for a in data]
    (v1, v2, score, words) = predictor.classifier.analyse(X, predictor.vocabulary)

    labels = ["neg", "pos", "mix", "other"]
    counter = []

    for i in range(len(set(y))):
        counter.append([0.0] * len(predictor.vocabulary))

    for i in range(np.size(X, 0)):
        x = X[i]
        label = y[i]
        for w in x:
            counter[label - 1][w] += 1.0

    counter = np.array(counter)
    cl = []
    for i in range(len(predictor.vocabulary)):
        cl_max = max(counter[:, i])
        for j in range(len(set(y))):
            if counter[j, i] == cl_max:
                cl.append(j)

    visualise(v1, predictor.vocabulary, cl)
    visualise(v2, predictor.vocabulary, cl)
    for i in range(len(score)):
        print "sentiment - " + str(labels[i])
        for j in range(len(score[i])):
            print words[i][j] + " : " + str(score[i][j])
Example #2
0
def analyse(factory):
    data=iter_corpus()
    predictor=factory()
    predictor.fit(data)
    p1=ExtractText(True)
    X1=p1.transform(data)
    p2=EncodingText(predictor.vocabulary)
    p2.fit(X1)
    X=p2.transform(X1)
    y=[a.rating for a in data]
    (v1,v2,score,words)=predictor.classifier.analyse(X,predictor.vocabulary)

    labels=['neg','pos','mix','other']
    counter=[]

    for i in range(len(set(y))):
        counter.append([0.0]*len(predictor.vocabulary))

    for i in range(np.size(X,0)):
        x=X[i]
        label=y[i]
        for w in x:
            counter[label-1][w]+=1.0

    counter=np.array(counter)
    cl=[]
    for i in range(len(predictor.vocabulary)):
        cl_max=max(counter[:,i])
        for j in range(len(set(y))):
            if counter[j,i]==cl_max:
                cl.append(j)

    visualise(v1,predictor.vocabulary,cl)
    visualise(v2,predictor.vocabulary,cl)
    for i in range(len(score)):
        print 'sentiment - '+str(labels[i])
        for j in range(len(score[i])):
            print words[i][j]+' : '+str(score[i][j])
Example #3
0
                    value = float(value)
                except ValueError:
                    pass
        new[key] = value
    return new


if __name__ == "__main__":
    import argparse
    import json
    import csv
    import sys

    from corpus import iter_corpus, iter_test_corpus
    from predictor import PhraseSentimentPredictor

    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("filename")
    config = parser.parse_args()
    config = json.load(open(config.filename))

    start=time.time()
    predictor = PhraseSentimentPredictor(**config)
    predictor.fit(list(iter_corpus()))
    print "fitting takes "+str(time.time()-start)
    test = list(iter_test_corpus())
    #prediction = predictor.predict(test)
    score = predictor.score(test,'test')
    print("test score {}%".format(score * 100))
    print 'programme finished!'
Example #4
0
"""
Created on Mon Sep  7 15:36:55 2015

@author: VAIO
"""
from collections import defaultdict
from sklearn.pipeline import make_pipeline, make_union
from corpus import iter_corpus, iter_test_corpus
from transformations import (ExtractText, ExtractAuthor,ExtractDate,EncodingText)
import csv
from settings import DATA_PATH

def target(phrases):
    return [datapoint.rating for datapoint in phrases]
    
phrases = list(iter_corpus())

vocabulary=[]
with open(DATA_PATH + '/vocabulary',encoding='utf-8') as f:
    rd=csv.reader(f)
    for line in rd:
        vocabulary.append(line[0])
          
pipeline1 = [ExtractText()]
pipeline1.append(EncodingText(vocabulary))
pipeline=make_pipeline(*pipeline1)


y = target(phrases)

Example #5
0
        self.last = new


if __name__ == "__main__":
    import argparse
    import json
    from evaluation import analyse
    from predictor import PhraseSentimentPredictor

    # get vocabulary
    from corpus import iter_corpus
    import csv,os
    from transformations import ExtractText

    if not os.path.exists('./data/vocabulary'):
        datapoints=list(iter_corpus())
        vocabulary=set()
        et=ExtractText()
        X=et.transform(datapoints)
        for datap in X:
            for w in datap.split():
                vocabulary.add(w)
        vocabulary=list(vocabulary)
        vocabulary.sort()
        with open('./data/vocabulary','wb') as f:
            wr=csv.writer(f)
            for voc in vocabulary:
                wr.writerow([voc])

    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("filename")
Example #6
0
        self.last = new


if __name__ == "__main__":
    import argparse
    import json
    from evaluation import analyse
    from predictor import PhraseSentimentPredictor

    # get vocabulary
    from corpus import iter_corpus
    import csv, os
    from transformations import ExtractText

    if not os.path.exists('./data/vocabulary'):
        datapoints = list(iter_corpus())
        vocabulary = set()
        et = ExtractText()
        X = et.transform(datapoints)
        for datap in X:
            for w in datap.split():
                vocabulary.add(w.lower())
        vocabulary = list(vocabulary)
        vocabulary.sort()
        with open('./data/vocabulary', 'wb') as f:
            wr = csv.writer(f)
            for voc in vocabulary:
                wr.writerow([voc])

    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("filename")