def analyse(factory):
    data = iter_corpus()
    predictor = factory()
    predictor.fit(data)
    p1 = ExtractText()
    X1 = p1.transform(data)
    p2 = EncodingText(predictor.vocabulary)
    p2.fit(X1)
    X = p2.transform(X1)
    y = [a.rating for a in data]
    (v1, v2, score, words) = predictor.classifier.analyse(X, predictor.vocabulary)

    labels = ["neg", "pos", "mix", "other"]
    counter = []

    for i in range(len(set(y))):
        counter.append([0.0] * len(predictor.vocabulary))

    for i in range(np.size(X, 0)):
        x = X[i]
        label = y[i]
        for w in x:
            counter[label - 1][w] += 1.0

    counter = np.array(counter)
    cl = []
    for i in range(len(predictor.vocabulary)):
        cl_max = max(counter[:, i])
        for j in range(len(set(y))):
            if counter[j, i] == cl_max:
                cl.append(j)

    visualise(v1, predictor.vocabulary, cl)
    visualise(v2, predictor.vocabulary, cl)
    for i in range(len(score)):
        print "sentiment - " + str(labels[i])
        for j in range(len(score[i])):
            print words[i][j] + " : " + str(score[i][j])
Esempio n. 2
0
def analyse(factory):
    data=iter_corpus()
    predictor=factory()
    predictor.fit(data)
    p1=ExtractText(True)
    X1=p1.transform(data)
    p2=EncodingText(predictor.vocabulary)
    p2.fit(X1)
    X=p2.transform(X1)
    y=[a.rating for a in data]
    (v1,v2,score,words)=predictor.classifier.analyse(X,predictor.vocabulary)

    labels=['neg','pos','mix','other']
    counter=[]

    for i in range(len(set(y))):
        counter.append([0.0]*len(predictor.vocabulary))

    for i in range(np.size(X,0)):
        x=X[i]
        label=y[i]
        for w in x:
            counter[label-1][w]+=1.0

    counter=np.array(counter)
    cl=[]
    for i in range(len(predictor.vocabulary)):
        cl_max=max(counter[:,i])
        for j in range(len(set(y))):
            if counter[j,i]==cl_max:
                cl.append(j)

    visualise(v1,predictor.vocabulary,cl)
    visualise(v2,predictor.vocabulary,cl)
    for i in range(len(score)):
        print 'sentiment - '+str(labels[i])
        for j in range(len(score[i])):
            print words[i][j]+' : '+str(score[i][j])
    def __init__(self, classifier="sgd", classifier_args=None, lowercase=True,
                 text_replacements=None, map_to_synsets=False, binary=False,
                 min_df=0, ngram=1, stopwords=None, limit_train=None,
                 map_to_lex=False, duplicates=False):
        """
        Parameter description:
            - `classifier`: The type of classifier used as main classifier,
              valid values are "sgd", "knn", "svc", "randomforest".
            - `classifier_args`: A dict to be passed as arguments to the main
              classifier.
            - `lowercase`: wheter or not all words are lowercased at the start of
              the pipeline.
            - `text_replacements`: A list of tuples `(from, to)` specifying
              string replacements to be made at the start of the pipeline (after
              lowercasing).
            - `map_to_synsets`: Whether or not to use the Wordnet synsets
              feature set.
            - `binary`: Whether or not to count words in the bag-of-words
              representation as 0 or 1.
            - `min_df`: Minumim frequency a word needs to have to be included
              in the bag-of-word representation.
            - `ngram`: The maximum size of ngrams to be considered in the
              bag-of-words representation.
            - `stopwords`: A list of words to filter out of the bag-of-words
              representation. Can also be the string "english", in which case
              a default list of english stopwords will be used.
            - `limit_train`: The maximum amount of training samples to give to
              the main classifier. This can be useful for some slow main
              classifiers (ex: svc) that converge with less samples to an
              optimum.
            - `max_to_lex`: Whether or not to use the Harvard Inquirer lexicon
              features.
            - `duplicates`: Whether or not to check for identical phrases between
              train and prediction.
        """
        self.limit_train = limit_train
        self.duplicates = duplicates

        # Build pre-processing common to every extraction
        pipeline = [ExtractText(lowercase)]
        if text_replacements:
            pipeline.append(ReplaceText(text_replacements))

        # Build feature extraction schemes
        ext = [build_text_extraction(binary=binary, min_df=min_df,
                                     ngram=ngram, stopwords=stopwords)]
        if map_to_synsets:
            ext.append(build_synset_extraction(binary=binary, min_df=min_df,
                                               ngram=ngram))
        if map_to_lex:
            ext.append(build_lex_extraction(binary=binary, min_df=min_df,
                                            ngram=ngram))
        ext = make_union(*ext)
        pipeline.append(ext)

        # Build classifier and put everything togheter
        if classifier_args is None:
            classifier_args = {}
        classifier = _valid_classifiers[classifier](**classifier_args)
        self.pipeline = make_pipeline(*pipeline)
        self.classifier = classifier
Esempio n. 4
0
    def __init__(self, classifier="rnn", classifier_args=None, lowercase=False,
                 text_replacements=None, map_to_synsets=False, binary=False,
                 min_df=0, ngram=1, stopwords=None, limit_train=None,
                 map_to_lex=False, duplicates=False):
        """
        Parameter description:
            - `classifier`: The type of classifier used as main classifier,
              valid values are "sgd", "knn", "svc", "randomforest".
            - `classifier_args`: A dict to be passed as arguments to the main
              classifier.
            - `lowercase`: wheter or not all words are lowercased at the start of
              the pipeline.
            - `text_replacements`: A list of tuples `(from, to)` specifying
              string replacements to be made at the start of the pipeline (after
              lowercasing).
            - `map_to_synsets`: Whether or not to use the Wordnet synsets
              feature set.
            - `binary`: Whether or not to count words in the bag-of-words
              representation as 0 or 1.
            - `min_df`: Minumim frequency a word needs to have to be included
              in the bag-of-word representation.
            - `ngram`: The maximum size of ngrams to be considered in the
              bag-of-words representation.
            - `stopwords`: A list of words to filter out of the bag-of-words
              representation. Can also be the string "english", in which case
              a default list of english stopwords will be used.
            - `limit_train`: The maximum amount of training samples to give to
              the main classifier. This can be useful for some slow main
              classifiers (ex: svc) that converge with less samples to an
              optimum.
            - `max_to_lex`: Whether or not to use the Harvard Inquirer lexicon
              features.
            - `duplicates`: Whether or not to check for identical phrases between
              train and prediction.
        """
        self.limit_train = limit_train
        self.duplicates = duplicates

        self.vocabulary=[]
        import csv
        with open('./data/vocabulary','rb') as f:
            rd=csv.reader(f)
            for line in rd:
                self.vocabulary.append(line[0])

        # Build pre-processing common to every extraction
        pipeline1 = [ExtractText()]
        pipeline1.append(EncodingText(self.vocabulary))
        pipeline=make_pipeline(*pipeline1)

        # Build classifier and put everything togheter
        if classifier_args is None:
            classifier_args = {'lambdaL': 0.0001, 'd': 50, 'cat': 4, 'lambdaCat': 1e-07, 'alpha': 0.2, 'lambdaW': 1e-05,'iter':70}
        if 'd' in classifier_args:
            d=classifier_args['d']
        else:
            d=50
        words_vectors=np.random.rand(d,len(self.vocabulary))*2*0.05-0.05
        classifier = _valid_classifiers[classifier](vocab=len(self.vocabulary),words_vectors=words_vectors,**classifier_args)

        #classifier=rnn.RNN(d=50,cat=4,vocab=len(self.vocabulary),alpha=0.2,words_vectors=words_vectors,lambdaW=10**(-5),lambdaCat=10**(-7),lambdaL=10**(-4))

        self.pipeline = pipeline
        self.classifier = classifier
Esempio n. 5
0
if __name__ == "__main__":
    import argparse
    import json
    from evaluation import analyse
    from predictor import PhraseSentimentPredictor

    # get vocabulary
    from corpus import iter_corpus
    import csv,os
    from transformations import ExtractText

    if not os.path.exists('./data/vocabulary'):
        datapoints=list(iter_corpus())
        vocabulary=set()
        et=ExtractText()
        X=et.transform(datapoints)
        for datap in X:
            for w in datap.split():
                vocabulary.add(w)
        vocabulary=list(vocabulary)
        vocabulary.sort()
        with open('./data/vocabulary','wb') as f:
            wr=csv.writer(f)
            for voc in vocabulary:
                wr.writerow([voc])

    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("filename")
    config = parser.parse_args()
    config = json.load(open(config.filename))
Esempio n. 6
0
if __name__ == "__main__":
    import argparse
    import json
    from evaluation import analyse
    from predictor import PhraseSentimentPredictor

    # get vocabulary
    from corpus import iter_corpus
    import csv, os
    from transformations import ExtractText

    if not os.path.exists('./data/vocabulary'):
        datapoints = list(iter_corpus())
        vocabulary = set()
        et = ExtractText()
        X = et.transform(datapoints)
        for datap in X:
            for w in datap.split():
                vocabulary.add(w.lower())
        vocabulary = list(vocabulary)
        vocabulary.sort()
        with open('./data/vocabulary', 'wb') as f:
            wr = csv.writer(f)
            for voc in vocabulary:
                wr.writerow([voc])

    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("filename")
    config = parser.parse_args()
    config = json.load(open(config.filename))