def analyse(factory): data = iter_corpus() predictor = factory() predictor.fit(data) p1 = ExtractText() X1 = p1.transform(data) p2 = EncodingText(predictor.vocabulary) p2.fit(X1) X = p2.transform(X1) y = [a.rating for a in data] (v1, v2, score, words) = predictor.classifier.analyse(X, predictor.vocabulary) labels = ["neg", "pos", "mix", "other"] counter = [] for i in range(len(set(y))): counter.append([0.0] * len(predictor.vocabulary)) for i in range(np.size(X, 0)): x = X[i] label = y[i] for w in x: counter[label - 1][w] += 1.0 counter = np.array(counter) cl = [] for i in range(len(predictor.vocabulary)): cl_max = max(counter[:, i]) for j in range(len(set(y))): if counter[j, i] == cl_max: cl.append(j) visualise(v1, predictor.vocabulary, cl) visualise(v2, predictor.vocabulary, cl) for i in range(len(score)): print "sentiment - " + str(labels[i]) for j in range(len(score[i])): print words[i][j] + " : " + str(score[i][j])
def analyse(factory): data=iter_corpus() predictor=factory() predictor.fit(data) p1=ExtractText(True) X1=p1.transform(data) p2=EncodingText(predictor.vocabulary) p2.fit(X1) X=p2.transform(X1) y=[a.rating for a in data] (v1,v2,score,words)=predictor.classifier.analyse(X,predictor.vocabulary) labels=['neg','pos','mix','other'] counter=[] for i in range(len(set(y))): counter.append([0.0]*len(predictor.vocabulary)) for i in range(np.size(X,0)): x=X[i] label=y[i] for w in x: counter[label-1][w]+=1.0 counter=np.array(counter) cl=[] for i in range(len(predictor.vocabulary)): cl_max=max(counter[:,i]) for j in range(len(set(y))): if counter[j,i]==cl_max: cl.append(j) visualise(v1,predictor.vocabulary,cl) visualise(v2,predictor.vocabulary,cl) for i in range(len(score)): print 'sentiment - '+str(labels[i]) for j in range(len(score[i])): print words[i][j]+' : '+str(score[i][j])
def __init__(self, classifier="sgd", classifier_args=None, lowercase=True, text_replacements=None, map_to_synsets=False, binary=False, min_df=0, ngram=1, stopwords=None, limit_train=None, map_to_lex=False, duplicates=False): """ Parameter description: - `classifier`: The type of classifier used as main classifier, valid values are "sgd", "knn", "svc", "randomforest". - `classifier_args`: A dict to be passed as arguments to the main classifier. - `lowercase`: wheter or not all words are lowercased at the start of the pipeline. - `text_replacements`: A list of tuples `(from, to)` specifying string replacements to be made at the start of the pipeline (after lowercasing). - `map_to_synsets`: Whether or not to use the Wordnet synsets feature set. - `binary`: Whether or not to count words in the bag-of-words representation as 0 or 1. - `min_df`: Minumim frequency a word needs to have to be included in the bag-of-word representation. - `ngram`: The maximum size of ngrams to be considered in the bag-of-words representation. - `stopwords`: A list of words to filter out of the bag-of-words representation. Can also be the string "english", in which case a default list of english stopwords will be used. - `limit_train`: The maximum amount of training samples to give to the main classifier. This can be useful for some slow main classifiers (ex: svc) that converge with less samples to an optimum. - `max_to_lex`: Whether or not to use the Harvard Inquirer lexicon features. - `duplicates`: Whether or not to check for identical phrases between train and prediction. """ self.limit_train = limit_train self.duplicates = duplicates # Build pre-processing common to every extraction pipeline = [ExtractText(lowercase)] if text_replacements: pipeline.append(ReplaceText(text_replacements)) # Build feature extraction schemes ext = [build_text_extraction(binary=binary, min_df=min_df, ngram=ngram, stopwords=stopwords)] if map_to_synsets: ext.append(build_synset_extraction(binary=binary, min_df=min_df, ngram=ngram)) if map_to_lex: ext.append(build_lex_extraction(binary=binary, min_df=min_df, ngram=ngram)) ext = make_union(*ext) pipeline.append(ext) # Build classifier and put everything togheter if classifier_args is None: classifier_args = {} classifier = _valid_classifiers[classifier](**classifier_args) self.pipeline = make_pipeline(*pipeline) self.classifier = classifier
def __init__(self, classifier="rnn", classifier_args=None, lowercase=False, text_replacements=None, map_to_synsets=False, binary=False, min_df=0, ngram=1, stopwords=None, limit_train=None, map_to_lex=False, duplicates=False): """ Parameter description: - `classifier`: The type of classifier used as main classifier, valid values are "sgd", "knn", "svc", "randomforest". - `classifier_args`: A dict to be passed as arguments to the main classifier. - `lowercase`: wheter or not all words are lowercased at the start of the pipeline. - `text_replacements`: A list of tuples `(from, to)` specifying string replacements to be made at the start of the pipeline (after lowercasing). - `map_to_synsets`: Whether or not to use the Wordnet synsets feature set. - `binary`: Whether or not to count words in the bag-of-words representation as 0 or 1. - `min_df`: Minumim frequency a word needs to have to be included in the bag-of-word representation. - `ngram`: The maximum size of ngrams to be considered in the bag-of-words representation. - `stopwords`: A list of words to filter out of the bag-of-words representation. Can also be the string "english", in which case a default list of english stopwords will be used. - `limit_train`: The maximum amount of training samples to give to the main classifier. This can be useful for some slow main classifiers (ex: svc) that converge with less samples to an optimum. - `max_to_lex`: Whether or not to use the Harvard Inquirer lexicon features. - `duplicates`: Whether or not to check for identical phrases between train and prediction. """ self.limit_train = limit_train self.duplicates = duplicates self.vocabulary=[] import csv with open('./data/vocabulary','rb') as f: rd=csv.reader(f) for line in rd: self.vocabulary.append(line[0]) # Build pre-processing common to every extraction pipeline1 = [ExtractText()] pipeline1.append(EncodingText(self.vocabulary)) pipeline=make_pipeline(*pipeline1) # Build classifier and put everything togheter if classifier_args is None: classifier_args = {'lambdaL': 0.0001, 'd': 50, 'cat': 4, 'lambdaCat': 1e-07, 'alpha': 0.2, 'lambdaW': 1e-05,'iter':70} if 'd' in classifier_args: d=classifier_args['d'] else: d=50 words_vectors=np.random.rand(d,len(self.vocabulary))*2*0.05-0.05 classifier = _valid_classifiers[classifier](vocab=len(self.vocabulary),words_vectors=words_vectors,**classifier_args) #classifier=rnn.RNN(d=50,cat=4,vocab=len(self.vocabulary),alpha=0.2,words_vectors=words_vectors,lambdaW=10**(-5),lambdaCat=10**(-7),lambdaL=10**(-4)) self.pipeline = pipeline self.classifier = classifier
if __name__ == "__main__": import argparse import json from evaluation import analyse from predictor import PhraseSentimentPredictor # get vocabulary from corpus import iter_corpus import csv,os from transformations import ExtractText if not os.path.exists('./data/vocabulary'): datapoints=list(iter_corpus()) vocabulary=set() et=ExtractText() X=et.transform(datapoints) for datap in X: for w in datap.split(): vocabulary.add(w) vocabulary=list(vocabulary) vocabulary.sort() with open('./data/vocabulary','wb') as f: wr=csv.writer(f) for voc in vocabulary: wr.writerow([voc]) parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("filename") config = parser.parse_args() config = json.load(open(config.filename))
if __name__ == "__main__": import argparse import json from evaluation import analyse from predictor import PhraseSentimentPredictor # get vocabulary from corpus import iter_corpus import csv, os from transformations import ExtractText if not os.path.exists('./data/vocabulary'): datapoints = list(iter_corpus()) vocabulary = set() et = ExtractText() X = et.transform(datapoints) for datap in X: for w in datap.split(): vocabulary.add(w.lower()) vocabulary = list(vocabulary) vocabulary.sort() with open('./data/vocabulary', 'wb') as f: wr = csv.writer(f) for voc in vocabulary: wr.writerow([voc]) parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("filename") config = parser.parse_args() config = json.load(open(config.filename))