class SentimentAnalyzer(): # @param safolder folder where the training set resides, and where to put the test result # @param model algorithm used for sentiment analysis def __init__(self, dir, model): self.dir = dir self.model = model self.preprocessor = Preprocessor(dir) # open existing files if a model has been built before. no need to reprocess self.classifier = pickle.load(open(f"{self.dir}/training/model/{model}_clf.pickle", "rb")) \ if os.path.isfile(f"{self.dir}/training/model/{model}_clf.pickle") else None self.features = pickle.load(open(f"{self.dir}/training/model/features.pickle", "rb")) \ if os.path.isfile(f"{self.dir}/training/model/features.pickle") else [] self.classes = pickle.load(open(f"{self.dir}/training/model/classes.pickle", "rb")) \ if os.path.isfile(f"{self.dir}/training/model/classes.pickle") else [] self.most_common_words = [w.strip() for w in open(f"{self.dir}/training/model/most_common_words.txt","r",encoding="utf8").readlines()] \ if os.path.isfile(f"{self.dir}/training/model/most_common_words.txt") else [] # remove most common words (top 1%) that appear in both positive and negative documents def _remove_most_common_words(self, documents): print("Define most common words...") most_common_words = set([]) for cls in self.classes: docs = [d for d in documents if d.sentiment == cls] doc_words = [ w for d in docs for w in word_tokenize(d.content.replace(".", "")) ] fdist = nltk.FreqDist(doc_words) if len(most_common_words) == 0: most_common_words = set([ w[0] for w in fdist.most_common(int(0.01 * len(doc_words))) ]) else: most_common_words = set([ w[0] for w in fdist.most_common(int(0.01 * len(doc_words))) if w[0] in most_common_words ]) self.most_common_words = most_common_words with open(f"{self.dir}/training/model/most_common_words.txt", "w", encoding="utf8") as writer: writer.writelines([f"{w}\n" for w in self.most_common_words]) ndocs = [] doc_count = 0 for d in documents: doc_count += 1 ncontent = " ".join([ w for w in word_tokenize(d.content) if w not in most_common_words ]) ndocs.append(Document(d.name, ncontent, d.sentiment, d.location)) print("\r", end="") print("Removing most common words progress", int(doc_count / len(documents) * 100), "%", end="", flush=True) print("") return ndocs # only keep adjectives, adverbs, and nouns def _reduce_dimension_by_postag(self, documents): reduced_documents = [] doc_count = 0 for doc in documents: reduced_sentence = " ".join([ p[0] for p in nltk.pos_tag( word_tokenize(doc.content.replace(".", ""))) if p[1] in preprocess.ADJ or p[1] in preprocess.ADV or p[1] in preprocess.NOUN ]) if not reduced_sentence.isspace(): reduced_documents.append( Document(doc.name, reduced_sentence, doc.sentiment, doc.location)) doc_count += 1 print("\r", end="") print("Reducing dimension in progress", int(doc_count * 100 / len(documents)), "%", end="", flush=True) print("") return reduced_documents def create_frequency_plot(self, words, top_k): p = nltk.FreqDist(words) p.plot(top_k) def _undersample(self, documents): # find the minimum number of documents in a class docs_by_class = [] minclass_length = len(documents) for cls in self.classes: docs = [d for d in documents if d.sentiment == cls] docs_by_class.append(docs) if len(docs) < minclass_length: minclass_length = len(docs) # sample all classes based on the minimum number of documents undersampled_docs = [] for docs in docs_by_class: random.shuffle(docs) undersampled_docs.extend(docs[:minclass_length]) return undersampled_docs # preprocessing def prepare_documents(self): documents = [] for file in os.listdir(f"{self.dir}/training/data"): documents.extend( pickle.load(open(f"{self.dir}/training/data/{file}", "rb"))) if len(self.classes) == 0: self.classes = set([doc.sentiment for doc in documents]) pickle.dump( self.classes, open(f"{self.dir}/training/model/classes.pickle", "wb")) print("Perform undersampling...") documents = self._undersample(documents) documents = self._reduce_dimension_by_postag(documents) documents = self._remove_most_common_words(documents) return documents def transform_into_featuresets(self, documents): self.features = set( [w for d in documents for w in set(word_tokenize(d.content))]) pickle.dump(self.features, open(f"{self.dir}/training/model/features.pickle", "wb")) print("Features length:", len(self.features)) featuresets = [] print("Transforming into featuresets....") doc_count = 0 for doc in documents: # checking whether a word exists in an array takes a significantly longer time # thus we check whether a word exists in a string featuresets.append(({ w: True for w in word_tokenize(doc.content) if w in self.features }, doc.sentiment)) doc_count += 1 print("\r", end='') print("Preparing featureset in progress", int(doc_count * 100 / len(documents)), "%", end='', flush=True) print("") return featuresets def get_training_validation_set(self, featuresets, valid_ratio): if len(self.classes) == 0: classes = set([f[1] for f in featuresets]) pickle.dump( self.classes, open(f"{self.dir}/training/model/classes.pickle", "wb")) trainingset = [] validset = [] for c in self.classes: subfeat = [f for f in featuresets if f[1] == c] random.shuffle(subfeat) trainct = int((1 - valid_ratio) * len(subfeat)) trainingset.extend(subfeat[:trainct]) validset.extend(subfeat[trainct:]) return trainingset, validset def train(self, validation_ratio): os.makedirs(os.path.dirname(f"{self.dir}/training/model/"), exist_ok=True) documents = self.prepare_documents() featuresets = self.transform_into_featuresets(documents) trainset, validset = self.get_training_validation_set( featuresets, validation_ratio) print("Building classifier...") if self.model == "NB": self.classifier = nltk.NaiveBayesClassifier.train(trainset) self.classifier.show_most_informative_features(15) elif self.model == "MNB": self.classifier = SklearnClassifier( MultinomialNB()).train(trainset) elif self.model == "SVM": self.classifier = SklearnClassifier(SVC()).train(trainset) elif self.model == "LR": self.classifier = SklearnClassifier( LogisticRegression()).train(trainset) print("Accuracy per class") for cls in self.classes: print(f"{cls} accuracy:", (nltk.classify.accuracy( self.classifier, [v for v in validset if v[1] == cls])) * 100) print("Classifier accuracy percent:", (nltk.classify.accuracy(self.classifier, validset)) * 100) pickle.dump( self.classifier, open(f"{self.dir}/training/model/{self.model}_clf.pickle", "wb")) def show_most_informative_features(self, n): self.classifier.show_most_informative_features(n) def sentiment(self, text): # to ensure that the word is lemmatized properly so it is detected in self.features cleaned_text = self.preprocessor.basic_preprocess(text).replace( ".", "") # no need advanced self processing because the features have been determined feature = { w: True for w in word_tokenize(cleaned_text) if w in self.features } prob_dict = self.classifier.prob_classify(feature) cls = prob_dict.max() prob = prob_dict.prob(cls) return cls, prob def classify(self, test_dir): print("Start classifying...") if self.classifier == None: self.train(0.2) else: self.classifier.show_most_informative_features(15) files = [ os.path.basename(x) for x in glob.glob(f"{self.dir}/{test_dir}/data/*.csv") ] done_files = [f.strip() for f in open(f"{self.dir}/testing/classify_done.txt", 'r').readlines()] \ if os.path.isfile(f"{self.dir}/testing/classify_done.txt") else [] tbp_files = [f for f in files if f not in done_files] headers = [ "review_page", "review_title", "review_content", "review_star", "reviewer_location", "review_date", "crawled_date" ] os.makedirs(os.path.dirname(f"{self.dir}/{test_dir}/results/"), exist_ok=True) for file in tbp_files: with open(f"{self.dir}/{test_dir}/data/{file}", "r", encoding="utf8") as f: csvreader = csv.DictReader(f) with open(f"{self.dir}/{test_dir}/results/{file}","w", encoding="utf8", newline="") \ as w: csvwriter = csv.writer(w) csvwriter.writerow(headers) rowid = 0 rownum = self.preprocessor.count_lines( f"{self.dir}/{test_dir}/data/{file}") for row in csvreader: review_page = row["review_page"] review_title = row["review_title"] review_content = row["review_content"] cat = self.sentiment( f"{row['review_title']}. {row['review_content']}") review_star = "45" if cat[0] == "pos" else "20" reviewer_location = row["user_location"] review_date = row["review_date"] crawled_date = "00000000" csvwriter.writerow([ review_page, review_title, review_content, review_star, reviewer_location, review_date, crawled_date ]) w.flush() rowid += 1 print("\r", end='') print("Classifying in progress", int(rowid * 100 / rownum), "% for", file, end='', flush=True) with open(f"{self.dir}/testing/classify_done.txt", "a", encoding="utf8") as writer: writer.write(f"{file}\n")
print "creating feature sets..." tweetlist = tweetTest.loadTwitterCSV('trainingandtestdata/testdata.csv') labeld_features = label_feats_from_tweets(tweetlist) #labeld_features = label_feats_from_corpus(movie_reviews) training_set, test_set = split_label_feats(labeld_features) # tweetlist = tweetTest.loadTwitterCSV('trainingandtestdata/training.1600000.processed.noemoticon.csv') # training_set = label_feats_from_tweets(tweetlist) # training_set, garbage = split_label_feats(training_set, 1.0) # test_set, garbage = split_label_feats(labeld_features, 1.0) print "training set length: %i test set length: %i" % (len(training_set), len(test_set)) print prettifyFeatureSet(test_set) print "training classifier..." #classifier = NaiveBayesClassifier.train(training_set) #classifier = MaxentClassifier.train(training_set, algorithm='iis', max_iter=99, min_lldelta=0.01) #classifier = MaxentClassifier.train(training_set) classifier = SklearnClassifier(LogisticRegression()).train(training_set) print "calculating accuracy..." print 'accuracy:', nltk.classify.util.accuracy(classifier, test_set) #classifier.show_most_informative_features(30) negfeat = bag_of_words(['the', 'plot', 'was', 'ludicrous']) print classifier.classify(negfeat) probdist = classifier.prob_classify(negfeat) print "pos: ", probdist.prob('pos'), " neg: ", probdist.prob('neg') print classifier.labels() classify_tweet(classifier, "I love this movie!", True) classify_tweet(classifier, "!!!", True)
print('BNB_classifier accuracy: ',nltk_accuracy(BNB_classifier,features_test)) print('LGR_classifier accuracy: ',nltk_accuracy(LGR_classifier,features_test)) print('SDGC_classifier accuracy: ',nltk_accuracy(SDGC_classifier,features_test)) print('SVC_classifier accuracy: ',nltk_accuracy(SVC_classifier,features_test)) print('LSVC_classifier accuracy: ',nltk_accuracy(LSVC_classifier,features_test)) print('NuSVC_classifier accuracy: ',nltk_accuracy(NuSVC_classifier,features_test)) # Test input movie reviews with open('text.txt','r',encoding='utf-8') as f1: input_reviews = sent_tokenize(f1.read()) f1.close() f = open('result.txt','w',encoding='utf-8') f.write("Review\tPredicted sentiment\tProbability\n") for review in input_reviews: review = review.replace('\n',' ') f.write(review + '\t') # Compute the probabilities probabilities = LGR_classifier.prob_classify(extract_features(review.split())) # Pick the maximum value predicted_sentiment = probabilities.max() # Print outputs f.write(predicted_sentiment + '\t') f.write('{}'.format(round(probabilities.prob(predicted_sentiment), 2)) + '\n') f.close()
class RForests(text_classifier.TextClassifier): def __init__(self,trainDir,labelFile,numTrees=10,numJobs=1): self.classifier = None self.labelFile = labelFile self.trainingDir = trainDir self.labels = None self.all_words = None self.numTrees = numTrees self.numJobs = numJobs self.classifier = SklearnClassifier(RandomForestClassifier( n_estimators=self.numTrees, n_jobs=numJobs),sparse=False) #self.labels = training.setup(labelFile) #self.train() def train(self): feature_sets = self.getFeatures() self.classifier.train(feature_sets) """ Determines training error""" def trainingError(self): feature_sets = self.getFeatures() p = nltk.classify.accuracy(self.classifier,feature_sets) return p """ Make sure that the algorithm works on training data using a k fold cross validation scheme """ def kfoldCrossValidation(self,k): feature_sets = self.getFeatures() error = 0 for i in range(k): self.classifier = SklearnClassifier(RandomForestClassifier( n_estimators=self.numTrees),sparse=False) n = len(feature_sets)/k train_set,test_set = feature_sets[:n*i],feature_sets[n*i:] test_set1 = feature_sets[:n*i] train_set = feature_sets[n*i:n*(i+1)] test_set2 = feature_sets[i+1:] test_set = test_set1+test_set2 self.classifier.train(feature_sets) p = nltk.classify.accuracy(self.classifier,test_set) return p """ Make sure that the algorithm works on training data using a leave one out cross validation scheme """ def leave1OutCrossValidation(self): error = 0 feature_sets = self.getFeatures() N = len(feature_sets) for i in range(N): self.classifier = SklearnClassifier(RandomForestClassifier( n_estimators=self.numTrees),sparse=False) train_set1,test_set,train_set2 = feature_sets[:i],feature_sets[i],feature_sets[i+1:] train_set = train_set1+train_set2 test_set = [test_set] self.classifier.train(feature_sets) p = nltk.classify.accuracy(self.classifier,test_set) error+=p return error/N """ Construct a learning curve to see if there is overfitting""" def learningCurve(self,numTrials=4): accuracies = [] feature_sets = self.getFeatures() for k in xrange(1,len(feature_sets)-1): total = 0 for i in xrange(numTrials): self.classifier = SklearnClassifier(RandomForestClassifier( n_estimators=self.numTrees), sparse=False) random.shuffle(feature_sets) train_set,test_set = feature_sets[:k],feature_sets[k:] self.classifier.train(train_set) p = nltk.classify.accuracy(self.classifier,test_set) print len(train_set),len(test_set),p total+=p accuracies.append(total/numTrials) return accuracies """ Train on only k features and return training labels and predicted labels """ def testClassify(self,k): feature_sets = self.getFeatures() random.shuffle(feature_sets) self.classifier = SklearnClassifier(RandomForestClassifier( n_estimators=self.numTrees),sparse=False) self.classifier.train(feature_sets[k:]) features,ref_labels = zip(*feature_sets[:k]) pred_labels = self.classifier.batch_classify(features) return ref_labels,pred_labels """ nltk confusion matrix """ def confusionMatrix(self,ref,test): ref.sort(key=lambda x: x[0]) test.sort(key=lambda x: x[0]) _,ref_labels = zip(*ref) _,test_labels = zip(*test) cm = ConfusionMatrix(ref_labels, test_labels) return cm def prob_classify(self,db,fastain): proIDs,pds,labels = [],[],[] prevFeatureset = '' prevText = '' for seq_record in SeqIO.parse(fastain, "fasta"): title = seq_record.id toks = title.split("|") proteinID = toks[5] query_rows = genbank.proteinQuery(proteinID,db) ids,text = zip(*query_rows) text = ''.join(map(str,text)) if text=='': label = ['na'] pd = None else: text = word_reg.findall(text) featureset = self.gene_features(text) assert text!=prevText assert featureset!=prevFeatureset prevFeatureset = featureset prevText = text label = self.classifier.batch_classify(featureset) pd = self.classifier.prob_classify([featureset])[0] proIDs.append(proteinID) pds.append(pd) labels+=label return proIDs,labels,pds def classifyPickle(self,pickle,fastain): proIDs,features,labels = [],[],[] prevFeatureset = '' prevText = '' gbkTable = genbank.GenBankTable() gbkTable.load(pickle) for seq_record in SeqIO.parse(fastain, "fasta"): title = seq_record.id toks = title.split("|") locus_tag = toks[5] text = gbkTable.getLocusText(locus_tag) if text=='': label = 'na' else: text = word_reg.findall(text) featureset = self.gene_features(text) #assert text!=prevText #assert featureset!=prevFeatureset prevFeatureset = featureset prevText = text label = self.classifier.classify(featureset) #print label,text proIDs.append(locus_tag) labels.append(label) return zip(proIDs,labels) """ Classifies proteins based on its text from sqlite3 database""" def classifyDB(self,db,fastain): proIDs,features,labels = [],[],[] prevFeatureset = '' prevText = '' for seq_record in SeqIO.parse(fastain, "fasta"): title = seq_record.id toks = title.split("|") locus_tag = toks[5] locus_rows = genbank_sqlite3.locusQuery(locus_tag,db) protein_rows = [] for row in locus_rows: locus,proteinID = row query_rows = genbank_sqlite3.proteinQuery(proteinID,db) protein_rows+=query_rows #print len(protein_rows),locus_tag if len(protein_rows)==0: label = 'na' else: ids,text = zip(*protein_rows) text = ''.join(map(str,text)) if text=='': label = 'na' else: text = word_reg.findall(text) featureset = self.gene_features(text) #assert text!=prevText #assert featureset!=prevFeatureset prevFeatureset = featureset prevText = text label = self.classifier.classify(featureset) #print label,text proIDs.append(locus_tag) labels.append(label) return zip(proIDs,labels) def classify(self,dbin,fastain,type='sqlite3'): if type=='sqlite3': return self.classifyDB(dbin,fastain) else: return self.classifyPickle(dbin,fastain)
def main(): parser = get_argparser() args = parser.parse_args() util.DPRINT = args.dprint featureset_name = os.path.basename(args.featurefn).split('.')[0] features.load_featurefile(args.featurefn) ## default is 1e-4. THETOL = 1e-3 classifier_pairs = [] classifier_pairs.append(("MFS", learn.MFSClassifier())) classifier = SklearnClassifier(LogisticRegression(C=1, penalty='l2', tol=THETOL)) classifier_pairs.append(("maxent-l2-c1", classifier)) stamp = util.timestamp() for fn in glob(args.testset + "/*data"): problems = semeval_testset.extract_wsd_problems(fn) w = problems[0][0] assert w.endswith(".n") w = w[:-2] load_training_for_word(w, args.bitextfn, args.alignfn, args.annotatedfn) bestoutfn = args.outputdir + "/{0}.{1}.best".format(w, "es") oofoutfn = args.outputdir + "/{0}.{1}.oof".format(w, "es") if os.path.exists(bestoutfn): os.remove(bestoutfn) if os.path.exists(oofoutfn): os.remove(oofoutfn) training = None for problem in problems: w = problem[0] assert w.endswith(".n") w = w[:-2] print(problem) if training is None: training = trainingdata.trainingdata_for(w, nonnull=True) print("got {0} instances for {1}".format(len(training), w)) labels = set(label for (feat,label) in training) if len(training) == 0: print("no samples for", w) break if len(labels) < 2: print("there's only one sense for", w, " and it is ", labels) break classifier.train(training) rawtext = problem[2] surface, index = semeval_testset.head_surface_and_index(rawtext) replaced = re.sub(r"<head>(.*)</head>", " \\1 ", rawtext) annotated = preprocessing.preprocess(replaced, "en") sentence = [token.lemma for token in annotated] focus_index = find_head_token_index(annotated, surface, index) feats = features.extract_untagged(sentence, annotated, focus_index) bestoutfn = args.outputdir + "/{0}.{1}.best".format(w, "es") oofoutfn = args.outputdir + "/{0}.{1}.oof".format(w, "es") with open(bestoutfn, "a") as bestoutfile, \ open(oofoutfn, "a") as oofoutfile: answer = classifier.classify(feats) print(answer) dist = classifier.prob_classify(feats) oof_answers = topfive(dist) print(output_one_best(problem, "es", answer), file=bestoutfile) print(output_five_best(problem, "es", oof_answers), file=oofoutfile)