def evaluate_classifier_Decision(featx): train_negids = train.fileids('neg') train_posids = train.fileids('pos') test_negids = test.fileids('neg') test_posids = test.fileids('pos') train_negfeats = [(featx(train.words(fileids=[f])), 'neg') for f in train_negids] train_posfeats = [(featx(train.words(fileids=[f])), 'pos') for f in train_posids] test_negfeats = [(featx(test.words(fileids=[f])), 'neg') for f in test_negids] test_posfeats = [(featx(test.words(fileids=[f])), 'pos') for f in test_posids] trainfeats = train_negfeats + train_posfeats testfeats = test_negfeats + test_posfeats train_negcutoff = len(train_negfeats)*1/100 train_poscutoff = len(train_posfeats)*1/100 trainfeats_Decision = train_negfeats[:train_negcutoff] + train_posfeats[:train_poscutoff] DecisionTree_classifier = DecisionTreeClassifier.train(trainfeats_Decision) refsets = collections.defaultdict(set) testsets_Decision = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed_Decision = DecisionTree_classifier.classify(feats) testsets_Decision[observed_Decision].add(i) accuracy3 = nltk.classify.util.accuracy(DecisionTree_classifier, testfeats) pos_precision3 = nltk.metrics.precision(refsets['pos'], testsets_Decision['pos']) pos_recall3 = nltk.metrics.recall(refsets['pos'], testsets_Decision['pos']) neg_precision3 = nltk.metrics.precision(refsets['neg'], testsets_Decision['neg']) neg_recall3 = nltk.metrics.recall(refsets['neg'], testsets_Decision['neg']) return(['DecisionTree',accuracy3,pos_precision3,pos_recall3,neg_precision3,neg_recall3])
def main_function(): conn = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter_analysis") hq_conn = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter") training_tweets = get_test_tweets(conn) training_feature_set = process_tweets(training_tweets) classifier = DecisionTreeClassifier.train(training_feature_set) test_tweets = get_training_tweets(conn) test_feature_set = process_tweets(test_tweets) classifier_accuracy = accuracy(classifier, test_feature_set) alt_full_matrix = {'+':{'+':0, '-':0, 'E':0}, '-':{'+':0, '-':0, 'E':0}, 'E':{'+':0, '-':0, 'E':0}} #for f in test_tweets: #f = test_tweets[0] #print f #guess = classifier.classify(process_tweet(f[1])) #print guess # update_tweet_polarity(f[0], guess, conn) ## pl = classifier.prob_classify(process_tweet(f[1])) # idx = f[2] # if idx == 'I' or idx == 'O': # idx = 'E' # alt_full_matrix[idx][guess] += 1 #print alt_full_matrix print "classifier accuracy: " + repr(classifier_accuracy)
def decisionTree(features_train, features_test): print 'train on %d instances, test on %d instances' % (len(features_train), len(features_test)) classifier = DecisionTreeClassifier.train(features_train, binary=True, entropy_cutoff=0.8, depth_cutoff=5, support_cutoff=30) print 'accuracy:', nltk.classify.util.accuracy(classifier, features_test) precisions, recalls = precision_recall(classifier, features_test) print "accuracy: ", precisions, "fitness: ", recalls
def classify(inputdir): #filenames = os.listdir('d:\\shir\\') filenames = os.listdir(inputdir) feat_set = [] sets = [] for name in filenames: # print name lineno=0 path = os.path.join(inputdir, name) sense = name.split('\\')[-1].split('.')[0] print 'training', sense file = codecs.open(path, 'r', 'utf-8') allwords = [] for line in file: if len(line.split())>2: lineno+=1 line = line.strip() words=[] tags=[] tokens = line.split() for item in tokens: if len(item.split('\\'))==2: word=item.split('\\')[0] tag= item.split('\\')[1] words.append(word) tags.append(tag) allwords.append(word) feat_set.append((bag_of_words(line),sense)) #feat_set.append((get_feature2(line),sense)) else: words=[] tags=[] file.close() random.shuffle(feat_set) random.shuffle(feat_set) #random.shuffle(feat_set) train_data = train_feats(feat_set) test_data = test_feats(feat_set) #classifier= MaxentClassifier.train(train_data) nb_classifier = NaiveBayesClassifier.train(train_data) dt_classifier = DecisionTreeClassifier.train(train_data, entropy_cutoff=0.8, depth_cutoff=5, support_cutoff=30) # pickle.dump(classifier, classifier_save_file) entropy_classifier = MaxentClassifier.train(train_data,algorithm='iis', trace=0, max_iter=1, min_lldelta=0.5) print "nb accuracy "+ str(accuracy(nb_classifier, test_data) * 100) print "dt accuracy "+ str(accuracy(dt_classifier, test_data) * 100) print "entropy accuracy "+ str(accuracy(entropy_classifier, test_data) * 100) mv_classifier = MaxVoteClassifier(nb_classifier, dt_classifier, entropy_classifier) print "max vote accuracy "+ str(accuracy(mv_classifier, test_data) * 100)
def run(training): """ To create and train a DecisionTreeClassifier :return: a trained Classifier """ print "Training DT Classifier..." # feats = label_feat_from_corps(movie_reviews) # training, testing = split_label_feats(feats) dt_classifier = DecisionTreeClassifier.train(training, binary=True, entropy_cutoff=0.8, depth_cutoff=10, support_cutoff=30) print "DT Classifier trained..." return save_classifier(dt_classifier)
def trainDT(featuresets): #idx = 2*len(featuresets) / ratio #train_set, test_set = featuresets[idx:], featuresets[:idx] train_set = featuresets #max_iter=20 classifier = DecisionTreeClassifier.train(train_set) #print accuracy(classifier, test_set) #classifier.show_most_informative_features(100) #train_set, test_set = featuresets[idx:], featuresets[:idx] #classifier.train(train_set, algo, max_iter=20) #print accuracy(classifier, test_set) #classifier.show_most_informative_features(100) return classifier
def classify_decision_tree(self): print "training decision tree" classifier = DecisionTreeClassifier.train(self.feature_vectors_tuples_for_train, depth_cutoff=200, entropy_cutoff=0.1) print "testing classifier" classified_labels = classifier.batch_classify([feature_set_tuple[0] for feature_set_tuple in self.feature_vectors_tuples_for_test]) correct = 0 wrong = 0 for i in range(0, len(classified_labels)): if classified_labels[i] is self.feature_vectors_tuples_for_test[i][1]: correct += 1 else: wrong += 1 print correct, wrong
def decision_tree_classifier(feature_vector_train, feature_vector_test): features_train, topics_train = zip(*feature_vector_train) features_test, topics_test = zip(*feature_vector_test) # training classifier2 = DecisionTreeClassifier.train(features_train, depth_cutoff=250, entropy_cutoff=0.1) # Kept an entropy cutoff in order to improve the training time (this might lead to loss in accuracy though) # Same goes for depth cutoff (for refining the tree). Kept it as 250. # testing predicted_topics = classifier2.classify_many(features_test) print classification_report(topics_test, predicted_topics, target_names=set(topics_test))
def dt_classify(filename): raw_sample_stream = get_samples_stream(filename) all_samples = list( binary_bow_feature(raw_sample_stream) ) # filter out two classes of outliers # these two categories contain too few examples, so the word frequency in these two categories # cannot reflect the true probability # all_samples = [(features,aspect) for features,aspect in all_samples if aspect != common.AspectNothing and aspect != common.AspectBusiness] test_sample_ratio = 0.25 train_samples,test_samples = split_samples(all_samples,test_sample_ratio) print "training set has {} samples, test set has {} samples".format(len(train_samples),len(test_samples)) classifier = DecisionTreeClassifier.train(train_samples,binary=True, depth_cutoff=15,verbose=True) print "training completes" print "training accuracy: {}".format(accuracy(classifier,train_samples)) print "test accuracy: {}".format(accuracy(classifier,test_samples)) return classifier
def decision_tree(train_data): training_data = [] for data in train_data: training_data.append(preprocess(data[0],label=data[1])) cl = DecisionTreeClassifier.train(training_data) return cl
def classify(inputdir): #filenames = os.listdir('d:\\shir\\') filenames = os.listdir(inputdir) feat_set = [] sets = [] for name in filenames: # print name labeledlist = [] lineno=0 path = os.path.join(inputdir, name) sense = name.split('\\')[-1].split('.')[0] print 'training', sense file = codecs.open(path, 'r', 'utf-8') allwords = [] for line in file: if len(line.split())>2: lineno+=1 line = line.strip() words=[] tags=[] tokens = line.split() for item in tokens: if len(item.split('\\'))==2: word=item.split('\\')[0] tag= item.split('\\')[1] words.append(word) tags.append(tag) allwords.append(word) feat_set.append((bag_of_bigrams_words(words),sense)) # feat_set.append((context_feature(line),sense)) else: words=[] tags=[] print lineno labeledlist.append((sense,allwords)) # feat_set.append((bigram_feature(allwords),sense)) file.close() high_info_words = set(high_information_words(labeledlist)) for item in high_info_words: print item random.shuffle(feat_set) random.shuffle(feat_set) random.shuffle(feat_set) train_data = train_feats(feat_set) test_data = test_feats(feat_set) print "training on "+str(len(train_data))+" instances" print "testting on "+str(len(test_data))+" instances" #classifier= MaxentClassifier.train(train_data) # nb_classifier = NaiveBayesClassifier.train(train_data) dt_classifier = DecisionTreeClassifier.train(train_data, entropy_cutoff=0.8, depth_cutoff=7, support_cutoff=10) # print dt_classifier.pp() # pickle.dump(classifier, classifier_save_file) entropy_classifier = MaxentClassifier.train(train_data,algorithm='iis', trace=0, max_iter=2, min_lldelta=0.5) print "nb accuracy " # print accuracy(nb_classifier, test_data) * 100 # print "nb precision and recall" # print precision_recall(nb_classifier,test_data) # print nb_classifier.show_most_informative_features() # for item in nb_classifier.most_informative_features(): # print item # print "dt accuracy "+ str(accuracy(dt_classifier, test_data) * 100) print "entropy accuracy "+ str(accuracy(entropy_classifier, test_data) * 100)
def getClassifier(tweetfile,cfg): degreesToUse = cfg['NLPnGrams'] print "DEBOOOOO", degreesToUse, type(degreesToUse) classMode = cfg['NLPMode'].replace('-',' ').replace('_',' ') shortClass = classMode.replace(' ','').lower() loadNeeded = True if 'NLPTEST' not in cfg.keys(): degreeString = '-'.join([str(degree) for degree in degreesToUse]) pickleFile = 'nlpTrainers/'+tweetfile.replace('.csv','.'+shortClass+degreeString+'.pickle') if isfile(pickleFile): print "Loading pickled", shortClass, "classifier" fileIn = open(pickleFile) classifier = cPickle.load(fileIn) fileIn.close() loadNeeded = False if loadNeeded: if 'NLPTEST'in cfg.keys(): content = prepText(tweetfile) categorized = prepClassifications(content) NGrammized = collectNGrams(categorized,degreesToUse,cfg) else: print "Loading content & preparing text" content = prepText(loadFile(tweetfile)) print "Categorizing contents" categorized = prepClassifications(content) print "Deriving NGrams of length(s)", degreesToUse NGrammized = collectNGrams(categorized,degreesToUse,cfg) print "Compiling Results" readyToSend = [] allCats = [str(key) for key in NGrammized.keys()] for category in allCats: readyToSend += NGrammized[category] print "Attempting Classification by mode", classMode, degreesToUse if classMode == 'naive bayes': from nltk.classify import NaiveBayesClassifier classifier = {'class':NaiveBayesClassifier.train(readyToSend),'mode':'nb'} elif classMode == 'positive naive bayes': from nltk.classify import PositiveNaiveBayesClassifier classifier = {'class':PositiveNaiveBayesClassifier.train(readyToSend),'mode':'pnb'} elif classMode == 'max ent': #import nltk.classify #from sklearn.linear_model import LogisticRegression #from nltk.classify import SklearnClassifier #classifier = {'class':LogisticRegression.train(readyToSend),'mode':'me'} from nltk.classify import MaxentClassifier classifier = {'class':MaxentClassifier.train(readyToSend,algorithm='iis'),'mode':'me'} elif classMode == 'decision tree': from nltk.classify import DecisionTreeClassifier classifier = {'class':DecisionTreeClassifier.train(readyToSend),'mode':'dt'} elif classMode == 'svm': if "SVMOrder" in cfg.keys(): priority = cfg['SVMOrder'] else: priority = "ABCDEFGHIJKLMNOPQRSTUVWXYZ9876543210" if type(priority) is str: priority = list(priority) priority = [entry for entry in priority if entry in allCats] preppedSVM = prepSVMAll(readyToSend,priority,allCats,cfg) classifier = {'class':preppedSVM,'mode':'svm','priority':priority} else: from nltk.classify import NaiveBayesClassifier classifier = {'class':NaiveBayesClassifier.train(readyToSend),'mode':'nb'} if 'NLPTEST' not in cfg.keys(): print "Pickling Classifier" fileOut = open(pickleFile, 'wb') cPickle.dump(classifier, fileOut) fileOut.close() if 'NLPTEST' not in cfg.keys(): if classMode != 'svm': classifier['class'].show_most_informative_features(n=150) """else: for key in classifier['class'].keys(): print classifier print classifier.keys() classifier['class'][key].show_most_informative_features(n=150/len(classifier['class'].keys()))""" return classifier
def train(self, features_label): self._classifier = DecisionTreeClassifier.train( features_label, entropy_cutoff=0.05, depth_cutoff=200, support_cutoff=20 ) return None
''' Created on Apr 25, 2010 @author: Ben ''' from nltk import classify from nltk.classify import DecisionTreeClassifier from edu.zoller.nlp import common print 'Reading feature words...' feature_words = common.read_tf_feature_words() print 'Assembling training feature sets...' train_set = [] for filename in common.train: year_class = common.get_40_year_class(filename) features = common.get_tf_features(filename, feature_words) train_set.append((features, year_class)) print 'Training classifier...' classifier = DecisionTreeClassifier.train(train_set) print 'Assembling test feature sets...' test_set = [] for filename in common.test: year_class = common.get_40_year_class(filename) features = common.get_tf_features(filename, feature_words) test_set.append((features, year_class)) print 'Classifying test accuracy' print classify.accuracy(classifier, test_set)
#print(len(X_test)) if not preprocess_flag: bow_transformer = joblib.load('FeatTransformer.pkl') X_train = joblib.load('TrainFeatures.pkl') X_test = joblib.load('TestFeatures.pkl') else: bow_transformer = CountVectorizer(analyzer=format_sentence).fit(X_train) X_train = bow_transformer.transform(X_train) X_test = bow_transformer.transform(X_test) joblib.dump(bow_transformer, 'FeatTransformer.pkl') joblib.dump(X_train, 'TrainFeatures.pkl') joblib.dump(X_test, 'TestFeatures.pkl') #train decision tree classifier dt_flag = 0 #if 1, train model from scratch and dump - if 0, load dumped model dt = DecisionTreeClassifier() if dt_flag: dt_clf = dt.fit(X_train, Y_train) joblib.dump(dt_clf, 'DTmodel.pkl') else: dt_clf = joblib.load('DTmodel.pkl') #test dt classifier preds = dt_clf.predict(X_test) cm = confusion_matrix(Y_test, preds) print(cm) print('\n') print(classification_report(Y_test, preds)) #plot_roc_curve(dt_clf,X_test,Y_test) plt.figure() plot_confusion_matrix(cm, classes=['negative', 'positive'], normalize=True, title='Normalized confusion matrix - Decision Tree') plt.show()
def getClassifier(tweetfile, cfg): degreesToUse = cfg['NLPnGrams'] print "DEBOOOOO", degreesToUse, type(degreesToUse) classMode = cfg['NLPMode'].replace('-', ' ').replace('_', ' ') shortClass = classMode.replace(' ', '').lower() loadNeeded = True if 'NLPTEST' not in cfg.keys(): degreeString = '-'.join([str(degree) for degree in degreesToUse]) pickleFile = 'nlpTrainers/' + tweetfile.replace( '.csv', '.' + shortClass + degreeString + '.pickle') if isfile(pickleFile): print "Loading pickled", shortClass, "classifier" fileIn = open(pickleFile) classifier = cPickle.load(fileIn) fileIn.close() loadNeeded = False if loadNeeded: if 'NLPTEST' in cfg.keys(): content = prepText(tweetfile) categorized = prepClassifications(content) NGrammized = collectNGrams(categorized, degreesToUse, cfg) else: print "Loading content & preparing text" content = prepText(loadFile(tweetfile)) print "Categorizing contents" categorized = prepClassifications(content) print "Deriving NGrams of length(s)", degreesToUse NGrammized = collectNGrams(categorized, degreesToUse, cfg) print "Compiling Results" readyToSend = [] allCats = [str(key) for key in NGrammized.keys()] for category in allCats: readyToSend += NGrammized[category] print "Attempting Classification by mode", classMode, degreesToUse if classMode == 'naive bayes': from nltk.classify import NaiveBayesClassifier classifier = { 'class': NaiveBayesClassifier.train(readyToSend), 'mode': 'nb' } elif classMode == 'positive naive bayes': from nltk.classify import PositiveNaiveBayesClassifier classifier = { 'class': PositiveNaiveBayesClassifier.train(readyToSend), 'mode': 'pnb' } elif classMode == 'max ent': #import nltk.classify #from sklearn.linear_model import LogisticRegression #from nltk.classify import SklearnClassifier #classifier = {'class':LogisticRegression.train(readyToSend),'mode':'me'} from nltk.classify import MaxentClassifier classifier = { 'class': MaxentClassifier.train(readyToSend, algorithm='iis'), 'mode': 'me' } elif classMode == 'decision tree': from nltk.classify import DecisionTreeClassifier classifier = { 'class': DecisionTreeClassifier.train(readyToSend), 'mode': 'dt' } elif classMode == 'svm': if "SVMOrder" in cfg.keys(): priority = cfg['SVMOrder'] else: priority = "ABCDEFGHIJKLMNOPQRSTUVWXYZ9876543210" if type(priority) is str: priority = list(priority) priority = [entry for entry in priority if entry in allCats] preppedSVM = prepSVMAll(readyToSend, priority, allCats, cfg) classifier = { 'class': preppedSVM, 'mode': 'svm', 'priority': priority } else: from nltk.classify import NaiveBayesClassifier classifier = { 'class': NaiveBayesClassifier.train(readyToSend), 'mode': 'nb' } if 'NLPTEST' not in cfg.keys(): print "Pickling Classifier" fileOut = open(pickleFile, 'wb') cPickle.dump(classifier, fileOut) fileOut.close() if 'NLPTEST' not in cfg.keys(): if classMode != 'svm': classifier['class'].show_most_informative_features(n=150) """else: for key in classifier['class'].keys(): print classifier print classifier.keys() classifier['class'][key].show_most_informative_features(n=150/len(classifier['class'].keys()))""" return classifier
print('neg recall:', recall(refsets['neg'], testsets['neg'])) print('neg F-measure:', f_measure(refsets['neg'], testsets['neg'])) # Model #2: **UNIGRAMS** & Decision Tree # In[26]: #Making a decision tree model to compare which is the better performing model import collections from nltk import metrics from nltk.metrics.scores import (accuracy, precision, recall, f_measure) from nltk.classify import DecisionTreeClassifier from nltk.classify.util import accuracy dt_classifier = DecisionTreeClassifier.train(train_set, binary=True, entropy_cutoff=0.8, depth_cutoff=5, support_cutoff=30) from nltk.classify.util import accuracy print(accuracy(dt_classifier, test_set)) for i, (uni_featureset, label) in enumerate(test_set): refsets[label].add(i) observed = dt_classifier.classify(uni_featureset) testsets[observed].add(i) print('pos precision:', precision(refsets['pos'], testsets['pos'])) print('pos recall:', recall(refsets['pos'], testsets['pos'])) print('pos F-measure:', f_measure(refsets['pos'], testsets['pos'])) print('neg precision:', precision(refsets['neg'], testsets['neg'])) print('neg recall:', recall(refsets['neg'], testsets['neg']))
pickle.dump(train_feats, save_train_feats) save_train_feats.close() save_test_feats = open("pickled_algos/test_feats", "wb") pickle.dump(test_feats, save_test_feats) save_test_feats.close() nb_classifier = NaiveBayesClassifier.train(train_feats) print(accuracy(nb_classifier, test_feats)) save_nb_classifier = open("pickled_algos/nb_classifier", "wb") pickle.dump(nb_classifier, save_nb_classifier) save_nb_classifier.close() dt_classifier = DecisionTreeClassifier.train(train_feats) print(accuracy(dt_classifier, test_feats)) save_dt_classifier = open("pickled_algos/dt_classifier", "wb") pickle.dump(dt_classifier, save_dt_classifier) save_dt_classifier.close() sk_classifier = SklearnClassifier(LinearSVC()).train(train_feats) print(accuracy(sk_classifier, test_feats)) save_sk_classifier = open("pickled_algos/sk_classifier", "wb") pickle.dump(sk_classifier, save_sk_classifier) save_sk_classifier.close()
def words_bag(words): return dict([(word,True) for word in words]) neg_list = movie_reviews.fileids('neg') pos_list = movie_reviews.fileids('pos') negfeats = [(words_bag(movie_reviews.words(fileids=[f])), 'neg') for f in neg_list] posfeats = [(words_bag(movie_reviews.words(fileids=[f])), 'pos') for f in pos_list] '''gathering training and test data for decision trees''' negcutoff_train_dt = len(negfeats) poscutoff_train_dt = len(posfeats) training_data_dt = negfeats[:negcutoff_train_dt] + posfeats[:poscutoff_train_dt] classifier_dt = DecisionTreeClassifier.train(training_data_dt) print "Decision Trees" print 'train on %d instances:' % (len(training_data_dt)) sentence_list = [] #comments = "first half was good but oh boy the second was shit, overall good movie. no matter how many times I watch this, I still like it. must watch movie, i just want to touch the sweet panda" while 1: comments = raw_input("Enter a review comment ending with a dot :") sentence_list = sent_tokenize(comments) for sentence in sentence_list: word_punct = wordpunct_tokenize(sentence) for words in word_punct: input_cl = words_bag(words) print sentence + "--->" + classifier_dt.classify(input_cl)
def main(): global tagger if constants.corpus == constants.Corpus.movie_review: neg_docs, pos_docs = get_movie_corpus() if constants.corpus == constants.Corpus.pol_debates: neg_docs, pos_docs = get_political_debates() if constants.mark_negation: neg_docs = [nltk.sentiment.util.mark_negation(doc) for doc in neg_docs] pos_docs = [nltk.sentiment.util.mark_negation(doc) for doc in pos_docs] # Split betweeen the training set and the testing set num_train_neg = int(3 / 4 * len(neg_docs)) num_test_neg = len(neg_docs) - num_train_neg num_train_pos = int(3 / 4 * len(pos_docs)) num_test_pos = len(pos_docs) - num_train_pos train_neg, test_neg = sklearn.cross_validation.train_test_split( neg_docs, train_size=num_train_neg, test_size=num_test_neg) train_pos, test_pos = sklearn.cross_validation.train_test_split( pos_docs, train_size=num_train_pos, test_size=num_test_pos) # Make the final train set and test set train_docs = train_pos + train_neg test_docs = test_pos + test_neg # Set up the Sentiment Analyzer analyzer = SentimentAnalyzer() if constants.feature_extractor == constants.FeatureExtractor.bag_of_words: analyzer.add_feat_extractor(extract_bag_of_words_feats) if constants.feature_extractor == constants.FeatureExtractor.freq_dist: analyzer.add_feat_extractor(extract_freq_dist) elif constants.feature_extractor == constants.FeatureExtractor.unigram: all_words = analyzer.all_words(train_docs, labeled=True) unigram_features = analyzer.unigram_word_feats(all_words, min_freq=1000) print("Length of unigram features: %d" % len(unigram_features)) analyzer.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigram_features) elif constants.feature_extractor == constants.FeatureExtractor.bigram_bag_of_words: analyzer.add_feat_extractor(extract_sig_bigram_feats) elif constants.feature_extractor == constants.FeatureExtractor.adjective_bag_of_words: tagger = nltk.tag.HunposTagger(constants.hunpos_english_model) analyzer.add_feat_extractor(adjective_bag_of_words) elif constants.feature_extractor == constants.FeatureExtractor.pos_bag_of_words: tagger = nltk.tag.HunposTagger(constants.hunpos_english_model) analyzer.add_feat_extractor(adjective_bag_of_words) train_feat = list(analyzer.apply_features(train_docs, labeled=True)) test_feat = list(analyzer.apply_features(test_docs, labeled=True)) print('train on %d instances, test on %d instances' % (len(train_feat), len(test_feat))) if constants.classifier == constants.Classifier.naive_bays: classifier = NaiveBayesClassifier.train(train_feat) analyzer.evaluate(test_feat, classifier, accuracy=True, f_measure=True, precision=True, recall=True, verbose=True) classifier.show_most_informative_features() # elif constants.classifier == constants.Classifier.maxent: # classifier = MaxentClassifier.train(train_feat) # analyzer.evaluate(test_feat, classifier, accuracy=True, f_measure=True, precision=True, recall=True, # verbose=True) # classifier.show_most_informative_features() elif constants.classifier == constants.Classifier.decision_tree: classifier = SklearnClassifier( DecisionTreeClassifier()).train(train_feat) analyzer.evaluate(test_feat, classifier, accuracy=True, f_measure=True, precision=True, recall=True, verbose=True) elif constants.classifier == constants.Classifier.linear_svm: classifier = SklearnClassifier(LinearSVC()).train(train_feat) analyzer.evaluate(test_feat, classifier, accuracy=True, f_measure=True, precision=True, recall=True, verbose=True) elif constants.classifier == constants.Classifier.random_forest: classifier = SklearnClassifier( RandomForestClassifier()).train(train_feat) analyzer.evaluate(test_feat, classifier, accuracy=True, f_measure=True, precision=True, recall=True, verbose=True) elif constants.classifier == constants.Classifier.logistic: classifier = SklearnClassifier(LogisticRegression()).train(train_feat) analyzer.evaluate(test_feat, classifier, accuracy=True, f_measure=True, precision=True, recall=True, verbose=True)
document_words = set(document) features = {} for word in word_features: features['contains({})'.format(word)] = (word in document_words) return features featuresets = [(document_features(d), c) for (d, c) in documents] train_set = featuresets[:1000] test_set = featuresets[1000:] classifier = nltk.NaiveBayesClassifier.train(train_set) print("NaiveBayesClassifier Accuracy =>" + str(nltk.classify.accuracy(classifier, test_set) * 100)) classifier.show_most_informative_features(5) classifier = DecisionTreeClassifier.train(train_set, binary=False, entropy_cutoff=0.4, depth_cutoff=20, support_cutoff=50) print("DecisionTreeClassifier Accuracy =>" + str(nltk.classify.accuracy(classifier, test_set) * 100)) # To Test This Application Put in File 1.txt and try to make the text large as possible because the features not large (small data set) #InputList=[] #with open("1.txt", 'r') as f: # for line in f: # for word in line.split(): # InputList.append(word) # words.append(word) #print(classifier.classify(document_features(InputList)))
neg_features.append(k) negcutoff = len(neg_features)*3//4 poscutoff = len(pos_features)*3//4 trainfeats = neg_features[:negcutoff] + pos_features[:poscutoff] testfeats = neg_features[negcutoff:] + pos_features[poscutoff:] print ('\n') print('Total Training Instances - '+ str(len(trainfeats))) print( 'Total Testing Instances - ' + str(len(testfeats))) classifier = NaiveBayesClassifier.train(trainfeats) print ('\n') print('NaiveBayesClassifier accuracy:', nltk.classify.util.accuracy(classifier, testfeats)) classifier1 = DecisionTreeClassifier.train(trainfeats,entropy_cutoff=0) print ('\n') print('DecisionTreeClassifier accuracy:', nltk.classify.util.accuracy(classifier1, testfeats)) feature_names = ["polarity_nature","polarity_value"] X = df[feature_names] X.polarity_nature = X.polarity_nature.apply(lambda i: 0.0 if i=="neutral" else ( 1.0 if i=="postive" else -1.0)) df["status1"] = df.status.apply(lambda i: 0.0 if i==({u'fair': u'neutral'}, 1) else ( 1.0 if i==({u'fair': u'positive'}, 1) else -1.0)) y = df.status1 print (y.head()) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,test_size=0.2) print (len(X_train), len(X_test)) linreg = LinearRegression()
tgd=brown.tagged_words(categories="news") ''' print tgd[:3] [(u'The', u'AT'), (u'Fulton', u'NP-TL'), (u'County', u'NN-TL')] ''' feats=[(pos_feats(w),c) for (w,c) in tgd] lens=int(len(feats)*0.2) tain,test=feats[lens:],feats[:lens] ''' print tain[:10] [({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'AT'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'CD'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'NN'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'NN'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'VBG'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'IN'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'NN-TL'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'NP'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'CS'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'NN')] ''' clf=DecisionTreeClassifier.train(tain) print accuracy(clf,test) print clf.classify(pos_feats("dogs")) #0.144952759821 #NN ''' for w in brown.words()[:10]: print w print w[-2:] The he Fulton on County
def train(self, reviews_file): """ Trains a classifier based on drug reviews with ratings Args: reviews_file: Reviews file to use for training. """ ## Parse data from files reviews = self.parse_reviews(reviews_file) with open('stopwords.txt') as stop_words_file: text = self.clean_text(stop_words_file.read()) stop_words = text.splitlines() ## Parse and convert positive and negative examples positive_comments = [] negative_comments = [] for review in reviews: comment = review['comment'] rating = review['rating'] comment = self.format_text(comment, stop_words) if float(rating) <= self.negative_threshold: negative_comments.append((comment, 'neg')) if float(rating) >= self.positive_threshold: positive_comments.append((comment, 'pos')) seed = 123 numpy.random.seed(seed) print("Total Negative Instances:" + str(len(negative_comments))) print("Total Positive Instances:" + str(len(positive_comments))) negcutoff = math.floor(len(negative_comments) * 1) poscutoff = math.floor(len(positive_comments) * 1) neg_idx_train = sorted( random.sample(range(len(negative_comments)), negcutoff)) neg_train = [negative_comments[i] for i in neg_idx_train] pos_idx_train = sorted( random.sample(range(len(positive_comments)), poscutoff)) pos_train = [positive_comments[i] for i in pos_idx_train] dataset = neg_train + pos_train comments = [x[0] for x in dataset] ratings = [x[1] for x in dataset] kfold = StratifiedKFold(n_splits=self.iterations, shuffle=True, random_state=seed) cvscores = [] for train, test in kfold.split(comments, ratings): train_data = [] for item in train: train_data.append(dataset[item]) test_data = [] for item in test: test_data.append(dataset[item]) if self.classifier_type == 'nb': self.model = NaiveBayesClassifier.train(train_data) elif self.classifier_type == 'dt': self.model = DecisionTreeClassifier.train(train_data) scores = nltk.classify.util.accuracy(self.model, test_data) print("{}%".format(scores * 100)) cvscores.append(scores * 100) # plot_model(model, to_file='model.png') if self.classifier_type == 'nb': self.model.show_most_informative_features() print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))