def evaluate_classifier_Decision(featx):

    train_negids = train.fileids('neg')
    train_posids = train.fileids('pos')
    test_negids = test.fileids('neg')
    test_posids = test.fileids('pos')
    train_negfeats = [(featx(train.words(fileids=[f])), 'neg') for f in train_negids]
    train_posfeats = [(featx(train.words(fileids=[f])), 'pos') for f in train_posids]
    test_negfeats = [(featx(test.words(fileids=[f])), 'neg') for f in test_negids]
    test_posfeats = [(featx(test.words(fileids=[f])), 'pos') for f in test_posids]
    trainfeats = train_negfeats + train_posfeats
    testfeats = test_negfeats + test_posfeats

    train_negcutoff = len(train_negfeats)*1/100
    train_poscutoff = len(train_posfeats)*1/100
    trainfeats_Decision = train_negfeats[:train_negcutoff] + train_posfeats[:train_poscutoff]
    DecisionTree_classifier = DecisionTreeClassifier.train(trainfeats_Decision)
    refsets = collections.defaultdict(set)
    testsets_Decision = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)           
            observed_Decision = DecisionTree_classifier.classify(feats)
            testsets_Decision[observed_Decision].add(i)

    accuracy3 = nltk.classify.util.accuracy(DecisionTree_classifier, testfeats)  
    pos_precision3 = nltk.metrics.precision(refsets['pos'], testsets_Decision['pos'])
    pos_recall3 = nltk.metrics.recall(refsets['pos'], testsets_Decision['pos'])
    neg_precision3 = nltk.metrics.precision(refsets['neg'], testsets_Decision['neg'])
    neg_recall3 = nltk.metrics.recall(refsets['neg'], testsets_Decision['neg'])

    return(['DecisionTree',accuracy3,pos_precision3,pos_recall3,neg_precision3,neg_recall3])
def main_function():
	conn = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter_analysis")
	hq_conn = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter")

	training_tweets = get_test_tweets(conn)
	training_feature_set = process_tweets(training_tweets)

	classifier = DecisionTreeClassifier.train(training_feature_set)

	test_tweets = get_training_tweets(conn)
	test_feature_set = process_tweets(test_tweets)

	classifier_accuracy = accuracy(classifier, test_feature_set)

	alt_full_matrix = {'+':{'+':0, '-':0, 'E':0}, 
				'-':{'+':0, '-':0, 'E':0}, 
				'E':{'+':0, '-':0, 'E':0}}

	#for f in test_tweets:
	#f = test_tweets[0]

	#print f
	#guess = classifier.classify(process_tweet(f[1]))
	#print guess
	#	update_tweet_polarity(f[0], guess, conn)
	##	pl = classifier.prob_classify(process_tweet(f[1]))
	#	idx = f[2]
	#	if idx == 'I' or idx == 'O':
	#		idx = 'E'
	#	alt_full_matrix[idx][guess] += 1

	#print alt_full_matrix

	print "classifier accuracy: " + repr(classifier_accuracy)
def decisionTree(features_train, features_test):
	print 'train on %d instances, test on %d instances' % (len(features_train), len(features_test))
	classifier = DecisionTreeClassifier.train(features_train,
												binary=True,
												entropy_cutoff=0.8,
												depth_cutoff=5,
												support_cutoff=30)
	print 'accuracy:', nltk.classify.util.accuracy(classifier, features_test)
	precisions, recalls = precision_recall(classifier, features_test)
	print "accuracy: ", precisions, "fitness: ", recalls
Example #4
0
def classify(inputdir):
        #filenames = os.listdir('d:\\shir\\')
        filenames = os.listdir(inputdir)

        feat_set = []
        sets = []
        for name in filenames:
        # print name
            lineno=0
            path = os.path.join(inputdir, name)
            sense = name.split('\\')[-1].split('.')[0]
            print 'training', sense

            file = codecs.open(path, 'r', 'utf-8')
            allwords = []
            for line in file:
              if len(line.split())>2:
                     lineno+=1
                     line = line.strip()
                     words=[]
                     tags=[]
                     tokens = line.split()

                     for item in tokens:
                           if len(item.split('\\'))==2:
                                word=item.split('\\')[0]
                                tag= item.split('\\')[1]
                                words.append(word)
                                tags.append(tag)
                                allwords.append(word)
                     feat_set.append((bag_of_words(line),sense))
                     #feat_set.append((get_feature2(line),sense))
              else:
                  words=[]
                  tags=[]
            file.close()

        random.shuffle(feat_set)
        random.shuffle(feat_set)
        #random.shuffle(feat_set)



        train_data = train_feats(feat_set)
        test_data  = test_feats(feat_set)
        #classifier=  MaxentClassifier.train(train_data)
        nb_classifier = NaiveBayesClassifier.train(train_data)
        dt_classifier = DecisionTreeClassifier.train(train_data, entropy_cutoff=0.8, depth_cutoff=5, support_cutoff=30)
       # pickle.dump(classifier, classifier_save_file)
        entropy_classifier = MaxentClassifier.train(train_data,algorithm='iis', trace=0, max_iter=1, min_lldelta=0.5)
        print "nb accuracy "+ str(accuracy(nb_classifier, test_data) * 100)
        print "dt accuracy "+ str(accuracy(dt_classifier, test_data) * 100)
        print "entropy accuracy "+ str(accuracy(entropy_classifier, test_data) * 100)
        mv_classifier = MaxVoteClassifier(nb_classifier, dt_classifier, entropy_classifier)
        print "max vote accuracy "+ str(accuracy(mv_classifier, test_data) * 100)
Example #5
0
def run(training):
    """
    To create and train a DecisionTreeClassifier
    :return: a trained Classifier
    """
    print "Training DT Classifier..."
    # feats = label_feat_from_corps(movie_reviews)
    # training, testing = split_label_feats(feats)

    dt_classifier = DecisionTreeClassifier.train(training, binary=True, entropy_cutoff=0.8, depth_cutoff=10, support_cutoff=30)
    print "DT Classifier trained..."
    return save_classifier(dt_classifier)
Example #6
0
def trainDT(featuresets):
    #idx = 2*len(featuresets) / ratio
    #train_set, test_set = featuresets[idx:], featuresets[:idx]
    train_set = featuresets
    #max_iter=20
    classifier = DecisionTreeClassifier.train(train_set)
    #print accuracy(classifier, test_set)
    #classifier.show_most_informative_features(100)
    #train_set, test_set = featuresets[idx:], featuresets[:idx]
    #classifier.train(train_set, algo, max_iter=20)
    #print accuracy(classifier, test_set)
    #classifier.show_most_informative_features(100)
    return classifier
Example #7
0
    def classify_decision_tree(self):

        print "training decision tree"
        classifier = DecisionTreeClassifier.train(self.feature_vectors_tuples_for_train, depth_cutoff=200, entropy_cutoff=0.1)
        print "testing classifier"
        classified_labels =  classifier.batch_classify([feature_set_tuple[0] for feature_set_tuple in self.feature_vectors_tuples_for_test])
        correct = 0
        wrong = 0
        for i in range(0, len(classified_labels)):
            if classified_labels[i] is self.feature_vectors_tuples_for_test[i][1]:
                correct += 1
            else:
                wrong += 1
        print correct, wrong
def decision_tree_classifier(feature_vector_train, feature_vector_test):
    features_train, topics_train = zip(*feature_vector_train)
    features_test, topics_test = zip(*feature_vector_test)

    # training

    classifier2 = DecisionTreeClassifier.train(features_train, depth_cutoff=250, entropy_cutoff=0.1)

    # Kept an entropy cutoff in order to improve the training time (this might lead to loss in accuracy though)
    # Same goes for depth cutoff (for refining the tree). Kept it as 250.

    # testing

    predicted_topics = classifier2.classify_many(features_test)

    print classification_report(topics_test, predicted_topics, target_names=set(topics_test))
def dt_classify(filename):
    raw_sample_stream = get_samples_stream(filename)
    all_samples = list( binary_bow_feature(raw_sample_stream) )

    # filter out two classes of outliers
    # these two categories contain too few examples, so the word frequency in these two categories
    # cannot reflect the true probability
    # all_samples = [(features,aspect) for features,aspect in all_samples if aspect != common.AspectNothing and aspect != common.AspectBusiness]

    test_sample_ratio = 0.25
    train_samples,test_samples = split_samples(all_samples,test_sample_ratio)
    print "training set has {} samples, test set has {} samples".format(len(train_samples),len(test_samples))

    classifier = DecisionTreeClassifier.train(train_samples,binary=True, depth_cutoff=15,verbose=True)
    print "training completes"

    print "training accuracy: {}".format(accuracy(classifier,train_samples))
    print "test accuracy: {}".format(accuracy(classifier,test_samples))

    return classifier
Example #10
0
def decision_tree(train_data):
    training_data = []
    for data in train_data:
        training_data.append(preprocess(data[0],label=data[1]))
    cl = DecisionTreeClassifier.train(training_data)
    return cl
Example #11
0
def classify(inputdir):
        #filenames = os.listdir('d:\\shir\\')
        filenames = os.listdir(inputdir)

        feat_set = []
        sets = []

        for name in filenames:
        # print name
                labeledlist = []
                lineno=0
                path = os.path.join(inputdir, name)
                sense = name.split('\\')[-1].split('.')[0]
                print 'training', sense

                file = codecs.open(path, 'r', 'utf-8')
                allwords = []
                for line in file:
                      if len(line.split())>2:
                             lineno+=1
                             line = line.strip()
                             words=[]
                             tags=[]
                             tokens = line.split()

                             for item in tokens:
                                   if len(item.split('\\'))==2:
                                        word=item.split('\\')[0]
                                        tag= item.split('\\')[1]
                                        words.append(word)
                                        tags.append(tag)
                                        allwords.append(word)
                             feat_set.append((bag_of_bigrams_words(words),sense))
                            # feat_set.append((context_feature(line),sense))
                      else:
                          words=[]
                          tags=[]
                print lineno
                labeledlist.append((sense,allwords))


#                feat_set.append((bigram_feature(allwords),sense))
                file.close()
        high_info_words = set(high_information_words(labeledlist))
        for item in  high_info_words:
                      print item

        random.shuffle(feat_set)
        random.shuffle(feat_set)
        random.shuffle(feat_set)

        

        train_data = train_feats(feat_set)
        test_data  = test_feats(feat_set)
        print "training on "+str(len(train_data))+" instances"
        print "testting on "+str(len(test_data))+" instances"
        #classifier=  MaxentClassifier.train(train_data)
       # nb_classifier = NaiveBayesClassifier.train(train_data)
        dt_classifier = DecisionTreeClassifier.train(train_data, entropy_cutoff=0.8, depth_cutoff=7, support_cutoff=10)
       # print dt_classifier.pp()
       # pickle.dump(classifier, classifier_save_file)
        entropy_classifier = MaxentClassifier.train(train_data,algorithm='iis', trace=0, max_iter=2, min_lldelta=0.5)
        print "nb accuracy "
       # print accuracy(nb_classifier, test_data) * 100
       # print "nb precision and recall"
#        print precision_recall(nb_classifier,test_data)

    #    print   nb_classifier.show_most_informative_features()
#        for item in  nb_classifier.most_informative_features():
#            print item
     #   print "dt accuracy "+ str(accuracy(dt_classifier, test_data) * 100)
        print "entropy accuracy "+ str(accuracy(entropy_classifier, test_data) * 100)
Example #12
0
def getClassifier(tweetfile,cfg):
    degreesToUse = cfg['NLPnGrams']
    print "DEBOOOOO", degreesToUse, type(degreesToUse)
    classMode = cfg['NLPMode'].replace('-',' ').replace('_',' ')
    shortClass = classMode.replace(' ','').lower()
    loadNeeded = True 

    if 'NLPTEST' not in cfg.keys():
	degreeString = '-'.join([str(degree) for degree in degreesToUse])
        pickleFile = 'nlpTrainers/'+tweetfile.replace('.csv','.'+shortClass+degreeString+'.pickle')  
	if isfile(pickleFile):
		print "Loading pickled", shortClass, "classifier"
		fileIn = open(pickleFile)
		classifier = cPickle.load(fileIn)
		fileIn.close()
		loadNeeded = False
    
    if loadNeeded:
        if 'NLPTEST'in cfg.keys():
            content = prepText(tweetfile)
            categorized = prepClassifications(content)
            NGrammized = collectNGrams(categorized,degreesToUse,cfg)
        else:
            print "Loading content & preparing text"
            content = prepText(loadFile(tweetfile))
            print "Categorizing contents"
            categorized = prepClassifications(content)
            print "Deriving NGrams of length(s)", degreesToUse
            NGrammized = collectNGrams(categorized,degreesToUse,cfg)
            print "Compiling Results"
        readyToSend = []
        allCats = [str(key) for key in NGrammized.keys()]
        for category in allCats:
            readyToSend += NGrammized[category]
            
        print "Attempting Classification by mode", classMode, degreesToUse
        if classMode == 'naive bayes':
            from nltk.classify import NaiveBayesClassifier
            classifier = {'class':NaiveBayesClassifier.train(readyToSend),'mode':'nb'}
        elif classMode == 'positive naive bayes':
            from nltk.classify import PositiveNaiveBayesClassifier
            classifier = {'class':PositiveNaiveBayesClassifier.train(readyToSend),'mode':'pnb'}
        elif classMode == 'max ent':
            #import nltk.classify
            #from sklearn.linear_model import LogisticRegression
            #from nltk.classify import SklearnClassifier
            #classifier = {'class':LogisticRegression.train(readyToSend),'mode':'me'}
            from nltk.classify import MaxentClassifier
            classifier = {'class':MaxentClassifier.train(readyToSend,algorithm='iis'),'mode':'me'}
        elif classMode == 'decision tree':
            from nltk.classify import DecisionTreeClassifier
            classifier = {'class':DecisionTreeClassifier.train(readyToSend),'mode':'dt'}
        elif classMode == 'svm':
            if "SVMOrder" in cfg.keys():
                priority =  cfg['SVMOrder']
            else:
                priority =  "ABCDEFGHIJKLMNOPQRSTUVWXYZ9876543210"
            if type(priority) is str:
                priority = list(priority)
            priority = [entry for entry in priority if entry in allCats]
            preppedSVM = prepSVMAll(readyToSend,priority,allCats,cfg)
            classifier = {'class':preppedSVM,'mode':'svm','priority':priority}
	else:
	    from nltk.classify import NaiveBayesClassifier
            classifier = {'class':NaiveBayesClassifier.train(readyToSend),'mode':'nb'}
        
        if 'NLPTEST' not in cfg.keys():
            print "Pickling Classifier"
            fileOut = open(pickleFile, 'wb')
            cPickle.dump(classifier, fileOut)
            fileOut.close() 
              
    if 'NLPTEST' not in cfg.keys():
        if classMode != 'svm': 
        	classifier['class'].show_most_informative_features(n=150)
	"""else:
		for key in classifier['class'].keys():
			print classifier		
			print classifier.keys()
			classifier['class'][key].show_most_informative_features(n=150/len(classifier['class'].keys()))"""    
    
    return classifier
Example #13
0
 def train(self, features_label):
     self._classifier = DecisionTreeClassifier.train(
         features_label, entropy_cutoff=0.05, depth_cutoff=200, support_cutoff=20
     )
     return None
'''
Created on Apr 25, 2010

@author: Ben
'''
from nltk import classify
from nltk.classify import DecisionTreeClassifier
from edu.zoller.nlp import common      

print 'Reading feature words...'
feature_words = common.read_tf_feature_words()

print 'Assembling training feature sets...'
train_set = []
for filename in common.train:
    year_class = common.get_40_year_class(filename)
    features = common.get_tf_features(filename, feature_words)
    train_set.append((features, year_class))
    
print 'Training classifier...'
classifier = DecisionTreeClassifier.train(train_set)
    
print 'Assembling test feature sets...'
test_set = []
for filename in common.test:
    year_class = common.get_40_year_class(filename)
    features = common.get_tf_features(filename, feature_words)
    test_set.append((features, year_class))

print 'Classifying test accuracy'
print classify.accuracy(classifier, test_set)
#print(len(X_test))
if not preprocess_flag:
    bow_transformer = joblib.load('FeatTransformer.pkl')
    X_train = joblib.load('TrainFeatures.pkl')
    X_test = joblib.load('TestFeatures.pkl')
else:
    bow_transformer = CountVectorizer(analyzer=format_sentence).fit(X_train)
    X_train = bow_transformer.transform(X_train)
    X_test = bow_transformer.transform(X_test)
    joblib.dump(bow_transformer, 'FeatTransformer.pkl')
    joblib.dump(X_train, 'TrainFeatures.pkl')
    joblib.dump(X_test, 'TestFeatures.pkl')

#train decision tree classifier
dt_flag = 0 #if 1, train model from scratch and dump - if 0, load dumped model
dt = DecisionTreeClassifier()
if dt_flag:
    dt_clf = dt.fit(X_train, Y_train)
    joblib.dump(dt_clf, 'DTmodel.pkl') 
else:
    dt_clf = joblib.load('DTmodel.pkl') 
#test dt classifier
preds = dt_clf.predict(X_test)
cm = confusion_matrix(Y_test, preds)
print(cm)
print('\n')
print(classification_report(Y_test, preds))
#plot_roc_curve(dt_clf,X_test,Y_test)
plt.figure()
plot_confusion_matrix(cm, classes=['negative', 'positive'], normalize=True, title='Normalized confusion matrix - Decision Tree')
plt.show()
Example #16
0
def getClassifier(tweetfile, cfg):
    degreesToUse = cfg['NLPnGrams']
    print "DEBOOOOO", degreesToUse, type(degreesToUse)
    classMode = cfg['NLPMode'].replace('-', ' ').replace('_', ' ')
    shortClass = classMode.replace(' ', '').lower()
    loadNeeded = True

    if 'NLPTEST' not in cfg.keys():
        degreeString = '-'.join([str(degree) for degree in degreesToUse])
        pickleFile = 'nlpTrainers/' + tweetfile.replace(
            '.csv', '.' + shortClass + degreeString + '.pickle')
        if isfile(pickleFile):
            print "Loading pickled", shortClass, "classifier"
            fileIn = open(pickleFile)
            classifier = cPickle.load(fileIn)
            fileIn.close()
            loadNeeded = False

    if loadNeeded:
        if 'NLPTEST' in cfg.keys():
            content = prepText(tweetfile)
            categorized = prepClassifications(content)
            NGrammized = collectNGrams(categorized, degreesToUse, cfg)
        else:
            print "Loading content & preparing text"
            content = prepText(loadFile(tweetfile))
            print "Categorizing contents"
            categorized = prepClassifications(content)
            print "Deriving NGrams of length(s)", degreesToUse
            NGrammized = collectNGrams(categorized, degreesToUse, cfg)
            print "Compiling Results"
        readyToSend = []
        allCats = [str(key) for key in NGrammized.keys()]
        for category in allCats:
            readyToSend += NGrammized[category]

        print "Attempting Classification by mode", classMode, degreesToUse
        if classMode == 'naive bayes':
            from nltk.classify import NaiveBayesClassifier
            classifier = {
                'class': NaiveBayesClassifier.train(readyToSend),
                'mode': 'nb'
            }
        elif classMode == 'positive naive bayes':
            from nltk.classify import PositiveNaiveBayesClassifier
            classifier = {
                'class': PositiveNaiveBayesClassifier.train(readyToSend),
                'mode': 'pnb'
            }
        elif classMode == 'max ent':
            #import nltk.classify
            #from sklearn.linear_model import LogisticRegression
            #from nltk.classify import SklearnClassifier
            #classifier = {'class':LogisticRegression.train(readyToSend),'mode':'me'}
            from nltk.classify import MaxentClassifier
            classifier = {
                'class': MaxentClassifier.train(readyToSend, algorithm='iis'),
                'mode': 'me'
            }
        elif classMode == 'decision tree':
            from nltk.classify import DecisionTreeClassifier
            classifier = {
                'class': DecisionTreeClassifier.train(readyToSend),
                'mode': 'dt'
            }
        elif classMode == 'svm':
            if "SVMOrder" in cfg.keys():
                priority = cfg['SVMOrder']
            else:
                priority = "ABCDEFGHIJKLMNOPQRSTUVWXYZ9876543210"
            if type(priority) is str:
                priority = list(priority)
            priority = [entry for entry in priority if entry in allCats]
            preppedSVM = prepSVMAll(readyToSend, priority, allCats, cfg)
            classifier = {
                'class': preppedSVM,
                'mode': 'svm',
                'priority': priority
            }
        else:
            from nltk.classify import NaiveBayesClassifier
            classifier = {
                'class': NaiveBayesClassifier.train(readyToSend),
                'mode': 'nb'
            }

        if 'NLPTEST' not in cfg.keys():
            print "Pickling Classifier"
            fileOut = open(pickleFile, 'wb')
            cPickle.dump(classifier, fileOut)
            fileOut.close()

    if 'NLPTEST' not in cfg.keys():
        if classMode != 'svm':
            classifier['class'].show_most_informative_features(n=150)
        """else:
		for key in classifier['class'].keys():
			print classifier		
			print classifier.keys()
			classifier['class'][key].show_most_informative_features(n=150/len(classifier['class'].keys()))"""

    return classifier
print('neg recall:', recall(refsets['neg'], testsets['neg']))
print('neg F-measure:', f_measure(refsets['neg'], testsets['neg']))

# Model #2: **UNIGRAMS** & Decision Tree

# In[26]:

#Making a decision tree model to compare which is the better performing model
import collections
from nltk import metrics
from nltk.metrics.scores import (accuracy, precision, recall, f_measure)
from nltk.classify import DecisionTreeClassifier
from nltk.classify.util import accuracy
dt_classifier = DecisionTreeClassifier.train(train_set,
                                             binary=True,
                                             entropy_cutoff=0.8,
                                             depth_cutoff=5,
                                             support_cutoff=30)
from nltk.classify.util import accuracy
print(accuracy(dt_classifier, test_set))

for i, (uni_featureset, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = dt_classifier.classify(uni_featureset)
    testsets[observed].add(i)

print('pos precision:', precision(refsets['pos'], testsets['pos']))
print('pos recall:', recall(refsets['pos'], testsets['pos']))
print('pos F-measure:', f_measure(refsets['pos'], testsets['pos']))
print('neg precision:', precision(refsets['neg'], testsets['neg']))
print('neg recall:', recall(refsets['neg'], testsets['neg']))
Example #18
0
pickle.dump(train_feats, save_train_feats)
save_train_feats.close()

save_test_feats = open("pickled_algos/test_feats", "wb")
pickle.dump(test_feats, save_test_feats)
save_test_feats.close()

nb_classifier = NaiveBayesClassifier.train(train_feats)

print(accuracy(nb_classifier, test_feats))

save_nb_classifier = open("pickled_algos/nb_classifier", "wb")
pickle.dump(nb_classifier, save_nb_classifier)
save_nb_classifier.close()

dt_classifier = DecisionTreeClassifier.train(train_feats)

print(accuracy(dt_classifier, test_feats))

save_dt_classifier = open("pickled_algos/dt_classifier", "wb")
pickle.dump(dt_classifier, save_dt_classifier)
save_dt_classifier.close()

sk_classifier = SklearnClassifier(LinearSVC()).train(train_feats)

print(accuracy(sk_classifier, test_feats))

save_sk_classifier = open("pickled_algos/sk_classifier", "wb")
pickle.dump(sk_classifier, save_sk_classifier)
save_sk_classifier.close()
def words_bag(words):
    return dict([(word,True) for word in words])

neg_list = movie_reviews.fileids('neg')
pos_list = movie_reviews.fileids('pos')
 
negfeats = [(words_bag(movie_reviews.words(fileids=[f])), 'neg') for f in neg_list]
posfeats = [(words_bag(movie_reviews.words(fileids=[f])), 'pos') for f in pos_list]

'''gathering training and test data for decision trees''' 
negcutoff_train_dt = len(negfeats)
poscutoff_train_dt = len(posfeats)
training_data_dt = negfeats[:negcutoff_train_dt] + posfeats[:poscutoff_train_dt]


classifier_dt = DecisionTreeClassifier.train(training_data_dt)

print "Decision Trees"    
print 'train on %d instances:' % (len(training_data_dt))

sentence_list = []

#comments = "first half was good but oh boy the second was shit, overall good movie. no matter how many times I watch this, I still like it. must watch movie, i just want to touch the sweet panda"
while 1:
    comments = raw_input("Enter a review comment ending with a dot :")
    sentence_list = sent_tokenize(comments)
    for sentence in sentence_list:
        word_punct = wordpunct_tokenize(sentence)
        for words in word_punct:
            input_cl = words_bag(words)
        print sentence + "--->" + classifier_dt.classify(input_cl)
def main():
    global tagger

    if constants.corpus == constants.Corpus.movie_review:
        neg_docs, pos_docs = get_movie_corpus()
    if constants.corpus == constants.Corpus.pol_debates:
        neg_docs, pos_docs = get_political_debates()

    if constants.mark_negation:
        neg_docs = [nltk.sentiment.util.mark_negation(doc) for doc in neg_docs]
        pos_docs = [nltk.sentiment.util.mark_negation(doc) for doc in pos_docs]

    # Split betweeen the training set and the testing set
    num_train_neg = int(3 / 4 * len(neg_docs))
    num_test_neg = len(neg_docs) - num_train_neg
    num_train_pos = int(3 / 4 * len(pos_docs))
    num_test_pos = len(pos_docs) - num_train_pos

    train_neg, test_neg = sklearn.cross_validation.train_test_split(
        neg_docs, train_size=num_train_neg, test_size=num_test_neg)
    train_pos, test_pos = sklearn.cross_validation.train_test_split(
        pos_docs, train_size=num_train_pos, test_size=num_test_pos)

    # Make the final train set and test set
    train_docs = train_pos + train_neg
    test_docs = test_pos + test_neg

    # Set up the Sentiment Analyzer
    analyzer = SentimentAnalyzer()

    if constants.feature_extractor == constants.FeatureExtractor.bag_of_words:
        analyzer.add_feat_extractor(extract_bag_of_words_feats)
    if constants.feature_extractor == constants.FeatureExtractor.freq_dist:
        analyzer.add_feat_extractor(extract_freq_dist)
    elif constants.feature_extractor == constants.FeatureExtractor.unigram:
        all_words = analyzer.all_words(train_docs, labeled=True)
        unigram_features = analyzer.unigram_word_feats(all_words,
                                                       min_freq=1000)
        print("Length of unigram features: %d" % len(unigram_features))
        analyzer.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats,
                                    unigrams=unigram_features)
    elif constants.feature_extractor == constants.FeatureExtractor.bigram_bag_of_words:
        analyzer.add_feat_extractor(extract_sig_bigram_feats)
    elif constants.feature_extractor == constants.FeatureExtractor.adjective_bag_of_words:
        tagger = nltk.tag.HunposTagger(constants.hunpos_english_model)
        analyzer.add_feat_extractor(adjective_bag_of_words)
    elif constants.feature_extractor == constants.FeatureExtractor.pos_bag_of_words:
        tagger = nltk.tag.HunposTagger(constants.hunpos_english_model)
        analyzer.add_feat_extractor(adjective_bag_of_words)

    train_feat = list(analyzer.apply_features(train_docs, labeled=True))
    test_feat = list(analyzer.apply_features(test_docs, labeled=True))

    print('train on %d instances, test on %d instances' %
          (len(train_feat), len(test_feat)))

    if constants.classifier == constants.Classifier.naive_bays:
        classifier = NaiveBayesClassifier.train(train_feat)
        analyzer.evaluate(test_feat,
                          classifier,
                          accuracy=True,
                          f_measure=True,
                          precision=True,
                          recall=True,
                          verbose=True)
        classifier.show_most_informative_features()
    # elif constants.classifier == constants.Classifier.maxent:
    #     classifier = MaxentClassifier.train(train_feat)
    #     analyzer.evaluate(test_feat, classifier, accuracy=True, f_measure=True, precision=True, recall=True,
    #                       verbose=True)
    #     classifier.show_most_informative_features()
    elif constants.classifier == constants.Classifier.decision_tree:
        classifier = SklearnClassifier(
            DecisionTreeClassifier()).train(train_feat)
        analyzer.evaluate(test_feat,
                          classifier,
                          accuracy=True,
                          f_measure=True,
                          precision=True,
                          recall=True,
                          verbose=True)
    elif constants.classifier == constants.Classifier.linear_svm:
        classifier = SklearnClassifier(LinearSVC()).train(train_feat)
        analyzer.evaluate(test_feat,
                          classifier,
                          accuracy=True,
                          f_measure=True,
                          precision=True,
                          recall=True,
                          verbose=True)
    elif constants.classifier == constants.Classifier.random_forest:
        classifier = SklearnClassifier(
            RandomForestClassifier()).train(train_feat)
        analyzer.evaluate(test_feat,
                          classifier,
                          accuracy=True,
                          f_measure=True,
                          precision=True,
                          recall=True,
                          verbose=True)
    elif constants.classifier == constants.Classifier.logistic:
        classifier = SklearnClassifier(LogisticRegression()).train(train_feat)
        analyzer.evaluate(test_feat,
                          classifier,
                          accuracy=True,
                          f_measure=True,
                          precision=True,
                          recall=True,
                          verbose=True)
Example #21
0
def decision_tree(train_data):
    training_data = []
    for data in train_data:
        training_data.append(preprocess(data[0],label=data[1]))
    cl = DecisionTreeClassifier.train(training_data)
    return cl
Example #22
0
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features


featuresets = [(document_features(d), c) for (d, c) in documents]
train_set = featuresets[:1000]
test_set = featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("NaiveBayesClassifier Accuracy     =>" +
      str(nltk.classify.accuracy(classifier, test_set) * 100))
classifier.show_most_informative_features(5)

classifier = DecisionTreeClassifier.train(train_set,
                                          binary=False,
                                          entropy_cutoff=0.4,
                                          depth_cutoff=20,
                                          support_cutoff=50)
print("DecisionTreeClassifier Accuracy     =>" +
      str(nltk.classify.accuracy(classifier, test_set) * 100))
# To Test This Application Put in File 1.txt and try to make the text large as possible because the features not large (small data set)
#InputList=[]
#with open("1.txt", 'r') as f:
#    for line in f:
#        for word in line.split():
#            InputList.append(word)
#            words.append(word)
#print(classifier.classify(document_features(InputList)))
Example #23
0
        neg_features.append(k)

negcutoff = len(neg_features)*3//4
poscutoff = len(pos_features)*3//4
trainfeats = neg_features[:negcutoff] + pos_features[:poscutoff]
testfeats = neg_features[negcutoff:] + pos_features[poscutoff:]
print ('\n')
print('Total Training Instances - '+ str(len(trainfeats)))
print( 'Total Testing Instances - ' + str(len(testfeats)))

classifier = NaiveBayesClassifier.train(trainfeats)
print ('\n')
print('NaiveBayesClassifier accuracy:', nltk.classify.util.accuracy(classifier, testfeats))


classifier1 = DecisionTreeClassifier.train(trainfeats,entropy_cutoff=0)
print ('\n')
print('DecisionTreeClassifier accuracy:', nltk.classify.util.accuracy(classifier1, testfeats))

feature_names = ["polarity_nature","polarity_value"]
X = df[feature_names]
X.polarity_nature = X.polarity_nature.apply(lambda i: 0.0 if i=="neutral" else ( 1.0 if i=="postive" else -1.0))
df["status1"] = df.status.apply(lambda i: 0.0 if i==({u'fair': u'neutral'}, 1) else ( 1.0 if i==({u'fair': u'positive'}, 1) else -1.0))
y = df.status1
print (y.head())

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,test_size=0.2)
print (len(X_train), len(X_test))


linreg = LinearRegression()
tgd=brown.tagged_words(categories="news")

'''
print tgd[:3]
[(u'The', u'AT'), (u'Fulton', u'NP-TL'), (u'County', u'NN-TL')]
'''
feats=[(pos_feats(w),c) for (w,c) in tgd]
lens=int(len(feats)*0.2)
tain,test=feats[lens:],feats[:lens]
'''
print tain[:10]

[({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'AT'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'CD'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'NN'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'NN'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'VBG'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'IN'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'NN-TL'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'NP'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'CS'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'NN')]

'''
clf=DecisionTreeClassifier.train(tain)
print accuracy(clf,test)
print clf.classify(pos_feats("dogs"))
#0.144952759821
#NN

'''
for w in brown.words()[:10]:
    print w
    print w[-2:]

The
he
Fulton
on
County
Example #25
0
    def train(self, reviews_file):
        """ Trains a classifier based on drug reviews with ratings

        Args:
            reviews_file: Reviews file to use for training.
        """
        ## Parse data from files
        reviews = self.parse_reviews(reviews_file)

        with open('stopwords.txt') as stop_words_file:
            text = self.clean_text(stop_words_file.read())
            stop_words = text.splitlines()

        ## Parse and convert positive and negative examples
        positive_comments = []
        negative_comments = []

        for review in reviews:
            comment = review['comment']
            rating = review['rating']

            comment = self.format_text(comment, stop_words)

            if float(rating) <= self.negative_threshold:
                negative_comments.append((comment, 'neg'))
            if float(rating) >= self.positive_threshold:
                positive_comments.append((comment, 'pos'))

        seed = 123
        numpy.random.seed(seed)

        print("Total Negative Instances:" + str(len(negative_comments)))
        print("Total Positive Instances:" + str(len(positive_comments)))

        negcutoff = math.floor(len(negative_comments) * 1)
        poscutoff = math.floor(len(positive_comments) * 1)

        neg_idx_train = sorted(
            random.sample(range(len(negative_comments)), negcutoff))
        neg_train = [negative_comments[i] for i in neg_idx_train]

        pos_idx_train = sorted(
            random.sample(range(len(positive_comments)), poscutoff))
        pos_train = [positive_comments[i] for i in pos_idx_train]

        dataset = neg_train + pos_train

        comments = [x[0] for x in dataset]
        ratings = [x[1] for x in dataset]
        kfold = StratifiedKFold(n_splits=self.iterations,
                                shuffle=True,
                                random_state=seed)
        cvscores = []
        for train, test in kfold.split(comments, ratings):
            train_data = []
            for item in train:
                train_data.append(dataset[item])
            test_data = []
            for item in test:
                test_data.append(dataset[item])

            if self.classifier_type == 'nb':
                self.model = NaiveBayesClassifier.train(train_data)
            elif self.classifier_type == 'dt':
                self.model = DecisionTreeClassifier.train(train_data)

            scores = nltk.classify.util.accuracy(self.model, test_data)
            print("{}%".format(scores * 100))
            cvscores.append(scores * 100)
            # plot_model(model, to_file='model.png')

            if self.classifier_type == 'nb':
                self.model.show_most_informative_features()

        print("%.2f%% (+/- %.2f%%)" %
              (numpy.mean(cvscores), numpy.std(cvscores)))