Example #1
0
def train(test=False):

    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')


    negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]


    if(test):
        negcutoff = len(negfeats)*3/4
        poscutoff = len(posfeats)*3/4

        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

        print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))

        classifier = NaiveBayesClassifier.train(trainfeats)
        print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)

        classifier.show_most_informative_features()

    else:
        return NaiveBayesClassifier.train(negfeats+posfeats)
def train_and_show_results(pos, neg, pos_bigrams, neg_bigrams, pos_control, neg_control, pos_control_bigrams, neg_control_bigrams):
    if pos_control == None or neg_control == None or pos_control_bigrams == None or neg_control_bigrams == None:
        negcutoff = len(neg)*3/4
        poscutoff = len(pos)*3/4
        neg_bigrams_cutoff = len(neg_bigrams)*3/4
        pos_bigrams_cutoff = len(pos_bigrams)*3/4
        test_bag_of_words = neg[negcutoff:] + pos[poscutoff:]
        test_bigrams = neg_bigrams[neg_bigrams_cutoff:] + pos_bigrams[pos_bigrams_cutoff:]
        train_corpora_bag_of_words = neg[:negcutoff] + pos[:poscutoff]
        train_corpora_bigrams = neg_bigrams[:neg_bigrams_cutoff] + pos_bigrams[:pos_bigrams_cutoff]
    else:
        test_bag_of_words = neg_control + pos_control
        test_bigrams = neg_control_bigrams + pos_control_bigrams
        train_corpora_bag_of_words = neg+pos
        train_corpora_bigrams = neg_bigrams + pos_bigrams
    
    print "negative corpus: ", len(neg) 
    print "positive corpus: ", len(pos)

    if neg_control != None:
        print "negative test corpus: ", len(neg_control) 
        print "positive test corpus: ", len(pos_control)

    print 'bag of words and bigrams - Naive Bayes' 
    naive_bayes = NaiveBayesClassifier.train(train_corpora_bag_of_words)
    naive_bayes_bigrams = NaiveBayesClassifier.train(train_corpora_bigrams)
   
    save_dataset('naive_bayes.dat', naive_bayes)
    save_dataset('naive_bayes_bigrams.dat', naive_bayes_bigrams)
    
    print 'bag of words and bigrams - Maximum Entropy' 
    maximum_entropy = nltk.MaxentClassifier.train(train_corpora_bag_of_words, max_iter=2)
    maximum_entropy_bigrams = nltk.MaxentClassifier.train(train_corpora_bigrams, max_iter=2)
    
    save_dataset('maximum_entropy.dat', maximum_entropy)
    save_dataset('maximum_entropy_bigrams.dat', maximum_entropy_bigrams)

    print 'Naive Bayesian results'
    print 'bag of words' 
    print 'Accuracy:', nltk.classify.util.accuracy(naive_bayes, test_bag_of_words)
    naive_bayes.show_most_informative_features()  
    print_precision_recall(naive_bayes, test_bag_of_words) 


    print '\nbigrams'
    print 'Accuracy:', nltk.classify.util.accuracy(naive_bayes_bigrams, test_bigrams)
    naive_bayes_bigrams.show_most_informative_features()  
    print_precision_recall(naive_bayes_bigrams, test_bigrams) 

    print 'Maximum Entropy results'
    print 'bag of words' 
    print 'Accuracy:', nltk.classify.util.accuracy(maximum_entropy, test_bag_of_words)
    maximum_entropy.show_most_informative_features()  
    print_precision_recall(maximum_entropy, test_bag_of_words) 


    print '\nbigrams'
    print 'Accuracy:', nltk.classify.util.accuracy(maximum_entropy_bigrams, test_bigrams)
    maximum_entropy_bigrams.show_most_informative_features()  
    print_precision_recall(maximum_entropy_bigrams, test_bigrams) 
    def cross_validation(self):
        #10 fold cross validation is performed
        train_feats_count = int(len(self.training_feats))
        fold_size = int(train_feats_count / self.k_fold)
        nb_accuracy_list = []
        svm_accuracy_list = []
        nb_f_val_list = []
        svm_f_val_list = []

        for a in range(self.k_fold):
            start_index = a * fold_size
            end_index = start_index + fold_size

            train_features = self.training_feats[:start_index] + self.training_feats[end_index:]
            test_features  = self.training_feats[start_index:end_index] 
            
            self.nb_classifier = NaiveBayesClassifier.train(train_features)         
            nb_acc = nltk.classify.util.accuracy(self.nb_classifier, test_features) 
            nb_accuracy_list.append(nb_acc)
       
            self.svm_classifier = SklearnClassifier(LinearSVC()) 
            self.svm_classifier.train(train_features)
            svm_acc = nltk.classify.util.accuracy(self.svm_classifier, test_features) 
            svm_accuracy_list.append(svm_acc)

            #Find F-Measure
            nb_f_val = self.compute_measures(test_features, self.nb_classifier)
            nb_f_val_list.append(nb_f_val)
            svm_f_val = self.compute_measures(test_features, self.svm_classifier)
            svm_f_val_list.append(svm_f_val)

        self.logging.info('Average accuracy of Naive Bayes Classifier %s\n' % (float(sum(nb_accuracy_list)/len(nb_accuracy_list))))
        self.logging.info('Average accuracy of SVM Classifier %s\n' % (float(sum(svm_accuracy_list)/len(svm_accuracy_list))))
        self.logging.info('Average F measure of Naive Bayes Classifier %s\n' % (float(sum(nb_f_val_list)/len(nb_f_val_list))))
        self.logging.info('Average F measure of SVM Classifier %s\n' % (float(sum(svm_f_val_list)/len(svm_f_val_list))))
 def classification(self):
     #Training NB classifier
     self.nb_classifier = NaiveBayesClassifier.train(self.training_feats)         
     
     #Training SVM classifier
     self.svm_classifier = SklearnClassifier(LinearSVC()) 
     self.svm_classifier.train(self.training_feats)
Example #5
0
def evaluate_classifier(featx):
    sportsfeats = [(featx(tweet[0]), tweet[1]) for tweet in Sports_Tweet]
    politicsfeats = [(featx(tweet[0]), tweet[1]) for tweet in Politics_Tweet]
    
    sportscutoff = len(sportsfeats)*3/4
    politicscutoff = len(politicsfeats)*3/4
 
    trainfeats = sportsfeats[:sportscutoff] + politicsfeats[:politicscutoff]
    testfeats = sportsfeats[sportscutoff:] + politicsfeats[politicscutoff:]
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    print 'pos precision:', nltk.metrics.precision(refsets['Sports'], testsets['Sports'])
    print 'pos recall:', nltk.metrics.recall(refsets['Politics'], testsets['Politics'])
    print 'neg precision:', nltk.metrics.precision(refsets['Sports'], testsets['Sports'])
    print 'neg recall:', nltk.metrics.recall(refsets['Politics'], testsets['Politics'])
    classifier.show_most_informative_features()
    return classifier
Example #6
0
def main():
 
#    vote_file = '/Users/nasrallah/Desktop/Insight/courtcast/data/supreme_court_dialogs_corpus_v1.01/supreme.votes.txt'
#    votes = get_justice_votes(vote_file)
#    for v in votes: print(v, votes[v])
    
#    win_file = '/Users/nasrallah/Desktop/Insight/courtcast/data/supreme_court_dialogs_corpus_v1.01/supreme.outcome.txt'
#    winners = get_winners(win_file)
#    for w in winners: print(w, winners[w])

    text_file = '/Users/nasrallah/Desktop/Insight/courtcast/data/supreme_court_dialogs_corpus_v1.01/supreme.conversations.txt'
    #text_file = '/Users/nasrallah/Desktop/some_text.txt'
    
    ## Extract the feature sets
    feature_sets = get_training_features(text_file)
    
    ## Shuffle the features to mix up pos and neg
    #random.shuffle(feature_sets)
    
    ## Separate into train and test sets 
    cutoff = int(len(feature_sets)*3/4)
    train_feature_sets = feature_sets[:cutoff]
    test_feature_sets = feature_sets[cutoff:]
    print('train on %d instances, test on %d instances' % (len(train_feature_sets), len(test_feature_sets)))
 
    classifier = NaiveBayesClassifier.train(train_feature_sets)
    print('accuracy:', nltk.classify.util.accuracy(classifier, test_feature_sets))
    classifier.show_most_informative_features()  
Example #7
0
def evaluate_classifier(featx):
    #negids = movie_reviews.fileids('neg')
    #posids = movie_reviews.fileids('pos')
    
    ##For Movie Review train:
    #negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    #posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
     
    ##For product reviews train:
    negfeats = [(featx([wrd for wrd in nltk.word_tokenize(con) if wrd not in stpwrds]), 'neg') for con in traincons]
    posfeats = [(featx([wrd for wrd in nltk.word_tokenize(pro) if wrd not in stpwrds]), 'pos') for pro in trainpros]
    
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
 
    trainfeats = negfeats[:] + posfeats[:]
    #trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
    print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
    classifier.show_most_informative_features()
    return classifier
    def __init__(self):
        # neg_phrases = filter_negative_phrases(load_csv_sentences('thoughtsandfeelings.csv'))
        # pos_phrases = filter_positive_phrases(load_csv_sentences('spiritualforums.csv'))
        neg_file = open("neg_phrases.txt", "r")
        pos_file = open("pos_phrases.txt", "r")
        neg_phrases = neg_file.readlines()
        pos_phrases = pos_file.readlines()

        neg_phrases_tagged = []
        pos_phrases_tagged = []
        for phrase in neg_phrases:
            neg_phrases_tagged.append((word_feats(phrase.split()), 'suicidal'))
        for phrase in pos_phrases:
            pos_phrases_tagged.append((word_feats(phrase.split()), 'alright'))

        negcutoff = int(len(neg_phrases_tagged) * .8)
        poscutoff = int(len(pos_phrases_tagged) * .8)

        trainfeats = neg_phrases_tagged[:negcutoff] + pos_phrases_tagged[:poscutoff]
        testfeats = neg_phrases_tagged[negcutoff:] + pos_phrases_tagged[poscutoff:]
        print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))

        self.classifier = NaiveBayesClassifier.train(trainfeats)
        print 'accuracy:', nltk.classify.util.accuracy(self.classifier, testfeats)
        self.classifier.show_most_informative_features()
def naiveBayes(features_train, features_test):
	print 'train on %d instances, test on %d instances' % (len(features_train), len(features_test))
	classifier = NaiveBayesClassifier.train(features_train)
	print 'accuracy:', nltk.classify.util.accuracy(classifier, features_test)
	classifier.show_most_informative_features()	
	precisions, recalls = precision_recall(classifier, features_test)
	print "accuracy: ", precisions, "fitness: ", recalls
Example #10
0
    def classify(self):
        # Classify

        articles = Article.objects.filter(entity=self.entity)

        def word_feats(body):
            words = body.split(" ")
            return dict([(word, True) for word in words])

        negids = articles.filter(score__lt=0)
        posids = articles.filter(score__gt=0)

        negfeats = [(word_feats(a.body), "neg") for a in negids]
        posfeats = [(word_feats(a.body), "pos") for a in posids]

        negcutoff = len(negfeats) * 3 / 4
        poscutoff = len(posfeats) * 3 / 4

        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
        print "train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats))

        classifier = NaiveBayesClassifier.train(trainfeats)
        print "accuracy:", nltk.classify.util.accuracy(classifier, testfeats)
        classifier.show_most_informative_features()
	def __init_naive_bayes(self):
		"""
		__init_naive_bayes(self):
		Gets the data from the positive, negative and neutral text files.
		Creates and trains the Naive Bayes classifier, using the data, so 
		that it can learn what constitutes a positive, negative or neutral tweet.
		"""
		
		try:
			pos_file = pjoin(sys.path[0], "sentiment_word_files", "tweets_positive.txt")
			f = codecs.open(pos_file, mode="rU", encoding='utf-8')
			positive = [line.lower().replace("\n" , " ") for line in f]
			positive = "".join(word[:] for word in positive).split()
			f.close
		
			neu_file = pjoin(sys.path[0], "sentiment_word_files", "tweets_neutral.txt")
			f = codecs.open(neu_file, mode="rU", encoding='utf-8')
			neutral = [line.lower().replace("\n" , " ") for line in f]
			neutral = "".join(word[:] for word in neutral).split()
			f.close
		
			neg_file = pjoin(sys.path[0], "sentiment_word_files", "tweets_negative.txt")
			f = codecs.open(neg_file, mode="rU", encoding='utf-8')
			negative = [line.lower().replace("\n" , " ") for line in f]
			negative = "".join(word[:] for word in negative).split()
			f.close
		
			posfeats = [(dict({word.lower() : True}), 'pos') for word in positive if self.__check_word(word)]
			neufeats = [(dict({word.lower() : True}), 'neu') for word in neutral if self.__check_word(word)]
			negfeats = [(dict({word.lower() : True}), 'neg') for word in negative if self.__check_word(word)]
		
			self.classifier = NaiveBayesClassifier.train( posfeats + neufeats + negfeats )
		
		except:
			raise Exception ("Unknown error in SentimentAnalyzer::__init_naive_bayes")
Example #12
0
def generate_sentiment_classifier(corpus, word_feats):
    negids = corpus.fileids('neg')
    posids = corpus.fileids('pos')
    negfeats = [(word_feats(corpus.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(word_feats(corpus.words(fileids=[f])), 'pos') for f in posids]

    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 

    classifier = NaiveBayesClassifier.train(trainfeats)

    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)



    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
    print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
    classifier.show_most_informative_features()

    return classifier
Example #13
0
def main():
    org_names = Org.objects.values_list('name', flat=True)

    users = User.objects.filter(likely_org=False)
    user_names = [user.get_name for user in users]
    # Exclude the users we know are orgs (exact same name). This mostly gets run the first time and for new users with org names
    non_org_user_names = set(user_names) - set(org_names)

    org_features = [(word_features(name), 'org') for name in org_names]
    user_features = [(word_features(name), 'user') for name in non_org_user_names]

    classifier = NaiveBayesClassifier.train(user_features + org_features)

    counter = 0

    likely_orgs = []

    for user in users:
        prediction = classifier.prob_classify(word_features(user.get_name))
        if prediction.max() == 'org':
            # Log probability ratio, so if P(org) == 2.4 and P(user) == 0.3 then log2(P(org)/P(user)) = log2(8.0) = 3.0
            ratio = math.log(((float(prediction.prob('org')) + NORMALIZING_CONST) / (float(prediction.prob('user')) + NORMALIZING_CONST)), 2)
            if ratio >= MIN_RATIO and user.likely_org == False and user.admin_classification != 'user':
                log.info('User ID %d with name "%s" is probably an org. Saving.' % (user.id, user.get_name))
                user.likely_org = True
                user.org_probability = ratio
                user.save()
                counter += 1

    log.info("Processed %d users with org-like names" % counter)
def create_train_classifier():
    print "Recreating training classifier"
    corpus_dir = nltk.data.find(TRAIN_DATASET_LOC)
    train_data = nltk.corpus.CategorizedPlaintextCorpusReader(corpus_dir, fileids='.*\.txt',cat_pattern="(pos|neg)")
        

    negids_train = train_data.fileids('neg')
    posids_train = train_data.fileids('pos')
        
    # negids_movies = movie_reviews.fileids('neg')
    # posids_movies = movie_reviews.fileids('pos')

    negfeats = [(__word_feats_neg(train_data.words(fileids=[f])), 'neg') for f in negids_train]
    posfeats = [(__word_feats_pos(train_data.words(fileids=[f])), 'pos') for f in posids_train]

    # negfeats.extend([(__word_feats_neg(movie_reviews.words(fileids=[f])), 'neg') for f in negids_movies])
    # posfeats.extend([(__word_feats_pos(movie_reviews.words(fileids=[f])), 'pos') for f in posids_movies])

    trainfeats = negfeats + posfeats

    classifier = NaiveBayesClassifier.train(trainfeats)
    
    pos_file_name = 'pickles'+os.sep+'positive_train.pickle'
    neg_file_name = 'pickles'+os.sep+'negative_train.pickle'
    class_file_name = 'pickles'+os.sep+'nbClassifier.pickle'
    
    __write_file(pos_file_name,cPickle.dumps(posfeats))
    __write_file(neg_file_name,cPickle.dumps(negfeats))
    __write_file(class_file_name,cPickle.dumps(classifier))
    print "Done!"
Example #15
0
def naivebayes(trainfeats, testfeats):
	classifier = NaiveBayesClassifier.train(trainfeats)
	print "NaiveBayes output"
	print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))

	print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
	print classifier.show_most_informative_features()
Example #16
0
def evaluateFeatures(featureSelect):
    posFeatures = []
    negFeatures = []
   
    with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWords = [featureSelect(posWords), 'pos']
            posFeatures.append(posWords)
    with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWords = [featureSelect(negWords), 'neg']
            negFeatures.append(negWords)

    
    posCutoff = int(math.floor(len(posFeatures)*3/4))
    negCutoff = int(math.floor(len(negFeatures)*3/4))
    #trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
    trainFeatures = posFeatures + negFeatures
    print testFeatures[0]
    classifier = NaiveBayesClassifier.train(trainFeatures)  

    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set) 

    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        #print features
        #print predicted
        testSets[predicted].add(i)  
def evaluate_classifier_Naive(featx):
    
    train_negids = train.fileids('neg')
    train_posids = train.fileids('pos')
    test_negids = test.fileids('neg')
    test_posids = test.fileids('pos')
    train_negfeats = [(featx(train.words(fileids=[f])), 'neg') for f in train_negids]
    train_posfeats = [(featx(train.words(fileids=[f])), 'pos') for f in train_posids]
    test_negfeats = [(featx(test.words(fileids=[f])), 'neg') for f in test_negids]
    test_posfeats = [(featx(test.words(fileids=[f])), 'pos') for f in test_posids]
    trainfeats = train_negfeats + train_posfeats
    testfeats = test_negfeats + test_posfeats

    Naive_classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets_Naive = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)           
            observed_Naive = Naive_classifier.classify(feats)
            testsets_Naive[observed_Naive].add(i)
            
    accuracy1 = nltk.classify.util.accuracy(Naive_classifier, testfeats)  
    pos_precision1 = nltk.metrics.precision(refsets['pos'], testsets_Naive['pos'])
    pos_recall1 = nltk.metrics.recall(refsets['pos'], testsets_Naive['pos'])
    neg_precision1 = nltk.metrics.precision(refsets['neg'], testsets_Naive['neg'])
    neg_recall1 = nltk.metrics.recall(refsets['neg'], testsets_Naive['neg'])

    Naive_classifier.show_most_informative_features(50)

    return(['NaiveBayes',accuracy1,pos_precision1,pos_recall1,neg_precision1,neg_recall1])
Example #18
0
def classify():
    #corpus = 'Cornell_text_polarity'
    #corpus = 'BingLiu_selected_sentences'
    corpus = 'Cornell_sentence_polarity'
    cases = load_corpus(corpus)
    features = get_word_features(cases)

    train_feats = []
    test_feats = []
    for polarity, feats in features.items():
        #cutoff = len(feats) * 1 / 4
        cutoff = 1000
        print polarity, 'number of train:', cutoff
        #train_feats += feats[:cutoff]
        #test_feats += feats[cutoff:]
        temp_feats = feats[:]
        random.shuffle(temp_feats)
        train_feats += temp_feats[:cutoff]
        test_feats += temp_feats[cutoff:]

    print 'train on %d instances, test on %d instances' % (len(train_feats), len(test_feats))

    classifier = NaiveBayesClassifier.train(train_feats)
    print 'accuracy:', nltk.classify.util.accuracy(classifier, test_feats)
    classifier.show_most_informative_features()
    def classification(self):

        fstruct = FeatStruct(self.train_reviews)
        classifier = NaiveBayesClassifier.train(fstruct)

        print 'accuracy:', nltk.classify.util.accuracy(classifier, self.test_reviews)
        classifier.show_most_informative_features() 
def main_function():
	conn = MySQLdb.connect(host=DATABASES['ensemble']['HOST'], 
			user=DATABASES['ensemble']['USER'], 
			passwd=DATABASES['ensemble']['PASSWORD'], 
			db=DATABASES['ensemble']['NAME'])

	training_tweets = classify.get_training_tweets(conn)
	training_feature_set = classify.process_tweets(training_tweets)
	classifier = NaiveBayesClassifier.train(training_feature_set)

	error_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	count_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	guess_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	
	count_table = {'+':0, '-':0, 'I':0, 'O':0}  
	tweets = classify.get_tweets_to_classify(conn);

	for tweet in tweets:
		text = classify.get_tweet_text(conn, tweet[0])[0][0]
		guess = classifier.classify(classify.process_tweet(text))
		classify.update_tweet_polarity(tweet[0], guess, conn)
		count_table[guess] += 1

	#fix_manual_tweets(conn_analysis)
	classify.run_sql(conn, classify.Statements.UPDATE_MANUAL_CLASSIFIED)

	print count_table
	print full_matrix
Example #21
0
    def train_with_movie_db(self):
        """
        Training possible with movie reviews
        - this does not yield particularly good results
        """
        self.use_movie_reviews = True

        negids = movie_reviews.fileids('neg')
        posids = movie_reviews.fileids('pos')

        negfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])),
                     "negative") for f in negids]
        posfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])),
                     "positive") for f in posids]

        negcutoff = len(negfeats) * 3 / 4
        poscutoff = len(posfeats) * 3 / 4

        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

        DLOG("train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats)))

        self.classifier = NaiveBayesClassifier.train(trainfeats)

        DLOG("accuracy: " + str(util.accuracy(self.classifier, testfeats)))
        DLOG(self.classifier.show_most_informative_features())
Example #22
0
def main():
    articles = CategorizedPlaintextCorpusReader(corpusdir, '.*', cat_pattern = r'(.*)[/]')
    feats = {}
    trainfeats = []
    testfeats = []
    for cat in articles.categories():
        wow = len([f for f in articles.fileids(cat)]) # such variable name
        print "for category", cat, ":", wow
        feats[cat] = [(word_feats(articles.words(fileids = [f])), cat) for f in articles.fileids(cat)]
        cutoff = wow - hold_back(wow)
        trainfeats.append(feats[cat][:cutoff])
        testfeats.append(feats[cat][cutoff:])

    train = [item for sublist in trainfeats for item in sublist]
    test = [item for sublist in testfeats for item in sublist]

    print 'train on %d instances, test on %d instances' % (len(train), len(test))

    classifier = NaiveBayesClassifier.train(train)
    print 'accuracy:', nltk.classify.util.accuracy(classifier, test)
    classifier.show_most_informative_features() # I don't understand the output for more than 2 categories :(

    # load with:
    # import pickle
    # f = open('my_classifier.pickle')
    # classifier = pickle.load(f)
    # f.close()
    with open('../data/classifier.pickle', 'wb') as f:
        pickle.dump(classifier, f)
Example #23
0
def main():
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
    negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
    negcutoff = int(len(negfeats) * 3 / 4)
    poscutoff = int(len(posfeats) * 3 / 4)
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    classifier = NaiveBayesClassifier.train(trainfeats)

    with open("output.json") as fin:
        sid = SentimentIntensityAnalyzer()
        data = json.load(fin)
    for key in data:
        reviews = data[key]["reviews"]
        for i in range(len(reviews)):
            text = reviews[i]["review"]
            sentiment_dict = {'positive_probability':0, 'label':'', 'negative_probability':0}
            prob = classifier.prob_classify(word_feats(text.split(" ")))
            classification = classifier.classify(word_feats(text.split(" ")))
            sentiment_dict['positive_probability'] = prob.prob('pos')
            sentiment_dict['negative_probability'] = prob.prob('neg')
            sentiment_dict['label'] = classification
            reviews[i]["sentiment"] = sentiment_dict
        data[key]["reviews"] = reviews
    with open('out_with_sentiment.json', 'w') as outfile:
        json.dump(data, outfile)
Example #24
0
def main(argv):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')

    #print negids
 
    negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'negative') for f in negids]
    posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'positive') for f in posids]

    trainfeats =  posfeats+negfeats
    #print trainfeats
    #    break
    classifier = NaiveBayesClassifier.train(trainfeats)

    #classifier = pickle.load(open("classifier.p", "rb"))
    topicList = ["media", "sports", "news", "fashion", "finance", "politics"]
    for line in sys.stdin:
        try:
            tolk_posset = word_tokenize(line.rstrip())
            d = word_feats(tolk_posset)
            for topic in topicList:
                subjectFull = subj(line, topic)
                if not subjectFull == "No match":
                    #print d
                    print "LongValueSum:" + "" + str(line.split(":")[0])+","+subjectFull + "," + classifier.classify(d) + "\t" + "1"                    
        except:
                #print "Error"
                continue
Example #25
0
def init():
    # create our dict of training data
    texts = {}
    texts['traffic'] = 'traffic-corpus.txt'
    texts['useless'] = 'useless-corpus.txt'

    #holds a dict of features for training our classifier
    train_set = []

    # loop through each item, grab the text, tokenize it and create a training feature with it
    for sense, file in texts.iteritems():
        print "training %s " % sense
	text = open(file, 'r').read()
        features = extract_words(text)
        train_set = train_set + [(get_feature(word), sense) for word in features]

    classifier = NaiveBayesClassifier.train(train_set)

    # uncomment out this line to see the most informative words the classifier will use
    classifier.show_most_informative_features(20)

    # uncomment out this line to see how well our accuracy is using some hand curated tweets
    # run_classifier_tests(classifier)

    return classifier
Example #26
0
def train():
    # get impact for documents for which it has not been computed yet
    for document in Document.objects.filter(sentiment__isnull=True):
        get_impact(document, settings.TIME)

    known_data = Document.objects.filter(sentiment__isnull=False)
    known_data_count = known_data.count()
    if known_data_count == 0:
        print('known_data_count == 0')
        return None, 0

    # 2/3 training data
    num_training_data = int(round(2 * known_data_count / 3))
    training_feats = []
    for document in known_data.order_by('id')[:num_training_data]:
        text = get_nltktext(document.text)
        training_feats.append((word_feats(text), document.sentiment))

    classifier = NaiveBayesClassifier.train(training_feats)

    # 1/3 test_data
    num_testing_data = int(round(known_data_count / 3))
    testing_feats = []
    for document in known_data.order_by('-id')[:num_testing_data]:
        text = get_nltktext(document.text)
        testing_feats.append((word_feats(text), document.sentiment))

    print('train on %d instances, test on %d instances' % (len(training_feats), len(testing_feats)))
    accuracy = nltk.classify.util.accuracy(classifier, testing_feats)
    return classifier, accuracy
    def train(self, graphs):
        """
        Trains a ``NaiveBayesClassifier`` using the edges present in
        graphs list as positive examples, the edges not present as
        negative examples.  Uses a feature vector of head-word,
        head-tag, child-word, and child-tag.

        :type graphs: list(DependencyGraph)
        :param graphs: A list of dependency graphs to train the scorer.
        """

        from nltk.classify import NaiveBayesClassifier

        # Create training labeled training examples
        labeled_examples = []
        for graph in graphs:
            for head_node in graph.nodes.values():
                for child_index, child_node in graph.nodes.items():
                    if child_index in head_node['deps']:
                        label = "T"
                    else:
                        label = "F"
                    labeled_examples.append(
                        (
                            dict(
                                a=head_node['word'],
                                b=head_node['tag'],
                                c=child_node['word'],
                                d=child_node['tag'],
                            ),
                            label,
                        )
                    )

        self.classifier = NaiveBayesClassifier.train(labeled_examples)
def evaluate_classifier(featx):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
 
    negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
    print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
    classifier.show_most_informative_features()
def evaluate_features(feature_select):
    posFeatures = []
    negFeatures = []
    inposFeatures = []
    innegFeatures = []
	#http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
	#breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
    with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWords = [feature_select(posWords), 'pos']
            posFeatures.append(posWords)
    with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWords = [feature_select(negWords), 'neg']
            negFeatures.append(negWords)
    """
    with open(RT_INPUT_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            inposWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            inposWords = [feature_select(inposWords), 'pos']
            inposFeatures.append(inposWords)
    """
    with open(RT_INPUT_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            innegWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            innegWords = [feature_select(innegWords), 'neg']
            innegFeatures.append(innegWords)
   
	#selects 3/4 of the features to be used for training and 1/4 to be used for testing
	#posCutoff = int(math.floor(len(posFeatures)*3/4))
	#negCutoff = int(math.floor(len(negFeatures)*3/4))
    trainFeatures = posFeatures + negFeatures
    testFeatures = innegFeatures #+ inposFeatures
      
    	#trains a Naive Bayes Classifier
    classifier = NaiveBayesClassifier.train(trainFeatures)	
    
    	#initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)	

    fileOutput ={'key':[],'pos':[],'neg':[]}
	#puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(testFeatures):
        #print features , label
        referenceSets[label].add(i)
        predicted = classifier.prob_classify(features)
        print "\n"
        fileOutput['key'].append(i)
        fileOutput['pos'].append(predicted.prob("pos"))
        fileOutput['neg'].append(predicted.prob("neg"))
        #posValues =  predicted.prob("pos") 
        #negValues = predicted.prob("neg") 
        fileOutput.values()
        testSets[predicted].add(i)
        #print i
        #print testSets[predicted]
    return fileOutput
Example #30
0
def classify_and_evaluate(reviews, feature_extractor=word_feats):
    random.shuffle(reviews)

    pos_reviews = filter(lambda x: x['class'] == 'POSITIVE', reviews)
    neg_reviews = filter(lambda x: x['class'] == 'NEGATIVE', reviews)

    # get unique features
    pos_features = []
    neg_features = []
    for review in pos_reviews:
        split_reviews = review['text'].split(' ')
        split_reviews = [x for x in split_reviews if x]
        pos_features.append((feature_extractor(split_reviews), 'pos'))

    for review in neg_reviews:
        split_reviews = review['text'].split(' ')
        split_reviews = [x for x in split_reviews if x]
        neg_features.append((feature_extractor(split_reviews), 'neg'))

    # divide groups
    pos_offset = int(math.floor(len(pos_reviews) * 3 / 4))
    neg_offset = int(math.floor(len(neg_reviews) * 3 / 4))

    training = pos_features[:pos_offset] + neg_features[:neg_offset]
    testing = pos_features[pos_offset:] + neg_features[neg_offset:]

    # train classifier
    classifier = NaiveBayesClassifier.train(training)

    print 'treinada em %d reviews, testada em %d reviews' % (len(training), len(testing))
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testing)
    classifier.show_most_informative_features()
Example #31
0
    def train_classifier(self,
                         dataset,
                         feature_fn_name='word',
                         train_ratio=0.8,
                         verbose=False,
                         token_column='text',
                         target_column='category',
                         best_ratio=0.8,
                         pos_target_val=1,
                         neg_target_val=-1):
        def word_feats(words):
            return dict([(word, True) for word in words])

        def best_word_feats(words):
            return dict([(word, True) for word in words if word in bestwords])

        def best_bigram_word_feats(words,
                                   score_fn=BigramAssocMeasures.chi_sq,
                                   n=200):
            bigram_finder = BigramCollocationFinder.from_words(words)
            bigrams = bigram_finder.nbest(score_fn, n)
            d = dict([(bigram, True) for bigram in bigrams])
            d.update(best_word_feats(words))
            return d

        def best_trigram_word_feats(words,
                                    score_fn=TrigramAssocMeasures.chi_sq,
                                    n=200):
            tcf = TrigramCollocationFinder.from_words(words)
            trigrams = tcf.nbest(score_fn, n)
            d = dict([(trigram, True) for trigram in trigrams])
            d.update(best_bigram_word_feats(words))
            d.update(best_word_feats(words))
            return d

        if verbose:
            print(
                '\nSelected feature function: {}, token column: {}, train ratio: {}'
                .format(feature_fn_name, token_column, train_ratio))
        df = dataset.sample(frac=1).reset_index(drop=True)
        negids = df[df[target_column] == neg_target_val].index
        posids = df[df[target_column] == pos_target_val].index
        feats = df[token_column]

        if feature_fn_name in ['best_word', 'best_bigram', 'best_trigram']:
            word_fd = FreqDist()
            label_word_fd = ConditionalFreqDist()
            for tokens in df[df[target_column] ==
                             pos_target_val][token_column]:
                for word in tokens.split():
                    word_fd[word] += 1
                    label_word_fd[self._positive_label][word] += 1

            for tokens in df[df[target_column] ==
                             neg_target_val][token_column]:
                for word in tokens.split():
                    word_fd[word] += 1
                    label_word_fd[self._negative_label][word] += 1

            pos_word_count = label_word_fd[self._positive_label].N()
            neg_word_count = label_word_fd[self._negative_label].N()
            total_word_count = pos_word_count + neg_word_count
            word_scores = {}
            for word, freq in word_fd.items():
                pos_score = BigramAssocMeasures.chi_sq(
                    label_word_fd[self._positive_label][word],
                    (freq, pos_word_count), total_word_count)
                neg_score = BigramAssocMeasures.chi_sq(
                    label_word_fd[self._negative_label][word],
                    (freq, neg_word_count), total_word_count)
                word_scores[word] = pos_score + neg_score

            best_cnt = int(len(word_scores) * best_ratio)
            best = sorted(word_scores.items(),
                          key=lambda item: item[1],
                          reverse=True)[:best_cnt]
            bestwords = set([w for w, s in best])
            if feature_fn_name == 'best_trigram_word_feats':
                feat_fn = best_trigram_word_feats
            elif feature_fn_name == 'best_bigram':
                feat_fn = best_bigram_word_feats
            else:
                feat_fn = best_word_feats

        else:
            feat_fn = word_feats

        negfeats = [(feat_fn(feats[i].split()), self._negative_label)
                    for i in negids]
        posfeats = [(feat_fn(feats[i].split()), self._positive_label)
                    for i in posids]
        if verbose:
            print('No. of samples: {}, Pos: {}, Neg: {}'.format(
                len(feats), len(posfeats), len(negfeats)))

        negcutoff = int(len(negfeats) * train_ratio)
        poscutoff = int(len(posfeats) * train_ratio)

        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

        classifier = NaiveBayesClassifier.train(trainfeats)
        refsets = defaultdict(set)
        testsets = defaultdict(set)

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        metrics = {
            'Accuracy':
            nltk.classify.util.accuracy(classifier, testfeats),
            'Pos precision':
            precision(refsets[self._positive_label],
                      testsets[self._positive_label]),
            'Pos recall':
            recall(refsets[self._positive_label],
                   testsets[self._positive_label]),
            'Neg precision':
            precision(refsets[self._negative_label],
                      testsets[self._negative_label]),
            'Neg recall':
            recall(refsets[self._negative_label],
                   testsets[self._negative_label])
        }
        if verbose:
            print(metrics)

        return classifier, metrics
Example #32
0
training_pre = pos[:len(pos)-5] + neg[:len(neg)-5]
test_pre = pos[len(pos)-5:] + neg[len(neg)-5:]

training = []
test = []
for k, line in enumerate(training_pre):
    training.append([format_sentence(line[0]), line[1]])

for k, line in enumerate(test_pre):
    test.append([format_sentence(line[0]), line[1]])

# Build classfier
from nltk.classify import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(training)

classifier.show_most_informative_features()

# Neg example
example1 = "XBox Live still down "
result = classifier.classify(format_sentence(example1))
print(result)


example1 = "wow... all i wanted to do is see taylor swift but of course shes sold out! "
result = classifier.classify(format_sentence(example1))
print(result)


example1 = "very sad after cavs loss "
        for i in range(int(Sum_Line * Devide_Part)):
            training_data.append([
                preprocess1(
                    line_clean(linecache.getline('data\\rt-polarity.pos', i))),
                'pos'
            ])

        for i in range(int(Sum_Line * Devide_Part)):
            training_data.append([
                preprocess1(
                    line_clean(linecache.getline('data\\rt-polarity.neg', i))),
                'neg'
            ])

        random.shuffle(training_data)
        model = NaiveBayesClassifier.train(training_data)

        Cor_Num_Pos = 0.0
        for i in range(int(Sum_Line * Devide_Part), Sum_Line):
            Cor_Num_Pos += (model.classify(
                preprocess2(
                    line_clean(linecache.getline('data\\rt-polarity.pos',
                                                 i)))) == 'pos')

        Cor_Num_Neg = 0.0
        for i in range(int(Sum_Line * Devide_Part), Sum_Line):
            Cor_Num_Neg += (model.classify(
                preprocess2(
                    line_clean(linecache.getline('data\\rt-polarity.neg',
                                                 i)))) == 'neg')
    features_pos = [(extract_features(movie_reviews.words(fileids=[f])),
                     'Positive') for f in fileids_pos]
    features_neg = [(extract_features(movie_reviews.words(fileids=[f])),
                     'Negative') for f in fileids_neg]

threshold = 0.8
num_pos = int(threshold * len(features_pos))
num_neg = int(threshold * len(features_neg))

features_train = features_pos[:num_pos] + features_neg[:num_neg]
features_test = features_pos[num_pos:] + features_neg[num_neg:]

print('\nNumber of training datapoints:', len(features_train))
print('Number of test datapoints:', len(features_test))

classifier = NaiveBayesClassifier.train(features_train)
print('\nAccuracy of the classifier:', nltk_accuracy(classifier,
                                                     features_test))

N = 15
print('\nTop ' + str(N) + ' most informative words:')
for i, item in enumerate(classifier.most_informative_features()):
    print(str(i + 1) + '. ' + item[0])
    if i == N - 1:
        break

input_reviews = [
    "Everything about this movie is outstanding -- the performances, the way the true events are handled, the cinematography. ",
    "In this day of digital news, this movie makes us stand back and realize what we may lose in the way of investigative journalism as we slowly kill off print media.",
    "The lengths the directors go to to achieve a sense of authenticity is remarkable. ",
    "We are there in Boston in 2001-2002. We get to know enough about each character to make him or her real, but not enough to create side dramas. ",
def main_function():
    conn = MySQLdb.connect(host=DATABASES['ensemble']['HOST'],
                           user=DATABASES['ensemble']['USER'],
                           passwd=DATABASES['ensemble']['PASSWORD'],
                           db=DATABASES['ensemble']['NAME'])

    training_tweets = classify.get_training_tweets(conn)
    training_feature_set = classify.process_tweets(training_tweets)

    bayes_classifier = NaiveBayesClassifier.train(training_feature_set)

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}

    test_tweets = classify.get_test_tweets(conn)

    for tweet in test_tweets:
        text = classify.get_tweet_text(conn, tweet[0])[0][0]
        guess = bayes_classifier.classify(classify.process_tweet(text))
        classify.update_tweet_polarity(tweet[0], guess, conn)
        count_table[guess] += 1

    print "Naive Bayes"
    print count_table

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    config_megam('/opt/packages')
    max_ent_classifier = MaxentClassifier.train(training_feature_set,
                                                algorithm="megam",
                                                trace=0)

    for tweet in test_tweets:
        text = classify.get_tweet_text(conn, tweet[0])[0][0]
        guess = max_ent_classifier.classify(classify.process_tweet(text))
        update_tweet_polarity_ensemble(tweet[0], guess, conn)
        count_table[guess] += 1

    print "Maximum Entropy"
    print count_table

    #generate the accuracy matrix
    full_matrix = {
        '+': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        '-': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'I': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'O': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        }
    }

    for tweet in test_tweets:
        result = classify.run_sql(
            conn, classify.Statements.CHECK_CONSENSUS % tweet[0])
        guess = result[0][0]

        actual_result = classify.run_sql(
            conn, classify.Statements.CHECK_MAJORITY % tweet[0])
        actual = actual_result[0][0]

        if guess is not None:
            if actual is not None:
                full_matrix[actual][guess] += 1

    print full_matrix
Example #36
0
def code():
    possitive_fileid = reviews.fileids('pos')  #pos/cv000 to cv999
    negative_fileid = reviews.fileids('neg')  #neg/cv000 to cv999

    print("Total number of reviews in the dataset: " +
          str(len(reviews.fileids())))
    print("Total number of possitive Reviews: " + str(len(possitive_fileid)))
    print("Total number of negative Reviews: " + str(len(negative_fileid)))

    possitive_features = [(extract(reviews.words(fileids=[fd])), 'Good')
                          for fd in possitive_fileid]
    negative_features = [(extract(reviews.words(fileids=[fd])), 'Bad')
                         for fd in negative_fileid]

    train_features = possitive_features[:] + negative_features[:]
    test_features = possitive_features[:] + negative_features[:]

    print('\nTotal number of trained datapoints:', len(train_features))
    print('Total number of tested datapoints:', len(test_features))

    # Train a Naive Bayes classifier
    classifier = classify.train(train_features)
    acc = classifier_accuracy(classifier, test_features) * 100
    print('\nAccuracy of the system: ' + str(acc) + ' %')

    n = 20
    print('\nTop ' + str(n) + ' most informative words:')
    for i, item in enumerate(classifier.most_informative_features()):
        print(str(i + 1) + '. ' + item[0])
        if i == n - 1:
            break

    default_reviews = [
        'The costumes in this movie were great',
        'I think the story was terrible and the characters were very weak',
        'People say that the director of the movie is amazing',
        'This is such an idiotic movie, i will not recommend it to anyone',
        'This is not the movie i recommend'
    ]

    print("\nCurrent Reviews:\n")
    for i in default_reviews:
        print(i + '.\n\n')

    while (True):
        a = int(
            input(
                "Do you want to add more reviews, if yes press 1 else press 0\n"
            ))
        if (a == 0):
            break
        elif (a == 1):
            b = input("Please enter the review\n")
            default_reviews.append(b)

    total_rating = 0

    print("\nMovie Review Predictions:")
    for review in default_reviews:
        print("\nReview:", review)

        probabilities = classifier.prob_classify(extract(review.split()))

        predicted_sentiment = probabilities.max()

        print("Predicted sentiment:", predicted_sentiment)
        print("Probability of correct sentiment:",
              format(round(probabilities.prob(predicted_sentiment), 2), '.2f'))

        if (predicted_sentiment == 'Good'):
            total_rating += round(probabilities.prob(predicted_sentiment), 2)
        else:
            if predicted_sentiment == 'Bad':
                total_rating -= round(probabilities.prob(predicted_sentiment),
                                      2)

    if (-0.25 <= total_rating <= 0.25):
        print("\n\nOverall Rating: Average")
    elif total_rating < -0.25:
        print("\n\nOverall Rating: Very Bad")
    else:
        print("\n\nOverall Rating: Very good")
Example #37
0
File: app.py Project: shntm/twenty
def trainAndPrintAccuracy(trainingSet, testSet):
    print("Training data...")
    classifier = NaiveBayesClassifier.train(trainingSet)
    accuracy = nltk.classify.util.accuracy(classifier, testSet)
    print(accuracy)
    return classifier
Example #38
0
def cross_validate(iterations, haiku_labeled, short_labeled, corpus_length, journal):
    accuracy_scores = []
    most_informative = []
    mis_classified = []
    h_precision_scores = []
    h_recall_scores = []
    nh_precision_scores = []
    nh_recall_scores = []
    response = raw_input("Do you want to reduce dimensionality (y/n)?")
    if response == "y":
        min_df = raw_input("Include words occurring more than how many times?")
    #do normal cross-validation
    for i in range(iterations):
        haiku_random = []
        haiku_random = random.sample(haiku_labeled, corpus_length) #pick poems at random -- choose number based on smaller corpus size
        #Create 4 folds for validation testing
        cut_point = int((len(haiku_random))/4)
        hfold1 = haiku_random[0:cut_point]
        hfold2 = haiku_random[cut_point:(cut_point*2)]
        hfold3 = haiku_random[(cut_point*2):(cut_point*3)]
        hfold4 = haiku_random[(cut_point*3):]
        poetry_random = []
        poetry_random = random.sample(short_labeled, corpus_length) #draws this number of samples randomly from the feature-set; will have to adjust number according to corpus size
        cut_point2 = int((len(poetry_random))/4)
        pfold1 = poetry_random[0:cut_point2]
        pfold2 = poetry_random[cut_point2:(cut_point2*2)]
        pfold3 = poetry_random[(cut_point2*2):(cut_point2*3)]
        pfold4 = poetry_random[(cut_point2*3):]
        #build training and test-sets
        train_set = hfold1 + hfold2 + pfold1 + pfold2 + hfold3 + pfold3
        test_set =  hfold4 + pfold4 #+ hfold3 + pfold3
        #dimensionality reduction; shouldn't you be doing this for both train and test sets?
        if response =="y":
            doc_terms = documents_per_word(train_set + test_set)  #count how many documents each word appears in
            train_set = reduce_word_features((train_set + test_set), doc_terms, int(min_df))    #exclude terms that appear in min_df documents
        #train the classifier
        nb_classifier = NaiveBayesClassifier.train([e[1] for e in train_set])
        nb_classifier.labels()
        #check accuracy of classifier and store accuracy measure
        accuracy_scores.append(accuracy(nb_classifier, [el[1] for el in test_set]))
        #obtain the 30 most informative features for each iteration
        most_informative.append(nb_classifier.show_most_informative_features(n=30))
        #get haiku precision and recall measures from the test and store in list
        h_precision, h_recall, nh_precision, nh_recall = get_precision_recall(test_set, nb_classifier)
        h_precision_scores.append(h_precision)
        h_recall_scores.append (h_recall)
        nh_precision_scores.append(nh_precision)
        nh_recall_scores.append (nh_recall)
        #store list of mis-classified files from the journal corpus (i.e., files misclassified as haiku)
        for el in test_set:
            guess = nb_classifier.classify(el[1][0])
            if guess != el[1][1] and (re.findall(r'[a-z]', el[0][0]) or len(el[0]) > 8):  #this will exclude the haiku mis-classified as not-haiku
                mis_classified.append(el[0])
    #write the mis_classified texts to a file and print out the most-commonly mis-classified texts
    counter = collections.Counter(mis_classified)
    #prepare to print mis-classified files to .csv
    import csv
    filename = 'c:\Users\Public\Documents\MyData\HaikuArticle\errors.csv'
    mis_classified_texts = open(filename, 'a')
    wr = csv.writer(mis_classified_texts, quoting=csv.QUOTE_ALL)
    wr.writerow(counter.values())   #write the frequency
    wr.writerow(counter.keys())     #write the filenames
    #print the files to a document along with metadata
    print_misclassified_haiku(counter, journal)
    print("\nThe most commonly mis-classified files in this test were the following:")
    print counter.most_common()
    return accuracy_scores, most_informative, h_precision_scores, h_recall_scores, nh_precision_scores, nh_recall_scores 
Example #39
0
def evaluate_features(feature_select):
    #reading pre-labeled input and splitting into lines
    negSentences = open(os.path.join(__location__, 'rt-polarity-neg.txt'),
                        'r',
                        encoding='utf8')
    posSentences = open(os.path.join(__location__, 'rt-polarity-pos.txt'),
                        'r',
                        encoding='utf8')
    negSentences = re.split(r'\n', negSentences.read())
    posSentences = re.split(r'\n', posSentences.read())
    # stopwords = re.split(r'\n', posSentences.read())
    stopwords = []
    with open('stopwords.txt', 'r') as f:
        for line in f:
            stopwords.append(line.rstrip())
        f.close()
    # print(stopwords)

    posFeatures = []
    negFeatures = []
    # breaks up the sentences into lists of individual words
    # creates instance structures for classifier
    for i in posSentences:
        posWords = re.findall(r"[\w']+|[.,!?;]", i)
        posWords = [word for word in posWords if word not in stopwords]
        posWords = [feature_select(posWords), 'pos']
        # print(posWords)
        # print(posWords)
        posFeatures.append(posWords)
        # if posWords in stopwords:
        #     continue
        # else:
        #     posFeatures.append(posWords)
    for i in negSentences:
        negWords = re.findall(r"[\w']+|[.,!?;]", i)
        negWords = [word for word in negWords if word not in stopwords]
        negWords = [feature_select(negWords), 'neg']
        negFeatures.append(negWords)
        # if negFeatures in stopwords:
        #     print("11")
        #     continue
        # else:
        #     negFeatures.append(negWords)

    # posFeatures = [word for word in posFeatures if word not in stopwords]
    # negFeatures = [word for word in negFeatures if word not in stopwords]
    posCutoff = int(math.floor(len(posFeatures) * 3 / 4))
    negCutoff = int(math.floor(len(negFeatures) * 3 / 4))
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]

    #Runs the classifier on the testFeatures
    classifier = NaiveBayesClassifier.train(trainFeatures)

    #Sets up labels to look at output
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)
    for i, (features, label) in enumerate(
            testFeatures):  # enumerate adds number-count to each item
        referenceSets[label].add(
            i)  # recorded polarity for these test sentences
        predicted = classifier.classify(
            features)  # classifiers' proposed polarity for tests
        testSets[predicted].add(i)

    #Outputs
    print('train on %s instances, test on %s instances' %
          (len(trainFeatures), len(testFeatures)))
    print('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures))
    print('pos precision:',
          scores.precision(referenceSets['pos'], testSets['pos']))
    print('pos recall:', scores.recall(referenceSets['pos'], testSets['pos']))
    print('neg precision:',
          scores.precision(referenceSets['neg'], testSets['neg']))
    print('neg recall:', scores.recall(referenceSets['neg'], testSets['neg']))
    classifier.show_most_informative_features(10)
Example #40
0
    X = [x[0] for x in dataset]
    Y = [x[1] for x in dataset]
    kfold = StratifiedKFold(n_splits=int(args.z),
                            shuffle=True,
                            random_state=seed)
    cvscores = []
    for train, test in kfold.split(X, Y):
        # print(dataset[train[0]])
        train_data = []
        for i in range(len(train)):
            train_data.append(dataset[train[i]])
        test_data = []
        for i in range(len(test)):
            test_data.append(dataset[test[i]])
        model = NaiveBayesClassifier.train(train_data)
        scores = nltk.classify.util.accuracy(model, test_data)
        print("{}%".format(scores * 100))
        cvscores.append(scores * 100)
        # plot_model(model, to_file='model.png')
        model.show_most_informative_features()

    print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))

    ### create training and test sets
    ## set the cutoffs
    # negcutoff = math.floor(len(neg_list)*3/4)
    # poscutoff = math.floor(len(pos_list)*3/4)
    #
    # top10list = []
    # avgAccuracy = 0
def train_classifier(training, test):
    classifier = NaiveBayesClassifier.train(training)
    print('Classifier Accuracy => ', accuracy(classifier, test))
    filtered_words = [
        word for word in words if word not in stopwords.words('english')
    ]

    # 表示该词在文本中,为了使用nltk中的分类器
    return {word: True for word in filtered_words}


# 构造样本
train_data = [[pro_text(text1), 1], [pro_text(text2), 1], [pro_text(text3), 1],
              [pro_text(text4), 0], [pro_text(text5), 0]]

print('train_data', train_data)

# 训练模型
nb_model = NaiveBayesClassifier.train(train_data)

# 测试模型
text6 = 'that is a bad one'
print('测试结果:', nb_model.classify(pro_text(text6)))

print('\n===================== 2. 文本相似度 =====================')
# 2. 文本相似度
import nltk
from nltk import FreqDist

text1 = 'I like the movie so much '
text2 = 'That is a good movie '
text3 = 'This is a great one '
text4 = 'That is a really bad movie '
text5 = 'This is a terrible movie'
def evaluate_classifier(featx):

    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]

    negcutoff = int(len(negfeats) * 3 / 4)
    poscutoff = int(len(posfeats) * 3 / 4)

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    # using 3 classifiers
    classifier_list = ['nb', 'maxent', 'svm']

    for cl in classifier_list:
        if cl == 'maxent':
            classifierName = 'Maximum Entropy'
            classifier = MaxentClassifier.train(trainfeats,
                                                'GIS',
                                                trace=0,
                                                encoding=None,
                                                labels=None,
                                                gaussian_prior_sigma=0,
                                                max_iter=1)
        elif cl == 'svm':
            classifierName = 'SVM'
            classifier = SklearnClassifier(LinearSVC(), sparse=False)
            classifier.train(trainfeats)
        else:
            classifierName = 'Naive Bayes'
            classifier = NaiveBayesClassifier.train(trainfeats)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        accuracy = nltk.classify.util.accuracy(classifier, testfeats)
        pos_precision = precision(refsets['pos'], testsets['pos'])
        pos_recall = recall(refsets['pos'], testsets['pos'])
        pos_fmeasure = f_measure(refsets['pos'], testsets['pos'])
        neg_precision = precision(refsets['neg'], testsets['neg'])
        neg_recall = recall(refsets['neg'], testsets['neg'])
        neg_fmeasure = f_measure(refsets['neg'], testsets['neg'])

        print('')
        print('---------------------------------------')
        print('SINGLE FOLD RESULT ' + '(' + classifierName + ')')
        print('---------------------------------------')
        print('accuracy:', accuracy)
        print('precision', (pos_precision + neg_precision) / 2)
        print('recall', (pos_recall + neg_recall) / 2)
        print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)

        #classifier.show_most_informative_features()

    print('')

    ## CROSS VALIDATION

    trainfeats = negfeats + posfeats

    # SHUFFLE TRAIN SET
    # As in cross validation, the test chunk might have only negative or only positive data
    random.shuffle(trainfeats)
    n = 5  # 5-fold cross-validation

    for cl in classifier_list:

        subset_size = int(len(trainfeats) / n)
        accuracy = []
        pos_precision = []
        pos_recall = []
        neg_precision = []
        neg_recall = []
        pos_fmeasure = []
        neg_fmeasure = []
        cv_count = 1
        for i in range(n):
            testing_this_round = trainfeats[i * subset_size:][:subset_size]
            training_this_round = trainfeats[:i * subset_size] + trainfeats[
                (i + 1) * subset_size:]

            if cl == 'maxent':
                classifierName = 'Maximum Entropy'
                classifier = MaxentClassifier.train(training_this_round,
                                                    'GIS',
                                                    trace=0,
                                                    encoding=None,
                                                    labels=None,
                                                    gaussian_prior_sigma=0,
                                                    max_iter=1)
            elif cl == 'svm':
                classifierName = 'SVM'
                classifier = SklearnClassifier(LinearSVC(), sparse=False)
                classifier.train(training_this_round)
            else:
                classifierName = 'Naive Bayes'
                classifier = NaiveBayesClassifier.train(training_this_round)

            refsets = collections.defaultdict(set)
            testsets = collections.defaultdict(set)
            for i, (feats, label) in enumerate(testing_this_round):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)

            cv_accuracy = nltk.classify.util.accuracy(classifier,
                                                      testing_this_round)
            cv_pos_precision = precision(refsets['pos'], testsets['pos'])
            cv_pos_recall = recall(refsets['pos'], testsets['pos'])
            cv_pos_fmeasure = f_measure(refsets['pos'], testsets['pos'])
            cv_neg_precision = precision(refsets['neg'], testsets['neg'])
            cv_neg_recall = recall(refsets['neg'], testsets['neg'])
            cv_neg_fmeasure = f_measure(refsets['neg'], testsets['neg'])

            accuracy.append(cv_accuracy)
            pos_precision.append(cv_pos_precision)
            pos_recall.append(cv_pos_recall)
            neg_precision.append(cv_neg_precision)
            neg_recall.append(cv_neg_recall)
            pos_fmeasure.append(cv_pos_fmeasure)
            neg_fmeasure.append(cv_neg_fmeasure)

            cv_count += 1

        print('---------------------------------------')
        print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')')
        print('---------------------------------------')
        print('accuracy:', sum(accuracy) / n)
        print('precision',
              (sum(pos_precision) / n + sum(neg_precision) / n) / 2)
        print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2)
        print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2)
        print('')
Example #44
0
news_words = sents2words(news_sents)
news_feats = [(word_feats(wordlist), 'news') for wordlist in news_words]
news_cutoff = len(news_feats) * 3 / 4
other_cutoff = len(other_feats) * 3 / 4
others_cutoff = len(others_feats) * 3 / 4

train_feats_other = news_feats[:news_cutoff] + other_feats[:other_cutoff]
train_feats_others = news_feats[:news_cutoff] + others_feats[:others_cutoff]
test_feats_other = news_feats[news_cutoff:] + other_feats[other_cutoff:]
test_feats_others = news_feats[news_cutoff:] + others_feats[others_cutoff:]
test_sents_other = news_sents[news_cutoff:] + other_sents[other_cutoff:]
print 'train on %d instances, test on %d instances' % (len(train_feats_other),
                                                       len(test_feats_other))

classifier_other = NaiveBayesClassifier.train(train_feats_other)
classifier_others = NaiveBayesClassifier.train(train_feats_others)
print 'accuracy:', nltk.classify.util.accuracy(classifier_other,
                                               test_feats_other)
print 'accuracy:', nltk.classify.util.accuracy(classifier_others,
                                               test_feats_others)
classifier_other.show_most_informative_features(n=100)

filename = filebase + "/fnielsen/data/Hansen2010Diffusion_news.txt"
tweet_news = [
    re.findall('^(-?\d) (.+)$', line, re.UNICODE)[0]
    for line in open(filename).readlines()[:1000]
]
tweet_words = sents2words(map(lambda (v, s): s, tweet_news))
tweet_feats = []
for n in range(len(tweet_news)):
Example #45
0
def evaluate_features(feature_select):
    posFeatures_train = []
    negFeatures_train = []
    posFeatures_test = []
    negFeatures_test = []
    #http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
    #breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
    with open(POS_FILE_TRAIN, "rb") as f:
        posSentences_train = f.readlines()
        #posSentences = pos_data.split('\n')
    random.shuffle(posSentences_train)

    with open(POS_FILE_TEST, "rb") as f:
        posSentences_test = f.readlines()
        #posSentences = pos_data.split('\n')
    random.shuffle(posSentences_test)

    with open(NEG_FILE_TRAIN, "rb") as f:
        #negSentences = f.read().split('\n')
        negSentences_train = f.readlines()
    random.shuffle(negSentences_train)

    with open(NEG_FILE_TEST, "rb") as f:
        #negSentences = f.read().split('\n')
        negSentences_test = f.readlines()
    random.shuffle(negSentences_test)

    #with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
    for i in posSentences_train:
        posWords = re.findall(r"[\w']+|[.,!?;@#]", str(i).rstrip())
        posWords = [feature_select(posWords), 'pos']  #pos = contains location
        posFeatures_train.append(posWords)
        str_i = (i.decode("utf-8")).strip()
        Train_twit_Dic[frozenset(posWords[0].items())] = str_i

    for i in posSentences_test:
        posWords_test = re.findall(r"[\w']+|[.,!?;@#]", str(i).rstrip())
        posWords_test = [feature_select(posWords_test),
                         'pos']  #pos = contains location
        posFeatures_test.append(posWords_test)
        str1 = (i.decode("utf-8")).strip()
        Test_twit_Dic[frozenset(posWords_test[0].items())] = str1

    #with open(RT_POLARITY_NEG_FILE, 'r') as negSenBufferedReader: <_io.BufferedReader name='/home/ira/Dropbox/twitter/contain_location_tweets.txt'>tences:
    for i in negSentences_train:
        negWords = re.findall(r"[\w']+|[.,!?;@#]", str(i).rstrip())
        negWords = [feature_select(negWords),
                    'neg']  #neg = doesn't contain location
        negFeatures_train.append(negWords)
        str2 = (i.decode("utf-8")).strip()
        Train_twit_Dic[frozenset(negWords[0].items())] = str2

    for i in negSentences_test:
        negWords_test = re.findall(r"[\w']+|[.,!?;@#]", str(i).rstrip())
        negWords_test = [feature_select(negWords_test),
                         'neg']  #neg = doesn't contain location
        negFeatures_test.append(negWords_test)
        str3 = (i.decode("utf-8")).strip()
        Test_twit_Dic[frozenset(negWords_test[0].items())] = str3
    #selects 3/4 of the features to be used for training and 1/4 to be used for testing
    #posCutoff = int(math.floor(len(posFeatures)*3/4))
    #negCutoff = int(math.floor(len(negFeatures)*3/4))
    #trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] ###need to understand what is test here
    #testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
    trainFeatures = posFeatures_train + negFeatures_train  ###need to understand what is test here
    testFeatures = posFeatures_test + negFeatures_test

    ##############################################################################3
    #trains a Naive Bayes Classifier
    classifier = NaiveBayesClassifier.train(trainFeatures)

    #initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)

    #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)
        ####################   MINE   ####################################
        if predicted == "pos":  ##the twit according to the classifier contains a location
            twiit = Test_twit_Dic[frozenset(features.items())]
            list_close_twits = Close_Twt_Dic[twiit]
            words = nltk.word_tokenize(twiit)
            tagged_words = ner_tagger.tag(words)
            lbl = ""
            for tag_w in tagged_words:
                if tag_w[1] == "LOCATION":
                    lbl = lbl + tag_w[0] + " "  #found a label for our twiit
                final_lbl = lbl
            ### employ satnford trained classifier on all of the close twiits to find
            if lbl == "":  #couldnt find the location (lable) for our twiit, lets try to find it wihitn its physical neiborhood twwits
                lbl_list = []
                for s in list_close_twits:
                    words = nltk.word_tokenize(s)
                    tagged_words = ner_tagger.tag(words)
                    lbl = ""
                    for tag_w in tagged_words:
                        if tag_w[1] == "LOCATION":
                            lbl = lbl + tag_w[0] + " "
                    if lbl != "":
                        lbl_list.append(lbl)
                ## find most common str (label) in lbl_list
                c = Counter(lbl_list)
                c.most_common(1)
                final_lbl = c[0][0]

    #prints metrics to show how well the feature selection did
    print('train on %d instances, test on %d instances' %
          (len(trainFeatures), len(testFeatures)))
    print('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures))
    #print 'pos precision:', nltk.metrics.precision(referenceSets['pos'], testSets['pos'])
    print('pos precision:', precision(referenceSets['pos'], testSets['pos']))
    print('pos recall:', recall(referenceSets['pos'], testSets['pos']))
    #print 'neg precision:', nltk.metrics.precision(referenceSets['neg'], testSets['neg'])
    print('neg precision:', precision(referenceSets['neg'], testSets['neg']))
    print('neg recall:', recall(referenceSets['neg'], testSets['neg']))
    classifier.show_most_informative_features(10)
    def do_validation(self):
        # each fold is a list of body ids.
        folds, hold_out = kfold_split(self.dataset, n_folds=10)
        #  fold_stances is a dict. keys are fold number (e.g. 0-9). hold_out_stances is list
        fold_stances, hold_out_stances = get_stances_for_folds(
            self.dataset, folds, hold_out)
        # https://cs.fit.edu/~mmahoney/compression/textdata.html
        sentences = word2vec.Text8Corpus('text8')
        model = word2vec.Word2Vec(sentences, size=200)

        labeled_feat_dict = {}

        print "Generating features for each fold"
        for fold_id in fold_stances:
            print "Generating features for fold ", fold_id
            bodies = folds[fold_id]
            stances = fold_stances[fold_id]

            fold_avg_sims, fold_max_sims = JaccardGenerator().gen_jaccard_sims(
                self.dataset, bodies, stances)
            common_ngrams = NgramsGenerator().gen_common_ngrams(
                self.dataset, bodies, stances, self._ngram_len)
            wordvectors = WordVector().gen_wordvectors(self.dataset, bodies,
                                                       stances, model)

            labeled_feature_set = []
            for i in range(len(stances)):
                features = {
                    'avg_sims': fold_avg_sims[i],
                    'max_sims': fold_max_sims[i],
                    'common_ngrams': common_ngrams[i],
                    'word_vectors': wordvectors[i]
                }
                label = self._process_stance(stances[i]['Stance'])
                labeled_feature = (features, label)
                labeled_feature_set.append(labeled_feature)

            labeled_feat_dict[fold_id] = labeled_feature_set

        print "Generating features for hold out fold"
        holdout_avg_sims, holdout_max_sims = JaccardGenerator(
        ).gen_jaccard_sims(self.dataset, hold_out, hold_out_stances)
        holdout_common_ngrams = NgramsGenerator().gen_common_ngrams(
            self.dataset, hold_out, hold_out_stances, self._ngram_len)
        holdout_wordvectors = WordVector().gen_wordvectors(
            self.dataset, hold_out, hold_out_stances, model)

        h_unlabeled_features = []
        h_labels = []
        for i in range(len(hold_out_stances)):
            unlabeled_feature = {
                'avg_sims': holdout_avg_sims[i],
                'max_sims': holdout_max_sims[i],
                'common_ngrams': holdout_common_ngrams[i],
                'word_vectors': holdout_wordvectors[i]
            }
            label = self._process_stance(hold_out_stances[i]['Stance'])

            h_unlabeled_features.append(unlabeled_feature)
            h_labels.append(label)

        fold_accuracy = {}
        best_fold_accuracy = 0.0
        classifiers = []

        print "Validating using each fold as testing set"
        for fold_id in fold_stances:
            fold_ids = list(range(len(folds)))
            del fold_ids[fold_id]  # deleted fold is test set for this run

            training_set = [
                feat for fid in fold_ids for feat in labeled_feat_dict[fid]
            ]

            testing_set = []
            testing_labels = []

            for feat, label in labeled_feat_dict[fold_id]:
                testing_set.append(feat)
                testing_labels.append(label)

            classifier = NaiveBayesClassifier.train(training_set)
            classifiers.append(classifier)
            pred = classifier.classify_many(testing_set)

            accuracy = self._score(pred, testing_labels)
            print "Fold ", fold_id, "accuracy: ", accuracy
            if accuracy > best_fold_accuracy:
                best_fold_accuracy = accuracy
                best_fold_cls = classifier

        h_res = best_fold_cls.classify_many(h_unlabeled_features)
        print 'holdout score:', self._score(h_res, h_labels)
def word_feats(words):
    return dict([(word, True) for word in words])


negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg')
            for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos')
            for f in posids]

trainfeats = negfeats + posfeats

classifier = NaiveBayesClassifier.train(trainfeats)
print('train on %d instances' % (len(trainfeats)))
classifier.show_most_informative_features()


def process():
    print('input some text')
    while True:
        line = input()
        if line.strip() is not '':
            words = word_tokenize(line)
            feats = [word_feats(words)]
            print(feats)
            print(classifier.classify_many(feats))
            for pdist in classifier.prob_classify_many(feats):
                print('pos: %.4f neg: %.4f' %
    'movie', 'sound', 'was', 'words', 'the', 'actors', 'is', 'did', 'know',
    'not'
]

pos_counti_features1 = [(word_feats(pos_count), 'pos_count')
                        for pos_count in words_pos_counti]
neg_counta_features2 = [(word_feats(neg_count), 'neg_count')
                        for neg_count in words_neg_counta]
neutral_features3 = [(word_feats(neu), 'neu') for neu in words_neutral]

print("pos_counti_features1: ", pos_counti_features1)
print("neg_counta_features2: ", neg_counta_features2)
print("neutral_features3: ", neutral_features3)

train_set_va = neg_counta_features2 + pos_counti_features1 + neutral_features3
classifier_va = NaiveBayesClassifier.train(train_set_va)

print("train_set_va: ", train_set_va)
print("classifier_va: ", classifier_va)

# Predict
neg_count = 0
pos_count = 0
sentence = "Awesome movie, I liked it"
sentence = sentence.lower()
key_words = sentence.split(' ')
for word in key_words:
    classResult = classifier_va.classify(word_feats(word))
    if classResult == 'neg_count':
        neg_count = neg_count + 1
    if classResult == 'pos_count':
class abc(username):
    
consumer_key = "D9idPsR9iCUbzQzlUIoOUlOjc"
consumer_secret = "VGLhthlGLxbJpPyu3WtTTz4oKLYYkJ5VHJIn94Azf0bDoeE7vm"
access_token = "798166878-qgZxk593TZpoSpaZyPBMI1wnOjkKH80AFGR0ZqAg"
access_token_secret = "DvQDsAEGm23yG6aRVoPhrVIeZINR6Y47jgjdXgdF7OaVH"
     
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
auth_api = API(auth)

target = username



rel = ['hate', 'left', 'bf','gf','girlfriend','boyfriend','heartbreak','alone', 'love','abuse']
edu = ['exam','test','assignment','school','studying']
mon = ['broke','money','cash','economy', 'finance','crisis']

cleantweet = []
clist = []
text = []
mylist = []
words = []
global dcount,Money
dcount = 0
Money = 0
global ccount,Academic
ccount = 0
Academic = 0
global tcount,Relationship
tcount = 0
Relationship = 0
long_stop_list = ['a','are','be','an','and','at','by','the','is','this','that','to','for','it','in','on']

def create_word_features(clist):
    useful_words = [word for word in clist if word not in long_stop_list]
    my_dict = dict([(word, True) for word in useful_words])
    return my_dict

corpus_root = 'C:/Users/Bindu/Desktop/samp1/neg'
wordlists=PlaintextCorpusReader(corpus_root,'.*')
neg_re=[]
mylist=[]
x=[]
for fileids in wordlists.fileids():
    words = wordlists.words(fileids)
    neg_re.append((create_word_features(words),"Depressed"))

corpus_root='C:/Users/Bindu/Desktop/samp1/pos'  
wordlists=PlaintextCorpusReader(corpus_root,'.*')
pos_re=[]
mylist=[]
x=[]
for fileids in wordlists.fileids():
    words = wordlists.words(fileids)
    pos_re.append((create_word_features(words),"Not depressed"))

train_set = neg_re[:45] + pos_re[:45]
random.shuffle(train_set)
test_set =  neg_re[45:] + pos_re[45:]
random.shuffle(test_set)

#print(len(train_set),len(test_set))
classifier = NaiveBayesClassifier.train(train_set)
accuracy = nltk.classify.util.accuracy(classifier, test_set)
#print(accuracy * 100)

class Helpline(App):
    def build(self):
        #Window.clearcolor = (0.50,0.50,0.50,1)
        return  Label(text="Need to talk to someone? \n\nNational Suicide Helpline \n\nVisit: http://www.aasra.info/ \nCall: +912227546669 \n\nWant to chat with a counsellor? \nVisit:https://yourdost.com/")
hp=Helpline()
def get_info(target):
   try: 
    item = auth_api.get_user(target)
    print("Name: " + item.name)
    print("Twitter name: " + item.screen_name)
    print("Total number of times tweeted: " + str(item.statuses_count))
    if (item.statuses_count < 50):
        print ("Insufficient Data to analyse")
        exit()
    print("Following: " + str(item.friends_count))
    print("Followers: " + str(item.followers_count))
    tweets = item.statuses_count
    account_created_date = item.created_at
    delta = datetime.utcnow() - account_created_date
    account_age_days = delta.days
    print("Account age (in days): " + str(account_age_days))
    if account_age_days > 0:
        print("Average tweets per day: " + "%.2f"%(float(tweets)/float(account_age_days)))    
   except:
        print ("Invalid Username")
        exit()



def get_tweets(target):
        
        print ("Collecting user %s's tweets" % target)
        for status in Cursor(auth_api.user_timeline, screen_name = '@%s' % target).items():
          mylist.append(json.dumps(status._json['text']))

def get_cause(clist):
    global Relationship,Academic,Money
    for w in clist:
        if w in rel:
            Relationship+=1
        elif w in edu:
            Academic+=1
        elif w in mon:
            Money+=1
   

def get_classify(tweet):
    clist = []
    global dcount
    global ccount
    words = word_tokenize(tweet)
    clist = words[:]
    get_cause(clist)
    words = create_word_features(words)
    x=classifier.classify(words)
    if x == "Depressed":
        dcount+=1
    else:
        ccount+=1
  

     
get_info(target)
try:
 get_tweets(target)
except:
    print("%s's account is set to private" % target)
    exit()
for tweet in mylist:
    
    tcount+=1
    tweet = tweet.replace('"','')
    tweet = tweet.lower()
    tweet = re.sub('^rt','',tweet)#Remove RT if they appear at the beginning of a tweet
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)#Remove www.* or https?://*
    tweet = re.sub('@[^\s]+','',tweet)#Remove @username
    tweet = " ".join(word.strip() for word in re.split('#|_', tweet))#Convert #word into word 
    tweet = re.sub('[0-9]+','',tweet)#Remove numbers
    tweet = re.sub(r'[^\w\s]','',tweet)#Remove punctuation
    tweet = re.sub('[\s]+',' ', tweet)#Remove additional white spaces
    cleantweet.append(tweet)
    get_classify(tweet)


def biggest(a, y, z):
    Max = a
    if y > Max:
        Max = y    
    if z > Max:
        Max = z
        if y > z:
            Max = y
    return Max

    
tot = dcount+ccount
#print(dcount,ccount)
rat = dcount/tot
print (rat)
if (rat < 0.25 ):
    print (" %s is unlikely to be depressed" % target)
elif (rat > 0.25 and rat < 0.4):
    print (" %s is likely to be moderately depressed" % target)
    cause = biggest(Relationship,Money,Academic)
    if (cause == Relationship):
     print("Cause for depression is likely to be relationship troubles")
    elif (cause == Money):
     print("Cause for depression is likely to be money troubles")
    elif (cause == Academic):
     print ("Cause for depression is likely to be academic troubles")
else:
    hp.run()
    print ("%s is likely to be severely depressed" % target)
    cause = biggest(Relationship,Money,Academic)
    if (cause == Relationship):
     print("Cause for depression is likely to be relationship troubles")
    elif (cause == Money):
     print("Cause for depression is likely to be money troubles")
    elif (cause == Academic):
     print ("Cause for depression is likely to be academic troubles")
    

with open("%s.txt" % target,'w',encoding = 'utf-8') as f:
    f.write(str(cleantweet))
f.close()    

def random_color_func(word=None, font_size=None, position=None, orientation=None, font_path=None, random_state=None):
    h = int(360.0 * 45.0 / 255.0)
    s = int(100.0 * 255.0 / 255.0)
    l = int(100.0 * float(random_state.randint(60, 120)) / 255.0)

    return "hsl({}, {}%, {}%)".format(h, s, l)   
file_content=open ("%s.txt" % target).read()
wordcloud = WordCloud(font_path = r'C:\Windows\WinSxS\amd64_microsoft-windows-font-truetype-verdana_31bf3856ad364e35_10.0.16299.15_none_e1654f127052576a\verdana.ttf',                            stopwords = STOPWORDS,
                            background_color = 'white',
                            width = 1200,
                            height = 1000,
                            color_func = random_color_func,
                            collocations = False
                            ).generate(file_content)

plt.imshow(wordcloud)
plt.axis('off')
plt.show()

class LoginApp(App):
    username = StringProperty(None)

    def build(self):
        manager = ScreenManager()

        manager.add_widget(Login(name='login'))
        manager.add_widget(Connected(name='connected'))

        return manager

    def get_application_config(self):
        if(not self.username):
            return super(LoginApp, self).get_application_config()

        conf_directory = self.user_data_dir + '/' + self.username

        if(not os.path.exists(conf_directory)):
            os.makedirs(conf_directory)

        return super(LoginApp, self).get_application_config(
            '%s/config.cfg' % (conf_directory)
        )

if __name__ == '__main__':
    LoginApp().run()
for i in posSentences:
    posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
    posWords = [make_full_dict(posWords), 'pos']
    posFeatures.append(posWords)

# Laben the negative words
for i in negSentences:
    negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
    negWords = [make_full_dict(negWords), 'neg']
    negFeatures.append(negWords)

# Create the training set
trainFeatures = posFeatures + negFeatures

# Train the Classifier
classifier = NaiveBayesClassifier.train(trainFeatures)
'''Save Pickled Data'''
# Save the trained Classifier for future use
save_trained_data = open(
    'C:/Users/animi/Documents/Python Codes/NLP/PythonProgrammingDotNet/Pickle_Files/Trained_NBC.pickle',
    'wb')
pickle.dump(classifier, save_trained_data)
save_trained_data.close()
'''Load Pickled Data'''
saved_trained_data = open(
    'C:/Users/animi/Documents/Pickle_Files/Trained_NBC.pickle', 'rb')
classifier = pickle.load(saved_trained_data)
saved_trained_data.close()

# Test the classifier with custom input
while True:
    for label, feats in lfeats.items():
        cutoff = int(len(feats) * split)
        train_feats.extend([(feat, label) for feat in feats[:cutoff]])
        test_feats.extend([(feat, label) for feat in feats[cutoff:]])
    return train_feats, test_feats


train_feats, test_feats = split_label_feats(lfeats)
train_feats[0]

print(len(train_feats))
print(len(test_feats))
#%%
# ______________  Bayesian  _______________-

nb = NaiveBayesClassifier.train(train_feats)
nb.labels()
acc = accuracy(nb, test_feats)
print("Bayesian Accuracy: ", acc)

#%%
# ______________  Naive_Bayes  _______________-

sk = SklearnClassifier(MultinomialNB())
sk.train(train_feats)
acc_Naive_Bayes = accuracy(sk, test_feats)
print("Naive Bayes Accuracy: ", acc_Naive_Bayes)

# ______________  K-Neighbors  _______________-

sk_knn = SklearnClassifier(KNeighborsClassifier())
Example #52
0
def test(trainfeats, testfeats, source, type):
    # print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

    my_classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = my_classifier.classify(feats)
        testsets[observed].add(i)

    # precision and recall
    accuracy = nltk.classify.util.accuracy(my_classifier, testfeats) * 100
    pos_prec = precision(refsets['pos'], testsets['pos']) * 100
    pos_rec = recall(refsets['pos'], testsets['pos']) * 100
    neg_prec = precision(refsets['neg'], testsets['neg']) * 100
    neg_rec = recall(refsets['neg'], testsets['neg']) * 100

    # round
    accuracy = round(accuracy, 1)
    pos_prec = round(pos_prec, 1)
    pos_rec = round(pos_rec, 1)
    neg_prec = round(neg_prec, 1)
    neg_rec = round(neg_rec, 1)

    # print('pos F-measure:', f_measure(refsets['pos'], testsets['pos']))
    # print('neg F-measure:', f_measure(refsets['neg'], testsets['neg']))
    my_classifier.show_most_informative_features(50)

    dir_path = os.path.dirname(__file__)

    file_path = os.path.join(
        dir_path, source + '/pickled/' + type + '/MNB_classifier.pickle')
    open_file = open(file_path, "rb")
    MNB_classifier = pickle.load(open_file)
    open_file.close()
    mnb = (nltk.classify.accuracy(MNB_classifier, testfeats)) * 100
    print(mnb)
    mnb = round(mnb, 1)

    file_path = os.path.join(
        dir_path,
        source + '/pickled/' + type + '/BernoulliNB_classifier.pickle')
    open_file = open(file_path, "rb")
    BernoulliNB_classifier = pickle.load(open_file)
    open_file.close()
    bnb = (nltk.classify.accuracy(BernoulliNB_classifier, testfeats)) * 100
    print(bnb)
    bnb = round(bnb, 1)

    file_path = os.path.join(
        dir_path,
        source + '/pickled/' + type + '/LogisticRegression_classifier.pickle')
    open_file = open(file_path, "rb")
    LogisticRegression_classifier = pickle.load(open_file)
    open_file.close()
    lr = (nltk.classify.accuracy(LogisticRegression_classifier,
                                 testfeats)) * 100
    print(lr)
    lr = round(lr, 1)

    file_path = os.path.join(
        dir_path, source + '/pickled/' + type + '/LinearSVC_classifier.pickle')
    open_file = open(file_path, "rb")
    LinearSVC_classifier = pickle.load(open_file)
    open_file.close()
    lsvc = (nltk.classify.accuracy(LinearSVC_classifier, testfeats)) * 100
    print(lsvc)
    lsvc = round(lsvc, 1)

    file_path = os.path.join(
        dir_path, source + '/pickled/' + type + '/NuSVC_classifier.pickle')
    open_file = open(file_path, "rb")
    NuSVC_classifier = pickle.load(open_file)
    open_file.close()
    nsvc = (nltk.classify.accuracy(NuSVC_classifier, testfeats)) * 100
    print(nsvc)
    nsvc = round(nsvc, 1)

    voted_classifier = VoteClassifier(NuSVC_classifier, LinearSVC_classifier,
                                      MNB_classifier, BernoulliNB_classifier,
                                      LogisticRegression_classifier)

    voted = (nltk.classify.accuracy(voted_classifier, testfeats)) * 100
    print(voted)
    voted = round(voted, 1)

    nltk_output = "nlt, " + str(accuracy) + ", " + str(pos_prec) + ", " + str(
        neg_prec) + ", " + str(pos_rec) + ", " + str(neg_rec) + "\n"
    sklearn_output = "skl, " + str(mnb) + ", " + str(bnb) + ", " + str(
        lr) + ", " + str(lsvc) + ", " + str(nsvc) + ", " + str(voted) + "\n"

    return (nltk_output, sklearn_output)
Example #53
0
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.corpus import CategorizedPlaintextCorpusReader
import nltk

mydir = 'Documents/Plab/Project4/subset/test/neg'

mr = CategorizedPlaintextCorpusReader(mydir,
                                      r'(?!\.).*\.txt',
                                      cat_pattern=r'(neg|pos)/.*',
                                      encoding='ascii')
stop = stopwords.words('english')
documents = [([
    w for w in mr.words(i)
    if w.lower() not in stop and w.lower() not in string.punctuation
], i.split('/')[0]) for i in mr.fileids()]

word_features = FreqDist(chain(*[i for i, j in documents]))
word_features = word_features.keys()[:100]

numtrain = int(len(documents) * 90 / 100)
train_set = [({i: (i in tokens)
               for i in word_features}, tag)
             for tokens, tag in documents[:numtrain]]
test_set = [({i: (i in tokens)
              for i in word_features}, tag)
            for tokens, tag in documents[numtrain:]]

classifier = nbc.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(5)
Example #54
0
def feature_classification(feature_select):
    global testSentences, wordScores, number_of_features, stoplist, bestWords

    positiveFeatures = []
    negativeFeatures = []
    testFeatures = []
    testLines = []
    tempArray = []
    probability = []
    testSets = collections.defaultdict(set)
    count = 0
    linesCount = 0
    positiveScore = 0.0

    #to create positive training features
    with open(positive_file, 'r') as positiveLines:
        for i in positiveLines:
            positiveWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            positiveWords = [k for k in positiveWords if not k in stoplist]
            positiveWords = [feature_select(positiveWords), 'pos']
            positiveFeatures.append(positiveWords)

#to create negative training features
    with open(negative_file, 'r') as negativeLines:
        for i in negativeLines:
            negativeWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negativeWords = [l for l in negativeWords if not l in stoplist]
            negativeWords = [feature_select(negativeWords), 'neg']
            negativeFeatures.append(negativeWords)

#to create testing features
    for i in testSentences:
        testLines = i.split('.')
        lines = len(testLines)
        count = 0
        for j in testLines:
            if not j:
                count += 1

        lines -= count
        for j in testLines:
            testWords = re.findall(r"[\w']+|[.,!?;]", j)
            for k in testWords:
                k = k.lower()
            testWords = [feature_select(testWords), lines]
            testFeatures.append(testWords)

#the whole training features to be provided to the classifier
    trainFeatures = positiveFeatures + negativeFeatures

    #creating a classifier object and performing the training process
    classifier = NaiveBayesClassifier.train(trainFeatures)

    #to perform testing process
    for i, (features, lines) in enumerate(testFeatures):
        predicted = classifier.classify(features)

        if predicted == 'pos':
            count += 1

        linesCount += 1
        testSets[predicted].add(i)

        if linesCount == lines:
            positiveScore = float(count) / lines
            probability.append(positiveScore)
            count = 0
            linesCount = 0

##	print 'Probability:'
##	print probability
    score = (sum(probability)) / len(probability)
    ##	print 'Score:'
    ##	print score
    return score
Example #55
0
def train_classifiers(posFeatures, negFeatures):

    #selects 3/4 of the features to be used for training and 1/4 to be used for testing
    posCutoff = int(math.floor(len(posFeatures) * 3 / 4))
    negCutoff = int(math.floor(len(negFeatures) * 3 / 4))
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]

    #trains a Naive Bayes Classifier
    print("----------------Naive Bayes Classifier-----------")
    classifier = NaiveBayesClassifier.train(trainFeatures)

    #initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)

    #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)

    #prints metrics to show how well the feature selection did
    print('train on %d instances, test on %d instances' %
          (len(trainFeatures), len(testFeatures)))
    print('Original Naive Bayes Accuracy:',
          (nltk.classify.util.accuracy(classifier, testFeatures)) * 100)
    print('pos precision:', precision(referenceSets['pos'], testSets['pos']))
    print('pos recall:', recall(referenceSets['pos'], testSets['pos']))
    print('neg precision:', precision(referenceSets['neg'], testSets['neg']))
    print('neg recall:', recall(referenceSets['neg'], testSets['neg']))
    classifier.show_most_informative_features(10)

    #Pickle the algorithm for future use
    save_classifier = open("pickled_algos/originalnaivebayes.pickle", "wb")
    pickle.dump(classifier, save_classifier)
    save_classifier.close()

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(trainFeatures)
    print("MNB_classifier accuracy percent:",
          (nltk.classify.accuracy(MNB_classifier, testFeatures)) * 100)

    #Pickle the algorithm for future use
    save_classifier = open("pickled_algos/MNB_classifier.pickle", "wb")
    pickle.dump(MNB_classifier, save_classifier)
    save_classifier.close()

    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(trainFeatures)
    print("BernoulliNB_classifier accuracy percent:",
          (nltk.classify.accuracy(BernoulliNB_classifier, testFeatures)) * 100)

    #Pickle the algorithm for future use
    save_classifier = open("pickled_algos/BernoulliNB_classifier.pickle", "wb")
    pickle.dump(BernoulliNB_classifier, save_classifier)
    save_classifier.close()

    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(trainFeatures)
    print(
        "LogisticRegression_classifier accuracy percent:",
        (nltk.classify.accuracy(LogisticRegression_classifier, testFeatures)) *
        100)

    #Pickle the algorithm for future use
    save_classifier = open(
        "pickled_algos/LogisticRegression_classifier.pickle", "wb")
    pickle.dump(LogisticRegression_classifier, save_classifier)
    save_classifier.close()

    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(trainFeatures)
    print("LinearSVC_classifier accuracy percent:",
          (nltk.classify.accuracy(LinearSVC_classifier, testFeatures)) * 100)

    #Pickle the algorithm for future use
    save_classifier = open("pickled_algos/LinearSVC_classifier.pickle", "wb")
    pickle.dump(LinearSVC_classifier, save_classifier)
    save_classifier.close()

    SGDC_classifier = SklearnClassifier(SGDClassifier())
    SGDC_classifier.train(trainFeatures)
    print("SGDClassifier accuracy percent:",
          nltk.classify.accuracy(SGDC_classifier, testFeatures) * 100)

    #Pickle the algorithm for future use
    save_classifier = open("pickled_algos/SGDC_classifier.pickle", "wb")
    pickle.dump(SGDC_classifier, save_classifier)
    save_classifier.close()

    Dec_Tree_Classifier = SklearnClassifier(DecisionTreeClassifier())
    Dec_Tree_Classifier.train(trainFeatures)
    print("DecisionTreeClassifier Accuracy:",
          (nltk.classify.accuracy(Dec_Tree_Classifier, testFeatures)) * 100)

    #Pickle the algorithm for future use
    save_classifier = open("pickled_algos/decision_tree.pickle", "wb")
    pickle.dump(Dec_Tree_Classifier, save_classifier)
    save_classifier.close()
    """
    
#    Grad_Boost_Classifier = SklearnClassifier(GradientBoostingClassifier())
#    Grad_Boost_Classifier.train(trainFeatures)
#    print("Gradient Boosting Classifier Accuracy:", (nltk.classify.accuracy(Grad_Boost_Classifier,testFeatures))*100)    
    """

    Random_Forest_Classifier = SklearnClassifier(RandomForestClassifier())
    Random_Forest_Classifier.train(trainFeatures)
    print("Random Forest Classifier Accuracy:",
          (nltk.classify.accuracy(Random_Forest_Classifier, testFeatures)) *
          100)

    #Pickle the algorithm for future use
    save_classifier = open("pickled_algos/random_forest.pickle", "wb")
    pickle.dump(Random_Forest_Classifier, save_classifier)
    save_classifier.close()

    Ada_Boost_Classifier = SklearnClassifier(AdaBoostClassifier())
    Ada_Boost_Classifier.train(trainFeatures)
    print("Ada Boost Classifier Accuracy:",
          (nltk.classify.accuracy(Ada_Boost_Classifier, testFeatures)) * 100)

    #Pickle the algorithm for future use
    save_classifier = open("pickled_algos/Ada_Boost.pickle", "wb")
    pickle.dump(Ada_Boost_Classifier, save_classifier)
    save_classifier.close()

    voted_classifier = VoteClassifier(classifier, LinearSVC_classifier,
                                      MNB_classifier, BernoulliNB_classifier,
                                      LogisticRegression_classifier,
                                      Random_Forest_Classifier,
                                      Ada_Boost_Classifier)

    print("Voted classifier accuracy percent:",
          (nltk.classify.accuracy(voted_classifier, testFeatures)) * 100)

    # The voted classifier could not be pickled. Check this later!

    return trainFeatures, testFeatures
                         'Negative') for f in minus_filenum]

    threshold_fact = 0.8
    threshold_pluspts = int(threshold_fact * len(feature_pluspts))
    threshold_minuspts = int(threshold_fact * len(feature_minuspts))

    feature_training = feature_pluspts[:
                                       threshold_pluspts] + feature_minuspts[:
                                                                             threshold_minuspts]
    feature_testing = feature_pluspts[threshold_pluspts:] + feature_minuspts[
        threshold_minuspts:]
    print "\nNumber of training datapoints:", len(feature_training)
    print "Number of test datapoints:", len(feature_testing)

    # Train a Naive Bayes classifiers
    classifiers = NaiveBayesClassifier.train(feature_training)
    print "\nAccuracy of the classifiers:", nltk.classify.util.accuracy(
        classifiers, feature_testing)

    print "\nTop 10 most informative words:"
    for item in classifiers.most_informative_features()[:10]:
        print item[0]

    # Sample input reviews
    in_reviews = [
        "The Movie was amazing",
        "the movie was dull. I would never recommend it to anyone.",
        "The cinematography is pretty great in the movie",
        "The direction was horrible and the story was all over the place"
    ]
Example #57
0
    'awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great',
    ':)'
]
negative_vocab = ['bad', 'terrible', 'useless', 'hate', ':(']
neutral_vocab = [
    'movie', 'the', 'sound', 'was', 'is', 'actors', 'did', 'know', 'words',
    'not'
]

positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab]

train_set = negative_features + positive_features + neutral_features

classifier = NaiveBayesClassifier.train(train_set)

# Predict
neg = 0
pos = 0
sentence = "Movie is not good, i hate this movie"
sentence = sentence.lower()
words = sentence.split(' ')
for word in words:
    classResult = classifier.classify(word_feats(word))
    print(classResult)
    if classResult == 'neg':
        neg = neg + 1
    if classResult == 'pos':
        pos = pos + 1
Example #58
0
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews


def word_features(words):
    return dict([(word, True) for word in words])


neg_ids = movie_reviews.fileids('neg')
pos_ids = movie_reviews.fileids('pos')

neg_features = [(word_features(movie_reviews.words(fileids=[f])), 'neg')
                for f in neg_ids]
pos_features = [(word_features(movie_reviews.words(fileids=[f])), 'pos')
                for f in pos_ids]

neg_cutoff = len(neg_features) * 3 / 4
pos_cutoff = len(pos_features) * 3 / 4

training_features = neg_features[:neg_cutoff] + pos_features[:pos_cutoff]
test_features = neg_features[neg_cutoff:] + pos_features[pos_cutoff:]

print 'Training on %d instances, testing on %d instances' % (
    len(training_features), len(test_features))

classifier = NaiveBayesClassifier.train(training_features)
print 'Accuracy: ', nltk.classify.util.accuracy(classifier, test_features)
classifier.show_most_informative_features()
Example #59
0
def evaluate_features(feature_select):

    #All variables
    tagged_Sentences = []
    untagged_Sentences = []
    neg_sentence = []
    pos_sentence = []
    mixed_sentence = []
    neutral_sentence = []
    neg_Feautures = []
    pos_Feautures = []
    mixed_Feautures = []
    neutral_Feautures = []
    test_sentence = []
    test_Feautures = []
    allwords = []

    tempPos = []

    stopWords = stopwords.words("english")

    # Reading positive words from txt file
    fileInput = open('positive-words.txt', 'r')
    sentences = re.split(r'\n', fileInput.read())
    fileInput.close()

    for i in sentences:
        posWords = re.findall(r"^[\w']+", i)
        if posWords:
            posWords = [feature_select(posWords), '+']
            POS_Words.append(posWords)
            pos_Feautures.append(posWords)

    # Reading negative words from txt file
    fileInput = open('negative-words.txt', 'r')
    sentences = re.split(r'\n', fileInput.read())
    fileInput.close()
    for i in sentences:
        negWords = re.findall(r"^[\w']+", i)
        if negWords:
            negWords = [feature_select(negWords), '-']
            NEG_Words.append(negWords)
            neg_Feautures.append(negWords)

    #reading pre-labeled input and splitting into lines
    fileInput = open('All_Classified.txt', 'r')
    sentences = re.split(r'\n', fileInput.read())
    fileInput.close()

    for i in sentences:
        #tagged = re.findall(r"^[-=+\*]|[\w']+[/]?[\w']+[/]+[\w']+ [.,!?;]*", i)
        tagged = re.findall(
            r"^[-=+\*]|[\w']+[/]?[\w']+[/]+[^(NN|NNS|NNP|PRP)]+ [.,!?;]*", i)
        untagged = re.sub(r'/[^\s]+|[0-9]+|[.,!?;]*|', '', ' '.join(tagged))

        untagged_Words = re.findall(r"[\w']+|[.,!?;]", untagged)
        filtered_Words = [
            w for w in untagged_Words if not w.lower() in stopWords
        ]

        if untagged and tagged:
            if untagged[0] == '-':
                neg_sentence.append(untagged)
                filtered_Words = [feature_select(filtered_Words), '-']
                NEG_Words.append(filtered_Words)
                neg_Feautures.append(filtered_Words)

                tagged_Words = [feature_select(tagged), '-']
                NEG_Words.append(tagged_Words)
                neg_Feautures.append(tagged_Words)
                """                
                for word in filtered_Words:
                    a=dict([(word, True)])
                    Word= [a, '-'] 
                    NEG_Words.append(Word)
                    neg_Feautures.append(Word)            
                 """
            if untagged[0] == '+':
                pos_sentence.append(untagged)

                filtered_Words = [feature_select(filtered_Words), '+']
                POS_Words.append(filtered_Words)
                pos_Feautures.append(filtered_Words)

                tagged_Words = [feature_select(tagged), '+']
                POS_Words.append(tagged_Words)
                pos_Feautures.append(tagged_Words)
                """
                for word in filtered_Words:
                    a=dict([(word, True)])
                    Word= [a, '+'] 
                    #Word= [dict([(word, True)]), '+'] 
                    POS_Words.append(Word)
                    pos_Feautures.append(Word)    
                """
            if untagged[0] == '*':
                mixed_sentence.append(untagged)
                filtered_Words = [feature_select(filtered_Words), '*']
                MIX_Words.append(filtered_Words)
                mixed_Feautures.append(filtered_Words)

            if untagged[0] == '=':
                neutral_sentence.append(untagged)
                filtered_Words = [feature_select(filtered_Words), '=']
                NEUTRAL_Words.append(filtered_Words)
                neutral_Feautures.append(filtered_Words)
                """
                tagged_Words = [feature_select(tagged), '=']
                NEUTRAL_Words.append(tagged_Words)
                neutral_Feautures.append(tagged_Words)
                
                for word in filtered_Words:
                    a=dict([(word, True)])
                    Word= [a, '='] 
                    #Word= [dict([(word, True)]), '='] 
                    NEUTRAL_Words.append(Word)
                    neutral_Feautures.append(Word) 
               """
        tagged_Sentences.append(tagged)
        untagged_Sentences.append(untagged)

    #Read a test file and create test feutures

    fileInput = open('test_dummy.txt', 'r')
    sentences = re.split(r'\n', fileInput.read())
    fileInput.close()

    for i in sentences:
        tagged = re.findall(r"^[-=+\*]|[\w']+[/]?[\w']+[/]+[\w']+ [.,!?;]*", i)
        untagged = re.sub(r'/[^\s]+|[0-9]+|[.,!?;]*', '', i)

        untagged_Words = re.findall(r"[\w']+|[.,!?;]", untagged)
        filtered_Words = [
            w for w in untagged_Words if not w.lower() in stopWords
        ]

        if untagged and tagged:
            if untagged[0] == '-':
                test_sentence.append(untagged)
                filtered_Words = [feature_select(filtered_Words), '-']
                test_Feautures.append(filtered_Words)
            if untagged[0] == '+':
                test_sentence.append(untagged)
                filtered_Words = [feature_select(filtered_Words), '+']
                test_Feautures.append(filtered_Words)
            if untagged[0] == '=':
                test_sentence.append(untagged)
                filtered_Words = [feature_select(filtered_Words), '=']
                test_Feautures.append(filtered_Words)

    #posCutoff = int(math.floor(len(pos_Feautures)*3/4))
    #negCutoff = int(math.floor(len(neg_Feautures)*3/4))
    neutralCutoff = int(math.floor(len(neutral_Feautures) * 1 / 20))

    trainFeatures = pos_Feautures + neg_Feautures + neutral_Feautures[:
                                                                      neutralCutoff]

    #test_Feautures= pos_Feautures[posCutoff:] + neg_Feautures[negCutoff:] + neutral_Feautures[neutralCutoff: 2*neutralCutoff]
    #trains a Naive Bayes Classifier
    classifier = NaiveBayesClassifier.train(trainFeatures)

    #initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)

    #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(test_Feautures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)

    #prints metrics to show how well the feature selection did
    print 'train on %d instances, test on %d instances' % (
        len(tagged_Sentences), len(test_sentence))
    print 'accuracy:', nltk.classify.util.accuracy(classifier, test_Feautures)

    print 'pos precision:', nltk.metrics.precision(referenceSets['+'],
                                                   testSets['+'])
    print 'pos recall:', nltk.metrics.recall(referenceSets['+'], testSets['+'])
    print 'pos f-measure:', nltk.metrics.f_measure(referenceSets['+'],
                                                   testSets['+'])

    print 'neg precision:', nltk.metrics.precision(referenceSets['-'],
                                                   testSets['-'])
    print 'neg recall:', nltk.metrics.recall(referenceSets['-'], testSets['-'])
    print 'neg f-measure:', nltk.metrics.f_measure(referenceSets['-'],
                                                   testSets['-'])
 def NaiveBayesClassification(self, train_features, test_features):
     # Training and finding accuracy of  NaiveBayes Classifier    
 
     #Training
     self.nb_classifier = NaiveBayesClassifier.train(train_features)