Beispiel #1
0
def train(test=False):

    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')


    negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]


    if(test):
        negcutoff = len(negfeats)*3/4
        poscutoff = len(posfeats)*3/4

        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

        print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))

        classifier = NaiveBayesClassifier.train(trainfeats)
        print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)

        classifier.show_most_informative_features()

    else:
        return NaiveBayesClassifier.train(negfeats+posfeats)
def train_and_show_results(pos, neg, pos_bigrams, neg_bigrams, pos_control, neg_control, pos_control_bigrams, neg_control_bigrams):
    if pos_control == None or neg_control == None or pos_control_bigrams == None or neg_control_bigrams == None:
        negcutoff = len(neg)*3/4
        poscutoff = len(pos)*3/4
        neg_bigrams_cutoff = len(neg_bigrams)*3/4
        pos_bigrams_cutoff = len(pos_bigrams)*3/4
        test_bag_of_words = neg[negcutoff:] + pos[poscutoff:]
        test_bigrams = neg_bigrams[neg_bigrams_cutoff:] + pos_bigrams[pos_bigrams_cutoff:]
        train_corpora_bag_of_words = neg[:negcutoff] + pos[:poscutoff]
        train_corpora_bigrams = neg_bigrams[:neg_bigrams_cutoff] + pos_bigrams[:pos_bigrams_cutoff]
    else:
        test_bag_of_words = neg_control + pos_control
        test_bigrams = neg_control_bigrams + pos_control_bigrams
        train_corpora_bag_of_words = neg+pos
        train_corpora_bigrams = neg_bigrams + pos_bigrams
    
    print "negative corpus: ", len(neg) 
    print "positive corpus: ", len(pos)

    if neg_control != None:
        print "negative test corpus: ", len(neg_control) 
        print "positive test corpus: ", len(pos_control)

    print 'bag of words and bigrams - Naive Bayes' 
    naive_bayes = NaiveBayesClassifier.train(train_corpora_bag_of_words)
    naive_bayes_bigrams = NaiveBayesClassifier.train(train_corpora_bigrams)
   
    save_dataset('naive_bayes.dat', naive_bayes)
    save_dataset('naive_bayes_bigrams.dat', naive_bayes_bigrams)
    
    print 'bag of words and bigrams - Maximum Entropy' 
    maximum_entropy = nltk.MaxentClassifier.train(train_corpora_bag_of_words, max_iter=2)
    maximum_entropy_bigrams = nltk.MaxentClassifier.train(train_corpora_bigrams, max_iter=2)
    
    save_dataset('maximum_entropy.dat', maximum_entropy)
    save_dataset('maximum_entropy_bigrams.dat', maximum_entropy_bigrams)

    print 'Naive Bayesian results'
    print 'bag of words' 
    print 'Accuracy:', nltk.classify.util.accuracy(naive_bayes, test_bag_of_words)
    naive_bayes.show_most_informative_features()  
    print_precision_recall(naive_bayes, test_bag_of_words) 


    print '\nbigrams'
    print 'Accuracy:', nltk.classify.util.accuracy(naive_bayes_bigrams, test_bigrams)
    naive_bayes_bigrams.show_most_informative_features()  
    print_precision_recall(naive_bayes_bigrams, test_bigrams) 

    print 'Maximum Entropy results'
    print 'bag of words' 
    print 'Accuracy:', nltk.classify.util.accuracy(maximum_entropy, test_bag_of_words)
    maximum_entropy.show_most_informative_features()  
    print_precision_recall(maximum_entropy, test_bag_of_words) 


    print '\nbigrams'
    print 'Accuracy:', nltk.classify.util.accuracy(maximum_entropy_bigrams, test_bigrams)
    maximum_entropy_bigrams.show_most_informative_features()  
    print_precision_recall(maximum_entropy_bigrams, test_bigrams) 
def main(argv):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')

    #print negids
 
    negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'negative') for f in negids]
    posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'positive') for f in posids]

    trainfeats =  posfeats+negfeats
    #print trainfeats
    #    break
    classifier = NaiveBayesClassifier.train(trainfeats)

    #classifier = pickle.load(open("classifier.p", "rb"))
    topicList = ["media", "sports", "news", "fashion", "finance", "politics"]
    for line in sys.stdin:
        try:
            tolk_posset = word_tokenize(line.rstrip())
            d = word_feats(tolk_posset)
            for topic in topicList:
                subjectFull = subj(line, topic)
                if not subjectFull == "No match":
                    #print d
                    print "LongValueSum:" + "" + str(line.split(":")[0])+","+subjectFull + "," + classifier.classify(d) + "\t" + "1"                    
        except:
                #print "Error"
                continue
def init():
    # create our dict of training data
    texts = {}
    texts['traffic'] = 'traffic-corpus.txt'
    texts['useless'] = 'useless-corpus.txt'

    #holds a dict of features for training our classifier
    train_set = []

    # loop through each item, grab the text, tokenize it and create a training feature with it
    for sense, file in texts.iteritems():
        print "training %s " % sense
	text = open(file, 'r').read()
        features = extract_words(text)
        train_set = train_set + [(get_feature(word), sense) for word in features]

    classifier = NaiveBayesClassifier.train(train_set)

    # uncomment out this line to see the most informative words the classifier will use
    classifier.show_most_informative_features(20)

    # uncomment out this line to see how well our accuracy is using some hand curated tweets
    # run_classifier_tests(classifier)

    return classifier
def evaluate_features(feature_select):
    posFeatures = []
    negFeatures = []
    inposFeatures = []
    innegFeatures = []
	#http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
	#breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
    with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWords = [feature_select(posWords), 'pos']
            posFeatures.append(posWords)
    with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWords = [feature_select(negWords), 'neg']
            negFeatures.append(negWords)
    """
    with open(RT_INPUT_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            inposWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            inposWords = [feature_select(inposWords), 'pos']
            inposFeatures.append(inposWords)
    """
    with open(RT_INPUT_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            innegWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            innegWords = [feature_select(innegWords), 'neg']
            innegFeatures.append(innegWords)
   
	#selects 3/4 of the features to be used for training and 1/4 to be used for testing
	#posCutoff = int(math.floor(len(posFeatures)*3/4))
	#negCutoff = int(math.floor(len(negFeatures)*3/4))
    trainFeatures = posFeatures + negFeatures
    testFeatures = innegFeatures #+ inposFeatures
      
    	#trains a Naive Bayes Classifier
    classifier = NaiveBayesClassifier.train(trainFeatures)	
    
    	#initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)	

    fileOutput ={'key':[],'pos':[],'neg':[]}
	#puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(testFeatures):
        #print features , label
        referenceSets[label].add(i)
        predicted = classifier.prob_classify(features)
        print "\n"
        fileOutput['key'].append(i)
        fileOutput['pos'].append(predicted.prob("pos"))
        fileOutput['neg'].append(predicted.prob("neg"))
        #posValues =  predicted.prob("pos") 
        #negValues = predicted.prob("neg") 
        fileOutput.values()
        testSets[predicted].add(i)
        #print i
        #print testSets[predicted]
    return fileOutput
Beispiel #6
0
def train():
    # get impact for documents for which it has not been computed yet
    for document in Document.objects.filter(sentiment__isnull=True):
        get_impact(document, settings.TIME)

    known_data = Document.objects.filter(sentiment__isnull=False)
    known_data_count = known_data.count()
    if known_data_count == 0:
        print('known_data_count == 0')
        return None, 0

    # 2/3 training data
    num_training_data = int(round(2 * known_data_count / 3))
    training_feats = []
    for document in known_data.order_by('id')[:num_training_data]:
        text = get_nltktext(document.text)
        training_feats.append((word_feats(text), document.sentiment))

    classifier = NaiveBayesClassifier.train(training_feats)

    # 1/3 test_data
    num_testing_data = int(round(known_data_count / 3))
    testing_feats = []
    for document in known_data.order_by('-id')[:num_testing_data]:
        text = get_nltktext(document.text)
        testing_feats.append((word_feats(text), document.sentiment))

    print('train on %d instances, test on %d instances' % (len(training_feats), len(testing_feats)))
    accuracy = nltk.classify.util.accuracy(classifier, testing_feats)
    return classifier, accuracy
Beispiel #7
0
def classify_and_evaluate(reviews, feature_extractor=word_feats):
    random.shuffle(reviews)

    pos_reviews = filter(lambda x: x['class'] == 'POSITIVE', reviews)
    neg_reviews = filter(lambda x: x['class'] == 'NEGATIVE', reviews)

    # get unique features
    pos_features = []
    neg_features = []
    for review in pos_reviews:
        split_reviews = review['text'].split(' ')
        split_reviews = [x for x in split_reviews if x]
        pos_features.append((feature_extractor(split_reviews), 'pos'))

    for review in neg_reviews:
        split_reviews = review['text'].split(' ')
        split_reviews = [x for x in split_reviews if x]
        neg_features.append((feature_extractor(split_reviews), 'neg'))

    # divide groups
    pos_offset = int(math.floor(len(pos_reviews) * 3 / 4))
    neg_offset = int(math.floor(len(neg_reviews) * 3 / 4))

    training = pos_features[:pos_offset] + neg_features[:neg_offset]
    testing = pos_features[pos_offset:] + neg_features[neg_offset:]

    # train classifier
    classifier = NaiveBayesClassifier.train(training)

    print 'treinada em %d reviews, testada em %d reviews' % (len(training), len(testing))
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testing)
    classifier.show_most_informative_features()
Beispiel #8
0
def main():
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
    negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
    negcutoff = int(len(negfeats) * 3 / 4)
    poscutoff = int(len(posfeats) * 3 / 4)
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    classifier = NaiveBayesClassifier.train(trainfeats)

    with open("output.json") as fin:
        sid = SentimentIntensityAnalyzer()
        data = json.load(fin)
    for key in data:
        reviews = data[key]["reviews"]
        for i in range(len(reviews)):
            text = reviews[i]["review"]
            sentiment_dict = {'positive_probability':0, 'label':'', 'negative_probability':0}
            prob = classifier.prob_classify(word_feats(text.split(" ")))
            classification = classifier.classify(word_feats(text.split(" ")))
            sentiment_dict['positive_probability'] = prob.prob('pos')
            sentiment_dict['negative_probability'] = prob.prob('neg')
            sentiment_dict['label'] = classification
            reviews[i]["sentiment"] = sentiment_dict
        data[key]["reviews"] = reviews
    with open('out_with_sentiment.json', 'w') as outfile:
        json.dump(data, outfile)
def evaluate_classifier(featx):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
 
    negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
    print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
    classifier.show_most_informative_features()
Beispiel #10
0
    def train_with_movie_db(self):
        """
        Training possible with movie reviews
        - this does not yield particularly good results
        """
        self.use_movie_reviews = True

        negids = movie_reviews.fileids('neg')
        posids = movie_reviews.fileids('pos')

        negfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])),
                     "negative") for f in negids]
        posfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])),
                     "positive") for f in posids]

        negcutoff = len(negfeats) * 3 / 4
        poscutoff = len(posfeats) * 3 / 4

        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

        DLOG("train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats)))

        self.classifier = NaiveBayesClassifier.train(trainfeats)

        DLOG("accuracy: " + str(util.accuracy(self.classifier, testfeats)))
        DLOG(self.classifier.show_most_informative_features())
Beispiel #11
0
def main():
 
#    vote_file = '/Users/nasrallah/Desktop/Insight/courtcast/data/supreme_court_dialogs_corpus_v1.01/supreme.votes.txt'
#    votes = get_justice_votes(vote_file)
#    for v in votes: print(v, votes[v])
    
#    win_file = '/Users/nasrallah/Desktop/Insight/courtcast/data/supreme_court_dialogs_corpus_v1.01/supreme.outcome.txt'
#    winners = get_winners(win_file)
#    for w in winners: print(w, winners[w])

    text_file = '/Users/nasrallah/Desktop/Insight/courtcast/data/supreme_court_dialogs_corpus_v1.01/supreme.conversations.txt'
    #text_file = '/Users/nasrallah/Desktop/some_text.txt'
    
    ## Extract the feature sets
    feature_sets = get_training_features(text_file)
    
    ## Shuffle the features to mix up pos and neg
    #random.shuffle(feature_sets)
    
    ## Separate into train and test sets 
    cutoff = int(len(feature_sets)*3/4)
    train_feature_sets = feature_sets[:cutoff]
    test_feature_sets = feature_sets[cutoff:]
    print('train on %d instances, test on %d instances' % (len(train_feature_sets), len(test_feature_sets)))
 
    classifier = NaiveBayesClassifier.train(train_feature_sets)
    print('accuracy:', nltk.classify.util.accuracy(classifier, test_feature_sets))
    classifier.show_most_informative_features()  
    def classification(self):

        fstruct = FeatStruct(self.train_reviews)
        classifier = NaiveBayesClassifier.train(fstruct)

        print 'accuracy:', nltk.classify.util.accuracy(classifier, self.test_reviews)
        classifier.show_most_informative_features() 
Beispiel #13
0
def evaluate_classifier(featx):
    #negids = movie_reviews.fileids('neg')
    #posids = movie_reviews.fileids('pos')
    
    ##For Movie Review train:
    #negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    #posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
     
    ##For product reviews train:
    negfeats = [(featx([wrd for wrd in nltk.word_tokenize(con) if wrd not in stpwrds]), 'neg') for con in traincons]
    posfeats = [(featx([wrd for wrd in nltk.word_tokenize(pro) if wrd not in stpwrds]), 'pos') for pro in trainpros]
    
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
 
    trainfeats = negfeats[:] + posfeats[:]
    #trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
    print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
    classifier.show_most_informative_features()
    return classifier
def evaluate_classifier_Naive(featx):
    
    train_negids = train.fileids('neg')
    train_posids = train.fileids('pos')
    test_negids = test.fileids('neg')
    test_posids = test.fileids('pos')
    train_negfeats = [(featx(train.words(fileids=[f])), 'neg') for f in train_negids]
    train_posfeats = [(featx(train.words(fileids=[f])), 'pos') for f in train_posids]
    test_negfeats = [(featx(test.words(fileids=[f])), 'neg') for f in test_negids]
    test_posfeats = [(featx(test.words(fileids=[f])), 'pos') for f in test_posids]
    trainfeats = train_negfeats + train_posfeats
    testfeats = test_negfeats + test_posfeats

    Naive_classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets_Naive = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)           
            observed_Naive = Naive_classifier.classify(feats)
            testsets_Naive[observed_Naive].add(i)
            
    accuracy1 = nltk.classify.util.accuracy(Naive_classifier, testfeats)  
    pos_precision1 = nltk.metrics.precision(refsets['pos'], testsets_Naive['pos'])
    pos_recall1 = nltk.metrics.recall(refsets['pos'], testsets_Naive['pos'])
    neg_precision1 = nltk.metrics.precision(refsets['neg'], testsets_Naive['neg'])
    neg_recall1 = nltk.metrics.recall(refsets['neg'], testsets_Naive['neg'])

    Naive_classifier.show_most_informative_features(50)

    return(['NaiveBayes',accuracy1,pos_precision1,pos_recall1,neg_precision1,neg_recall1])
    def __init__(self):
        # neg_phrases = filter_negative_phrases(load_csv_sentences('thoughtsandfeelings.csv'))
        # pos_phrases = filter_positive_phrases(load_csv_sentences('spiritualforums.csv'))
        neg_file = open("neg_phrases.txt", "r")
        pos_file = open("pos_phrases.txt", "r")
        neg_phrases = neg_file.readlines()
        pos_phrases = pos_file.readlines()

        neg_phrases_tagged = []
        pos_phrases_tagged = []
        for phrase in neg_phrases:
            neg_phrases_tagged.append((word_feats(phrase.split()), 'suicidal'))
        for phrase in pos_phrases:
            pos_phrases_tagged.append((word_feats(phrase.split()), 'alright'))

        negcutoff = int(len(neg_phrases_tagged) * .8)
        poscutoff = int(len(pos_phrases_tagged) * .8)

        trainfeats = neg_phrases_tagged[:negcutoff] + pos_phrases_tagged[:poscutoff]
        testfeats = neg_phrases_tagged[negcutoff:] + pos_phrases_tagged[poscutoff:]
        print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))

        self.classifier = NaiveBayesClassifier.train(trainfeats)
        print 'accuracy:', nltk.classify.util.accuracy(self.classifier, testfeats)
        self.classifier.show_most_informative_features()
    def cross_validation(self):
        #10 fold cross validation is performed
        train_feats_count = int(len(self.training_feats))
        fold_size = int(train_feats_count / self.k_fold)
        nb_accuracy_list = []
        svm_accuracy_list = []
        nb_f_val_list = []
        svm_f_val_list = []

        for a in range(self.k_fold):
            start_index = a * fold_size
            end_index = start_index + fold_size

            train_features = self.training_feats[:start_index] + self.training_feats[end_index:]
            test_features  = self.training_feats[start_index:end_index] 
            
            self.nb_classifier = NaiveBayesClassifier.train(train_features)         
            nb_acc = nltk.classify.util.accuracy(self.nb_classifier, test_features) 
            nb_accuracy_list.append(nb_acc)
       
            self.svm_classifier = SklearnClassifier(LinearSVC()) 
            self.svm_classifier.train(train_features)
            svm_acc = nltk.classify.util.accuracy(self.svm_classifier, test_features) 
            svm_accuracy_list.append(svm_acc)

            #Find F-Measure
            nb_f_val = self.compute_measures(test_features, self.nb_classifier)
            nb_f_val_list.append(nb_f_val)
            svm_f_val = self.compute_measures(test_features, self.svm_classifier)
            svm_f_val_list.append(svm_f_val)

        self.logging.info('Average accuracy of Naive Bayes Classifier %s\n' % (float(sum(nb_accuracy_list)/len(nb_accuracy_list))))
        self.logging.info('Average accuracy of SVM Classifier %s\n' % (float(sum(svm_accuracy_list)/len(svm_accuracy_list))))
        self.logging.info('Average F measure of Naive Bayes Classifier %s\n' % (float(sum(nb_f_val_list)/len(nb_f_val_list))))
        self.logging.info('Average F measure of SVM Classifier %s\n' % (float(sum(svm_f_val_list)/len(svm_f_val_list))))
Beispiel #17
0
def naivebayes(trainfeats, testfeats):
	classifier = NaiveBayesClassifier.train(trainfeats)
	print "NaiveBayes output"
	print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))

	print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
	print classifier.show_most_informative_features()
Beispiel #18
0
def evaluate_classifier(featx):
    sportsfeats = [(featx(tweet[0]), tweet[1]) for tweet in Sports_Tweet]
    politicsfeats = [(featx(tweet[0]), tweet[1]) for tweet in Politics_Tweet]
    
    sportscutoff = len(sportsfeats)*3/4
    politicscutoff = len(politicsfeats)*3/4
 
    trainfeats = sportsfeats[:sportscutoff] + politicsfeats[:politicscutoff]
    testfeats = sportsfeats[sportscutoff:] + politicsfeats[politicscutoff:]
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    print 'pos precision:', nltk.metrics.precision(refsets['Sports'], testsets['Sports'])
    print 'pos recall:', nltk.metrics.recall(refsets['Politics'], testsets['Politics'])
    print 'neg precision:', nltk.metrics.precision(refsets['Sports'], testsets['Sports'])
    print 'neg recall:', nltk.metrics.recall(refsets['Politics'], testsets['Politics'])
    classifier.show_most_informative_features()
    return classifier
 def classification(self):
     #Training NB classifier
     self.nb_classifier = NaiveBayesClassifier.train(self.training_feats)         
     
     #Training SVM classifier
     self.svm_classifier = SklearnClassifier(LinearSVC()) 
     self.svm_classifier.train(self.training_feats)
def main_function():
	conn = MySQLdb.connect(host=DATABASES['ensemble']['HOST'], 
			user=DATABASES['ensemble']['USER'], 
			passwd=DATABASES['ensemble']['PASSWORD'], 
			db=DATABASES['ensemble']['NAME'])

	training_tweets = classify.get_training_tweets(conn)
	training_feature_set = classify.process_tweets(training_tweets)
	classifier = NaiveBayesClassifier.train(training_feature_set)

	error_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	count_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	guess_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	
	count_table = {'+':0, '-':0, 'I':0, 'O':0}  
	tweets = classify.get_tweets_to_classify(conn);

	for tweet in tweets:
		text = classify.get_tweet_text(conn, tweet[0])[0][0]
		guess = classifier.classify(classify.process_tweet(text))
		classify.update_tweet_polarity(tweet[0], guess, conn)
		count_table[guess] += 1

	#fix_manual_tweets(conn_analysis)
	classify.run_sql(conn, classify.Statements.UPDATE_MANUAL_CLASSIFIED)

	print count_table
	print full_matrix
def main():
    articles = CategorizedPlaintextCorpusReader(corpusdir, '.*', cat_pattern = r'(.*)[/]')
    feats = {}
    trainfeats = []
    testfeats = []
    for cat in articles.categories():
        wow = len([f for f in articles.fileids(cat)]) # such variable name
        print "for category", cat, ":", wow
        feats[cat] = [(word_feats(articles.words(fileids = [f])), cat) for f in articles.fileids(cat)]
        cutoff = wow - hold_back(wow)
        trainfeats.append(feats[cat][:cutoff])
        testfeats.append(feats[cat][cutoff:])

    train = [item for sublist in trainfeats for item in sublist]
    test = [item for sublist in testfeats for item in sublist]

    print 'train on %d instances, test on %d instances' % (len(train), len(test))

    classifier = NaiveBayesClassifier.train(train)
    print 'accuracy:', nltk.classify.util.accuracy(classifier, test)
    classifier.show_most_informative_features() # I don't understand the output for more than 2 categories :(

    # load with:
    # import pickle
    # f = open('my_classifier.pickle')
    # classifier = pickle.load(f)
    # f.close()
    with open('../data/classifier.pickle', 'wb') as f:
        pickle.dump(classifier, f)
Beispiel #22
0
def evaluateFeatures(featureSelect):
    posFeatures = []
    negFeatures = []
   
    with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWords = [featureSelect(posWords), 'pos']
            posFeatures.append(posWords)
    with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWords = [featureSelect(negWords), 'neg']
            negFeatures.append(negWords)

    
    posCutoff = int(math.floor(len(posFeatures)*3/4))
    negCutoff = int(math.floor(len(negFeatures)*3/4))
    #trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
    trainFeatures = posFeatures + negFeatures
    print testFeatures[0]
    classifier = NaiveBayesClassifier.train(trainFeatures)  

    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set) 

    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        #print features
        #print predicted
        testSets[predicted].add(i)  
Beispiel #23
0
    def classify(self):
        # Classify

        articles = Article.objects.filter(entity=self.entity)

        def word_feats(body):
            words = body.split(" ")
            return dict([(word, True) for word in words])

        negids = articles.filter(score__lt=0)
        posids = articles.filter(score__gt=0)

        negfeats = [(word_feats(a.body), "neg") for a in negids]
        posfeats = [(word_feats(a.body), "pos") for a in posids]

        negcutoff = len(negfeats) * 3 / 4
        poscutoff = len(posfeats) * 3 / 4

        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
        print "train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats))

        classifier = NaiveBayesClassifier.train(trainfeats)
        print "accuracy:", nltk.classify.util.accuracy(classifier, testfeats)
        classifier.show_most_informative_features()
Beispiel #24
0
def generate_sentiment_classifier(corpus, word_feats):
    negids = corpus.fileids('neg')
    posids = corpus.fileids('pos')
    negfeats = [(word_feats(corpus.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(word_feats(corpus.words(fileids=[f])), 'pos') for f in posids]

    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 

    classifier = NaiveBayesClassifier.train(trainfeats)

    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)



    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
    print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
    classifier.show_most_informative_features()

    return classifier
	def __init_naive_bayes(self):
		"""
		__init_naive_bayes(self):
		Gets the data from the positive, negative and neutral text files.
		Creates and trains the Naive Bayes classifier, using the data, so 
		that it can learn what constitutes a positive, negative or neutral tweet.
		"""
		
		try:
			pos_file = pjoin(sys.path[0], "sentiment_word_files", "tweets_positive.txt")
			f = codecs.open(pos_file, mode="rU", encoding='utf-8')
			positive = [line.lower().replace("\n" , " ") for line in f]
			positive = "".join(word[:] for word in positive).split()
			f.close
		
			neu_file = pjoin(sys.path[0], "sentiment_word_files", "tweets_neutral.txt")
			f = codecs.open(neu_file, mode="rU", encoding='utf-8')
			neutral = [line.lower().replace("\n" , " ") for line in f]
			neutral = "".join(word[:] for word in neutral).split()
			f.close
		
			neg_file = pjoin(sys.path[0], "sentiment_word_files", "tweets_negative.txt")
			f = codecs.open(neg_file, mode="rU", encoding='utf-8')
			negative = [line.lower().replace("\n" , " ") for line in f]
			negative = "".join(word[:] for word in negative).split()
			f.close
		
			posfeats = [(dict({word.lower() : True}), 'pos') for word in positive if self.__check_word(word)]
			neufeats = [(dict({word.lower() : True}), 'neu') for word in neutral if self.__check_word(word)]
			negfeats = [(dict({word.lower() : True}), 'neg') for word in negative if self.__check_word(word)]
		
			self.classifier = NaiveBayesClassifier.train( posfeats + neufeats + negfeats )
		
		except:
			raise Exception ("Unknown error in SentimentAnalyzer::__init_naive_bayes")
def create_train_classifier():
    print "Recreating training classifier"
    corpus_dir = nltk.data.find(TRAIN_DATASET_LOC)
    train_data = nltk.corpus.CategorizedPlaintextCorpusReader(corpus_dir, fileids='.*\.txt',cat_pattern="(pos|neg)")
        

    negids_train = train_data.fileids('neg')
    posids_train = train_data.fileids('pos')
        
    # negids_movies = movie_reviews.fileids('neg')
    # posids_movies = movie_reviews.fileids('pos')

    negfeats = [(__word_feats_neg(train_data.words(fileids=[f])), 'neg') for f in negids_train]
    posfeats = [(__word_feats_pos(train_data.words(fileids=[f])), 'pos') for f in posids_train]

    # negfeats.extend([(__word_feats_neg(movie_reviews.words(fileids=[f])), 'neg') for f in negids_movies])
    # posfeats.extend([(__word_feats_pos(movie_reviews.words(fileids=[f])), 'pos') for f in posids_movies])

    trainfeats = negfeats + posfeats

    classifier = NaiveBayesClassifier.train(trainfeats)
    
    pos_file_name = 'pickles'+os.sep+'positive_train.pickle'
    neg_file_name = 'pickles'+os.sep+'negative_train.pickle'
    class_file_name = 'pickles'+os.sep+'nbClassifier.pickle'
    
    __write_file(pos_file_name,cPickle.dumps(posfeats))
    __write_file(neg_file_name,cPickle.dumps(negfeats))
    __write_file(class_file_name,cPickle.dumps(classifier))
    print "Done!"
Beispiel #27
0
def main():
    org_names = Org.objects.values_list('name', flat=True)

    users = User.objects.filter(likely_org=False)
    user_names = [user.get_name for user in users]
    # Exclude the users we know are orgs (exact same name). This mostly gets run the first time and for new users with org names
    non_org_user_names = set(user_names) - set(org_names)

    org_features = [(word_features(name), 'org') for name in org_names]
    user_features = [(word_features(name), 'user') for name in non_org_user_names]

    classifier = NaiveBayesClassifier.train(user_features + org_features)

    counter = 0

    likely_orgs = []

    for user in users:
        prediction = classifier.prob_classify(word_features(user.get_name))
        if prediction.max() == 'org':
            # Log probability ratio, so if P(org) == 2.4 and P(user) == 0.3 then log2(P(org)/P(user)) = log2(8.0) = 3.0
            ratio = math.log(((float(prediction.prob('org')) + NORMALIZING_CONST) / (float(prediction.prob('user')) + NORMALIZING_CONST)), 2)
            if ratio >= MIN_RATIO and user.likely_org == False and user.admin_classification != 'user':
                log.info('User ID %d with name "%s" is probably an org. Saving.' % (user.id, user.get_name))
                user.likely_org = True
                user.org_probability = ratio
                user.save()
                counter += 1

    log.info("Processed %d users with org-like names" % counter)
Beispiel #28
0
def classify():
    #corpus = 'Cornell_text_polarity'
    #corpus = 'BingLiu_selected_sentences'
    corpus = 'Cornell_sentence_polarity'
    cases = load_corpus(corpus)
    features = get_word_features(cases)

    train_feats = []
    test_feats = []
    for polarity, feats in features.items():
        #cutoff = len(feats) * 1 / 4
        cutoff = 1000
        print polarity, 'number of train:', cutoff
        #train_feats += feats[:cutoff]
        #test_feats += feats[cutoff:]
        temp_feats = feats[:]
        random.shuffle(temp_feats)
        train_feats += temp_feats[:cutoff]
        test_feats += temp_feats[cutoff:]

    print 'train on %d instances, test on %d instances' % (len(train_feats), len(test_feats))

    classifier = NaiveBayesClassifier.train(train_feats)
    print 'accuracy:', nltk.classify.util.accuracy(classifier, test_feats)
    classifier.show_most_informative_features()
def naiveBayes(features_train, features_test):
	print 'train on %d instances, test on %d instances' % (len(features_train), len(features_test))
	classifier = NaiveBayesClassifier.train(features_train)
	print 'accuracy:', nltk.classify.util.accuracy(classifier, features_test)
	classifier.show_most_informative_features()	
	precisions, recalls = precision_recall(classifier, features_test)
	print "accuracy: ", precisions, "fitness: ", recalls
    def train(self, graphs):
        """
        Trains a ``NaiveBayesClassifier`` using the edges present in
        graphs list as positive examples, the edges not present as
        negative examples.  Uses a feature vector of head-word,
        head-tag, child-word, and child-tag.

        :type graphs: list(DependencyGraph)
        :param graphs: A list of dependency graphs to train the scorer.
        """

        from nltk.classify import NaiveBayesClassifier

        # Create training labeled training examples
        labeled_examples = []
        for graph in graphs:
            for head_node in graph.nodes.values():
                for child_index, child_node in graph.nodes.items():
                    if child_index in head_node['deps']:
                        label = "T"
                    else:
                        label = "F"
                    labeled_examples.append(
                        (
                            dict(
                                a=head_node['word'],
                                b=head_node['tag'],
                                c=child_node['word'],
                                d=child_node['tag'],
                            ),
                            label,
                        )
                    )

        self.classifier = NaiveBayesClassifier.train(labeled_examples)
    neg_words.append(({neg_word.rstrip(): True}, 'negative'))

print "First 5 positive words %s " % pos_words[:5]
print "First 5 negative words %s" % neg_words[:5]

print "Number of positive words %d" % len(pos_words)

print "Number of negative words %d" % len(neg_words)

all_words_with_sentiment = pos_words + neg_words

print "Total number of words %d" % len(all_words_with_sentiment)

from nltk.classify import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(all_words_with_sentiment)


def to_dictionary(words):
    return dict([(word, True) for word in words])


test_data = []


def predict_sentiment(text, expected_sentiment=None):
    text_to_classify = to_dictionary(text.split())
    result = classifier.classify(text_to_classify)
    test_data.append([text_to_classify, expected_sentiment])
    return result
    testFeats = None

    for category in categories:
        instancesOfEntityTrain = getInstancesOfEntity(
            category, completeTaggedSentencesTrain)
        instancesOfEntityTest = getInstancesOfEntity(
            category, completeTaggedSentencesTest)

        entityFeatsTrain = train_feats(category, instancesOfEntityTrain)
        entityFeatsTest = train_feats(category, instancesOfEntityTrain)

        if trainFeats == None:
            trainFeats = entityFeatsTrain
            testFeats = entityFeatsTest
        else:
            trainFeats += entityFeatsTrain
            testFeats += entityFeatsTest

    features = prev_next_pos_iob

    #naiveBayers
    naiveBayers = NaiveBayesClassifier.train(trainFeats)
    naiveBayersTagger = ClassifierBasedTagger(
        train=completeTaggedSentencesTrain,
        feature_detector=features,
        classifier_builder=naiveBayers)
    nerChunkerNaiveBayers = ClassifierChunker(completeTaggedSentencesTrain,
                                              naiveBayersTagger)
    evalNaiveBayers = nerChunkerNaiveBayers.evaluate2(testFeats)
    print(evalNaiveBayers)
Beispiel #33
0
 def build_classifier(self):
     classifier = NaiveBayesClassifier.train(self.training)
     return (classifier)
Beispiel #34
0
threshold_factor = 0.8
threshold_positive = int(threshold_factor * len(features_positive))
threshold_negative = int(threshold_factor * len(features_negative))

features_train = features_positive[:
                                   threshold_positive] + features_negative[:
                                                                           threshold_negative]
features_test = features_positive[threshold_positive:] + features_negative[
    threshold_negative:]

print("Number of training datapoints: ", len(features_train))
print("Number of test datapoints: ", len(features_test))

# Train a Naive Bayes Classifier

classifier = NaiveBayesClassifier.train(features_train)
print("\nAccuracy of NBC: ",
      nltk.classify.util.accuracy(classifier, features_test))

print("\nTop 10 most informative words: ")
for item in classifier.most_informative_features()[:10]:
    print(item[0])

# Sample input reviews

input_reviews = [
    "It is an amazing movie",
    "This is a dull movie. I would never recommend it to anyone.",
    "The cinematography is pretty great in this movie",
    "The direction was terrible and the story was all over the place"
]
    )  #Using only the contents in HTML <body> tag, avoides Javascript from being treated as text.
    words = html_data.findAll(
        text=True
    )  #setting text to True to extract only the text in the <body>
    word_list = []  #Stores the list of words
    for word in words[
            30:]:  #Removing redundant content from Instapaper Mobilizer headers
        for w in word.split(" "):  #splitting on spcae for multiword strings
            wd = (pattern.sub('', w.lower())
                  )  #substituing non alphanumeric characters with ''
            if len(wd) > 1 and not wd.isdigit():
                word_list.append(
                    wd)  #exclude strings of less than 2 characters
    filtered_words = [
        w for w in word_list if not w in nltk.corpus.stopwords.words('english')
    ]
    return filtered_words


positive_examples = [
    'http://www.engadget.com/2012/11/16/htc-droid-dna-review/',
    'http://www.engadget.com/2012/10/08/samsung-galaxy-note-ii-review/',
    'http://www.engadget.com/2012/11/16/htc-desire-x-review/',
    'http://www.engadget.com/2012/11/16/htc-desire-x-review/'
]
train_set = [(list(get_list_of_words_in_url), True)
             for link in positive_examples]
classifier = NaiveBayesClassifier.train(train_set)
print get_list_of_words_in_url(
    'http://www.theverge.com/2012/11/28/3699112/the-verge-year-one-our-big-stories-august-2012-through-november-2012'
)
Beispiel #36
0
def evaluate_features(feature_select):

    #All variables
    tagged_Sentences = []
    untagged_Sentences = []
    neg_sentence = []
    pos_sentence = []
    mixed_sentence = []
    neutral_sentence = []
    neg_Feautures = []
    pos_Feautures = []
    mixed_Feautures = []
    neutral_Feautures = []
    test_sentence = []
    test_Feautures = []
    allwords = []

    tempPos = []

    stopWords = stopwords.words("english")

    # Reading positive words from txt file
    fileInput = open('positive-words.txt', 'r')
    sentences = re.split(r'\n', fileInput.read())
    fileInput.close()

    for i in sentences:
        posWords = re.findall(r"^[\w']+", i)
        if posWords:
            posWords = [feature_select(posWords), '1']
            POS_Words.append(posWords)
            pos_Feautures.append(posWords)

    # Reading negative words from txt file
    fileInput = open('negative-words.txt', 'r')
    sentences = re.split(r'\n', fileInput.read())
    fileInput.close()
    for i in sentences:
        negWords = re.findall(r"^[\w']+", i)
        if negWords:
            negWords = [feature_select(negWords), '-1']
            NEG_Words.append(negWords)
            neg_Feautures.append(negWords)

    #reading pre-labeled input and splitting into lines
    fileInput = open('All_Classified.txt', 'r')
    sentences = re.split(r'\n', fileInput.read())
    fileInput.close()

    for i in sentences:
        tagged = re.findall(r"^[012\(-1)]|[\w']+[/]?[\w']+[/]+[\w']+ [.,!?;]*",
                            i)
        untagged = re.sub(r'/[^\s]+|[0-9]+|[.,!?;]*|', '', i)

        untagged_Words = re.findall(r"[\w']+|[.,!?;]", untagged)
        filtered_Words = [
            w for w in untagged_Words if not w.lower() in stopWords
        ]
        #allwords.append(', '.join(filtered_Words))

        if untagged and tagged:
            if tagged[0] == '-1':
                neg_sentence.append(untagged)
                filtered_Words = [feature_select(filtered_Words), '-1']
                NEG_Words.append(filtered_Words)
                neg_Feautures.append(filtered_Words)

                tagged_Words = [feature_select(tagged), '-1']
                NEG_Words.append(tagged_Words)
                neg_Feautures.append(tagged_Words)

                allword = ['-1', ', '.join(untagged_Words)]
                allwords.append(', '.join(allword))

            if tagged[0] == '1':
                pos_sentence.append(untagged)
                filtered_Words = [feature_select(filtered_Words), '1']
                POS_Words.append(filtered_Words)
                pos_Feautures.append(filtered_Words)

                tagged_Words = [feature_select(tagged), '1']
                POS_Words.append(tagged_Words)
                pos_Feautures.append(tagged_Words)

                allword = ['1', ', '.join(untagged_Words)]
                allwords.append(', '.join(allword))

            if tagged[0] == '2':
                mixed_sentence.append(untagged)
                filtered_Words = [feature_select(filtered_Words), '2']
                MIX_Words.append(filtered_Words)
                mixed_Feautures.append(filtered_Words)

                allword = ['2', ', '.join(untagged_Words)]
                allwords.append(', '.join(allword))

            if tagged[0] == '0':
                neutral_sentence.append(untagged)
                filtered_Words = [feature_select(filtered_Words), '0']
                NEUTRAL_Words.append(filtered_Words)
                neutral_Feautures.append(filtered_Words)

                allword = ['0', ', '.join(untagged_Words)]
                allwords.append(', '.join(allword))

        tagged_Sentences.append(tagged)
        untagged_Sentences.append(untagged)

    #Read a test file and create test feutures
    #reading pre-labeled input and splitting into lines
    fileInput = open('cs583_test_data.txt', 'r')
    sentences = re.split(r'\n', fileInput.read())
    fileInput.close()
    temp = 0
    for i in sentences:
        tagged = re.findall(
            r"^[\"012\(-1)]|[\w']+[/]?[\w']+[/]+[\w']+[.,!?;]*", i)
        #tagged = re.findall(r"^[-=+\*]|[\w']+[/]?[\w']+[/]+[^(NN|NNS|NNP|PRP)]+ [.,!?;]*", i)
        untagged = re.sub(r'/[^\s]+|[0-9]+|[.,!?;]*|', '', ' '.join(tagged))
        #untagged =re.sub(r'/[^\s]+|[0-9]+|[.,!?;]*|','',i)

        untagged_Words = re.findall(r"[\w']+|[.,!?;]", untagged)
        filtered_Words = [
            w for w in untagged_Words if not w.lower() in stopWords
        ]

        if untagged and tagged and i:
            if i[-2] == '-':
                c = '-1'
                test_sentence.append(untagged)
                filtered_Words = [feature_select(filtered_Words), c]
                #NEUTRAL_Words.append(filtered_Words)
                test_Feautures.append(filtered_Words)
            if i[-1] == '1' and i[-2] != '-':
                c = '1'
                test_sentence.append(untagged)
                filtered_Words = [feature_select(filtered_Words), c]
                #NEUTRAL_Words.append(filtered_Words)
                test_Feautures.append(filtered_Words)

            if i[-1] == '2':
                c = '2'
                test_sentence.append(untagged)
                filtered_Words = [feature_select(filtered_Words), c]
                #NEUTRAL_Words.append(filtered_Words)
                test_Feautures.append(filtered_Words)

            if i[-1] == '0':
                c = '0'
                test_sentence.append(untagged)
                filtered_Words = [feature_select(filtered_Words), c]
                #NEUTRAL_Words.append(filtered_Words)
                test_Feautures.append(filtered_Words)
    """          
    posCutoff = int(math.floor(len(pos_Feautures)*3/4))
    negCutoff = int(math.floor(len(neg_Feautures)*3/4))
    """
    neutralCutoff = int(math.floor(len(neutral_Feautures) * 1 / 20))

    trainFeatures = pos_Feautures + neg_Feautures + neutral_Feautures[:
                                                                      neutralCutoff]

    #test_Feautures= pos_Feautures[posCutoff:] + neg_Feautures[negCutoff:] + neutral_Feautures[neutralCutoff: 2*neutralCutoff]
    #trains a Naive Bayes Classifier
    classifier = NaiveBayesClassifier.train(trainFeatures)

    #initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)

    #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(test_Feautures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)

    #prints metrics to show how well the feature selection did
    print 'train on %d instances, test on %d instances' % (
        len(tagged_Sentences), len(test_sentence))
    print 'accuracy:', nltk.classify.util.accuracy(classifier, test_Feautures)

    print 'pos precision:', nltk.metrics.precision(referenceSets['1'],
                                                   testSets['1'])
    print 'pos recall:', nltk.metrics.recall(referenceSets['1'], testSets['1'])
    print 'pos f-measure:', nltk.metrics.f_measure(referenceSets['1'],
                                                   testSets['1'])

    print 'neg precision:', nltk.metrics.precision(referenceSets['-1'],
                                                   testSets['-1'])
    print 'neg recall:', nltk.metrics.recall(referenceSets['-1'],
                                             testSets['-1'])
    print 'neg f-measure:', nltk.metrics.f_measure(referenceSets['-1'],
                                                   testSets['-1'])
Beispiel #37
0
print("Generiere Cutoff")
ingCutoff = int(len(ingFeats) * 0.9)
neutCutoff = int(len(neutFeats) * 0.9)

print(
    f'Sätze Ingvar-Korpus {len(ingFeats)}, Sätze neutraler Korpus {len(neutFeats)}'
)

print("Trainiere Classifier mit Kontrollmenge")
trainfeats = ingFeats[:ingCutoff] + neutFeats[:neutCutoff]
testfeats = ingFeats[ingCutoff:] + neutFeats[neutCutoff:]
print('Trainiere mit %d Features, Teste mit %d Features' %
      (len(trainfeats), len(testfeats)))

classifierTrain = NaiveBayesClassifier.train(trainfeats)
print('Genauigkeit:', nltk.classify.util.accuracy(classifierTrain, testfeats))
classifierTrain.show_most_informative_features()

print("Trainiere Classifier zum Weiterverwenden")
mainFeats = ingFeats + neutFeats
classifier = NaiveBayesClassifier.train(mainFeats)

with open('SentimentAnalysisClassifier.pickle', 'wb') as f:
    pickle.dump(classifier, f, protocol=2)
    f.close()
"""
test_sentence = ''

while test_sentence.lower() is not 'stop':
Beispiel #38
0
    positive_ids = movie_reviews.fileids('pos')
""" 
Separate positive features from negative
"""
negative_features = [(extract(movie_reviews.words(fileids=[f])), 'neg')
                     for f in negative_ids]
positive_features = [(extract(movie_reviews.words(fileids=[f])), 'pos')
                     for f in positive_ids]
""" 
Trains of 3/4 off the database
and test off 1/4
"""
negative_cutoff = int(len(negative_features) * 3 / 4)
positive_cutoff = int(len(positive_features) * 3 / 4)

train_features = negative_features[:
                                   negative_cutoff] + positive_features[:
                                                                        positive_cutoff]
test_features = negative_features[negative_cutoff:] + positive_features[
    positive_cutoff:]

print('Training on %d instances, testing on %d instances' %
      (len(train_features), len(test_features)))
classifier = NaiveBayesClassifier.train(train_features)
print('Training complete')
print('accuracy:', nltk.classify.util.accuracy(classifier, test_features))
classifier.show_most_informative_features()
""" Save classifier """
f = open('classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()
Beispiel #39
0
for w in short_pos_words:
    all_words.append(w.lower())

for w in short_neg_words:
    all_words.append(w.lower())

print('3' * 80)
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())

print('4' * 80)


def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features


featuresets = [(find_features(rev), category) for (rev, category) in documents]
training_set = featuresets
print('5' * 80)
model = NaiveBayesClassifier.train(training_set)
print('6' * 80)

Pkl_Filename = "Pickle_RL_Model.pkl"
with open(Pkl_Filename, 'wb') as file:
    pickle.dump(model, file)
def classifyReviews():
    ''' Perform sentiment classification on movie reviews '''
    # Read the data from the file
    data = pd.read_csv("data/movieReviews.csv")

    # get the text of the positive and negative reviews only.
    # positive and negative will be lists of strings
    # For now we use only very positive and very negative reviews.
    positive = getReviews(data, 4)
    negative = getReviews(data, 0)

    # Split each data set into training and testing sets.
    # You have to write the function splitTrainTest
    (posTrainText, posTestText) = splitTrainTest(positive, 0.8)
    (negTrainText, negTestText) = splitTrainTest(negative, 0.8)

    # Format the data to be passed to the classifier.
    # You have to write the formatForClassifer function
    posTrain = formatForClassifier(posTrainText, 'pos')
    negTrain = formatForClassifier(negTrainText, 'neg')

    # Create the training set by appending the pos and neg training examples
    training = posTrain + negTrain

    # Format the testing data for use with the classifier
    posTest = formatForClassifier(posTestText, 'pos')
    negTest = formatForClassifier(negTestText, 'neg')
    # Create the test set
    test = posTest + negTest

    # Train a Naive Bayes Classifier
    # Uncomment the next line once the code above is working
    classifier = NaiveBayesClassifier.train(training)

    # Uncomment the next two lines once everything above is working
    print("Accuracy of the classifier is: " + str(accuracy(classifier, test)))
    classifier.show_most_informative_features()

    # Calculate and print the accuracy on the positive and negative
    # documents separately
    # You will want to use the function classifier.classify, which takes
    # a document formatted for the classifier and returns the classification
    # of that document ("pos" or "neg").  For example:
    # classifier.classify(format_sentence("I love this movie. It was great!"))
    # will (hopefully!) return "pos"
    numPos = 0
    numNeg = 0
    for review in positive:
        if classifier.classify(format_sentence(review)) == "pos":
            numPos += 1
    for review in negative:
        if classifier.classify(format_sentence(review)) == "neg":
            numNeg += 1
    print("Accuracy of Positive: " + str(numPos / len(positive)))
    print("Accuracy of Negative: " + str(numNeg / len(negative)))

    # Prints two lists with all of the misclassified positive reviews and misclassified negative reviews.
    wrongPosList = []
    wrongNegList = []
    for review in positive:
        if classifier.classify(format_sentence(review)) == "neg":
            wrongPosList.append(review)
    for review in negative:
        if classifier.classify(format_sentence(review)) == "pos":
            wrongNegList.append(review)
    print("Misclassified Positive Reviews: " + str(wrongPosList))
    print("Misclassified Negative Reviews: " + str(wrongNegList))
def train(trainfeats, testfeats, dataset, nlt=True, skl=True, most=0):
    # print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

    nltk_output = "none"
    sklearn_output = "none"

    if nlt:

        my_classifier = NaiveBayesClassifier.train(trainfeats)
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = my_classifier.classify(feats)
            testsets[observed].add(i)

        # precision and recall
        accuracy = nltk.classify.util.accuracy(my_classifier, testfeats) * 100
        pos_prec = precision(refsets['pos'], testsets['pos']) * 100
        pos_rec = recall(refsets['pos'], testsets['pos']) * 100
        neg_prec = precision(refsets['neg'], testsets['neg']) * 100
        neg_rec = recall(refsets['neg'], testsets['neg']) * 100

        # round
        accuracy = round(accuracy, 1)
        pos_prec = round(pos_prec, 1)
        pos_rec = round(pos_rec, 1)
        neg_prec = round(neg_prec, 1)
        neg_rec = round(neg_rec, 1)

        # print('pos F-measure:', f_measure(refsets['pos'], testsets['pos']))
        # print('neg F-measure:', f_measure(refsets['neg'], testsets['neg']))
        my_classifier.show_most_informative_features(most)

        nltk_output = "nlt, " + str(accuracy) + ", " + str(
            pos_prec) + ", " + str(neg_prec) + ", " + str(
                pos_rec) + ", " + str(neg_rec) + "\n"

    if skl:

        MNB_classifier = SklearnClassifier(MultinomialNB())
        MNB_classifier._vectorizer.sort = False
        my_classifier = MNB_classifier.train(trainfeats)
        # get_precision(trainfeats, testfeats, my_classifier, dataset, "mnb")

        # mnb = (nltk.classify.accuracy(MNB_classifier, testfeats)) * 100
        # mnb = round(mnb, 1)
        # print(mnb)

        BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
        BernoulliNB_classifier._vectorizer.sort = False
        my_classifier = BernoulliNB_classifier.train(trainfeats)
        # get_precision(trainfeats, testfeats, my_classifier, dataset, "bnb")

        # bnb = (nltk.classify.accuracy(BernoulliNB_classifier, testfeats)) * 100
        # bnb = round(bnb, 1)
        # print(bnb)

        LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
        LogisticRegression_classifier._vectorizer.sort = False
        my_classifier = LogisticRegression_classifier.train(trainfeats)
        # get_precision(trainfeats, testfeats, my_classifier, dataset, "lr")

        # lr = (nltk.classify.accuracy(LogisticRegression_classifier, testfeats)) * 100
        # lr = round(lr, 1)
        # print(lr)

        LinearSVC_classifier = SklearnClassifier(LinearSVC())
        LinearSVC_classifier._vectorizer.sort = False
        my_classifier = LinearSVC_classifier.train(trainfeats)
        # get_precision(trainfeats, testfeats, my_classifier, dataset, "lsvc")

        # lsvc = (nltk.classify.accuracy(LinearSVC_classifier, testfeats)) * 100
        # lsvc = round(lsvc, 1)
        # print(lsvc)

        NuSVC_classifier = SklearnClassifier(NuSVC())
        NuSVC_classifier._vectorizer.sort = False
        my_classifier = NuSVC_classifier.train(trainfeats)
        # get_precision(trainfeats, testfeats, my_classifier, dataset, "nsvc")

        # nsvc = (nltk.classify.accuracy(NuSVC_classifier, testfeats)) * 100
        # nsvc = round(nsvc, 1)
        # print(nsvc)

        voted_classifier = VoteClassifier(NuSVC_classifier,
                                          LinearSVC_classifier, MNB_classifier,
                                          BernoulliNB_classifier,
                                          LogisticRegression_classifier)
        get_precision(trainfeats, testfeats, voted_classifier, dataset,
                      "voted")

        # voted = (nltk.classify.accuracy(voted_classifier, testfeats)) * 100
        # voted = round(voted, 1)
        # print(voted)

        # sklearn_output = "skl, " + str(mnb) + ", " + str(bnb) + ", " + str(lr) + ", " + str(lsvc) + ", " + str(nsvc) + ", " + str(voted) + "\n"
        sklearn_output = ""

    return (nltk_output, sklearn_output)
Beispiel #42
0
        pos.append([format_sentence(i), 'pos'])

neg = []
with open("rt.neg", encoding="utf8") as f:
    for i in f:
        neg.append([format_sentence(i), 'neg'])

# next, split labeled data into the training and test data
training = pos[:int((.8) * len(pos))] + neg[:int((.8) * len(neg))]
test = pos[int((.8) * len(pos)):] + neg[int((.8) * len(neg)):]

# a second test data set based on tweets
test_2 = []
with open("pos_tweets.txt", encoding="utf8") as f:
    for i in f:
        test_2.append([format_sentence(i), 'pos'])

with open("neg_tweets.txt", encoding="utf8") as f:
    for i in f:
        test_2.append([format_sentence(i), 'neg'])

classifier = NaiveBayesClassifier.train(training)
classifier.show_most_informative_features()
print("accuracy = " + str(accuracy(classifier, test)))
print("accuracy 2 = " + str(accuracy(classifier, test_2)))

# training a model every time we run the program is inefiicient
# we use pickle to store and save the model for future use
classifier_file = open("classifier.pickle", "wb")
pickle.dump(classifier, classifier_file)
classifier_file.close()
def main(twtInfo: object):
    data_tcs_tweets = pd.read_json(twtInfo, orient="records")
    tweets = data_tcs_tweets["text"]
    data_id = data_tcs_tweets["id"]
    nltk.download("twitter_samples")
    pos_tweets = twitter_samples.strings("positive_tweets.json")
    neg_tweets = twitter_samples.strings("negative_tweets.json")
    clean_pos_tweets = [cleanTweet(tweet) for tweet in pos_tweets]
    clean_neg_tweets = [cleanTweet(tweet) for tweet in neg_tweets]
    # downloads corpus of stopwords (i.e. "the", "did", "?")
    # TODO: check if nltk.stopwords is already downloaded and if it is, then skip
    nltk.download("stopwords")
    # downloads work tokenizer trained on English
    # TODO: check if nltk.punkt is already downloaded and if it is, then skip
    nltk.download("punkt")
    # tokenize and clean up the whole set of clean tweet texts
    # tc_tweets = tokenized & cleaned tweets
    pos_tc_tweets = tokenizeTweets(clean_pos_tweets)
    neg_tc_tweets = tokenizeTweets(clean_neg_tweets)
    # apply stemming algorithm to tweets
    # stemming normalizes text i.e. "waited", "waits", "waiting" -> "wait"
    # this cleans the data and makes it easier for the ML algorithm to read it
    pos_tcs_tweets = stemTweets(pos_tc_tweets)
    neg_tcs_tweets = stemTweets(neg_tc_tweets)
    # pairs each tweet's cleaned text with its sentiment label
    pos_label_pair_list = ((tweet, "pos") for tweet in pos_tcs_tweets)
    neg_label_pair_list = ((tweet, "neg") for tweet in neg_tcs_tweets)
    # TODO: possible bias location, we are only separating into positive/negative sentiment and not neutral
    # remove all neutral tweets since we are only interested in positive/negative ones
    #text_label_pair_list[:] = [tuple for tuple in text_label_pair_list if tuple[1] != "Neutral"]
    # split into train and test set, 90% for training set, 10% for testing set
    #train, test = train_test_split(text_label_pair_list, test_size = 0.1, random_state=7)
    # define bag-of-words model and features
    pos_bow = [(buildBowFeatures(tuple[0]), tuple[1])
               for tuple in pos_label_pair_list]
    neg_bow = [(buildBowFeatures(tuple[0]), tuple[1])
               for tuple in neg_label_pair_list]
    data_bow = [buildBowFeatures(text) for text in tweets]
    # one of the simplest supervised ML classifiers is the Naive Bayes Classifier
    # TODO: potential new tool would involve different ML classifier
    # it can be trained on 90% of the data to learn what words are associated with pos/neg comments
    train_bow = pos_bow + neg_bow
    shuffle(train_bow)
    sentiment_classifier = NaiveBayesClassifier.train(train_bow)
    # we can check after training what the accuracy is on the training set
    # i.e. the same data we used for training, this should be a high number since algo already saw the data
    #nltk.classify.util.accuracy(sentiment_classifier, train_bow)*100
    # accuracy on the testing set
    #nltk.classify.util.accuracy(sentiment_classifier, test_bow)*100
    preds = [
        sentiment_classifier.classify(comment_dict)
        for comment_dict in data_bow
    ]
    # TODO: figure out what to return
    #return pd.Series(rxt_params).to_json(orient="records")
    #dfPreds = pd.DataFrame(preds)
    #ret = pd.concat([data, dfPreds], axis=1)
    ret = []
    for i in range(len(preds)):
        ret.append({})
        ret[i]["text"] = tweets[i]
        ret[i]["id"] = data_id[i]
        ret[i]["sentiment"] = preds[i]
    return pd.Series(ret).to_json(orient="records")
Beispiel #44
0
            # count term frequency
            terms_all = [term for term in words if term not in stop]
            word_tokens = word_feats(terms_all)
            bi_tokens = bigram_word_feats(terms_all)
            best_tokens200 = best_feats(terms_all,bestwords)
            count_all.update(terms_all)
            sentence = " ".join(str(term) for term in terms_all)
            #print ('label the features')
            with open('train-labels.txt','r') as f2:
                for line2 in f2:
                    line2 = line2.strip('\n')
                    if id == line2.split('\t')[0]:
                        train_data.append([bi_tokens, line2.split('\t')[1]])

        print ('train the model')
        classifier = NaiveBayesClassifier.train(train_data)
#       classifier.show_most_informative_features(20)
    with open("dev-tweets.txt",'r')as dev:
        print ('preprocess the dev ')
        test_data = []
        for line in dev:
            line = line.strip('\n')
            id2 = line.split('\t')[0]
            text = line.split('\t')[1]
            text = preprocess.processAll(text)

            words = [word if (word[0:2] == '__') else word.lower() for word in text.split() if len(word) >= 3]
            words = [word for word in words if word[0:2] != '__']
            words = [stemmer.stem(w) for w in words]  
            words = [lemmatizer.lemmatize(w) for w in words]
Beispiel #45
0
def returnLastValue():
    with open('Actor.txt', 'r') as actor:
        data_actor = actor.read().replace(',', ' ')
        actor_vocab = nltk.word_tokenize(data_actor)

    with open('Plot.txt', 'r') as plot:
        data_plot = plot.read().replace(',', ' ')
        plot_vocab = nltk.word_tokenize(data_plot)

    with open('Theme.txt', 'r') as theme:
        data_theme = theme.read().replace(',', ' ')
        theme_vocab = nltk.word_tokenize(data_theme)

    with open('Other.txt', 'r') as other:
        data_other = other.read().replace(',', ' ')
        other_vocab = nltk.word_tokenize(data_other)

    actor_features = [(word_feats(act), 'actor') for act in actor_vocab]
    plot_features = [(word_feats(plo), 'plot') for plo in plot_vocab]
    theme_feature = [(word_feats(the), 'theme') for the in theme_vocab]
    other_feature = [(word_feats(ot), 'other') for ot in other_vocab]

    train_set = actor_features + plot_features + theme_feature + other_feature

    NBclassifier = NaiveBayesClassifier.train(train_set)

    LR_classifier = SklearnClassifier(LogisticRegression())
    LR_classifier.train(train_set)

    LSVS_classifier = SklearnClassifier(LinearSVC())
    LSVS_classifier.train(train_set)

    Random_classifier = SklearnClassifier(RandomForestClassifier())
    Random_classifier.train(train_set)

    decision_classifier = SklearnClassifier(DecisionTreeClassifier())
    decision_classifier.train(train_set)

    classifierReswithVotes = {}
    classifierResult = []

    NBactor = 0
    NBplot = 0
    NBtheme = 0
    NBother = 0

    # actor plot theme other
    LRactor = 0
    LRplot = 0
    LRtheme = 0
    LRother = 0

    SVMactor = 0
    SVMplot = 0
    SVMtheme = 0
    SVMother = 0

    RFactor = 0
    RFplot = 0
    RFtheme = 0
    RFother = 0

    DTactor = 0
    DTplot = 0
    DTtheme = 0
    DTother = 0

    line = "good actor"
    sentence = "".join(line)
    sentence = sentence.lower()
    words = sentence.split(' ')
    # actor plot theme other
    for word in words:
        NBclassResult = NBclassifier.classify(word_feats(word))
        if NBclassResult == 'actor':
            NBactor = NBactor + 1
        if NBclassResult == 'plot':
            NBplot = NBplot + 1
        if NBclassResult == 'theme':
            NBtheme = NBtheme + 1
        if NBclassResult == 'other':
            NBother = NBother + 1
        # actor plot theme other
        classResultLR = LR_classifier.classify(word_feats(word))
        if classResultLR == 'actor':
            LRactor = LRactor + 1
        if classResultLR == 'plot':
            LRplot = LRplot + 1
        if classResultLR == 'theme':
            LRtheme = LRtheme + 1
        if classResultLR == 'other':
            LRother = LRother + 1

        # actor plot theme other
        classResultSVM = LSVS_classifier.classify(word_feats(word))
        if classResultSVM == 'actor':
            SVMactor = SVMactor + 1
        if classResultSVM == 'plot':
            SVMplot = SVMplot + 1
        if classResultSVM == 'theme':
            SVMtheme = SVMtheme + 1
        if classResultSVM == 'other':
            SVMother = SVMother + 1

        classResultRandom = Random_classifier.classify(word_feats(word))
        # actor plot theme other
        if classResultRandom == 'actor':
            RFactor = RFactor + 1
        if classResultRandom == 'plot':
            RFplot = RFplot + 1
        if classResultRandom == 'theme':
            RFtheme = RFtheme + 1
        if classResultRandom == 'other':
            RFother = RFother + 1

        classResultDT = decision_classifier.classify(word_feats(word))
        # actor plot theme other
        if classResultDT == 'actor':
            DTactor = DTactor + 1
        if classResultDT == 'plot':
            DTplot = DTplot + 1
        if classResultDT == 'theme':
            DTtheme = DTtheme + 1
        if classResultDT == 'other':
            DTother = DTother + 1

    # actor plot theme other
    statsNB = {
        'actor': (float(NBactor) / len(words)),
        'plot': (float(NBplot) / len(words)),
        'theme': (float(NBtheme) / len(words)),
        'other': (float(NBother) / len(words))
    }
    maximumNB = max(statsNB.items(), key=operator.itemgetter(1))[0]
    classifierResult.append(maximumNB)
    addToVotes(maximumNB, statsNB.pop(maximumNB), classifierReswithVotes)

    statsLR = {
        'actor': (float(LRactor) / len(words)),
        'plot': (float(LRplot) / len(words)),
        'theme': (float(LRtheme) / len(words)),
        'other': (float(LRother) / len(words))
    }
    maximumLR = max(statsLR.items(), key=operator.itemgetter(1))[0]
    classifierResult.append(maximumLR)
    addToVotes(maximumLR, statsLR.pop(maximumLR), classifierReswithVotes)

    statsSVM = {
        'actor': (float(SVMactor) / len(words)),
        'plot': (float(SVMplot) / len(words)),
        'theme': (float(SVMtheme) / len(words)),
        'other': (float(SVMother) / len(words))
    }
    maximumSVM = max(statsSVM.items(), key=operator.itemgetter(1))[0]
    classifierResult.append(maximumSVM)
    addToVotes(maximumSVM, statsSVM.pop(maximumSVM), classifierReswithVotes)

    statsRF = {
        'actor': (float(RFactor) / len(words)),
        'plot': (float(RFplot) / len(words)),
        'theme': (float(RFtheme) / len(words)),
        'other': (float(RFother) / len(words))
    }
    maximumRF = max(statsRF.items(), key=operator.itemgetter(1))[0]
    classifierResult.append(maximumRF)
    addToVotes(maximumRF, statsRF.pop(maximumRF), classifierReswithVotes)

    statsDT = {
        'actor': (float(DTactor) / len(words)),
        'plot': (float(DTplot) / len(words)),
        'theme': (float(DTtheme) / len(words)),
        'other': (float(DTother) / len(words))
    }
    maximumDT = max(statsDT.items(), key=operator.itemgetter(1))[0]
    classifierResult.append(maximumDT)
    addToVotes(maximumDT, statsDT.pop(maximumDT), classifierReswithVotes)

    print(str(classifierResult))

    try:
        normalRes = mode(classifierResult)
    except:
        maxx = max(classifierReswithVotes.items(),
                   key=operator.itemgetter(1))[0]
        normalRes = maxx

    hybrid_classifier = VoteClassifier(NBclassifier, LR_classifier,
                                       LSVS_classifier, Random_classifier,
                                       decision_classifier)
    print("sentence :" + sentence)
    hybridRes = hybrid_classifier.classifyAll(sentence)

    print("Normal Result", normalRes)
    print("Hybrid Result", hybridRes)

    print(
        "------------------------------------------------------------------------------"
    )
Beispiel #46
0
pos = []

with open("basic_positive.csv", "r") as reader:
    for line in reader:
        pos.append(line)
neg = []
with open("basic_negative.csv", "r") as reader:
    for line in reader:
        neg.append(line)

positive_feature = [(format_sentence(pos_term), "pos") for pos_term in pos]
negative_feature = [(format_sentence(neg_term), "neg") for neg_term in neg]

train_test = positive_feature + negative_feature
classifier = NaiveBayesClassifier.train(train_test)

pos = 0
neg = 0
pos_line = 0
neg_line = 0
counter = 0
count_line = 0
#try to save the pos and neg comments
#this can be helpful to perform a further machine learning test
pos_file = open("positive_comments.csv", "w")
neg_file = open("negative_comments.csv", "w")

with open("dataset.csv", "r") as reader:
    for line in reader:
        count_line += 1  #this is the single comment
Beispiel #47
0
           df.iloc[idx_test:,:]

test_df, training_df=\
    pd_train_test_split(data_df, test_size=0.2, randomstate=123)

train_Xy=[(wfeatures,sentiment) for wfeatures,sentiment in \
           zip(training_df['dict_features'].tolist(),training_df['sentiment'].tolist())]

test_Xy=[(wfeatures,sentiment) for wfeatures,sentiment in \
          zip(test_df['dict_features'].tolist(),test_df['sentiment'].tolist())]


# training
print(" ** training ")
    
NB_nltk_clf = NaiveBayesClassifier.train(train_Xy)

MaxEnt_nltk_clf=classifier = MaxentClassifier.train(train_Xy, max_iter = 10)

NB_nltk_clf.show_most_informative_features(10)
MaxEnt_nltk_clf.show_most_informative_features(10)

# test and report

# support functions
def classifier_predict(clf, testXy):
    test_predictions, test_labels=[],[]
    for sampleid in range(len(testXy)):
        test_predictions.append(clf.classify(testXy[sampleid][0]))
        test_labels.append(testXy[sampleid][1])
    return np.array(test_predictions), np.array(test_labels)
Beispiel #48
0
from nltk.tokenize import word_tokenize
from nltk.classify import NaiveBayesClassifier
import pickle


def formatar_sentenca(sentenca):
    return {palavra: True for palavra in word_tokenize(sentenca)}


f_pos = open('corpus_positivo.txt', 'rb')
positivos = f_pos.read().splitlines()
f_pos.close()
f_neg = open('corpus_negativo.txt', 'rb')
negativos = f_neg.read().splitlines()
f_neg.close()

dados_treinamento = []

for positivo in positivos:
    dados_treinamento.append(
        [formatar_sentenca(positivo.decode("utf8").lower()), "positivo"])
for negativo in negativos:
    dados_treinamento.append(
        [formatar_sentenca(negativo.decode("utf8").lower()), "negativo"])

modelo = NaiveBayesClassifier.train(dados_treinamento)

with open('modelo.obj', 'wb') as f:
    modelo_serial = pickle.dump(modelo, f)
    print('Modelo classificador treinado e armazenado em modelo.obj')
    def do_validation(self):
        # each fold is a list of body ids.
        folds, hold_out = kfold_split(self.dataset, n_folds=10)
        #  fold_stances is a dict. keys are fold number (e.g. 0-9). hold_out_stances is list
        fold_stances, hold_out_stances = get_stances_for_folds(
            self.dataset, folds, hold_out)

        labeled_feat_dict = {}

        print "Generating features for each fold"
        for fold_id in fold_stances:
            print "Generating features for fold ", fold_id
            bodies = folds[fold_id]
            stances = fold_stances[fold_id]

            fold_avg_sims, fold_max_sims = JaccardGenerator().gen_jaccard_sims(
                self.dataset, bodies, stances)

            labeled_feature_set = []
            for i in range(len(stances)):
                labeled_feature = ({
                    'avg_sims': fold_avg_sims[i],
                    'max_sims': fold_max_sims[i]
                }, self._process_stance(stances[i]['Stance']))
                labeled_feature_set.append(labeled_feature)

            labeled_feat_dict[fold_id] = labeled_feature_set

        print "Generating features for hold out fold"
        holdout_avg_sims, holdout_max_sims = JaccardGenerator(
        ).gen_jaccard_sims(self.dataset, hold_out, hold_out_stances)

        h_unlabeled_features = []
        h_labels = []
        for i in range(len(hold_out_stances)):
            unlabeled_feature = {
                'avg_sims': holdout_avg_sims[i],
                'max_sims': holdout_max_sims[i]
            }
            label = self._process_stance(hold_out_stances[i]['Stance'])

            h_unlabeled_features.append(unlabeled_feature)
            h_labels.append(label)

        fold_accuracy = {}
        best_fold_accuracy = 0.0
        classifiers = []

        print "Validating using each fold as testing set"
        for fold_id in fold_stances:
            fold_ids = list(range(len(folds)))
            del fold_ids[fold_id]  # deleted fold is test set for this run

            # training set is every fold except for the testing fold (fold_id)
            training_set = [
                feat for fid in fold_ids for feat in labeled_feat_dict[fid]
            ]

            testing_set = []
            testing_labels = []

            # testing set is just the testing fold (fold_id)
            for feat, label in labeled_feat_dict[fold_id]:
                testing_set.append(feat)
                testing_labels.append(label)

            classifier = NaiveBayesClassifier.train(training_set)
            classifiers.append(classifier)
            pred = classifier.classify_many(testing_set)

            accuracy = self._score(pred, testing_labels)
            print "Fold ", fold_id, "accuracy: ", accuracy
            if accuracy > best_fold_accuracy:
                best_fold_accuracy = accuracy
                best_fold_cls = classifier

        h_res = best_fold_cls.classify_many(h_unlabeled_features)
        print 'holdout score:', self._score(h_res, h_labels)
Beispiel #50
0
def getClassifier(tweetfile, cfg):
    degreesToUse = cfg['NLPnGrams']
    print "DEBOOOOO", degreesToUse, type(degreesToUse)
    classMode = cfg['NLPMode'].replace('-', ' ').replace('_', ' ')
    shortClass = classMode.replace(' ', '').lower()
    loadNeeded = True

    if 'NLPTEST' not in cfg.keys():
        degreeString = '-'.join([str(degree) for degree in degreesToUse])
        pickleFile = 'nlpTrainers/' + tweetfile.replace(
            '.csv', '.' + shortClass + degreeString + '.pickle')
        if isfile(pickleFile):
            print "Loading pickled", shortClass, "classifier"
            fileIn = open(pickleFile)
            classifier = cPickle.load(fileIn)
            fileIn.close()
            loadNeeded = False

    if loadNeeded:
        if 'NLPTEST' in cfg.keys():
            content = prepText(tweetfile)
            categorized = prepClassifications(content)
            NGrammized = collectNGrams(categorized, degreesToUse, cfg)
        else:
            print "Loading content & preparing text"
            content = prepText(loadFile(tweetfile))
            print "Categorizing contents"
            categorized = prepClassifications(content)
            print "Deriving NGrams of length(s)", degreesToUse
            NGrammized = collectNGrams(categorized, degreesToUse, cfg)
            print "Compiling Results"
        readyToSend = []
        allCats = [str(key) for key in NGrammized.keys()]
        for category in allCats:
            readyToSend += NGrammized[category]

        print "Attempting Classification by mode", classMode, degreesToUse
        if classMode == 'naive bayes':
            from nltk.classify import NaiveBayesClassifier
            classifier = {
                'class': NaiveBayesClassifier.train(readyToSend),
                'mode': 'nb'
            }
        elif classMode == 'positive naive bayes':
            from nltk.classify import PositiveNaiveBayesClassifier
            classifier = {
                'class': PositiveNaiveBayesClassifier.train(readyToSend),
                'mode': 'pnb'
            }
        elif classMode == 'max ent':
            #import nltk.classify
            #from sklearn.linear_model import LogisticRegression
            #from nltk.classify import SklearnClassifier
            #classifier = {'class':LogisticRegression.train(readyToSend),'mode':'me'}
            from nltk.classify import MaxentClassifier
            classifier = {
                'class': MaxentClassifier.train(readyToSend, algorithm='iis'),
                'mode': 'me'
            }
        elif classMode == 'decision tree':
            from nltk.classify import DecisionTreeClassifier
            classifier = {
                'class': DecisionTreeClassifier.train(readyToSend),
                'mode': 'dt'
            }
        elif classMode == 'svm':
            if "SVMOrder" in cfg.keys():
                priority = cfg['SVMOrder']
            else:
                priority = "ABCDEFGHIJKLMNOPQRSTUVWXYZ9876543210"
            if type(priority) is str:
                priority = list(priority)
            priority = [entry for entry in priority if entry in allCats]
            preppedSVM = prepSVMAll(readyToSend, priority, allCats, cfg)
            classifier = {
                'class': preppedSVM,
                'mode': 'svm',
                'priority': priority
            }
        else:
            from nltk.classify import NaiveBayesClassifier
            classifier = {
                'class': NaiveBayesClassifier.train(readyToSend),
                'mode': 'nb'
            }

        if 'NLPTEST' not in cfg.keys():
            print "Pickling Classifier"
            fileOut = open(pickleFile, 'wb')
            cPickle.dump(classifier, fileOut)
            fileOut.close()

    if 'NLPTEST' not in cfg.keys():
        if classMode != 'svm':
            classifier['class'].show_most_informative_features(n=150)
        """else:
		for key in classifier['class'].keys():
			print classifier		
			print classifier.keys()
			classifier['class'][key].show_most_informative_features(n=150/len(classifier['class'].keys()))"""

    return classifier
Beispiel #51
0
    fileName = sys.argv[1]
    # Loads training data from input file
    load_data(fileName)

    # Then build feature vectors for both negative and positive tweets
    negfeats = get_feature_vec(negtweets, 'neg')
    posfeats = get_feature_vec(postweets, 'pos')

    all_feats = negfeats + posfeats
    random.shuffle(all_feats)

    print 'TextBlog accuracy on training data:', textblob_acc(
        postweets, negtweets)
    result = cross_validation(all_feats)
    print 'Naiave Bayes cross validation accuracy:', result[0]
    print 'MaxEnt cross validation accuracy:', result[1]

    pos1feats = get_feature_vec(pos1, 'pos')
    neg1feats = get_feature_vec(neg1, 'neg')

    all1_feats = neg1feats + pos1feats

    print 'TextBlog accuracy on manual dataset:', textblob_acc(pos1, neg1)
    bayes = NaiveBayesClassifier.train(all_feats)
    print 'Naiave Bayes accuracy on manual dataset:', nltk.classify.util.accuracy(
        bayes, all1_feats)
    maxent = nltk.MaxentClassifier.train(
        all_feats, nltk.classify.MaxentClassifier.ALGORITHMS[0], max_iter=1)
    print 'MaxEnt accuracy on manual dataset:', nltk.classify.util.accuracy(
        maxent, all1_feats)
def training():
    if os.path.exists('CommentSentimentData/dataset.pkl'):
        with open('CommentSentimentData/dataset.pkl', 'rb') as f:
            dataset = pickle.load(f)
        pos_sen = dataset['pos']
        neg_sen = dataset['neg']

    else:
        dataset = fileload('CommentSentimentData/training.1600000.csv')
        pos_sen = [sen[5] for sen in dataset if sen[0] == '4']
        neg_sen = [sen[5] for sen in dataset if sen[0] == '0']
        dataset_dic = {}
        dataset_dic['pos'] = pos_sen
        dataset_dic['neg'] = neg_sen
        with open('CommentSentimentData/dataset.pkl', 'wb') as f:
            pickle.dump(dataset_dic, f, protocol=pickle.HIGHEST_PROTOCOL)

    if os.path.exists('CommentSentimentData/bestwords.pkl'):
        with open('CommentSentimentData/bestwords.pkl', 'rb') as f:
            best_words = pickle.load(f)
    else:
        best_words = find_best_words(pos_sen, neg_sen, 2000)
        with open('CommentSentimentData/bestwords.pkl', 'wb') as f:
            pickle.dump(best_words, f, protocol=pickle.HIGHEST_PROTOCOL)

    prev = [(features(words, best_words), 'positive') for words in pos_sen]
    nrev = [(features(words, best_words), 'negative') for words in neg_sen]

    pos_set = prev
    neg_set = nrev

    if os.path.exists('CommentSentimentData/classifier.pkl'):
        with open('CommentSentimentData/classifier.pkl', 'rb') as f:
            real_classifier = pickle.load(f)
    else:
        real_classifier = NaiveBayesClassifier.train(prev + nrev)
        with open('CommentSentimentData/classifier.pkl', 'wb') as f:
            pickle.dump(real_classifier, f, protocol=pickle.HIGHEST_PROTOCOL)

    # TO TEST ACCURACY OF CLASSIFIER UNCCOMMENT THE CODE BELOW
    # ACCURACY : 78.1695423855964

    # ncutoff = int(len(nrev) * 3 / 4)
    # pcutoff = int(len(prev) * 3 / 4)
    # train_set = nrev[:ncutoff] + prev[:pcutoff]
    # test_set = nrev[ncutoff:] + prev[pcutoff:]
    # # test_classifier = NaiveBayesClassifier.train(train_set)
    # test_classifier = SklearnClassifier(BernoulliNB()).train(train_set)

    pos_sen = open("CommentSentimentData/positive.txt",
                   'r',
                   encoding='latin-1').read()
    neg_sen = open("CommentSentimentData/negative.txt",
                   'r',
                   encoding='latin-1').read()
    prev = [(features(words, best_words), 'positive')
            for words in pos_sen.split('\n')]
    nrev = [(features(words, best_words), 'negative')
            for words in neg_sen.split('\n')]
    test_set = nrev + prev

    print("Accuracy is : ", util.accuracy(real_classifier, test_set) * 100)
def author_beng_nbc():
    #1st Set
    bankc = open("/python27/Bankim500_1.txt", "r").read()
    bankw = bankc.split()
    bankz = reduce(concat, [['bankim', x] for x in bankw[1:]], bankw[0:1])
    #print a3
    it = iter(bankz)
    bankt = zip(it, it)
    #print a4
    #2nd Set
    bibhuc = open("/python27/Bibhuti500_1.txt", "r").read()
    bibhuw = bibhuc.split()
    bibhuz = reduce(concat, [['bibhuti', x] for x in bibhuw[1:]], bibhuw[0:1])
    #print b3
    it1 = iter(bibhuz)
    bibhut = zip(it1, it1)
    #print b4
    #3rd Set
    rabindrac = open("/python27/Rabindra500_1.txt", "r").read()
    rabindraw = rabindrac.split()
    rabindraz = reduce(concat, [['rabindra', x] for x in rabindraw[1:]],
                       rabindraw[0:1])
    #print a3
    it2 = iter(rabindraz)
    rabindrat = zip(it2, it2)
    #4th Set
    saratc = open("/python27/Sarat500_1.txt", "r").read()
    saratw = saratc.split()
    saratz = reduce(concat, [['sarat', x] for x in saratw[1:]], saratw[0:1])
    #print a3
    it3 = iter(saratz)
    saratt = zip(it3, it3)
    add1 = bankt + bibhut + rabindrat + saratt
    #print c1
    training_data = add1
    vocabulary = set(
        chain(*[word_tokenize(i[0].lower()) for i in training_data]))
    feature_set = [
        ({i: (i in word_tokenize(sentence.lower()))
          for i in vocabulary}, tag) for sentence, tag in training_data
    ]
    #print "###",feature_set
    from nltk.classify import NaiveBayesClassifier as nbc
    train_set, test_set = feature_set[:300], feature_set[300:]
    print len(train_set)
    print len(test_set)
    classifier = nbc.train(train_set)
    test_sentence = "আলীপুরের উকিল বিশেষ কিছু হয় বলিয়া মনে হয় না বালিগঞ্জের ওদিকে কোথায় একটা টিউশনি আছে"
    featurized_test_sentence = {
        i: (i in word_tokenize(test_sentence.lower()))
        for i in vocabulary
    }
    print "test_sent:", test_sentence
    print "tag:", classifier.classify(featurized_test_sentence)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    for i, (feats, label) in enumerate(test_set):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    print 'bankim precision:', nltk.precision(refsets['bankim'],
                                              testsets['bankim'])
    print 'bankim recall:', nltk.recall(refsets['bankim'], testsets['bankim'])
    print 'bankim F-measure:', nltk.f_measure(refsets['bankim'],
                                              testsets['bankim'])
    print 'bibhuti precision:', nltk.precision(refsets['bibhuti'],
                                               testsets['bibhuti'])
    print 'bibhuti recall:', nltk.recall(refsets['bibhuti'],
                                         testsets['bibhuti'])
    print 'bibhuti F-measure:', nltk.f_measure(refsets['bibhuti'],
                                               testsets['bibhuti'])
    print 'bankim precision:', nltk.precision(refsets['rabindra'],
                                              testsets['rabindra'])
    print 'bankim recall:', nltk.recall(refsets['rabindra'],
                                        testsets['rabindra'])
    print 'bankim F-measure:', nltk.f_measure(refsets['rabindra'],
                                              testsets['rabindra'])
    print 'bibhuti precision:', nltk.precision(refsets['sarat'],
                                               testsets['sarat'])
    print 'bibhuti recall:', nltk.recall(refsets['sarat'], testsets['sarat'])
    print 'bibhuti F-measure:', nltk.f_measure(refsets['sarat'],
                                               testsets['sarat'])
    def unigramAnalysis(self, word_extract_feature):

        #Dataset on Anger and Trust is extremely poor
        #it ruined my existing dataset as well , so I will avoid them as of now
        datafiles = [
            {
                'emo': "Sad",
                'name': "/negative.csv"
            }, {
                'emo': "Happy",
                'name': "/positive.csv"
            }
            # ,{'emo': 'Happy', 'name': "/trust.csv"}, {'emo': 'Sad', 'name': "/anger.csv"}
        ]

        trainfeats = []
        testfeats = []
        dataset = []
        for value in datafiles:
            emo = value['emo']
            name = value['name']
            read = self.readFile(name)
            read['emo'] = emo
            features = [(word_extract_feature(statement.split()), emo)
                        for statement in read['tweets']]
            dataset.append(features)

        for data in dataset:
            cutoff = len(data) * 3 / 4
            trainfeats = trainfeats + data[:cutoff]
            testfeats = testfeats + data[cutoff:]

        try:
            classifier = NaiveBayesClassifier.train(trainfeats)
            refsets = collections.defaultdict(set)
            testsets = collections.defaultdict(set)

            #K-Fold classification test
            #average the result of number of tests
            shuffle(trainfeats)
            X_folds = np.array_split(trainfeats, K_FOLDS)

            scores = list()
            for k in range(K_FOLDS):
                X_train = list(X_folds)
                X_test = X_train.pop(k)
                X_train = np.concatenate(X_train)
                classifier = NaiveBayesClassifier.train(X_train)
                scores.append(nltk.classify.util.accuracy(classifier, X_test))

            for i, (feats, label) in enumerate(testfeats):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)

            print 'Average accuracy K-Fold ', sum(scores) / float(len(scores))
            print 'accuracy:', nltk.classify.util.accuracy(
                classifier, testfeats)
            print 'Happy precision:', nltk.metrics.precision(
                refsets['Happy'], testsets['Happy'])
            print 'Happy recall:', nltk.metrics.recall(refsets['Happy'],
                                                       testsets['Happy'])
            print 'Sad precision:', nltk.metrics.precision(
                refsets['Sad'], testsets['Sad'])
            print 'Sad recall:', nltk.metrics.recall(refsets['Sad'],
                                                     testsets['Sad'])
            # print 'Output:',nltk.classify.util.accuracy(classifier, ['He is Our To be Hanged'])
            # print 'Trust precision:', nltk.metrics.precision(refsets['Trust'], testsets['Trust'])
            # print 'Trust recall:', nltk.metrics.recall(refsets['Trust'], testsets['Trust'])
            # print 'Sad precision:', nltk.metrics.precision(refsets['Angry'], testsets['Angry'])
            # print 'Sad recall:', nltk.metrics.recall(refsets['Angry'], testsets['Angry'])
            classifier.show_most_informative_features()

        except AttributeError, err:
            print Exception, err
Beispiel #55
0
def word_feats(words):
    return dict([(word, True) for word in words])


voc_p = ['great', 'fun', 'epic', 'good', 'happy', 'safe', 'normal','amazing' ]
voc_n = ['bad', 'terrible', 'help', 'danger', 'trouble']


ft_pos = [(word_feats(pos), 'pos') for pos in voc_p]
ft_ng = [(word_feats(neg), 'neg') for neg in voc_n]


tr_set = ft_ng + ft_pos

class_fy = NaiveBayesClassifier.train(tr_set)
# Predict
def predictNegPos(sentence):
    ng = 0
    ps = 0

    sentence = sentence.lower()
    words = sentence.split(' ')
    for word in words:
        classResult = class_fy.classify(word_feats(word))
        if classResult == 'neg':
            ng+=1
        if classResult == 'pos':
            ps+=1
    if ng > ps:
        result = firebase.put(
    filtered_from_stopWords=''
    counter = 0
    for j in range(len(illegal_chars)) :
        if counter == 0:
            counter+=1
            filtered = i[0].replace(illegal_chars[j], '')
        else :
            filtered=filtered.replace(illegal_chars[j],'')
    counter=0
    filteredArr = filtered.split(' ')
    for x in filteredArr :
        if x not in stopWords :
            filtered_from_stopWords+=x+' '
    bb=[]
    filtered_from_stopWords_ARRAY=filtered_from_stopWords.split(' ')
    features = {w.lower(): (w  in most_cm_1) for w in filtered_from_stopWords_ARRAY}
    bb.append(features)
    bb.append(i[1])
    sentences.append(bb)
    remarks.append(i[1])

count =0
print(remarks)
print(sentences)
classifier = NaiveBayesClassifier.train(sentences)
inputs = input('Enter a comment ')
words_entered=inputs.split(' ')
entry = {w: ( True) for w in words_entered}

print(classifier.classify(entry))
Beispiel #57
0
def train_model(train_features):
    classifier = NaiveBayesClassifier.train(train_features)
    return classifier
Beispiel #58
0
    testfeats = negtest_feats[:test_neg] + postest_feats[:test_pos]

    while (flag == 1
           ):  #this flag is set so that user gets option to pass another query
        i = 0  #i is set for indexing the 10 extracted videos
        result = [
        ]  #double dimensional array to store 100 comments of each video
        pos_neg_list = [
        ]  #this list stores the positive and negative counts of all 10 videos as a tuple
        query = raw_input('enter a query word:')
        (vid_ids,
         vid_titles, vid_likes, vid_dislikes, comment_count) = youtube_search(
             query)  #youtube_search function returns five parameters
        print("no. of videos", len(vid_ids))

        nb_classifier = NaiveBayesClassifier.train(trainfeats)
        #nb_precisions, nb_recalls= precision_recall(nb_classifier, testfeats)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = nb_classifier.classify(feats)
            testsets[observed].add(i)

        print("Accuracy:", nltk.classify.accuracy(nb_classifier, testfeats))
        print("Positive Precision:", precision(refsets['pos'],
                                               testsets['pos']))
        print('Positive Recall:', recall(refsets['pos'], testsets['pos']))
        print('Positive F-measure:', f_measure(refsets['pos'],
                                               testsets['pos']))
def evaluate_features(feature_select):
    #these variables contains the output of our feature selection mechanism
    posFeatures = []
    negFeatures = []
    conFeatures = []
    intFeatures = []
    litFeatures = []
    modFeatures = []
    supFeatures = []
    uncFeatures = []
    newsFeatures = []
    # http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
    # breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
    with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWords = [feature_select(posWords), 'pos']
            posFeatures.append(posWords)
    with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWords = [feature_select(negWords), 'neg']
            negFeatures.append(negWords)
    with open(RT_POLARITY_CON_FILE, 'r') as conSentences:
        for i in conSentences:
            conWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            conWords = [feature_select(conWords), 'con']
            conFeatures.append(conWords)

        # for i in newsSentences:
        #     newsWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
        #     newsWords = [feature_select(newsWords), 'news']
        #     newsFeatures.append(newsWords)
    #print(newsFeatures)
    #separates the data into training and testing data for a Naive Bayes classifier
    # selects 3/4 of the features to be used for training and 1/4 to be used for testing
    posCutoff = int(math.floor(len(posFeatures) * 3 / 4))
    negCutoff = int(math.floor(len(negFeatures) * 3 / 4))
    conCutoff = int(math.floor(len(conFeatures) * 3 / 4))

    #trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] + conFeatures[:conCutoff]
    #testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:] + conFeatures[conCutoff:]
    trainFeatures = posFeatures + negFeatures + conFeatures
    print(trainFeatures)
    with open(RT_POLARITY_NEWS_FILE, 'r') as newsSentences:
        for test_sentence in newsSentences:
            # Tokenize the line.
            doc = nltk.word_tokenize(test_sentence.lower())
            featurized_doc = {i: (i in doc) for i in trainFeatures}
            # tagged_label = classifier.classify(featurized_doc)
            print(doc)
    testFeatures = featurized_doc
    print(trainFeatures)

    # trains a Naive Bayes Classifier
    classifier = NaiveBayesClassifier.train(trainFeatures)

    # initiates referenceSets and testSets
    referenceSets = collections.defaultdict(
        set)  #will contain the actual values for the testing data
    testSets = collections.defaultdict(set)  #will contain the predicted output

    # puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)

        # print(predicted)
        testSets[predicted].add(i)

    # prints metrics to show how well the feature selection did
    print('train on %d instances, test on %d instances' %
          (len(trainFeatures), len(testFeatures)))
    print('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures))
    print('pos precision:', precision(referenceSets['pos'], testSets['pos']))
    print('pos recall:', recall(referenceSets['pos'], testSets['pos']))
    print('neg precision:', precision(referenceSets['neg'], testSets['neg']))
    print('neg recall:', recall(referenceSets['neg'], testSets['neg']))
    print('con precision:', precision(referenceSets['con'], testSets['con']))
    print('con recall:', recall(referenceSets['con'], testSets['con']))
    classifier.show_most_informative_features(10)
def main():

    all_words = []

    list_of_tokens_of_each_file = []
    #---read all files to extract all word lists---#
    for root, directories, filenames in os.walk(sys.argv[1]):

        for each_filename in filenames:
            if each_filename.endswith(".txt"):
                input = open(os.path.join(root, each_filename),
                             "r",
                             encoding="latin1").read()
                tokens = input.split()
                list_of_tokens_of_each_file.append(tokens)
                for each_token in tokens:
                    if each_token not in all_words:
                        all_words.append(each_token)

    test_set_list = []
    #---- fetch all dev data set ---#
    for root, directories, filenames in os.walk(sys.argv[2]):

        for each_filename in filenames:
            if each_filename.endswith(".txt"):
                test_input = open(os.path.join(root, each_filename),
                                  "r",
                                  encoding="latin1").read()
                test_tokens = test_input.split()
                test_set_list.append(test_tokens)

    total_list = []
    X_list = []

    #--- extract the features for each of the training file ----#
    for root, directories, filenames in os.walk(sys.argv[1]):

        if "positive" in root or "negative" in root or "neutral" in root:
            for i in range(len(filenames)):

                X_list.append(
                    word2features(list_of_tokens_of_each_file[i], all_words,
                                  root))

    result_list = []

    #-- applying naive bayes NLTK classification ---#
    classifier = NaiveBayesClassifier.train(X_list)
    f = open('nboutput.txt', 'w')

    actual_positive = 0
    actual_negative = 0
    actual_neutral = 0
    positive_counter = 0
    negative_counter = 0
    neutral_counter = 0
    classified_positive = 0
    classified_negative = 0
    classified_neutral = 0
    for root, directories, filenames in os.walk(sys.argv[2]):

        print("root  is ", root)
        for i in range(len(filenames)):
            path = root + '/' + filenames[i]
            if "positive" in path:
                actual_positive = actual_positive + 1
            elif "negative" in path:
                actual_negative = actual_negative + 1
            elif "neutral" in path:
                actual_neutral = actual_neutral + 1

            if "positive" in root or "negative" in root or "neutral" in root:
                result_list.append(
                    word2features_test(test_set_list[i], all_words))

                #--- classify each of the test file to respective category---#
                output = classifier.classify(
                    word2features_test(test_set_list[i], all_words))

                if output == "positive":
                    classified_positive = classified_positive + 1
                elif output == "negative":
                    classified_negative = classified_negative + 1
                elif output == "neutral":
                    classified_neutral = classified_neutral + 1
                f.write(output + " " + root + '/' + filenames[i] + "\n")

    print('accuracy:', nltk.classify.util.accuracy(classifier, X_list))