コード例 #1
1
ファイル: filters.py プロジェクト: gleicon/sentiment_analysis
def print_precision_recall(classifier, test_dict):
    refsets = defaultdict(set)
    testsets = defaultdict(set)
    for i, (feats, label) in enumerate(test_dict):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
    print 'pos precision:', precision(refsets['positive'], testsets['positive'])
    print 'pos recall:', recall(refsets['positive'], testsets['positive'])
    print 'pos F-measure:', f_measure(refsets['positive'], testsets['positive'])
    print 'neg precision:', precision(refsets['negative'], testsets['negative'])
    print 'neg recall:', recall(refsets['negative'], testsets['negative'])
    print 'neg F-measure:', f_measure(refsets['negative'], testsets['negative'])
コード例 #2
0
def get_performance(clf_sel, train_features, test_features):
    ref_set = collections.defaultdict(set)
    test_set = collections.defaultdict(set)
    classification_error = False

    clf = SklearnClassifier(clf_sel)
    try:
        classifier = clf.train(train_features)
    except:
        classification_error = True
        # print (str(clf_sel.__class__),'NA')

    if str(clf_sel.__class__) == "<class 'sklearn.naive_bayes.MultinomialNB'>":
        pickle_cls(classifier, 'MultinomialNB')

    # print(str(clf_sel), 'accuracy:'(nltk.classify.accuracy(classifier, test_features)) * 100)

    if not classification_error:
        clf_acc = nltk.classify.accuracy(classifier, test_features)

        for i, (features, label) in enumerate(test_features):
            ref_set[label].add(i)
            predicted = classifier.classify(features)
            test_set[predicted].add(i)

        pos_precision = precision(ref_set['pos'], test_set['pos'])
        pos_recall = recall(ref_set['pos'], test_set['pos'])
        neg_precision = precision(ref_set['neg'], test_set['neg'])
        neg_recall = recall(ref_set['neg'], test_set['neg'])

        print(
            "{0},{1},{2},{3},{4},{5}".format(clf_sel.__class__, clf_acc, pos_precision, pos_recall, neg_precision,
                                             neg_recall))
コード例 #3
0
ファイル: procesador.py プロジェクト: cristianocca/pln_tarea1
    def GetEvaluacion(self):
        '''
            Devuelve las medidas de precision, recall, y matriz de confusion del clasificador.
            Para esto usamos las funciones precision, recall y confusion matrix de nltk y el conjunto
            de testeo.

            Retorna una tupla (positivos, negativos, matriz) donde positivos y negativos es otra tupla con los valores (precision, recall)

            Precision: Fraccion de las instancias que se clasificaron correctamente / Fraccion de las instancias que se clasificaron en la clase: 
                TP / (TP + FP)
                Cuanto mayor es esto, menor es la cantidad de falsos positivos, es decir, esto me da el porcentaje de los elementos que
                fueron clasificados correctamente en esta clase.

            Recall: Fraccion de las instancias que se clasificaron correctamente / Fraccion de las instancias que realmente estaban en la clase:
                TP / (TP + FN)      
                Cuanto mayor es esto, menor es la cantidad de falsos negativos, es decir, del total de elementos que realmente existen
                en la clase, cuantos clasifique.


            Tanto precision y recall son para las clases y no en general (o sea, hay un valor de precision/recall para la clase de comentarios
            positivos y otros para la clase de comentarios negativos)

            Vale la pena leer de aca: http://streamhacker.com/2010/05/17/text-classification-sentiment-analysis-precision-recall/
        '''

        clasificador = self.GetClasificador()
        corpus = self.DatosTesteo        

        #Las funciones de NLTK usan sets.
        #Construyo sets de referencia y testeo para positivos y negativos.
        refSet = {CLASE_POSITIVO:set(), CLASE_NEGATIVO:set()}   #Tiene los valores reales
        testSet = {CLASE_POSITIVO:set(), CLASE_NEGATIVO:set()}  #Tiene los valores luego de clasificar.
       
        #Valores para la matriz de conf.
        refList = []
        testList = []
        
        #Tengo que construir conjuntos de referencia y testeo a partir de los de testeo, para poder usar las funciones de nltk
        for i, c in enumerate(corpus):
            refSet[c[1]].add(i) #Lo agrega en los positivos o negativos segun su clase.
            clasificado = clasificador.classify(c[0])
            testSet[clasificado].add(i)
        
            refList.append(c[1])
            testList.append(clasificado)


        positivos = ( precision(refSet[CLASE_POSITIVO], testSet[CLASE_POSITIVO]), recall(refSet[CLASE_POSITIVO], testSet[CLASE_POSITIVO]) )
        negativos = ( precision(refSet[CLASE_NEGATIVO], testSet[CLASE_NEGATIVO]), recall(refSet[CLASE_NEGATIVO], testSet[CLASE_NEGATIVO]) )

        return (positivos, negativos, ConfusionMatrix(refList, testList))
コード例 #4
0
 def benchmarking(self, classifier,_test_set,all_f_measure=[],all_precision=[],all_recall=[]):
     from nltk import classify
     accuracy = classify.accuracy(classifier, _test_set)
     
     print("accuracy:",accuracy)
     
     from nltk.metrics import precision
     from nltk.metrics import recall
     from nltk.metrics import f_measure
     
     import collections
     refsets = collections.defaultdict(set)
     testsets = collections.defaultdict(set)
     for i, (feats, label) in enumerate(_test_set):
         refsets[label].add(i)
         observed = classifier.classify(feats)
         testsets[observed].add(i)
         
     prec=precision(refsets['class'], testsets['class'])
     rec=recall(refsets['class'], testsets['class'])
     f1=f_measure(refsets['class'], testsets['class'])
     print('precision:', prec)
     print('recall:', rec)
     print('F-measure:', f1)
             
     all_f_measure.append(f1)
     all_precision.append(prec)
     all_recall.append(rec)
     print('========Show top 10 most informative features========')
     classifier.show_most_informative_features(10)
コード例 #5
0
def precision_and_recall(classifier, testfeats):
    #Finds precision and recall on that big booty classifier.
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    
    
    #Feats is the dictionary of words
    #label is the label, pos or neg
    for i, (feats, label) in enumerate(testfeats):
        
        #a mapping of which entries are pos and negative
        #ex refsets[pos] = {1,2,3,4,6,7,11,78}
        refsets[label].add(i)
        
        #Classifies something as pos or neg given its feats
        observed = classifier.classify(feats)
        
        #a mapping of entries and their classifications
        #ex testsets[pos] = {1,2,3,4,5,8,11}
        testsets[observed].add(i)
        
        prec = {}
        rec = {}
        
    for label in classifier.labels():
        prec[label] = precision(refsets[label], testsets[label])
        rec[label] = recall(refsets[label], testsets[label])
    
    return prec, rec
def multi_metrics(multi_classifier, test_feats):
	mds = []
	refsets = collections.defaultdict(set)
	testsets = collections.defaultdict(set)
	
	for i, (feat, labels) in enumerate(test_feats):
		for label in labels:
			refsets[label].add(i)
		
		guessed = multi_classifier.classify(feat)
		
		for label in guessed:
			testsets[label].add(i)
		
		mds.append(metrics.masi_distance(set(labels), guessed))
	
	avg_md = sum(mds) / float(len(mds))
	precisions = {}
	recalls = {}
	
	for label in multi_classifier.labels():
		precisions[label] = metrics.precision(refsets[label], testsets[label])
		recalls[label] = metrics.recall(refsets[label], testsets[label])
	
	return precisions, recalls, avg_md
コード例 #7
0
ファイル: Estimator.py プロジェクト: Bakuchi/naivebayes
def calculate(classifier, feature_set):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    print("Calculating refsets for precision and recall")
    for i, (feats, label) in enumerate(feature_set):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    print('country precision:', metrics.precision(refsets['country'], testsets['country']))
    print('country recall:', metrics.recall(refsets['country'], testsets['country']))

    print('religion precision:', metrics.precision(refsets['religion'], testsets['religion']))
    print('religion recall:', metrics.recall(refsets['religion'], testsets['religion']))

    print('astronomy precision:', metrics.precision(refsets['astronomy'], testsets['astronomy']))
    print('astronomy recall:', metrics.recall(refsets['astronomy'], testsets['astronomy']))
コード例 #8
0
ファイル: Driver.py プロジェクト: urtonj/PA4
def print_results(classifier, featureset, results, name):
    print '''
    %s classifier results:
    Classifier accuracy: %s
    B Precision: %s
    B Recall: %s
    I Precision: %s
    I Recall: %s
    O Precision: %s
    O Recall: %s
    ''' % (name, 
       accuracy(classifier, featureset), 
       precision(results[0]['B-SNP'], results[1]['B-SNP']), 
       recall(results[0]['B-SNP'], results[1]['B-SNP']), 
       precision(results[0]['I-SNP'], results[1]['I-SNP']),
       recall(results[0]['I-SNP'], results[1]['I-SNP']),
       precision(results[0]['O'], results[1]['O']),
       recall(results[0]['O'], results[1]['O']))
コード例 #9
0
def eval_stats(results):
	'''
	Compute recall, precision, and f-measure from passed results.
	The expected format for results is a dictionary whose keys=<name of article>
	and values=tuple (<test category>, <reference category>, <scores>), where:
	test=category suggested by classifier, reference=pre-classified gold
	category, scores=can be None or dictionary whose keys=category names and
	values=matching score for this article.
	'''
	# Calculate number of correct matches
	correct = 0
	missed = defaultdict(tuple)
	for article_name, (suggested, real, scores) in results.iteritems():
		if suggested==real:
			correct += 1
		else:
			missed[article_name] = (suggested, real)
	success_ratio = correct / float(len(results))
	print "Ratio: %0.3f" % success_ratio
	
	# Print wrong matches
	for name, (suggested, real) in missed.iteritems():
		print "%s\t%s\t%s" % (name, suggested, real)
	
	# Create sets of references / test classification for evaluation
	cat_ref = defaultdict(set)
	cat_test= defaultdict(set)
	for name, (test_category, ref_category, scores) in results.iteritems():
		cat_ref[ref_category].add(name) 		# gold-tagged categories
		cat_test[test_category].add(name) 	# suggested categories

	# Precision, recall, f-measure, support (num of reference articles in
	# each category) for each category
	print "\nCategory\tPrecision\tRecall\tF-measure\tSupport" 
	measures = defaultdict(tuple)
	for category in cat_ref.keys():
		cat_prec = metrics.precision(cat_ref[category], cat_test[category])
		cat_rec = metrics.recall(cat_ref[category], cat_test[category])
		cat_f = metrics.f_measure(cat_ref[category], cat_test[category])
		cat_support = len(cat_ref[category])
		measures[category] = (cat_prec, cat_rec, cat_f, cat_support)
		print "%s\t%0.3f\t%0.3f\t%0.3f\t%d" % \
		(category, cat_prec, cat_rec, cat_f, cat_support)
	
	# Calculate precision, recall, f-measure for entire corpus:
	# This is a weighted average of the values of separate categories
	# SUM(product of all precisions, product of all supports)/sum(total number of supports)
	avg_prec = weighted_average([(cat_measure[0], cat_measure[3]) for \
		cat_measure in measures.values()])
	avg_rec = weighted_average([(cat_measure[1], cat_measure[3]) for \
		cat_measure in measures.values()])
	avg_f = weighted_average([(cat_measure[2], cat_measure[3]) for \
		cat_measure in measures.values()])
	total_support = sum([cat_support[3] for cat_support in measures.values()])
	
	print "%s\t%0.3f\t%0.3f\t%0.3f\t%d" % ("Total", avg_prec, avg_rec, avg_f, total_support)
コード例 #10
0
ファイル: hw2.py プロジェクト: lxmonk/nlg12_hw2
def evaluate_features(feature_extractor, N, only_acc=False):
    from nltk.corpus import movie_reviews
    from nltk.classify import NaiveBayesClassifier as naive
    from nltk.classify.util import accuracy
    from nltk.metrics import precision, recall, f_measure
    from sys import stdout
    
    negative = movie_reviews.fileids('neg')
    positive = movie_reviews.fileids('pos')
    negfeats = [(feature_extractor(movie_reviews.sents(fileids=[f])),
                 'neg') for f in negative]

    posfeats = [(feature_extractor(movie_reviews.sents(fileids=[f])),
                 'pos') for f in positive]
    negtrain, negtest = stratifiedSamples(negfeats, N)
    postrain, postest = stratifiedSamples(posfeats, N)

    trainfeats = negtrain + postrain
    testfeats = negtest + postest
    classifier = naive.train(trainfeats)
    if only_acc: return accuracy(classifier, testfeats)
    print 'accuracy: {}'.format(accuracy(classifier, testfeats))

    # Precision, Recall, F-measure
    from collections import defaultdict
    refsets = defaultdict(set)
    testsets = defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
        
    print 'pos precision:', precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', recall(refsets['pos'], testsets['pos'])
    print 'pos F-measure:', f_measure(refsets['pos'], testsets['pos'])
    print 'neg precision:', precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', recall(refsets['neg'], testsets['neg'])
    print 'neg F-measure:', f_measure(refsets['neg'], testsets['neg'])
    stdout.flush()
    classifier.show_most_informative_features()
    return classifier
コード例 #11
0
 def evaluate_features(self, feature_select):
     #reading pre-labeled input and splitting into lines
     posSentences = open('rt-polarity-pos.txt', 'r')
     negSentences = open('rt-polarity-neg.txt', 'r')
     posSentences = re.split(r'\n', posSentences.read())
     negSentences = re.split(r'\n', negSentences.read())
   
     posFeatures = []
     negFeatures = []
     #breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
     for i in posSentences:
         posWords = re.findall(r"[\w']+|[.,!?;]", i)
         posWords = [feature_select(posWords), 'pos']
         posFeatures.append(posWords)
     for i in negSentences:
         negWords = re.findall(r"[\w']+|[.,!?;]", i)
         negWords = [feature_select(negWords), 'neg']
         negFeatures.append(negWords)
      
     #selects 3/4 of the features to be used for training and 1/4 to be used for testing
     posCutoff = int(math.floor(len(posFeatures)*3/4))
     negCutoff = int(math.floor(len(negFeatures)*3/4))
     trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
     testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
     #Training Phase: 
     classifier = NaiveBayesClassifier.train(trainFeatures)
      
     referenceSets = collections.defaultdict(set)
     testSets = collections.defaultdict(set)    
      
     #Testing Phase:
     for i, (features, label) in enumerate(testFeatures):
         referenceSets[label].add(i)
         predicted = classifier.classify(features)
         testSets[predicted].add(i)
          
     print 'Trained on %d instances, Tested on %d instances' % (len(trainFeatures), len(testFeatures))
     print 'Accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
     print 'Positive Precision:', precision(referenceSets['pos'], testSets['pos'])
     print 'Positive Recall:', recall(referenceSets['pos'], testSets['pos'])
     print 'Negative Precision:', precision(referenceSets['neg'], testSets['neg'])
     print 'Negative Recall:', recall(referenceSets['neg'], testSets['neg'])
コード例 #12
0
def calcAllClassesRecall(classSet, refsets, testsets):
    rSum = 0.0
    denominator = 0
    for category in classSet:
        num = recall(refsets[category], testsets[category])
        if num is None:
            continue
        rSum += num
        denominator += 1
    
    return rSum/denominator
コード例 #13
0
def main():
    global best_words
    tweets = get_tweets_from_db()
    tweet_list = tweets[1000:1599000]
    test_list = tweets[:1000]+ tweets[1599000:]
    word_scores = create_word_scores()
    best_words = find_best_words(word_scores, 500000)
    f = open('bestwords.pickle', 'wb')
    pickle.dump(best_words, f)
    f.close()
    training_set = classify.apply_features(best_word_features, tweet_list)
    print "extracted features"
    # train the classifier with the training set
    classifier = NaiveBayesClassifier.train(training_set)
    print "trained classifier"
    # create the pickle file
    f = open('NBclassifier_new.pickle', 'wb')
    pickle.dump(classifier, f)
    f.close()
    print "created pickle"
    # test for precision and recall
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    test_set = classify.apply_features(best_word_features, test_list)
 
    for i, (feats, label) in enumerate(test_set):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
     
    print 'neg precision:', metrics.precision(refsets['0'], testsets['0'])
    print 'neg recall:', metrics.recall(refsets['0'], testsets['0'])
    print 'pos precision:', metrics.precision(refsets['4'], testsets['4'])
    print 'pos recall:', metrics.recall(refsets['4'], testsets['4'])
    # test_set = classify.apply_features(extract_features, test_list)
    # print "extracted features"
    print classify.accuracy(classifier, test_set)
    print classifier.show_most_informative_features(30)
コード例 #14
0
ファイル: classification.py プロジェクト: shingjay/dealchan
def precision_recall(classifier, testfeats):
	refsets = collections.defaultdict(set)
	testsets = collections.defaultdict(set)
	for i, (feats, label) in enumerate(testfeats):
		refsets[label].add(i)
		observed = classifier.classify(feats)
		testsets[observed].add(i)
	precisions = {}
	recalls = {}
	for label in classifier.labels():
		precisions[label] = metrics.precision(refsets[label], testsets[label])
		recalls[label] = metrics.recall(refsets[label], testsets[label])
	return precisions, recalls
コード例 #15
0
def precision_recall(classifier, testfeats):
    #gives precision and recall of classifiers
    #precision = lack of false positives
    #recall = lack of false negatives
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    
    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    precisions = {}
    recalls = {}
    for label in classifier.labels():
        precisions[label] = precision(refsets[label], testsets[label])
        recalls[label] = recall(refsets[label], testsets[label])
    return precisions, recalls
コード例 #16
0
ファイル: nltk_align.py プロジェクト: avadnal/CLIR--Project-1
    def recall(self, reference):
        """
        Return the recall of an aligned sentence with respect to a
        "gold standard" reference ``AlignedSent``.

        :type reference: AlignedSent or Alignment
        :param reference: A "gold standard" reference aligned sentence.
        :rtype: float or None
        """
        # Get alignments in set of 2-tuples form
        # The "sure" recall is used so we don't penalize for missing an
        # alignment that was only marked as "possible".

        align = self.alignment
        if isinstance(reference, AlignedSent):
            sure = reference.alignment
        else:
            sure  = Alignment(reference)

        # Call NLTKs existing functions for recall
        return recall(sure, align)
コード例 #17
0
def train_classifiers(posFeatures,negFeatures):
    
    #selects 3/4 of the features to be used for training and 1/4 to be used for testing
    posCutoff = int(math.floor(len(posFeatures)*3/4))
    negCutoff = int(math.floor(len(negFeatures)*3/4))
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
    
    #trains a Naive Bayes Classifier
    print ("----------------Naive Bayes Classifier-----------")
    classifier = NaiveBayesClassifier.train(trainFeatures)	
    
    #initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)	
    
    #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(testFeatures):
    	referenceSets[label].add(i)
    	predicted = classifier.classify(features)
    	testSets[predicted].add(i)	
    
    #prints metrics to show how well the feature selection did
    print ('train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)))
    print ('Original Naive Bayes Accuracy:', (nltk.classify.util.accuracy(classifier, testFeatures))*100)
    print ('pos precision:', precision(referenceSets['pos'], testSets['pos']))
    print ('pos recall:', recall(referenceSets['pos'], testSets['pos']))
    print ('neg precision:',precision(referenceSets['neg'], testSets['neg']))
    print ('neg recall:', recall(referenceSets['neg'], testSets['neg']))
    classifier.show_most_informative_features(10)

    #Pickle the algorithm for future use
    save_classifier = open("pickled_algos/originalnaivebayes.pickle","wb")
    pickle.dump(classifier, save_classifier)
    save_classifier.close()    
     
    
    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(trainFeatures)
    print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testFeatures))*100)

    #Pickle the algorithm for future use    
    save_classifier = open("pickled_algos/MNB_classifier.pickle","wb")
    pickle.dump(MNB_classifier, save_classifier)
    save_classifier.close()   

    
    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(trainFeatures)
    print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testFeatures))*100)
    
    #Pickle the algorithm for future use     
    save_classifier = open("pickled_algos/BernoulliNB_classifier.pickle","wb")
    pickle.dump(BernoulliNB_classifier, save_classifier)
    save_classifier.close()    
    
    
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(trainFeatures)
    print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testFeatures))*100)
    
    #Pickle the algorithm for future use 
    save_classifier = open("pickled_algos/LogisticRegression_classifier.pickle","wb")
    pickle.dump(LogisticRegression_classifier, save_classifier)
    save_classifier.close()

    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(trainFeatures)
    print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testFeatures))*100)
    
    #Pickle the algorithm for future use    
    save_classifier = open("pickled_algos/LinearSVC_classifier.pickle","wb")
    pickle.dump(LinearSVC_classifier, save_classifier)
    save_classifier.close()

    
    SGDC_classifier = SklearnClassifier(SGDClassifier())
    SGDC_classifier.train(trainFeatures)
    print("SGDClassifier accuracy percent:",nltk.classify.accuracy(SGDC_classifier, testFeatures)*100)
    
    #Pickle the algorithm for future use 
    save_classifier = open("pickled_algos/SGDC_classifier.pickle","wb")
    pickle.dump(SGDC_classifier, save_classifier)
    save_classifier.close()
    
    Dec_Tree_Classifier = SklearnClassifier(DecisionTreeClassifier())    
    Dec_Tree_Classifier.train(trainFeatures)
    print("DecisionTreeClassifier Accuracy:",(nltk.classify.accuracy(Dec_Tree_Classifier,testFeatures))*100)
    
    
    #Pickle the algorithm for future use 
    save_classifier = open("pickled_algos/decision_tree.pickle","wb")
    pickle.dump(Dec_Tree_Classifier, save_classifier)
    save_classifier.close()    
    
    """
    
#    Grad_Boost_Classifier = SklearnClassifier(GradientBoostingClassifier())
#    Grad_Boost_Classifier.train(trainFeatures)
#    print("Gradient Boosting Classifier Accuracy:", (nltk.classify.accuracy(Grad_Boost_Classifier,testFeatures))*100)    
    """    
    
    Random_Forest_Classifier = SklearnClassifier(RandomForestClassifier())
    Random_Forest_Classifier.train(trainFeatures)
    print("Random Forest Classifier Accuracy:",(nltk.classify.accuracy(Random_Forest_Classifier,testFeatures
    ))*100)
    
    #Pickle the algorithm for future use 
    save_classifier = open("pickled_algos/random_forest.pickle","wb")
    pickle.dump(Random_Forest_Classifier, save_classifier)
    save_classifier.close()
    
    Ada_Boost_Classifier = SklearnClassifier(AdaBoostClassifier())
    Ada_Boost_Classifier.train(trainFeatures)
    print("Ada Boost Classifier Accuracy:",(nltk.classify.accuracy(Ada_Boost_Classifier,testFeatures))*100) 
    
    #Pickle the algorithm for future use 
    save_classifier = open("pickled_algos/Ada_Boost.pickle","wb")
    pickle.dump(Ada_Boost_Classifier, save_classifier)
    save_classifier.close()
    
    
    voted_classifier = VoteClassifier(classifier,
                                  LinearSVC_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier,
                                  Random_Forest_Classifier,
                                  Ada_Boost_Classifier
                                  )
                                                    
    print("Voted classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testFeatures))*100) 
    
    # The voted classifier could not be pickled. Check this later!    
    
    
    return trainFeatures,testFeatures
コード例 #18
0
    trainfeats = negfeats[:4000] + posfeats[:4000]
    testfeats = negfeats[4000:] + posfeats[4000:]
    print("train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats)))
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    # cross validation  3-fold
    feats = negfeats + posfeats
    M = math.floor(len(feats) / 3)
    result = []
    for n in range(3):
        val_set = feats[n * M :][:M]
        train_set = feats[(n + 1) * M :] + feats[: n * M]
        classifier = nltk.NaiveBayesClassifier.train(train_set)
        result.append("{:.4f}".format(round(nltk.classify.accuracy(classifier, val_set) * 100, 4)))

    print("cross_validation:", result)

    print("pos precision:", precision(refsets["pos"], testsets["pos"]))
    print("pos recall:", recall(refsets["pos"], testsets["pos"]))
    print("pos F-measure:", f_measure(refsets["pos"], testsets["pos"]))
    print("neg precision:", precision(refsets["neg"], testsets["neg"]))
    print("neg recall:", recall(refsets["neg"], testsets["neg"]))
    print("neg F-measure:", f_measure(refsets["neg"], testsets["neg"]))
    classifier.show_most_informative_features()
コード例 #19
0
ファイル: spam-02.py プロジェクト: altonga/pydata
# Now create the data structure for model evaluation
#
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testfeats):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)
#print len(refsets)
#print len(testsets)
#print refsets
precisions = {}
recalls = {}
for label in classifier.labels():
    precisions[label] = metrics.precision(refsets[label],testsets[label])
    recalls[label] = metrics.recall(refsets[label], testsets[label])
#
# Let us calculate Precision & Recall and compare with nltk
#
# Luckily the data structures are symmetric
#
c_00=len(refsets[labels[0]].intersection(testsets[labels[0]]))
c_01=len(refsets[labels[0]].intersection(testsets[labels[1]]))
c_10=len(refsets[labels[1]].intersection(testsets[labels[0]]))
c_11=len(refsets[labels[1]].intersection(testsets[labels[1]]))
#
print '  |   H   |   S   |'
print '--|-------|-------|'
print 'H | %5d | %5d |' % (c_00,c_01)
print '--|-------|-------|'
print 'S | %5d | %5d |' % (c_10,c_11)
コード例 #20
0
	
	if not args.no_precision or not args.no_recall or not args.no_fmeasure:
		if args.multi and args.binary:
			refsets, testsets = scoring.multi_ref_test_sets(classifier, test_feats)
		else:
			refsets, testsets = scoring.ref_test_sets(classifier, test_feats)
		
		for label in labels:
			ref = refsets[label]
			test = testsets[label]
			
			if not args.no_precision:
				print '%s precision: %f' % (label, precision(ref, test) or 0)
			
			if not args.no_recall:
				print '%s recall: %f' % (label, recall(ref, test) or 0)
			
			if not args.no_fmeasure:
				print '%s f-measure: %f' % (label, f_measure(ref, test) or 0)

if args.show_most_informative and args.algorithm != 'DecisionTree' and not (args.multi and args.binary):
	print '%d most informative features' % args.show_most_informative
	classifier.show_most_informative_features(args.show_most_informative)

##############
## pickling ##
##############

if not args.no_pickle:
	if args.filename:
		fname = os.path.expanduser(args.filename)
コード例 #21
0
ファイル: validate.py プロジェクト: anov/honors
#script to validate coding
import cPickle as pickle
import sys
from nltk.metrics import accuracy, ConfusionMatrix, precision, recall, f_measure
from collections import defaultdict
import classifier

if __name__=='__main__':
	validation_pickle=sys.argv[1]
	classifier_pickle=sys.argv[2]
	validation_set=pickle.load(open(validation_pickle, 'rb'))
	c=pickle.load(open(classifier_pickle, 'rb'))
	
	reference=defaultdict(set)
	observed=defaultdict(set)
	for i, (tweet, label) in enumerate(validation_set):
		reference[label].add(i)
		observation=c.classify(tweet)
		observed[observation].add(i)
	
	print "accuracy: %s" % accuracy(observed, reference)
	print "pos precision: %s" % precision(reference['positive'], observed['positive'])
	print "pos recall: %s" % recall(reference['positive'], observed['positive'])
	print "pos f-measure: %s" % f_measure(reference['positive'], observed['positive'])
	print "neg precision: %s" % precision(reference['negative'], observed['negative'])
	print "neg recall: %s" % recall(reference['negative'], observed['negative'])
	print "neg f-measure: %s" % f_measure(reference['negative'], observed['negative'])
	
コード例 #22
0
def avaliate_new_classifier(featureSet):
	print("Vamos treinar o classificador agora!")
	print("\n")
	#random.shuffle(featureSet)

	#Cada um tem 197
	positive_tweets = featureSet[:196]

	#Misturando as paradas pra nao ficar testando só os mesmos últimos
	random.shuffle(positive_tweets)

	#print(featureSet[7185])
	#Pra pegar 7185 do pos e 7185 do negativo mas o negativo tem 7213
	negative_tweets = featureSet[196:293]
	random.shuffle(negative_tweets)

	neutral_tweets = featureSet[293:]
	random.shuffle(neutral_tweets)

	#Agora vou dividir cada classe em um conjunto de referencia e outro de teste
	pos_cutoff = len(positive_tweets)*3/4
	neg_cutoff = len(negative_tweets)*3/4
	neu_cutoff = len(neutral_tweets)*3/4

	# 75% dos tweets vao pra ser de referencia(treinamento) e o resto pra teste
	pos_references = positive_tweets[:pos_cutoff]
	pos_tests = positive_tweets[pos_cutoff:]

	neg_references = negative_tweets[:neg_cutoff]
	neg_tests = negative_tweets[neg_cutoff:]

	neu_references = neutral_tweets[:neu_cutoff]
	neu_tests = neutral_tweets[neu_cutoff:]

	#COnjunto de treinamento e de testes pra calcular a accuracy
	training_set = pos_references + neg_references + neu_references
	testing_set = pos_tests + neg_tests + neu_tests

	start_time = time.time()

	global classifier
	print("Comecou a treina-lo agora!")

	#training_set2 = [(t,l) for (t,l,twe) in training_set]

	classifier = nltk.NaiveBayesClassifier.train(training_set)
	#testing_set2 = [(t,l) for (t,l,twe) in testing_set]
	print("Naive Bayes Algo accuracy:", (nltk.classify.accuracy(classifier, testing_set)) * 100)
	classifier.show_most_informative_features(30)

	refsets = collections.defaultdict(set)
	testsets = collections.defaultdict(set)

	for i, (feats, label) in enumerate(testing_set):
	    refsets[label].add(i)
	    observed = classifier.classify(feats)
	    testsets[observed].add(i)
	 
	print 'pos precision:', precision(refsets['pos'], testsets['pos'])
	print 'pos recall:', recall(refsets['pos'], testsets['pos'])
	print 'pos F-measure:', f_measure(refsets['pos'], testsets['pos'])

	print 'neg precision:', precision(refsets['neg'], testsets['neg'])
	print 'neg recall:', recall(refsets['neg'], testsets['neg'])
	print 'neg F-measure:', f_measure(refsets['neg'], testsets['neg'])

	print 'neutral precision:', precision(refsets['neutral'], testsets['neutral'])
	print 'neutral recall:', recall(refsets['neutral'], testsets['neutral'])
	print 'neutral F-measure:', f_measure(refsets['neutral'], testsets['neutral'])


	print("--- Classifier executed in %s seconds ---" % (time.time() - start_time))
コード例 #23
0
ファイル: showscores.py プロジェクト: chloebt/educe
 def recall(self):
     return recall(self._reference, self._test)
        texts = list(lif(categorized_corpus, label))
        stop = int(len(texts) * args.fraction)

        for t in texts[:stop]:
            feat = bag_of_words(norm_words(t))
            feats.append(feat)
            test_feats.append((feat, label))

    print "accuracy:", accuracy(classifier, test_feats)
    refsets, testsets = scoring.ref_test_sets(classifier, test_feats)

    for label in labels:
        ref = refsets[label]
        test = testsets[label]
        print "%s precision: %f" % (label, precision(ref, test) or 0)
        print "%s recall: %f" % (label, recall(ref, test) or 0)
        print "%s f-measure: %f" % (label, f_measure(ref, test) or 0)
else:
    if args.instances == "sents":
        texts = categorized_corpus.sents()
        total = len(texts)
    elif args.instances == "paras":
        texts = (itertools.chain(*para) for para in categorized_corpus.paras())
        total = len(categorized_corpus.paras)
    elif args.instances == "files":
        texts = (categorized_corpus.words(fileids=[fid]) for fid in categorized_corpus.fileids())
        total = len(categorized_corpus.fileids())

    stop = int(total * args.fraction)
    feats = (bag_of_words(norm_words(i)) for i in itertools.islice(texts, stop))
コード例 #25
0
def avaliate_classifiers(featureSet):
	print("Vamos treinar o classificador agora!")
	print("\n")
	#random.shuffle(featureSet)

	#Vai fazer o calculo de recall e precision
	# You need to build 2 sets for each classification label:
	# a reference set of correct values, and a test set of observed values.

	#Os primeiros 6686 + 500(dia 14) tweets sao positivos e resto(6757 + 500(dia 14)) negativo
	positive_tweets = featureSet[:7185]

	#Misturando as paradas pra nao ficar testando só os mesmos últimos
	random.shuffle(positive_tweets)

	#print(featureSet[7185])
	#Pra pegar 7185 do pos e 7185 do negativo mas o negativo tem 7213
	negative_tweets = featureSet[7185:14372]
	random.shuffle(negative_tweets)

	#Agora vou dividir cada classe em um conjunto de referencia e outro de teste
	pos_cutoff = len(positive_tweets)*3/4
	neg_cutoff = len(negative_tweets)*3/4

	# 75% dos tweets vao pra ser de referencia(treinamento) e o resto pra teste
	pos_references = positive_tweets[:pos_cutoff]
	pos_tests = positive_tweets[pos_cutoff:]

	neg_references = negative_tweets[:neg_cutoff]
	neg_tests = negative_tweets[neg_cutoff:]

	#COnjunto de treinamento e de testes pra calcular a accuracy
	training_set = pos_references + neg_references
	testing_set = pos_tests + neg_tests

	start_time = time.time()

	global classifier
	print("Comecou a treina-lo agora!")

	#training_set2 = [(t,l) for (t,l,twe) in training_set]

	classifier = nltk.NaiveBayesClassifier.train(training_set)
	#testing_set2 = [(t,l) for (t,l,twe) in testing_set]
	print("Naive Bayes Algo accuracy:", (nltk.classify.accuracy(classifier, testing_set)) * 100)
	classifier.show_most_informative_features(30)

	refsets = collections.defaultdict(set)
	testsets = collections.defaultdict(set)

	# for i, (feats, label, l) in enumerate(testing_set):
	#     refsets[label].add(i)
	#     observed = classifier.classify(feats)
	#     testsets[observed].add(i)
	#     print("--"*200)
	#     print()
	#     print("Classified as: ",observed)
	#     print()
	#     print(l)
	#     print()
	#     print("--"*200)
	#     raw_input("Press any key to continue:")
	 
	print 'pos precision:', precision(refsets['pos'], testsets['pos'])
	print 'pos recall:', recall(refsets['pos'], testsets['pos'])
	print 'pos F-measure:', f_measure(refsets['pos'], testsets['pos'])
	print 'neg precision:', precision(refsets['neg'], testsets['neg'])
	print 'neg recall:', recall(refsets['neg'], testsets['neg'])
	print 'neg F-measure:', f_measure(refsets['neg'], testsets['neg'])


	print("--- Classifier executed in %s seconds ---" % (time.time() - start_time))
コード例 #26
0
ファイル: scoring.py プロジェクト: Herka/nltk-trainer
def cross_fold(instances, trainf, testf, folds=10, trace=1, metrics=True, informative=0):
	if folds < 2:
		raise ValueError('must have at least 3 folds')
	# ensure isn't an exhaustible iterable
	instances = list(instances)
	# randomize so get an even distribution, in case labeled instances are
	# ordered by label
	random.shuffle(instances)
	l = len(instances)
	step = l / folds
	
	if trace:
		print('step %d over %d folds of %d instances' % (step, folds, l))
	
	accuracies = []
	precisions = collections.defaultdict(list)
	recalls = collections.defaultdict(list)
	f_measures = collections.defaultdict(list)
	
	for f in range(folds):
		if trace:
			print('\nfold %d' % (f+1))
			print('-----%s' % ('-'*len('%s' % (f+1))))
		
		start = f * step
		end = start + step
		train_instances = instances[:start] + instances[end:]
		test_instances = instances[start:end]
		
		if trace:
			print('training on %d:%d + %d:%d' % (0, start, end, l))
		
		obj = trainf(train_instances)
		
		if trace:
			print('testing on %d:%d' % (start, end))
		
		if metrics:
			refsets, testsets = ref_test_sets(obj, test_instances)
			
			for key in set(refsets.keys() + testsets.keys()):
				ref = refsets[key]
				test = testsets[key]
				p = precision(ref, test) or 0
				r = recall(ref, test) or 0
				f = f_measure(ref, test) or 0
				precisions[key].append(p)
				recalls[key].append(r)
				f_measures[key].append(f)
				
				if trace:
					print('%s precision: %f' % (key, p))
					print('%s recall: %f' % (key, r))
					print('%s f-measure: %f' % (key, f))
		
		accuracy = testf(obj, test_instances)
		
		if trace:
			print('accuracy: %f' % accuracy)
		
		accuracies.append(accuracy)
		
		if trace and informative and hasattr(obj, 'show_most_informative_features'):
			obj.show_most_informative_features(informative)
	
	if trace:
		print('\nmean and variance across folds')
		print('------------------------------')
		print('accuracy mean: %f' % (sum(accuracies) / folds))
		print('accuracy variance: %f' % array(accuracies).var())
		
		for key, ps in iteritems(precisions):
			print('%s precision mean: %f' % (key, sum(ps) / folds))
			print('%s precision variance: %f' % (key, array(ps).var()))
		
		for key, rs in iteritems(recalls):
			print('%s recall mean: %f' % (key, sum(rs) / folds))
			print('%s recall variance: %f' % (key, array(rs).var()))
		
		for key, fs in iteritems(f_measures):
			print('%s f_measure mean: %f' % (key, sum(fs) / folds))
			print('%s f_measure variance: %f' % (key, array(fs).var()))
	
	return accuracies, precisions, recalls, f_measures
コード例 #27
0
    # print the classification results
    print 'Dictionary : ', dictionary.get_name(), '\n'
    print ConfusionMatrix(gold_standard,results).pp()
    print 'Accuracy: ', accuracy(gold_standard,results)
    for c in [0,1,-1]:
        print 'Metrics for class ', c
        gold = set()
        test = set()
        for i,x in enumerate(gold_standard):
            if x == c:
                gold.add(i)
        for i,x in enumerate(results):
            if x == c:
                test.add(i)
        print 'Precision: ', precision(gold, test)
        print 'Recall   : ', recall(gold, test)
        print 'F_measure: ', f_measure(gold, test)
    print '\n\n'


#################### Sentences classification ##########################

# Not reported in the paper because LIWC doesn't have neutral class

positive_sents = [reli.words_sentence_pos(s) for s in reli.sents(polarity='positive')]
negative_sents = [reli.words_sentence_pos(s) for s in reli.sents(polarity='negative')]
neutral_sents = [reli.words_sentence_pos(s) for s in reli.sents(polarity='neutral')]


print '#########################################################################'
print '###################### Sentences classification #########################'
コード例 #28
0
def evaluate_features(feature_select, classifier_sel):
    posFeatures = []
    negFeatures = []
    # http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
    # breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list

    with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            i = clean_text(i)
            posWords = [w for w in i.lower().split() if w not in stopWords]

            # posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWords = [feature_select(posWords), 'pos']
            posFeatures.append(posWords)
    with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            i = clean_text(i)
            negWords = [w for w in i.lower().split() if w not in stopWords]

            # negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWords = [feature_select(negWords), 'neg']
            negFeatures.append(negWords)

            # selects 3/4 of the features to be used for training and 1/4 to be used for testing
    if (DATA_FLAG == 'kooshas_data'):

        trainFeatures = posFeatures + negFeatures

        neg_test, pos_test, neut_test = get_features(testing_data, no_classes=2, feat_select=feature_select)
        testFeatures = pos_test + neg_test  # for three class data add neutFeat

    else:
        posCutoff = int(math.floor(len(posFeatures) * 4 / 5))
        negCutoff = int(math.floor(len(negFeatures) * 4 / 5))
        trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
        testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]

        aa, bb, cc = get_features(testing_data, no_classes=2, feat_select=feature_select)
        testFeatures = aa + bb

    # initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)

    # trains a Classifier
    if classifier_sel == 'NB':
        classifier = NaiveBayesClassifier.train(trainFeatures)
        # puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
        for i, (features, label) in enumerate(testFeatures):
            referenceSets[label].add(i)
            predicted = classifier.classify(features)
            testSets[predicted].add(i)

            # prints metrics to show how well the feature selection did
        print ('train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)))
        print ('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures))

        for i, (features, label) in enumerate(testFeatures):
            referenceSets[label].add(i)
            predicted = classifier.classify(features)
            testSets[predicted].add(i)

        print ('pos precision:', precision(referenceSets['pos'], testSets['pos']))
        print ('pos recall:', recall(referenceSets['pos'], testSets['pos']))
        print ('neg precision:', precision(referenceSets['neg'], testSets['neg']))
        print ('neg recall:', recall(referenceSets['neg'], testSets['neg']))

        classifier.show_most_informative_features(10)

    elif classifier_sel == 'MaxEnt':
        get_performance(LogisticRegression(), trainFeatures, testFeatures)

    elif classifier_sel == 'all_classifiers':
        get_performance(MultinomialNB(), trainFeatures, testFeatures)
        get_performance(BernoulliNB(), trainFeatures, testFeatures)
        get_performance(LogisticRegression(), trainFeatures, testFeatures)
        get_performance(SGDClassifier(), trainFeatures, testFeatures)
        get_performance(SVC(), trainFeatures, testFeatures)
        get_performance(LinearSVC(), trainFeatures, testFeatures)
        get_performance(NuSVC(kernel='rbf', nu=1), trainFeatures, testFeatures)



    elif classifier_sel == 'SVM':  # use SVM
        SVC_classifier = SklearnClassifier(SVC())
        classifier = SVC_classifier.train(trainFeatures)
        print("SVC_classifier accuracy:",
              (nltk.classify.accuracy(classifier, testFeatures)) * 100)

        for i, (features, label) in enumerate(testFeatures):
            referenceSets[label].add(i)
            predicted = classifier.classify(features)
            testSets[predicted].add(i)

        get_performance(classifier, referenceSets, testSets)
コード例 #29
0
ファイル: jason.py プロジェクト: joshsilverman/114-pa4
##pprint.pprint(results)
#pprint(results)
import sys
#sys.exit()

print '''
Classifier accuracy (Bayes): %s
B Precision (Bayes): %s
B Recall (Bayes): %s
I Precision (Bayes): %s
I Recall (Bayes): %s
O Precision (Bayes): %s
O Recall (Bayes): %s
''' % (accuracy(bayes_classifier, test_featureset), 
       precision(results[0]['B-SNP'], results[1]['B-SNP']), 
       recall(results[0]['B-SNP'], results[1]['B-SNP']), 
       precision(results[0]['I-SNP'], results[1]['I-SNP']),
       recall(results[0]['I-SNP'], results[1]['I-SNP']),
       precision(results[0]['O'], results[1]['O']),
       recall(results[0]['O'], results[1]['O']))

#bayes_classifier.show_most_informative_features(10)

sys.exit()

maxent_classifier = nltk.classify.MaxentClassifier.train(training_featureset)
maxent_results = get_results(maxent_classifier)

print '''
Classifier accuracy (MaxEnt): %s
B Precision (MaxEnt): %s
コード例 #30
-1
ファイル: classifier.py プロジェクト: anov/honors
	def validate(self, validation_set):
		if self.classifier is None:
			raise Exception("self.classifier is None")
		reference=defaultdict(set)
		observed=defaultdict(set)
		observed['neutral']=set()

		for i, (tweet, label) in enumerate(validation_set):
			reference[label].add(i)
			observation=self.classify(tweet)
			observed[observation].add(i)
		acc=classify.accuracy(self.classifier, observed)
		posp=precision(reference['positive'],observed['positive'])
		posr=recall(reference['positive'], observed['positive'])
		posf=f_measure(reference['positive'], observed['positive'])
		negp=precision(reference['negative'],observed['negative'])
		negr=recall(reference['negative'], observed['negative'])
		negf=f_measure(reference['negative'], observed['negative'])
		
		print "accuracy: %s" % acc
		print "pos precision: %s" % posp
		print "pos recall: %s" % posr
		print "pos f-measure: %s" % posf
		print "neg precision: %s" % negp
		print "neg recall: %s" % negr
		print "neg f-measure: %s" % negf
		return (acc, posp, posr, posf, negp, negr, negf)