def evaluate_model(NBClassifier): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) accuracy = classify.accuracy(NBClassifier, validation_features) * 100 for i, (feats, label) in enumerate(validation_features): refsets[label].add(i) observed = NBClassifier.classify(feats) testsets[observed].add(i) negative_precision = precision(refsets['negative'], testsets['negative']) neutral_precision = precision(refsets['neutral'], testsets['neutral']) positive_precision = precision(refsets['positive'], testsets['positive']) positive_recall = recall(refsets['positive'], testsets['positive']) neutral_recall = recall(refsets['neutral'], testsets['neutral']) negative_recall = recall(refsets['negative'], testsets['negative']) try: avg_recall = (1 / 3) * (negative_recall + positive_recall + neutral_recall) avg_precision = (1 / 3) * (negative_precision + positive_precision + neutral_precision) print(accuracy, avg_recall, avg_precision) except TypeError: pass
def evaluate_features(feature_select): # reading pre-labeled input and splitting into lines negSentences = open(os.path.join(__location__, 'rt-polarity-neg.txt'), 'r', encoding='utf8') posSentences = open(os.path.join(__location__, 'rt-polarity-pos.txt'), 'r', encoding='utf8') negSentences = re.split(r'\n', negSentences.read()) posSentences = re.split(r'\n', posSentences.read()) stop = stopwords.words('english') posFeatures = [] negFeatures = [] # breaks up the sentences into lists of individual words # creates instance structures for classifier for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i) # print(posWords) posWords = [j for j in posWords if j not in stop] # print(posWords) posWords = [feature_select(posWords), 'pos'] posFeatures.append(posWords) for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i) negWords = [j for j in negWords if j not in stop] negWords = [feature_select(negWords), 'neg'] negFeatures.append(negWords) # Make 2/3s of features training set posCutoff = int(math.floor(len(posFeatures) * 3 / 4)) negCutoff = int(math.floor(len(negFeatures) * 3 / 4)) trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:] # Runs the classifier on the testFeatures classifier = NaiveBayesClassifier.train(trainFeatures) # Sets up labels to look at output referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) for i, (features, label) in enumerate( testFeatures): # enumerate adds number-count to each item referenceSets[label].add( i) # recorded polarity for these test sentences predicted = classifier.classify( features) # classifiers' proposed polarity for tests testSets[predicted].add(i) # Outputs print('train on %s instances, test on %s instances' % (len(trainFeatures), len(testFeatures))) print('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)) print('pos precision:', scores.precision(referenceSets['pos'], testSets['pos'])) print('pos recall:', scores.recall(referenceSets['pos'], testSets['pos'])) print('neg precision:', scores.precision(referenceSets['neg'], testSets['neg'])) print('neg recall:', scores.recall(referenceSets['neg'], testSets['neg'])) classifier.show_most_informative_features(10)
def evaluate_features(feature_select): ## Label all Trump tweets with 'pos' and other tweets with 'neg' ## Divide them into Train and Test subset posFeatures_train =[] negFeatures_train =[] for i in Trump_train: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [feature_select(posWords), 'pos'] posFeatures_train.append(posWords) for i in Adele_train + Clinton_train: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [feature_select(negWords), 'neg'] negFeatures_train.append(negWords) posFeatures_test = [] negFeatures_test = [] for i in Trump_test: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [feature_select(posWords), 'pos'] posFeatures_test.append(posWords) for i in Adele_test + Clinton_test: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [feature_select(negWords), 'neg'] negFeatures_test.append(negWords) trainFeatures = posFeatures_train + negFeatures_train testFeatures = posFeatures_test + negFeatures_test ## Trains a Naive Bayes Classifier ## Read more here: https://en.wikipedia.org/wiki/Naive_Bayes_classifier classifier = NaiveBayesClassifier.train(trainFeatures) ## Initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) ## Puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) ## Prints metrics to show how well the feature selection did ## Accuracy: percentage of items in test set that the classifier correctly labeled. ## Precision: True_Positive / (True_Positive+False_Positive) ## Recall: True_Positive / (True_Positive+False_Negative) print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)) print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures) print 'pos precision:', precision(referenceSets['pos'], testSets['pos']) print 'pos recall:', recall(referenceSets['pos'], testSets['pos']) print 'neg precision:', precision(referenceSets['neg'], testSets['neg']) print 'neg recall:', recall(referenceSets['neg'], testSets['neg']) classifier.show_most_informative_features(10)
def evaluate_features(feature_select): posFeatures = [] negFeatures = [] global cnt cnt += 1 #http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation #breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [ feature_select(posWords), 'pos' ] #calls make_full_dict and returns a dict with [word,'True'] posFeatures.append(posWords) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [feature_select(negWords), 'neg'] negFeatures.append(negWords) #selects 3/4 of the features to be used for training and 1/4 to be used for testing posCutoff = int(math.floor(len(posFeatures) * 3 / 4)) negCutoff = int(math.floor(len(negFeatures) * 3 / 4)) trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:] #trains a Naive Bayes Classifier classifier = NaiveBayesClassifier.train(trainFeatures) #initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) #prints metrics to show how well the feature selection did print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)) print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures) print 'pos precision:', precision(referenceSets['pos'], testSets['pos']) print 'pos recall:', recall(referenceSets['pos'], testSets['pos']) print 'pos f1-score:', f_measure(referenceSets['pos'], testSets['pos']) print 'neg precision:', precision(referenceSets['neg'], testSets['neg']) print 'neg recall:', recall(referenceSets['neg'], testSets['neg']) print 'neg f1-score:', f_measure(referenceSets['neg'], testSets['neg']) classifier.show_most_informative_features(10) print '================================================='
def accuracy_measure(classifier, cross_valid_set): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(cross_valid_set): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'pos Precision:', precision(refsets[1], testsets[1]) print 'pos Recall:', recall(refsets[1], testsets[1]) print 'pos F-measure:', f_measure(refsets[1], testsets[1]) print 'neg Precision:', precision(refsets[0], testsets[0]) print 'neg Recall:', recall(refsets[0], testsets[0]) print 'neg F-measure:', f_measure(refsets[0], testsets[0])
def evaluate_features(feature_select): posFeatures = [] negFeatures = [] global cnt cnt += 1 # http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation # breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list with open(RT_POLARITY_POS_FILE, "r") as posSentences: for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [feature_select(posWords), "pos"] # calls make_full_dict and returns a dict with [word,'True'] posFeatures.append(posWords) with open(RT_POLARITY_NEG_FILE, "r") as negSentences: for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [feature_select(negWords), "neg"] negFeatures.append(negWords) # selects 3/4 of the features to be used for training and 1/4 to be used for testing posCutoff = int(math.floor(len(posFeatures) * 3 / 4)) negCutoff = int(math.floor(len(negFeatures) * 3 / 4)) trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:] # trains a Naive Bayes Classifier classifier = NaiveBayesClassifier.train(trainFeatures) # initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) # puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) # prints metrics to show how well the feature selection did print "train on %d instances, test on %d instances" % (len(trainFeatures), len(testFeatures)) print "accuracy:", nltk.classify.util.accuracy(classifier, testFeatures) print "pos precision:", precision(referenceSets["pos"], testSets["pos"]) print "pos recall:", recall(referenceSets["pos"], testSets["pos"]) print "pos f1-score:", f_measure(referenceSets["pos"], testSets["pos"]) print "neg precision:", precision(referenceSets["neg"], testSets["neg"]) print "neg recall:", recall(referenceSets["neg"], testSets["neg"]) print "neg f1-score:", f_measure(referenceSets["neg"], testSets["neg"]) classifier.show_most_informative_features(10) print "================================================="
def getMetrics(fileName, resultPath, expectedPath, enc): output = open(resultPath, 'r', encoding=enc) outputResult = output.read() outputSentences = set(sent_tokenize(outputResult)) output.close() expected = open(expectedPath, 'r', encoding=enc) expectedResult = expected.readlines() #removing newlines for better results expectedSentences = [] for line in expectedResult: expectedSentences += [line[:-1]] expectedSentences = set(expectedSentences) expected.close() recallResult = recall(expectedSentences, outputSentences) precisionResult = precision(expectedSentences, outputSentences) f1Score = calculateF1score( precisionResult, recallResult) if recallResult != 0 and precisionResult != 0 else 0 resultString = "File: " + fileName + " Recall: " + str( recallResult) + " Precision: " + str( precisionResult) + " F1 Score: " + str(f1Score) print(resultString) return { "recall": recallResult, "precision": precisionResult, "relevance": 1 if precisionResult != 0 else 0, "f1": f1Score }
def evaluate(ref_tags, hyp_tags): if len(ref_tags) != len(hyp_tags): raise ValueError( 'reference and hypothesis has different number of lines') n = len(ref_tags) counter = Counter(ref_tags) unique_tags = set(ref_tags) prec_dict, rec_dict, f_dict = defaultdict(float), defaultdict( float), defaultdict(float) for tag in sorted(unique_tags): ref_ids = {i for i, ref_tag in enumerate(ref_tags) if ref_tag == tag} hyp_ids = {i for i, hyp_tag in enumerate(hyp_tags) if hyp_tag == tag} prec_dict[tag] = precision(ref_ids, hyp_ids) rec_dict[tag] = recall(ref_ids, hyp_ids) f_dict[tag] = f_measure(ref_ids, hyp_ids) if prec_dict[tag] is None: warn(f'Undefined precision for {tag}; converting to 0.0') prec_dict[tag] = 0. if rec_dict[tag] is None: warn(f'Undefined recall for {tag}; converting to 0.0') rec_dict[tag] = 0. if f_dict[tag] is None: warn(f'Undefined F-score for {tag}; converting to 0.0') f_dict[tag] = 0. prec_dict[OVERALL_KEY] += counter[tag] * prec_dict[tag] / n rec_dict[OVERALL_KEY] += counter[tag] * rec_dict[tag] / n f_dict[OVERALL_KEY] += counter[tag] * f_dict[tag] / n return EvalResult(precision=prec_dict, recall=rec_dict, f1=f_dict, conf_matrix=ConfusionMatrix(ref_tags, hyp_tags, sort_by_count=True))
def me_classifier(exclude_list): me_classifier = 0 with open(train_data, 'r', encoding='utf-8', errors='ignore') as csvfile: reader = csv.reader(csvfile) feature_set = [(feature_set_generator(text, length, label, exclude_list), label) for text, length, label in reader] #print(feature_set) me_classifier = MaxentClassifier.train(feature_set, "megam") accuracy = 0.0 with open(test_data, 'r', encoding='utf-8', errors='ignore') as testcsvfile: test_reader = csv.reader(testcsvfile) test_feature_set = [(feature_set_generator(text, length, label, exclude_list), label) for text, length, label in test_reader] accuracy = classify.accuracy(me_classifier, test_feature_set) classified = collections.defaultdict(set) observed = collections.defaultdict(set) i = 1 with open(test_data, 'r', encoding='utf-8', errors='ignore') as testcsvfile: test_reader = csv.reader(testcsvfile) for text, length, label in test_reader: observed[label].add(i) classified[me_classifier.classify( feature_set_generator(text, length, label, exclude_list))].add(i) i += 1 return accuracy,precision(observed["1"], classified["1"]),recall(observed['1'], classified['1']),\ f_measure(observed['1'], classified['1']),precision(observed['0'], classified['0']),recall(observed['1'], classified['0']),f_measure(observed['1'], classified['0'])
def multi_metrics(multi_classifier, test_feats): mds = [] refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feat, labels) in enumerate(test_feats): for label in labels: refsets[label].add(i) guessed = multi_classifier.classify(feat) for label in guessed: testsets[label].add(i) mds.append(masi_distance(set(labels), guessed)) avg_md = sum(mds) / float(len(mds)) precisions = {} recalls = {} for label in multi_classifier.labels(): precisions[label] = precision(refsets[label], testsets[label]) recalls[label] = recall(refsets[label], testsets[label]) return precisions, recalls, avg_md
def calculate_metrics(self): included_logs = 0 metrics = {} cc = SmoothingFunction() for identifier in self._values: if self._values[identifier].get('target_text', None) is not None: included_logs += 1 target_text = self._values[identifier]['target_text'] output_text = self._values[identifier]['output_text'] metrics['BLEU'] = metrics.get('BLEU', 0) + sentence_bleu( [target_text], output_text, smoothing_function=cc.method4) metrics['accuracy'] = metrics.get('accuracy', 0) + accuracy( target_text, output_text) target_text = set(target_text) output_text = set(output_text) metrics['precision'] = metrics.get('precision', 0) + precision( target_text, output_text) metrics['recall'] = metrics.get('recall', 0) + recall( target_text, output_text) metrics['f_measure'] = metrics.get('f_measure', 0) + f_measure( target_text, output_text) if included_logs != 0: for metric in metrics: metrics[metric] /= included_logs return metrics, included_logs
def precision_recall_2way_with_threshold(classifier, testFeatures, threshold): refsets = defaultdict(set) testsets = defaultdict(set) probs = classifier.prob_classify_many([feats for (feats, label) in testFeatures]) trues = 0 for i, (feats, label) in enumerate(testFeatures): refsets[label].add(i) observed = classifier.classify(feats) prob = probs[i] if prob.prob(observed) < threshold: observed = 'neu' testsets[observed].add(i) if observed == label: trues += 1 precisions = {} recalls = {} for label in classifier.labels(): precisions[label] = precision(refsets[label], testsets[label]) recalls[label] = recall(refsets[label], testsets[label]) accuracy = float(trues)/len(testFeatures) return precisions, recalls, accuracy
def calculate_topk_recall(reference, test, k): substitute_pairs = set() for food in reference: substitutes = reference[food][:k] substitute_pairs.update({tuple([food, substitute]) for substitute in substitutes}) topk_rec = scores.recall(reference=substitute_pairs, test=test) return topk_rec
def evaluate_features(feature_select): posFeatures = [] negFeatures = [] #将这些句子分解成单词的列表(由输入机制选择)并在每个列表后附加'pos'或'neg' with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [feature_select(posWords), 'pos'] posFeatures.append(posWords) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [feature_select(negWords), 'neg'] negFeatures.append(negWords) #选择3/4用于训练和1/4用于测试 posCutoff = int(math.floor(len(posFeatures) * 3 / 4)) negCutoff = int(math.floor(len(negFeatures) * 3 / 4)) trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:] #训练朴素贝叶斯分类器 classifier = NaiveBayesClassifier.train(trainFeatures) #启动referenceSets和testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) #在referenceSets中放置正确标记的句子,在测试集中放置预测性标记的版本 for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) #打印指标以显示特征选择的效果 print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)) print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures) print 'pos precision:', precision(referenceSets['pos'], testSets['pos']) print 'pos recall:', recall(referenceSets['pos'], testSets['pos']) print 'pos F1:', f_measure(referenceSets['pos'], testSets['pos']) print 'neg precision:', precision(referenceSets['neg'], testSets['neg']) print 'neg recall:', recall(referenceSets['neg'], testSets['neg']) print 'neg F1:', f_measure(referenceSets['pos'], testSets['pos']) classifier.show_most_informative_features(10)
def testing(sent_classifier): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, category) in enumerate(testing_set): refsets[category].add(i) observed = sent_classifier.classify(feats) testsets[observed].add(i) print ('Classifier Accuracy: ', (nltk.classify.accuracy(sent_classifier, testing_set))*100, "%") print ('Classifier pos Precision:', scores.precision(refsets['pos'], testsets['pos'])*100, "%") print ('Classifier pos Recall:', scores.recall(refsets['pos'], testsets['pos'])*100, "%") print ('Classifier pos F-measure:', scores.f_measure(refsets['pos'], testsets['pos'])*100, "%") print ('Classifier neg Precision:', scores.precision(refsets['neg'], testsets['neg'])*100, "%") print ('Classifier neg Recall:', scores.recall(refsets['neg'], testsets['neg'])*100, "%") print ('Classifier neg F-measure:', scores.f_measure(refsets['neg'], testsets['neg'])*100, "%") print ('\n')
def kset_stat(silvs,golds) : s1 = set(map(to_root,golds)) s2 = set(map(to_root,silvs)) #print(s1,s2) p=precision(s1,s2) r=recall(s1,s2) f=f_measure(s1,s2) if not (p and r and f) : return {'p':0,'r':0,'f':0} return {'p':p,'r':r,'f':f}
def test_trained_classifier(classifier, test_samples): """Prints precision/recall statistics of a NLTK classifier""" import collections refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (sample, label) in enumerate(test_samples): refsets[label].add(i) observed = classifier.classify(sample) testsets[observed].add(i) print("pos precision:", scores.precision(refsets["pos"], testsets["pos"])) print("pos recall:", scores.recall(refsets["pos"], testsets["pos"])) print("pos F-measure:", scores.f_measure(refsets["pos"], testsets["pos"])) print("neg precision:", scores.precision(refsets["neg"], testsets["neg"])) print("neg recall:", scores.recall(refsets["neg"], testsets["neg"])) print("neg F-measure:", scores.f_measure(refsets["neg"], testsets["neg"]))
def classification_result(classifier, test_set): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) reflist = [] testlist = [] for i, (tweet, label) in enumerate(test_set): refsets[label].add(i) reflist.append(label) observed = classifier.classify(tweet) testsets[observed].add(i) testlist.append(observed) print(len(refsets['1'])) print("Accuracy : ", nltk.classify.accuracy(classifier, test_set) * 100) print("Precision Pos: ", precision(refsets['1'], testsets['1']) * 100) print("Recall Pos: ", recall(refsets['1'], testsets['1']) * 100) print("F Measure Pos: ", f_measure(refsets['1'], testsets['1']) * 100) print("Precision Neg: ", precision(refsets['0'], testsets['0']) * 100) print("Recall Neg: ", recall(refsets['0'], testsets['0']) * 100) print("F Measure Neg: ", f_measure(refsets['0'], testsets['0']) * 100) print("Confusion Metrics : \n", ConfusionMatrix(reflist, testlist))
def prec_rec(test_users, exact_neighbors, apx_neighbors): pr = [] rc = [] for uid in test_users: p = precision(set(exact_neighbors[uid]), set(apx_neighbors[uid])) r = recall(set(exact_neighbors[uid]), set(apx_neighbors[uid])) if p != None: pr.append(p) if r != None: rc.append(r) return (np.mean(pr), np.mean(rc))
def avgOffEval(inpath1, inpath2): print('\n=============================') print( 'NER evaluation (single entity class/mention-level, full/offsets, avg. of abstract-level)' ) print('=============================') print('==> gold', inpath1) print('==> pred', inpath2) print('=============================') recs = [] pres = [] fscs = [] for filename1 in glob.glob(inpath1 + "/*ann"): filen1 = filename1.split('/')[len(filename1.split('/')) - 1] for filename2 in glob.glob(inpath2 + "/*ann"): filen2 = filename2.split('/')[len(filename2.split('/')) - 1] if filen1 == filen2: preds = set([]) refrs = set([]) file1 = codecs.open(filename1, 'r', encoding='utf-8') file2 = codecs.open(filename2, 'r', encoding='utf-8') for line1 in file1.readlines(): if len(line1.split('\t')) > 1: men1 = line1.split('\t')[2].strip() off1 = '-'.join([ w.strip() for w in line1.split('\t')[1].split(' ') ]) gold = men1 + '_' + off1 refrs.add(gold) for line2 in file2.readlines(): if len(line2.split('\t')) > 1: men2 = line2.split('\t')[2].strip() off2 = '-'.join([ w.strip() for w in line2.split('\t')[1].split(' ') ]) pred = men2 + '_' + off2 preds.add(pred) if len(preds) > 0 and len(refrs) > 0: rec = scores.recall(refrs, preds) pre = scores.precision(refrs, preds) fsc = scores.f_measure(refrs, preds) else: rec = 0 pre = 0 fsc = 0 recs.append(rec) pres.append(pre) fscs.append(fsc) print('average \t R={R} \t P={P} \t F1={F}'.format(R=str(np.mean(recs)), P=str(np.mean(pres)), F=str(np.mean(fscs)))) print('=============================\n')
def get_results(self, classifier, test_set, target): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(test_set): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) target_precision = precision(refsets[target], testsets[target]) target_recall = recall(refsets[target], testsets[target]) target_f_measure = f_measure(refsets[target], testsets[target]) results = (target_precision, target_recall, target_f_measure) return (results)
def printEval(realSet, testSet): precisionPos = precision(realSet['pos'], testSet['pos']) precisionNeg = precision(realSet['neg'], testSet['neg']) precisionNeutre = precision(realSet['neutre'], testSet['neutre']) recallPos = recall(realSet['pos'], testSet['pos']) recallNeg = recall(realSet['neg'], testSet['neg']) fmesurePos = f_measure(realSet['pos'], testSet['pos']) fmesureNeg = f_measure(realSet['neg'], testSet['neg']) # print("Precision Pos: " + precisionPos + " - Neg: " + float(precisionNeg) # # print("Recall Pos: %f - Neg: %f - Neutral: %f" %(recallPos, recallNeg, recallNeutre)) # # print("F-Mesure Pos: %f - Neg: %f - Neutral: %f" %(fmesurePos, fmesureNeg, fmesureNeutre)) print("Precision Pos: %f - Neg: %f " %(float(precisionPos), float(precisionNeg))) print("Recall Pos: %f - Neg: %f " %(float(recallPos), float(recallNeg))) print("F-Mesure Pos: %f - Neg: %f " %(float(fmesurePos), float(fmesureNeg)))
def print_precision_recall(self): refset = collections.defaultdict(set) testset = collections.defaultdict(set) for i, (ft, label) in enumerate(self.test_set): refset[label].add(i) predicted = self.classifier.classify(ft) testset[predicted].add(i) for tag in refset.keys(): prc = precision(refset[tag], testset[tag]) rec = recall(refset[tag], testset[tag]) print('{}: precision={:4.2f} recall={:4.2f}'.format(tag, prc, rec))
def main(command, classifier_type): feature_functions = [unigram_freqs] corpus_file = open('ratings_corpus.json') corpus = json.load(corpus_file) corpus_file.close() feature_representation = [(extract_features(document, feature_functions), label) for document, label in corpus] train_set, test_set = split_data(feature_representation) classifier = '' if command == 'new': if classifier_type == 'decision_tree': classifier = nltk.classify.DecisionTreeClassifier.train(train_set) elif classifier_type == 'maxent': classifier = nltk.classify.maxent.MaxentClassifier.train(train_set) elif command == 'load': if classifier_type == 'decision_tree': classifier_file = open('decisiontree_classifier.pickle', 'rb') classifier = pickle.load(classifier_file) classifier_file.close() elif classifier_type == 'maxent': classifier_file = open('maxent_classifier.pickle', 'rb') classifier = pickle.load(classifier_file) classifier_file.close() predictions = [] golds = [] for test_doc, rating in test_set: predictions.append(classifier.classify(test_doc)) golds.append(rating) pred_sets = initialize_sets(ALL_RATINGS) gold_sets = initialize_sets(ALL_RATINGS) for doc_id, rating in enumerate(predictions): pred_sets[rating].add(doc_id) for doc_id, rating in enumerate(golds): gold_sets[rating].add(doc_id) for label in ALL_RATINGS: r = scores.recall(gold_sets[label], pred_sets[label]) p = scores.precision(gold_sets[label], pred_sets[label]) f = scores.f_measure(gold_sets[label], pred_sets[label]) if not (r==None or p==None or f==None): f = float(f) print('<{}> P: {:.2}, R: {:.2}, F: {:.2}'.format(label, p, r, f))
def evaluate_features(feature_select, classify_method): posFeatures = [] negFeatures = [] with open(RT_POLARITY_POS_FILE, 'r') as posSentences: posWords = [] for i in posSentences: if "<review" in i: continue if "</review" in i: posWords = [feature_select(posWords), 'pos'] posFeatures.append(posWords) posWords = [] continue i = i.decode('utf8') line = re.sub(r'[{}]+'.format(PUNCTUATION).decode("utf8"), "".decode("utf8"),i) posWords += jieba.cut(line, cut_all=False) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: negWords = [] for i in negSentences: if "<review" in i: continue if "</review" in i: negWords = [feature_select(negWords), 'neg'] negFeatures.append(negWords) negWords = [] continue i = i.decode('utf8') line = re.sub(r'[{}]+'.format(PUNCTUATION).decode("utf8"), "".decode("utf8"),i) negWords += jieba.cut(line, cut_all=False) #get trainFeatures and testFeatures trainFeatures = posFeatures + negFeatures testFeatures = getTestFeatures(feature_select) classifier = nltk.classify.SklearnClassifier(classify_method) classifier.train(trainFeatures) #initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) #prints metrics to show how well the feature selection did print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)) print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures) print 'pos precision:', scores.precision(referenceSets['pos'], testSets['pos']) print 'pos recall:', scores.recall(referenceSets['pos'], testSets['pos']) print 'neg precision:', scores.precision(referenceSets['neg'], testSets['neg']) print 'neg recall:', scores.recall(referenceSets['neg'], testSets['neg']) print 'F1 Pos:',2*scores.precision(referenceSets['pos'], testSets['pos'])* scores.recall(referenceSets['pos'], testSets['pos'])/ \ (scores.precision(referenceSets['pos'], testSets['pos'])+scores.recall(referenceSets['pos'], testSets['pos'])) print 'F1 neg:',2*scores.precision(referenceSets['neg'], testSets['neg'])* scores.recall(referenceSets['neg'], testSets['neg'])/ \ (scores.precision(referenceSets['neg'], testSets['neg'])+scores.recall(referenceSets['neg'], testSets['neg']))
def assess_classifier(classifier, test_set): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(test_set): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) count = 0 print('Precision = ' + str(precision(refsets['spam'], testsets['spam']))) print('Recall = ' + str(recall(refsets['spam'], testsets['spam']))) print('F measure = ' + str(f_measure(refsets['spam'], testsets['spam'], alpha=0.5))) print('FP rate = ' + str( abs((len(refsets['ham']) - len(testsets['ham'])) / (len(refsets['spam']) + len(refsets['ham'])))))
def main(): brown_tagged_sents = brown.tagged_sents(categories='news') size = int(len(brown_tagged_sents) * 0.8) train_data = brown_tagged_sents[:size] test_data = brown_tagged_sents[size:] # store pickle file if not (os.path.isfile('UnigramTagger.pkl') and os.path.isfile('Tnt_Tagger.pkl') and os.path.isfile('PerceptronTagger.pkl')): unigram_tagger = unigram_tag(train_data) tnt_tagger = tnt_tag(train_data) perc_tagger = perceptron_tag(train_data) [store_pickle(each_) for each_ in [unigram_tagger, tnt_tagger, perc_tagger]] # load pickle file and get each model file with a tuple models_files_tuple = [(each_.split('.')[0], retrieve_pickle(each_)) for each_ in ['UnigramTagger.pkl', 'PerceptronTagger.pkl', 'Tnt_Tagger.pkl']] # test the loaded models on test data print("TESTING LOADED MODELS") for tagg_name, tagg_mode in models_files_tuple: print("Loaded {tag_name} evaluation results: {evaluate_res}".format(tag_name=tagg_name, evaluate_res=tagg_mode.evaluate(test_data))) # Tabulate and calculate accuracies, choose best one based on F1 value reference_sentences_lists = [list(map(lambda pair_: pair_[1], each)) for each in test_data] test_sentences_lists = [list(map(lambda pair_: pair_[0], each)) for each in test_data] reference_lst = list() test_lst = list() [reference_lst.extend(each_lst) for each_lst in reference_sentences_lists[:1000]] [test_lst.extend(each_lst) for each_lst in test_sentences_lists[:1000]] for tagg_name, tagger_mod in models_files_tuple: if tagg_name == "Tnt_Tagger": reference_lst = reference_lst[:700] test_lst = test_lst[:700] result_tokens = tagger_mod.tag(test_lst) result_tokens__ = list(map(lambda pair: 'UNKNOWN' if pair[1] is None else pair[1], result_tokens)) print("{} Evaluation Results".format(tagg_name)) print("Precision: ", precision(set(reference_lst), set(result_tokens__))) print("Recall: ", recall(set(reference_lst), set(result_tokens__))) print("F measure: ", f_measure(set(reference_lst), set(result_tokens__)))
def precision_recall(classifier, testfeats): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) precisions = {} recalls = {} for label in classifier.labels(): precisions[label] = scores.precision(refsets[label], testsets[label]) recalls[label] = scores.recall(refsets[label], testsets[label]) return precisions, recalls
def precision_recall(classifier, testFeatures): refsets = defaultdict(set) testsets = defaultdict(set) for i, (feats, label) in enumerate(testFeatures): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) precisions = {} recalls = {} for label in classifier.labels(): precisions[label] = precision(refsets[label], testsets[label]) recalls[label] = recall(refsets[label], testsets[label]) return precisions, recalls
def compute_evaluation_scores(classifier: ClassifierBase, data_set: List[Tuple[Dict, str]], evaluated_class: LikeTypeEnum) \ -> Dict[str, float]: """Evaluate classifier on dataset with common metrics. Namely calculates: precision, recall, accuracy, f-measure. And adds: tp, fp, np, tn (true/false positives/negatives).""" clas_scores: dict = {} correctly_classified: int = 0 # metrics refsets: DefaultDict[str, set] = defaultdict(set) testsets: DefaultDict[str, set] = defaultdict(set) for i, (fs, label) in enumerate(data_set): refsets[label].add(i) classified = classifier.classify(fs) testsets[classified].add(i) if label == classified: correctly_classified += 1 # we don't know how many and what are the values of negative classes # therefore we compute union of all and subtract positive elements negative_test: set = reduce(lambda a, b: a.union(b), testsets.values()) \ - testsets[evaluated_class.value] negative_ref: set = reduce(lambda a, b: a.union(b), refsets.values()) \ - refsets[evaluated_class.value] positive_test: set = testsets[evaluated_class.value] positive_ref: set = refsets[evaluated_class.value] clas_scores['tp'] = len(positive_test & positive_ref) / len(data_set) clas_scores['fp'] = len(positive_test & negative_ref) / len(data_set) clas_scores['tn'] = len(negative_test & negative_ref) / len(data_set) clas_scores['fn'] = len(negative_test & positive_ref) / len(data_set) clas_scores['precision'] = scores.precision(positive_ref, positive_test) clas_scores['recall'] = scores.recall(positive_ref, positive_test) clas_scores['f_measure'] = scores.f_measure(positive_ref, positive_test) # accuracy is true positives and true negatives over all instances clas_scores['accuracy'] = correctly_classified / len(data_set) return clas_scores
def macroOffEval(inpath1, inpath2): print('\n=============================') print( 'NER evaluation (single entity class/mention-level, full/offsets, corpus-level)' ) print('=============================') print('==> gold', inpath1) print('==> pred', inpath2) print('=============================') preds = set([]) refrs = set([]) for filename1 in glob.glob(inpath1 + "/*ann"): filen1 = filename1.split('/')[len(filename1.split('/')) - 1] for filename2 in glob.glob(inpath2 + "/*ann"): filen2 = filename2.split('/')[len(filename2.split('/')) - 1] if filen1 == filen2: file1 = codecs.open(filename1, 'r', encoding='utf-8') file2 = codecs.open(filename2, 'r', encoding='utf-8') for line1 in file1.readlines(): if len(line1.split('\t')) > 1: men1 = line1.split('\t')[2].strip() off1 = '-'.join([ w.strip() for w in line1.split('\t')[1].split(' ') ]) gold = men1 + '_' + off1 refrs.add(gold) for line2 in file2.readlines(): if len(line2.split('\t')) > 1: men2 = line2.split('\t')[2].strip() off2 = '-'.join([ w.strip() for w in line2.split('\t')[1].split(' ') ]) pred = men2 + '_' + off2 preds.add(pred) rec = scores.recall(refrs, preds) pre = scores.precision(refrs, preds) fsc = scores.f_measure(refrs, preds) print('macro \t R={R} \t P={P} \t F1={F}'.format(R=str(rec), P=str(pre), F=str(fsc))) print('=============================\n')
def show_metrics(classifier, test_set): description = "" # Given a classifier and a set to test it, it will print metrics for the classifier description = description + "\n" + "Accuracy: " + str( nltk.classify.accuracy(classifier, test_set)) # Creates two sets: one with references (correct results) and other with tests (classifier predictions) # This sets are divided in fact-checkable and non-fact-checkable sets that contain a unique id (integer) # for each sentence refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(test_set): refsets[label].add(i) # 1, neg observed = classifier.classify(feats) #neg testsets[observed].add(i) #1, neg model_precision = int( precision(refsets['fact-checkable'], testsets['fact-checkable']) * 100) model_recall = int( recall(refsets['fact-checkable'], testsets['fact-checkable']) * 100) model_f_measure = int( f_measure(refsets['fact-checkable'], testsets['fact-checkable'], 0.3) * 100) description += "\n" + "PRECISION: Of the sentences predicted fact-checkable, " + str( model_precision) + "% were actually fact-checkable" description += "\n" + "RECALL: Of the sentences that were fact-checkable, " + str( model_recall) + "% were predicted correctly" description += "\n" + "F-MEASURE (balance between precission and recall): " + str( model_f_measure) + "%" # Same for non fact-checkables #print('non-fact-checkable precision:', precision(refsets['non-fact-checkable'], testsets['non-fact-checkable'])) #print('non-fact-checkable recall:', recall(refsets['non-fact-checkable'], testsets['non-fact-checkable'])) #print('non-fact-checkable F-measure:', f_measure(refsets['non-fact-checkable'], testsets['non-fact-checkable'])) print(description) # informative classifier.show_most_informative_features(25) return description
def get_measures(reference, test): tp = tn = fp = fn = 0 for ((_, r), (_, t)) in zip(reference, test): if r == t == "O": tn += 1 elif r == t == "ORG": tp += 1 elif r == "O" and t == "ORG": fp += 1 elif r == "ORG" and t == "O": fn += 1 matrix = [tp, tn, fp, fn] acc = accuracy(reference, test) reference_set = set(reference) test_set = set(test) pre = precision(reference_set, test_set) rec = recall(reference_set, test_set) f = f_measure(reference_set, test_set) return acc, pre, rec, f, matrix
def get_performance_dataframe(tagger, test_tag_list): """Returns DataFrame with metrics for individual tag combinations. For NLTK taggers.""" truth_sets = defaultdict(set) test_sets = defaultdict(set) for n, (w, label) in enumerate(test_tag_list): observed = tagger.tag([w])[0][1] truth_sets[label].add(n) test_sets[observed].add(n) performance_dict = dict() for key in test_sets.keys(): performance_dict.setdefault( key, { 'Precision': precision(truth_sets[key], test_sets[key]), 'Recall': recall(truth_sets[key], test_sets[key]), 'F1': f_measure(truth_sets[key], test_sets[key]) } ) df = pd.DataFrame(performance_dict).T return df
def compute_pairwise(hashed_er_anns_df): """ Returns pairwise comparision between users (uesr_a & user_b) that have completed similar documents """ # Make user_pks unique userset = set(hashed_er_anns_df.user_id) inter_annotator_arr = [] # For each unique user comparision, compute for user_a, user_b in itertools.combinations(userset, 2): # The list of document_pks that each user had completed user_a_set = set(hashed_er_anns_df[hashed_er_anns_df['user_id'] == user_a].document_pk) user_b_set = set(hashed_er_anns_df[hashed_er_anns_df['user_id'] == user_b].document_pk) # Only compare documents both users have completed pmid_set = user_a_set.intersection(user_b_set) # If user_a and user_b have completed shared PMID, compute comparisions if len(pmid_set) != 0: pmid_df = hashed_er_anns_df[hashed_er_anns_df['document_pk'].isin( pmid_set)] ref_set = set(pmid_df[pmid_df['user_id'] == user_a].hash) test_set = set(pmid_df[pmid_df['user_id'] == user_b].hash) # Compute the precision, recall and F-measure based on # the unique hashes inter_annotator_arr.append( (user_a, user_b, len(pmid_set), nltk_scoring.precision(ref_set, test_set), nltk_scoring.recall(ref_set, test_set), nltk_scoring.f_measure(ref_set, test_set))) return pd.DataFrame(inter_annotator_arr, columns=('user_a', 'user_b', 'docs_compared', 'precision', 'recall', 'f-score'))
def compute_pairwise(hashed_annotations_df): ''' Returns pairwise comparision between users (uesr_a & user_b) that have completed similar documents ''' # Make user_pks unique userset = set(hashed_annotations_df.user) inter_annotator_arr = [] # For each unique user comparision, compute for user_a, user_b in itertools.combinations(userset, 2): # The list of document_ids that each user had completed user_a_set = set(hashed_annotations_df[hashed_annotations_df['user'] == user_a].document_id) user_b_set = set(hashed_annotations_df[hashed_annotations_df['user'] == user_b].document_id) # Only compare documents both users have completed pmid_set = user_a_set.intersection(user_b_set) # If user_a and user_b have completed shared PMID, compute comparisions if len(pmid_set) != 0: pmid_df = hashed_annotations_df[hashed_annotations_df['document_id'].isin(pmid_set)] ref_set = set(pmid_df[pmid_df['user'] == user_a].hash) test_set = set(pmid_df[pmid_df['user'] == user_b].hash) # Compute the precision, recall and F-measure based on # the unique hashes inter_annotator_arr.append(( user_a, user_b, len(pmid_set), nltk_scoring.precision(ref_set, test_set), nltk_scoring.recall(ref_set, test_set), nltk_scoring.f_measure(ref_set, test_set) )) return pd.DataFrame(inter_annotator_arr, columns=('user_a', 'user_b', 'docs_compared', 'precision', 'recall', 'f-score'))
def scores(classifier, test, ids): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(test): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) accuracy = nltk.classify.accuracy(classifier, test) print("accuracy: " + str(accuracy)) p = filter(partial(is_not, None), [precision(refsets[sense], testsets[sense]) for sense in ids]) p = sum(p) / len(p) print("precision: " + str(p)) r = filter(partial(is_not, None), [recall(refsets[sense], testsets[sense]) for sense in ids]) r = sum(r) / len(r) print("recall: " + str(r)) f_1 = filter(partial(is_not, None), [f_measure(refsets[sense], testsets[sense]) for sense in ids]) f_1 = sum(f_1) / len(f_1) print("f-1 score: " + str(f_1)) return ({"precision": p, "recall": r, "f_1": f_1, "accuracy": accuracy})
if flip: model.class_prior = [1-categorized_proportion, categorized_proportion] else: model.class_prior = [categorized_proportion, 1-categorized_proportion] classifier.train(train_set) # test classifier test_results = classifier.classify_many([feat for (feat, label) in test_set]) pos_test_set = set(i for i, result in enumerate(test_results) if result == category) reference_values = [label for (feat, label) in test_set] pos_ref_set = set(i for i, (feat, label) in enumerate(test_set) if label == category) accuracy = scores.accuracy(reference_values, test_results) accuracies.append(accuracy) precision = scores.precision(pos_ref_set, pos_test_set) recall = scores.recall(pos_ref_set, pos_test_set) f1 = scores.f_measure(pos_ref_set, pos_test_set) f1_scores.append(f1) print "%s: accuracy %s, precision %s, recall %s, F1 %s" % (colored(category, "blue"), colored(accuracy, "yellow"), colored(precision, "yellow"), colored(recall, "yellow"), colored(f1, "yellow")) ## print(nltk.classify.accuracy(classifier, test_set)) # classifier.show_most_informative_features(5) # print "" # save trained classifier and word features to file dump_file = open("classifiers/%s.pickle" % category, "wb") pickle.dump({ "classifier": classifier, "word_features": word_features }, dump_file) dump_file.close()
def getRecall(self): return recall(self._refsets['POS'], self._testsets['POS'])
# #-----------------------------classifier------------------------------------------- posTweets = int(math.floor(len(preprocess_pos_tweets)*3/4)) negTweets = int(math.floor(len(preprocess_neg_tweets)*3/4)) trainFeatures = preprocess_pos_tweets[:posTweets] + preprocess_neg_tweets[:negTweets] testFeatures = preprocess_pos_tweets[posTweets:] + preprocess_neg_tweets[negTweets:] classifier = NaiveBayesClassifier.train(trainFeatures) referenceSets = {'positive': set(), 'negative':set()} testSets = {'positive':set(), 'negative':set()} for i , (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)) print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures) print 'pos precision:', precision(referenceSets['positive'], testSets['positive']) print 'pos recall:', recall(referenceSets['positive'], testSets['positive']) print 'neg precision:', precision(referenceSets['negative'], testSets['negative']) print 'neg recall:', recall(referenceSets['negative'], testSets['negative']) classifier.show_most_informative_features(10)
def test_iteration(i, train_set, test_dict, feature_sets_by_match, classifier_type='decision_tree'): """Performs one iteration of the k-fold cross validation, returing a dict containing overall micro and macro score averages, in addition to scores for each label. Args: i: the iteration of the k-fold cross validation. train_set: a list containing feature, rating pairs test_dict: a dicitonary containing feature and rating information for the test set. feature_sets_by_match: feature respresentations of documents organized by match. classifier_type: the type of classifier to use. Returns: A dict containing overall micro and macro score averages, in addition to scores for each label. """ classifier = '' if classifier_type == 'decision_tree': #classifier = nltk.classify.DecisionTreeClassifier.train(train_set) classifier = nltk.classify.scikitlearn.SklearnClassifier(tree.DecisionTreeClassifier(random_state=8246)).train(train_set) elif classifier_type == 'maxent': #classifier = nltk.classify.maxent.MaxentClassifier.train(train_set) classifier = nltk.classify.scikitlearn.SklearnClassifier(linear_model.LogisticRegression()).train(train_set) elif classifier_type == 'svr': classifier = nltk.classify.scikitlearn.SklearnClassifier(svm.SVR()).train(train_set) pred_sets = initialize_sets(ALL_RATINGS) gold_sets = initialize_sets(ALL_RATINGS) pred_list = [] gold_list = [] # Classify predictions and add them to relevant dicts and lists. for match in test_dict: for doc_id in test_dict[match]: test_doc = test_dict[match][doc_id]['features'] pred = classifier.classify(test_doc) gold = test_dict[match][doc_id]['gold'] test_dict[match][doc_id]['pred'] = pred gold_list.append(str(gold)) pred_list.append(str(pred)) gold_sets[gold].add(doc_id) pred_sets[pred].add(doc_id) # Calculate pairwise ranking accuracy correct= 0 total = 0 for match in test_dict: for pl1, pl2 in combinations(test_dict[match].keys(), 2): p1 = test_dict[match][pl1] p2 = test_dict[match][pl2] if p1['gold'] > p2['gold'] and p1['pred'] > p2['pred']: correct += 1 elif p1['gold'] < p2['gold'] and p1['pred'] < p2['pred']: correct += 1 elif p1['gold'] == p2['gold'] and p1['pred'] == p2['pred']: correct += 1 total += 1 print('Pairwise ranking accuracy: ' + str(correct/total)) fold_scores = {'micro': '', 'macro': '', 'by_label': {rating: {'p': 0, 'r': 0, 'f': 0} for rating in ALL_RATINGS} } prf_micro = precision_recall_fscore_support(gold_list, pred_list, average='micro') print(prf_micro) fold_scores['micro'] = prf_micro prf_macro = precision_recall_fscore_support(gold_list, pred_list, average='macro') print(prf_macro) fold_scores['macro'] = prf_macro for label in ALL_RATINGS: r = scores.recall(gold_sets[label], pred_sets[label]) p = scores.precision(gold_sets[label], pred_sets[label]) f = scores.f_measure(gold_sets[label], pred_sets[label]) if r == None: r = 0.0 if p == None: p = 0.0 if f == None: f = 0.0 fold_scores['by_label'][label]['p'] = p fold_scores['by_label'][label]['r'] = r fold_scores['by_label'][label]['f'] = f f = float(f) print('<{}> P: {:.3}, R: {:.3}, F: {:.3}'.format(label, p, r, f)) return fold_scores
def recall(self, label): return scores.recall(self._referenceSets[label], \ self._testSets[label])