def cross_validation(featureset,k,tamanhos,dataAugmentation):
    soma = 0
    fmeasurePonderadoMedia = 0;
    nome_do_dataset = dataAugmentation['nome_do_dataset']
    oversampling = dataAugmentation['oversampling']
    ind = data_augmentation(nome_do_dataset)
    qtdNegativos = 0
    kf = KFold(n_splits=k)
    tam = 10
    fmeasureDesvio = [] 
    fmeasurePonderadoMedia = []
    random.Random().shuffle(featureset)
    if(oversampling):
        featuresetComIndice = featureset
        indicesDoBalanceamento = []
        featureset = []
        for features in featuresetComIndice:
            indicesDoBalanceamento.append(features[2])
            featureset.append(features[:2])
    cont = 0
    for train, test in kf.split(featureset):
        qtdNegativos = tamanhos['qtdNegativos']
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
        train_data = np.array(featureset)[train]
        test_data = np.array(featureset)[test]
        if(oversampling):
            train_augmentation = aumenta_dados(indicesDoBalanceamento,ind,dataAugmentation,train_data)
            qtdNegativos += len(train_augmentation)
            train_data = np.concatenate((train_data,train_augmentation),axis=0)
            random.Random().shuffle(train_data)
        classifier = nltk.classify.SklearnClassifier(LinearSVC())
        # classifier = nltk.NaiveBayesClassifier.train(train_data)
        classifier.train(train_data)

        soma += nltk.classify.accuracy(classifier, test_data)
        for i, (feats, label) in enumerate(test_data):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
        fmeasurePositivo = f_measure(refsets['no'], testsets['no'])
        fmeasureNegativo = f_measure(refsets['yes'], testsets['yes'])
        # print("Positivo",fmeasurePositivo,tamanhos['qtdPositivos'])
        # print("Negativo",fmeasureNegativo,qtdNegativos)
        # print("prox")
        if(not fmeasurePositivo or not fmeasureNegativo):
            continue
        fmeasurePonderadoMedia.append(((fmeasurePositivo*tamanhos['qtdPositivos'])+(fmeasureNegativo*qtdNegativos))/(qtdNegativos+tamanhos['qtdPositivos']))
        cont += 1
    # average = soma/10
    # print(average)
        # fmeasurePositivoMedia = 2 * (precisionPositivoMedia*recallPositivoMedia)/(precisionPositivoMedia+recallPositivoMedia)
        # fmeasureNegativoMedia = 2 * (precisionNegativoMedia*recallNegativoMedia)/(precisionNegativoMedia+recallNegativoMedia)
    # print(cont)
    fmeasurePonderado = np.mean(fmeasurePonderadoMedia)
    print(fmeasurePonderado)
def evaluate_features(feature_select):
    posFeatures = []
    negFeatures = []
    global cnt
    cnt += 1
    #http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
    #breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
    with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWords = [
                feature_select(posWords), 'pos'
            ]  #calls make_full_dict and returns a dict with [word,'True']
            posFeatures.append(posWords)
    with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWords = [feature_select(negWords), 'neg']
            negFeatures.append(negWords)

    #selects 3/4 of the features to be used for training and 1/4 to be used for testing
    posCutoff = int(math.floor(len(posFeatures) * 3 / 4))
    negCutoff = int(math.floor(len(negFeatures) * 3 / 4))
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]

    #trains a Naive Bayes Classifier
    classifier = NaiveBayesClassifier.train(trainFeatures)

    #initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)

    #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)

    #prints metrics to show how well the feature selection did
    print 'train on %d instances, test on %d instances' % (len(trainFeatures),
                                                           len(testFeatures))
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
    print 'pos precision:', precision(referenceSets['pos'], testSets['pos'])
    print 'pos recall:', recall(referenceSets['pos'], testSets['pos'])
    print 'pos f1-score:', f_measure(referenceSets['pos'], testSets['pos'])
    print 'neg precision:', precision(referenceSets['neg'], testSets['neg'])
    print 'neg recall:', recall(referenceSets['neg'], testSets['neg'])
    print 'neg f1-score:', f_measure(referenceSets['neg'], testSets['neg'])
    classifier.show_most_informative_features(10)
    print '================================================='
Beispiel #3
0
def accuracy_measure(classifier, cross_valid_set):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(cross_valid_set):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    print 'pos Precision:', precision(refsets[1], testsets[1])
    print 'pos Recall:', recall(refsets[1], testsets[1])
    print 'pos F-measure:', f_measure(refsets[1], testsets[1])
    print 'neg Precision:', precision(refsets[0], testsets[0])
    print 'neg Recall:', recall(refsets[0], testsets[0])
    print 'neg F-measure:', f_measure(refsets[0], testsets[0])
def evaluate_features(feature_select):
    posFeatures = []
    negFeatures = []
    global cnt
    cnt += 1
    # http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
    # breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
    with open(RT_POLARITY_POS_FILE, "r") as posSentences:
        for i in posSentences:
            posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWords = [feature_select(posWords), "pos"]  # calls make_full_dict and returns a dict with [word,'True']
            posFeatures.append(posWords)
    with open(RT_POLARITY_NEG_FILE, "r") as negSentences:
        for i in negSentences:
            negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWords = [feature_select(negWords), "neg"]
            negFeatures.append(negWords)

    # selects 3/4 of the features to be used for training and 1/4 to be used for testing
    posCutoff = int(math.floor(len(posFeatures) * 3 / 4))
    negCutoff = int(math.floor(len(negFeatures) * 3 / 4))
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]

    # trains a Naive Bayes Classifier
    classifier = NaiveBayesClassifier.train(trainFeatures)

    # initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)

    # puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)

    # prints metrics to show how well the feature selection did
    print "train on %d instances, test on %d instances" % (len(trainFeatures), len(testFeatures))
    print "accuracy:", nltk.classify.util.accuracy(classifier, testFeatures)
    print "pos precision:", precision(referenceSets["pos"], testSets["pos"])
    print "pos recall:", recall(referenceSets["pos"], testSets["pos"])
    print "pos f1-score:", f_measure(referenceSets["pos"], testSets["pos"])
    print "neg precision:", precision(referenceSets["neg"], testSets["neg"])
    print "neg recall:", recall(referenceSets["neg"], testSets["neg"])
    print "neg f1-score:", f_measure(referenceSets["neg"], testSets["neg"])
    classifier.show_most_informative_features(10)
    print "================================================="
Beispiel #5
0
    def calculate_metrics(self):
        included_logs = 0
        metrics = {}
        cc = SmoothingFunction()
        for identifier in self._values:
            if self._values[identifier].get('target_text', None) is not None:
                included_logs += 1
                target_text = self._values[identifier]['target_text']
                output_text = self._values[identifier]['output_text']
                metrics['BLEU'] = metrics.get('BLEU', 0) + sentence_bleu(
                    [target_text], output_text, smoothing_function=cc.method4)
                metrics['accuracy'] = metrics.get('accuracy', 0) + accuracy(
                    target_text, output_text)
                target_text = set(target_text)
                output_text = set(output_text)
                metrics['precision'] = metrics.get('precision', 0) + precision(
                    target_text, output_text)
                metrics['recall'] = metrics.get('recall', 0) + recall(
                    target_text, output_text)
                metrics['f_measure'] = metrics.get('f_measure', 0) + f_measure(
                    target_text, output_text)

        if included_logs != 0:
            for metric in metrics:
                metrics[metric] /= included_logs

        return metrics, included_logs
def me_classifier(exclude_list):
    me_classifier = 0

    with open(train_data, 'r', encoding='utf-8', errors='ignore') as csvfile:
        reader = csv.reader(csvfile)
        feature_set = [(feature_set_generator(text, length, label,
                                              exclude_list), label)
                       for text, length, label in reader]
        #print(feature_set)
        me_classifier = MaxentClassifier.train(feature_set, "megam")

    accuracy = 0.0
    with open(test_data, 'r', encoding='utf-8',
              errors='ignore') as testcsvfile:
        test_reader = csv.reader(testcsvfile)
        test_feature_set = [(feature_set_generator(text, length, label,
                                                   exclude_list), label)
                            for text, length, label in test_reader]
        accuracy = classify.accuracy(me_classifier, test_feature_set)

    classified = collections.defaultdict(set)
    observed = collections.defaultdict(set)
    i = 1
    with open(test_data, 'r', encoding='utf-8',
              errors='ignore') as testcsvfile:
        test_reader = csv.reader(testcsvfile)
        for text, length, label in test_reader:
            observed[label].add(i)
            classified[me_classifier.classify(
                feature_set_generator(text, length, label,
                                      exclude_list))].add(i)
            i += 1

    return accuracy,precision(observed["1"], classified["1"]),recall(observed['1'], classified['1']),\
           f_measure(observed['1'], classified['1']),precision(observed['0'], classified['0']),recall(observed['1'], classified['0']),f_measure(observed['1'], classified['0'])
Beispiel #7
0
def evaluate(ref_tags, hyp_tags):
    if len(ref_tags) != len(hyp_tags):
        raise ValueError(
            'reference and hypothesis has different number of lines')

    n = len(ref_tags)
    counter = Counter(ref_tags)
    unique_tags = set(ref_tags)
    prec_dict, rec_dict, f_dict = defaultdict(float), defaultdict(
        float), defaultdict(float)
    for tag in sorted(unique_tags):
        ref_ids = {i for i, ref_tag in enumerate(ref_tags) if ref_tag == tag}
        hyp_ids = {i for i, hyp_tag in enumerate(hyp_tags) if hyp_tag == tag}
        prec_dict[tag] = precision(ref_ids, hyp_ids)
        rec_dict[tag] = recall(ref_ids, hyp_ids)
        f_dict[tag] = f_measure(ref_ids, hyp_ids)
        if prec_dict[tag] is None:
            warn(f'Undefined precision for {tag}; converting to 0.0')
            prec_dict[tag] = 0.
        if rec_dict[tag] is None:
            warn(f'Undefined recall for {tag}; converting to 0.0')
            rec_dict[tag] = 0.
        if f_dict[tag] is None:
            warn(f'Undefined F-score for {tag}; converting to 0.0')
            f_dict[tag] = 0.
        prec_dict[OVERALL_KEY] += counter[tag] * prec_dict[tag] / n
        rec_dict[OVERALL_KEY] += counter[tag] * rec_dict[tag] / n
        f_dict[OVERALL_KEY] += counter[tag] * f_dict[tag] / n

    return EvalResult(precision=prec_dict,
                      recall=rec_dict,
                      f1=f_dict,
                      conf_matrix=ConfusionMatrix(ref_tags,
                                                  hyp_tags,
                                                  sort_by_count=True))
Beispiel #8
0
def evaluate_features(feature_select):
    posFeatures = []
    negFeatures = []
    #将这些句子分解成单词的列表(由输入机制选择)并在每个列表后附加'pos'或'neg'
    with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWords = [feature_select(posWords), 'pos']
            posFeatures.append(posWords)
    with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWords = [feature_select(negWords), 'neg']
            negFeatures.append(negWords)

    #选择3/4用于训练和1/4用于测试
    posCutoff = int(math.floor(len(posFeatures) * 3 / 4))
    negCutoff = int(math.floor(len(negFeatures) * 3 / 4))
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]

    #训练朴素贝叶斯分类器
    classifier = NaiveBayesClassifier.train(trainFeatures)

    #启动referenceSets和testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)

    #在referenceSets中放置正确标记的句子,在测试集中放置预测性标记的版本
    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)

    #打印指标以显示特征选择的效果
    print 'train on %d instances, test on %d instances' % (len(trainFeatures),
                                                           len(testFeatures))
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
    print 'pos precision:', precision(referenceSets['pos'], testSets['pos'])
    print 'pos recall:', recall(referenceSets['pos'], testSets['pos'])
    print 'pos F1:', f_measure(referenceSets['pos'], testSets['pos'])
    print 'neg precision:', precision(referenceSets['neg'], testSets['neg'])
    print 'neg recall:', recall(referenceSets['neg'], testSets['neg'])
    print 'neg F1:', f_measure(referenceSets['pos'], testSets['pos'])
    classifier.show_most_informative_features(10)
Beispiel #9
0
def kset_stat(silvs,golds) :
  s1 = set(map(to_root,golds))
  s2 = set(map(to_root,silvs))
  #print(s1,s2)
  p=precision(s1,s2)
  r=recall(s1,s2)
  f=f_measure(s1,s2)
  if not (p and r and f) : return {'p':0,'r':0,'f':0}
  return {'p':p,'r':r,'f':f}
def test_trained_classifier(classifier, test_samples):
    """Prints precision/recall statistics of a NLTK classifier"""
    import collections

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (sample, label) in enumerate(test_samples):
        refsets[label].add(i)
        observed = classifier.classify(sample)
        testsets[observed].add(i)

    print("pos precision:", scores.precision(refsets["pos"], testsets["pos"]))
    print("pos recall:", scores.recall(refsets["pos"], testsets["pos"]))
    print("pos F-measure:", scores.f_measure(refsets["pos"], testsets["pos"]))
    print("neg precision:", scores.precision(refsets["neg"], testsets["neg"]))
    print("neg recall:", scores.recall(refsets["neg"], testsets["neg"]))
    print("neg F-measure:", scores.f_measure(refsets["neg"], testsets["neg"]))
Beispiel #11
0
def testing(sent_classifier):

	refsets = collections.defaultdict(set)
	testsets = collections.defaultdict(set)
	 
	for i, (feats, category) in enumerate(testing_set):
	    refsets[category].add(i)
	    observed = sent_classifier.classify(feats)
	    testsets[observed].add(i)

	print ('Classifier Accuracy: ', (nltk.classify.accuracy(sent_classifier, testing_set))*100, "%")
	print ('Classifier pos Precision:', scores.precision(refsets['pos'], testsets['pos'])*100, "%")
	print ('Classifier pos Recall:', scores.recall(refsets['pos'], testsets['pos'])*100, "%")
	print ('Classifier pos F-measure:', scores.f_measure(refsets['pos'], testsets['pos'])*100, "%")
	print ('Classifier neg Precision:', scores.precision(refsets['neg'], testsets['neg'])*100, "%")
	print ('Classifier neg Recall:', scores.recall(refsets['neg'], testsets['neg'])*100, "%")
	print ('Classifier neg F-measure:', scores.f_measure(refsets['neg'], testsets['neg'])*100, "%")
	print ('\n')
Beispiel #12
0
 def showResults(self, classif, clasificador):
     refsets = collections.defaultdict(set)
     testsets = collections.defaultdict(set)
     for i, (feats, label) in enumerate(self.test_data):
         refsets[label].add(i)
         observed = classif.classify(feats)
         testsets[observed].add(i)
     print("F1 Score del clasificador:", clasificador,
           f_measure(refsets['"positive"'], testsets['"positive"']))
def fmeasure(questions_list, batch):
    f_scores = []
    for i, question in enumerate(questions_list):
        original_quest = batch['question_text'][i]
        ref_quest = set(original_quest[:-1].split(' '))
        gen_quest = set(question.split(' '))
        score = f_measure(ref_quest, gen_quest)
        f_scores.append(float(score))

    return f_scores
def classification_result(classifier, test_set):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    reflist = []
    testlist = []
    for i, (tweet, label) in enumerate(test_set):
        refsets[label].add(i)
        reflist.append(label)
        observed = classifier.classify(tweet)
        testsets[observed].add(i)
        testlist.append(observed)
    print(len(refsets['1']))
    print("Accuracy : ", nltk.classify.accuracy(classifier, test_set) * 100)
    print("Precision Pos: ", precision(refsets['1'], testsets['1']) * 100)
    print("Recall Pos: ", recall(refsets['1'], testsets['1']) * 100)
    print("F Measure Pos: ", f_measure(refsets['1'], testsets['1']) * 100)
    print("Precision Neg: ", precision(refsets['0'], testsets['0']) * 100)
    print("Recall Neg: ", recall(refsets['0'], testsets['0']) * 100)
    print("F Measure Neg: ", f_measure(refsets['0'], testsets['0']) * 100)
    print("Confusion Metrics : \n", ConfusionMatrix(reflist, testlist))
Beispiel #15
0
 def sentence_fmeasure(self, references, hypothesis):
     fmeasure_scores = []
     hypothesis_set = set(hypothesis)
     for reference in references:
         reference_set = set(reference)
         fmeasure_score = f_measure(reference_set, hypothesis_set)
         # we calculate f_measure(set(each_reference), set(hypothesis)) score
         fmeasure_scores.append(fmeasure_score)
     fmeasure_final_score = max(fmeasure_scores)
     # we calculate f_measure(set(closest_reference), set(hypothesis)) score
     return fmeasure_final_score
Beispiel #16
0
 def get_results(self, classifier, test_set, target):
     refsets = collections.defaultdict(set)
     testsets = collections.defaultdict(set)
     for i, (feats, label) in enumerate(test_set):
         refsets[label].add(i)
         observed = classifier.classify(feats)
         testsets[observed].add(i)
     target_precision = precision(refsets[target], testsets[target])
     target_recall = recall(refsets[target], testsets[target])
     target_f_measure = f_measure(refsets[target], testsets[target])
     results = (target_precision, target_recall, target_f_measure)
     return (results)
Beispiel #17
0
def avgOffEval(inpath1, inpath2):

    print('\n=============================')
    print(
        'NER evaluation (single entity class/mention-level, full/offsets, avg. of abstract-level)'
    )
    print('=============================')
    print('==> gold', inpath1)
    print('==> pred', inpath2)
    print('=============================')
    recs = []
    pres = []
    fscs = []
    for filename1 in glob.glob(inpath1 + "/*ann"):
        filen1 = filename1.split('/')[len(filename1.split('/')) - 1]
        for filename2 in glob.glob(inpath2 + "/*ann"):
            filen2 = filename2.split('/')[len(filename2.split('/')) - 1]
            if filen1 == filen2:
                preds = set([])
                refrs = set([])
                file1 = codecs.open(filename1, 'r', encoding='utf-8')
                file2 = codecs.open(filename2, 'r', encoding='utf-8')
                for line1 in file1.readlines():
                    if len(line1.split('\t')) > 1:
                        men1 = line1.split('\t')[2].strip()
                        off1 = '-'.join([
                            w.strip() for w in line1.split('\t')[1].split(' ')
                        ])
                        gold = men1 + '_' + off1
                        refrs.add(gold)
                for line2 in file2.readlines():
                    if len(line2.split('\t')) > 1:
                        men2 = line2.split('\t')[2].strip()
                        off2 = '-'.join([
                            w.strip() for w in line2.split('\t')[1].split(' ')
                        ])
                        pred = men2 + '_' + off2
                        preds.add(pred)
                if len(preds) > 0 and len(refrs) > 0:
                    rec = scores.recall(refrs, preds)
                    pre = scores.precision(refrs, preds)
                    fsc = scores.f_measure(refrs, preds)
                else:
                    rec = 0
                    pre = 0
                    fsc = 0
                recs.append(rec)
                pres.append(pre)
                fscs.append(fsc)
    print('average \t R={R} \t P={P} \t F1={F}'.format(R=str(np.mean(recs)),
                                                       P=str(np.mean(pres)),
                                                       F=str(np.mean(fscs))))
    print('=============================\n')
Beispiel #18
0
def printEval(realSet, testSet):

    precisionPos = precision(realSet['pos'], testSet['pos'])
    precisionNeg = precision(realSet['neg'], testSet['neg'])
    precisionNeutre = precision(realSet['neutre'], testSet['neutre'])


    recallPos = recall(realSet['pos'], testSet['pos'])
    recallNeg = recall(realSet['neg'], testSet['neg'])


    fmesurePos = f_measure(realSet['pos'], testSet['pos'])
    fmesureNeg = f_measure(realSet['neg'], testSet['neg'])


    # print("Precision    Pos: " + precisionPos + " - Neg: " + float(precisionNeg)
    # # print("Recall   Pos: %f - Neg: %f - Neutral: %f" %(recallPos, recallNeg, recallNeutre))
    # # print("F-Mesure Pos: %f - Neg: %f - Neutral: %f" %(fmesurePos, fmesureNeg, fmesureNeutre))

    print("Precision    Pos: %f - Neg: %f " %(float(precisionPos), float(precisionNeg)))
    print("Recall   Pos: %f - Neg: %f " %(float(recallPos), float(recallNeg)))
    print("F-Mesure Pos: %f - Neg: %f " %(float(fmesurePos), float(fmesureNeg)))
def main(command, classifier_type):
    feature_functions = [unigram_freqs]

    corpus_file = open('ratings_corpus.json')
    corpus = json.load(corpus_file)
    corpus_file.close()

    feature_representation = [(extract_features(document, feature_functions), label)
                              for document, label in corpus]

    train_set, test_set = split_data(feature_representation)

    classifier = ''
    if command == 'new':
        if classifier_type == 'decision_tree':
            classifier = nltk.classify.DecisionTreeClassifier.train(train_set)
        elif classifier_type == 'maxent':
            classifier = nltk.classify.maxent.MaxentClassifier.train(train_set)
    elif command == 'load':
        if classifier_type == 'decision_tree':
            classifier_file = open('decisiontree_classifier.pickle', 'rb')
            classifier = pickle.load(classifier_file)
            classifier_file.close()
        elif classifier_type == 'maxent':
            classifier_file = open('maxent_classifier.pickle', 'rb')
            classifier = pickle.load(classifier_file)
            classifier_file.close()

    predictions = []
    golds = []

    for test_doc, rating in test_set:
        predictions.append(classifier.classify(test_doc))
        golds.append(rating)

    pred_sets = initialize_sets(ALL_RATINGS)
    gold_sets = initialize_sets(ALL_RATINGS)

    for doc_id, rating in enumerate(predictions):
        pred_sets[rating].add(doc_id)
    for doc_id, rating in enumerate(golds):
        gold_sets[rating].add(doc_id)

    for label in ALL_RATINGS:
        r = scores.recall(gold_sets[label], pred_sets[label])
        p = scores.precision(gold_sets[label], pred_sets[label])
        f = scores.f_measure(gold_sets[label], pred_sets[label])
        
        if not (r==None or p==None or f==None):
            f = float(f)
            print('<{}> P: {:.2}, R: {:.2}, F: {:.2}'.format(label, p, r, f))
Beispiel #20
0
def assign_clusters_to_works(trials):
    dest_file = "results/mca_assignments.csv"
    similarity = wup
    T = 0.76

    cluster_types = ['clusters']
    for (tag_col_prefix, use_only_n_tags) in [('user', 6), ('machine', 25)]:
        for cluster_type in cluster_types:
            print('  *', 'processing type: {} for {}'.format(tag_col_prefix, cluster_type))
            with codecs.open(f'data/{cluster_type}.json', 'rb', 'utf-8') as f_clusters:
                clusters = preprocess_clusters(json.loads(f_clusters.read()))
                tags_to_clusters(
                    clusters,
                    trials,
                    t=T,
                    similarity=similarity,
                    tag_col_prefix=tag_col_prefix,
                    cluster_type=cluster_type,
                    use_only_n_tags=use_only_n_tags)

    #
    # score results
    #
    for work in trials:
        for cluster_type in cluster_types:
            machine = "{}_{}_{}".format('machine', cluster_type, "no_scores")
            human = "{}_{}_{}".format('user', cluster_type, "no_scores")
            work["{}_fmeasure".format(cluster_type)] = f_measure(set(work['user_aggrement']), set(work[machine]))


    df = pd.DataFrame(trials)
    df.drop(columns=['user_tags_synsets', 'machine_tags_synsets'], inplace=True)

    # move some columns to front
    cols = df.columns.tolist()

    for col in ['user_tags',  'machine_tags',  'title',  'description', 'artist_name']:
        cols.insert(0, cols.pop(cols.index(col)))
    df = df.reindex(columns=cols)

    df.to_csv(dest_file, index=False)
    print(' *', 'written file: {}'.format(dest_file))

    for j, cluster_type in enumerate(cluster_types):
        s = df['{}_fmeasure'.format(cluster_type)]
        print(cluster_type,
              'mean f-measure:',
              s.mean(),
              'hit percentage:',
              100 * s.where(s > 0).count() / len(s)
              )
def assess_classifier(classifier, test_set):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    for i, (feats, label) in enumerate(test_set):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
    count = 0
    print('Precision = ' + str(precision(refsets['spam'], testsets['spam'])))
    print('Recall = ' + str(recall(refsets['spam'], testsets['spam'])))
    print('F measure = ' +
          str(f_measure(refsets['spam'], testsets['spam'], alpha=0.5)))
    print('FP rate = ' + str(
        abs((len(refsets['ham']) - len(testsets['ham'])) /
            (len(refsets['spam']) + len(refsets['ham'])))))
def main():
    brown_tagged_sents = brown.tagged_sents(categories='news')
    size = int(len(brown_tagged_sents) * 0.8)
    train_data = brown_tagged_sents[:size]
    test_data = brown_tagged_sents[size:]

    # store pickle file
    if not (os.path.isfile('UnigramTagger.pkl') and os.path.isfile('Tnt_Tagger.pkl')
            and os.path.isfile('PerceptronTagger.pkl')):
        unigram_tagger = unigram_tag(train_data)
        tnt_tagger = tnt_tag(train_data)
        perc_tagger = perceptron_tag(train_data)

        [store_pickle(each_) for each_ in [unigram_tagger, tnt_tagger, perc_tagger]]

    # load pickle file and get each model file with a tuple
    models_files_tuple = [(each_.split('.')[0], retrieve_pickle(each_)) for each_ in
                    ['UnigramTagger.pkl', 'PerceptronTagger.pkl', 'Tnt_Tagger.pkl']]

    # test the loaded models on test data
    print("TESTING LOADED MODELS")
    for tagg_name, tagg_mode in models_files_tuple:
        print("Loaded {tag_name} evaluation results: {evaluate_res}".format(tag_name=tagg_name,
                                                                            evaluate_res=tagg_mode.evaluate(test_data)))

    # Tabulate and calculate accuracies, choose best one based on F1 value
    reference_sentences_lists = [list(map(lambda pair_: pair_[1], each)) for each in test_data]
    test_sentences_lists = [list(map(lambda pair_: pair_[0], each)) for each in test_data]

    reference_lst = list()
    test_lst = list()
    [reference_lst.extend(each_lst) for each_lst in reference_sentences_lists[:1000]]
    [test_lst.extend(each_lst) for each_lst in test_sentences_lists[:1000]]

    for tagg_name, tagger_mod in models_files_tuple:

        if tagg_name == "Tnt_Tagger":
            reference_lst = reference_lst[:700]
            test_lst = test_lst[:700]
        result_tokens = tagger_mod.tag(test_lst)

        result_tokens__ = list(map(lambda pair: 'UNKNOWN' if pair[1] is None else pair[1], result_tokens))

        print("{} Evaluation Results".format(tagg_name))
        print("Precision: ", precision(set(reference_lst), set(result_tokens__)))
        print("Recall: ", recall(set(reference_lst), set(result_tokens__)))
        print("F measure: ", f_measure(set(reference_lst), set(result_tokens__)))
Beispiel #23
0
def compute_evaluation_scores(classifier: ClassifierBase,
                              data_set: List[Tuple[Dict, str]],
                              evaluated_class: LikeTypeEnum) \
        -> Dict[str, float]:
    """Evaluate classifier on dataset with common metrics.

    Namely calculates:
    precision, recall, accuracy, f-measure.

    And adds:
    tp, fp, np, tn (true/false positives/negatives)."""
    clas_scores: dict = {}
    correctly_classified: int = 0

    # metrics
    refsets: DefaultDict[str, set] = defaultdict(set)
    testsets: DefaultDict[str, set] = defaultdict(set)
    for i, (fs, label) in enumerate(data_set):
        refsets[label].add(i)
        classified = classifier.classify(fs)
        testsets[classified].add(i)

        if label == classified:
            correctly_classified += 1

    # we don't know how many and what are the values of negative classes
    # therefore we compute union of all and subtract positive elements
    negative_test: set = reduce(lambda a, b: a.union(b), testsets.values()) \
                         - testsets[evaluated_class.value]
    negative_ref: set = reduce(lambda a, b: a.union(b), refsets.values()) \
                        - refsets[evaluated_class.value]
    positive_test: set = testsets[evaluated_class.value]
    positive_ref: set = refsets[evaluated_class.value]

    clas_scores['tp'] = len(positive_test & positive_ref) / len(data_set)
    clas_scores['fp'] = len(positive_test & negative_ref) / len(data_set)
    clas_scores['tn'] = len(negative_test & negative_ref) / len(data_set)
    clas_scores['fn'] = len(negative_test & positive_ref) / len(data_set)

    clas_scores['precision'] = scores.precision(positive_ref, positive_test)
    clas_scores['recall'] = scores.recall(positive_ref, positive_test)
    clas_scores['f_measure'] = scores.f_measure(positive_ref, positive_test)
    # accuracy is true positives and true negatives over all instances
    clas_scores['accuracy'] = correctly_classified / len(data_set)

    return clas_scores
def run_baseline():
    gold_filter = []
    with open(FEATURE_CSV, "r") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            gold_filter += [int(row["book_id"])]
    clusters = [[], []]
    for id in gold_filter:
        clusters[random.randint(0, len(clusters) - 1)] += [id]
    f_score, recall, precision = score_clusters(clusters, get_gold_standard(gold_filter))
    print "%s,%s,%s" % (f_score, recall, precision)
    f_score, recall, precision = score_clusters([clusters[0] + clusters[1], [2098]], get_gold_standard(gold_filter))
    gold_standard = get_gold_standard(gold_filter)
    print "f-score:", f_measure(
        [set(clusters[0] + clusters[1]), set([])], [set(gold_standard[0]), set(gold_standard[1])]
    )
    print "%s,%s,%s" % (f_score, recall, precision)
Beispiel #25
0
def find_scores():
    #Text formatting to classify
    def format_text(text):
        return ({word: True for word in nltk.word_tokenize(text)})

    #Load positive categorized text
    pos = []
    with open("./pos.txt", encoding='ISO-8859-1') as f:
        for i in f:
            pos.append([
                format_text(i.encode("utf-8").decode("unicode-escape")),
                'positive'
            ])
    #Load negative categorized text
    neg = []
    with open("./neg.txt", encoding='ISO-8859-1') as f:
        for i in f:
            neg.append([
                format_text(i.encode("utf-8").decode("unicode-escape")),
                'negative'
            ])
    #Load negative categorized text
    neu = []
    with open("./neu.txt", encoding='ISO-8859-1') as f:
        for i in f:
            neu.append([
                format_text(i.encode("utf-8").decode("unicode-escape")),
                'neutre'
            ])
    #Split data into training(80%) and testing(20%) sets
    training_set = pos[:int((.80) * len(pos))] + neg[:int(
        (.80) * len(neg))] + neu[:int((.80) * len(neu))]
    test_set = pos[int((.80) * len(pos)):] + neg[int(
        (.80) * len(neg)):] + neu[int((.80) * len(neu)):]
    #Training classifier
    classifier = NaiveBayesClassifier.train(training_set)
    #Calculate scores
    trueset = collections.defaultdict(set)
    testset = collections.defaultdict(set)
    #Test all test-set items using defined classifier
    for i, (text, label) in enumerate(test_set):
        trueset[label].add(i)
        result = classifier.classify(text)
        testset[result].add(i)
        #accurays
    return accuracy(classifier, test_set), f_measure(
        trueset['positive'], testset['negative']), f_measure(
            testset['negative'], trueset['positive']), f_measure(
                testset['neutre'], trueset['positive']), f_measure(
                    testset['positive'], trueset['neutre']), f_measure(
                        testset['negative'],
                        trueset['neutre']), f_measure(testset['neutre'],
                                                      trueset['negative'])
Beispiel #26
0
def precision_recall_F_Measure(classifier, testfeats):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    precisions = {}
    recalls = {}
    f = {}

    for label in classifier.labels():
        precisions[label] = precision(refsets[label], testsets[label])
        recalls[label] = recall(refsets[label], testsets[label])
        f[label] = f_measure(refsets[label], testsets[label])

    return precisions, recalls, f
Beispiel #27
0
def macroOffEval(inpath1, inpath2):

    print('\n=============================')
    print(
        'NER evaluation (single entity class/mention-level, full/offsets, corpus-level)'
    )
    print('=============================')
    print('==> gold', inpath1)
    print('==> pred', inpath2)
    print('=============================')
    preds = set([])
    refrs = set([])
    for filename1 in glob.glob(inpath1 + "/*ann"):
        filen1 = filename1.split('/')[len(filename1.split('/')) - 1]
        for filename2 in glob.glob(inpath2 + "/*ann"):
            filen2 = filename2.split('/')[len(filename2.split('/')) - 1]
            if filen1 == filen2:
                file1 = codecs.open(filename1, 'r', encoding='utf-8')
                file2 = codecs.open(filename2, 'r', encoding='utf-8')
                for line1 in file1.readlines():
                    if len(line1.split('\t')) > 1:
                        men1 = line1.split('\t')[2].strip()
                        off1 = '-'.join([
                            w.strip() for w in line1.split('\t')[1].split(' ')
                        ])
                        gold = men1 + '_' + off1
                        refrs.add(gold)
                for line2 in file2.readlines():
                    if len(line2.split('\t')) > 1:
                        men2 = line2.split('\t')[2].strip()
                        off2 = '-'.join([
                            w.strip() for w in line2.split('\t')[1].split(' ')
                        ])
                        pred = men2 + '_' + off2
                        preds.add(pred)
    rec = scores.recall(refrs, preds)
    pre = scores.precision(refrs, preds)
    fsc = scores.f_measure(refrs, preds)
    print('macro \t R={R} \t P={P} \t F1={F}'.format(R=str(rec),
                                                     P=str(pre),
                                                     F=str(fsc)))
    print('=============================\n')
Beispiel #28
0
def show_metrics(classifier, test_set):
    description = ""
    # Given a classifier and a set to test it, it will print metrics for the classifier
    description = description + "\n" + "Accuracy: " + str(
        nltk.classify.accuracy(classifier, test_set))

    # Creates two sets: one with references (correct results) and other with tests (classifier predictions)
    # This sets are divided in fact-checkable and non-fact-checkable sets that contain a unique id (integer)
    # for each sentence
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    for i, (feats, label) in enumerate(test_set):
        refsets[label].add(i)  # 1, neg
        observed = classifier.classify(feats)  #neg
        testsets[observed].add(i)  #1, neg

    model_precision = int(
        precision(refsets['fact-checkable'], testsets['fact-checkable']) * 100)
    model_recall = int(
        recall(refsets['fact-checkable'], testsets['fact-checkable']) * 100)
    model_f_measure = int(
        f_measure(refsets['fact-checkable'], testsets['fact-checkable'], 0.3) *
        100)

    description += "\n" + "PRECISION: Of the sentences predicted fact-checkable, " + str(
        model_precision) + "% were actually fact-checkable"
    description += "\n" + "RECALL: Of the sentences that were fact-checkable, " + str(
        model_recall) + "% were predicted correctly"
    description += "\n" + "F-MEASURE (balance between precission and recall): " + str(
        model_f_measure) + "%"

    # Same for non fact-checkables
    #print('non-fact-checkable precision:', precision(refsets['non-fact-checkable'], testsets['non-fact-checkable']))
    #print('non-fact-checkable recall:', recall(refsets['non-fact-checkable'], testsets['non-fact-checkable']))
    #print('non-fact-checkable F-measure:', f_measure(refsets['non-fact-checkable'], testsets['non-fact-checkable']))

    print(description)

    # informative
    classifier.show_most_informative_features(25)

    return description
Beispiel #29
0
def get_measures(reference, test):
    tp = tn = fp = fn = 0

    for ((_, r), (_, t)) in zip(reference, test):
        if r == t == "O":
            tn += 1
        elif r == t == "ORG":
            tp += 1
        elif r == "O" and t == "ORG":
            fp += 1
        elif r == "ORG" and t == "O":
            fn += 1
    matrix = [tp, tn, fp, fn]
    acc = accuracy(reference, test)
    reference_set = set(reference)
    test_set = set(test)
    pre = precision(reference_set, test_set)
    rec = recall(reference_set, test_set)
    f = f_measure(reference_set, test_set)
    return acc, pre, rec, f, matrix
def run_baseline():
    gold_filter = []
    with open(FEATURE_CSV, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            gold_filter += [int(row['book_id'])]
    clusters = [[], []]
    for id in gold_filter:
        clusters[random.randint(0, len(clusters) - 1)] += [id]
    f_score, recall, precision = score_clusters(clusters,
                                                get_gold_standard(gold_filter))
    print "%s,%s,%s" % (f_score, recall, precision)
    f_score, recall, precision = score_clusters(
        [clusters[0] + clusters[1], [2098]], get_gold_standard(gold_filter))
    gold_standard = get_gold_standard(gold_filter)
    print "f-score:", f_measure(
        [set(clusters[0] + clusters[1]),
         set([])],
        [set(gold_standard[0]), set(gold_standard[1])])
    print "%s,%s,%s" % (f_score, recall, precision)
Beispiel #31
0
def get_performance_dataframe(tagger, test_tag_list):
    """Returns DataFrame with metrics for individual tag combinations. For NLTK taggers."""
    truth_sets = defaultdict(set)
    test_sets = defaultdict(set)
    
    for n, (w, label) in enumerate(test_tag_list):
        observed = tagger.tag([w])[0][1]
        truth_sets[label].add(n)
        test_sets[observed].add(n)

    performance_dict = dict()
    for key in test_sets.keys():
        performance_dict.setdefault(
            key,
            {
                'Precision': precision(truth_sets[key], test_sets[key]),
                'Recall': recall(truth_sets[key], test_sets[key]),
                'F1': f_measure(truth_sets[key], test_sets[key])
            }
        )
    df = pd.DataFrame(performance_dict).T
    return df
Beispiel #32
0
def compute_pairwise(hashed_er_anns_df):
    """
        Returns pairwise comparision between users (uesr_a & user_b)
        that have completed similar documents
    """
    # Make user_pks unique
    userset = set(hashed_er_anns_df.user_id)

    inter_annotator_arr = []
    # For each unique user comparision, compute
    for user_a, user_b in itertools.combinations(userset, 2):
        # The list of document_pks that each user had completed
        user_a_set = set(hashed_er_anns_df[hashed_er_anns_df['user_id'] ==
                                           user_a].document_pk)
        user_b_set = set(hashed_er_anns_df[hashed_er_anns_df['user_id'] ==
                                           user_b].document_pk)

        # Only compare documents both users have completed
        pmid_set = user_a_set.intersection(user_b_set)

        # If user_a and user_b have completed shared PMID, compute comparisions
        if len(pmid_set) != 0:
            pmid_df = hashed_er_anns_df[hashed_er_anns_df['document_pk'].isin(
                pmid_set)]
            ref_set = set(pmid_df[pmid_df['user_id'] == user_a].hash)
            test_set = set(pmid_df[pmid_df['user_id'] == user_b].hash)

            # Compute the precision, recall and F-measure based on
            # the unique hashes
            inter_annotator_arr.append(
                (user_a, user_b, len(pmid_set),
                 nltk_scoring.precision(ref_set, test_set),
                 nltk_scoring.recall(ref_set, test_set),
                 nltk_scoring.f_measure(ref_set, test_set)))

    return pd.DataFrame(inter_annotator_arr,
                        columns=('user_a', 'user_b', 'docs_compared',
                                 'precision', 'recall', 'f-score'))
Beispiel #33
0
def scores(classifier, test, ids):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    for i, (feats, label) in enumerate(test):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    accuracy = nltk.classify.accuracy(classifier, test)
    print("accuracy: " + str(accuracy))
    p = filter(partial(is_not, None),
               [precision(refsets[sense], testsets[sense]) for sense in ids])
    p = sum(p) / len(p)
    print("precision: " + str(p))
    r = filter(partial(is_not, None),
               [recall(refsets[sense], testsets[sense]) for sense in ids])
    r = sum(r) / len(r)
    print("recall: " + str(r))
    f_1 = filter(partial(is_not, None),
                 [f_measure(refsets[sense], testsets[sense]) for sense in ids])
    f_1 = sum(f_1) / len(f_1)
    print("f-1 score: " + str(f_1))

    return ({"precision": p, "recall": r, "f_1": f_1, "accuracy": accuracy})
def compute_pairwise(hashed_annotations_df):
    '''
        Returns pairwise comparision between users (uesr_a & user_b)
        that have completed similar documents
    '''
    # Make user_pks unique
    userset = set(hashed_annotations_df.user)

    inter_annotator_arr = []
    # For each unique user comparision, compute
    for user_a, user_b in itertools.combinations(userset, 2):
        # The list of document_ids that each user had completed
        user_a_set = set(hashed_annotations_df[hashed_annotations_df['user'] == user_a].document_id)
        user_b_set = set(hashed_annotations_df[hashed_annotations_df['user'] == user_b].document_id)

        # Only compare documents both users have completed
        pmid_set = user_a_set.intersection(user_b_set)

        # If user_a and user_b have completed shared PMID, compute comparisions
        if len(pmid_set) != 0:
            pmid_df = hashed_annotations_df[hashed_annotations_df['document_id'].isin(pmid_set)]
            ref_set = set(pmid_df[pmid_df['user'] == user_a].hash)
            test_set = set(pmid_df[pmid_df['user'] == user_b].hash)

            # Compute the precision, recall and F-measure based on
            # the unique hashes
            inter_annotator_arr.append((
                user_a,
                user_b,
                len(pmid_set),
                nltk_scoring.precision(ref_set, test_set),
                nltk_scoring.recall(ref_set, test_set),
                nltk_scoring.f_measure(ref_set, test_set)
            ))

    return pd.DataFrame(inter_annotator_arr, columns=('user_a', 'user_b', 'docs_compared', 'precision', 'recall', 'f-score'))
 def f_measure(self, label, alpha=0.5):
     return scores.f_measure(self._referenceSets[label], \
                             self._testSets[label], alpha)
Beispiel #36
0
        model.class_prior = [1-categorized_proportion, categorized_proportion]
    else:
        model.class_prior = [categorized_proportion, 1-categorized_proportion]

    classifier.train(train_set)

    # test classifier
    test_results = classifier.classify_many([feat for (feat, label) in test_set])
    pos_test_set = set(i for i, result in enumerate(test_results) if result == category)
    reference_values = [label for (feat, label) in test_set]
    pos_ref_set = set(i for i, (feat, label) in enumerate(test_set) if label == category)
    accuracy = scores.accuracy(reference_values, test_results)
    accuracies.append(accuracy)
    precision = scores.precision(pos_ref_set, pos_test_set)
    recall = scores.recall(pos_ref_set, pos_test_set)
    f1 = scores.f_measure(pos_ref_set, pos_test_set)
    f1_scores.append(f1)

    print "%s: accuracy %s, precision %s, recall %s, F1 %s" % (colored(category, "blue"), colored(accuracy, "yellow"), colored(precision, "yellow"), colored(recall, "yellow"), colored(f1, "yellow"))
    ## print(nltk.classify.accuracy(classifier, test_set))
    # classifier.show_most_informative_features(5)
    # print ""

    # save trained classifier and word features to file
    dump_file = open("classifiers/%s.pickle" % category, "wb")
    pickle.dump({
        "classifier": classifier,
        "word_features": word_features
    }, dump_file)
    dump_file.close()
Beispiel #37
0
 def getFMeasure(self):
     return f_measure(self._refsets['POS'], self._testsets['POS']);
def test_iteration(i, train_set, test_dict, feature_sets_by_match,
                   classifier_type='decision_tree'):
    """Performs one iteration of the k-fold cross validation, returing a dict
    containing overall micro and macro score averages, in addition to scores for
    each label.

    Args:
        i: the iteration of the k-fold cross validation.
        train_set: a list containing feature, rating pairs
        test_dict: a dicitonary containing feature and rating information for
            the test set.
        feature_sets_by_match: feature respresentations of documents organized
            by match.
        classifier_type: the type of classifier to use.
    Returns:
        A dict containing overall micro and macro score averages, in addition
        to scores for each label.
    """
    classifier = ''
    if classifier_type == 'decision_tree':
        #classifier = nltk.classify.DecisionTreeClassifier.train(train_set)
        classifier = nltk.classify.scikitlearn.SklearnClassifier(tree.DecisionTreeClassifier(random_state=8246)).train(train_set)
    elif classifier_type == 'maxent':
        #classifier = nltk.classify.maxent.MaxentClassifier.train(train_set)
        classifier = nltk.classify.scikitlearn.SklearnClassifier(linear_model.LogisticRegression()).train(train_set)
    elif classifier_type == 'svr':
        classifier = nltk.classify.scikitlearn.SklearnClassifier(svm.SVR()).train(train_set)
    
    pred_sets = initialize_sets(ALL_RATINGS)
    gold_sets = initialize_sets(ALL_RATINGS)
    pred_list = []
    gold_list = []

    # Classify predictions and add them to relevant dicts and lists.
    for match in test_dict:
        for doc_id in test_dict[match]:
            test_doc = test_dict[match][doc_id]['features']
            pred = classifier.classify(test_doc)
            gold = test_dict[match][doc_id]['gold']
            test_dict[match][doc_id]['pred'] = pred

            gold_list.append(str(gold))
            pred_list.append(str(pred))
            gold_sets[gold].add(doc_id)
            pred_sets[pred].add(doc_id)

    # Calculate pairwise ranking accuracy
    correct= 0
    total = 0
    for match in test_dict:
        for pl1, pl2 in combinations(test_dict[match].keys(), 2):
            p1 = test_dict[match][pl1]
            p2 = test_dict[match][pl2]
            if p1['gold'] > p2['gold'] and p1['pred'] > p2['pred']:
                correct += 1
            elif p1['gold'] < p2['gold'] and p1['pred'] < p2['pred']:
                correct += 1
            elif p1['gold'] == p2['gold'] and p1['pred'] == p2['pred']:
                correct += 1
            total += 1

    print('Pairwise ranking accuracy: ' + str(correct/total))
    
    fold_scores = {'micro': '',
                   'macro': '',
                   'by_label': {rating: {'p': 0, 'r': 0, 'f': 0}
                                for rating in ALL_RATINGS}
                   }
    prf_micro = precision_recall_fscore_support(gold_list, pred_list, average='micro')
    print(prf_micro)
    fold_scores['micro'] = prf_micro

    prf_macro = precision_recall_fscore_support(gold_list, pred_list, average='macro')
    print(prf_macro)
    fold_scores['macro'] = prf_macro

    for label in ALL_RATINGS:
        r = scores.recall(gold_sets[label], pred_sets[label])
        p = scores.precision(gold_sets[label], pred_sets[label])
        f = scores.f_measure(gold_sets[label], pred_sets[label])
        
        if r == None:
            r = 0.0
        if p == None:
            p = 0.0
        if f == None:
            f = 0.0
            
        fold_scores['by_label'][label]['p'] = p
        fold_scores['by_label'][label]['r'] = r
        fold_scores['by_label'][label]['f'] = f
        f = float(f)
        print('<{}> P: {:.3}, R: {:.3}, F: {:.3}'.format(label, p, r, f))

    return fold_scores