Exemple #1
0
def calc_agreements(nr_of_abstracts=150):
    # Loop over the abstracts and calculate the kappa and alpha per abstract
    aggregate = []
    for i in range(0, nr_of_abstracts):
        # try:
            annotators = round_robin(i)
            annotations_A = flatten(get_annotations(i, annotators[0]))
            annotations_B = flatten(get_annotations(i, annotators[1]))
            annotations = __str_combine_annotations(annotations_A, annotations_B)
            a = AnnotationTask(annotations, agreement_fn)
            aggregate.append({
                "kappa" : a.kappa(),
                "alpha" : a.alpha(),
                "annotator_A" : annotators[0],
                "annotator_B" : annotators[1] })
        # except:
        #     print("Could not calculate kappa for abstract %i" % (i + 1))
        #     pass

    # Summary statistics
    kappa = describe([a['kappa'] for a in aggregate])
    print("number of abstracts %i" % kappa[0])
    print("[kappa] mean: " + str(kappa[2]))
    print("[kappa] variance: " + str(kappa[3]))
    alpha = describe([a['alpha'] for a in aggregate])
    print("[alpha] mean: " + str(alpha[2]))
    print("[alpha] variance: " + str(alpha[3]))
Exemple #2
0
    def alpha(self,
              ids=None,
              staff="upper",
              common_id=None,
              lib='nltk',
              label='bigram',
              distance=None):
        if ids is None:
            ids = []
        if staff not in ('upper', 'lower'):
            raise Exception(
                "Alpha measure only applicable one staff at a time.")

        data = self._staff_annotation_data(ids=ids,
                                           staff=staff,
                                           lib=lib,
                                           label=label,
                                           common_id=common_id)
        if distance is None and label == "bigram":
            distance = DScore.bigram_label_distance

        if lib == 'nltk':
            if distance is None:
                distance = binary_distance
            annot_task = AnnotationTask(data=data, distance=distance)
            krip = annot_task.alpha()
        else:
            if distance is None:
                distance = 'nominal'
            krip = alpha(reliability_data=data, level_of_measurement=distance)

        return krip
def __main__(argv):
	if len(argv) != 2:
		print("Specify cmd arg")
		sys.exit(2)
	else:
		arg = argv[1]
		if arg == 'img':
			reliability_mat = getReliabilityMatImg("../data/imageGID_job_map_expt2_corrected.csv")
		else:
			reliability_mat = getReliabilityMatTurker()

		
		t = AnnotationTask(data=reliability_mat)

		print("Calculating the agreement scores")
		
		alpha = t.alpha()
		print("Alpha = %f" %alpha)
		
		s = t.S()
		print("S = %f" %s)

		pi = t.pi()
		print("Pi = %f" %pi)

		kappa = t.kappa()
		print("kappa = %f" %kappa)
def compute_krippendorff(sce_path,
                         output_path='',
                         wo_attention_check=False,
                         bad_annotators_path='',
                         dataset=''):
    """
    Compute Krippendorff's alpha with krippendorff library
    (https://github.com/pln-fing-udelar/fast-krippendorff/blob/master/sample.py)
    :param sce_path: csv file with columns UID, ANSWER, ANNOTATOR
    :param output_path: path of the output file where the results will be printed (if empty string the results are
    printed in the standart output)
    :param wo_attention_check: if True remove the attention check when computing alpha
    :param bad_annotators_path: path of the pkl file containing for each threshold the list of 'bad' annotators.
    For each threshold remove the annotations of the annotators listed when computing alpha. If empty string no
    annotator's annotation it removed.
    :param dataset: alphanumeric characters identifying the corpus to compute the alpha (if empty string the alpha is
    computed with annotation from all corpora and from attention check)
    """

    if output_path:
        sys.stdout = open(output_path, "w")

    rows = read_csv(sce_path, dataset=dataset)

    bad_annotators_per_th = get_bad_annotators(bad_annotators_path)
    for th, bad_annotators in bad_annotators_per_th.items():
        print(f'--- Threshold {th}---')
        annotations = get_annotations_per_annotators(
            rows,
            wo_attention_check=wo_attention_check,
            wo_annotator=bad_annotators)

        print('- After filtering: -')
        print_annotation_statistics(annotations)

        ratings_per_annotator = get_annotator_tab(annotations)

        data = [[np.nan if not r else int(r) for r in ratings]
                for ratings in ratings_per_annotator]

        print(
            "Krippendorff's alpha for nominal metric: ",
            krippendorff.alpha(reliability_data=data,
                               level_of_measurement='nominal'))
        print("Krippendorff's alpha for interval metric: ",
              krippendorff.alpha(reliability_data=data))
        print(
            "Krippendorff's alpha for ordinal metric: ",
            krippendorff.alpha(reliability_data=data,
                               level_of_measurement='ordinal'))

        # with nltk library
        task_data = annotations2task_data(annotations)
        rating_task = AnnotationTask(data=task_data, distance=ordinal)
        print("Krippendorff's alpha for ordinal metric (nltk): ",
              rating_task.alpha())
    def run_closed_class_jaccard_and_masi(cls, df: pd.DataFrame) -> Dict:
        iaa_by_column = {
            column: {
                "df": extract_iaa_df_by_column_name(df, column)
            }
            for column in cls.CLOSED_CLASS_COLUMNS
        }

        for column in iaa_by_column:
            task = AnnotationTask(distance=jaccard_distance)
            task.load_array(
                extract_records_for_nltk(iaa_by_column[column]['df']))
            iaa_by_column[column]['alpha_jaccard'] = task.alpha()

            task = AnnotationTask(distance=masi_distance)
            task.load_array(
                extract_records_for_nltk(iaa_by_column[column]['df']))
            iaa_by_column[column]['alpha_masi'] = task.alpha()
        return iaa_by_column
def nltk_with_kippendorff_data():
    # needs data to be shaped in triples: (coder,item,label)

    input_eval_dp = "../data/krippendorff-evaluation-dataset.csv"

    eval_df = pd.read_table(input_eval_dp, delimiter=',', index_col=0)
    print("\ninput data:\n", eval_df.head())

    # reshape rcsi data
    eval_nltk_df = pd.DataFrame()
    for index, row in eval_df.iterrows():
        eval_nltk_df = eval_nltk_df.append(
            {
                'coder': 'obs_1',
                'item': index,
                'label': row['obs1']
            },
            ignore_index=True)
        eval_nltk_df = eval_nltk_df.append(
            {
                'coder': 'obs_2',
                'item': index,
                'label': row['obs2']
            },
            ignore_index=True)
        eval_nltk_df = eval_nltk_df.append(
            {
                'coder': 'obs_3',
                'item': index,
                'label': row['obs3']
            },
            ignore_index=True)
        eval_nltk_df = eval_nltk_df.append(
            {
                'coder': 'obs_4',
                'item': index,
                'label': row['obs4']
            },
            ignore_index=True)
        eval_nltk_df = eval_nltk_df.append(
            {
                'coder': 'obs_5',
                'item': index,
                'label': row['obs5']
            },
            ignore_index=True)
    print("\nreshaped data:\n\n", eval_nltk_df.head())
    print(eval_nltk_df.tail())

    annotation_triples = eval_nltk_df.values.tolist()
    # print(annotation_triples)

    t = AnnotationTask(annotation_triples)
    print("\nKrippendorff alpha as per NLTK:\t", t.alpha(),
          "\n===========================================\n")
Exemple #7
0
 def test_easy(self):
     '''
     Simple test, based on
     https://github.com/foolswood/krippendorffs_alpha/raw/master/krippendorff.pdf.
     '''
     data = [('coder1', 'dress1', 'YES'),
             ('coder2', 'dress1', 'NO'),
             ('coder3', 'dress1', 'NO'),
             ('coder1', 'dress2', 'YES'),
             ('coder2', 'dress2', 'NO'),
             ('coder3', 'dress3', 'NO'),
             ]
     annotation_task = AnnotationTask(data)
     self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
Exemple #8
0
 def test_easy2(self):
     '''
     Same simple test with 1 rating removed.
     Removal of that rating should not matter: K-Apha ignores items with
     only 1 rating.
     '''
     data = [('coder1', 'dress1', 'YES'),
             ('coder2', 'dress1', 'NO'),
             ('coder3', 'dress1', 'NO'),
             ('coder1', 'dress2', 'YES'),
             ('coder2', 'dress2', 'NO'),
             ]
     annotation_task = AnnotationTask(data)
     self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
Exemple #9
0
 def test_advanced2(self):
     '''
     Same more advanced example, but with 1 rating removed.
     Again, removal of that 1 rating shoudl not matter.
     '''
     data = [
         ('A', '1', '1'),
         ('B', '1', '1'),
         ('D', '1', '1'),
         ('A', '2', '2'),
         ('B', '2', '2'),
         ('C', '2', '3'),
         ('D', '2', '2'),
         ('A', '3', '3'),
         ('B', '3', '3'),
         ('C', '3', '3'),
         ('D', '3', '3'),
         ('A', '4', '3'),
         ('B', '4', '3'),
         ('C', '4', '3'),
         ('D', '4', '3'),
         ('A', '5', '2'),
         ('B', '5', '2'),
         ('C', '5', '2'),
         ('D', '5', '2'),
         ('A', '6', '1'),
         ('B', '6', '2'),
         ('C', '6', '3'),
         ('D', '6', '4'),
         ('A', '7', '4'),
         ('B', '7', '4'),
         ('C', '7', '4'),
         ('D', '7', '4'),
         ('A', '8', '1'),
         ('B', '8', '1'),
         ('C', '8', '2'),
         ('D', '8', '1'),
         ('A', '9', '2'),
         ('B', '9', '2'),
         ('C', '9', '2'),
         ('D', '9', '2'),
         ('B', '10', '5'),
         ('C', '10', '5'),
         ('D', '10', '5'),
         ('C', '11', '1'),
         ('D', '11', '1'),
         ('C', '12', '3'),
     ]
     annotation_task = AnnotationTask(data)
     self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
Exemple #10
0
 def test_easy(self):
     '''
     Simple test, based on
     https://github.com/foolswood/krippendorffs_alpha/raw/master/krippendorff.pdf.
     '''
     data = [('coder1', 'dress1', 'YES'),
             ('coder2', 'dress1', 'NO'),
             ('coder3', 'dress1', 'NO'),
             ('coder1', 'dress2', 'YES'),
             ('coder2', 'dress2', 'NO'),
             ('coder3', 'dress3', 'NO'),
             ]
     annotation_task = AnnotationTask(data)
     self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
Exemple #11
0
 def test_advanced(self):
     '''
     More advanced test, based on 
     http://www.agreestat.com/research_papers/onkrippendorffalpha.pdf
     '''
     data = [
         ('A', '1', '1'),
         ('B', '1', '1'),
         ('D', '1', '1'),
         ('A', '2', '2'),
         ('B', '2', '2'),
         ('C', '2', '3'),
         ('D', '2', '2'),
         ('A', '3', '3'),
         ('B', '3', '3'),
         ('C', '3', '3'),
         ('D', '3', '3'),
         ('A', '4', '3'),
         ('B', '4', '3'),
         ('C', '4', '3'),
         ('D', '4', '3'),
         ('A', '5', '2'),
         ('B', '5', '2'),
         ('C', '5', '2'),
         ('D', '5', '2'),
         ('A', '6', '1'),
         ('B', '6', '2'),
         ('C', '6', '3'),
         ('D', '6', '4'),
         ('A', '7', '4'),
         ('B', '7', '4'),
         ('C', '7', '4'),
         ('D', '7', '4'),
         ('A', '8', '1'),
         ('B', '8', '1'),
         ('C', '8', '2'),
         ('D', '8', '1'),
         ('A', '9', '2'),
         ('B', '9', '2'),
         ('C', '9', '2'),
         ('D', '9', '2'),
         ('B', '10', '5'),
         ('C', '10', '5'),
         ('D', '10', '5'),
         ('C', '11', '1'),
         ('D', '11', '1'),
         ('C', '12', '3'),
     ]
     annotation_task = AnnotationTask(data)
     self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
Exemple #12
0
 def test_easy2(self):
     '''
     Same simple test with 1 rating removed.
     Removal of that rating should not matter: K-Apha ignores items with
     only 1 rating.
     '''
     data = [('coder1', 'dress1', 'YES'),
             ('coder2', 'dress1', 'NO'),
             ('coder3', 'dress1', 'NO'),
             ('coder1', 'dress2', 'YES'),
             ('coder2', 'dress2', 'NO'),
             ]
     annotation_task = AnnotationTask(data)
     self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
Exemple #13
0
 def test_advanced2(self):
     """
     Same more advanced example, but with 1 rating removed.
     Again, removal of that 1 rating should not matter.
     """
     data = [
         ("A", "1", "1"),
         ("B", "1", "1"),
         ("D", "1", "1"),
         ("A", "2", "2"),
         ("B", "2", "2"),
         ("C", "2", "3"),
         ("D", "2", "2"),
         ("A", "3", "3"),
         ("B", "3", "3"),
         ("C", "3", "3"),
         ("D", "3", "3"),
         ("A", "4", "3"),
         ("B", "4", "3"),
         ("C", "4", "3"),
         ("D", "4", "3"),
         ("A", "5", "2"),
         ("B", "5", "2"),
         ("C", "5", "2"),
         ("D", "5", "2"),
         ("A", "6", "1"),
         ("B", "6", "2"),
         ("C", "6", "3"),
         ("D", "6", "4"),
         ("A", "7", "4"),
         ("B", "7", "4"),
         ("C", "7", "4"),
         ("D", "7", "4"),
         ("A", "8", "1"),
         ("B", "8", "1"),
         ("C", "8", "2"),
         ("D", "8", "1"),
         ("A", "9", "2"),
         ("B", "9", "2"),
         ("C", "9", "2"),
         ("D", "9", "2"),
         ("B", "10", "5"),
         ("C", "10", "5"),
         ("D", "10", "5"),
         ("C", "11", "1"),
         ("D", "11", "1"),
         ("C", "12", "3"),
     ]
     annotation_task = AnnotationTask(data)
     self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
Exemple #14
0
 def test_advanced(self):
     """
     More advanced test, based on
     http://www.agreestat.com/research_papers/onkrippendorffalpha.pdf
     """
     data = [
         ("A", "1", "1"),
         ("B", "1", "1"),
         ("D", "1", "1"),
         ("A", "2", "2"),
         ("B", "2", "2"),
         ("C", "2", "3"),
         ("D", "2", "2"),
         ("A", "3", "3"),
         ("B", "3", "3"),
         ("C", "3", "3"),
         ("D", "3", "3"),
         ("A", "4", "3"),
         ("B", "4", "3"),
         ("C", "4", "3"),
         ("D", "4", "3"),
         ("A", "5", "2"),
         ("B", "5", "2"),
         ("C", "5", "2"),
         ("D", "5", "2"),
         ("A", "6", "1"),
         ("B", "6", "2"),
         ("C", "6", "3"),
         ("D", "6", "4"),
         ("A", "7", "4"),
         ("B", "7", "4"),
         ("C", "7", "4"),
         ("D", "7", "4"),
         ("A", "8", "1"),
         ("B", "8", "1"),
         ("C", "8", "2"),
         ("D", "8", "1"),
         ("A", "9", "2"),
         ("B", "9", "2"),
         ("C", "9", "2"),
         ("D", "9", "2"),
         ("B", "10", "5"),
         ("C", "10", "5"),
         ("D", "10", "5"),
         ("C", "11", "1"),
         ("D", "11", "1"),
         ("C", "12", "3"),
     ]
     annotation_task = AnnotationTask(data)
     self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
Exemple #15
0
 def test_advanced(self):
     '''
     More advanced test, based on 
     http://www.agreestat.com/research_papers/onkrippendorffalpha.pdf
     '''
     data = [('A', '1', '1'),
             ('B', '1', '1'),
             ('D', '1', '1'),
             ('A', '2', '2'),
             ('B', '2', '2'),
             ('C', '2', '3'),
             ('D', '2', '2'),
             ('A', '3', '3'),
             ('B', '3', '3'),
             ('C', '3', '3'),
             ('D', '3', '3'),
             ('A', '4', '3'),
             ('B', '4', '3'),
             ('C', '4', '3'),
             ('D', '4', '3'),
             ('A', '5', '2'),
             ('B', '5', '2'),
             ('C', '5', '2'),
             ('D', '5', '2'),
             ('A', '6', '1'),
             ('B', '6', '2'),
             ('C', '6', '3'),
             ('D', '6', '4'),
             ('A', '7', '4'),
             ('B', '7', '4'),
             ('C', '7', '4'),
             ('D', '7', '4'),
             ('A', '8', '1'),
             ('B', '8', '1'),
             ('C', '8', '2'),
             ('D', '8', '1'),
             ('A', '9', '2'),
             ('B', '9', '2'),
             ('C', '9', '2'),
             ('D', '9', '2'),
             ('B', '10', '5'),
             ('C', '10', '5'),
             ('D', '10', '5'),
             ('C', '11', '1'),
             ('D', '11', '1'),
             ('C', '12', '3'),
             ]
     annotation_task = AnnotationTask(data)
     self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
Exemple #16
0
 def test_advanced2(self):
     '''
     Same more advanced example, but with 1 rating removed.
     Again, removal of that 1 rating shoudl not matter.
     '''
     data = [('A', '1', '1'),
             ('B', '1', '1'),
             ('D', '1', '1'),
             ('A', '2', '2'),
             ('B', '2', '2'),
             ('C', '2', '3'),
             ('D', '2', '2'),
             ('A', '3', '3'),
             ('B', '3', '3'),
             ('C', '3', '3'),
             ('D', '3', '3'),
             ('A', '4', '3'),
             ('B', '4', '3'),
             ('C', '4', '3'),
             ('D', '4', '3'),
             ('A', '5', '2'),
             ('B', '5', '2'),
             ('C', '5', '2'),
             ('D', '5', '2'),
             ('A', '6', '1'),
             ('B', '6', '2'),
             ('C', '6', '3'),
             ('D', '6', '4'),
             ('A', '7', '4'),
             ('B', '7', '4'),
             ('C', '7', '4'),
             ('D', '7', '4'),
             ('A', '8', '1'),
             ('B', '8', '1'),
             ('C', '8', '2'),
             ('D', '8', '1'),
             ('A', '9', '2'),
             ('B', '9', '2'),
             ('C', '9', '2'),
             ('D', '9', '2'),
             ('B', '10', '5'),
             ('C', '10', '5'),
             ('D', '10', '5'),
             ('C', '11', '1'),
             ('D', '11', '1'),
             ('C', '12', '3'),
             ]
     annotation_task = AnnotationTask(data)
     self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
Exemple #17
0
 def test_easy2(self):
     """
     Same simple test with 1 rating removed.
     Removal of that rating should not matter: K-Apha ignores items with
     only 1 rating.
     """
     data = [
         ("coder1", "dress1", "YES"),
         ("coder2", "dress1", "NO"),
         ("coder3", "dress1", "NO"),
         ("coder1", "dress2", "YES"),
         ("coder2", "dress2", "NO"),
     ]
     annotation_task = AnnotationTask(data)
     self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
Exemple #18
0
 def test_easy(self):
     """
     Simple test, based on
     https://github.com/foolswood/krippendorffs_alpha/raw/master/krippendorff.pdf.
     """
     data = [
         ("coder1", "dress1", "YES"),
         ("coder2", "dress1", "NO"),
         ("coder3", "dress1", "NO"),
         ("coder1", "dress2", "YES"),
         ("coder2", "dress2", "NO"),
         ("coder3", "dress3", "NO"),
     ]
     annotation_task = AnnotationTask(data)
     self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
Exemple #19
0
def agree_tags(delta, column):
    """
    egytokenes címkézési feladatokra számol egyetértést
    :param delta:  az összevetett adat
    :param column:  az az oszlop, amelyre egyetértést akarunk számolni
    :return:
    """
    by_field = reverse_tags(delta, column)

    task = AnnotationTask(data=by_field)

    oa = task.avg_Ao()      # observed agreement
    s = task.S()            # Bennett, Albert and Goldstein S (1954) all categories are equally likely
    pi = task.pi()          # Scott pi (1955) single distribution
    kappa = task.kappa()    # Cohen kappa (1960) individual coder distribution
    w_kappa = task.weighted_kappa()
    alpha = task.alpha()    # Krippendorff alpha (1980)

    return oa, s, pi, kappa, w_kappa, alpha
def compute_annotator_agreement_nltkmetrics(data_array):
    ''' See http://nltk.org/api/nltk.metrics.html#nltk.metrics.agreement '''
    
    print "####### Agreement coefficients according to NLTK metrics.agreement #######"
    
    t = AnnotationTask(data=data_array)
    print "Average observed agreement across all coders and items: "+str(t.avg_Ao())
    
    print "Cohen's Kappa (Cohen 1960): "+str(t.kappa())
    print "Weighted kappa (Cohen 1968): "+str(t.weighted_kappa())
    
    print "Scott's pi (Scott 1955): "+str(t.pi())
    #print "pi_avg: "+str(t.pi_avg())
    
    print "alpha (Krippendorff 1980): "+str(t.alpha())
    
    print "Observed disagreement for the alpha coefficient: "+str(t.Do_alpha())
    print "S (Bennett, Albert and Goldstein 1954): "+str(t.S())
    #print "n-notation used in Artstein and Poesio (2007): "+str(t.N(k=, ic???))
    print "Observed disagreement for the weighted kappa coefficient averaged over all labelers: "+str(t.Do_Kw())
def getagreement(tpl,datadir,task_type='all'):
    """Get agreement values for annotators in the :data:'tpl' list

    Args:
       tpl (list):  combination group of annotators
       datadir (str): Cache data directory used by joblib

    Returns:
       namedtuple defined as ``Agree = collections.namedtuple('Agree', ['kappa', 'alpha','avg_ao'], verbose=True)``
    """

    mem = Memory(cachedir=datadir)
    readjson=mem.cache(json2taskdata.readjson,mmap_mode='r')
    create_task_data= mem.cache(json2taskdata.create_task_data)
    count_occurrances=mem.cache(json2taskdata.count_occurrances)
    count_labels=mem.cache(json2taskdata.count_labels)

    annotators=set()
    lectask=[]
    #-------------------------------------------------------------------------------
    # for each annotator in group tpl
    #-------------------------------------------------------------------------------

    for stditem in tpl:
        aname=stditem.split('.')[0][3:][-2:]
        annotators.add(aname)
        lecdict=readjson(stditem)
        newlectask= create_task_data(lecdict,task_type=task_type,annotator=aname)
        label_data=json2taskdata.create_labels_list(newlectask)
        abscount=count_occurrances(str(label_data))
        yaml.dump(abscount,open(os.path.join( datadir,'abscount-'+aname+'.yaml'),'w'))

        setcount=count_labels(newlectask)
        yaml.dump(setcount,open(os.path.join( datadir,'setcount-'+aname+'.yaml'),'w'))

        lectask=lectask+newlectask

    task=AnnotationTask(data=lectask,distance=nltk.metrics.distance.masi_distance_mod)

    return  {frozenset(annotators): Agree(task.kappa(),task.alpha(),task.avg_Ao())}
    data = []

    sentiment_r1_5_scale = []
    sentiment_r2_5_scale = []

    for r1, r2 in zip(sentences_r1, sentences_r2):

        sentiment_r1_5_scale.append(int(r1[5]))
        data.append((6, r1[0], r1[5]))

        sentiment_r2_5_scale.append(int(r2[5]))
        data.append((7, r2[0], r2[5]))

        if (r1[0] != r2[0]):
            print r1[0]

except Exception, e:
    print e

# disconnect from server
db.close()
print i

print skll.kappa(sentiment_r1_5_scale, sentiment_r2_5_scale)

annotation = AnnotationTask(data=data)

print annotation.kappa()
print annotation.alpha()
Exemple #23
0
                                    if (anonym_annot,edge2,dummy_label) not in iaa_data: # to avoid duplicates
                                        iaa_data.append((anonym_annot, single_edge2, dummy_label))
                                    
                        else:
                            # Disagreemnts on edge (and consequently also on label)
                            if edge2 not in summed_results[annotator][text]:
                                if (anonym_annot,edge2,dummy_label) not in iaa_data: 	 # to avoid duplicates
                                    iaa_data.append((anonym_annot, edge2, dummy_label))
                       
    return iaa_data

#text = "text3"
#annotators = ["beata", "elena", "julia"] # "text3"

text = "text6" 
annotators = ["beata", "julia","mats"] # "text6"

dummy_label = frozenset(["CORR"])
flexible = False
add_missing = False		# True = V1, False = V2
iaa_data = create_iaa_data(summed_results, text, annotators, dummy_label, flexible, add_missing)

#print iaa_data[:3]

task = AnnotationTask(data=iaa_data,distance=jaccard_distance)

print "**** Inter-annotator agreement for", text, "****"
print "Avg agreement:\t\t\t\t", round(task.avg_Ao(),3)    		# Average observed agreement across all coders and items.
print "Fleiss (multi_kappa):\t\t", round(task.multi_kappa(),3)  # (Davies and Fleiss 1982)
print "Krippendorff's alpha:\t\t", round(task.alpha(),3) 		# (Krippendorff 1980)
Exemple #24
0
def agreement_analysis(
    crowd_df,
    anno,
):
    assert crowd_df.groupby(
        ['method_i', 'method_j', 'topic', 'criterion',
         'annotator']).count().max().max() == 1
    crowd_df['i greater j?'] = crowd_df['i greater j?'].astype('int32')
    crowd_df.set_index(
        ['method_i', 'method_j', 'topic', 'criterion', 'annotator'])

    # build comparisonId. topic is already ordered. we have to unify H1_H2 == H2_H1
    def methods_ordered(x, all_methods_ordered):
        mi, mj = x['method_i'], x['method_j']
        if all_methods_ordered.index(mi) < all_methods_ordered.index(mj):
            return mi + '_' + mj
        else:
            return mj + '_' + mi

    crowd_df['comparisonId'] = crowd_df.apply(methods_ordered,
                                              args=[all_methods_ordered],
                                              axis=1)
    crowd_df['comparisonId'] = crowd_df['comparisonId'] + '_' \
                                      + crowd_df['topic'].map(str) + '_' \
                                      + crowd_df['criterion']
    # crowd_df['i greater j?'] = crowd_df['i greater j?'].apply(lambda x: bool(random.getrandbits(1)))
    # grouping by comparisonId, not aggregating [method_i, method_j, topic, criterion], they are the same in each group
    weighted_voting_df = crowd_df.groupby(['comparisonId']).agg(
        votes=pd.NamedAgg(column='i greater j?', aggfunc='sum'),
        total_votes=pd.NamedAgg(column='i greater j?', aggfunc='count'),
        # comparisonId=pd.NamedAgg(column='comparisonId', aggfunc='first'), #no aggregation
        method_i=pd.NamedAgg(column='method_i', aggfunc='first'),
        method_j=pd.NamedAgg(column='method_j', aggfunc='first'),
        topic=pd.NamedAgg(column='topic', aggfunc='first'),
        criterion=pd.NamedAgg(column='criterion', aggfunc='first'),
    )
    weighted_voting_df = weighted_voting_df[weighted_voting_df.total_votes > 1]

    def percentage_agreement(row):
        if row['total_votes'] == 0:
            return np.NaN
        return row['votes'] / row['total_votes']

    perc_df = weighted_voting_df.apply(
        percentage_agreement, axis=1).rename('percentage_agreement').dropna()
    weighted_voting_df = weighted_voting_df.join(perc_df)
    weighted_voting_df = weighted_voting_df.reset_index()

    def won_vote(x):
        ag = x['percentage_agreement']
        if ag > 0.5:
            return True
        elif ag == 0.5:
            #for purposes of criteria agreement_analysis, its a 0.5 either way, so we just assign a random winner
            return bool(random.getrandbits(1))
        else:
            return False

    weighted_voting_df['left_won_vote?'] = weighted_voting_df.apply(won_vote,
                                                                    axis=1)
    comparisonId_df = crowd_df.copy(deep=True)
    method_i = weighted_voting_df.drop(columns='method_j').rename(
        columns={'method_i': 'method'})
    method_j = weighted_voting_df.drop(columns='method_i').rename(
        columns={'method_j': 'method'})
    method_j['left_won_vote?'] = ~method_j['left_won_vote?']
    method_j['percentage_agreement'] = method_j['percentage_agreement'].apply(
        lambda x: 1. - x)
    method_j['votes'] = method_j[['votes', 'total_votes'
                                  ]].apply(lambda x:
                                           (x['total_votes'] - x['votes']),
                                           axis=1)
    weighted_voting_df = pd.concat([method_i, method_j])
    weighted_voting_df.reset_index(drop=True)
    weighted_voting_df = weighted_voting_df.sort_values(['comparisonId'])
    assert 0.49 < weighted_voting_df['percentage_agreement'].mean() < 0.51

    #simple plots
    # by criterion (only take the winners, or the mean would be 0.5)
    critmeans = weighted_voting_df[weighted_voting_df[
        'left_won_vote?']].groupby('criterion')['percentage_agreement'].mean()
    if anno == 'likertanno':
        critmeans.reindex(likertanno_criteria, axis=0)
        ax = critmeans.plot(
            y='criterion',
            x='percentage_agreement',
            kind='barh',
        )
        ax.set_xlim([0.6, 1.0])
    if anno == 'pairanno':
        critmeans.reindex(pairanno_criteria, axis=0)
        ax = critmeans.plot(
            y='criterion',
            x='percentage_agreement',
            kind='barh',
        )
        # ax.set_xlim([0.6, 1.0])
    ax.yaxis.label.set_visible(False)
    plt.tight_layout()
    plt.savefig('../figures/{}_percentage_agreement_criteria.pdf'.format(anno),
                bboxinches='tight',
                padinches=0)
    plt.show()
    #by methods
    sysmeans = weighted_voting_df.groupby(
        'method')['percentage_agreement'].mean()
    ax = sysmeans.plot(kind='bar')
    ax.set_ylabel('mean percentage_agreement')
    ax.xaxis.label.set_visible(False)

    plt.tight_layout()
    plt.savefig(
        '../figures/{}_percentage_agreement_by_method.pdf'.format(anno),
        bboxinches='tight',
        padinches=0)
    plt.show()

    ####################### Krippendorf alpha ##########
    #filter out single comparisons (only one annotator votes for a given comparison)
    filtered_comparisons = comparisonId_df.groupby('comparisonId').filter(
        lambda x: len(x) > 1)
    three_cols = ['annotator', 'comparisonId', 'i greater j?']
    task = AnnotationTask(data=filtered_comparisons[three_cols].values)

    krippendorf = [('Whole Dataset', task.alpha())]
    criteria = {
        'likertanno': likertanno_criteria,
        'pairanno': pairanno_criteria
    }
    #by criteria:
    for criterion in criteria[anno][::-1]:  # [::-1] reverses the list
        task = AnnotationTask(data=filtered_comparisons[
            filtered_comparisons.criterion == criterion][three_cols].values)
        # print('{} Krippendorf alpha for {}: \t{}'.format(criterion, anno, task.alpha()))
        krippendorf.append((criterion, task.alpha()))
    krippendorf = pd.DataFrame(data=krippendorf,
                               columns=['criterion', 'krippendorf alpha'])
    ax = krippendorf.plot(kind='barh')
    ax.set_yticklabels(krippendorf.criterion)
    ax.set_xlabel('Krippendorf alpha')
    ax.get_legend().remove()
    plt.tight_layout()
    plt.savefig('../figures/{}_krippendorf_agreement.pdf'.format(anno),
                bboxinches='tight',
                padinches=0)
    plt.show()
    return weighted_voting_df, crowd_df
Exemple #25
0
def krippendorf_alpha(annotations):
    t = AnnotationTask(annotations)  # distance=binary_distance per default
    return t.alpha()  # Krippendorff's alpha
Exemple #26
0
from nltk.metrics.agreement import AnnotationTask
from krippendorff import alpha
import numpy as np

# The following example borrowed via the krippendorf modules doctest from
# https://www.statisticshowto.datasciencecentral.com/wp-content/uploads/2016/07/fulltext.pdf, page 8.
reliability_data = [[1, 2, 3, 3, 2, 1, 4, 1, 2, np.nan, np.nan, np.nan],
                    [1, 2, 3, 3, 2, 2, 4, 1, 2, 5, np.nan, 3],
                    [np.nan, 3, 3, 3, 2, 3, 4, 2, 2, 5, 1, np.nan],
                    [1, 2, 3, 3, 2, 4, 4, 1, 2, 5, 1, np.nan]]
print(round(alpha(reliability_data, level_of_measurement='nominal'), 3))
# 0.743

annotation_data = []
coder_id = 0
for r in reliability_data:
    item_id = 0
    for v in r:
        if not np.isnan(v):
            record = [coder_id, item_id, v]
            annotation_data.append(record)
        item_id += 1
    coder_id += 1
print(annotation_data)

annot_task = AnnotationTask(data=annotation_data)
print(round(annot_task.alpha(), 3))
# 0.705
Exemple #27
0
        writer = csv.writer(f)
        writer.writerows(rows)




#rows = read_csv('../format/030820_first_public_noatt.csv')
#rows = read_csv('../format/attention_check.csv')

def ordinal(a, b):
    if a > b:
        a, b = b, a
    return (sum([i for i in range(a, b+1)]) - ((a+b)/2))**2


ths = ['0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9', '1.0']

pts= ['2pt', '3pt', '4pt']

results = {}
to_csv = [['SCALE', 'TH', 'MAJORITY']]
for pt in pts:
    for th in ths:
        rows = read_csv(f'{pt}_{th}.csv')
        nrows = [(row['ANNOTATOR'], row['UID'], int(row['ANSWER'])) for row in rows]
        rating_task = AnnotationTask(data=nrows, distance=ordinal)
        to_csv.append([pt, th, rating_task.alpha()])


write_csv('alphas.csv', to_csv)
Exemple #28
0
def status_view(request, task_id=None):
    """
    Renders the evaluation tasks status page for staff users.
    """
    LOGGER.info('Rendering evaluation task overview for user "{0}".'.format(
        request.user.username))

    # Check if user is member in WMT13 group.  If so, redirect to wmt13 app.
    if request.user.groups.filter(name="WMT13").exists():
        LOGGER.info('Redirecting user "{0}" to WMT13 overview.'.format(
            request.user.username))
        return redirect('appraise.wmt13.views.overview')

    if task_id:
        task = get_object_or_404(EvaluationTask, task_id=task_id)

        headers = task.get_status_header()
        status = []

        for user in task.users.all():
            status.append((user.username, task.get_status_for_user(user)))

        scores = None
        result_data = []
        raw_result_data = Counter()
        users = list(task.users.all())

        for item in EvaluationItem.objects.filter(task=task):
            results = []
            for user in users:
                qset = EvaluationResult.objects.filter(user=user, item=item)
                if qset.exists():
                    category = str(qset[0].results)
                    results.append((user.id, item.id, category))
                    raw_result_data[qset[0].raw_result] += 1

            if len(results) == len(users):
                result_data.extend(results)

        _raw_results = []
        _keys = raw_result_data.keys()
        _total_results = float(sum(raw_result_data.values()))
        for key in sorted(_keys):
            value = raw_result_data[key]
            _raw_results.append((key, value, 100 * value / _total_results))

        try:
            # Computing inter-annotator agreement only makes sense for more
            # than one coder -- otherwise, we only display result_data...
            if len(users) > 1:
                # Check if we can safely use NLTK's AnnotationTask class.
                try:
                    from nltk.metrics.agreement import AnnotationTask
                    chk = AnnotationTask(data=[('b', '1', 'k'), ('a', '1',
                                                                 'k')])
                    assert (chk == 1.0)

                except AssertionError:
                    LOGGER.debug('Fixing outdated version of AnnotationTask.')
                    from appraise.utils import AnnotationTask

                # We have to sort annotation data to prevent StopIterator errors.
                result_data.sort()
                annotation_task = AnnotationTask(result_data)

                scores = (annotation_task.alpha(), annotation_task.kappa(),
                          annotation_task.S(), annotation_task.pi())

        except ZeroDivisionError:
            scores = None

        except ImportError:
            scores = None

        dictionary = {
            'combined': task.get_status_for_users(),
            'commit_tag': COMMIT_TAG,
            'headers': headers,
            'scores': scores,
            'raw_results': _raw_results,
            'status': status,
            'task_id': task.task_id,
            'task_name': task.task_name,
            'title': 'Evaluation Task Status',
        }

        return render(request, 'evaluation/status_task.html', dictionary)

    else:
        evaluation_tasks = {}
        for task_type_id, task_type in APPRAISE_TASK_TYPE_CHOICES:
            # We collect a list of task descriptions for this task_type.
            evaluation_tasks[task_type] = []

            # Super users see all EvaluationTask items, even non-active ones.
            if request.user.is_superuser:
                _tasks = EvaluationTask.objects.filter(task_type=task_type_id)

            else:
                _tasks = EvaluationTask.objects.filter(task_type=task_type_id,
                                                       active=True)

            # Loop over the QuerySet and compute task description data.
            for _task in _tasks:
                if not APPRAISE_TASK_CACHE.has_key(_task.task_id):
                    APPRAISE_TASK_CACHE[_task.task_id] = {}

                _cache = APPRAISE_TASK_CACHE[_task.task_id]
                if not _cache.has_key(request.user.username):
                    _update_task_cache(_task, request.user)

                _task_data = _cache[request.user.username]

                # Append new task description to current task_type list.
                evaluation_tasks[task_type].append(_task_data)

            # If there are no tasks descriptions for this task_type, we skip it.
            if len(evaluation_tasks[task_type]) == 0:
                evaluation_tasks.pop(task_type)

        dictionary = {
            'active_page': "STATUS",
            'commit_tag': COMMIT_TAG,
            'evaluation_tasks': evaluation_tasks,
            'title': 'Evaluation Task Status',
        }

        return render(request, 'evaluation/status.html', dictionary)
Exemple #29
0
def compute_overall_scores(coder_df, document_column, outcome_column,
                           coder_column):
    """
    Computes overall inter-rater reliability scores (Krippendorf's Alpha and Fleiss' Kappa). Allows for more than two \
    coders and code values. The input data must consist of a :py:class:`pandas.DataFrame` with the following columns:

        - A column with values that indicate the coder (like a name)
        - A column with values that indicate the document (like an ID)
        - A column with values that indicate the code value

    :param coder_df: A :py:class:`pandas.DataFrame` of codes
    :type coder_df: :py:class:`pandas.DataFrame`
    :param document_column: The column that contains IDs for the documents
    :type document_column: str
    :param outcome_column: The column that contains the codes
    :type outcome_column: str
    :param coder_column: The column containing values that indicate which coder assigned the code
    :type coder_column: str
    :return: A dictionary containing the scores
    :rtype: dict

    Usage::

        from pewanalytics.stats.irr import compute_overall_scores
        import pandas as pd

        df = pd.DataFrame([
            {"coder": "coder1", "document": 1, "code": "2"},
            {"coder": "coder2", "document": 1, "code": "2"},
            {"coder": "coder1", "document": 2, "code": "1"},
            {"coder": "coder2", "document": 2, "code": "2"},
            {"coder": "coder1", "document": 3, "code": "0"},
            {"coder": "coder2", "document": 3, "code": "0"},
        ])

        >>> compute_overall_scores(df, "document", "code", "coder")
        {'alpha': 0.5454545454545454, 'fleiss_kappa': 0.4545454545454544}

    """

    alpha = AnnotationTask(
        data=coder_df[[coder_column, document_column, outcome_column]].values)
    try:
        alpha = alpha.alpha()
    except (ZeroDivisionError, ValueError):
        alpha = None

    grouped = coder_df.groupby(document_column).count()
    complete_docs = grouped[grouped[coder_column] == len(
        coder_df[coder_column].unique())].index
    dataset = coder_df[coder_df[document_column].isin(complete_docs)]
    df = dataset.groupby([outcome_column,
                          document_column]).count()[[coder_column]]
    df = df.unstack(outcome_column).fillna(0)

    if len(df) > 0:
        kappa = fleiss_kappa(df)
    else:
        kappa = None

    return {"alpha": alpha, "fleiss_kappa": kappa}
Exemple #30
0
def compute_scores(
    coder_df,
    coder1,
    coder2,
    outcome_column,
    document_column,
    coder_column,
    weight_column=None,
    pos_label=None,
):
    """
    Computes a variety of inter-rater reliability scores, including Cohen's kappa, Krippendorf's alpha, precision,
    and recall. The input data must consist of a :py:class:`pandas.DataFrame` with the following columns:

        - A column with values that indicate the coder (like a name)
        - A column with values that indicate the document (like an ID)
        - A column with values that indicate the code value
        - (Optional) A column with document weights

    This function will return a :py:class:`pandas.DataFrame` with agreement scores between the two specified coders.

    :param coder_df: A :py:class:`pandas.DataFrame` of codes
    :type coder_df: :py:class:`pandas.DataFrame`
    :param coder1: The value in ``coder_column`` for rows corresponding to the first coder
    :type coder1: str or int
    :param coder2: The value in ``coder_column`` for rows corresponding to the second coder
    :type coder2: str or int
    :param outcome_column: The column that contains the codes
    :type outcome_column: str
    :param document_column: The column that contains IDs for the documents
    :type document_column: str
    :param coder_column: The column containing values that indicate which coder assigned the code
    :type coder_column: str
    :param weight_column: The column that contains sampling weights
    :type weight_column: str
    :param pos_label: The value indicating a positive label (optional)
    :type pos_label: str or int
    :return: A dictionary of scores
    :rtype: dict

    .. note:: If using a multi-class (non-binary) code, some scores may come back null or not compute as expected. \
        We recommend running the function separately for each specific code value as a binary flag by providing \
        each unique value to the ``pos_label`` argument. If ``pos_label`` is not provided for multi-class codes, \
        this function will attempt to compute scores based on support-weighted averages.

    Usage::

        from pewanalytics.stats.irr import compute_scores
        import pandas as pd

        df = pd.DataFrame([
            {"coder": "coder1", "document": 1, "code": "2"},
            {"coder": "coder2", "document": 1, "code": "2"},
            {"coder": "coder1", "document": 2, "code": "1"},
            {"coder": "coder2", "document": 2, "code": "2"},
            {"coder": "coder1", "document": 3, "code": "0"},
            {"coder": "coder2", "document": 3, "code": "0"},
        ])

        >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder")
        {'coder1': 'coder1',
         'coder2': 'coder2',
         'n': 3,
         'outcome_column': 'code',
         'pos_label': None,
         'coder1_mean_unweighted': 1.0,
         'coder1_std_unweighted': 0.5773502691896257,
         'coder2_mean_unweighted': 1.3333333333333333,
         'coder2_std_unweighted': 0.6666666666666666,
         'alpha_unweighted': 0.5454545454545454,
         'accuracy': 0.6666666666666666,
         'f1': 0.5555555555555555,
         'precision': 0.5,
         'recall': 0.6666666666666666,
         'precision_recall_min': 0.5,
         'matthews_corrcoef': 0.6123724356957946,
         'roc_auc': None,
         'pct_agree_unweighted': 0.6666666666666666}

        >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder", pos_label="0")
         {'coder1': 'coder1',
         'coder2': 'coder2',
         'n': 3,
         'outcome_column': 'code',
         'pos_label': '0',
         'coder1_mean_unweighted': 0.3333333333333333,
         'coder1_std_unweighted': 0.3333333333333333,
         'coder2_mean_unweighted': 0.3333333333333333,
         'coder2_std_unweighted': 0.3333333333333333,
         'alpha_unweighted': 1.0,
         'cohens_kappa': 1.0,
         'accuracy': 1.0,
         'f1': 1.0,
         'precision': 1.0,
         'recall': 1.0,
         'precision_recall_min': 1.0,
         'matthews_corrcoef': 1.0,
         'roc_auc': 1.0,
         'pct_agree_unweighted': 1.0}

        >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder", pos_label="1")
        {'coder1': 'coder1',
         'coder2': 'coder2',
         'n': 3,
         'outcome_column': 'code',
         'pos_label': '1',
         'coder1_mean_unweighted': 0.3333333333333333,
         'coder1_std_unweighted': 0.3333333333333333,
         'coder2_mean_unweighted': 0.0,
         'coder2_std_unweighted': 0.0,
         'alpha_unweighted': 0.0,
         'cohens_kappa': 0.0,
         'accuracy': 0.6666666666666666,
         'f1': 0.0,
         'precision': 0.0,
         'recall': 0.0,
         'precision_recall_min': 0.0,
         'matthews_corrcoef': 1.0,
         'roc_auc': None,
         'pct_agree_unweighted': 0.6666666666666666}

        >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder", pos_label="2")
        {'coder1': 'coder1',
         'coder2': 'coder2',
         'n': 3,
         'outcome_column': 'code',
         'pos_label': '2',
         'coder1_mean_unweighted': 0.3333333333333333,
         'coder1_std_unweighted': 0.3333333333333333,
         'coder2_mean_unweighted': 0.6666666666666666,
         'coder2_std_unweighted': 0.3333333333333333,
         'alpha_unweighted': 0.4444444444444444,
         'cohens_kappa': 0.3999999999999999,
         'accuracy': 0.6666666666666666,
         'f1': 0.6666666666666666,
         'precision': 0.5,
         'recall': 1.0,
         'precision_recall_min': 0.5,
         'matthews_corrcoef': 0.5,
         'roc_auc': 0.75,
         'pct_agree_unweighted': 0.6666666666666666}


    """

    old_np_settings = np.seterr(all="raise")

    coder_df = copy.deepcopy(coder_df)
    if pos_label:
        coder_df[outcome_column] = (
            coder_df[outcome_column] == pos_label).astype(int)
    coder1_df = coder_df[coder_df[coder_column] == coder1]
    coder1_df.index = coder1_df[document_column]
    coder2_df = coder_df[coder_df[coder_column] == coder2]
    coder2_df.index = coder2_df[document_column]
    coder1_df = coder1_df[coder1_df.index.isin(coder2_df.index)]
    coder2_df = coder2_df[coder2_df.index.isin(
        coder1_df.index)].loc[coder1_df.index]

    row = {
        "coder1": coder1,
        "coder2": coder2,
        "n": len(coder1_df),
        "outcome_column": outcome_column,
        "pos_label": pos_label,
    }

    for labelsetname, labelset in [
        ("coder1", coder1_df[outcome_column]),
        ("coder2", coder2_df[outcome_column]),
    ]:

        if weight_column:
            try:
                weighted_stats = DescrStatsW(labelset,
                                             weights=coder1_df[weight_column])
                if weighted_stats:
                    row["{}_mean".format(labelsetname)] = weighted_stats.mean
                    row["{}_std".format(
                        labelsetname)] = weighted_stats.std_mean
            except (TypeError, ValueError):
                try:
                    weighted_stats = DescrStatsW(
                        labelset.astype(int), weights=coder1_df[weight_column])
                    if weighted_stats:
                        row["{}_mean".format(
                            labelsetname)] = weighted_stats.mean
                        row["{}_std".format(
                            labelsetname)] = weighted_stats.std_mean
                except (TypeError, ValueError):
                    pass

        try:
            unweighted_stats = DescrStatsW(labelset,
                                           weights=[1.0 for x in labelset])
            if unweighted_stats:
                row["{}_mean_unweighted".format(
                    labelsetname)] = unweighted_stats.mean
                row["{}_std_unweighted".format(
                    labelsetname)] = unweighted_stats.std_mean
        except (TypeError, ValueError):
            try:
                unweighted_stats = DescrStatsW(labelset.astype(int),
                                               weights=[1.0 for x in labelset])
                if unweighted_stats:
                    row["{}_mean_unweighted".format(
                        labelsetname)] = unweighted_stats.mean
                    row["{}_std_unweighted".format(
                        labelsetname)] = unweighted_stats.std_mean
            except (TypeError, ValueError):
                pass

    alpha = AnnotationTask(
        data=coder_df[[coder_column, document_column, outcome_column]].values)
    try:
        alpha = alpha.alpha()
    except (ZeroDivisionError, ValueError):
        alpha = None
    row["alpha_unweighted"] = alpha

    labels = np.unique(coder_df[outcome_column])
    if len(labels) <= 2:

        try:
            row["cohens_kappa"] = cohen_kappa_score(
                coder1_df[outcome_column],
                coder2_df[outcome_column],
                sample_weight=coder1_df[weight_column]
                if weight_column else None,
                labels=labels,
            )
        except FloatingPointError:
            row["cohens_kappa"] = 1.0

    try:
        row["accuracy"] = accuracy_score(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
        )
    except ValueError:
        row["accuracy"] = None

    try:
        row["f1"] = f1_score(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
            labels=labels,
            average="weighted" if not pos_label else "binary",
        )
    except ValueError:
        row["f1"] = None

    try:
        row["precision"] = precision_score(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
            labels=labels,
            average="weighted" if not pos_label else "binary",
        )
    except ValueError:
        row["precision"] = None

    try:
        row["recall"] = recall_score(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
            labels=labels,
            average="weighted" if not pos_label else "binary",
        )
    except ValueError:
        row["recall"] = None

    if is_not_null(row["precision"]) and is_not_null(row["recall"]):
        row["precision_recall_min"] = min([row["precision"], row["recall"]])
    else:
        row["precision_recall_min"] = None

    try:
        row["matthews_corrcoef"] = matthews_corrcoef(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
        )
    except ValueError:
        row["matthews_corrcoef"] = None
    except FloatingPointError:
        row["matthews_corrcoef"] = 1.0

    try:

        row["roc_auc"] = (roc_auc_score(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
            labels=labels,
            average="weighted" if not pos_label else None,
        ) if len(np.unique(coder1_df[outcome_column])) > 1
                          and len(np.unique(coder2_df[outcome_column])) > 1
                          else None)
    except TypeError:
        try:
            row["roc_auc"] = (roc_auc_score(
                coder1_df[outcome_column],
                coder2_df[outcome_column],
                sample_weight=coder1_df[weight_column]
                if weight_column else None,
                average="weighted" if not pos_label else None,
            ) if len(np.unique(coder1_df[outcome_column])) > 1
                              and len(np.unique(coder2_df[outcome_column])) > 1
                              else None)
        except (ValueError, TypeError):
            row["roc_auc"] = None
    except (ValueError, TypeError):
        row["roc_auc"] = None

    row["pct_agree_unweighted"] = np.average([
        1 if c[0] == c[1] else 0
        for c in zip(coder1_df[outcome_column], coder2_df[outcome_column])
    ])

    for k, v in row.items():
        if type(v) == tuple:
            row[k] = v[0]
            # For some weird reason, some of the sklearn scorers return 1-tuples sometimes

    np.seterr(**old_np_settings)

    return row
Exemple #31
0
def status_view(request, task_id=None):
    """
    Renders the evaluation tasks status page for staff users.
    """
    LOGGER.info('Rendering evaluation task overview for user "{0}".'.format(
      request.user.username))
    
    # Check if user is member in WMT13 group.  If so, redirect to wmt13 app.
    if request.user.groups.filter(name="WMT13").exists():
        LOGGER.info('Redirecting user "{0}" to WMT13 overview.'.format(
          request.user.username))
        return redirect('appraise.wmt13.views.overview')
    
    if task_id:
        task = get_object_or_404(EvaluationTask, task_id=task_id)
        
        headers = task.get_status_header()
        status = []
        
        for user in task.users.all():
            status.append((user.username, task.get_status_for_user(user)))
        
        scores = None
        result_data = []
        raw_result_data = Counter()
        users = list(task.users.all())
        
        for item in EvaluationItem.objects.filter(task=task):
            results = []
            for user in users:
                qset = EvaluationResult.objects.filter(user=user, item=item)
                if qset.exists():
                    category = str(qset[0].results)
                    results.append((user.id, item.id, category))
                    raw_result_data[qset[0].raw_result] += 1
            
            if len(results) == len(users):
                result_data.extend(results)

        # todo for gisting, calculate - somehow - the percentage of answers against the number of different answers ->
        # in that same gap, and also regroup them for readability
        _raw_results = []
        _keys = raw_result_data.keys()
        _total_results = float(sum(raw_result_data.values()))
        for key in sorted(_keys):
            value = raw_result_data[key]
            _raw_results.append((key, value, 100 * value / _total_results))
        
        try:
            # Computing inter-annotator agreement only makes sense for more
            # than one coder -- otherwise, we only display result_data...
            if len(users) > 1:
                # Check if we can safely use NLTK's AnnotationTask class.
                try:
                    from nltk.metrics.agreement import AnnotationTask
                    chk = AnnotationTask(data=[('b', '1', 'k'),
                      ('a', '1', 'k')])
                    assert(chk == 1.0)
                
                except AssertionError:
                    LOGGER.debug('Fixing outdated version of AnnotationTask.')
                    from appraise.utils import AnnotationTask

                # We have to sort annotation data to prevent StopIterator errors.
                result_data.sort()
                annotation_task = AnnotationTask(result_data)
                
                scores = (
                  annotation_task.alpha(),
                  annotation_task.kappa(),
                  annotation_task.S(),
                  annotation_task.pi()
                )
        
        except ZeroDivisionError:
            scores = None
        
        except ImportError:
            scores = None
        
        dictionary = {
          'combined': task.get_status_for_users(),
          'commit_tag': COMMIT_TAG,
          'headers': headers,
          'scores': scores,
          'raw_results': _raw_results,
          'status': status,
          'task_id': task.task_id,
          'task_name': task.task_name,
          'title': 'Evaluation Task Status',
        }

        return render(request, 'evaluation/status_task.html', dictionary)
    
    else:
        evaluation_tasks = {}
        for task_type_id, task_type in APPRAISE_TASK_TYPE_CHOICES:
            # We collect a list of task descriptions for this task_type.
            evaluation_tasks[task_type] = []
        
            # Super users see all EvaluationTask items, even non-active ones.
            if request.user.is_superuser:
                _tasks = EvaluationTask.objects.filter(task_type=task_type_id)
        
            else:
                _tasks = EvaluationTask.objects.filter(task_type=task_type_id,
                  active=True)
        
            # Loop over the QuerySet and compute task description data.
            for _task in _tasks:
                if not APPRAISE_TASK_CACHE.has_key(_task.task_id):
                    APPRAISE_TASK_CACHE[_task.task_id] = {}
                
                _cache = APPRAISE_TASK_CACHE[_task.task_id]
                if not _cache.has_key(request.user.username):
                    _update_task_cache(_task, request.user)
                
                _task_data = _cache[request.user.username]
                
                # Append new task description to current task_type list.
                evaluation_tasks[task_type].append(_task_data)
            
            # If there are no tasks descriptions for this task_type, we skip it.
            if len(evaluation_tasks[task_type]) == 0:
                evaluation_tasks.pop(task_type)

        dictionary = {
          'active_page': "STATUS",
          'commit_tag': COMMIT_TAG,
          'evaluation_tasks': evaluation_tasks,
          'title': 'Evaluation Task Status',
        }

        return render(request, 'evaluation/status.html', dictionary)
        continue
    ata = align_annot_task(task)
    ata.sort(key=itemgetter(1))
    t = AnnotationTask(ata)
    same = 0
    diff = 0
    for key in set([t[1] for t in ata]):
        r1, r2 = [t for t in ata if t[1] == key]
        if r1[2] == r2[2]:
            same += 1
        else:
            diff += 1
    print('- - - {} - - -'.format(label))
    print('Agreement on: {}/{}'.format(same, same + diff))
    print('Average observed agreement: {}'.format(t.avg_Ao()))
    print('Krippendorff\'s alpha: {}'.format(t.alpha()))

if len(set([t[0] for t in task])) == 2:
    # number of raters = 2
    type_arr1 = []
    type_arr2 = []
    att = align_annot_task(annot_task_type)
    att.sort(key=itemgetter(1))
    for key in set([t[1] for t in att]):
        r1, r2 = [t for t in att if t[1] == key]
        type_arr1.append(r1[2])
        type_arr2.append(r2[2])
    cm = ConfusionMatrix(type_arr1, type_arr2)

    types = ['claim', 'ne', 'example', 'other']
    print()
    def dispatch(self, request, *args, **kwargs):
        self.task = get_object_or_404(Task, pk=self.kwargs['pk'])
        self.array = []
        self.kappa = []
        self.kappa1 = []
        self.kappa_name = "/media/csvfileFinal.csv"
        self.eval_name = "/media/csvfileP.csv"
        self.kappa_nameLong = "/media/csvfileFinal.csv"
        self.lblr = []
        self.head = []
        self.coder_emails = PostResponse.objects.filter(
            task=self.task.pk).values_list(
                'responder__email',
                flat=True).distinct().order_by('responder__email')
        post_list = self.task.post_list.all()
        if os.path.exists(
                '/home/salae001/new/LabelingSystem-master/labelingsystem/media/csvfileFinal.csv'
        ):
            print('existsssss')
            os.remove(
                '/home/salae001/new/LabelingSystem-master/labelingsystem/media/csvfileFinal.csv'
            )
        if os.path.exists(
                '/home/salae001/new/LabelingSystem-master/labelingsystem/media/csvfileP.csv'
        ):
            os.remove(
                '/home/salae001/new/LabelingSystem-master/labelingsystem/media/csvfileP.csv'
            )
        if os.path.exists(
                '/home/salae001/new/LabelingSystem-master/labelingsystem/media/csvfile.csv'
        ):
            os.remove(
                '/home/salae001/new/LabelingSystem-master/labelingsystem/media/csvfile.csv'
            )
        name = "media/csvfileP.csv"  # + str(self.task.pk)
        self.eval_name = "/" + name

        #                    print (name)
        #dateVal =datetime.datetime.now()
        filepp = open(str(name), "w+")
        filepp.write(',')
        for coder_email in self.coder_emails:
            filepp.write(coder_email)
            filepp.write(',')
        filepp.write('Majority Vote')
        filepp.write('\n')
        voteList = {}
        listTemp = []
        cpr = 0
        for post in post_list:
            row = []
            cpr = cpr + 1
            #if cpr > 6:
            #	row.append('...')
            #	break
            row.append(post.content)
            filepp.write(post.content)
            filepp.write(',')
            i = 0
            if len(self.coder_emails) > 5:
                self.coder_emails_temp = self.coder_emails[0:5]
                temp_emails = self.coder_emails
                self.coder_emails_temp.append("(List continues...)")
                #for coder_email in temp_emails:
                #	filepp.write(coder_email)
                #	filepp.write(';')
                #filepp.write('\n')
                voteList = {}
                listTemp = []
                for coder_email in temp_emails:
                    #                                	if len(self.coder_emails) > 5  and coder_email == "(List continues...)":
                    #                                        	label = '...'
                    #	                                else :
                    #					print ('/....N?A////')
                    label = 'N/A'
                    try:
                        post_response = PostResponse.objects.filter(
                            task=self.task.pk,
                            post=post.pk,
                            responder__email=coder_email).last()
                        label = post_response.label
                        #print('label...',label)
                        #filepp.write(coder_email)
                        #		filepp.write(';')
                        filepp.write(str(label))
                        #						myMap = {}
                        listTemp.append(str(label))
                        #						maximum = ( '', 0 ) # (occurring element, occurrences)
                        #						for n in :
                        #							if n in voteList:voteList[n] += 1
                        #							else: voteList[n] = 1
                        #						        # Keep track of maximum on the go
                        #						        if voteList[n] > maximum[1]: maximum = (n,voteList[n])
                        filepp.write(',')
                    except:
                        filepp.write('N/A')
                        listTemp.append('N/A')
                        filepp.write(',')
                        pass
                    #if len(self.coder_emails) > 5:
                    #        label = '...'

            #		row.append(label)
            #	filepp
                maximum = ('', 0)  # (occurring element, occurrences)
                for n in listTemp:
                    if n in voteList:
                        voteList[n] += 1
                    else:
                        voteList[n] = 1
# Keep track of maximum on the go
                    if voteList[n] > maximum[1]:
                        maximum = (n, voteList[n])
            #	filepp.write(';')
            #print('maximum', maximum)
            #	filepp.write(maximum[0])
            #	filepp.write('\n')

            else:
                self.coder_emails_temp = self.coder_emails
                voteList = {}
                listTemp = []
            i = 0
            for coder_email in self.coder_emails_temp:
                #i = i+1

                #if i>6: #self.coder_emails) > 5 and coder_email == "(List continues...)":
                #   break
                if len(self.coder_emails
                       ) > 5 and coder_email == "(List continues...)":
                    # print ('coder email-----------')
                    label = '...'
                    #continue
                    #try:
                    # post_response = PostResponse.objects.filter(task=self.task.pk, post=post.pk, responder__email=coder_email).last()
                    #print (post_response)
                    #label = post_response.label
                    #filepp.write(str(label))
                    #filepp.write(';')
                #  listTemp.append(str(label))
                #except:
                # filepp.write('N/A')#listTemp.append('N/A')
                #filepp.write(';')
                # listTemp.append(str(label))
                #pass

                else:
                    label = 'N/A'
                    try:
                        post_response = PostResponse.objects.filter(
                            task=self.task.pk,
                            post=post.pk,
                            responder__email=coder_email).last()
                        print(post_response)
                        label = post_response.label
                        if len(self.coder_emails) <= 5:
                            filepp.write(str(label))
                            filepp.write(',')
                        listTemp.append(str(label))
                    except:
                        if len(self.coder_emails) <= 5:
                            filepp.write('N/A')  #listTemp.append('N/A')
                            filepp.write(',')
                        listTemp.append(str(label))
                        pass
                row.append(label)
            maximum = ('', 0)
            for n in listTemp:
                if n in voteList:
                    voteList[n] += 1
                else:
                    voteList[n] = 1
                    # Keep track of maximum on the go
                if voteList[n] > maximum[1]:
                    maximum = (n, voteList[n])
            #filepp.write(';')
            filepp.write(maximum[0])
            filepp.write('\n')
            #                              i = i+1
            #			maximum = ( '', 0 ) # (occurring element, occurrences)
            #			for n in listTemp:
            #				if n in voteList:
            #					voteList[n] += 1
            #				else:
            #					voteList[n] = 1
            # Keep track of maximum on the go
            #				if voteList[n] > maximum[1]:
            #					maximum = (n,voteList[n])
            #filepp.write(';')
            #			print('maximum', maximum)
            #                               filepp.write(maximum[0])
            #                               filepp.write('\n')
            #				row.append(maximum[0])
            #row.append(label)
            #			self.coder_emails_temp.append("(List continues...)")
            #			row.append(maximum[0])
            #			self.array.append(row)
            #maximum = ( '', 0 ) # (occurring element, occurrences)
            #for n in listTemp:
            #       if n in voteList:
            #              voteList[n] += 1
            #     else:
            #                voteList[n] = 1
            #                               # Keep track of maximum on the go
            #      if voteList[n] > maximum[1]:
            #             maximum = (n,voteList[n])
            #    #filepp.write(';')
            #print('maximum', maximum)
            #row.append(label)
            maximum = ('', 0)  # (occurring element, occurrences)
            for n in listTemp:
                if n in voteList:
                    voteList[n] += 1
                else:
                    voteList[n] = 1
                    # Keep track of maximum on the go
                if voteList[n] > maximum[1]:
                    maximum = (n, voteList[n])
                #filepp.write(';')
            print('maximum', maximum)
            #filepp.write(maximum[0])
            #filepp.write('\n')

            row.append(maximum[0])
            self.array.append(row)

        try:
            annotation_triplet_list = []
            post_response_list = PostResponse.objects.filter(task=self.task.pk)
            #rint (post_response_list)
            post_response_t = [
                part.encode("utf8") for part in PostResponse.objects.filter(
                    task=self.task.pk).values_list('responder__email',
                                                   flat=True).distinct()
            ]
            lst_rp = []
            triple_list = []
            ctr = 0
            #			for post_response in post_response_t:
            #				post_response = str(post_response).replace('b\'', '')
            #				post_response = post_response.replace('\'', '')
            #				lst_rp.append(post_response)
            #				print (post_response)
            #triple_list.append([])
            #triple_list[ctr].append(post_response)
            #triple_list[ctr].append(post_response)
            #triple_list[ctr].append('0')
            #ctr = ctr + 1
            #print (triple_list)
            # Get all combinations of [1, 2, 3]
            # and length 2
            #	print (post_response)
            #post_response = post_response.replace('b', '\'')
            #	print(post_response)
            #print ('here')
            #print("post_reposne", post_response_t)
            if len(post_response_t) > 5:
                post_response_t_temp = post_response_t[0:5]
                for post_response in post_response_t_temp:
                    post_response = str(post_response).replace('b\'', '')
                    post_response = post_response.replace('\'', '')
                    lst_rp.append(post_response)
                lst_rp.append("(List continues...)")

                comb_temp = combinations(post_response_t, 2)
                for i in list(comb_temp):
                    #print ("in the comb")
                    annotation_triplet_list = []
                    ip = []
                    sp = ""
                    #       ct = ctr + 1
                    #print (ctr)
                    temp = str(i[0]).replace('b\'', '')
                    temp = temp.replace('\'', '')
                    if ([temp, temp, '0'] not in triple_list):
                        triple_list.append([])
                        triple_list[ctr].append(temp)
                        triple_list[ctr].append(temp)
                        triple_list[ctr].append('0')
                        ctr = ctr + 1
                    triple_list.append([])
                    for s in i:
                        st = str(s).replace('b\'', '')
                        st = st.replace('\'', '')
                        ip.append(st)
                        triple_list[ctr].append(st)
#triple_list[ctr].append(i[0])
#triple_list[ctr].append(i[0])
#triple_list[ctr].append(0)
                    print(triple_list[ctr])
                    for post_response in post_response_list:
                        #               print(post_response)
                        #               print(ip, post_response.responder.email)
                        if (post_response.responder.email in ip):

                            annotation_triplet = (
                                post_response.responder.email,
                                post_response.post.content,
                                post_response.label.content)
                            #       print (post_response.responder.email)
                            #       print(annotation_triplet)
                            annotation_triplet_list.append(annotation_triplet)

                            t = AnnotationTask(annotation_triplet_list)
#print("kappa " +str(t.kappa()))
                    triple_list[ctr].append(str(t.kappa()))
                    #str(t.kappa()))
                    self.lblr.append(triple_list)
                    ctr = ctr + 1
                self.alpha1 = t.alpha()
                #	                        print (triple_list)
                self.kappa1.append(triple_list)
                #print ('before EXPORT')
                #				exportCSV(triple_list, self.alpha1, self.coder_emails)
                #				 print ('in export CSV')
                #with open('result.csv','w') as file:
                #print(self.task)

                name = "media/csvfile.csv"  #+ str(self.task.pk)
                self.kappa_nameLong = "/" + name
                print(name)

                #dateVal =datetime.datetime.now()
                filep = open(str(name), "w+")
                #print ('here in csv')
                #print (filep)
                i = 0
                ct = 1
                filep.write(',')
                prev_email = 's'
                #if 's' is not '*****@*****.**':
                #	print (True)
                for email in triple_list:
                    #					print (email)
                    if email[0] != prev_email:
                        #print ('in email 0', email[0])
                        prev_email = email[0]
                        filep.write(email[0])
                        filep.write(',')
                filep.write(email[1])
                filep.write('\n')
                for row in triple_list:
                    #	print (row[0], row[1], row[2])
                    #filep.write(row[0])
                    #filep.write(row[0])
                    #				for i in range(0, len(self.coder_emails)):
                    #					filep.write(row[0])
                    #					filep.write(';')
                    #	print (i)
                    if i == 0 or i == ct - 1:
                        filep.write(row[0])
                        filep.write(',')
                        #		print('row 0', row[0])
                        for k in range(0, i):
                            filep.write('--,')
#						filep.write(row[0])
#						filep.write(';')

                    if i == len(self.coder_emails) - 1:
                        i = ct
                        filep.write(row[2])
                        #	filep.write(row[0])
                        #						print (ct)
                        #						print (range(0,ct))
                        #						for k in range(0,ct) :
                        #							filep.write('--;')
                        filep.write('\n')
                        ct = ct + 1
                    else:
                        i = i + 1
                        #						filep.write('--')
                        filep.write(row[2])
                        filep.write(',')
                    #filep.write('\n')
                filep.close()
                #or col in row:
                #file.write(col)
                #print (triple_list)
                #	for rowp in triple_list:
#							print (rowp)
#	if forloop.counter != forloop.parentloop.counter:
#		if col in rowp:
#	file.write(rowp)
#					file.close()
#self.kappa.append(triple_list)

            else:
                post_response_t_temp = post_response_t
                for post_response in post_response_t_temp:
                    post_response = str(post_response).replace('b\'', '')
                    post_response = post_response.replace('\'', '')
                    lst_rp.append(post_response)
                    print(post_response)

            self.head.append(lst_rp)
            comb = combinations(post_response_t_temp, 2)
            #	print (comb)
            #ctr = 0
            #triple_list = []
            ip = []
            lst_rp = []
            triple_list = []
            ctr = 0

            #triple_list.append([])
            #triple_list.append(post_response_t)
            # Print the obtained combinations
            #			for iv in list(comb):
            #				annotation_triplet_list = []
            #				triple_list.append([])
            #print (i[0])
            #				temp =  str(iv[0]).replace('b\'', '')
            #				temp = temp.replace('\'', '')
            #				if ([temp,temp,'0'] not in triple_list) :
            #					triple_list.append([])
            #					triple_list[ctr].append(temp)
            #					triple_list[ctr].append(temp)
            #					triple_list[ctr].append('0')
            #					ctr = ctr + 1
            #	print (triple_list)
            #			print (triple_list)
            #			comb = combinations(post_response_t, 2)
            for i in list(comb):
                #				print ("in the comb")
                annotation_triplet_list = []
                ip = []
                sp = ""
                #	ct = ctr + 1
                print(ctr)
                temp = str(i[0]).replace('b\'', '')
                temp = temp.replace('\'', '')
                if ([temp, temp, '0'] not in triple_list):
                    triple_list.append([])
                    triple_list[ctr].append(temp)
                    triple_list[ctr].append(temp)
                    triple_list[ctr].append('0')
                    ctr = ctr + 1
                triple_list.append([])
                for s in i:
                    #	print (s)
                    #print (sp)
                    #p.join(s)
                    #p.join(" , ")
                    #					print ("sds"+s)
                    #print (s)
                    #					sp = sp+s+" , "
                    st = str(s).replace('b\'', '')
                    st = st.replace('\'', '')
                    print(st)
                    ip.append(st)
                    triple_list[ctr].append(st)
                #triple_list[ctr].append(i[0])
                #triple_list[ctr].append(i[0])
                #triple_list[ctr].append(0)

                print(triple_list[ctr])
                #triple_list.append(sp)
                #	print(triple_list)
                #print(post_response_list)
                for post_response in post_response_list:
                    #		print(post_response)
                    #		print(ip, post_response.responder.email)
                    if (post_response.responder.email in ip):

                        annotation_triplet = (post_response.responder.email,
                                              post_response.post.content,
                                              post_response.label.content)
                        #	print (post_response.responder.email)
                        #	print(annotation_triplet)
                        annotation_triplet_list.append(annotation_triplet)

                        t = AnnotationTask(annotation_triplet_list)
                #print("kappa " +str(t.kappa()))
                triple_list[ctr].append(str(t.kappa()))
                #str(t.kappa()))
                self.lblr.append(triple_list)

                ctr = ctr + 1
            if len(post_response_t) > 5:
                self.alpha = self.alpha1
            else:
                self.alpha = t.alpha()
            print(triple_list)
            self.kappa.append(triple_list)
            name = "media/csvfileFinal.csv"  #+ str(self.task.pk)
            self.kappa_name = "/" + name
            #print (name)
            #dateVal =datetime.datetime.now()
            filep = open(str(name), "w+")
            #print ('here in csv')
            #print (filep)
            i = 0
            ct = 1
            filep.write(',')
            prev_email = 's'
            #if 's' is not '*****@*****.**':
            #       print (True)
            for email in triple_list:
                #                       print (email)
                if email[0] != prev_email:
                    prev_email = email[0]
                    filep.write(email[0])
                    filep.write(',')
            filep.write(email[1])
            filep.write('\n')
            for row in triple_list:
                #       print (row[0], row[1], row[2])
                #filep.write(row[0])
                #filep.write(row[0])
                #                               for i in range(0, len(self.coder_emails)):
                #                                       filep.write(row[0])
                #                                       filep.write(';')
                #       print (i)
                if i == 0 or i == ct - 1:
                    filep.write(row[0])
                    filep.write(',')
                    #               print('row 0', row[0])
                    for k in range(0, i):
                        filep.write('--,')
#                                               filep.write(row[0])
#                                               filep.write(';')

                if i == len(self.coder_emails) - 1:
                    i = ct
                    filep.write(row[2])
                    #       filep.write(row[0])
                    #                                               print (ct)
                    #                                               print (range(0,ct))
                    #                                               for k in range(0,ct) :
                    #                                                       filep.write('--;')
                    filep.write('\n')
                    ct = ct + 1
                else:
                    i = i + 1
                    #                                               filep.write('--')
                    filep.write(row[2])
                    filep.write(',')
                #filep.write('\n')
            filep.close()
            #self.kappa.append(triple_list)

    #	print (self.kappa)
        except:
            self.alpha = 'N/A'
            name = "media/csvfileFinal.csv"  #+ str(self.task.pk)
            self.kappa_name = "/" + name
            filep = open(str(name), "w+")

        return super(TaskEvaluationDetailView,
                     self).dispatch(request, *args, **kwargs)
Exemple #34
0
def test_agreement_statistics():
    """Tests agreement statistics functions against those found in NLTK:
        https://www.nltk.org/api/nltk.metrics.html#module-nltk.metrics.agreement

    Compares the values of agreement statistics with those found in:
        Artstein, R. and Poesio, M. (2005) Kappa 3 = Alpha (or Beta) University of Essex NLE Technote

    Data is in:
        artstein_poesio_example.txt
    """

    file_path = os.path.join("label_data", "artstein_poesio_example.txt")

    # Distance function for weighted agreement stats
    def test_distance_func(label_a, label_b):
        if label_a == label_b:
            return 0
        elif (label_a == 'ireq'
              and label_b == 'stat') or (label_b == 'ireq'
                                         and label_a == 'stat'):
            return 1
        else:
            return 0.5

    # Gets individual user labels
    def get_user_labels(path):
        with open(path, 'r') as file:
            a_stat = [0] * 100
            a_ireq = [0] * 100
            a_chck = [0] * 100

            b_stat = [0] * 100
            b_ireq = [0] * 100
            b_chck = [0] * 100

            for line in file:
                usr = line.split()[0]
                ind = int(line.split()[1])
                lbl = line.split()[2]
                if usr == 'a':
                    if lbl == 'chck':
                        a_chck[ind - 1] += 1
                    elif lbl == 'stat':
                        a_stat[ind - 1] += 1
                    elif lbl == 'ireq':
                        a_ireq[ind - 1] += 1

                elif usr == 'b':
                    if lbl == 'chck':
                        b_chck[ind - 1] += 1
                    elif lbl == 'stat':
                        b_stat[ind - 1] += 1
                    elif lbl == 'ireq':
                        b_ireq[ind - 1] += 1

            a_data = {'stat': a_stat, 'ireq': a_ireq, 'chck': a_chck}
            a_frame = pd.DataFrame(a_data)
            b_data = {'stat': b_stat, 'ireq': b_ireq, 'chck': b_chck}
            b_frame = pd.DataFrame(b_data)
            example_users_dict = {'a': a_frame, 'b': b_frame}
        return example_users_dict

    # NLTK stats
    nltk_stats = AnnotationTask(data=[x.split() for x in open(file_path)])
    print("nltk:")
    print("multi-Pi - " + str(nltk_stats.pi()))
    print("multi-kappa - " + str(nltk_stats.multi_kappa()))
    print("alpha - " + str(nltk_stats.alpha()))

    # Stats from my functions
    example_users = get_user_labels(file_path)
    print("Mine:")
    print("Multi-Pi - {0:.4f}".format(multi_pi(example_users)))
    print("multi-kappa - {0:.4f}".format(multi_kappa(example_users)))
    print("alpha - {0:.4f}".format(alpha(example_users, test_distance_func)))
    print("alpha prime - {0:.4f}".format(
        alpha_prime(example_users, test_distance_func)))
    print("beta - {0:.4f}".format(beta(example_users, test_distance_func)))

    # Expected values from Artstein and Poesio
    print("Expected:")
    print("mulit-Pi - " + str(0.7995))
    print("mulit-kappa - " + str(0.8013))
    print("alpha - " + str(0.8156))
    print("alpha prime - " + str(0.8146))
    print("beta - " + str(0.8163))

    # Test bias
    uniform_path = os.path.join("label_data", "bias_uniform.txt")
    unequal_path = os.path.join("label_data", "bias_unequal.txt")
    b_uniform = get_user_labels(uniform_path)
    b_unequal = get_user_labels(unequal_path)

    print("Bias with example_users:")
    print("alpha - {0:.4f}".format(alpha(example_users, test_distance_func)))
    print("beta - {0:.4f}".format(beta(example_users, test_distance_func)))
    print("Bias - {0:.4f}".format(bias(example_users, test_distance_func)))

    # Test uniform first
    print("Bias with uniform:")
    print("alpha - {0:.4f}".format(alpha(b_uniform, test_distance_func)))
    print("beta - {0:.4f}".format(beta(b_uniform, test_distance_func)))
    print("Bias - {0:.4f}".format(bias(b_uniform, test_distance_func)))

    print("Bias with unequal:")
    print("alpha - {0:.4f}".format(alpha(b_unequal, test_distance_func)))
    print("beta - {0:.4f}".format(beta(b_unequal, test_distance_func)))
    print("Bias - {0:.4f}".format(bias(b_unequal, test_distance_func)))
Exemple #35
0
from nltk.metrics import ConfusionMatrix
# Here we have four items, each labeled by two different annotators. In two cases, the annotators agree. In two cases they don't.
toy_data = [
    # annotators, element, label
    ['1', 5723, 'ORG'],
    ['2', 5723, 'ORG'],
    ['1', 55829, 'LOC'],
    ['2', 55829, 'LOC'],
    ['1', 259742, 'PER'],
    ['2', 259742, 'LOC'],
    ['1', 269340, 'PER'],
    ['2', 269340, 'LOC']
]
task = AnnotationTask(data=toy_data)
print(task.kappa())
print(task.alpha())
# 16h52 : Yes ! ça marche !

# L'annotateur est remplacé par une division en deux variables
# L'élément est remplacé par la position dans la liste
toy1 = ['ORG', 'LOC', 'PER', 'PER']
toy2 = ['ORG', 'LOC', 'LOC', 'LOC']
cm = ConfusionMatrix(toy1, toy2)
print(cm)

# multilabel pour une classe (un but)
# only 2 utilisateurs

rater1 = ['no', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'no']
rater2 = ['yes', 'no', 'no', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes']
Exemple #36
0
allcoders = data.columns
experts = ['KEY', 'MG', 'MS', 'TM']
novices = ['KEY', 'CK', 'GK', 'RM']

cols = novices

# Total values
taskdata = []
for coder in cols:
    for i in data[coder].index:
        taskdata.append([coder, i, data[coder][i]])

ratingtask = AnnotationTask(data=taskdata)
print("kappa " + str(ratingtask.kappa()))
print("fleiss " + str(ratingtask.multi_kappa()))
print("alpha " + str(ratingtask.alpha()))
print("scotts " + str(ratingtask.pi()))

# Pairwise values
similarities = []
for coders in itertools.product(cols, repeat=2):
    if coders[0] == coders[1]:
        similarities.append(1)
    else:
        taskdata = []
        for coder in coders:
            for i in data[coder].index:
                taskdata.append([coder, i, data[coder][i]])

        ratingtask = AnnotationTask(data=taskdata)
        k = ratingtask.kappa()
    return (description_triples, action_triples, relation_triples)

def xmls_to_triples(filenames):
    total_annotators = len(filenames)
    description_triples = []
    action_triples = []
    relation_triples = []
    tagged_lines_count = get_tagged_lines(filenames)
    def tag_filter(tag):
        tagged_start_line_count = tagged_lines_count[tag.attrib["span_start_line"]]
        tagged_end_line_count = tagged_lines_count[tag.attrib["span_end_line"]]
        return tagged_start_line_count == total_annotators and tagged_end_line_count == total_annotators
    for filename in filenames:
        triples = xml_to_triples(filename, tag_filter)
        description_triples.extend(triples[0])
        action_triples.extend(triples[1])
        relation_triples.extend(triples[2])
    return (description_triples, action_triples, relation_triples)

filenames = ["xml_by_annotater/keren.xml",
             "xml_by_annotater/kristen.xml",
             "xml_by_annotater/jingdi.xml"]

triples = xmls_to_triples(filenames)

for n in range(3):
    task = AnnotationTask(data=triples[n])
    print(task.C)
    print("kappa:", task.multi_kappa())
    print("alpha:", task.alpha())