def __main__(argv):
	if len(argv) != 2:
		print("Specify cmd arg")
		sys.exit(2)
	else:
		arg = argv[1]
		if arg == 'img':
			reliability_mat = getReliabilityMatImg("../data/imageGID_job_map_expt2_corrected.csv")
		else:
			reliability_mat = getReliabilityMatTurker()

		
		t = AnnotationTask(data=reliability_mat)

		print("Calculating the agreement scores")
		
		alpha = t.alpha()
		print("Alpha = %f" %alpha)
		
		s = t.S()
		print("S = %f" %s)

		pi = t.pi()
		print("Pi = %f" %pi)

		kappa = t.kappa()
		print("kappa = %f" %kappa)
Ejemplo n.º 2
0
def agree_tags(delta, column):
    """
    egytokenes címkézési feladatokra számol egyetértést
    :param delta:  az összevetett adat
    :param column:  az az oszlop, amelyre egyetértést akarunk számolni
    :return:
    """
    by_field = reverse_tags(delta, column)

    task = AnnotationTask(data=by_field)

    oa = task.avg_Ao()      # observed agreement
    s = task.S()            # Bennett, Albert and Goldstein S (1954) all categories are equally likely
    pi = task.pi()          # Scott pi (1955) single distribution
    kappa = task.kappa()    # Cohen kappa (1960) individual coder distribution
    w_kappa = task.weighted_kappa()
    alpha = task.alpha()    # Krippendorff alpha (1980)

    return oa, s, pi, kappa, w_kappa, alpha
Ejemplo n.º 3
0
def compute_annotator_agreement_nltkmetrics(data_array):
    ''' See http://nltk.org/api/nltk.metrics.html#nltk.metrics.agreement '''
    
    print "####### Agreement coefficients according to NLTK metrics.agreement #######"
    
    t = AnnotationTask(data=data_array)
    print "Average observed agreement across all coders and items: "+str(t.avg_Ao())
    
    print "Cohen's Kappa (Cohen 1960): "+str(t.kappa())
    print "Weighted kappa (Cohen 1968): "+str(t.weighted_kappa())
    
    print "Scott's pi (Scott 1955): "+str(t.pi())
    #print "pi_avg: "+str(t.pi_avg())
    
    print "alpha (Krippendorff 1980): "+str(t.alpha())
    
    print "Observed disagreement for the alpha coefficient: "+str(t.Do_alpha())
    print "S (Bennett, Albert and Goldstein 1954): "+str(t.S())
    #print "n-notation used in Artstein and Poesio (2007): "+str(t.N(k=, ic???))
    print "Observed disagreement for the weighted kappa coefficient averaged over all labelers: "+str(t.Do_Kw())
Ejemplo n.º 4
0
experts = ['KEY', 'MG', 'MS', 'TM']
novices = ['KEY', 'CK', 'GK', 'RM']

cols = novices

# Total values
taskdata = []
for coder in cols:
    for i in data[coder].index:
        taskdata.append([coder, i, data[coder][i]])

ratingtask = AnnotationTask(data=taskdata)
print("kappa " + str(ratingtask.kappa()))
print("fleiss " + str(ratingtask.multi_kappa()))
print("alpha " + str(ratingtask.alpha()))
print("scotts " + str(ratingtask.pi()))

# Pairwise values
similarities = []
for coders in itertools.product(cols, repeat=2):
    if coders[0] == coders[1]:
        similarities.append(1)
    else:
        taskdata = []
        for coder in coders:
            for i in data[coder].index:
                taskdata.append([coder, i, data[coder][i]])

        ratingtask = AnnotationTask(data=taskdata)
        k = ratingtask.kappa()
        f = ratingtask.multi_kappa()
Ejemplo n.º 5
0
def status_view(request, task_id=None):
    """
    Renders the evaluation tasks status page for staff users.
    """
    LOGGER.info('Rendering evaluation task overview for user "{0}".'.format(
      request.user.username))
    
    # Check if user is member in WMT13 group.  If so, redirect to wmt13 app.
    if request.user.groups.filter(name="WMT13").exists():
        LOGGER.info('Redirecting user "{0}" to WMT13 overview.'.format(
          request.user.username))
        return redirect('appraise.wmt13.views.overview')
    
    if task_id:
        task = get_object_or_404(EvaluationTask, task_id=task_id)
        
        headers = task.get_status_header()
        status = []
        
        for user in task.users.all():
            status.append((user.username, task.get_status_for_user(user)))
        
        scores = None
        result_data = []
        raw_result_data = Counter()
        users = list(task.users.all())
        
        for item in EvaluationItem.objects.filter(task=task):
            results = []
            for user in users:
                qset = EvaluationResult.objects.filter(user=user, item=item)
                if qset.exists():
                    category = str(qset[0].results)
                    results.append((user.id, item.id, category))
                    raw_result_data[qset[0].raw_result] += 1
            
            if len(results) == len(users):
                result_data.extend(results)

        # todo for gisting, calculate - somehow - the percentage of answers against the number of different answers ->
        # in that same gap, and also regroup them for readability
        _raw_results = []
        _keys = raw_result_data.keys()
        _total_results = float(sum(raw_result_data.values()))
        for key in sorted(_keys):
            value = raw_result_data[key]
            _raw_results.append((key, value, 100 * value / _total_results))
        
        try:
            # Computing inter-annotator agreement only makes sense for more
            # than one coder -- otherwise, we only display result_data...
            if len(users) > 1:
                # Check if we can safely use NLTK's AnnotationTask class.
                try:
                    from nltk.metrics.agreement import AnnotationTask
                    chk = AnnotationTask(data=[('b', '1', 'k'),
                      ('a', '1', 'k')])
                    assert(chk == 1.0)
                
                except AssertionError:
                    LOGGER.debug('Fixing outdated version of AnnotationTask.')
                    from appraise.utils import AnnotationTask

                # We have to sort annotation data to prevent StopIterator errors.
                result_data.sort()
                annotation_task = AnnotationTask(result_data)
                
                scores = (
                  annotation_task.alpha(),
                  annotation_task.kappa(),
                  annotation_task.S(),
                  annotation_task.pi()
                )
        
        except ZeroDivisionError:
            scores = None
        
        except ImportError:
            scores = None
        
        dictionary = {
          'combined': task.get_status_for_users(),
          'commit_tag': COMMIT_TAG,
          'headers': headers,
          'scores': scores,
          'raw_results': _raw_results,
          'status': status,
          'task_id': task.task_id,
          'task_name': task.task_name,
          'title': 'Evaluation Task Status',
        }

        return render(request, 'evaluation/status_task.html', dictionary)
    
    else:
        evaluation_tasks = {}
        for task_type_id, task_type in APPRAISE_TASK_TYPE_CHOICES:
            # We collect a list of task descriptions for this task_type.
            evaluation_tasks[task_type] = []
        
            # Super users see all EvaluationTask items, even non-active ones.
            if request.user.is_superuser:
                _tasks = EvaluationTask.objects.filter(task_type=task_type_id)
        
            else:
                _tasks = EvaluationTask.objects.filter(task_type=task_type_id,
                  active=True)
        
            # Loop over the QuerySet and compute task description data.
            for _task in _tasks:
                if not APPRAISE_TASK_CACHE.has_key(_task.task_id):
                    APPRAISE_TASK_CACHE[_task.task_id] = {}
                
                _cache = APPRAISE_TASK_CACHE[_task.task_id]
                if not _cache.has_key(request.user.username):
                    _update_task_cache(_task, request.user)
                
                _task_data = _cache[request.user.username]
                
                # Append new task description to current task_type list.
                evaluation_tasks[task_type].append(_task_data)
            
            # If there are no tasks descriptions for this task_type, we skip it.
            if len(evaluation_tasks[task_type]) == 0:
                evaluation_tasks.pop(task_type)

        dictionary = {
          'active_page': "STATUS",
          'commit_tag': COMMIT_TAG,
          'evaluation_tasks': evaluation_tasks,
          'title': 'Evaluation Task Status',
        }

        return render(request, 'evaluation/status.html', dictionary)
Ejemplo n.º 6
0
def status_view(request, task_id=None):
    """
    Renders the evaluation tasks status page for staff users.
    """
    LOGGER.info('Rendering evaluation task overview for user "{0}".'.format(
        request.user.username))

    # Check if user is member in WMT13 group.  If so, redirect to wmt13 app.
    if request.user.groups.filter(name="WMT13").exists():
        LOGGER.info('Redirecting user "{0}" to WMT13 overview.'.format(
            request.user.username))
        return redirect('appraise.wmt13.views.overview')

    if task_id:
        task = get_object_or_404(EvaluationTask, task_id=task_id)

        headers = task.get_status_header()
        status = []

        for user in task.users.all():
            status.append((user.username, task.get_status_for_user(user)))

        scores = None
        result_data = []
        raw_result_data = Counter()
        users = list(task.users.all())

        for item in EvaluationItem.objects.filter(task=task):
            results = []
            for user in users:
                qset = EvaluationResult.objects.filter(user=user, item=item)
                if qset.exists():
                    category = str(qset[0].results)
                    results.append((user.id, item.id, category))
                    raw_result_data[qset[0].raw_result] += 1

            if len(results) == len(users):
                result_data.extend(results)

        _raw_results = []
        _keys = raw_result_data.keys()
        _total_results = float(sum(raw_result_data.values()))
        for key in sorted(_keys):
            value = raw_result_data[key]
            _raw_results.append((key, value, 100 * value / _total_results))

        try:
            # Computing inter-annotator agreement only makes sense for more
            # than one coder -- otherwise, we only display result_data...
            if len(users) > 1:
                # Check if we can safely use NLTK's AnnotationTask class.
                try:
                    from nltk.metrics.agreement import AnnotationTask
                    chk = AnnotationTask(data=[('b', '1', 'k'), ('a', '1',
                                                                 'k')])
                    assert (chk == 1.0)

                except AssertionError:
                    LOGGER.debug('Fixing outdated version of AnnotationTask.')
                    from appraise.utils import AnnotationTask

                # We have to sort annotation data to prevent StopIterator errors.
                result_data.sort()
                annotation_task = AnnotationTask(result_data)

                scores = (annotation_task.alpha(), annotation_task.kappa(),
                          annotation_task.S(), annotation_task.pi())

        except ZeroDivisionError:
            scores = None

        except ImportError:
            scores = None

        dictionary = {
            'combined': task.get_status_for_users(),
            'commit_tag': COMMIT_TAG,
            'headers': headers,
            'scores': scores,
            'raw_results': _raw_results,
            'status': status,
            'task_id': task.task_id,
            'task_name': task.task_name,
            'title': 'Evaluation Task Status',
        }

        return render(request, 'evaluation/status_task.html', dictionary)

    else:
        evaluation_tasks = {}
        for task_type_id, task_type in APPRAISE_TASK_TYPE_CHOICES:
            # We collect a list of task descriptions for this task_type.
            evaluation_tasks[task_type] = []

            # Super users see all EvaluationTask items, even non-active ones.
            if request.user.is_superuser:
                _tasks = EvaluationTask.objects.filter(task_type=task_type_id)

            else:
                _tasks = EvaluationTask.objects.filter(task_type=task_type_id,
                                                       active=True)

            # Loop over the QuerySet and compute task description data.
            for _task in _tasks:
                if not APPRAISE_TASK_CACHE.has_key(_task.task_id):
                    APPRAISE_TASK_CACHE[_task.task_id] = {}

                _cache = APPRAISE_TASK_CACHE[_task.task_id]
                if not _cache.has_key(request.user.username):
                    _update_task_cache(_task, request.user)

                _task_data = _cache[request.user.username]

                # Append new task description to current task_type list.
                evaluation_tasks[task_type].append(_task_data)

            # If there are no tasks descriptions for this task_type, we skip it.
            if len(evaluation_tasks[task_type]) == 0:
                evaluation_tasks.pop(task_type)

        dictionary = {
            'active_page': "STATUS",
            'commit_tag': COMMIT_TAG,
            'evaluation_tasks': evaluation_tasks,
            'title': 'Evaluation Task Status',
        }

        return render(request, 'evaluation/status.html', dictionary)
Ejemplo n.º 7
0
def calculate_iaa_label(number, data_dict):
    data = []
    y_true = []
    y_pred = []
    i = 0

    if number == 1:
        for key, value in data_dict.items():
            i += 1
            if value['label1'] in ['', ' ']:
                data.append(('Annotator1', str(i), '0'))
                y_pred.append('0')
            else:
                data.append(('Annotator1', str(i), value['label1']))
                y_pred.append(value['label1'])

            if value['label1_2'] in ['', ' ']:
                data.append(('Annotator2', str(i), '0'))
                y_true.append('0')
            else:
                data.append(('Annotator2', str(i), value['label1_2']))
                y_true.append(value['label1_2'])

        t = AnnotationTask(data)
        print('Cohen\'s Kappa for Label {}: {}'.format(number, t.pi()))

        matrix = confusion_matrix(y_true, y_pred)
        disp = ConfusionMatrixDisplay(matrix,
                                      display_labels=[
                                          "0", "1", "1+", "1-", "2", "2+",
                                          "2-", "3", "3+", "3-", "4", "4+",
                                          "4-", "5", "5+", "5-"
                                      ])
        disp = disp.plot(include_values=True, values_format="d")

        fig = plt.gcf()
        fig.set_size_inches(6.5, 6.5)
        plt.xlabel('Annotator2')
        plt.ylabel('Annotator1')
        plt.title('Agreement Label 1')

        plt.show()
        plt.savefig('agreement_label1')

    else:
        for key, value in data_dict.items():
            i += 1
            if value['label2'] in ['', ' ']:
                data.append(('Annotator1', str(i), '0'))
                y_pred.append('0')
            else:
                data.append(('Annotator1', str(i), value['label2']))
                y_pred.append(value['label2'])

            if value['label2_2'] in ['', ' ']:
                data.append(('Annotator2', str(i), '0'))
                y_true.append('0')
            else:
                data.append(('Annotator2', str(i), value['label2_2']))
                y_true.append(value['label2_2'])

        t = AnnotationTask(data)
        print('Cohen\'s Kappa for Label {}: {}'.format(number, t.pi()))

        matrix = confusion_matrix(y_true, y_pred)
        disp = ConfusionMatrixDisplay(matrix,
                                      display_labels=[
                                          "0", "1", "1+", "1-", "2", "2+",
                                          "2-", "3", "3+", "3-", "4", "4+",
                                          "4-", "5", "5+", "5-"
                                      ])
        disp = disp.plot(include_values=True, values_format="d")

        fig = plt.gcf()
        fig.set_size_inches(6.5, 6.5)
        plt.xlabel('Annotator2')
        plt.ylabel('Annotator1')
        plt.title('Agreement Label 2')

        plt.show()
        plt.savefig('agreement_label2')
Ejemplo n.º 8
0
def test_agreement_statistics():
    """Tests agreement statistics functions against those found in NLTK:
        https://www.nltk.org/api/nltk.metrics.html#module-nltk.metrics.agreement

    Compares the values of agreement statistics with those found in:
        Artstein, R. and Poesio, M. (2005) Kappa 3 = Alpha (or Beta) University of Essex NLE Technote

    Data is in:
        artstein_poesio_example.txt
    """

    file_path = os.path.join("label_data", "artstein_poesio_example.txt")

    # Distance function for weighted agreement stats
    def test_distance_func(label_a, label_b):
        if label_a == label_b:
            return 0
        elif (label_a == 'ireq'
              and label_b == 'stat') or (label_b == 'ireq'
                                         and label_a == 'stat'):
            return 1
        else:
            return 0.5

    # Gets individual user labels
    def get_user_labels(path):
        with open(path, 'r') as file:
            a_stat = [0] * 100
            a_ireq = [0] * 100
            a_chck = [0] * 100

            b_stat = [0] * 100
            b_ireq = [0] * 100
            b_chck = [0] * 100

            for line in file:
                usr = line.split()[0]
                ind = int(line.split()[1])
                lbl = line.split()[2]
                if usr == 'a':
                    if lbl == 'chck':
                        a_chck[ind - 1] += 1
                    elif lbl == 'stat':
                        a_stat[ind - 1] += 1
                    elif lbl == 'ireq':
                        a_ireq[ind - 1] += 1

                elif usr == 'b':
                    if lbl == 'chck':
                        b_chck[ind - 1] += 1
                    elif lbl == 'stat':
                        b_stat[ind - 1] += 1
                    elif lbl == 'ireq':
                        b_ireq[ind - 1] += 1

            a_data = {'stat': a_stat, 'ireq': a_ireq, 'chck': a_chck}
            a_frame = pd.DataFrame(a_data)
            b_data = {'stat': b_stat, 'ireq': b_ireq, 'chck': b_chck}
            b_frame = pd.DataFrame(b_data)
            example_users_dict = {'a': a_frame, 'b': b_frame}
        return example_users_dict

    # NLTK stats
    nltk_stats = AnnotationTask(data=[x.split() for x in open(file_path)])
    print("nltk:")
    print("multi-Pi - " + str(nltk_stats.pi()))
    print("multi-kappa - " + str(nltk_stats.multi_kappa()))
    print("alpha - " + str(nltk_stats.alpha()))

    # Stats from my functions
    example_users = get_user_labels(file_path)
    print("Mine:")
    print("Multi-Pi - {0:.4f}".format(multi_pi(example_users)))
    print("multi-kappa - {0:.4f}".format(multi_kappa(example_users)))
    print("alpha - {0:.4f}".format(alpha(example_users, test_distance_func)))
    print("alpha prime - {0:.4f}".format(
        alpha_prime(example_users, test_distance_func)))
    print("beta - {0:.4f}".format(beta(example_users, test_distance_func)))

    # Expected values from Artstein and Poesio
    print("Expected:")
    print("mulit-Pi - " + str(0.7995))
    print("mulit-kappa - " + str(0.8013))
    print("alpha - " + str(0.8156))
    print("alpha prime - " + str(0.8146))
    print("beta - " + str(0.8163))

    # Test bias
    uniform_path = os.path.join("label_data", "bias_uniform.txt")
    unequal_path = os.path.join("label_data", "bias_unequal.txt")
    b_uniform = get_user_labels(uniform_path)
    b_unequal = get_user_labels(unequal_path)

    print("Bias with example_users:")
    print("alpha - {0:.4f}".format(alpha(example_users, test_distance_func)))
    print("beta - {0:.4f}".format(beta(example_users, test_distance_func)))
    print("Bias - {0:.4f}".format(bias(example_users, test_distance_func)))

    # Test uniform first
    print("Bias with uniform:")
    print("alpha - {0:.4f}".format(alpha(b_uniform, test_distance_func)))
    print("beta - {0:.4f}".format(beta(b_uniform, test_distance_func)))
    print("Bias - {0:.4f}".format(bias(b_uniform, test_distance_func)))

    print("Bias with unequal:")
    print("alpha - {0:.4f}".format(alpha(b_unequal, test_distance_func)))
    print("beta - {0:.4f}".format(beta(b_unequal, test_distance_func)))
    print("Bias - {0:.4f}".format(bias(b_unequal, test_distance_func)))