import seaborn as sns sns.set(style="white", color_codes=True) def preprocessing(raw_document): urls = r'(http.+?(\s|$))' specialchar = r'|[^A-Za-z\s]' doc = raw_document.lower() tokens = word_tokenize(re.sub(urls + specialchar, ' ', doc)) return tokens A = Annotation('data/exportMedium.json') docAnnos = A.perDocument(1, ['document', 'annotations']) # This blob produces a dictionary with the values ['proposal', 'no # proposal', 'wrong proposal']. Values are tuples (number of word in document, proposalFlag, duration) blob = dict((k, list(v)) for k, v in groupby(sorted([(len(preprocessing(document)), annotation['proposalFlag'], annotation['duration']) for (document, annotations) in docAnnos for annotation in annotations], key=itemgetter(1)), key=itemgetter(1))) plotContent = dict(( f, map(lambda (count, flag, duration): (count, A.durationToSec(duration)), v))
from __future__ import division from Annotation import Annotation as Annotation from operator import itemgetter from itertools import groupby import matplotlib.pyplot as plt import numpy as np import seaborn as sns sns.set(color_codes=True, style="whitegrid") A = Annotation('data/exportMedium.json') documents, allAnnotations = zip(*A.perDocument(2, ['document', 'annotations'])) totalByAggrement = [(len(annotations), len(set(map(lambda a: a['labels'][0], annotations)))) for annotations in allAnnotations] rlt = [(length, [(num, len(list(v2))) for num, v2 in groupby(v1, key=itemgetter(1))]) for length, v1 in groupby(sorted(totalByAggrement), key=itemgetter(0))] # upper limit for the number of annotations per document # rlt = rlt[:2] numOfAnno, x = zip(*rlt) maxDiff = max(i[0] for l in x for i in l) fig = plt.Figure() index = np.arange(len(rlt))
kappa = (Pd - Pe) / (1 - Pe) return kappa def duplicates(lst): return [(item, count) for item, count in collections.Counter(lst).items() if count > 1] A = Annotation('data/exportMedium.json') maxAnno = 3 labelsPerDocDist = [ FreqDist(map(lambda x: x['labels'][0], pD[0])) for pD in A.perDocument(3, ['annotations']) if len(pD[0]) == maxAnno ] labels = ['Neg', 'Neut', 'Pos', 'No Sent', 'Undecided', 'Irrelevant'] matrix = np.array([ map(lambda l: fd[l] if fd.get(l) else 0, labels) for fd in labelsPerDocDist ]) fKappa = fleissKappa(matrix, maxAnno) print " Fleiss' Kappa: {}".format(fKappa) path = '/home/kai/Dropbox/MA/thesis/const/fleissKappa.tex' with open(path, 'w+') as file: file.write(str(round(fKappa * 1000) / 1000))
import matplotlib.pyplot as plt import numpy as np import seaborn as sns; sns.set(color_codes=True, style="whitegrid") def save(name, value): path = '/home/kai/Dropbox/MA/thesis/const/{}.tex'.format(name) with open(path, 'w+') as file: file.write(value) A = Annotation('data/exportMedium.json') annotations = A.ofAll(['labels', 'proposalFlag']) perDocument = A.perDocument(3, ['dateTime']) save('numberOfAnnotations', str(len(annotations))) # save('numberOfDocuments', str(len(perDocument))) # flags = dict((k, list(v)) for k, v in groupby(sorted(annotations, key=itemgetter(1)), key=itemgetter(1))) save('numberOfProposals', str(len(flags['proposal']))) save('numberOfNoProposals', str(len(flags['no proposal']))) save('numberOfWrongProposals', str(len(flags['wrong proposal'])))
def groupAnnotatorAgreement(allAnnotations): numDiffLabels = [(len(set(map(lambda a: a['labels'][0], annotations))), annotations) for annotations in allAnnotations] k = itemgetter(0) return dict((k, map(itemgetter(1), v)) for k, v in groupby(sorted(numDiffLabels, key=k), key=k)) A = Annotation('data/exportMedium.json') sortedAnnotations = [ sortByTime(annotations[0]) for annotations in A.perDocument(3, ['annotations']) if len(annotations[0]) == 3 ] lastPropAnnotations = filter(lastHasProposal, sortedAnnotations) annotatorAgreement = groupAnnotatorAgreement(lastPropAnnotations) print """ The goal is to predict the third annotation based on the two previously seen annotations. This might enable to reduce the number of annotations from 3 to 2. In case all annotators agree, this might indicate that the annotation for this document is obvious. Let's say we want to reduce the number of annotations from 3 to 2. Then, if the first two annotators agree
import numpy as np import math import seaborn as sns; sns.set(color_codes=True, style="whitegrid") def autolabel(rect, label): height = rect.get_height() ax.text(rect.get_x() + rect.get_width()/2., 1.05*height, label, ha='center', va='bottom') A = Annotation('data/exportMedium.json') annoPerDocument = map(lambda annos: map(lambda a: a['labels'][0], annos[0]), A.perDocument(3, ['annotations']) ) keys = [u'Neg', u'Neut', u'Pos', u'No Sent', u'Undecided', u'Irrelevant'] annotations = list(chain(*annoPerDocument)) annoDist = FreqDist(annotations) labelDist = FreqDist(map(lambda a: Counter(a).most_common()[0][0] if len(set(a))<3 else None, annoPerDocument)) annoCount = np.array(map(lambda k: annoDist[k], keys)) labelCount = np.array(map(lambda k: labelDist[k], keys)) index = np.arange(len(keys))
from __future__ import division from Annotation import Annotation as Annotation from operator import itemgetter from itertools import groupby A = Annotation('data/exportMedium.json') totalByAggrement = [ (len(annotations), len(set(map(lambda a: a['labels'][0], annotations))), '\t'.join([document] + map(lambda a: a['labels'][0], annotations))) for document, annotations in A.perDocument(2, ['document', 'annotations']) ] rlt = dict( (length, dict((num, map(itemgetter(2), v2)) for num, v2 in groupby(v1, key=itemgetter(1)))) for length, v1 in groupby(sorted(totalByAggrement), key=itemgetter(0))) c = [rlt[3][1][:20], rlt[3][2][:20], rlt[3][3][:20]] for i in range(1, 4): with open(str(i) + '.tsv', 'w') as f: f.write('\n'.join(c[i - 1]).encode('utf-8'))
from itertools import groupby import matplotlib.pyplot as plt import numpy as np A = Annotation('data/exportMedium.json') numAnnos = 3 threeAnnoPerDoc = filter( lambda d: len(d) == numAnnos, map( lambda annos: map( lambda a: (a['labels'][0], a['proposalFlag'], a['user']), annos[0]), A.perDocument(numAnnos, ['annotations']))) def wrongProposal(pD): return True if (pD[0][1] != 'wrong proposal' and pD[1][1] != 'wrong proposal' and pD[2][1] == 'wrong proposal') else False wP = filter(wrongProposal, map(lambda pD: sorted(pD, key=itemgetter(1)), threeAnnoPerDoc)) # def match(pD): # match = [] # annotations = pD[0] # for annotation in annotations: