def generateBayes(w, data, senseLocation, tokenLocation, contextSize = 2): allSenses = indexBy(senseLocation, data) #get word bags wordBags = dict([(s, []) for s in allSenses]) for each in allSenses: sentences = [x[tokenLocation] for x in allSenses[each]] sentences = filter(lambda x: w in x, sentences) contexts = [extract(prepSentence(x), contextSize, w, contextSize)\ for x in sentences] wordBags[each] = contexts #bind words to their relative positions to w # originally, C(v_j, s_k) -- l factors in locational information locWords = dict([(s, []) for s in allSenses]) for each in wordBags: wbs = wordBags[each] if len(wbs) < 1: del locWords[each] continue locWords[each] = markLocations(wbs) #first loop from training algorithm #calculates P(v_j|s_k) # represented in python as Pbayes[s_k][v_j] # ie, the bayesian probability, given sense k, of vocab item j rawWords = reduce(lambda x,y: x+locWords[y], [[]]+list(locWords)) words = histogram(rawWords) smooth = 0.5 Pbayes = dict([(s, dict([(w, smooth) for w in words])) for s in locWords]) for s_k in locWords: localCounts = histogram(locWords[s_k]) for each in localCounts: Pbayes[s_k][each] += localCounts[each] Pbayes[s_k][each] = Pbayes[s_k][each]/words[each] Psense = {} totalSenstances = reduce(lambda x,y: x+len(locWords[y]), [0]+list(locWords)) for s_k in locWords: Psense[s_k] = float(len(locWords[s_k]))/totalSenstances return lambda c: bayesDisambiguator(Pbayes, Psense, smooth, c)
def confMatrix(data, parm, target): dataPairs = [] dataByPos = indexBy("corpus_pos", data) for each in dataByPos: dataPairs.append(dict(map(lambda d: (d[parm],d[target]),\ dataByPos[each]))) #grab possible values for parms parms = list(dict(dataPairs[0])) if len(parms) != 2: return "Fail--parm doesn't have two values" #grab all target values targetValues = map(lambda d: d[target], data) targetValues = list(set(targetValues)) targetValues.sort() #total up matrix = dict([((v, w), 0) for v in targetValues for w in targetValues]) for each in dataPairs: vals = tuple(map(lambda p: each[p], parms)) matrix[vals]+=1 return matrix
################### data = loadTurkData(["partyNN.csv"]) cleanData = [] for each in data: each["token"] = each["token"].strip("<> \'\"") cleanData.append(each) pivot = pivotize("corpus_pos", "value", cleanData) print fKappa(pivot) ################ #Cohen's kappas# ################ items = indexBy("corpus_pos", cleanData) #find pairs of annotators annotatorPairs = {} for each in items: annotatorPair = map(lambda x: x["annotator"], items[each]) annotatorPair.sort() if tuple(annotatorPair) not in annotatorPairs: annotatorPairs[tuple(annotatorPair)] = items[each] else: annotatorPairs[tuple(annotatorPair)] += items[each] confusionMatrices = dict(map(lambda p: (p, confMatrix(annotatorPairs[p],\ "annotator", "value")), annotatorPairs)) cKappa = [(p, cKappa(confusionMatrices[p])) for p in confusionMatrices]