def loadPairs(queryId, fileName, labeledPoints, pairLabels): for line in open(fileName, 'r'): split = line.split('\t') cpoints = set() for entry in split: normQuery = filterWords(entry.strip()) #normQuery = entry.strip()#normalize(entry,stemmer) if normQuery in queryId: qid = queryId[normQuery] labeledPoints.add(qid) cpoints.add(qid) #else: #if len(normQuery) > 0: # print normQuery # pass #print 'True same ',normQuery pairs = generatePairsFromList(sorted(cpoints)) for pair in pairs: pairLabels.add(pair)
def calculateIndiciesFromFiles(trueLabelFile, differentPairFile, predictedLabelFile, queryList): queryId = {} idQuery = {} i = 1 for line in open(queryList, 'r'): split = line.split('\t') typeList = ast.literal_eval(split[7]) if len(typeList) > 0: query = filterWords(split[0].strip()) if query not in queryId: queryId[query] = str(i) idQuery[str(i)] = query i += 1 l_samePairs = set() l_points = set() l_diffPairs = set() p_samePairs = set() #load true label file #Same cluster loadPairs(queryId, trueLabelFile, l_points, l_samePairs) loadPairs(queryId, differentPairFile, l_points, l_diffPairs) total_pairs = len(l_samePairs) + len(l_diffPairs) #(len(l_points)*(len(l_points)-1))/2 #filter predicted label file for line in open(predictedLabelFile, 'r'): line = line.strip() cpoints = set() if len(line) > 0 and 'NO CLUST' not in line: split = line.split('\t') for entry in split: try: entry = filterWords(entry.strip()) qid = queryId[entry] if qid in l_points: cpoints.add(qid) except: #print entry pass pairs = generatePairsFromList(sorted(cpoints)) for pair in pairs: if pair in l_samePairs or pair in l_diffPairs: #if pair in l_diffPairs: # s = pair.split() # print pair, idQuery[s[0]], idQuery[s[1]] p_samePairs.add(pair) # if pair in l_diffPairs: p1 = pair[0:pair.find(' ')] p2 = pair[pair.find(' ') + 1:] #if p1 in idQuery and p2 in idQuery: # print idQuery[p1], idQuery[p2] #else: # print p1, 'sec', p2 print len(l_samePairs), len(p_samePairs), total_pairs #print l_samePairs tp = len(l_samePairs & p_samePairs) fp = len(p_samePairs & l_diffPairs) #len(p_samePairs) - tp ; fn = len(l_samePairs) - tp tn = len(l_diffPairs) - fp #total_pairs - (tp+fp+fn) print tp, fp, fn, tn, total_pairs return tp, fp, fn, tn, total_pairs