def __init__(self, catVect, catDir, cclass): self.catFileDict = getCats(catDir) self.catVectors = catVect; self.catObjDict = {} self.catClass = cclass;
def getStatsPerQuery(argv): tagURL = 'http://localhost:8080/rest/annotate' catURL = 'http://localhost:8080/rest/graph/get-entity-categories' catVector = loadCategoryVector(argv[3]) f1Dict = getCats(argv[2]) sFound = 0.0 sTotal = 0.0 eTotal = set() eRemov = set() catFoundNoTerm = set() catNotFound = set() catTermFound = set() catEntity = set() outfile = open('match_session_dom.txt', 'w') #categoryVectors = {} for session in getSessionWithNL(argv[1]): catCount = {} entCount = {} querySpotList = {} for query in session: #find the entities in query try: spotDict = None #tagQueryWithDexter(query, tagURL,catURL) querySpotList[query] = spotDict for text in spotDict.keys(): for entry in spotDict[text]['cat'].split(): catCount[entry] = catCount.setdefault(entry, 1) + 1 entCount[text] = entCount.setdefault(text, 1) + 1 except Exception as err: print err #print 'SESSION', session, 'CATCOUNT', catCount, 'ENTCOUNT',entCount found = False if len(catCount) > 0: #find the dominant entity maxEnt = max(entCount.values()) #sessionQueryMapping = {} for query, spotList in querySpotList.iteritems(): matchl = spotList.keys() for entry in matchl: eTotal.add(entry) if entCount[entry] < maxEnt: spotList.pop(entry, None) print 'Removing spot', query, entry eRemov.add(entry) else: #get the categories #catTermMatch = {} rquery = query.replace(entry, '') queryTerms = set(rquery.split()) for cat in spotList[entry]['cat'].lower().split(): catEntity.add(entry + '_' + cat) if cat in f1Dict: phrase1 = loadPhrasesWithScore(argv[2] + '/' + f1Dict[cat]) pVector = catVector[cat] queryDict = getDictFromSet(queryTerms) pTotal = sum(phrase1.values()) pset = set(phrase1.keys()) sint = pset & queryTerms score = 0.0 cscore = get_cosine(queryDict, pVector) for iphrase in sint: score += phrase1[iphrase] / pTotal if len(queryTerms) > 0: score *= (1.0 * len(sint)) / len(queryTerms) if sint: outfile.write(query + '\t' + entry + '\t' + cat + '\t' + str(cscore) + '\t' + ', '.join(sint) + '\n') found = True catTermFound.add(entry + '_' + cat) else: outfile.write(query + '\t' + entry + '\t' + cat + '\t0\t0\n') catFoundNoTerm.add(cat + '_' + entry) else: outfile.write( query + '\t' + entry + '\t' + cat + '\t0\tNOT FOUND\n') catNotFound.add(cat + '_' + entry) #load the terms for category #check if these terms match if found: sFound += 1 sTotal += 1 outfile.write('\n') print 'Total Sessions ', sTotal print 'Sessions with dominant entity in AOL', sFound print '# Unique Entities', len(eTotal) print '# Removed Entities (non dominant)', len(eRemov) print '# no of entity types', len(catEntity) print '# no of entity types with terms match ', len(catTermFound) print '# no of entity types with no term match', len(catFoundNoTerm) print '# no of entity types with no match in AOL', len(catNotFound)