def main(argv): ttCoOc = CoOccurrence() teCoOc = ObjCoOccurrence() ecCoOc = ObjCoOccurrence() termEntMan = CoOcManager(argv[1], teCoOc, '\t') entCatMan = CoOcManager(argv[2], ecCoOc, '\t') termTermMan = CoOcManager(argv[3], ttCoOc, ' ') ipaddress = 'localhost' #dexter object tagURL = 'http://' + ipaddress + ':8080/rest/annotate' catURL = 'http://' + ipaddress + ':8080/rest/graph/get-entity-categories' dexter = Dexter(tagURL, catURL) et = ProbExpansion(termTermMan, termEntMan, entCatMan, dexter, Ranker()) for i, session in getSessionWithNL(argv[4]): query = session[0] terms = et.expandText(query, 50) print i, '\t', query, '\t', terms
def getStatsPerSession(catVector, f1Dict, argv): tagURL = 'http://localhost:8080/rest/annotate' catURL = 'http://localhost:8080/rest/graph/get-entity-categories' print 'Cats ', len(f1Dict) #stats sStat = {'ef': 0, 'total': 0, 'aTerms': 0} #eStat = {'total':set(), 'remov':set()} catStat = {'nfTerm': set(), 'nf': set(), 'tf': set(), 'total': set()} outfile = open('match_session_' + str(argv[4]) + '.txt', 'w') #categoryVectors = {} #load the session arTotal = 0.0 apTotal = 0.0 for session in getSessionWithNL(argv[1]): bQuery = session[0].lower() bQuery = re.sub(SYMB, ' ', bQuery) bQuery = re.sub('\s+', ' ', bQuery).strip() aTerms, rTerms = addedAndRemovedTerms(bQuery, session[1:]) arMax = 0.0 apMax = 0.0 try: spotDict = None #tagQueryWithDexter(bQuery, tagURL,catURL) time.sleep(1) if aTerms: sStat['aTerms'] += 1.0 if len(spotDict) > 0: sStat['ef'] += 1.0 print 'Found Entity \t', '\t'.join(session) for entry in spotDict.keys(): rquery = bQuery.replace(entry, '') queryTerms = set(rquery.split()) catList = spotDict[entry]['cat'].lower().split() #notFound, maxCat, rDict = getPrecRecall('avg',catList,f1Dict,catVector, queryTerms, argv[2]) #print 'Avg', notFound, rDict notFound, maxCat, rDict = getPrecRecall( 'max', catList, f1Dict, catVector, queryTerms, aTerms, int( argv[4])) print 'Max', bQuery, 'Ent', entry, 'Cat', maxCat, 'NFC', notFound, rDict nf = 0 for centry in catList: catStat['total'].add(centry + '_' + entry) if centry in notFound: catStat['nf'].add(centry + '_' + entry) nf += 1.0 else: if rDict and len(rDict['qInt']) == 0: catStat['nfTerm'].add(centry + '_' + entry) if nf == len(catList): print 'For Query', bQuery, 'With ent list', spotDict.keys( ), 'for ENT', entry, 'No cat found' if rDict: #to choose the type with max values if arMax < rDict['aR']: arMax = rDict['aR'] if apMax < rDict['aP']: apMax = rDict['aP'] outfile.write( bQuery + '\t' + entry + '\t' + str(rDict['qS']) + '\t' + ', '.join(rDict['qInt']) + '\t' + ', '.join(rDict['aInt']) + '\t' + str(rDict['aR']) + '\t' + str(rDict['aP']) + '\n') #else: # outfile.write(bQuery+'\tNOT\tNOT\tNOT\tNO TERMS\n') except Exception as err: print 'SESSION WITH ERR', session, err, err.args if aTerms: print 'Prec ', argv[4], bQuery, '\t', apMax for query in session[1:]: outfile.write(query + '\tNIL\t0.0\tNIL\tNIL\t0.0\t0.0\n') sStat['total'] += 1 outfile.write('\n') apTotal += apMax arTotal += arMax print 'Total Sessions ', sStat['total'] print 'Sessions with entity in AOL', sStat['ef'] print '# no of entity types', len(catStat['total']) #print '# no of entity types with terms match ', len(catStat['tf']) print '# no of entity types present but no qterm match', len( catStat['nfTerm']) print '# no of entity types not present in AOL', len(catStat['nf']) if sStat['ef'] > 0: print argv[4], '\t', 'Prec', apTotal / sStat['ef' ], 'Recall', arTotal / sStat['ef'] print argv[4], '\t', 'Prec', apTotal / sStat[ 'aTerms' ], 'Recall', arTotal / sStat['aTerms']
def findMarkovStats(argv): i = 0 wikiIndexDir = argv[2] queryIndexDir = argv[3] iFile = argv[1] wIndex, wsearcher = loadIndex(wikiIndexDir, wikiIndexDir) qIndex, qsearcher = loadIndex(queryIndexDir, queryIndexDir) wtlc = loadCollector(wsearcher, 2000, 20) qtlc = loadCollector(qsearcher, 2000, 20) qqp = loadQueryParser(qIndex, 'session') wqp = loadQueryParser(wIndex, 'content') prec = {} recall = {} count = 0.0 for session in getSessionWithNL(iFile): #get the query query = session[0].lower() query = re.sub(SYMB, ' ', query) query = re.sub('\d+', ' ', query) query = re.sub('\s+', ' ', query).strip() aTerms, bTerms = addedAndRemovedTerms(query, session) if aTerms: count += 1.0 totalNetwork = {} #stemNetwork = {} #queryNetwork = {} #wikiNetwork = {} terms = updateNetwork(query, totalNetwork, wqp, wsearcher, wtlc, 'content', 'wiki') terms2 = updateNetwork(query, totalNetwork, qqp, qsearcher, qtlc, 'session', 'query') print len(terms), len(terms2) #updateStemNetwork(queryNetwork,stemNetwork, porter) #updateStemNetwork(wikiNetwork,stemNetwork, porter) updateStemNetwork(totalNetwork) #normalizeNetworks(queryNetwork) #normalizeNetworks(stemNetwork) #normalizeNetworks(wikiNetwork) #calculate the mixtures at two stages stage1 = {} stage2 = {} combineNetwork(1.0, stage1, totalNetwork, 'stem') combineNetwork(0.5, stage2, totalNetwork, 'query') combineNetwork(0.5, stage2, totalNetwork, 'wiki') #convert into matrix for multiplication totalDim = sorted(list(set(stage1.keys()) | set(stage2.keys()))) dim = len(totalDim) if dim > 0: stage1Matrix = toMatrix(totalDim, stage1) print 'STAGE1', stage1Matrix[0], stage1Matrix.shape stage2Matrix = toMatrix(totalDim, stage2) print 'STAGE2', stage2Matrix[0], stage2Matrix.shape backSmooth = 1.0 / len(totalDim) stage3Matrix = numpy.zeros((dim, dim)) stage3Matrix.fill(backSmooth) print 'STAGE3', stage3Matrix[0], stage3Matrix.shape alpha = 0.80 #matrix = ['stage2','stage2','stage2','stage2','stage2','stage2','stage2','stage2','stage3'] matrix = ['stage1', 'stage2', 'stage2', 'stage2', 'stage3'] totalSum = numpy.zeros((dim, dim)) cK = numpy.ones((dim, dim)) #start walk! for k in range(len(matrix)): print k, matrix[k] if matrix[k] == 'stage1': cK = numpy.dot(stage1Matrix, cK) elif matrix[k] == 'stage2': cK = numpy.dot(stage2Matrix, cK) else: cK = numpy.dot(cK, stage3Matrix) print 'CK', cK[0] totalSum = totalSum + (math.pow(alpha, k) * cK) totalSum = totalSum * (1 - alpha) #rank Terms qList = [] terms = query.split() #getQueryTerms(query) for term in terms: if term in totalDim: qList.append(totalDim.index(term)) else: print 'ERROR dint find ', query, '\t', term, len(totalDim) termScore = {} for i in range(len(totalDim)): termScore[totalDim[i]] = 0.0 for j in qList: if totalSum[i][j] > 0.0: termScore[totalDim[i]] += math.log(totalSum[i][j]) #find the precision for different term sets sortTerms = sorted(termScore.iteritems(), reverse=True, key=lambda x: x[1]) for i in [1, 3, 5, 10, 20, 30, 40, 50, 60, 100, '10000']: try: cTerms = set([x[0] for x in sortTerms[:i]]) print 'CTERMS ', sortTerms[0:10], len(cTerms), 'ATERMS', aTerms p = len(aTerms & cTerms) / (len(aTerms) * 1.0) r = len(aTerms & cTerms) / (len(cTerms) * 1.0) prec[i] = prec.setdefault(i, 0.0) + p recall[i] = recall.setdefault(i, 0.0) + r print 'Prec', i, '\t', query, '\t', p except Exception as err: cTerms = set([x[0] for x in sortTerms]) p = len(aTerms & cTerms) / (len(aTerms) * 1.0) r = len(aTerms & cTerms) / (len(cTerms) * 1.0) prec[i] = prec.setdefault(i, 0.0) + p recall[i] = recall.setdefault(i, 0.0) + r print 'Prec', i, '\t', query, '\t', p else: for i in [1, 3, 5, 10, 20, 30, 40, 50, 60, 100, '10000']: print 'Prec', i, '\t', query, '\t', 0.0 #average the prec & recall #print prec and recall print 'Printing Precison' for entry, value in prec.iteritems(): print entry, value / count print 'Printing Precison' for entry, value in recall.iteritems(): print entry, value / count wIndex.close() qIndex.close()
def getStatsPerQuery(argv): tagURL = 'http://localhost:8080/rest/annotate' catURL = 'http://localhost:8080/rest/graph/get-entity-categories' catVector = loadCategoryVector(argv[3]) f1Dict = getCats(argv[2]) sFound = 0.0 sTotal = 0.0 eTotal = set() eRemov = set() catFoundNoTerm = set() catNotFound = set() catTermFound = set() catEntity = set() outfile = open('match_session_dom.txt', 'w') #categoryVectors = {} for session in getSessionWithNL(argv[1]): catCount = {} entCount = {} querySpotList = {} for query in session: #find the entities in query try: spotDict = None #tagQueryWithDexter(query, tagURL,catURL) querySpotList[query] = spotDict for text in spotDict.keys(): for entry in spotDict[text]['cat'].split(): catCount[entry] = catCount.setdefault(entry, 1) + 1 entCount[text] = entCount.setdefault(text, 1) + 1 except Exception as err: print err #print 'SESSION', session, 'CATCOUNT', catCount, 'ENTCOUNT',entCount found = False if len(catCount) > 0: #find the dominant entity maxEnt = max(entCount.values()) #sessionQueryMapping = {} for query, spotList in querySpotList.iteritems(): matchl = spotList.keys() for entry in matchl: eTotal.add(entry) if entCount[entry] < maxEnt: spotList.pop(entry, None) print 'Removing spot', query, entry eRemov.add(entry) else: #get the categories #catTermMatch = {} rquery = query.replace(entry, '') queryTerms = set(rquery.split()) for cat in spotList[entry]['cat'].lower().split(): catEntity.add(entry + '_' + cat) if cat in f1Dict: phrase1 = loadPhrasesWithScore(argv[2] + '/' + f1Dict[cat]) pVector = catVector[cat] queryDict = getDictFromSet(queryTerms) pTotal = sum(phrase1.values()) pset = set(phrase1.keys()) sint = pset & queryTerms score = 0.0 cscore = get_cosine(queryDict, pVector) for iphrase in sint: score += phrase1[iphrase] / pTotal if len(queryTerms) > 0: score *= (1.0 * len(sint)) / len(queryTerms) if sint: outfile.write(query + '\t' + entry + '\t' + cat + '\t' + str(cscore) + '\t' + ', '.join(sint) + '\n') found = True catTermFound.add(entry + '_' + cat) else: outfile.write(query + '\t' + entry + '\t' + cat + '\t0\t0\n') catFoundNoTerm.add(cat + '_' + entry) else: outfile.write( query + '\t' + entry + '\t' + cat + '\t0\tNOT FOUND\n') catNotFound.add(cat + '_' + entry) #load the terms for category #check if these terms match if found: sFound += 1 sTotal += 1 outfile.write('\n') print 'Total Sessions ', sTotal print 'Sessions with dominant entity in AOL', sFound print '# Unique Entities', len(eTotal) print '# Removed Entities (non dominant)', len(eRemov) print '# no of entity types', len(catEntity) print '# no of entity types with terms match ', len(catTermFound) print '# no of entity types with no term match', len(catFoundNoTerm) print '# no of entity types with no match in AOL', len(catNotFound)