def kFoldEvaluation(k, sessFile, featFile, weightFile, percent, typeFile): sessions = loadSessions(sessFile) #weightMatrix = readWeightMatrix(weightFile) # #p1 = {} #r1 = {} #p2 = {} #r2 = {} #p3 = {} #r3 = {} #p4 = {} #r4 = {} # amean = [] ymean = [] for i in range(k): x, y, uniqx, uniqy = sampleSessions(sessions, percent) acount = 0.0 ylen = 0.0 termList, termDict = getTermList(uniqx) for session in y: aTerms, rTerms = addedAndRemovedTerms(session[0], session[1:], termDict) acount += len(aTerms) ylen += len(session) print acount, ylen, acount / len(y), ylen / len(y) amean.append(acount / len(y)) ymean.append(ylen / len(y)) print np.mean(amean), np.mean(ymean)
def predictTerms(queryList, y, qclusters): termList, termDict = getTermList(queryList) oracle_prec = 0.0 oracle_mrr = 0.0 added = 0 cScorer = ScoreClusterTerms() for session in y: query = session[0] aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict) if len(aTerms) > 0: prec1, mrr1 = getPrecRecall(termList, aTerms) added += 1.0 oracle_prec += prec1 oracle_mrr += mrr1 print 'Oracle prec and recall ', oracle_prec / added, oracle_mrr / added, added #porter = stem.porter.PorterStemmer(); clusters, clusIndex = toTerms(qclusters) lim = 5 i = 0 prec = {} mrr = {} pf = 0.0 pr = 0.0 for session in y: query = session[0].strip() qSet = getQueryTerms(query) #getQueryTermsStemmed(query, porter); aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict) if len(aTerms) > 0: terms = cScorer.scoreWithCosine(qSet, clusters, clusIndex, lim) if len(terms) > 0: #print len(aTerms), len(terms) prec1, mrr1 = getClustPrecRecall(terms, aTerms) # returns a list #print 'METRIC',i, prec1, mrr1 #print topk , prec1, mrr1 if sum(prec1) > 0: pf += 1.0 if sum(mrr1) > 0: pr += 1.0 for topk in range(len(prec1)): if topk not in prec: prec[topk] = [] mrr[topk] = [] prec[topk].append(prec1[topk]) mrr[topk].append(mrr1[topk]) i += 1 retPrec = {} retRecall = {} for entry, ls in prec.items(): print 'Prec @', entry, np.mean(ls) retPrec[entry] = np.mean(ls) for entry, ls in mrr.items(): print 'Recall @', entry, np.mean(ls) retRecall[entry] = np.mean(ls) print 'Percentage ', pf / i, pr / i return retPrec, retRecall
entExp1 = CatThesExpansion(dexter, catManage1, ranker, catCoMan, None) entExp2 = CatThesExpansion(dexter, catManage2, ranker, catCoMan, None) oFile1 = open(argv[1][:argv[1].rfind('.')] + '_ent.txt', 'w') oFile2 = open(argv[1][:argv[1].rfind('.')] + '_entSub.txt', 'w') i = 0 porter = stem.porter.PorterStemmer() totalVocab = loadFileInList(argv[7]) stats = {'sess': 0.0, 'aTerms': 0.0, 'ent': 0.0, 'enta': 0.0, 'acount': 0.0} for session, documents, clicks, cTitle, scontents in getSessionWithXML( argv[1]): query = session[0] cText = None aTerms, rTerms = addedAndRemovedTerms(query, session[1:], totalVocab) entStatus2, entExpTerms2 = entExp2.getTopSubclusters(query, cText, 1, 5) if entStatus2: stats['ent'] += 1 if len(aTerms) > 0: stats['aTerms'] += 1 if entStatus2: stats['enta'] += 1.0 stats['sess'] += 1.0 stats['acount'] += len(aTerms) '''cText = normalize(' '.join(cTitle[0]),porter); i+=1
def main(argv): #Scorer coSessOccur = CoOccurrence() coSessOcMan = CoOcManager(argv[2], coSessOccur, ' ') tScorer = CoOccurSimScore(coSessOcMan) cScorer = ScoreClusterTerms() #vocab = set() i = 0 prec = {} mrr = {} lim = 55 queryList = loadFileInList(argv[5]) termList, termDict = getTermList(queryList) print len(termList) added = 0 oracle_prec = 0.0 oracle_mrr = 0.0 for tid, session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): query = session[0].strip() aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict) if len(aTerms) > 0: prec1, mrr1 = getPrecRecall(termList, aTerms) added += 1.0 oracle_prec += prec1 oracle_mrr += mrr1 print 'Oracle prec and recall ', oracle_prec / added, oracle_mrr / added porter = stem.porter.PorterStemmer() ttype = argv[6] print ttype for iFile in os.listdir(argv[3]): qclusters = loadClusters(argv[3] + '/' + iFile) clusters, clusIndex = toTerms(qclusters) print iFile, len(clusters) prec[iFile] = {} mrr[iFile] = {} added = 0.0 i = 1 for tid, session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): i += 1 query = session[0].strip() qSet = getQueryTermsStemmed(query, porter) print 'Query ', query, qSet if ttype == 'query': aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict) elif ttype == 'title': aTerms = getTerms(cTitle, qSet, termDict, porter, range( 1, len(session) - 1)) else: aTerms = getTerms(cTitle, qSet, termDict, porter, range( 1, len(session) - 1)) bTerms = getTerms(cSummary, qSet, termDict, porter, range( 1, len(session) - 1)) aTerms = aTerms | bTerms #aTerms,rTerms = addedAndRemovedTerms(query, session[1:], None ) if len(aTerms) > 0: terms = cScorer.scoreWithIndex(qSet, clusters, clusIndex, tScorer, lim) #terms = cScorer.scoreWithClustPos(qSet, clusters,tScorer, lim) print 'TERMS', '\t', i, '\t', ttype, '\t', iFile, '\t', len( terms), terms #for topk in range(1,lim,5): prec1, mrr1 = getClustPrecMrr(terms, aTerms) # returns a list print 'METRIC', iFile, i, prec1, mrr1 #print topk , prec1, mrr1 for topk in prec1.keys(): if topk not in prec[iFile]: prec[iFile][topk] = [] mrr[iFile][topk] = [] prec[iFile][topk].append(prec1[topk]) mrr[iFile][topk].append(mrr1[topk]) #prec[iFile][topk] += prec1 #mrr[iFile][topk] += mrr1 added += 1.0 #if i == 3: # break for fName, scoreDict in prec.items(): for pos in scoreDict.keys(): print 'Prec all', fName, pos, len(scoreDict[pos]) total = sum(scoreDict[pos]) prec[fName][pos] = total / added #len(scoreDict[pos]) print 'Prec', fName, pos, prec[fName][pos], total for fName, scoreDict in mrr.items(): for pos in scoreDict.keys(): print 'Mrr all', fName, pos, len(scoreDict[pos]) total = sum(mrr[fName][pos]) mrr[fName][pos] = total / added #len(scoreDict[pos]) print 'MRR', fName, pos, mrr[fName][pos], total #for entry in prec.keys(): #for t in prec[entry].keys(): #print 'Prec',entry, t, prec[entry][t], prec[entry][t]/added #prec[entry][t]/=added #for entry in mrr.keys(): #for t in mrr[entry].keys(): #print 'Mrr',entry, t, mrr[entry][t], mrr[entry][t]/added #mrr[entry][t]/=added print 'Plotting Precision and MRR' plotMultipleSys(prec, 'No of Terms', 'Prec', argv[4] + 'prec.png', 'Term Prediction Prec Plot') plotMultipleSys(mrr, 'No of Terms', 'MRR', argv[4] + 'mrr.png', 'Term Prediction MRR Plot')
def getStatsPerSession(catVector, f1Dict, argv): tagURL = 'http://localhost:8080/rest/annotate' catURL = 'http://localhost:8080/rest/graph/get-entity-categories' print 'Cats ', len(f1Dict) #stats sStat = {'ef': 0, 'total': 0, 'aTerms': 0} #eStat = {'total':set(), 'remov':set()} catStat = {'nfTerm': set(), 'nf': set(), 'tf': set(), 'total': set()} outfile = open('match_session_' + str(argv[4]) + '.txt', 'w') #categoryVectors = {} #load the session arTotal = 0.0 apTotal = 0.0 for session in getSessionWithNL(argv[1]): bQuery = session[0].lower() bQuery = re.sub(SYMB, ' ', bQuery) bQuery = re.sub('\s+', ' ', bQuery).strip() aTerms, rTerms = addedAndRemovedTerms(bQuery, session[1:]) arMax = 0.0 apMax = 0.0 try: spotDict = None #tagQueryWithDexter(bQuery, tagURL,catURL) time.sleep(1) if aTerms: sStat['aTerms'] += 1.0 if len(spotDict) > 0: sStat['ef'] += 1.0 print 'Found Entity \t', '\t'.join(session) for entry in spotDict.keys(): rquery = bQuery.replace(entry, '') queryTerms = set(rquery.split()) catList = spotDict[entry]['cat'].lower().split() #notFound, maxCat, rDict = getPrecRecall('avg',catList,f1Dict,catVector, queryTerms, argv[2]) #print 'Avg', notFound, rDict notFound, maxCat, rDict = getPrecRecall( 'max', catList, f1Dict, catVector, queryTerms, aTerms, int( argv[4])) print 'Max', bQuery, 'Ent', entry, 'Cat', maxCat, 'NFC', notFound, rDict nf = 0 for centry in catList: catStat['total'].add(centry + '_' + entry) if centry in notFound: catStat['nf'].add(centry + '_' + entry) nf += 1.0 else: if rDict and len(rDict['qInt']) == 0: catStat['nfTerm'].add(centry + '_' + entry) if nf == len(catList): print 'For Query', bQuery, 'With ent list', spotDict.keys( ), 'for ENT', entry, 'No cat found' if rDict: #to choose the type with max values if arMax < rDict['aR']: arMax = rDict['aR'] if apMax < rDict['aP']: apMax = rDict['aP'] outfile.write( bQuery + '\t' + entry + '\t' + str(rDict['qS']) + '\t' + ', '.join(rDict['qInt']) + '\t' + ', '.join(rDict['aInt']) + '\t' + str(rDict['aR']) + '\t' + str(rDict['aP']) + '\n') #else: # outfile.write(bQuery+'\tNOT\tNOT\tNOT\tNO TERMS\n') except Exception as err: print 'SESSION WITH ERR', session, err, err.args if aTerms: print 'Prec ', argv[4], bQuery, '\t', apMax for query in session[1:]: outfile.write(query + '\tNIL\t0.0\tNIL\tNIL\t0.0\t0.0\n') sStat['total'] += 1 outfile.write('\n') apTotal += apMax arTotal += arMax print 'Total Sessions ', sStat['total'] print 'Sessions with entity in AOL', sStat['ef'] print '# no of entity types', len(catStat['total']) #print '# no of entity types with terms match ', len(catStat['tf']) print '# no of entity types present but no qterm match', len( catStat['nfTerm']) print '# no of entity types not present in AOL', len(catStat['nf']) if sStat['ef'] > 0: print argv[4], '\t', 'Prec', apTotal / sStat['ef' ], 'Recall', arTotal / sStat['ef'] print argv[4], '\t', 'Prec', apTotal / sStat[ 'aTerms' ], 'Recall', arTotal / sStat['aTerms']
def main(argv): ipaddress = "localhost" # dexter object tagURL = "http://" + ipaddress + ":8080/rest/annotate" catURL = "http://" + ipaddress + ":8080/rest/graph/get-entity-categories" dexter = Dexter(tagURL, catURL, argv[5]) # load the Category co-occurrence bit catCoMan = CoOcManager(argv[4], CoOccurrence(), " ") # category vector catVect = loadCategoryVector(argv[2]) catManage1 = CategoryManager(catVect, argv[3], Category) catManage2 = CategoryManager(catVect, argv[7], CategorySubcluster) # ranker ranker = Ranker() totalVocab = loadFileInList(argv[6]) # task extraction # htcTask = TaskExpansion('Indexes/htcIndex',ranker,3000); qccTask = TaskExpansion("Indexes/qccIndex", ranker, 3000, totalVocab) # taskK = argv[5][argv[5].rfind('/')+1:] wordFeatMan = None # WordManager(argv[8],False); # expansion # entExp1 = CatThesExpansion(dexter, catManage1, ranker,catCoMan,wordFeatMan); entExp2 = CatThesExpansion(dexter, catManage2, ranker, catCoMan, wordFeatMan) # term expansion coOccExp = CoOccurExpansion(catCoMan, None, ranker) # randomWalk # randWalk = RandomWalk(argv[2],argv[3],ranker) prec = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} mrr = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} ent_prec = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} ent_mrr = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} """ sess_prec = {}; sess_mrr = {}; """ covered = {} i = 0 porter = stem.porter.PorterStemmer() ttype = argv[10] for session, doc, click, cTitle, cSummary in getSessionWithXML(argv[1]): query = session[0] qSet = getQueryTerms(query) # print 'Title, Summary clicked ',cTitle[0], cSummary[0]; aTerms = None # cText = normalize(' '.join(cTitle[0]),porter); if ttype == "query": aTerms, rTerms = addedAndRemovedTerms(query, session[1:], totalVocab) elif ttype == "title": aTerms = getTerms(cTitle, qSet, totalVocab, porter, range(1, len(session) - 1)) else: aTerms = getTerms(cTitle, qSet, totalVocab, porter, range(1, len(session) - 1)) bTerms = getTerms(cSummary, qSet, totalVocab, porter, range(1, len(session) - 1)) aTerms = aTerms | bTerms print i, "Query", query, aTerms, len(aTerms) if len(aTerms) > 0: # and query not in covered: covered[query] = 1 coExpTerms = coOccExp.expandTextWithStep(query, 0, 55, 5) # entStatus1, entExpTerms1 = entExp1.expandTextWithStep(query,'',1,0,55,5); entStatus1, entExpTerms2 = entExp2.expandTextWithStepAndSubcluster(query, "", 1, 0, 55, 5) qccTaskTerms = qccTask.expandTextWithStep(query, 0, 55, 5) # htcTaskTerms = htcTask.expandTextWithStep(query,0,55,5) # randExpTerms = randWalk.expandTextWithStep(query,55,105,5) if not entStatus1: print i, "Ent False", query # addLen = getBand(len(aTerms)); # if addLen not in sess_prec: # sess_prec[addLen] = {'ent':{}};#, 'qccTask':{}, 'htcTask':{}, 'co':{} }; # sess_mrr[addLen] = {'ent':{}};#, 'qccTask':{}, 'htcTask':{}, 'co':{} }; # for noTerms in entExpTerms1.keys(): # print 'ETerms\t',i,'\t',query,'\t',entExpTerms1[noTerms],'\t',noTerms; # prec1 , mrr1 = getPrecRecall(entExpTerms1[noTerms],aTerms); # prec = updateStats(noTerms, 'ent',prec1, prec); # mrr = updateStats(noTerms, 'ent',mrr1, mrr); # if entStatus1: # ent_prec = updateStats(noTerms, 'ent',prec1, ent_prec) # ent_mrr = updateStats(noTerms, 'ent',mrr1, ent_mrr); ##sess_prec[addLen] = updateStats(noTerms, 'ent',prec1, sess_prec[addLen]) ##sess_mrr[addLen] = updateStats(noTerms, 'ent',mrr1, sess_mrr[addLen]); # print 'EMetrics ',i,'\t',noTerms,'\t', len(aTerms), '\t', aTerms, '\t',prec1, '\t',mrr1; # for noTerms in entExpTerms2.keys(): print "ESubTerms\t", i, "\t", query, "\t", entExpTerms2[noTerms], "\t", noTerms prec1, mrr1 = getPrecRecall(entExpTerms2[noTerms], aTerms) prec = updateStats(noTerms, "entSub", prec1, prec) mrr = updateStats(noTerms, "entSub", mrr1, mrr) if entStatus1: ent_prec = updateStats(noTerms, "entSub", prec1, ent_prec) ent_mrr = updateStats(noTerms, "entSub", mrr1, ent_mrr) # sess_prec[addLen] = updateStats(noTerms, 'ent',prec1, sess_prec[addLen]) # sess_mrr[addLen] = updateStats(noTerms, 'ent',mrr1, sess_mrr[addLen]); print "ESubMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1 for noTerms in qccTaskTerms.keys(): print "qccTaskTerms\t", i, "\t", query, "\t", qccTaskTerms[noTerms], "\t", noTerms prec1, mrr1 = getPrecRecall(qccTaskTerms[noTerms], aTerms) prec = updateStats(noTerms, "qccTask", prec1, prec) mrr = updateStats(noTerms, "qccTask", mrr1, mrr) if entStatus1: ent_prec = updateStats(noTerms, "qccTask", prec1, ent_prec) ent_mrr = updateStats(noTerms, "qccTask", mrr1, ent_mrr) """ sess_prec[addLen] = updateStats(noTerms, 'qccTask',prec1, sess_prec[addLen]) sess_mrr[addLen] = updateStats(noTerms, 'qccTask',mrr1, sess_mrr[addLen]); """ print "qccTaskMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1 # for noTerms in htcTaskTerms.keys(): # print 'htcTaskTerms\t',i,'\t',query,'\t',htcTaskTerms[noTerms],'\t',noTerms # prec1 , mrr1 = getPrecRecall(htcTaskTerms[noTerms],aTerms) # prec = updateStats(noTerms, 'htcTask',prec1, prec) # mrr = updateStats(noTerms, 'htcTask',mrr1, mrr); # if entStatus1: # ent_prec = updateStats(noTerms, 'htcTask',prec1, ent_prec) # ent_mrr = updateStats(noTerms, 'htcTask',mrr1, ent_mrr); ##sess_prec[addLen] = updateStats(noTerms, 'htcTask',prec1, sess_prec[addLen]) ##sess_mrr[addLen] = updateStats(noTerms, 'htcTask',mrr1, sess_mrr[addLen]); # # print 'htcTaskMetrics ',i,'\t',noTerms,'\t', len(aTerms), '\t', aTerms, '\t',prec1, '\t',mrr1 for noTerms in coExpTerms.keys(): print "CoTerms\t", i, "\t", query, "\t", coExpTerms[noTerms], "\t", noTerms prec1, mrr1 = getPrecRecall(coExpTerms[noTerms], aTerms) prec = updateStats(noTerms, "co", prec1, prec) mrr = updateStats(noTerms, "co", mrr1, mrr) if entStatus1: ent_prec = updateStats(noTerms, "co", prec1, ent_prec) ent_mrr = updateStats(noTerms, "co", mrr1, ent_mrr) """ sess_prec[addLen] = updateStats(noTerms, 'co',prec1, sess_prec[addLen]) sess_mrr[addLen] = updateStats(noTerms, 'co' ,mrr1, sess_mrr[addLen]); """ print "CoMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1 else: pass # print 'NO ADDED TERMS in', i; i += 1 printMetric(prec, "entSub", "Prec") printMetric(mrr, "entSub", "Mrr") printMetric(prec, "ent", "Prec") printMetric(mrr, "ent", "Mrr") printMetric(prec, "htcTask", "Prec") printMetric(mrr, "htcTask", "Mrr") printMetric(prec, "qccTask", "Prec") printMetric(mrr, "qccTask", "Mrr") printMetric(prec, "co", "Prec") printMetric(mrr, "co", "Mrr") printMetric(ent_prec, "entSub", "EntPrec") printMetric(ent_mrr, "entSub", "EntMrr") printMetric(ent_prec, "ent", "EntPrec") printMetric(ent_mrr, "ent", "EntMrr") printMetric(ent_prec, "htcTask", "EntPrec") printMetric(ent_mrr, "htcTask", "EntMrr") printMetric(ent_prec, "qccTask", "EntPrec") printMetric(ent_mrr, "qccTask", "EntMrr") printMetric(ent_prec, "co", "EntPrec") printMetric(ent_mrr, "co", "EntMrr") plotMultipleSys( prec, "No of Terms", "Prec", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "prec.png", "Term Prediction Prec Plot", ) plotMultipleSys( mrr, "No of Terms", "MRR", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "mrr.png", "Term Prediction MRR Plot", ) plotMultipleSys( ent_prec, "No of Terms", "Prec", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "_ent_prec.png", "Term Prediction Prec Plot (Ent queries)", ) plotMultipleSys( ent_mrr, "No of Terms", "MRR", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "_ent_mrr.png", "Term Prediction MRR Plot (Ent queries)", ) # htcTask.closeIndex(); qccTask.closeIndex() """