def main(argv): #Scorer coSessOccur = CoOccurrence() coSessOcMan = CoOcManager(argv[2], coSessOccur, ' ') tScorer = CoOccurSimScore(coSessOcMan) cScorer = ScoreClusterTerms() #vocab = set() i = 0 prec = {} mrr = {} lim = 55 queryList = loadFileInList(argv[5]) termList, termDict = getTermList(queryList) print len(termList) added = 0 oracle_prec = 0.0 oracle_mrr = 0.0 for tid, session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): query = session[0].strip() aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict) if len(aTerms) > 0: prec1, mrr1 = getPrecRecall(termList, aTerms) added += 1.0 oracle_prec += prec1 oracle_mrr += mrr1 print 'Oracle prec and recall ', oracle_prec / added, oracle_mrr / added porter = stem.porter.PorterStemmer() ttype = argv[6] print ttype for iFile in os.listdir(argv[3]): qclusters = loadClusters(argv[3] + '/' + iFile) clusters, clusIndex = toTerms(qclusters) print iFile, len(clusters) prec[iFile] = {} mrr[iFile] = {} added = 0.0 i = 1 for tid, session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): i += 1 query = session[0].strip() qSet = getQueryTermsStemmed(query, porter) print 'Query ', query, qSet if ttype == 'query': aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict) elif ttype == 'title': aTerms = getTerms(cTitle, qSet, termDict, porter, range( 1, len(session) - 1)) else: aTerms = getTerms(cTitle, qSet, termDict, porter, range( 1, len(session) - 1)) bTerms = getTerms(cSummary, qSet, termDict, porter, range( 1, len(session) - 1)) aTerms = aTerms | bTerms #aTerms,rTerms = addedAndRemovedTerms(query, session[1:], None ) if len(aTerms) > 0: terms = cScorer.scoreWithIndex(qSet, clusters, clusIndex, tScorer, lim) #terms = cScorer.scoreWithClustPos(qSet, clusters,tScorer, lim) print 'TERMS', '\t', i, '\t', ttype, '\t', iFile, '\t', len( terms), terms #for topk in range(1,lim,5): prec1, mrr1 = getClustPrecMrr(terms, aTerms) # returns a list print 'METRIC', iFile, i, prec1, mrr1 #print topk , prec1, mrr1 for topk in prec1.keys(): if topk not in prec[iFile]: prec[iFile][topk] = [] mrr[iFile][topk] = [] prec[iFile][topk].append(prec1[topk]) mrr[iFile][topk].append(mrr1[topk]) #prec[iFile][topk] += prec1 #mrr[iFile][topk] += mrr1 added += 1.0 #if i == 3: # break for fName, scoreDict in prec.items(): for pos in scoreDict.keys(): print 'Prec all', fName, pos, len(scoreDict[pos]) total = sum(scoreDict[pos]) prec[fName][pos] = total / added #len(scoreDict[pos]) print 'Prec', fName, pos, prec[fName][pos], total for fName, scoreDict in mrr.items(): for pos in scoreDict.keys(): print 'Mrr all', fName, pos, len(scoreDict[pos]) total = sum(mrr[fName][pos]) mrr[fName][pos] = total / added #len(scoreDict[pos]) print 'MRR', fName, pos, mrr[fName][pos], total #for entry in prec.keys(): #for t in prec[entry].keys(): #print 'Prec',entry, t, prec[entry][t], prec[entry][t]/added #prec[entry][t]/=added #for entry in mrr.keys(): #for t in mrr[entry].keys(): #print 'Mrr',entry, t, mrr[entry][t], mrr[entry][t]/added #mrr[entry][t]/=added print 'Plotting Precision and MRR' plotMultipleSys(prec, 'No of Terms', 'Prec', argv[4] + 'prec.png', 'Term Prediction Prec Plot') plotMultipleSys(mrr, 'No of Terms', 'MRR', argv[4] + 'mrr.png', 'Term Prediction MRR Plot')
def main(argv): rare_prec = {'ent': {}, 'qccTask': {}, 'htcTask': {}, 'co': {}, 'entSub': {}} rare_mrr = {'ent': {}, 'qccTask': {}, 'htcTask': {}, 'co': {}, 'entSub': {}} freq_prec = {'ent': {}, 'qccTask': {}, 'htcTask': {}, 'co': {}, 'entSub': {}} freq_mrr = {'ent': {}, 'qccTask': {}, 'htcTask': {}, 'co': {}, 'entSub': {}} freqList = set() rareList = set() for line in open(argv[1], 'r'): split = line.split('\t') qid = int(split[0]) freq = float(split[-1]) if freq > 0: freqList.add(qid) else: rareList.add(qid) entFalse = set() for line in open(argv[2], 'r'): if 'Ent False' in line: entFalse.add(int(line[:line.find(' ')])) print entFalse if 'Metrics' in line: osplit = line.split('\t') split = osplit[0].split() approach = split[0][:split[0].rfind('Metrics')].lower() if approach == 'e': approach = 'ent' if approach == 'esub': approach = 'entSub' if approach == 'qcctask': approach = 'qccTask' if approach == 'htctask': approach = 'htcTask' qid = int(split[1]) noTerms = int(osplit[1]) prec = float(osplit[-2]) mrr = float(osplit[-1]) if qid not in entFalse: #print qid; if qid in freqList: freq_prec = updateStats(noTerms, approach, prec, freq_prec) freq_mrr = updateStats(noTerms, approach, mrr, freq_mrr) else: rare_prec = updateStats(noTerms, approach, prec, rare_prec) rare_mrr = updateStats(noTerms, approach, mrr, rare_mrr) printMetric(freq_prec, 'entSub', 'FreqPrec') printMetric(freq_mrr, 'entSub', 'FreqMrr') printMetric(freq_prec, 'ent', 'FreqPrec') printMetric(freq_mrr, 'ent', 'FreqMrr') printMetric(freq_prec, 'htcTask', 'FreqPrec') printMetric(freq_mrr, 'htcTask', 'FreqMrr') printMetric(freq_prec, 'qccTask', 'FreqPrec') printMetric(freq_mrr, 'qccTask', 'FreqMrr') printMetric(freq_prec, 'co', 'FreqPrec') printMetric(freq_mrr, 'co', 'FreqMrr') printMetric(rare_prec, 'entSub', 'RarePrec') printMetric(rare_mrr, 'entSub', 'RareMrr') printMetric(rare_prec, 'ent', 'RarePrec') printMetric(rare_mrr, 'ent', 'RareMrr') printMetric(rare_prec, 'htcTask', 'RarePrec') printMetric(rare_mrr, 'htcTask', 'RareMrr') printMetric(rare_prec, 'qccTask', 'RarePrec') printMetric(rare_mrr, 'qccTask', 'RareMrr') printMetric(rare_prec, 'co', 'RarePrec') printMetric(rare_mrr, 'co', 'RareMrr') plotMultipleSys(freq_prec, 'No of Terms', 'Prec', argv[3] + '_freq_prec.png', 'Term Prediction Prec Plot') plotMultipleSys(freq_mrr, 'No of Terms', 'MRR', argv[3] + '_freq_mrr.png', 'Term Prediction MRR Plot') plotMultipleSys(rare_prec, 'No of Terms', 'Prec', argv[3] + '_rare_prec.png', 'Term Prediction Prec Plot ') plotMultipleSys(rare_mrr, 'No of Terms', 'MRR', argv[3] + '_rare_mrr.png', 'Term Prediction MRR Plot ') #completedDocs = {}; #for session, docs, clicks in getSessionWithXML(argv[1]): #for i , docList in docs.items(): #for docId in docList: #if docId not in completedDocs: #completedDocs[docId] = 1.0; # # #for entry in completedDocs.keys(): #print entry; # # print session #get the entries for a particular query #parts = int(argv[2]) #index = int(argv[3]) #files = os.listdir(argv[1]) #outFolder = argv[4] #oFile = open(outFolder+'/'+ifile,'w') #strt = index*(len(files)/parts) #end = (index+1)*(len(files)/parts) #for i in range(strt,end): #ifile = files[i] '''stemmer = porter.PorterStemmer(); queryFreq = {}; for line in open(argv[1],'r'): split = line.split('\t'); query = split[0].strip(); sQuery = normalize(query, stemmer); freq = float(split[1]); queryFreq[sQuery] = freq; toPrint = {}; qid = 1; for session, doc, click, cTitle, cSummary in getSessionWithXML(argv[2]): oquery = session[0]; query = normalize(oquery, stemmer); if query in queryFreq: toPrint[str(qid)+'\t'+query] = queryFreq[query]; else: toPrint[str(qid)+'\t'+query] = 0; qid+=1; sort = sorted(toPrint.items() , reverse = True , key = lambda x : x[1]); for entry in sort: print entry[0],'\t', entry[1]; # ##print getDocumentText('clueweb12-0817wb-00-27979','/media/Data/TREC_Session_Doc/cluewebdocs12/') ''' '''done = {}
def main(argv): ipaddress = "localhost" # dexter object tagURL = "http://" + ipaddress + ":8080/rest/annotate" catURL = "http://" + ipaddress + ":8080/rest/graph/get-entity-categories" dexter = Dexter(tagURL, catURL, argv[5]) # load the Category co-occurrence bit catCoMan = CoOcManager(argv[4], CoOccurrence(), " ") # category vector catVect = loadCategoryVector(argv[2]) catManage1 = CategoryManager(catVect, argv[3], Category) catManage2 = CategoryManager(catVect, argv[7], CategorySubcluster) # ranker ranker = Ranker() totalVocab = loadFileInList(argv[6]) # task extraction # htcTask = TaskExpansion('Indexes/htcIndex',ranker,3000); qccTask = TaskExpansion("Indexes/qccIndex", ranker, 3000, totalVocab) # taskK = argv[5][argv[5].rfind('/')+1:] wordFeatMan = None # WordManager(argv[8],False); # expansion # entExp1 = CatThesExpansion(dexter, catManage1, ranker,catCoMan,wordFeatMan); entExp2 = CatThesExpansion(dexter, catManage2, ranker, catCoMan, wordFeatMan) # term expansion coOccExp = CoOccurExpansion(catCoMan, None, ranker) # randomWalk # randWalk = RandomWalk(argv[2],argv[3],ranker) prec = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} mrr = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} ent_prec = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} ent_mrr = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} """ sess_prec = {}; sess_mrr = {}; """ covered = {} i = 0 porter = stem.porter.PorterStemmer() ttype = argv[10] for session, doc, click, cTitle, cSummary in getSessionWithXML(argv[1]): query = session[0] qSet = getQueryTerms(query) # print 'Title, Summary clicked ',cTitle[0], cSummary[0]; aTerms = None # cText = normalize(' '.join(cTitle[0]),porter); if ttype == "query": aTerms, rTerms = addedAndRemovedTerms(query, session[1:], totalVocab) elif ttype == "title": aTerms = getTerms(cTitle, qSet, totalVocab, porter, range(1, len(session) - 1)) else: aTerms = getTerms(cTitle, qSet, totalVocab, porter, range(1, len(session) - 1)) bTerms = getTerms(cSummary, qSet, totalVocab, porter, range(1, len(session) - 1)) aTerms = aTerms | bTerms print i, "Query", query, aTerms, len(aTerms) if len(aTerms) > 0: # and query not in covered: covered[query] = 1 coExpTerms = coOccExp.expandTextWithStep(query, 0, 55, 5) # entStatus1, entExpTerms1 = entExp1.expandTextWithStep(query,'',1,0,55,5); entStatus1, entExpTerms2 = entExp2.expandTextWithStepAndSubcluster(query, "", 1, 0, 55, 5) qccTaskTerms = qccTask.expandTextWithStep(query, 0, 55, 5) # htcTaskTerms = htcTask.expandTextWithStep(query,0,55,5) # randExpTerms = randWalk.expandTextWithStep(query,55,105,5) if not entStatus1: print i, "Ent False", query # addLen = getBand(len(aTerms)); # if addLen not in sess_prec: # sess_prec[addLen] = {'ent':{}};#, 'qccTask':{}, 'htcTask':{}, 'co':{} }; # sess_mrr[addLen] = {'ent':{}};#, 'qccTask':{}, 'htcTask':{}, 'co':{} }; # for noTerms in entExpTerms1.keys(): # print 'ETerms\t',i,'\t',query,'\t',entExpTerms1[noTerms],'\t',noTerms; # prec1 , mrr1 = getPrecRecall(entExpTerms1[noTerms],aTerms); # prec = updateStats(noTerms, 'ent',prec1, prec); # mrr = updateStats(noTerms, 'ent',mrr1, mrr); # if entStatus1: # ent_prec = updateStats(noTerms, 'ent',prec1, ent_prec) # ent_mrr = updateStats(noTerms, 'ent',mrr1, ent_mrr); ##sess_prec[addLen] = updateStats(noTerms, 'ent',prec1, sess_prec[addLen]) ##sess_mrr[addLen] = updateStats(noTerms, 'ent',mrr1, sess_mrr[addLen]); # print 'EMetrics ',i,'\t',noTerms,'\t', len(aTerms), '\t', aTerms, '\t',prec1, '\t',mrr1; # for noTerms in entExpTerms2.keys(): print "ESubTerms\t", i, "\t", query, "\t", entExpTerms2[noTerms], "\t", noTerms prec1, mrr1 = getPrecRecall(entExpTerms2[noTerms], aTerms) prec = updateStats(noTerms, "entSub", prec1, prec) mrr = updateStats(noTerms, "entSub", mrr1, mrr) if entStatus1: ent_prec = updateStats(noTerms, "entSub", prec1, ent_prec) ent_mrr = updateStats(noTerms, "entSub", mrr1, ent_mrr) # sess_prec[addLen] = updateStats(noTerms, 'ent',prec1, sess_prec[addLen]) # sess_mrr[addLen] = updateStats(noTerms, 'ent',mrr1, sess_mrr[addLen]); print "ESubMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1 for noTerms in qccTaskTerms.keys(): print "qccTaskTerms\t", i, "\t", query, "\t", qccTaskTerms[noTerms], "\t", noTerms prec1, mrr1 = getPrecRecall(qccTaskTerms[noTerms], aTerms) prec = updateStats(noTerms, "qccTask", prec1, prec) mrr = updateStats(noTerms, "qccTask", mrr1, mrr) if entStatus1: ent_prec = updateStats(noTerms, "qccTask", prec1, ent_prec) ent_mrr = updateStats(noTerms, "qccTask", mrr1, ent_mrr) """ sess_prec[addLen] = updateStats(noTerms, 'qccTask',prec1, sess_prec[addLen]) sess_mrr[addLen] = updateStats(noTerms, 'qccTask',mrr1, sess_mrr[addLen]); """ print "qccTaskMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1 # for noTerms in htcTaskTerms.keys(): # print 'htcTaskTerms\t',i,'\t',query,'\t',htcTaskTerms[noTerms],'\t',noTerms # prec1 , mrr1 = getPrecRecall(htcTaskTerms[noTerms],aTerms) # prec = updateStats(noTerms, 'htcTask',prec1, prec) # mrr = updateStats(noTerms, 'htcTask',mrr1, mrr); # if entStatus1: # ent_prec = updateStats(noTerms, 'htcTask',prec1, ent_prec) # ent_mrr = updateStats(noTerms, 'htcTask',mrr1, ent_mrr); ##sess_prec[addLen] = updateStats(noTerms, 'htcTask',prec1, sess_prec[addLen]) ##sess_mrr[addLen] = updateStats(noTerms, 'htcTask',mrr1, sess_mrr[addLen]); # # print 'htcTaskMetrics ',i,'\t',noTerms,'\t', len(aTerms), '\t', aTerms, '\t',prec1, '\t',mrr1 for noTerms in coExpTerms.keys(): print "CoTerms\t", i, "\t", query, "\t", coExpTerms[noTerms], "\t", noTerms prec1, mrr1 = getPrecRecall(coExpTerms[noTerms], aTerms) prec = updateStats(noTerms, "co", prec1, prec) mrr = updateStats(noTerms, "co", mrr1, mrr) if entStatus1: ent_prec = updateStats(noTerms, "co", prec1, ent_prec) ent_mrr = updateStats(noTerms, "co", mrr1, ent_mrr) """ sess_prec[addLen] = updateStats(noTerms, 'co',prec1, sess_prec[addLen]) sess_mrr[addLen] = updateStats(noTerms, 'co' ,mrr1, sess_mrr[addLen]); """ print "CoMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1 else: pass # print 'NO ADDED TERMS in', i; i += 1 printMetric(prec, "entSub", "Prec") printMetric(mrr, "entSub", "Mrr") printMetric(prec, "ent", "Prec") printMetric(mrr, "ent", "Mrr") printMetric(prec, "htcTask", "Prec") printMetric(mrr, "htcTask", "Mrr") printMetric(prec, "qccTask", "Prec") printMetric(mrr, "qccTask", "Mrr") printMetric(prec, "co", "Prec") printMetric(mrr, "co", "Mrr") printMetric(ent_prec, "entSub", "EntPrec") printMetric(ent_mrr, "entSub", "EntMrr") printMetric(ent_prec, "ent", "EntPrec") printMetric(ent_mrr, "ent", "EntMrr") printMetric(ent_prec, "htcTask", "EntPrec") printMetric(ent_mrr, "htcTask", "EntMrr") printMetric(ent_prec, "qccTask", "EntPrec") printMetric(ent_mrr, "qccTask", "EntMrr") printMetric(ent_prec, "co", "EntPrec") printMetric(ent_mrr, "co", "EntMrr") plotMultipleSys( prec, "No of Terms", "Prec", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "prec.png", "Term Prediction Prec Plot", ) plotMultipleSys( mrr, "No of Terms", "MRR", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "mrr.png", "Term Prediction MRR Plot", ) plotMultipleSys( ent_prec, "No of Terms", "Prec", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "_ent_prec.png", "Term Prediction Prec Plot (Ent queries)", ) plotMultipleSys( ent_mrr, "No of Terms", "MRR", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "_ent_mrr.png", "Term Prediction MRR Plot (Ent queries)", ) # htcTask.closeIndex(); qccTask.closeIndex() """