def filterQueries(queryCountFile, queryFile, trainFile, sessionFile): queryCount = loadFileInDict(queryCountFile) #print len(queryCount) toPrint = set() toFilter = loadFileInList(queryFile) training = loadFileInList(trainFile) session = loadFileInList(sessionFile) stemmer = stem.porter.PorterStemmer() for entry in toFilter: if (entry in queryCount) and ((queryCount[entry] > 15) or \ (entry in training) or (entry in session)): entry1 = normalize(entry, stemmer) toPrint.add(entry1) #print entry, '\t', queryCount[entry] for entry in toPrint: print entry
def filterSessionWithQuery(fileName, queryFile): queryList = loadFileInList(queryFile) for line in open(fileName, 'r'): split = line.split('\t') query = split[0].strip() nQuery = '' for entry in query.split(): if len(entry) > 1: nQuery += ' ' + entry nQuery = nQuery.strip() if (nQuery in queryList) or (query in queryList): print line,
def main(argv): simpleWalk = SimpleWalk() top50 = loadFileInList(argv[2]) porter = stem.porter.PorterStemmer() for rsession in getSessionWithQuery(argv[1]): i = 0 j = 1 session = removeWrongEntries(rsession, top50) sesLen = len(session) while i < sesLen and j < sesLen: stemI = stemQuery(session[i], porter) stemJ = stemQuery(session[j], porter) simpleWalk.addEdge(stemI, stemJ, 1.0) i = j j += 1 simpleWalk.filter(2) simpleWalk.walk()
#category vector catVect = loadCategoryVector(argv[2]) catManage1 = CategoryManager(catVect, argv[3], Category) catManage2 = CategoryManager(catVect, argv[5], CategorySubcluster) #ranker ranker = Ranker() entExp1 = CatThesExpansion(dexter, catManage1, ranker, catCoMan, None) entExp2 = CatThesExpansion(dexter, catManage2, ranker, catCoMan, None) oFile1 = open(argv[1][:argv[1].rfind('.')] + '_ent.txt', 'w') oFile2 = open(argv[1][:argv[1].rfind('.')] + '_entSub.txt', 'w') i = 0 porter = stem.porter.PorterStemmer() totalVocab = loadFileInList(argv[7]) stats = {'sess': 0.0, 'aTerms': 0.0, 'ent': 0.0, 'enta': 0.0, 'acount': 0.0} for session, documents, clicks, cTitle, scontents in getSessionWithXML( argv[1]): query = session[0] cText = None aTerms, rTerms = addedAndRemovedTerms(query, session[1:], totalVocab) entStatus2, entExpTerms2 = entExp2.getTopSubclusters(query, cText, 1, 5) if entStatus2: stats['ent'] += 1 if len(aTerms) > 0: stats['aTerms'] += 1 if entStatus2:
def main(argv): #Scorer coSessOccur = CoOccurrence() coSessOcMan = CoOcManager(argv[2], coSessOccur, ' ') tScorer = CoOccurSimScore(coSessOcMan) cScorer = ScoreClusterTerms() #vocab = set() i = 0 prec = {} mrr = {} lim = 55 queryList = loadFileInList(argv[5]) termList, termDict = getTermList(queryList) print len(termList) added = 0 oracle_prec = 0.0 oracle_mrr = 0.0 for tid, session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): query = session[0].strip() aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict) if len(aTerms) > 0: prec1, mrr1 = getPrecRecall(termList, aTerms) added += 1.0 oracle_prec += prec1 oracle_mrr += mrr1 print 'Oracle prec and recall ', oracle_prec / added, oracle_mrr / added porter = stem.porter.PorterStemmer() ttype = argv[6] print ttype for iFile in os.listdir(argv[3]): qclusters = loadClusters(argv[3] + '/' + iFile) clusters, clusIndex = toTerms(qclusters) print iFile, len(clusters) prec[iFile] = {} mrr[iFile] = {} added = 0.0 i = 1 for tid, session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): i += 1 query = session[0].strip() qSet = getQueryTermsStemmed(query, porter) print 'Query ', query, qSet if ttype == 'query': aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict) elif ttype == 'title': aTerms = getTerms(cTitle, qSet, termDict, porter, range( 1, len(session) - 1)) else: aTerms = getTerms(cTitle, qSet, termDict, porter, range( 1, len(session) - 1)) bTerms = getTerms(cSummary, qSet, termDict, porter, range( 1, len(session) - 1)) aTerms = aTerms | bTerms #aTerms,rTerms = addedAndRemovedTerms(query, session[1:], None ) if len(aTerms) > 0: terms = cScorer.scoreWithIndex(qSet, clusters, clusIndex, tScorer, lim) #terms = cScorer.scoreWithClustPos(qSet, clusters,tScorer, lim) print 'TERMS', '\t', i, '\t', ttype, '\t', iFile, '\t', len( terms), terms #for topk in range(1,lim,5): prec1, mrr1 = getClustPrecMrr(terms, aTerms) # returns a list print 'METRIC', iFile, i, prec1, mrr1 #print topk , prec1, mrr1 for topk in prec1.keys(): if topk not in prec[iFile]: prec[iFile][topk] = [] mrr[iFile][topk] = [] prec[iFile][topk].append(prec1[topk]) mrr[iFile][topk].append(mrr1[topk]) #prec[iFile][topk] += prec1 #mrr[iFile][topk] += mrr1 added += 1.0 #if i == 3: # break for fName, scoreDict in prec.items(): for pos in scoreDict.keys(): print 'Prec all', fName, pos, len(scoreDict[pos]) total = sum(scoreDict[pos]) prec[fName][pos] = total / added #len(scoreDict[pos]) print 'Prec', fName, pos, prec[fName][pos], total for fName, scoreDict in mrr.items(): for pos in scoreDict.keys(): print 'Mrr all', fName, pos, len(scoreDict[pos]) total = sum(mrr[fName][pos]) mrr[fName][pos] = total / added #len(scoreDict[pos]) print 'MRR', fName, pos, mrr[fName][pos], total #for entry in prec.keys(): #for t in prec[entry].keys(): #print 'Prec',entry, t, prec[entry][t], prec[entry][t]/added #prec[entry][t]/=added #for entry in mrr.keys(): #for t in mrr[entry].keys(): #print 'Mrr',entry, t, mrr[entry][t], mrr[entry][t]/added #mrr[entry][t]/=added print 'Plotting Precision and MRR' plotMultipleSys(prec, 'No of Terms', 'Prec', argv[4] + 'prec.png', 'Term Prediction Prec Plot') plotMultipleSys(mrr, 'No of Terms', 'MRR', argv[4] + 'mrr.png', 'Term Prediction MRR Plot')
def main(argv): queries_to_ignore = loadFileInList(argv[2], 0) clean_user_sessions = getSessionsByUsers(argv[1], queries_to_ignore) WriteCleanSessionsToFile(clean_user_sessions, 5, argv[3])
def main(argv): ipaddress = "localhost" # dexter object tagURL = "http://" + ipaddress + ":8080/rest/annotate" catURL = "http://" + ipaddress + ":8080/rest/graph/get-entity-categories" dexter = Dexter(tagURL, catURL, argv[5]) # load the Category co-occurrence bit catCoMan = CoOcManager(argv[4], CoOccurrence(), " ") # category vector catVect = loadCategoryVector(argv[2]) catManage1 = CategoryManager(catVect, argv[3], Category) catManage2 = CategoryManager(catVect, argv[7], CategorySubcluster) # ranker ranker = Ranker() totalVocab = loadFileInList(argv[6]) # task extraction # htcTask = TaskExpansion('Indexes/htcIndex',ranker,3000); qccTask = TaskExpansion("Indexes/qccIndex", ranker, 3000, totalVocab) # taskK = argv[5][argv[5].rfind('/')+1:] wordFeatMan = None # WordManager(argv[8],False); # expansion # entExp1 = CatThesExpansion(dexter, catManage1, ranker,catCoMan,wordFeatMan); entExp2 = CatThesExpansion(dexter, catManage2, ranker, catCoMan, wordFeatMan) # term expansion coOccExp = CoOccurExpansion(catCoMan, None, ranker) # randomWalk # randWalk = RandomWalk(argv[2],argv[3],ranker) prec = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} mrr = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} ent_prec = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} ent_mrr = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} """ sess_prec = {}; sess_mrr = {}; """ covered = {} i = 0 porter = stem.porter.PorterStemmer() ttype = argv[10] for session, doc, click, cTitle, cSummary in getSessionWithXML(argv[1]): query = session[0] qSet = getQueryTerms(query) # print 'Title, Summary clicked ',cTitle[0], cSummary[0]; aTerms = None # cText = normalize(' '.join(cTitle[0]),porter); if ttype == "query": aTerms, rTerms = addedAndRemovedTerms(query, session[1:], totalVocab) elif ttype == "title": aTerms = getTerms(cTitle, qSet, totalVocab, porter, range(1, len(session) - 1)) else: aTerms = getTerms(cTitle, qSet, totalVocab, porter, range(1, len(session) - 1)) bTerms = getTerms(cSummary, qSet, totalVocab, porter, range(1, len(session) - 1)) aTerms = aTerms | bTerms print i, "Query", query, aTerms, len(aTerms) if len(aTerms) > 0: # and query not in covered: covered[query] = 1 coExpTerms = coOccExp.expandTextWithStep(query, 0, 55, 5) # entStatus1, entExpTerms1 = entExp1.expandTextWithStep(query,'',1,0,55,5); entStatus1, entExpTerms2 = entExp2.expandTextWithStepAndSubcluster(query, "", 1, 0, 55, 5) qccTaskTerms = qccTask.expandTextWithStep(query, 0, 55, 5) # htcTaskTerms = htcTask.expandTextWithStep(query,0,55,5) # randExpTerms = randWalk.expandTextWithStep(query,55,105,5) if not entStatus1: print i, "Ent False", query # addLen = getBand(len(aTerms)); # if addLen not in sess_prec: # sess_prec[addLen] = {'ent':{}};#, 'qccTask':{}, 'htcTask':{}, 'co':{} }; # sess_mrr[addLen] = {'ent':{}};#, 'qccTask':{}, 'htcTask':{}, 'co':{} }; # for noTerms in entExpTerms1.keys(): # print 'ETerms\t',i,'\t',query,'\t',entExpTerms1[noTerms],'\t',noTerms; # prec1 , mrr1 = getPrecRecall(entExpTerms1[noTerms],aTerms); # prec = updateStats(noTerms, 'ent',prec1, prec); # mrr = updateStats(noTerms, 'ent',mrr1, mrr); # if entStatus1: # ent_prec = updateStats(noTerms, 'ent',prec1, ent_prec) # ent_mrr = updateStats(noTerms, 'ent',mrr1, ent_mrr); ##sess_prec[addLen] = updateStats(noTerms, 'ent',prec1, sess_prec[addLen]) ##sess_mrr[addLen] = updateStats(noTerms, 'ent',mrr1, sess_mrr[addLen]); # print 'EMetrics ',i,'\t',noTerms,'\t', len(aTerms), '\t', aTerms, '\t',prec1, '\t',mrr1; # for noTerms in entExpTerms2.keys(): print "ESubTerms\t", i, "\t", query, "\t", entExpTerms2[noTerms], "\t", noTerms prec1, mrr1 = getPrecRecall(entExpTerms2[noTerms], aTerms) prec = updateStats(noTerms, "entSub", prec1, prec) mrr = updateStats(noTerms, "entSub", mrr1, mrr) if entStatus1: ent_prec = updateStats(noTerms, "entSub", prec1, ent_prec) ent_mrr = updateStats(noTerms, "entSub", mrr1, ent_mrr) # sess_prec[addLen] = updateStats(noTerms, 'ent',prec1, sess_prec[addLen]) # sess_mrr[addLen] = updateStats(noTerms, 'ent',mrr1, sess_mrr[addLen]); print "ESubMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1 for noTerms in qccTaskTerms.keys(): print "qccTaskTerms\t", i, "\t", query, "\t", qccTaskTerms[noTerms], "\t", noTerms prec1, mrr1 = getPrecRecall(qccTaskTerms[noTerms], aTerms) prec = updateStats(noTerms, "qccTask", prec1, prec) mrr = updateStats(noTerms, "qccTask", mrr1, mrr) if entStatus1: ent_prec = updateStats(noTerms, "qccTask", prec1, ent_prec) ent_mrr = updateStats(noTerms, "qccTask", mrr1, ent_mrr) """ sess_prec[addLen] = updateStats(noTerms, 'qccTask',prec1, sess_prec[addLen]) sess_mrr[addLen] = updateStats(noTerms, 'qccTask',mrr1, sess_mrr[addLen]); """ print "qccTaskMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1 # for noTerms in htcTaskTerms.keys(): # print 'htcTaskTerms\t',i,'\t',query,'\t',htcTaskTerms[noTerms],'\t',noTerms # prec1 , mrr1 = getPrecRecall(htcTaskTerms[noTerms],aTerms) # prec = updateStats(noTerms, 'htcTask',prec1, prec) # mrr = updateStats(noTerms, 'htcTask',mrr1, mrr); # if entStatus1: # ent_prec = updateStats(noTerms, 'htcTask',prec1, ent_prec) # ent_mrr = updateStats(noTerms, 'htcTask',mrr1, ent_mrr); ##sess_prec[addLen] = updateStats(noTerms, 'htcTask',prec1, sess_prec[addLen]) ##sess_mrr[addLen] = updateStats(noTerms, 'htcTask',mrr1, sess_mrr[addLen]); # # print 'htcTaskMetrics ',i,'\t',noTerms,'\t', len(aTerms), '\t', aTerms, '\t',prec1, '\t',mrr1 for noTerms in coExpTerms.keys(): print "CoTerms\t", i, "\t", query, "\t", coExpTerms[noTerms], "\t", noTerms prec1, mrr1 = getPrecRecall(coExpTerms[noTerms], aTerms) prec = updateStats(noTerms, "co", prec1, prec) mrr = updateStats(noTerms, "co", mrr1, mrr) if entStatus1: ent_prec = updateStats(noTerms, "co", prec1, ent_prec) ent_mrr = updateStats(noTerms, "co", mrr1, ent_mrr) """ sess_prec[addLen] = updateStats(noTerms, 'co',prec1, sess_prec[addLen]) sess_mrr[addLen] = updateStats(noTerms, 'co' ,mrr1, sess_mrr[addLen]); """ print "CoMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1 else: pass # print 'NO ADDED TERMS in', i; i += 1 printMetric(prec, "entSub", "Prec") printMetric(mrr, "entSub", "Mrr") printMetric(prec, "ent", "Prec") printMetric(mrr, "ent", "Mrr") printMetric(prec, "htcTask", "Prec") printMetric(mrr, "htcTask", "Mrr") printMetric(prec, "qccTask", "Prec") printMetric(mrr, "qccTask", "Mrr") printMetric(prec, "co", "Prec") printMetric(mrr, "co", "Mrr") printMetric(ent_prec, "entSub", "EntPrec") printMetric(ent_mrr, "entSub", "EntMrr") printMetric(ent_prec, "ent", "EntPrec") printMetric(ent_mrr, "ent", "EntMrr") printMetric(ent_prec, "htcTask", "EntPrec") printMetric(ent_mrr, "htcTask", "EntMrr") printMetric(ent_prec, "qccTask", "EntPrec") printMetric(ent_mrr, "qccTask", "EntMrr") printMetric(ent_prec, "co", "EntPrec") printMetric(ent_mrr, "co", "EntMrr") plotMultipleSys( prec, "No of Terms", "Prec", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "prec.png", "Term Prediction Prec Plot", ) plotMultipleSys( mrr, "No of Terms", "MRR", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "mrr.png", "Term Prediction MRR Plot", ) plotMultipleSys( ent_prec, "No of Terms", "Prec", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "_ent_prec.png", "Term Prediction Prec Plot (Ent queries)", ) plotMultipleSys( ent_mrr, "No of Terms", "MRR", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "_ent_mrr.png", "Term Prediction MRR Plot (Ent queries)", ) # htcTask.closeIndex(); qccTask.closeIndex() """