def main(argv): #for each query #get bi-grams, unigrams and update frequency coOccur = CoOccurrence() stemmer = stem.porter.PorterStemmer() for line in open(argv[1], 'r'): split = line.strip().split('\t') query = normalize(split[0].strip(), stemmer) freq = int(split[1].strip()) #generate ngrams ngrams = getNGramsAsList(query, 1) #if it has more than one term lngrams = len(ngrams) if lngrams > 1: for i in range(lngrams - 1): if ngrams[i] not in stopSet and len(ngrams[i]) > 2: for j in range(i + 1, lngrams): if ngrams[j] not in stopSet and len(ngrams[j]) > 2: coOccur.updateStats(ngrams[i], ngrams[j], freq) coOccur.setTermTotal() #for each query find the terms highly co-occured wth for line in open(argv[2], 'r'): split = line.split('\t') query = normalize(split[1].lower().strip(), stemmer) nGrams = getNGramsAsList(query, 1) toScore = set() result = {} for entry in nGrams: elist = coOccur.getNeighbours(entry) if elist: toScore |= set(elist) for term1 in toScore: if term1 not in query: result[term1] = 0.0 for term2 in nGrams: pmi = coOccur.getPMI(term1, term2, 50) result[term1] += pmi result[term1] /= len(nGrams) for entry in result.keys(): if result[entry] == 0: del result[entry] sort = sorted(result.items(), reverse=True, key=lambda x: x[1]) print query, '\t', '\t'.join('{0}:{1}'.format(x[0], round(x[1], 3)) for x in sort[:50])
def normalizeDict(idict, stemmer): returnDict = {} #tSum = sum(idict.values())*1.0 for entry, count in idict.items(): entry = normalize(entry, stemmer) if len(entry) > 2: try: returnDict[entry] += count #/tSum except: returnDict[entry] = count #/tSum return returnDict
def mergeFeatures(featFile, taggedFile, newFile): entFeatDict = {} stemmer = stem.porter.PorterStemmer() for line in open(taggedFile, 'r'): spotDict = ast.literal_eval(line.strip()) normQuery = normalize(spotDict['text'], stemmer) if normQuery not in entFeatDict: entFeatDict[normQuery] = [] #convert ent stuff into a dictionary entVect = {} catVect = {} for entry in spotDict['spots']: entVect[entry['wikiname']] = 1.0 cats = ast.literal_eval(entry['cat']) for cat in cats: if cat not in catVect: catVect[cat] = 0.0 catVect[cat] += 1.0 entFeatDict[normQuery].append(entVect) entFeatDict[normQuery].append(catVect) print len(entFeatDict) featDict = {} outF = open(newFile, 'w') for line in open(featFile, 'r'): split = line.split('\t') query = split[0].strip() featDict[query] = [] for entry in split[1:]: featDict[query].append(entry.strip()) if query in entFeatDict: featDict[query] = featDict[query] + entFeatDict[query] else: featDict[query] = featDict[query] + [{}, {}] print 'Query not tagged! ', query outF.write(query) for entry in featDict[query]: outF.write('\t' + str(entry)) outF.write('\n') #convert cat stuff into a dictionary outF.close()
def generatePhraseStats(labelFile): phrases = {} stemmer = stem.porter.PorterStemmer() entQuery = 0.0 for line in open(labelFile, 'r'): try: entList = [] spotDict = ast.literal_eval(line) text = spotDict['text'] for entry in spotDict['spots']: mention = entry['mention'] text = text.replace(mention, '<entity>').strip() entList.append(entry['wikiname']) if len(text) > 2: split = text.split('<entity>') if len(split) == 1: entQuery += 1.0 for entry in split: entry = normalize(entry, stemmer).strip() if entry not in phrases: phrases[entry] = {} for entity in entList: if entity not in phrases[entry]: phrases[entry][entity] = 0.0 phrases[entry][entity] += 1.0 #phrases[entry]+=1.0 except: pass #print line print entQuery words = {} for entry in sorted(phrases.items(), reverse=True, key=lambda x: len(x[1])): split = entry[0].split() wlen = len(split) if wlen not in words: words[wlen] = 0.0 words[wlen] += 1.0 print entry for entry, count in words.items(): print entry, count
def generatePhrases(labelFile): stemmer = stem.porter.PorterStemmer() for line in open(labelFile, 'r'): pList = [] spotDict = ast.literal_eval(line) query = spotDict['text'] text = spotDict['text'] for entry in spotDict['spots']: mention = entry['mention'] text = text.replace(mention, '<entity>').strip() if len(text) > 2: split = text.split('<entity>') for entry in split: entry = normalize(entry, stemmer).strip() if len(entry) > 1: pList.append(entry) yield query, pList
def filterQueries(queryCountFile, queryFile, trainFile, sessionFile): queryCount = loadFileInDict(queryCountFile) #print len(queryCount) toPrint = set() toFilter = loadFileInList(queryFile) training = loadFileInList(trainFile) session = loadFileInList(sessionFile) stemmer = stem.porter.PorterStemmer() for entry in toFilter: if (entry in queryCount) and ((queryCount[entry] > 15) or \ (entry in training) or (entry in session)): entry1 = normalize(entry, stemmer) toPrint.add(entry1) #print entry, '\t', queryCount[entry] for entry in toPrint: print entry
def main(argv): #load the co-occurrence terms coOccurTermList = loadDictFromFile(argv[2], '\t', ':') #for each query find the entities #score the cats ipaddress = 'localhost' #dexter object tagURL = 'http://' + ipaddress + ':8080/rest/annotate' catURL = 'http://' + ipaddress + ':8080/rest/graph/get-entity-categories' dexter = Dexter(tagURL, catURL) catManage = CategoryManager(argv[3], argv[4]) catCoMan = CoOcManager(argv[5], CoOccurrence(), ' ') ranker = Ranker() entExp = CatThesExpansion(dexter, catManage, ranker, catCoMan) stemmer = stem.porter.PorterStemmer() result = {} done = set() noEnt = 0 oFile = open(argv[6], 'w') for line in open(argv[1], 'r'): split = line.strip().split('\t') oquery = split[0].strip() query = normalize(oquery, stemmer) if query not in done and len(query) > 2: result = {} result = {'coTerms': {}, 'catTerms': None, 'freq': 0.0} result['freq'] = int(split[1]) #print query, query in coOccurTermList if query in coOccurTermList: result['coTerms'] = coOccurTermList[query] entCatTermDict = entExp.getTopEntityCategoryTerms(oquery, 1, 40) result['catTerms'] = entCatTermDict if len(result['catTerms']) > 0: oFile.write(query + '\t' + json.dumps(result) + '\n') else: noEnt += 1.0 done.add(query) print 'No of queries with no Ent ', noEnt oFile.close()
def populateDatasetWithBigrams(logFile, bigramSet, queryFile): sid = 0 queryList = buildBigramSet(queryFile) stemmer = stem.porter.PorterStemmer() for session in getSessionWithQuery(logFile): sessionStr = ' '.join(session) sessionSet = set(getNGramsAsList(sessionStr, 2)) inter = sessionSet & bigramSet #print len(sessionSet), len(bigramSet), inter if len(inter) > 0: lastq = None for q in session: if q in queryList: q = normalize(q, stemmer) if lastq != q and len(q) > 1: print sid, '\t', q lastq = q sid += 1
def findUniqueQueries(fileName, file2, index): toCheck = {} #for line in open(file2,'r'): ##split = line.split('\t'); ##query = split[0].strip() #spotDict = ast.literal_eval(line) #query = spotDict['text'] #toCheck[query] = 1.0 # #print len(toCheck) #queryList = {} for line in open(fileName, 'r'): split = line.strip().split('\t') query = split[0].strip() #if query in toCheck: # print query toCheck[query] = 1.0 print len(toCheck) #if query not in toCheck: #rsplit = query.split() #if not hasInapWords(rsplit): #if query not in queryList: #queryList[query] = 1.0 #else: #queryList[query] +=1.0 ##else: #print query stemmer = stem.porter.PorterStemmer() for line in open(file2, 'r'): split = line.split('\t') entry = split[index].strip() norm = normalize(entry, stemmer) if norm in toCheck and len(norm) > 3: print line,
def main(argv): #open the index searcher = SearchIndex(argv[2]) searcher.initializeAnalyzer() ipaddress = 'localhost' #dexter object tagURL = 'http://' + ipaddress + ':8080/rest/annotate' catURL = 'http://' + ipaddress + ':8080/rest/graph/get-entity-categories' dexter = Dexter(tagURL, catURL, argv[7]) #category vector catVect = loadCategoryVector(argv[3]) catManage1 = CategoryManager(catVect, argv[4], Category) catManage2 = CategoryManager(catVect, argv[5], CategorySubcluster) #load the Category co-occurrence bit catCoMan = CoOcManager(argv[6], CoOccurrence(), ' ') #ranker ranker = Ranker() #task extraction htcTask = TaskExpansion('Indexes/htcIndex', ranker, 3000) qccTask = TaskExpansion('Indexes/qccIndex', ranker, 3000) #taskK = argv[5][argv[5].rfind('/')+1:] #totalVocab = loadFileInList(argv[6]); #expansion entExp1 = CatThesExpansion(dexter, catManage1, ranker, catCoMan) entExp2 = CatThesExpansion(dexter, catManage2, ranker, catCoMan) #term expansion coOccExp = CoOccurExpansion(catCoMan, None, ranker) rel, noRel = loadRelJudgements(argv[8]) outFolder = argv[9] #randomWalk #randWalk = RandomWalk(argv[3],argv[4],ranker) #randWalk = RandomWalk(catManage,catCoMan,entTermVect, catTermVect,ranker) #result String #query key terms #queryList = loadQueryList(argv[4]); #plotMap = {'baseline':{},'ent':{}, 'entSub':{}, 'qccTask':{}, 'htcTask':{},'co':{}}; #plotNDCG = {'baseline':{},'ent':{}, 'entSub':{}, 'qccTask':{}, 'htcTask':{},'co':{}}; #viewedFileFolder = argv[5] #i=0 ##qMap = []; ##qNdcg = []; #meth = 'baseline' #oFile = open(outFolder+'/baseline.RL1','w'); #covered = {}; #porter = stem.porter.PorterStemmer(); #for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(argv[1]): #i+=1 #query = session[0].strip(); #if i in rel and query not in covered: #covered[query] = 1.0; #docList = searcher.getTopDocuments(query,1000,'content','id'); #k = 1 #for dtuple in docList: #oFile.write(str(i)+' Q0 '+dtuple[0]+' '+str(k)+' '+str(round(dtuple[1],2))+' baseline\n'); #k +=1 #'''qmap = findAvgPrec(docList,rel[i],noRel[i]); #dcg10, idcg10 = findDCG(docList[:10],rel[i]); ##print dcg10, idcg10, rel[i].values(); #ndcg10 = 0.0; #if idcg10 > 0: #ndcg10 = dcg10/idcg10; #qMap.append(qmap); #qNdcg.append(ndcg10); #oFile.write('ndcg10 '+str(i)+' '+str(ndcg10)+'\n'); #oFile.write('map '+str(i)+' '+str(qmap)+'\n'); #''' #else: #print 'No rel ', i, session[0]; #oFile.close(); #''' #fmap = sum(qMap)/len(qMap); #fnd = sum(qNdcg)/len(qNdcg); #oFile.write('all map ' +str(fmap)+'\n'); #oFile.write('all ndcg10 '+str(fnd)+'\n'); #for val in range(0,55,5): #plotMap[meth][val] = fmap; #plotNDCG[meth][val] = fnd; #oFile.close(); #''' i = 0 #qMap = {}; #qNdcg = {}; oFile = {} meth = 'co' covered = {} for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): i += 1 query = session[0].strip() if i in rel and query not in covered: covered[query] = 1.0 coExpTerms = coOccExp.expandTextWithStep(query, 50, 55, 5) for noTerms, terms in coExpTerms.items(): #if noTerms not in qMap: # qMap[noTerms] = []; # qNdcg[noTerms] = []; if noTerms not in oFile: oFile[noTerms] = open( outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w') docList = searcher.getTopDocumentsWithExpansion(query, terms, 1000, 'content', 'id') k = 1 for dtuple in docList: oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' ' + str(round(dtuple[1], 2)) + ' baseline\n') k += 1 '''qmap = findAvgPrec(docList,rel[i],noRel[i]); dcg10, idcg10 = findDCG(docList[:10],rel[i]); ndcg10 = 0.0; if idcg10 > 0: ndcg10 = dcg10/idcg10; qMap[noTerms].append(qmap); qNdcg[noTerms].append(ndcg10); oFile[noTerms].write('ndcg10 '+str(i)+' '+str(ndcg10)+' '+str(dcg10)+' '+str(idcg10)+'\n'); oFile[noTerms].write('map '+str(i)+' '+str(qmap)+'\n'); ''' ''' for entry, vlist in qMap.items(): i = len(vlist); fmap = sum(vlist)/i; fnd = sum(qNdcg[entry])/i; print sum(vlist), len(vlist); oFile[entry].write('all map ' +str(fmap)+'\n'); oFile[entry].write('all ndcg10 '+str(fnd)+'\n'); plotMap[meth][entry] = fmap; plotNDCG[meth][entry] = fnd; oFile[entry].close(); ''' for entry in oFile.keys(): oFile[entry].close() i = 0 #qMap = {}; #qNdcg = {}; oFile = {} meth = 'ent' covered = {} for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): i += 1 query = session[0].strip() cText = normalize(' '.join(cTitle[0]), porter) if i in rel and query not in covered: covered[query] = 1.0 entStatus1, entExpTerms1 = entExp1.expandTextWithStep(query, cText, 1, 50, 55, 5) for noTerms, terms in entExpTerms1.items(): #if noTerms not in qMap: # qMap[noTerms] = []; # qNdcg[noTerms] = []; if noTerms not in oFile: oFile[noTerms] = open( outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w') docList = searcher.getTopDocumentsWithExpansion(session[0], terms, 1000, 'content', 'id') k = 1 for dtuple in docList: oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' ' + str(round(dtuple[1], 2)) + ' baseline\n') k += 1 ''' qmap = findAvgPrec(docList,rel[i],noRel[i]); dcg10, idcg10 = findDCG(docList[:10],rel[i]); ndcg10 = 0.0; if idcg10 > 0: ndcg10 = dcg10/idcg10; qMap[noTerms].append(qmap); qNdcg[noTerms].append(ndcg10); oFile[noTerms].write('ndcg10 '+str(i)+' '+str(ndcg10)+' '+str(dcg10)+' '+str(idcg10)+'\n'); oFile[noTerms].write('map '+str(i)+' '+str(qmap)+'\n'); for entry, vlist in qMap.items(): i = len(vlist); fmap = sum(qMap[entry])/i; fnd = sum(qNdcg[entry])/i; oFile[entry].write('all map ' +str(fmap)+'\n'); oFile[entry].write('all ndcg10 '+str(fnd)+'\n'); plotMap[meth][entry] = fmap; plotNDCG[meth][entry] = fnd; oFile[entry].close(); ''' for entry in oFile.keys(): oFile[entry].close() i = 0 #qMap = {}; #qNdcg = {}; oFile = {} meth = 'entSub' covered = {} for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): i += 1 query = session[0].strip() cText = normalize(' '.join(cTitle[0]), porter) if i in rel and query not in covered: covered[query] = 1.0 entStatus2, entExpTerms2 = entExp2.expandTextWithStepAndSubcluster( query, cText, 1, 50, 55, 5) for noTerms, terms in entExpTerms2.items(): #if noTerms not in qMap: #qMap[noTerms] = []; #qNdcg[noTerms] = []; if noTerms not in oFile: oFile[noTerms] = open( outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w') docList = searcher.getTopDocumentsWithExpansion(session[0], terms, 1000, 'content', 'id') k = 1 for dtuple in docList: oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' ' + str(round(dtuple[1], 2)) + ' baseline\n') k += 1 '''qmap = findAvgPrec(docList,rel[i],noRel[i]); dcg10, idcg10 = findDCG(docList[:10],rel[i]); ndcg10 = 0.0; if idcg10 > 0: ndcg10 = dcg10/idcg10; qMap[noTerms].append(qmap); qNdcg[noTerms].append(ndcg10); oFile[noTerms].write('ndcg10 '+str(i)+' '+str(ndcg10)+' '+str(dcg10)+' '+str(idcg10)+'\n'); oFile[noTerms].write('map '+str(i)+' '+str(qmap)+'\n'); for entry, vlist in qMap.items(): i = len(vlist); fmap = sum(qMap[entry])/i; fnd = sum(qNdcg[entry])/i; oFile[entry].write('all map ' +str(fmap)+'\n'); oFile[entry].write('all ndcg10 '+str(fnd)+'\n'); plotMap[meth][entry] = fmap; plotNDCG[meth][entry] = fnd; oFile[entry].close(); ''' for entry in oFile.keys(): oFile[entry].close() i = 0 #qMap = {}; #qNdcg = {}; oFile = {} meth = 'qccTask' covered = {} for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): i += 1 query = session[0].strip() if i in rel and query not in covered: covered[query] = 1.0 qccTaskTerms = qccTask.expandTextWithStep(query, 50, 55, 5) for noTerms, terms in qccTaskTerms.items(): #if noTerms not in qMap: #qMap[noTerms] = []; #qNdcg[noTerms] = []; if noTerms not in oFile: oFile[noTerms] = open( outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w') docList = searcher.getTopDocumentsWithExpansion(session[0], terms, 1000, 'content', 'id') k = 1 for dtuple in docList: oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' ' + str(round(dtuple[1], 2)) + ' baseline\n') k += 1 #qmap = findAvgPrec(docList,rel[i],noRel[i]); #dcg10, idcg10 = findDCG(docList[:10],rel[i]); #ndcg10 = 0.0; #if idcg10 > 0: #ndcg10 = dcg10/idcg10; # #qMap[noTerms].append(qmap); #qNdcg[noTerms].append(ndcg10); #oFile[noTerms].write('ndcg10 '+str(i)+' '+str(ndcg10)+' '+str(dcg10)+' '+str(idcg10)+'\n'); #oFile[noTerms].write('map '+str(i)+' '+str(qmap)+'\n'); # #for entry, vlist in qMap.items(): #i = len(vlist); #fmap = sum(qMap[entry])/i; #fnd = sum(qNdcg[entry])/i; #oFile[entry].write('all map ' +str(fmap)+'\n'); #oFile[entry].write('all ndcg10 '+str(fnd)+'\n'); #plotMap[meth][entry] = fmap; #plotNDCG[meth][entry] = fnd; #oFile[entry].close(); # for entry in oFile.keys(): oFile[entry].close() i = 0 #qMap = {}; #qNdcg = {}; oFile = {} meth = 'htcTask' covered = {} for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): i += 1 query = session[0].strip() if i in rel and query not in covered: covered[query] = 1.0 htcTaskTerms = htcTask.expandTextWithStep(query, 50, 55, 5) for noTerms, terms in htcTaskTerms.items(): #if noTerms not in qMap: #qMap[noTerms] = []; #qNdcg[noTerms] = []; if noTerms not in oFile: oFile[noTerms] = open( outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w') docList = searcher.getTopDocumentsWithExpansion(session[0], terms, 1000, 'content', 'id') k = 1 for dtuple in docList: oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' ' + str(round(dtuple[1], 2)) + ' baseline\n') k += 1 #qmap = findAvgPrec(docList,rel[i],noRel[i]); #dcg10, idcg10 = findDCG(docList[:10],rel[i]); #ndcg10 = 0.0; #if idcg10 > 0: #ndcg10 = dcg10/idcg10; #qMap[noTerms].append(qmap); #qNdcg[noTerms].append(ndcg10); #oFile[noTerms].write('ndcg10 '+str(i)+' '+str(ndcg10)+' '+str(dcg10)+' '+str(idcg10)+'\n'); #oFile[noTerms].write('map '+str(i)+' '+str(qmap)+'\n'); # #for entry, vlist in qMap.items(): #i = len(vlist); #fmap = sum(qMap[entry])/i; #fnd = sum(qNdcg[entry])/i; #oFile[entry].write('all map ' +str(fmap)+'\n'); #oFile[entry].write('all ndcg10 '+str(fnd)+'\n'); #plotMap[meth][entry] = fmap; #plotNDCG[meth][entry] = fnd; #oFile[entry].close(); for entry in oFile.keys(): oFile[entry].close() #plotMultipleSys(plotMap,'No of Terms', 'MAP',outFolder+'/map.png','Retrieval MAP Plot'); #plotMultipleSys(plotNDCG,'No of Terms', 'NDCG@10',outFolder+'/ndcg10.png','Retrieval NDCG Plot'); searcher.close()