def findBigramCoOccurence(sessionFile, outFile): #generate all bigrams from session # for every bigram get term and co-occurrence bigramCo = {} for session in getSessionWithQuery(sessionFile): for i in range(len(session) - 1): b1 = getNGramsAsList(session[i], 2) b2 = getNGramsAsList(session[i + 1], 2)
def main(argv): #for each query #get bi-grams, unigrams and update frequency coOccur = CoOccurrence() stemmer = stem.porter.PorterStemmer() for line in open(argv[1], 'r'): split = line.strip().split('\t') query = normalize(split[0].strip(), stemmer) freq = int(split[1].strip()) #generate ngrams ngrams = getNGramsAsList(query, 1) #if it has more than one term lngrams = len(ngrams) if lngrams > 1: for i in range(lngrams - 1): if ngrams[i] not in stopSet and len(ngrams[i]) > 2: for j in range(i + 1, lngrams): if ngrams[j] not in stopSet and len(ngrams[j]) > 2: coOccur.updateStats(ngrams[i], ngrams[j], freq) coOccur.setTermTotal() #for each query find the terms highly co-occured wth for line in open(argv[2], 'r'): split = line.split('\t') query = normalize(split[1].lower().strip(), stemmer) nGrams = getNGramsAsList(query, 1) toScore = set() result = {} for entry in nGrams: elist = coOccur.getNeighbours(entry) if elist: toScore |= set(elist) for term1 in toScore: if term1 not in query: result[term1] = 0.0 for term2 in nGrams: pmi = coOccur.getPMI(term1, term2, 50) result[term1] += pmi result[term1] /= len(nGrams) for entry in result.keys(): if result[entry] == 0: del result[entry] sort = sorted(result.items(), reverse=True, key=lambda x: x[1]) print query, '\t', '\t'.join('{0}:{1}'.format(x[0], round(x[1], 3)) for x in sort[:50])
def generatePhraseFeatures(featureFile, spotFile, outFile): #load features for queries qfeatMan = FeatureManager() qfeatMan.readFeatures(featureFile) pid = 0 pfeatMan = FeatureManager() #generate features for phrases for query, pList in generatePhrases(spotFile): qkey, qfeat = qfeatMan.returnFeature(query) #print query, qkey if qkey: #print query, pList for phrase in pList: qVect = getDictFromSet(phrase.split()) ngrams = getNGramsAsList(phrase, 2) url = qfeat.returnUrl() user = qfeat.returnUsers() ent = qfeat.returnEntities() cat = qfeat.returnCategories() typ = qfeat.returnType() sess = qfeat.returnSessions() if 'tournament' in phrase: print query, phrase print sess print typ print ent nFeature = QueryFeature(phrase, ngrams, qVect, url, user, sess, ent, cat, typ) pfeatMan.addFeature(phrase, pid, nFeature) pid += 1 pfeatMan.writeFeatures(outFile)
def findQueryCounts(queryFile): #coOccur = CoOccurrence(); pairs = {} porter = stem.porter.PorterStemmer() qTerms = '' for line in open(queryFile, 'r'): split = line.strip().lower().split('\t') query = split[0].strip() freq = float(split[1]) #for each query get nonEntTerms and update co-occurrence stats qTerms = '' qTerms = ' '.join(getQueryTerms(query)) if len(qTerms) > 3: ngrams = sorted(getNGramsAsList(qTerms.strip(), 1)) lngrams = len(ngrams) if lngrams > 1: for i in range(lngrams - 1): if ngrams[i] not in stopSet and len(ngrams[i]) > 2: for j in range(i + 1, lngrams): if ngrams[j] not in stopSet and len(ngrams[j]) > 2: stemd1 = porter.stem(ngrams[i]) stemd2 = porter.stem(ngrams[j]) key = stemd1 + ' ' + stemd2 if key not in pairs: pairs[key] = 0.0 pairs[key] += freq #coOccur.updateStats(stemd1, stemd2, freq); #coOccur.setTermTotal(); #coOccur.writeTermCo(outFile); return pairs
def findSessionCountsOfNonEnt(netDict, queryFile, outFile): coOccur = CoOccurrence() qTerms = '' for session in getSessionWithQuery(queryFile): #for each query get nonEntTerms and update co-occurrence stats qTerms = '' for query in session: query = (query.decode('utf-8')).encode('ascii', 'ignore') if query in netDict: for entry in netDict[query].getNonEntityTerms(): if entry not in qTerms: qTerms += ' ' + entry qTerms = qTerms.strip() if len(qTerms) > 2: ngrams = getNGramsAsList(qTerms.strip(), 1) lngrams = len(ngrams) if lngrams > 1: for i in range(lngrams - 1): if ngrams[i] not in stopSet and len(ngrams[i]) > 2: for j in range(i + 1, lngrams): if ngrams[j] not in stopSet and len(ngrams[j]) > 2: coOccur.updateStats(ngrams[i], ngrams[j], 1.0) coOccur.setTermTotal() coOccur.writeTermCo(outFile)
def sampleSessions(sessiontrackFile, biGramFile, freqFile, sessionFileToPrune): #if the session contains any session Track query #if it contains top 100 bigrams 20 unigrams # if the session average query count > 1/ session length #else rand number is sessionTrackQueries = {} #load sessionTrack queries for line in open(sessiontrackFile, 'r'): query = normalizeWithoutStem(line.strip().lower()) sessionTrackQueries[query] = 1.0 sessionTrackQueries[line.strip().lower()] = 1.0 biGrams = set() for line in open(biGramFile, 'r'): split = line.split('\t') biGrams.add(split[0]) if len(biGrams) == 2500: break freq = {} for line in open(freqFile, 'r'): split = line.split('\t') freq[split[0].strip()] = float(split[1]) avgFreq = 0.0 lastSes = None session = [] hasQuery = False hasBigram = False for line in open(sessionFileToPrune, 'r'): split = line.split('\t') sessNo = int(split[0]) query = split[1].strip() if query in sessionTrackQueries: hasQuery = True nGrams = set(getNGramsAsList(query, 2)) inter = nGrams & biGrams if len(inter) > 0: hasBigram = True avgFreq += freq[query] if lastSes != sessNo: rnum = random.random() if len(session) > 0: avgFreq /= len(session) if (rnum > 0.80 or hasQuery or hasBigram) and (len(session) > 3 ) and avgFreq > 90: for entry in session: print entry, session = [] hasQuery = False hasBigram = False session.append(line) lastSes = sessNo
def getNString(string, glen): string = string.strip() gString = '' ngrams = getNGramsAsList(string, glen) #print glen, string, bi, ind gString = ' '.join('{0}:{1}'.format(x.replace(' ', '_'), y) for x, y in ngrams.items()) queryVect = getDictFromSet(string.split()) qVectString = ' '.join('{0}:{1}'.format(x, y) for x, y in queryVect.items()) return gString + '\t' + qVectString
def findSessionCounts(queryFile, outFile, wordSet): coOccur = {} #CoOccurrence(); qTerms = '' sess = 0 qid = 0.0 qSet = set() for session in getSessionWithQuery(queryFile): qSet.clear() for query in session: qid += 1 terms = getQueryTerms(query) if len(terms) > 0: qSet |= getQueryTerms(query) if qid % 1000000 == 0: print qid print len(coOccur) #print len(session) , len(qSet); #for each query get nonEntTerms and update co-occurrence stats qTerms = '' qTerms = ' '.join(qSet) if len(qTerms) > 3 and len(qSet) > 1: #print qSet; ngrams = sorted(getNGramsAsList(qTerms.strip(), 1)) lngrams = len(ngrams) if lngrams > 1: for i in range(lngrams - 1): if ngrams[i] not in stopSet and len( ngrams[i]) > 2 and ngrams[i] in wordSet: for j in range(i + 1, lngrams): if ngrams[j] not in stopSet and len( ngrams[j]) > 2 and ngrams[j] in wordSet: #coOccur.updateStats(ngrams[i],ngrams[j],1.0); key = ngrams[i] + ' ' + ngrams[j] if key not in coOccur: coOccur[key] = 0.0 coOccur[key] += 1.0 if len(coOccur) >= 9000000: writeDictToFile(outFile, coOccur, sess) coOccur.clear() coOccur = {} sess += 1
def populateDatasetWithBigrams(logFile, bigramSet, queryFile): sid = 0 queryList = buildBigramSet(queryFile) stemmer = stem.porter.PorterStemmer() for session in getSessionWithQuery(logFile): sessionStr = ' '.join(session) sessionSet = set(getNGramsAsList(sessionStr, 2)) inter = sessionSet & bigramSet #print len(sessionSet), len(bigramSet), inter if len(inter) > 0: lastq = None for q in session: if q in queryList: q = normalize(q, stemmer) if lastq != q and len(q) > 1: print sid, '\t', q lastq = q sid += 1
def main(): parser = ap.ArgumentParser(description = 'Generate features for entity tagged queries') parser.add_argument('-i', '--iFile', help='Query log file', required=True) parser.add_argument('-o', '--oFile', help='Output feature file', required=True) parser.add_argument('-t', '--typeFile', help='DBPedia type file', required=True) parser.add_argument('-c', '--catFile', help='DBPedia cat file', required=True) parser.add_argument('-u', '--uid', help='User id present or not', required=True,type=bool) parser.add_argument('-w', '--wtype', help='Phrase (phrase) or query (query) features', required=True) args = parser.parse_args() boolUid = args.uid #load the category list dbCatList = loadCategories(args.catFile) #print 'Categories',len(dbCatList) #load the type list dbTypeList = loadInstancesInList(args.typeFile) #print 'Types',len(dbTypeList) #query list queryList = {} #user list userList = {} #url list urlList = {} #session list sessionList = {} #entity List entityList = {} #category List categoryList = {} #type list typeList = {} ipaddress = 'localhost' tagURL = 'http://'+ipaddress+':8080/dexter-webapp/api/rest/annotate' cqid = 1 sid = 1 qid = None for session in getSessionTuples(args.iFile,'\t', 1560): print 'Session id and length' , sid, len(session) for entry in session: query = entry[QUERY] #tag it with dexter and get all 3 parameters spotDict = tagQueryWithDexter(query,tagURL) if 'spots' in spotDict: updatedSpotDict = getCatAndTypeInfo(spotDict,dbCatList, dbTypeList) if args.wtype == 'query': #given wtype find the following if query not in queryList: #print 'Mapping ', query , 'to ', cqid queryList[query] = cqid qid = cqid cqid+=1 else: qid = queryList[query] updateDict(sessionList,sid, qid) if boolUid: updateDict(userList, entry[USER], qid) if CLICKU in entry: updateDict(urlList, entry[CLICKU],qid) if updatedSpotDict: for spot in updatedSpotDict['spots']: updateDict(categoryList,spot['cat'], qid) updateDict(typeList,spot['type'], qid) updateDict(entityList,encodeUTF(spot['wikiname'].lower()),qid) if args.wtype == 'phrase': for spot in updatedSpotDict['spots']: splits = query.split(spot['mention']) for split in splits: split = split.strip() #remove stop words split = filterStopWordsFromQuery(split) if len(split) > 1: if split not in queryList: queryList[split] = cqid qid = cqid cqid+=1 else: qid = queryList[split] updateDict(sessionList,sid, qid) if boolUid: updateDict(userList, entry[USER], qid) if CLICKU in entry: updateDict(urlList, entry[CLICKU],qid) if updatedSpotDict: updateDict(categoryList,spot['cat'], qid) updateDict(typeList,spot['type'], qid) updateDict(entityList,encodeUTF(spot['wikiname'].lower()),qid) sid+=1 #write the features to the outfile outF = open(args.oFile,'w') for query, qid in queryList.items(): outF.write(query) #generate ngrams queryVect = getDictFromSet(query.split()) ngramString = getNGramsAsList(query,3) #ngrams = 1 outF.write('\t'+str(ngramString)) #query vect = 2 outF.write('\t'+str(queryVect)) if qid in urlList: outF.write('\t'+str(urlList[qid])) else: outF.write('\t{}') if qid in userList: outF.write('\t'+str(userList[qid])) else: outF.write('\t{}') if qid in entityList: outF.write('\t'+str(entityList[qid])) else: outF.write('\t{}') if qid in categoryList: outF.write('\t'+str(categoryList[qid])) else: outF.write('\t{}') if qid in typeList: outF.write('\t'+str(typeList[qid])) else: outF.write('\t{}') if qid in sessionList: outF.write('\t'+str(sessionList[qid])) else: outF.write('\t{}') outF.write('\n') outF.close()
def combineQueryFeatures(queryFile, spotFile, featFile, newFile): #load features featDict = {} i = 1 urlDict = {} for line in open(featFile, 'r'): split = line.strip().split('\t') featDict[split[0].strip()] = split[1:] querySpots = {} for line in open(spotFile, 'r'): spotDict = ast.literal_eval(line) querySpots[spotDict['text']] = spotDict outF = open(newFile, 'w') #all queries for line in open(queryFile, 'r'): query = line.strip() queryFeat = [] #getNString(query,3).decode('utf-8') #triString = str(triString.encode('ascii','ignore')).strip() triString = getNGramsAsList(query, 3) if len(triString) > 0: queryFeat.append(triString) else: queryFeat.append({}) queryVect = getDictFromSet(query.split()) if len(queryVect) > 0: queryFeat.append(queryVect) else: queryFeat.append({}) if query in featDict: #normalize the users userString = getUserString(featDict[query][0]) if len(userString) > 0: queryFeat.append(userString) else: queryFeat.append({}) #normalize the urls i, urlDict, linkString = getUrlString(featDict[query][1], urlDict, i) if len(linkString) > 0: queryFeat.append(linkString) else: queryFeat.append({}) else: print 'Query not found ', query if query in querySpots: spotDict = querySpots[query] #ast.literal_eval(line) #cat, ent and type info result = getCatEntString(spotDict) for entry in result: if len(entry) > 0: queryFeat.append(entry) else: queryFeat.append({}) else: queryFeat += [{}, {}, {}] #print queryFeat try: outF.write(query) for entry in queryFeat: outF.write('\t' + str(entry)) outF.write('\n') except: print 'ERROR ', queryFeat outF.close()