def updateNetwork(query, network, qp, searcher, tlc, field, ntype): #find the top 50 documents q = qp.parse(unicode(query)) totalText = '' total = 0.0 tmin = -1000 tmax = 1000 terms = set() try: searcher.search_with_collector(q, tlc) except TimeLimit: print '--LONG-- ', query results = tlc.results() for entry in results: totalText += entry[field] + ' ' finder = BigramCollocationFinder.from_words(word_tokenize(totalText)) #update the network rList = finder.score_ngrams(biMeas.pmi) for rTuple in rList: total += rTuple[1] if tmin > rTuple[1]: tmin = rTuple[1] if tmax < rTuple[1]: tmax = rTuple[1] for rTuple in sorted(rList, reverse=True, key=lambda x: x[1]): if (len(terms) < 3000 and finder.ngram_fd[rTuple[0]] > 2 ) or (finder.ngram_fd[rTuple[0]] > 1.0 and rTuple[0][0] in query or rTuple[0][1] in query and len(terms) < 4000): #if (len(terms) < 3000 and finder.ngram_fd[rTuple[0]] > 2) or (rTuple[0][0] in query or rTuple[0][1] in query and len(terms) < 4000): a = rTuple[0][0] if len(a) > 2 and hasAlpha(a) and a not in stopSet and not hasWebsite(a): if a not in network: network[a] = {} terms.add(a) b = rTuple[0][1] if len(b) > 2 and hasAlpha(b) and b not in stopSet and not hasWebsite( b): if b not in network[a]: network[a][b] = {} terms.add(b) network[a][b][ntype] = network[a][b].setdefault(ntype, 0.0) + ( (rTuple[1] - tmin) / (tmax - tmin)) print query, ntype, len(terms) return terms
def getUserVector(fileName, uIndex, qIndex): userVector = {} lastUser = None porter1 = porter.PorterStemmer() for line in open(fileName, 'r'): split = line.strip().split('\t') uId = split[uIndex] query = split[qIndex] if not lastUser: lastUser = uId raw_split = re.sub(SYMB, ' ', query.lower()).split(' ') query = filterStopWordsFromList(raw_split) #print uId, lastUser, lastUser!=uId if lastUser != uId: yield lastUser, userVector userVector = {} if (not (hasManyChars(query,raw_split,1,4,70) \ or hasInapWords(raw_split) or hasManyWords(raw_split,15,40))) \ and hasAlpha(query): qDict = text_to_vector(query) for entry, val in qDict.iteritems(): entry1 = porter1.stem(entry) userVector[entry1] = userVector.setdefault(entry1, 0.0) + val lastUser = uId yield lastUser, userVector
def createVector(self, fileName): porter = stem.porter.PorterStemmer() word_catVector = {} word_entVector = {} for line in open(fileName, 'r'): split = line.strip().split('\t') query = split[0] qsplit = query.split() spotDict = ast.literal_eval(split[1]) for entity, elist in spotDict.iteritems(): for oword in qsplit: oword = oword.replace('\'', '') word = porter.stem(oword) if len(word) > 2 and hasAlpha(word) and word not in stopSet: if word not in word_entVector: word_catVector[word] = {} word_entVector[word] = {} for cat in elist['cat'].split(): word_catVector[word][cat] = word_catVector[word].setdefault( cat, 0.0) + 1.0 word_entVector[word][entity] = word_entVector[word].setdefault( entity, 0.0) + 1.0 self.writeVector('ont/Word_catCount.txt', word_catVector) self.writeVector('ont/Word_entCount.txt', word_entVector)
def updateNetworkFromText(self, query, text, ntype): total = 0.0 tmin = -1000 tmax = 1000 qsplit = query.split() for entry in qsplit: term = self.porter.stem(entry) self.network[term] = {} self.terms.add(term) finder = BigramCollocationFinder.from_words(word_tokenize(text)) #update the network rList = finder.score_ngrams(self.biMeas.pmi) for rTuple in rList: total += rTuple[1] if tmin > rTuple[1]: tmin = rTuple[1] if tmax < rTuple[1]: tmax = rTuple[1] for rTuple in sorted(rList, reverse=True, key=lambda x: x[1]): if (len(self.terms) < 1000 and finder.ngram_fd[rTuple[0]] > 2) or \ ((finder.ngram_fd[rTuple[0]] > 1.0 and rTuple[0][0] in query) or \ (rTuple[0][1] in query and len(self.terms) < 1500)): noSymbA = SYMBreg.sub('', rTuple[0][0]) noSymbB = SYMBreg.sub('', rTuple[0][1]) if noSymbA not in stopSet and noSymbB not in stopSet: a = self.porter.stem(noSymbA) b = self.porter.stem(noSymbB) if len(a) > 2 and hasAlpha(a) and a not in stopSet and not hasWebsite(a) \ and len(b) > 2 and hasAlpha(b) and b not in stopSet and not hasWebsite(b): if a not in self.network: self.network[a] = {} self.terms.add(a) if b not in self.network[a]: self.network[a][b] = {} self.terms.add(b) self.network[a][b][ntype] = self.network[a][b].setdefault( ntype, 0.0) + ((rTuple[1] - tmin) / (tmax - tmin)) print query, ntype, len(self.terms)
def getVectorFromTuple(self, tTuple): tDict = {} #print tTuple for entry in tTuple: split = entry.split() for unstem in split: word = self.porter.stem(unstem) if word not in stopSet and hasAlpha(word) and len( word) > 2 and word not in ashleelString: tDict[word] = tDict.setdefault(word, 0.0) + 1.0 #print tDict return tDict
def expandText(self, query, limit): #get the entities spotDict = self.dexter.tagText(query) #P(c) pC = 1.0 / self.entityCatManager.getUniqueTermCount() termList = self.termTermManager.getUniqueTerms() pEC = {} for entity, edict in spotDict.iteritems(): catList = edict['cat'].split() #SUM(P(e|c)P(c)) pEC[entity] = 0.0 for cat in catList: #SUM(P(e|c)P(c)) #print entity, cat, self.entityCatManager.getProb(cat, entity) pEC[entity] += pC * self.entityCatManager.getProb(cat, entity) #print entity, 'pEC', pEC[entity] termScore = {} for term in termList: pTE = 0.0 termScore[term] = 0.0 #P(t|e) for entity, score in pEC.iteritems(): repQuery = query.replace(entity, '') pTT = 0.0000001 qsplit = repQuery.split() for entry in qsplit: if len(entry) > 2 and entry not in stopSet and hasAlpha(entry): pTT += self.termTermManager.getProb(term, entry) #print term, entry, self.termTermManager.getProb(term, entry) pTE = self.termEntityManager.getProb(entity, term) if pTE == 0.0: pTE = 0.0000001 #print term, entity, pTE, score, pTT termScore[term] += pTE * score * pTT if termScore[term] < 0.0 or termScore[term] > 0.0: termScore[term] = math.log(termScore[term]) resultSet = {} for ttuple in sorted(termScore.items(), reverse=True, key=lambda x: x[1]): #print ttuple, resultSet[ttuple[0]] = ttuple[1] if len(resultSet) == limit + 10: break #print #print query, '\t', resultSet return resultSet
def getTaskTermSet(self, rSort, text): termSet = {} #qterms = getQueryTerms(text); for entry in rSort: #tDict = taskDict[entry[0]] tDict = text_to_vector(entry[0]) for tentry, value in tDict.iteritems(): stem = self.porter.stem(tentry) if tentry not in stopSet and len(tentry) >2 and hasAlpha(tentry) \ and (tentry not in text and stem not in text) and tentry in self.vocab: termSet[stem] = termSet.setdefault(stem, 0.0) + value #for term in termSet.keys(): # for qterm in qTerms: # termSet[term] *= return termSet
def getCoOcScore(self, stemSet, phrase): total = 0.0 tCount = 0.0 for qRep in stemSet: # stem the term # get PMI if len(qRep) > 2 and qRep not in stopSet and hasAlpha(qRep): # total += self.coMan.getPMI(phrase, qRep,50) c1, c2 = self.coMan.getCoOcCount(phrase, qRep) if c1 != c2: print ":O CoOcc count diff ", phrase, qRep, c1, c2 total += c1 tCount += 1.0 if tCount > 0: return total / tCount return 0
def getCoOcScore(self, phrase, stemSet): total = 0.0 tCount = 0.0 for qRep in stemSet: #stem the term #get PMI if len(qRep) > 2 and qRep not in stopSet and hasAlpha(qRep): #total += self.catCoMan.getPMI(phrase, qRep,10) #c1, c2 = self.catCoMan.getCoOcCount(phrase, qRep); c1 = self.catCoMan.getProb(phrase, qRep, 50) #if c1 != c2: # print ':O CoOcc count diff ',phrase, qRep, c1, c2; #total+= c1; if c1 > 0: total += c1 tCount += 1.0 if tCount > 0: return total / tCount return 0
def findClickQuery(fileName): """load clicked queries""" porter1 = porter.PorterStemmer() clickQuery = {} for line in open(fileName, 'r'): entry = parseLine(line.strip()) if len(entry) > 3: terms = entry[QUERY].split() for term in terms: nterm = porter1.stem(term) if len(term) > 2 and hasAlpha(term) and term not in ashleelString and \ nterm not in stopSet and nterm not in ashleelString: if nterm not in clickQuery: clickQuery[nterm] = {} clickQuery[nterm][entry[CLICKU]] = clickQuery[nterm].setdefault( entry[CLICKU], 0.0) + 1.0 for entry, cdict in clickQuery.iteritems(): print entry, '\t', cdict return clickQuery