def updateNetwork(query, network, qp, searcher, tlc, field, ntype): #find the top 50 documents q = qp.parse(unicode(query)) totalText = '' total = 0.0 tmin = -1000 tmax = 1000 terms = set() try: searcher.search_with_collector(q, tlc) except TimeLimit: print '--LONG-- ', query results = tlc.results() for entry in results: totalText += entry[field] + ' ' finder = BigramCollocationFinder.from_words(word_tokenize(totalText)) #update the network rList = finder.score_ngrams(biMeas.pmi) for rTuple in rList: total += rTuple[1] if tmin > rTuple[1]: tmin = rTuple[1] if tmax < rTuple[1]: tmax = rTuple[1] for rTuple in sorted(rList, reverse=True, key=lambda x: x[1]): if (len(terms) < 3000 and finder.ngram_fd[rTuple[0]] > 2 ) or (finder.ngram_fd[rTuple[0]] > 1.0 and rTuple[0][0] in query or rTuple[0][1] in query and len(terms) < 4000): #if (len(terms) < 3000 and finder.ngram_fd[rTuple[0]] > 2) or (rTuple[0][0] in query or rTuple[0][1] in query and len(terms) < 4000): a = rTuple[0][0] if len(a) > 2 and hasAlpha(a) and a not in stopSet and not hasWebsite(a): if a not in network: network[a] = {} terms.add(a) b = rTuple[0][1] if len(b) > 2 and hasAlpha(b) and b not in stopSet and not hasWebsite( b): if b not in network[a]: network[a][b] = {} terms.add(b) network[a][b][ntype] = network[a][b].setdefault(ntype, 0.0) + ( (rTuple[1] - tmin) / (tmax - tmin)) print query, ntype, len(terms) return terms
def updateNetworkFromText(self, query, text, ntype): total = 0.0 tmin = -1000 tmax = 1000 qsplit = query.split() for entry in qsplit: term = self.porter.stem(entry) self.network[term] = {} self.terms.add(term) finder = BigramCollocationFinder.from_words(word_tokenize(text)) #update the network rList = finder.score_ngrams(self.biMeas.pmi) for rTuple in rList: total += rTuple[1] if tmin > rTuple[1]: tmin = rTuple[1] if tmax < rTuple[1]: tmax = rTuple[1] for rTuple in sorted(rList, reverse=True, key=lambda x: x[1]): if (len(self.terms) < 1000 and finder.ngram_fd[rTuple[0]] > 2) or \ ((finder.ngram_fd[rTuple[0]] > 1.0 and rTuple[0][0] in query) or \ (rTuple[0][1] in query and len(self.terms) < 1500)): noSymbA = SYMBreg.sub('', rTuple[0][0]) noSymbB = SYMBreg.sub('', rTuple[0][1]) if noSymbA not in stopSet and noSymbB not in stopSet: a = self.porter.stem(noSymbA) b = self.porter.stem(noSymbB) if len(a) > 2 and hasAlpha(a) and a not in stopSet and not hasWebsite(a) \ and len(b) > 2 and hasAlpha(b) and b not in stopSet and not hasWebsite(b): if a not in self.network: self.network[a] = {} self.terms.add(a) if b not in self.network[a]: self.network[a][b] = {} self.terms.add(b) self.network[a][b][ntype] = self.network[a][b].setdefault( ntype, 0.0) + ((rTuple[1] - tmin) / (tmax - tmin)) print query, ntype, len(self.terms)