def daatQuery(self, words): queryResult = [] urlTable = URLTable() num = len(words) print "num of lists:", num lp = [] openListStart = time.clock() for i in range(num): lp.append(self.openList(words[i])) print "time to openlist:", str(time.clock() - openListStart) sortStart = time.clock() bm25Time = 0 lp.sort(key=attrgetter('size')) # for i in range(1,len(lp)): # if did = 0 print "time sorting list:", str(time.clock() - sortStart) nextGEQTime=0 while did < config.MAXDID: nGEQStart=time.clock() did = self.nextGEQ(lp[0], did) nextGEQTime+=time.clock()-nGEQStart if did == config.MAXDID: break d = None for i in range(1, num): nGEQStart=time.clock() d = self.nextGEQ(lp[i], did) nextGEQTime+=time.clock()-nGEQStart if d != did: break if d is not None and d > did: did = d else: resultItem = ResultItem() resultItem.docID = did resultItem.url = urlTable[did].url score = 0 bm25Start = time.clock() for i in range(num): freq = self.getFreq(lp[i], did) score += bm25.getBM25(freq, lp[i].size, urlTable.N, urlTable[did].dl, urlTable.avgdl) pos = self.getPos(lp[i], did) resultItem.pos.append(pos) bm25Time += time.clock() - bm25Start resultItem.bm25 = score resultItem.score = resultItem.bm25 queryResult.append(resultItem) did += 1 print "total time nextGEQ:",nextGEQTime print "total time bm25: ", bm25Time return queryResult
def queryWords(self, query, start, limit): lexiconTable = LexiconTable() urlTable = URLTable() words = self.parseQuery(query) begin = time.clock() sets = [] indexMap = {} for word in words: index = self.getIndex(word) docset = set([]) if index is not None: docset = set(index.keys()) indexMap[word] = index sets.append(docset) resultset = set.intersection(*sets) print "get result set time:",str(time.clock()-begin) #pagerank = PageRank() resultSize = len(resultset) print "result size:",resultSize begin = time.clock() if start > resultSize-1: return [] queryResult = [] for docID in resultset: resultItem = ResultItem() resultItem.docID = docID resultItem.url = urlTable[docID].url bm25Score = 0 for word in words: bm25Score += bm25.getBM25(indexMap[word][docID].occurence, lexiconTable[word.lower()].occurence, urlTable.N, urlTable[docID].pagesize, urlTable.avgdl) resultItem.bm25 = bm25Score resultItem.score = resultItem.bm25 queryResult.append(resultItem) print "BM25 time:",str(time.clock()-begin) begin = time.clock() queryResult = sorted(queryResult, key=attrgetter('score'), reverse=True) print "sort BM25 time:",str(time.clock()-begin) begin = time.clock() startIndex = 0 if start < 0 else start endIndex = startIndex + limit endIndex = resultSize if endIndex > resultSize else endIndex print "start index & end index:", startIndex, endIndex queryResult = queryResult[startIndex:endIndex] for item in queryResult: item.snippet = cgi.escape(self.getSnippet(item.docID, item.url, words)) print "snippet time:",str(time.clock()-begin) return queryResult, resultSize