def search(self, query, queryExpansionParameter): orderedQueryResults = self.tfidf(query); if orderedQueryResults is None: return None; mostRelevantDocumentId = orderedQueryResults.getHead().docId; mostRelevantDocumentIdDocumentCollection = int(self._documentIds[mostRelevantDocumentId]); orderedExpansionTerms = LinkedList(); orderedExpansionTerms.printSep = "\n"; uniqueTerms = {}; currentDoc = self._docCollect.nextDocument(); currentDocId = int(currentDoc.getName()); while currentDocId != mostRelevantDocumentIdDocumentCollection: currentDoc = self._docCollect.nextDocument(); currentDocId = int(currentDoc.getName()); # current doc now has the document that is the highest ranked tfid result self._tokenizer.loadDocument(currentDoc); # generate unique query term set token, position = self._tokenizer.nextToken() while token is not None: uniqueTerms[token] = position; token, position = self._tokenizer.nextToken(); # calculate tfidf scores for each token and insert into ordered linked list for token, pos in uniqueTerms.iteritems(): tfidf = self.tfidf_term(token, mostRelevantDocumentIdDocumentCollection); orderedExpansionTerms.insertSorted(QueryResultItem(token, tfidf)); # expand query with most relevant query terms: currentNode = orderedExpansionTerms._head; for i in xrange(0,queryExpansionParameter): currentData = currentNode.value.docId; query += " " + currentData; currentNode = currentNode.next; # perform search with appended query results = self.tfidf(query); current = results._head; rank = 0; print("\n\nComplete query:"); print(query) print("results:\n") while current is not None: print str(self._currentQueryId) + " 0 " + str(current.value.docId) + " " + str(rank) + " " + str(current.value.tfidf) + " 0"; current = current.next; rank += 1; self._currentQueryId += 1;
def tfidf(self, query): queryTokens = self._queryTokenizer.tokenize(query); postingLists = self._getPostingLists(queryTokens); documentScores = {}; # create document scores for term in postingLists: if term is None: return; for doc in term: termFrequency = 1 + math.log10(float(doc.count) / float(self.termFrequencies[int(doc.doc) - 1])) docSocre = termFrequency * self._getIDF(term); if str(doc.doc) in documentScores.keys(): documentScores[str(doc.doc)] += docSocre; else: documentScores[str(doc.doc)] = docSocre; # create an ordered list of documents, ordered by document score orderedQueryResults = LinkedList(); orderedQueryResults.printSep = "\n"; for docId, ds in documentScores.iteritems(): orderedQueryResults.insertSorted(QueryResultItem(docId, ds)); return orderedQueryResults