def getMatchesOld(self, terms): ''' returns a list of top-n matching pairs (doc-id, score) ranked by descending scores ''' matches = collections.defaultdict(float) for t in terms: if (self.index.hasTerm(t) is None): continue idf = self.index.getIdf(t) for entry in self.index.getEntries(t): docId = entry["doc-id"] tf = entry["tf"] matches[docId] += tf * idf return getTopMatches([(k, v) for k, v in matches.items()], self.n)
def getMatchesNew(self, listOfTermsAndScores): ''' returns a list of top-n matching pairs (doc-id, score) ranked by descending scores ''' matches = dict() numberOfOccurences = dict() for query in listOfTermsAndScores: for t in query: if self.index.hasTerm(t[0]): idf = self.index.getIdf(t[0]) for entry in self.index.getEntries(t[0]): docId = entry["doc-id"] tf = entry["tf"] d = self.index.getD(docId) w = (idf * tf * (1 + self.k)) / ( tf + self.k * (1 - self.b + self.b * d / self.avrgD)) matchScore = w * t[1] if docId in matches.keys(): totalScore = matches[docId] * numberOfOccurences[ docId] totalScore += matchScore numberOfOccurences[docId] += 1 newScore = totalScore / numberOfOccurences[docId] matches[docId] = newScore ''' totalScore = matches[docId][0]*numberOfOccurences[docId] totalScore += matchScore numberOfOccurences[docId] += 1 newScore = totalScore/numberOfOccurences[docId] matches[docId][0] = newScore if matches[docId][1] < t[1]: matches[docId][1] = t[1] ''' else: matches[docId] = matchScore numberOfOccurences[docId] = 1 ''' matches[docId] = list() matches[docId].append(matchScore) matches[docId].append(t[1]) numberOfOccurences[docId] = 1 ''' #print(matches) return getTopMatches([(k, v) for k, v in matches.items()], self.n)
def getMatchesNew(self, listOfTermsAndScores): ''' returns a list of top-n matching pairs (doc-id, score) ranked by descending scores ''' matches = dict() for query in listOfTermsAndScores: for t in query: if self.index.hasTerm(t[0]): idf = self.index.getIdf(t[0]) for entry in self.index.getEntries(t[0]): docId = entry["doc-id"] tf = entry["tf"] tf_idf = tf * idf matchScore = tf_idf * t[1] if docId in matches.keys(): matches[docId] += matchScore else: matches[docId] = matchScore return getTopMatches([(k, v) for k, v in matches.items()], self.n)