def rank(self, query): query = StemmerHelper.stem_text(query) term_freqs = self.__get_tf(query) query_vector = {} for term, tf in term_freqs.iteritems(): idf = 0.0 if self.__idfs.has_key(term): idf = self.__idfs[term] tfidf = tf * idf print "{0} tf {1} : idf {2} : {3}".format(term, tf, idf, tfidf ) query_vector[term] = tfidf union = set() for term in term_freqs.keys(): if self.__inverted_file.has_key(term): union = union | self.__inverted_file[term] scores = [] for id in union: scores.append({'id': id, 'score': self.__similarity(query_vector, self.__get_document_vector(id))}) sorted_scores = sorted(scores, key=lambda k: k['score'], reverse=True) self.__search_result = [] for score in sorted_scores: document = self.__documents[score['id']] document.set_score(score['score']) self.__search_result.append(document) return self.__search_result
def __init_keywords(self, keywords_filename): print keywords_filename self.__documents = [] #p = PorterStemmer() file = open(keywords_filename) try: line = file.readline() while line: stemmed_word = StemmerHelper.stem_text(line) if len(stemmed_word) > 0: self.__keywords.add(stemmed_word) line = file.readline() except IOError: print "Error: can\'t find keywords file or read data" finally: file.close() return
def __init__(self, title, text): self.__text = text self.__stemmed_text = StemmerHelper.stem_text(text) self.__title = title self.__score = 0.0