Ejemplo n.º 1
0
    def rank(self, query):
        query = StemmerHelper.stem_text(query)
        term_freqs = self.__get_tf(query)
        query_vector = {}
        for term, tf in term_freqs.iteritems():
            idf = 0.0
            if self.__idfs.has_key(term):
                idf = self.__idfs[term]
            tfidf = tf * idf
            print "{0} tf {1} : idf {2} : {3}".format(term, tf, idf, tfidf )
            query_vector[term] = tfidf

        union = set()
        for term in term_freqs.keys():
            if self.__inverted_file.has_key(term):
                union = union | self.__inverted_file[term]
        scores = []
        for id in union:
            scores.append({'id': id, 'score': self.__similarity(query_vector, self.__get_document_vector(id))})

        sorted_scores = sorted(scores, key=lambda k: k['score'], reverse=True)
        self.__search_result = []
        for score in sorted_scores:
            document = self.__documents[score['id']]
            document.set_score(score['score'])
            self.__search_result.append(document)

        return self.__search_result
Ejemplo n.º 2
0
 def __init_keywords(self, keywords_filename):
     print keywords_filename
     self.__documents = []
     #p = PorterStemmer()
     file = open(keywords_filename)
     try:
         line = file.readline()
         while line:
             stemmed_word = StemmerHelper.stem_text(line)
             if len(stemmed_word) > 0:
                 self.__keywords.add(stemmed_word)
             line = file.readline()
     except IOError:
         print "Error: can\'t find keywords file or read data"
     finally:
         file.close()
     return
Ejemplo n.º 3
0
 def __init__(self, title, text):
     self.__text = text
     self.__stemmed_text = StemmerHelper.stem_text(text)
     self.__title = title
     self.__score = 0.0