class SearchEngine(object): def __init__(self, mystem_path, documents, dictionary): self._documents = documents self._N = len(documents) self._dictionary = dictionary self._lemmer = Lemmer(mystem_path) def _get_df(self, postings): return len(set(postings)) def _get_top(self, scores): result = [] sorted_scores = sorted(scores.items(), key=lambda (document_id, score): score) for (document_id, score) in sorted_scores[-10:]: path = self._documents[document_id] result.append((path, score)) return result def search(self, *args): tfidf = defaultdict(dict) query = [self._lemmer.translate(word) for word in args] query_dictionary = dict((word, self._dictionary.get(word)) for word in query) for (word, postings) in query_dictionary.iteritems(): df = self._get_df(postings) idf = math.log(self._N / float(df)) for document_id in xrange(self._N): tf = query_dictionary[word].count(document_id) tfidf[word][document_id] = idf * tf scores = {} for document_id in xrange(self._N): score = 0 for word in query: score += tfidf[word][document_id] scores[document_id] = score pprint.pprint(self._get_top(scores))
if options.collection_dir is None: parser.error('Collection directory option is required!') if options.index_path is None: parser.error('Index file is not specified!') def walk(path): for root_dir, dirs, files in os.walk(path): for file in files: yield os.path.join(root_dir, file) dictionary = defaultdict(list) documents = [] lemmer = Lemmer(options.mystem_path) scanner = Scanner() for (document_id, document_path) in enumerate(walk(options.collection_dir)): with codecs.open(document_path, 'r', 'cp1251') as f: words = scanner.scan(f.read()) for word in words: if word: stem = lemmer.translate(word) dictionary[stem].append(document_id) documents.append(document_path) print '.', items = dictionary.items() items.sort(key=lambda (stem, postings): len(documents))
def __init__(self, mystem_path, documents, dictionary): self._documents = documents self._N = len(documents) self._dictionary = dictionary self._lemmer = Lemmer(mystem_path)