class SearchEngine(object):
    def __init__(self, mystem_path, documents, dictionary):
        self._documents = documents
        self._N = len(documents)
        self._dictionary = dictionary
        self._lemmer = Lemmer(mystem_path)

    def _get_df(self, postings):
        return len(set(postings))

    def _get_top(self, scores):
        result = []
        sorted_scores = sorted(scores.items(), key=lambda (document_id, score): score)
        for (document_id, score) in sorted_scores[-10:]:
            path = self._documents[document_id]
            result.append((path, score))
        return result

    def search(self, *args):
        tfidf = defaultdict(dict)
        query = [self._lemmer.translate(word) for word in args]
        query_dictionary = dict((word, self._dictionary.get(word)) for word in query)

        for (word, postings) in query_dictionary.iteritems():
            df = self._get_df(postings)
            idf = math.log(self._N / float(df))

            for document_id in xrange(self._N):
                tf = query_dictionary[word].count(document_id)
                tfidf[word][document_id] = idf * tf

        scores = {}
        for document_id in xrange(self._N):
            score = 0
            for word in query:
                score += tfidf[word][document_id]
            scores[document_id] = score

        pprint.pprint(self._get_top(scores))
if options.collection_dir is None:
    parser.error('Collection directory option is required!')

if options.index_path is None:
    parser.error('Index file is not specified!')


def walk(path):
    for root_dir, dirs, files in os.walk(path):
        for file in files:
            yield os.path.join(root_dir, file)

dictionary = defaultdict(list)
documents = []

lemmer = Lemmer(options.mystem_path)
scanner = Scanner()

for (document_id, document_path) in enumerate(walk(options.collection_dir)):
    with codecs.open(document_path, 'r', 'cp1251') as f:
        words = scanner.scan(f.read())
        for word in words:
            if word:
                stem = lemmer.translate(word)
                dictionary[stem].append(document_id)
        documents.append(document_path)
        print '.',

items = dictionary.items()
items.sort(key=lambda (stem, postings): len(documents))
 def __init__(self, mystem_path, documents, dictionary):
     self._documents = documents
     self._N = len(documents)
     self._dictionary = dictionary
     self._lemmer = Lemmer(mystem_path)