Ejemplo n.º 1
0
def build_index(in_dir, out_dict, out_postings):
    """
    Build index from documents stored in the input directory,
    then output the dictionary file and postings file
    """
    print('Indexing...')

    stemmer = PorterStemmer()
    dictionaries = Dictionaries(out_dict)
    postings = Postings(out_postings)
    offset = 1

    for docID in os.listdir(in_dir):
        f = open(f'{in_dir}/{docID}', 'r')
        content_tokens = word_tokenize(f.read())
        for word in content_tokens:
            term = stemmer.stem(word=word).lower()

            if dictionaries.has_term(term):
                old_offset = dictionaries.get_offset(term)
                postings.add_docId_to_offset(old_offset, docID)
            else:
                dictionaries.add_term(term, offset)
                postings.add_doc_id(offset)
                postings.add_docId_to_offset(offset, docID)
                offset += 1

            dictionaries.increment_frequency(term)

    postings.save_to_file(dictionaries)
    dictionaries.save_to_file()