예제 #1
0
def build_lexicon(
    docs_to_index: List[DocumentToIndex],
    lex_path: str,
) -> Lexicon:
    print('Building lexicon: {}'.format(lex_path))
    word_counts = get_word_counts(docs_to_index)
    lexicon = Lexicon([
        Lexicon.Word(i, w, word_counts[w])
        for i, w in enumerate(sorted(word_counts.keys()))
    ])
    print('Storing lexicon: {}'.format(lex_path))
    lexicon.store(lex_path)
    return lexicon
예제 #2
0
def main(
        index_dir: str,
        new_doc_dir: Optional[str],
        chunk_size: Optional[int] = None,
        skip_existing_names: bool = False
):
    assert chunk_size is None or chunk_size > 0
    doc_path = os.path.join(index_dir, 'documents.txt')
    lex_path = os.path.join(index_dir, 'lexicon.txt')
    index_path = os.path.join(index_dir, 'index.bin')

    old_lexicon = Lexicon.load(lex_path)

    documents = Documents.load(doc_path)

    if new_doc_dir:
        new_docs_to_index = list_docs(new_doc_dir)
    else:
        new_docs_to_index = read_docs_from_stdin()

    assert len(new_docs_to_index) > 0
    tmp_new_docs_to_index = []
    for new_doc in new_docs_to_index:
        if new_doc.name in documents:
            if skip_existing_names:
                print('Skipping: {} is already indexed!'.format(new_doc.name))
            else:
                raise Exception(
                    '{} is already indexed! Aborting.'.format(new_doc.name))
        else:
            tmp_new_docs_to_index.append(new_doc)
    new_docs_to_index = tmp_new_docs_to_index
    if len(new_docs_to_index) == 0:
        print('No new documents to index.')
        return

    # Update lexicon
    new_word_counts = get_word_counts(new_docs_to_index)
    lexicon_words = [
        Lexicon.Word(w.id, w.token, w.count + new_word_counts[w.token]
                     if w.token in new_word_counts else w.count)
        for w in old_lexicon
    ]
    for w in new_word_counts:
        if w not in old_lexicon:
            lexicon_words.append(
                Lexicon.Word(len(lexicon_words), w, new_word_counts[w]))
    lexicon = Lexicon(lexicon_words)

    base_doc_id = len(documents)
    new_documents = [Documents.Document(id=i + base_doc_id, name=d.name)
                     for i, d in enumerate(new_docs_to_index)]

    # Convert existing index.bin to a dirctory if needed
    if os.path.isfile(index_path):
        tmp_index_path = index_path + '.tmp'
        shutil.move(index_path, tmp_index_path)
        os.makedirs(index_path)
        shutil.move(
            tmp_index_path,
            os.path.join(index_path, '{:07d}-{:07d}.bin'.format(
                0, base_doc_id)))
    assert os.path.isdir(index_path)

    # Index the new documents
    index_new_docs(new_docs_to_index, new_documents, lexicon, index_path,
                   os.path.join(index_dir, 'data'), chunk_size)

    # Write out the new documents file
    shutil.move(doc_path, doc_path + '.old')
    all_documents = list(documents)
    all_documents.extend(new_documents)
    Documents(all_documents).store(doc_path)

    # Update to the new lexicon
    lexicon.store(lex_path)

    print('Done!')