Beispiel #1
0
def main(out_dir: str,
         doc_dir: Optional[str],
         chunk_size: Optional[int] = None):
    assert chunk_size is None or chunk_size > 0

    # Load document names
    if doc_dir:
        docs_to_index = list(sorted(list_docs(doc_dir)))
    else:
        docs_to_index = read_docs_from_stdin()

    os.makedirs(out_dir, exist_ok=True)

    # Load or build a lexicon
    lex_path = os.path.join(out_dir, 'lexicon.txt')
    if not os.path.exists(lex_path):
        lexicon = build_lexicon(docs_to_index, lex_path)
        assert os.path.exists(lex_path), 'Missing: {}'.format(lex_path)
    else:
        lexicon = Lexicon.load(lex_path)

    # Build and store the document list
    docs_path = os.path.join(out_dir, 'documents.txt')
    documents = Documents([
        Documents.Document(id=i, name=d.name)
        for i, d in enumerate(docs_to_index)
    ])
    print('Storing document list: {}'.format(docs_path))
    documents.store(docs_path)
    assert os.path.exists(docs_path), 'Missing: {}'.format(docs_path)

    # Build inverted index chunks and reencode the documents
    index_path = os.path.join(out_dir, 'index.bin')
    data_dir = os.path.join(out_dir, 'data')
    remove_if_exists(index_path)
    remove_if_exists(data_dir)

    os.makedirs(data_dir)
    index_all_docs(docs_to_index, documents, lexicon, index_path, data_dir,
                   chunk_size)

    assert os.path.exists(index_path), 'Missing: {}'.format(index_path)
    print('Done!')
Beispiel #2
0
def main(
        index_dir: str,
        new_doc_dir: Optional[str],
        chunk_size: Optional[int] = None,
        skip_existing_names: bool = False
):
    assert chunk_size is None or chunk_size > 0
    doc_path = os.path.join(index_dir, 'documents.txt')
    lex_path = os.path.join(index_dir, 'lexicon.txt')
    index_path = os.path.join(index_dir, 'index.bin')

    old_lexicon = Lexicon.load(lex_path)

    documents = Documents.load(doc_path)

    if new_doc_dir:
        new_docs_to_index = list_docs(new_doc_dir)
    else:
        new_docs_to_index = read_docs_from_stdin()

    assert len(new_docs_to_index) > 0
    tmp_new_docs_to_index = []
    for new_doc in new_docs_to_index:
        if new_doc.name in documents:
            if skip_existing_names:
                print('Skipping: {} is already indexed!'.format(new_doc.name))
            else:
                raise Exception(
                    '{} is already indexed! Aborting.'.format(new_doc.name))
        else:
            tmp_new_docs_to_index.append(new_doc)
    new_docs_to_index = tmp_new_docs_to_index
    if len(new_docs_to_index) == 0:
        print('No new documents to index.')
        return

    # Update lexicon
    new_word_counts = get_word_counts(new_docs_to_index)
    lexicon_words = [
        Lexicon.Word(w.id, w.token, w.count + new_word_counts[w.token]
                     if w.token in new_word_counts else w.count)
        for w in old_lexicon
    ]
    for w in new_word_counts:
        if w not in old_lexicon:
            lexicon_words.append(
                Lexicon.Word(len(lexicon_words), w, new_word_counts[w]))
    lexicon = Lexicon(lexicon_words)

    base_doc_id = len(documents)
    new_documents = [Documents.Document(id=i + base_doc_id, name=d.name)
                     for i, d in enumerate(new_docs_to_index)]

    # Convert existing index.bin to a dirctory if needed
    if os.path.isfile(index_path):
        tmp_index_path = index_path + '.tmp'
        shutil.move(index_path, tmp_index_path)
        os.makedirs(index_path)
        shutil.move(
            tmp_index_path,
            os.path.join(index_path, '{:07d}-{:07d}.bin'.format(
                0, base_doc_id)))
    assert os.path.isdir(index_path)

    # Index the new documents
    index_new_docs(new_docs_to_index, new_documents, lexicon, index_path,
                   os.path.join(index_dir, 'data'), chunk_size)

    # Write out the new documents file
    shutil.move(doc_path, doc_path + '.old')
    all_documents = list(documents)
    all_documents.extend(new_documents)
    Documents(all_documents).store(doc_path)

    # Update to the new lexicon
    lexicon.store(lex_path)

    print('Done!')