def main(out_dir: str, doc_dir: Optional[str], chunk_size: Optional[int] = None): assert chunk_size is None or chunk_size > 0 # Load document names if doc_dir: docs_to_index = list(sorted(list_docs(doc_dir))) else: docs_to_index = read_docs_from_stdin() os.makedirs(out_dir, exist_ok=True) # Load or build a lexicon lex_path = os.path.join(out_dir, 'lexicon.txt') if not os.path.exists(lex_path): lexicon = build_lexicon(docs_to_index, lex_path) assert os.path.exists(lex_path), 'Missing: {}'.format(lex_path) else: lexicon = Lexicon.load(lex_path) # Build and store the document list docs_path = os.path.join(out_dir, 'documents.txt') documents = Documents([ Documents.Document(id=i, name=d.name) for i, d in enumerate(docs_to_index) ]) print('Storing document list: {}'.format(docs_path)) documents.store(docs_path) assert os.path.exists(docs_path), 'Missing: {}'.format(docs_path) # Build inverted index chunks and reencode the documents index_path = os.path.join(out_dir, 'index.bin') data_dir = os.path.join(out_dir, 'data') remove_if_exists(index_path) remove_if_exists(data_dir) os.makedirs(data_dir) index_all_docs(docs_to_index, documents, lexicon, index_path, data_dir, chunk_size) assert os.path.exists(index_path), 'Missing: {}'.format(index_path) print('Done!')
def main( index_dir: str, new_doc_dir: Optional[str], chunk_size: Optional[int] = None, skip_existing_names: bool = False ): assert chunk_size is None or chunk_size > 0 doc_path = os.path.join(index_dir, 'documents.txt') lex_path = os.path.join(index_dir, 'lexicon.txt') index_path = os.path.join(index_dir, 'index.bin') old_lexicon = Lexicon.load(lex_path) documents = Documents.load(doc_path) if new_doc_dir: new_docs_to_index = list_docs(new_doc_dir) else: new_docs_to_index = read_docs_from_stdin() assert len(new_docs_to_index) > 0 tmp_new_docs_to_index = [] for new_doc in new_docs_to_index: if new_doc.name in documents: if skip_existing_names: print('Skipping: {} is already indexed!'.format(new_doc.name)) else: raise Exception( '{} is already indexed! Aborting.'.format(new_doc.name)) else: tmp_new_docs_to_index.append(new_doc) new_docs_to_index = tmp_new_docs_to_index if len(new_docs_to_index) == 0: print('No new documents to index.') return # Update lexicon new_word_counts = get_word_counts(new_docs_to_index) lexicon_words = [ Lexicon.Word(w.id, w.token, w.count + new_word_counts[w.token] if w.token in new_word_counts else w.count) for w in old_lexicon ] for w in new_word_counts: if w not in old_lexicon: lexicon_words.append( Lexicon.Word(len(lexicon_words), w, new_word_counts[w])) lexicon = Lexicon(lexicon_words) base_doc_id = len(documents) new_documents = [Documents.Document(id=i + base_doc_id, name=d.name) for i, d in enumerate(new_docs_to_index)] # Convert existing index.bin to a dirctory if needed if os.path.isfile(index_path): tmp_index_path = index_path + '.tmp' shutil.move(index_path, tmp_index_path) os.makedirs(index_path) shutil.move( tmp_index_path, os.path.join(index_path, '{:07d}-{:07d}.bin'.format( 0, base_doc_id))) assert os.path.isdir(index_path) # Index the new documents index_new_docs(new_docs_to_index, new_documents, lexicon, index_path, os.path.join(index_dir, 'data'), chunk_size) # Write out the new documents file shutil.move(doc_path, doc_path + '.old') all_documents = list(documents) all_documents.extend(new_documents) Documents(all_documents).store(doc_path) # Update to the new lexicon lexicon.store(lex_path) print('Done!')