def test_load_dumped_corpus(mode: str, vectorized_corpus: VectorizedCorpus): tag: str = f'{str(uuid.uuid1())[:6]}' folder: str = jj(OUTPUT_FOLDER, tag) os.makedirs(folder, exist_ok=True) vectorized_corpus.dump(tag=tag, folder=folder, compressed=True, mode=mode) assert VectorizedCorpus.dump_exists(tag=tag, folder=folder) assert VectorizedCorpus.find_tags(folder) == [tag] loaded_corpus: VectorizedCorpus = VectorizedCorpus.load(tag=tag, folder=folder) assert (vectorized_corpus.term_frequency == loaded_corpus.term_frequency ).all() assert vectorized_corpus.document_index.to_dict( ) == loaded_corpus.document_index.to_dict() assert vectorized_corpus.token2id == loaded_corpus.token2id loaded_options: dict = VectorizedCorpus.load_options(tag=tag, folder=folder) assert loaded_options == dict() VectorizedCorpus.dump_options(tag=tag, folder=folder, options=dict(apa=1)) loaded_options: dict = VectorizedCorpus.load_options(tag=tag, folder=folder) assert loaded_options == dict(apa=1) VectorizedCorpus.remove(tag=tag, folder=folder) assert not VectorizedCorpus.dump_exists(tag=tag, folder=folder) assert not VectorizedCorpus.find_tags(folder) shutil.rmtree(folder)
def store_corpus_bundle(corpus: VectorizedCorpus, args: interface.ComputeOpts): if VectorizedCorpus.dump_exists(tag=args.corpus_tag, folder=args.target_folder): VectorizedCorpus.remove(tag=args.corpus_tag, folder=args.target_folder) target_folder = args.target_folder if args.create_subfolder: if os.path.split(target_folder)[1] != args.corpus_tag: target_folder = os.path.join(target_folder, args.corpus_tag) os.makedirs(target_folder, exist_ok=True) corpus.dump(tag=args.corpus_tag, folder=target_folder) VectorizedCorpus.dump_options( tag=args.corpus_tag, folder=target_folder, options=args.props, )