コード例 #1
0
def test_load_dumped_corpus(mode: str, vectorized_corpus: VectorizedCorpus):

    tag: str = f'{str(uuid.uuid1())[:6]}'
    folder: str = jj(OUTPUT_FOLDER, tag)

    os.makedirs(folder, exist_ok=True)

    vectorized_corpus.dump(tag=tag, folder=folder, compressed=True, mode=mode)

    assert VectorizedCorpus.dump_exists(tag=tag, folder=folder)
    assert VectorizedCorpus.find_tags(folder) == [tag]

    loaded_corpus: VectorizedCorpus = VectorizedCorpus.load(tag=tag,
                                                            folder=folder)
    assert (vectorized_corpus.term_frequency == loaded_corpus.term_frequency
            ).all()
    assert vectorized_corpus.document_index.to_dict(
    ) == loaded_corpus.document_index.to_dict()
    assert vectorized_corpus.token2id == loaded_corpus.token2id

    loaded_options: dict = VectorizedCorpus.load_options(tag=tag,
                                                         folder=folder)
    assert loaded_options == dict()

    VectorizedCorpus.dump_options(tag=tag, folder=folder, options=dict(apa=1))
    loaded_options: dict = VectorizedCorpus.load_options(tag=tag,
                                                         folder=folder)
    assert loaded_options == dict(apa=1)

    VectorizedCorpus.remove(tag=tag, folder=folder)
    assert not VectorizedCorpus.dump_exists(tag=tag, folder=folder)
    assert not VectorizedCorpus.find_tags(folder)

    shutil.rmtree(folder)
コード例 #2
0
ファイル: dtm.py プロジェクト: humlab/penelope
def store_corpus_bundle(corpus: VectorizedCorpus, args: interface.ComputeOpts):

    if VectorizedCorpus.dump_exists(tag=args.corpus_tag, folder=args.target_folder):
        VectorizedCorpus.remove(tag=args.corpus_tag, folder=args.target_folder)

    target_folder = args.target_folder

    if args.create_subfolder:
        if os.path.split(target_folder)[1] != args.corpus_tag:
            target_folder = os.path.join(target_folder, args.corpus_tag)
        os.makedirs(target_folder, exist_ok=True)

    corpus.dump(tag=args.corpus_tag, folder=target_folder)

    VectorizedCorpus.dump_options(
        tag=args.corpus_tag,
        folder=target_folder,
        options=args.props,
    )