def test_load_dumped_corpus(mode: str, vectorized_corpus: VectorizedCorpus): tag: str = f'{str(uuid.uuid1())[:6]}' folder: str = jj(OUTPUT_FOLDER, tag) os.makedirs(folder, exist_ok=True) vectorized_corpus.dump(tag=tag, folder=folder, compressed=True, mode=mode) assert VectorizedCorpus.dump_exists(tag=tag, folder=folder) assert VectorizedCorpus.find_tags(folder) == [tag] loaded_corpus: VectorizedCorpus = VectorizedCorpus.load(tag=tag, folder=folder) assert (vectorized_corpus.term_frequency == loaded_corpus.term_frequency ).all() assert vectorized_corpus.document_index.to_dict( ) == loaded_corpus.document_index.to_dict() assert vectorized_corpus.token2id == loaded_corpus.token2id loaded_options: dict = VectorizedCorpus.load_options(tag=tag, folder=folder) assert loaded_options == dict() VectorizedCorpus.dump_options(tag=tag, folder=folder, options=dict(apa=1)) loaded_options: dict = VectorizedCorpus.load_options(tag=tag, folder=folder) assert loaded_options == dict(apa=1) VectorizedCorpus.remove(tag=tag, folder=folder) assert not VectorizedCorpus.dump_exists(tag=tag, folder=folder) assert not VectorizedCorpus.find_tags(folder) shutil.rmtree(folder)
def test_dump_and_store_of_corpus_with_empty_trailing_row( ) -> VectorizedCorpus: os.makedirs(OUTPUT_FOLDER, exist_ok=True) bag_term_matrix = np.array([[2, 1, 4, 1], [2, 2, 3, 0], [0, 0, 0, 0]]) token2id = {'a': 0, 'b': 1, 'c': 2, 'd': 3} document_index = pd.DataFrame({'year': [2013, 2013, 2014]}) corpus: VectorizedCorpus = VectorizedCorpus(bag_term_matrix, token2id=token2id, document_index=document_index) corpus.dump(tag="ZERO", folder="./tests/output") loaded_corpus = VectorizedCorpus.load(tag="ZERO", folder="./tests/output") assert corpus.data.shape == loaded_corpus.data.shape
def test_load_of_uncompressed_corpus(text_corpus): os.makedirs(OUTPUT_FOLDER, exist_ok=True) # Arrange corpus: VectorizedCorpus = CorpusVectorizer().fit_transform( text_corpus, already_tokenized=True) corpus.dump(tag='dump_test', folder=OUTPUT_FOLDER, compressed=False) # Act loaded_corpus: VectorizedCorpus = VectorizedCorpus.load( tag='dump_test', folder=OUTPUT_FOLDER) # Assert assert (corpus.term_frequency == loaded_corpus.term_frequency).all() assert corpus.document_index.to_dict( ) == loaded_corpus.document_index.to_dict() assert corpus.token2id == loaded_corpus.token2id
import os from penelope.common.curve_fit import pchip_spline from penelope.common.keyness.metrics import KeynessMetric # , rolling_average_smoother from penelope.corpus import VectorizedCorpus from penelope.notebook.word_trends.displayers import TopTokensDisplayer from penelope.notebook.word_trends.interface import TrendsComputeOpts # pylint: disable=protected-access DEFAULT_SMOOTHERS = [pchip_spline] folder = "/path/to/data" tag = os.path.split(folder)[1] corpus: VectorizedCorpus = VectorizedCorpus.load(folder=folder, tag=tag) compute_opts: TrendsComputeOpts = TrendsComputeOpts(normalize=False, keyness=KeynessMetric.TF, temporal_key='year') top_tokens = corpus.get_top_n_words(n=100000) displayer: TopTokensDisplayer = TopTokensDisplayer() displayer.setup() indices = [x[1] for x in top_tokens] smooth = False plot_data = displayer._compile(corpus=corpus, compute_opts=compute_opts, indices=indices, smoothers=[DEFAULT_SMOOTHERS] if smooth else [])