Exemple #1
0
def test_load_dumped_corpus(mode: str, vectorized_corpus: VectorizedCorpus):

    tag: str = f'{str(uuid.uuid1())[:6]}'
    folder: str = jj(OUTPUT_FOLDER, tag)

    os.makedirs(folder, exist_ok=True)

    vectorized_corpus.dump(tag=tag, folder=folder, compressed=True, mode=mode)

    assert VectorizedCorpus.dump_exists(tag=tag, folder=folder)
    assert VectorizedCorpus.find_tags(folder) == [tag]

    loaded_corpus: VectorizedCorpus = VectorizedCorpus.load(tag=tag,
                                                            folder=folder)
    assert (vectorized_corpus.term_frequency == loaded_corpus.term_frequency
            ).all()
    assert vectorized_corpus.document_index.to_dict(
    ) == loaded_corpus.document_index.to_dict()
    assert vectorized_corpus.token2id == loaded_corpus.token2id

    loaded_options: dict = VectorizedCorpus.load_options(tag=tag,
                                                         folder=folder)
    assert loaded_options == dict()

    VectorizedCorpus.dump_options(tag=tag, folder=folder, options=dict(apa=1))
    loaded_options: dict = VectorizedCorpus.load_options(tag=tag,
                                                         folder=folder)
    assert loaded_options == dict(apa=1)

    VectorizedCorpus.remove(tag=tag, folder=folder)
    assert not VectorizedCorpus.dump_exists(tag=tag, folder=folder)
    assert not VectorizedCorpus.find_tags(folder)

    shutil.rmtree(folder)
def test_dump_and_store_of_corpus_with_empty_trailing_row(
) -> VectorizedCorpus:
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)

    bag_term_matrix = np.array([[2, 1, 4, 1], [2, 2, 3, 0], [0, 0, 0, 0]])
    token2id = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
    document_index = pd.DataFrame({'year': [2013, 2013, 2014]})
    corpus: VectorizedCorpus = VectorizedCorpus(bag_term_matrix,
                                                token2id=token2id,
                                                document_index=document_index)

    corpus.dump(tag="ZERO", folder="./tests/output")

    loaded_corpus = VectorizedCorpus.load(tag="ZERO", folder="./tests/output")

    assert corpus.data.shape == loaded_corpus.data.shape
def test_load_of_uncompressed_corpus(text_corpus):
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)

    # Arrange
    corpus: VectorizedCorpus = CorpusVectorizer().fit_transform(
        text_corpus, already_tokenized=True)

    corpus.dump(tag='dump_test', folder=OUTPUT_FOLDER, compressed=False)

    # Act
    loaded_corpus: VectorizedCorpus = VectorizedCorpus.load(
        tag='dump_test', folder=OUTPUT_FOLDER)

    # Assert
    assert (corpus.term_frequency == loaded_corpus.term_frequency).all()
    assert corpus.document_index.to_dict(
    ) == loaded_corpus.document_index.to_dict()
    assert corpus.token2id == loaded_corpus.token2id
Exemple #4
0
import os

from penelope.common.curve_fit import pchip_spline
from penelope.common.keyness.metrics import KeynessMetric  # , rolling_average_smoother
from penelope.corpus import VectorizedCorpus
from penelope.notebook.word_trends.displayers import TopTokensDisplayer
from penelope.notebook.word_trends.interface import TrendsComputeOpts

# pylint: disable=protected-access

DEFAULT_SMOOTHERS = [pchip_spline]

folder = "/path/to/data"
tag = os.path.split(folder)[1]

corpus: VectorizedCorpus = VectorizedCorpus.load(folder=folder, tag=tag)
compute_opts: TrendsComputeOpts = TrendsComputeOpts(normalize=False,
                                                    keyness=KeynessMetric.TF,
                                                    temporal_key='year')

top_tokens = corpus.get_top_n_words(n=100000)
displayer: TopTokensDisplayer = TopTokensDisplayer()
displayer.setup()

indices = [x[1] for x in top_tokens]
smooth = False
plot_data = displayer._compile(corpus=corpus,
                               compute_opts=compute_opts,
                               indices=indices,
                               smoothers=[DEFAULT_SMOOTHERS] if smooth else [])