Python VectorizedCorpus.load Examples

Programming Language: Python

Namespace/Package Name: penelope.corpus

Class/Type: VectorizedCorpus

Method/Function: load

Examples at hotexamples.com: 4

Python VectorizedCorpus.load - 4 examples found. These are the top rated real world Python examples of penelope.corpus.VectorizedCorpus.load extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

VectorizedCorpus(11)

load(4)

token_indices(4)

dump(2)

dump_options(2)

to_bag_of_terms(2)

remove(2)

dump_exists(2)

load_options(1)

todense(1)

to_n_top_dataframe(1)

to_co_occurrences(1)

tf_idf(1)

stats(1)

pick_top_tf_map(1)

group_by_time_period_optimized(1)

co_occurrence_matrix(1)

_token2id(1)

get_word_vector(1)

get_top_n_words(1)

from_token_id_stream(1)

find_tags(1)

find_matching_words_indices(1)

find_matching_words(1)

filter(1)

group_by_time_period(1)

Example #1

Show file

def test_load_dumped_corpus(mode: str, vectorized_corpus: VectorizedCorpus):

    tag: str = f'{str(uuid.uuid1())[:6]}'
    folder: str = jj(OUTPUT_FOLDER, tag)

    os.makedirs(folder, exist_ok=True)

    vectorized_corpus.dump(tag=tag, folder=folder, compressed=True, mode=mode)

    assert VectorizedCorpus.dump_exists(tag=tag, folder=folder)
    assert VectorizedCorpus.find_tags(folder) == [tag]

    loaded_corpus: VectorizedCorpus = VectorizedCorpus.load(tag=tag,
                                                            folder=folder)
    assert (vectorized_corpus.term_frequency == loaded_corpus.term_frequency
            ).all()
    assert vectorized_corpus.document_index.to_dict(
    ) == loaded_corpus.document_index.to_dict()
    assert vectorized_corpus.token2id == loaded_corpus.token2id

    loaded_options: dict = VectorizedCorpus.load_options(tag=tag,
                                                         folder=folder)
    assert loaded_options == dict()

    VectorizedCorpus.dump_options(tag=tag, folder=folder, options=dict(apa=1))
    loaded_options: dict = VectorizedCorpus.load_options(tag=tag,
                                                         folder=folder)
    assert loaded_options == dict(apa=1)

    VectorizedCorpus.remove(tag=tag, folder=folder)
    assert not VectorizedCorpus.dump_exists(tag=tag, folder=folder)
    assert not VectorizedCorpus.find_tags(folder)

    shutil.rmtree(folder)

Example #2

Show file

File: vectorized_corpus_test.py Project: humlab/penelope

def test_dump_and_store_of_corpus_with_empty_trailing_row(
) -> VectorizedCorpus:
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)

    bag_term_matrix = np.array([[2, 1, 4, 1], [2, 2, 3, 0], [0, 0, 0, 0]])
    token2id = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
    document_index = pd.DataFrame({'year': [2013, 2013, 2014]})
    corpus: VectorizedCorpus = VectorizedCorpus(bag_term_matrix,
                                                token2id=token2id,
                                                document_index=document_index)

    corpus.dump(tag="ZERO", folder="./tests/output")

    loaded_corpus = VectorizedCorpus.load(tag="ZERO", folder="./tests/output")

    assert corpus.data.shape == loaded_corpus.data.shape

Example #3

Show file

File: vectorized_corpus_test.py Project: humlab/penelope

def test_load_of_uncompressed_corpus(text_corpus):
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)

    # Arrange
    corpus: VectorizedCorpus = CorpusVectorizer().fit_transform(
        text_corpus, already_tokenized=True)

    corpus.dump(tag='dump_test', folder=OUTPUT_FOLDER, compressed=False)

    # Act
    loaded_corpus: VectorizedCorpus = VectorizedCorpus.load(
        tag='dump_test', folder=OUTPUT_FOLDER)

    # Assert
    assert (corpus.term_frequency == loaded_corpus.term_frequency).all()
    assert corpus.document_index.to_dict(
    ) == loaded_corpus.document_index.to_dict()
    assert corpus.token2id == loaded_corpus.token2id

Example #4

Show file

import os

from penelope.common.curve_fit import pchip_spline
from penelope.common.keyness.metrics import KeynessMetric  # , rolling_average_smoother
from penelope.corpus import VectorizedCorpus
from penelope.notebook.word_trends.displayers import TopTokensDisplayer
from penelope.notebook.word_trends.interface import TrendsComputeOpts

# pylint: disable=protected-access

DEFAULT_SMOOTHERS = [pchip_spline]

folder = "/path/to/data"
tag = os.path.split(folder)[1]

corpus: VectorizedCorpus = VectorizedCorpus.load(folder=folder, tag=tag)
compute_opts: TrendsComputeOpts = TrendsComputeOpts(normalize=False,
                                                    keyness=KeynessMetric.TF,
                                                    temporal_key='year')

top_tokens = corpus.get_top_n_words(n=100000)
displayer: TopTokensDisplayer = TopTokensDisplayer()
displayer.setup()

indices = [x[1] for x in top_tokens]
smooth = False
plot_data = displayer._compile(corpus=corpus,
                               compute_opts=compute_opts,
                               indices=indices,
                               smoothers=[DEFAULT_SMOOTHERS] if smooth else [])