Python Corpus Examples, tmtoolkit.corpus.Corpus Python Examples

Example #1

0

Show file

File: test_corpus.py Project: ihavemanyquestions/tmtoolkit

def test_corpus_from_tabular():
    for ext in ('csv', 'xlsx'):
        c = Corpus.from_tabular('tests/data/100NewsArticles.' + ext,
                                'article_id', 'text')
        assert len(c.docs) == 100
        assert all(dl.startswith('100NewsArticles') for dl in c.doc_labels)

        c.add_tabular('tests/data/100NewsArticles.' + ext,
                      'article_id',
                      'text',
                      prepend_columns=['title', 'subtitle'],
                      doc_label_fmt='added-{id}')
        assert len(c.docs) == 200

        n_added = 0
        for dl, nchars in c.doc_lengths.items():
            if dl.startswith('added'):
                n_added += 1
                _, doc_id = dl.split('-')
                assert nchars >= c.doc_lengths['100NewsArticles-' + doc_id]

        assert n_added == 100

    assert len(Corpus.from_tabular('tests/data/bt18_speeches_sample.csv', 0,
                                   2)) == 1000

Example #2

0

Show file

File: test_corpus.py Project: ihavemanyquestions/tmtoolkit

def test_corpus_replace_characters_simple():
    c = Corpus({'doc1': 'ABC', 'doc2': 'abcDeF'})
    c.replace_characters({'a': None, 'C': 'c', 'e': ord('X')})

    assert c.docs == {
        'doc1': 'ABc',
        'doc2': 'bcDXF',
    }

    c.replace_characters({ord('A'): None})

    assert c.docs == {
        'doc1': 'Bc',
        'doc2': 'bcDXF',
    }

    c.replace_characters(str.maketrans('DXFY', '1234'))

    assert c.docs == {
        'doc1': 'Bc',
        'doc2': 'bc123',
    }

    c.replace_characters({})

    assert c.docs == {
        'doc1': 'Bc',
        'doc2': 'bc123',
    }

Example #3

0

Show file

File: test_corpus.py Project: yushu-liu/tmtoolkit

def test_corpus_builtin_corpora():
    builtin_corp = Corpus.builtin_corpora()
    assert len(builtin_corp) == 2

    for corp in builtin_corp:
        c = Corpus.from_builtin_corpus(corp)
        assert len(c) > 0

Example #4

0

Show file

def evaluate_model(file_name, date, n_iter, scope, lang, n_eval=5):
  #
  corpus = Corpus()
  corpus.add_files(file_name, encoding='utf8')
  #
  preproc = TMPreproc(corpus)
  dtm_bg = preproc.dtm
  #
  var_params = [{'n_topics': k} for k in range(5, int(n_eval*10), n_eval)]
  #
  const_params = {
    'n_iter': n_iter,
    'random_state': 20200713  # to make results reproducible
  }
  eval_results = evaluate_topic_models(dtm_bg,
                                     varying_parameters=var_params,
                                     constant_parameters=const_params,
                                     metric=['loglikelihood', 'cao_juan_2009', 'arun_2010']#,
                                     #return_models=True
                                     )
  #
  eval_results_by_topics = results_by_parameter(eval_results, 'n_topics')
  #
  name = "evaluate_model_{}_{}iter_{}eval_{}_{}.png".format(date, n_iter, n_eval, scope, lang)
  plot_eval_results(eval_results_by_topics, figsize=(8, 6), metric_direction_font_size='x-small', title_fontsize='small', axes_title_fontsize='x-small')
  plt.tight_layout()
  plt.savefig('out/'+name)
  return

Example #5

0

Show file

File: test_corpus.py Project: ihavemanyquestions/tmtoolkit

def test_corpus_builtin_corpora():
    builtin_corp = Corpus.builtin_corpora()
    assert sorted(builtin_corp) == sorted(
        Corpus._BUILTIN_CORPORA_LOAD_KWARGS.keys())

    for corp in builtin_corp:
        c = Corpus.from_builtin_corpus(corp)
        assert len(c) > 0

Example #6

0

Show file

File: test_corpus.py Project: ihavemanyquestions/tmtoolkit

def test_corpus_from_pickle():
    c1 = Corpus({'a': '1', 'b': '22', 'c': '333'})

    with tempfile.TemporaryFile(suffix='.pickle') as f:
        c1.to_pickle(f)
        f.seek(0)
        c2 = Corpus.from_pickle(f)

    assert c1.docs == c2.docs

Example #7

0

Show file

def test_corpus_pass_tmpreproc():
    c = Corpus()
    c['doc1'] = 'A simple example in simple English.'
    c['doc2'] = 'It contains only three very simple documents.'
    c['doc3'] = 'Simply written documents are very brief.'

    preproc = TMPreproc(c)
    tok = preproc.tokenize().tokens
    assert set(tok.keys()) == set(c.keys())
    assert len(tok['doc1']) == 7

Example #8

0

Show file

def test_corpus_from_folder_valid_ext():
    assert len(
        Corpus.from_folder('examples/data/gutenberg',
                           valid_extensions='txt').docs) == 3
    assert len(
        Corpus.from_folder('examples/data/gutenberg',
                           valid_extensions='foo').docs) == 0
    assert len(
        Corpus.from_folder('examples/data/gutenberg',
                           valid_extensions=('foo', 'txt')).docs) == 3

Example #9

0

Show file

File: test_corpus.py Project: ihavemanyquestions/tmtoolkit

def test_corpus_copy(texts):
    c1 = Corpus({str(i): t for i, t in enumerate(texts)})
    c2 = c1.copy()

    assert c1.docs is not c2.docs
    assert c1.docs == c2.docs

    assert c1.doc_paths is not c2.doc_paths
    assert c1.doc_paths == c2.doc_paths

    assert c1.doc_labels == c2.doc_labels
    assert c1.doc_lengths == c2.doc_lengths
    assert c1.unique_characters == c2.unique_characters

Example #10

0

Show file

File: test_corpus.py Project: ihavemanyquestions/tmtoolkit

def test_corpus_apply(texts):
    c = Corpus({str(i): t for i, t in enumerate(texts)})
    c_orig = c.copy()
    orig_doc_labels = c.doc_labels
    orig_doc_lengths = c.doc_lengths

    assert isinstance(c.apply(str.upper), Corpus)

    assert c.doc_labels == orig_doc_labels
    assert c.doc_lengths == orig_doc_lengths

    for dl, dt in c.items():
        assert c_orig[dl].upper() == dt

Example #11

0

Show file

def test_corpus_filter_by_min_length():
    c = Corpus.from_folder('examples/data/gutenberg',
                           force_unix_linebreaks=False)
    assert len(c.filter_by_min_length(1).docs) == 3
    assert len(c.filter_by_min_length(142694).docs) == 1
    assert len(c.filter_by_min_length(142695).docs) == 0
    assert len(c.filter_by_min_length(1).docs) == 0

Example #12

0

Show file

File: test_corpus.py Project: ihavemanyquestions/tmtoolkit

def test_corpus_unique_characters(texts):
    all_chars = set(''.join(texts))

    c = Corpus({str(i): t for i, t in enumerate(texts)})
    res_chars = c.unique_characters
    assert isinstance(res_chars, set)
    assert res_chars == all_chars

Example #13

0

Show file

File: test_corpus.py Project: ihavemanyquestions/tmtoolkit

def test_corpus_filter_by_max_length():
    c = Corpus.from_folder('tests/data/gutenberg', force_unix_linebreaks=False)
    assert len(c.filter_by_max_length(999999).docs) == 3
    assert len(c.filter_by_max_length(142694).docs) == 3
    assert len(c.filter_by_max_length(142693).docs) == 2
    assert len(c.filter_by_max_length(0).docs) == 0
    assert len(c.filter_by_max_length(999999).docs) == 0

Example #14

0

Show file

File: test_corpus.py Project: petershan1119/tmtoolkit

def test_corpus_filter_by_max_length():
    c = Corpus.from_folder('examples/data/gutenberg')
    assert len(c.filter_by_max_length(999999).docs) == 3
    assert len(c.filter_by_max_length(142694).docs) == 3
    assert len(c.filter_by_max_length(142693).docs) == 2
    assert len(c.filter_by_max_length(0).docs) == 0
    assert len(c.filter_by_max_length(999999).docs) == 0

Example #15

0

Show file

def test_corpus_from_files():
    doc_path = 'examples/data/gutenberg/kafka_verwandlung.txt'
    c1 = Corpus.from_files([doc_path])
    c2 = Corpus().add_files([doc_path])

    assert len(c1.docs) == len(c1.doc_paths) == 1
    assert len(c2.docs) == len(c2.doc_paths) == 1
    assert c1.docs.keys() == c2.docs.keys() == c1.doc_paths.keys(
    ) == c2.doc_paths.keys()

    only_doc_label = next(iter(c1.docs.keys()))
    assert only_doc_label.endswith('kafka_verwandlung')

    only_doc = c1.docs[only_doc_label]
    assert len(only_doc) > 0

    assert c1.doc_paths[only_doc_label] == doc_path

Example #16

0

Show file

File: test_corpus.py Project: ihavemanyquestions/tmtoolkit

def test_corpus_split_by_paragraphs_rejoin():
    # TODO: better tests here
    c = Corpus.from_folder('tests/data/gutenberg', doc_label_fmt='{basename}')
    c2 = Corpus.from_folder('tests/data/gutenberg', doc_label_fmt='{basename}')

    orig_docs = c.docs
    #par_docs = c.split_by_paragraphs().docs
    par_docs_joined = c2.split_by_paragraphs(join_paragraphs=5).docs

    assert len(par_docs_joined) >= len(orig_docs)

    for k, d in orig_docs.items():
        assert k in ('goethe_werther1', 'goethe_werther2', 'kafka_verwandlung')
        pars = [
            par_docs_joined[par_k] for par_k in sorted(par_docs_joined.keys())
            if par_k.startswith(k)
        ]
        assert len(pars) > 0

Example #17

0

Show file

def test_corpus_from_files2():
    c = Corpus.from_files([
        'examples/data/gutenberg/werther/goethe_werther1.txt',
        'examples/data/gutenberg/werther/goethe_werther2.txt'
    ])
    assert len(c.docs) == len(c.doc_paths) == 2

    for k, d in c.docs.items():
        assert k[:-1].endswith('goethe_werther')
        assert len(d) > 0

Example #18

0

Show file

File: test_corpus.py Project: ihavemanyquestions/tmtoolkit

def test_corpus_sample():
    c = Corpus.from_folder('tests/data/gutenberg')
    n_docs_orig = c.n_docs

    sampled_docs = c.sample(2)
    assert isinstance(sampled_docs, Corpus)
    assert len(sampled_docs) == 2
    assert c.n_docs == n_docs_orig

    sampled_docs = c.sample(2, as_corpus=False)
    assert isinstance(sampled_docs, dict)
    assert len(sampled_docs) == 2
    assert c.n_docs == n_docs_orig

    assert isinstance(c.sample(2, inplace=True), Corpus)
    assert c.n_docs == 2

Example #19

0

Show file

File: test_corpus.py Project: ihavemanyquestions/tmtoolkit

def test_corpus_split_by_paragraphs():
    c = Corpus.from_folder('tests/data/gutenberg', doc_label_fmt='{basename}')

    orig_docs = c.docs
    orig_doc_paths = c.doc_paths
    c.split_by_paragraphs()
    par_docs = c.docs

    assert len(par_docs) >= len(orig_docs)
    assert len(set(orig_doc_paths.values())) == len(set(c.doc_paths.values()))

    for k, d in orig_docs.items():
        assert k in ('goethe_werther1', 'goethe_werther2', 'kafka_verwandlung')
        pars = [
            par_docs[par_k] for par_k in sorted(par_docs.keys())
            if par_k.startswith(k)
        ]
        assert len(pars) > 0

        pars_ = paragraphs_from_lines(d)
        assert len(pars_) == len(pars)
        assert set(pars_) == set(pars)

Example #20

0

Show file

def test_corpus_dict_methods():
    c = Corpus()
    assert len(c) == 0
    with pytest.raises(KeyError):
        x = c['x']

    with pytest.raises(KeyError):
        c[1] = 'abc'

    with pytest.raises(KeyError):
        c[''] = 'abc'

    with pytest.raises(ValueError):
        c['d1'] = None

    c['d1'] = 'd1 text'
    assert len(c) == 1
    assert 'd1' in c
    assert set(c.keys()) == {'d1'}
    assert c['d1'] == 'd1 text'

    c['d2'] = 'd2 text'
    assert len(c) == 2
    for dl in c:
        assert dl in {'d1', 'd2'}
    assert set(c.keys()) == {'d1', 'd2'}

    for dl, dt in c.items():
        assert dl in {'d1', 'd2'}
        assert c[dl] == dt

    with pytest.raises(KeyError):
        del c['d3']

    del c['d1']
    assert len(c) == 1
    assert set(c.keys()) == {'d2'}

    del c['d2']
    assert len(c) == 0
    assert set(c.keys()) == set()

Example #21

0

Show file

def test_corpus_from_folder_not_existent():
    with pytest.raises(IOError):
        Corpus.from_folder('not_existent')

Example #22

0

Show file

File: test_corpus.py Project: ihavemanyquestions/tmtoolkit

def test_corpus_filter_characters(texts):
    c = Corpus({str(i): t for i, t in enumerate(texts)})
    c_orig = c.copy()

    orig_doc_labels = c.doc_labels
    orig_doc_lengths = c.doc_lengths
    orig_uniq_chars = c.unique_characters

    assert isinstance(c.filter_characters(orig_uniq_chars), Corpus)
    assert c.doc_labels == orig_doc_labels
    assert c.doc_lengths == orig_doc_lengths
    assert c.unique_characters == orig_uniq_chars

    not_in_corpus_chars = set(string.printable) - orig_uniq_chars
    if len(not_in_corpus_chars) > 0:
        c.filter_characters(not_in_corpus_chars)
        assert c.doc_labels == orig_doc_labels
        assert c.doc_lengths == {dl: 0 for dl in c.doc_labels}
        assert c.unique_characters == set()

    c = c_orig.copy()
    c.filter_characters(set())
    assert c.doc_labels == orig_doc_labels
    assert c.doc_lengths == {dl: 0 for dl in c.doc_labels}
    assert c.unique_characters == set()

    if len(orig_uniq_chars) > 3:
        c = c_orig.copy()
        only_chars = set(sample(list(orig_uniq_chars), 3))
        c.filter_characters(only_chars)
        assert c.doc_labels == orig_doc_labels
        assert c.doc_lengths != orig_doc_lengths
        assert c.unique_characters == only_chars

        c = c_orig.copy()
        only_chars = set(sample(list(orig_uniq_chars), 3))
        c.filter_characters(''.join(only_chars))  # as char sequence
        assert c.doc_labels == orig_doc_labels
        assert c.doc_lengths != orig_doc_lengths
        assert c.unique_characters == only_chars

Example #23

0

Show file

def test_corpus_from_files_not_existent():
    with pytest.raises(IOError):
        Corpus.from_files([
            'examples/data/gutenberg/werther/goethe_werther1.txt',
            'not_existent'
        ])

Example #24

0

Show file

def test_corpus_from_files_nonlist_arg():
    with pytest.raises(ValueError):
        Corpus.from_files('wrong')

Example #25

0

Show file

def test_corpus_get_doc_labels():
    c = Corpus.from_folder('examples/data/gutenberg')
    assert set(c.docs.keys()) == set(c.get_doc_labels())

Example #26

0

Show file

def test_corpus_add_doc():
    c = Corpus()
    with pytest.raises(ValueError):
        c.add_doc('', 'x')
    with pytest.raises(ValueError):
        c.add_doc(123, 'x')
    with pytest.raises(ValueError):
        c.add_doc('d1', None)

    c.add_doc('d1', 'd1 text')
    with pytest.raises(ValueError):
        c.add_doc('d1', 'd1 text')

    c.add_doc('d2', '')

    assert set(c.keys()) == {'d1', 'd2'}

Example #27

0

Show file

def test_corpus_sample():
    c = Corpus.from_folder('examples/data/gutenberg')
    assert len(c.sample(2).docs) == 2

Example #28

0

Show file

def test_empty_corpora():
    c1 = Corpus()
    c2 = Corpus.from_files([])
    c3 = Corpus.from_files([]).add_files([])
    assert c1.docs == c2.docs == c3.docs == {}

Example #29

0

Show file

File: test_corpus.py Project: petershan1119/tmtoolkit

def test_corpus_filter_by_min_length():
    c = Corpus.from_folder('examples/data/gutenberg')
    assert len(c.filter_by_min_length(1).docs) == 3
    assert len(c.filter_by_min_length(142694).docs) == 1
    assert len(c.filter_by_min_length(142695).docs) == 0
    assert len(c.filter_by_min_length(1).docs) == 0

Example #30

0

Show file

File: test_corpus.py Project: ihavemanyquestions/tmtoolkit

def test_corpus_from_zip():
    c = Corpus.from_zip('tests/data/zipdata.zip',
                        id_column='article_id',
                        text_column='text')
    assert sum(dl.startswith('100NewsArticles-') for dl in c.doc_labels) == 100
    assert sum(dl == 'german-goethe_werther1' for dl in c.doc_labels) == 1