def create_reader(self):
     filename_fields = dict(year=r".{5}(\d{4})_.*",
                            serial_no=r".{9}_(\d+).*")
     reader = create_tokens_reader(filename_fields=filename_fields,
                                   fix_whitespaces=True,
                                   fix_hyphenation=True)
     return reader
Example #2
0
def test_get_index_when_extractor_passed_returns_metadata2():
    filename_fields = "year:_:1#serial_no:_:2"
    reader: TextTokenizer = create_tokens_reader(
        filename_fields=filename_fields,
        fix_whitespaces=True,
        fix_hyphenation=True)
    result = reader.metadata
    expected = [
        dict(filename='dikt_2019_01_test.txt', serial_no=1, year=2019),
        dict(filename='dikt_2019_02_test.txt', serial_no=2, year=2019),
        dict(filename='dikt_2019_03_test.txt', serial_no=3, year=2019),
        dict(filename='dikt_2020_01_test.txt', serial_no=1, year=2020),
        dict(filename='dikt_2020_02_test.txt', serial_no=2, year=2020),
    ]

    assert len(expected) == len(result)
    for i in range(0, len(expected)):
        assert expected[i] == result[i]

    reader.apply_filter(['dikt_2019_01_test.txt', 'dikt_2019_02_test.txt'])

    result = reader.metadata
    expected = [
        dict(filename='dikt_2019_01_test.txt', serial_no=1, year=2019),
        dict(filename='dikt_2019_02_test.txt', serial_no=2, year=2019),
    ]

    assert len(expected) == len(result)
    for i in range(0, len(expected)):
        assert expected[i] == result[i]
Example #3
0
def test_get_index_when_no_extract_passed_returns_not_none():
    reader = create_tokens_reader(filename_fields=None,
                                  fix_whitespaces=True,
                                  fix_hyphenation=True)
    corpus = corpora.TokenizedCorpus(reader)
    result = corpus.metadata
    assert result is not None
Example #4
0
def test_metadata_has_filena():
    tokens_reader = create_tokens_reader()
    assert tokens_reader is not None
    assert tokens_reader.filenames is not None
    assert len(tokens_reader.filenames) > 0
    assert len(tokens_reader.metadata) > 0
    assert len(tokens_reader.metadata[0].keys()) > 0
    assert 'filename' in tokens_reader.metadata[0]
Example #5
0
def test_next_document_when_new_corpus_returns_document():
    reader = create_tokens_reader(fix_whitespaces=True, fix_hyphenation=True)
    corpus = corpora.TokenizedCorpus(reader)
    result = next(corpus)
    expected = ("Tre svarta ekar ur snön . " +
                "Så grova , men fingerfärdiga . " +
                "Ur deras väldiga flaskor " + "ska grönskan skumma i vår .")
    assert expected == ' '.join(result[1])
Example #6
0
def test_reader_can_be_reiterated():

    reader: TextTokenizer = create_tokens_reader(filename_fields="year:_:1",
                                                 fix_whitespaces=True,
                                                 fix_hyphenation=True)
    for _ in range(0, 4):
        n_tokens = [len(x) for _, x in reader]
        expected = [22, 16, 26, 45, 21]
        assert expected == n_tokens
Example #7
0
def test_next_document_when_only_any_alphanumeric_true_skips_deliminators_using_defaults(
):
    reader = create_tokens_reader(filename_fields=None,
                                  fix_whitespaces=True,
                                  fix_hyphenation=True)
    corpus = corpora.TokenizedCorpus(
        reader, transform_opts=TokensTransformOpts(only_any_alphanumeric=True))
    _, tokens = next(corpus)
    expected = "Tre svarta ekar ur snön Så grova men fingerfärdiga Ur deras väldiga flaskor ska grönskan skumma i vår"
    assert expected.split() == tokens
Example #8
0
def test_can_get_file_when_compress_whitespace_is_true_strips_whitespaces():
    filename = 'dikt_2019_01_test.txt'
    reader = create_tokens_reader(fix_whitespaces=True,
                                  fix_hyphenation=True,
                                  filename_filter=[filename])
    result = next(reader)
    expected = ("Tre svarta ekar ur snön . " +
                "Så grova , men fingerfärdiga . " +
                "Ur deras väldiga flaskor " + "ska grönskan skumma i vår .")
    assert filename == result[0]
    assert expected == ' '.join(result[1])
Example #9
0
def test_get_file_when_fix_hyphenation_is_trye_removes_hyphens():
    filename = 'dikt_2019_03_test.txt'
    reader = create_tokens_reader(fix_whitespaces=True,
                                  fix_hyphenation=True,
                                  filename_filter=[filename])
    result = next(reader)
    expected = (
        "Nordlig storm . Det är den i den tid när rönnbärsklasar mognar . Vaken i mörkret hör man "
        + "stjärnbilderna stampa i sina spiltor " + "högt över trädet")
    assert filename == result[0]
    assert expected == ' '.join(result[1])
Example #10
0
def test_n_tokens_when_exhausted_and_only_any_alphanumeric_is_true_returns_expected_count(
):
    reader = create_tokens_reader(filename_fields=None,
                                  fix_whitespaces=True,
                                  fix_hyphenation=True)
    corpus = corpora.TokenizedCorpus(
        reader, transform_opts=TokensTransformOpts(only_any_alphanumeric=True))
    _ = [x for x in corpus]
    n_tokens = list(corpus.document_index.n_tokens)
    expected = [18, 14, 24, 42, 18]
    assert expected == n_tokens
Example #11
0
def test_get_file_when_default_returns_unmodified_content():
    filename = 'dikt_2019_01_test.txt'
    reader = create_tokens_reader(fix_whitespaces=False,
                                  fix_hyphenation=True,
                                  filename_filter=[filename])
    result = next(reader)
    expected = ("Tre svarta ekar ur snön . " +
                "Så grova , men fingerfärdiga . " +
                "Ur deras väldiga flaskor " + "ska grönskan skumma i vår .")
    assert filename == result[0]
    assert expected == ' '.join(result[1])
Example #12
0
def test_corpus_can_be_reiterated():

    reader = create_tokens_reader(filename_fields=None,
                                  fix_whitespaces=True,
                                  fix_hyphenation=True)

    corpus = corpora.TokenizedCorpus(
        reader, transform_opts=TokensTransformOpts(only_any_alphanumeric=True))
    for _ in range(0, 4):
        n_tokens = [len(x) for x in corpus.terms]
        expected = [18, 14, 24, 42, 18]
        assert expected == n_tokens  # , f"iteration{i}"
Example #13
0
def test_next_document_when_token_corpus_returns_tokenized_document():
    reader = create_tokens_reader(filename_fields=None,
                                  fix_whitespaces=True,
                                  fix_hyphenation=True)
    corpus = corpora.TokenizedCorpus(
        reader,
        transform_opts=TokensTransformOpts(only_any_alphanumeric=False))
    _, tokens = next(corpus)
    expected = (
        "Tre svarta ekar ur snön . Så grova , men fingerfärdiga . Ur deras väldiga flaskor ska grönskan skumma i vår ."
    )
    assert expected.split() == tokens
Example #14
0
def test_get_file_when_file_exists_and_extractor_specified_returns_content_and_metadata(
):
    filename = 'dikt_2019_03_test.txt'
    filename_fields = dict(year=r".{5}(\d{4})_.*", serial_no=r".{9}_(\d+).*")
    reader = create_tokens_reader(filename_fields=filename_fields,
                                  fix_whitespaces=True,
                                  fix_hyphenation=True,
                                  filename_filter=[filename])
    result = next(reader)
    expected = (
        "Nordlig storm . Det är den i den tid när rönnbärsklasar mognar . Vaken i mörkret hör man "
        + "stjärnbilderna stampa i sina spiltor " + "högt över trädet")
    assert filename == result[0]
    assert expected == ' '.join(result[1])
    assert reader.metadata[0]['year'] > 0
Example #15
0
def text_corpus() -> TokenizedCorpus:
    filename_fields = dict(year=r".{5}(\d{4})_.*", serial_no=r".{9}_(\d+).*")
    reader = create_tokens_reader(filename_fields=filename_fields,
                                  fix_whitespaces=True,
                                  fix_hyphenation=True)
    transform_opts = TokensTransformOpts(
        only_any_alphanumeric=True,
        to_lower=True,
        remove_accents=False,
        min_len=2,
        max_len=None,
        keep_numerals=False,
    )
    corpus = TokenizedCorpus(reader, transform_opts=transform_opts)
    return corpus
Example #16
0
def test_get_index_when_extractor_passed_returns_metadata():
    filename_fields = dict(year=r".{5}(\d{4})_.*", serial_no=r".{9}_(\d+).*")
    reader = create_tokens_reader(filename_fields=filename_fields,
                                  fix_whitespaces=True,
                                  fix_hyphenation=True)
    result = reader.metadata
    expected = [
        dict(filename='dikt_2019_01_test.txt', serial_no=1, year=2019),
        dict(filename='dikt_2019_02_test.txt', serial_no=2, year=2019),
        dict(filename='dikt_2019_03_test.txt', serial_no=3, year=2019),
        dict(filename='dikt_2020_01_test.txt', serial_no=1, year=2020),
        dict(filename='dikt_2020_02_test.txt', serial_no=2, year=2020),
    ]

    assert len(expected) == len(result)
    for i in range(0, len(expected)):
        assert expected[i] == result[i]
Example #17
0
def test_archive_filenames_when_filter_txt_returns_txt_files():
    reader = create_tokens_reader(filename_pattern='*.txt')
    assert 5 == len(reader.filenames)
Example #18
0
def test_archive_filenames_when_filter_md_returns_md_files():
    reader = create_tokens_reader(filename_pattern='*.md')
    assert 1 == len(reader.filenames)
Example #19
0
def test_archive_filenames_when_filter_function_txt_returns_txt_files():
    def filename_filter(x):
        return x.endswith('txt')

    reader = create_tokens_reader(filename_filter=filename_filter)
    assert 5 == len(reader.filenames)