Esempio n. 1
0
def test_phrased_tagged_frame():

    os.makedirs('./tests/output', exist_ok=True)

    tagged_corpus_source: str = "./tests/test_data/tranströmer_corpus_export.sparv4.csv.zip"
    checkpoint_opts: checkpoint.CheckpointOpts = None
    data = checkpoint.load_archive(source_name=tagged_corpus_source,
                                   checkpoint_opts=checkpoint_opts,
                                   reader_opts=None)
    payload = next(data.create_stream())

    tokens = tagged_frame_to_tokens(
        payload.content,
        ExtractTaggedTokensOpts(lemmatize=False, **SPARV_TAGGED_COLUMNS),
    )
    assert tokens is not None
    phrases = {'United Nations': 'United_Nations', 'United': 'United'}
    phrased_tokens = tagged_frame_to_tokens(
        payload.content,
        ExtractTaggedTokensOpts(lemmatize=False,
                                phrases=phrases,
                                **SPARV_TAGGED_COLUMNS),
    )
    assert phrased_tokens[:9] == [
        'Constitution',
        'of',
        'the',
        'United_Nations',
        'Educational',
        ',',
        'Scientific',
        'and',
        'Cultural',
    ]
Esempio n. 2
0
def test_sparv_extract_and_store_when_only_nouns_and_source_is_sparv3_succeeds(
):

    os.makedirs(OUTPUT_FOLDER, exist_ok=True)

    target_filename = os.path.join(OUTPUT_FOLDER, f'{uuid.uuid1()}.zip')

    sparv_corpus.sparv_xml_extract_and_store(
        SPARV3_ZIPPED_XML_EXPORT_FILENAME,
        target_filename,
        version=3,
        extract_opts=ExtractTaggedTokensOpts(pos_includes='|NN|',
                                             pos_paddings=None,
                                             lemmatize=False),
        transform_opts=TokensTransformOpts(to_lower=True,
                                           min_len=2,
                                           stopwords=['<text>']),
    )

    expected_document_start = "utredningar justitiedepartementet förslag utlänningslag angående om- händertagande förläggning års gere ide to lm \rstatens utredningar förteckning betänkande förslag utlänningslag lag omhändertagande utlänning anstalt förläggning tryckort tryckorten bokstäverna fetstil begynnelse- bokstäverna departement"

    test_filename = "sou_1945_1.txt"

    content = zip_utils.read_file_content(zip_or_filename=target_filename,
                                          filename=test_filename,
                                          as_binary=False)

    assert content.startswith(expected_document_start)

    os.remove(target_filename)
Esempio n. 3
0
def test_reader_when_no_transforms_returns_source_tokens():

    expected = [
        'Rödräven',
        'är',
        'ett',
        'hunddjur',
        'som',
        'har',
        'en',
        'mycket',
        'vidsträckt',
        'utbredning',
        'över',
        'norra',
        'halvklotet',
        '.',
    ]
    expected_name = "sparv_xml_export_small.txt"

    reader = readers.SparvXmlReader(
        SPARV_XML_EXPORT_FILENAME_SMALL,
        chunk_size=None,
        extract_opts=ExtractTaggedTokensOpts(pos_includes='',
                                             pos_paddings=None,
                                             lemmatize=False,
                                             pos_excludes=None),
    )

    filename, tokens = next(iter(reader))

    assert expected == list(tokens)
    assert expected_name == filename
Esempio n. 4
0
def test_reader_when_ignore_puncts_returns_filter_outs_puncts():

    expected = [
        'rödräv',
        'vara',
        'en',
        'hunddjur',
        'som',
        'ha',
        'en',
        'mycken',
        'vidsträckt',
        'utbredning',
        'över',
        'norra',
        'halvklot',
    ]
    expected_name = "sparv_xml_export_small.txt"

    reader = readers.SparvXmlReader(
        SPARV_XML_EXPORT_FILENAME_SMALL,
        chunk_size=None,
        extract_opts=ExtractTaggedTokensOpts(pos_includes='',
                                             pos_paddings=None,
                                             lemmatize=True,
                                             pos_excludes="|MAD|MID|PAD|"),
    )

    filename, tokens = next(iter(reader))

    assert expected == tokens
    assert expected_name == filename
Esempio n. 5
0
def test_extract_tokens_when_punct_filter_is_disabled_succeeds(
        df_doc: pd.DataFrame):
    pytest.importorskip("spacy")
    df_doc = df_doc.copy()

    extract_opts = ExtractTaggedTokensOpts(lemmatize=True,
                                           **SPACY_TAGGED_COLUMNS,
                                           filter_opts=dict(is_punct=None))
    tokens = tagged_frame_to_tokens(doc=df_doc, extract_opts=extract_opts)
    assert tokens == [
        'mars',
        'be',
        'once',
        'home',
        'to',
        'sea',
        'and',
        'ocean',
        ',',
        'and',
        'perhaps',
        'even',
        'life',
        '.',
    ]
Esempio n. 6
0
def test_reader_store_result():

    os.makedirs(OUTPUT_FOLDER, exist_ok=True)

    expected_documents = [
        ['rödräv', 'hunddjur', 'utbredning', 'halvklot'],
        [
            'fjällräv', 'fjällvärld', 'liv', 'fjällräv', 'vinter', 'men',
            'variant', 'år'
        ],
    ]
    expected_names = ["document_001.txt", "document_002.txt"]

    target_filename = os.path.join(OUTPUT_FOLDER,
                                   'test_reader_store_result.zip')

    sparv_corpus.sparv_xml_extract_and_store(
        SPARV_ZIPPED_XML_EXPORT_FILENAME,
        target_filename,
        version=4,
        extract_opts=ExtractTaggedTokensOpts(pos_includes='|NN|',
                                             pos_paddings=None,
                                             lemmatize=True),
        transform_opts=TokensTransformOpts(to_lower=True),
    )

    for i in range(0, len(expected_names)):

        content = zip_utils.read_file_content(zip_or_filename=target_filename,
                                              filename=expected_names[i],
                                              as_binary=False)

        assert ' '.join(expected_documents[i]) == content

    os.remove(target_filename)
Esempio n. 7
0
def test_extract_tokens_lemma_no_stops_succeeds(df_doc: pd.DataFrame):
    pytest.importorskip("spacy")
    df_doc = df_doc.copy()
    extract_opts = ExtractTaggedTokensOpts(lemmatize=True,
                                           **SPACY_TAGGED_COLUMNS,
                                           filter_opts=dict(is_stop=False,
                                                            is_punct=False))

    tokens = tagged_frame_to_tokens(doc=df_doc, extract_opts=extract_opts)
    assert tokens == ['mars', 'home', 'sea', 'ocean', 'life']
Esempio n. 8
0
def create_test_corpus() -> SparvTokenizedCsvCorpus:

    corpus = SparvTokenizedCsvCorpus(
        SPARV_ZIPPED_CSV_EXPORT_FILENAME,
        reader_opts=TextReaderOpts(filename_fields="year:_:1", ),
        extract_opts=ExtractTaggedTokensOpts(lemmatize=True,
                                             **SPARV_TAGGED_COLUMNS),
    )

    return corpus
Esempio n. 9
0
def ComputeOptsSparvCSV(
    *,
    corpus_tag: str = 'TELLUS',
    corpus_source:
    str = './tests/test_data/tranströmer_corpus_export.sparv4.csv.zip',
) -> ComputeOpts:  # pylint: disable=too-many-instance-attributes)

    return ComputeOpts(
        corpus_tag=corpus_tag,
        corpus_source=corpus_source,
        target_folder="./tests/output",
        corpus_type=CorpusType.SparvCSV,
        transform_opts=TokensTransformOpts(
            to_lower=True,
            min_len=1,
            remove_stopwords=None,
            keep_symbols=True,
            keep_numerals=True,
            only_alphabetic=False,
            only_any_alphanumeric=False,
        ),
        text_reader_opts=TextReaderOpts(
            filename_pattern='*.csv',
            filename_fields=('year:_:1', ),
            index_field=None,  # use filename
            as_binary=False,
        ),
        extract_opts=ExtractTaggedTokensOpts(
            pos_includes=None,
            pos_excludes='|MAD|MID|PAD|',
            pos_paddings=None,
            lemmatize=False,
            **SPARV_TAGGED_COLUMNS,
            filter_opts=dict(
                is_alpha=False,
                is_punct=False,
                is_digit=None,
                is_stop=None,
                is_space=False,
            ),
        ),
        create_subfolder=False,
        persist=True,
        context_opts=ContextOpts(
            concept=('jag', ),
            context_width=2,
            partition_keys=['document_id'],
        ),
        tf_threshold=1,
        tf_threshold_mask=False,
        vectorize_opts=VectorizeOpts(already_tokenized=True,
                                     min_tf=1,
                                     max_tokens=None),
    )
Esempio n. 10
0
def test_extract_tokens_target_text_succeeds(df_doc: pd.DataFrame):
    pytest.importorskip("spacy")
    df_doc = df_doc.copy()
    extract_opts = ExtractTaggedTokensOpts(lemmatize=False,
                                           **SPACY_TAGGED_COLUMNS,
                                           filter_opts=dict(is_punct=False))

    tokens = tagged_frame_to_tokens(doc=df_doc, extract_opts=extract_opts)
    assert tokens == [
        "Mars", "was", "once", "home", "to", "seas", "and", "oceans", "and",
        "perhaps", "even", "life"
    ]
Esempio n. 11
0
def prepare_train_corpus(
    input_filename,
    pos_includes,
    pos_excludes,
    chunk_size,
    lemmatize,
    lower,
    remove_stopwords,
    min_word_length,
    keep_symbols,
    keep_numerals,
    version,
):
    """Prepares the a training corpus from Sparv XML archive"""
    transform_opts: TokensTransformOpts = TokensTransformOpts(
        to_lower=lower,
        remove_stopwords=remove_stopwords is not None,
        language=remove_stopwords,
        min_len=min_word_length,
        max_len=None,
        keep_numerals=keep_numerals,
        keep_symbols=keep_symbols,
    )
    extract_opts = ExtractTaggedTokensOpts(
        pos_includes=pos_includes,
        pos_excludes=pos_excludes,
        lemmatize=lemmatize,
    )
    output_filename = replace_extension(
        timestamp_filename(suffix_filename(input_filename, "text")), 'zip')

    reader_opts = {
        'chunk_size': chunk_size,
    }

    sparv_corpus.sparv_xml_extract_and_store(
        source=input_filename,
        target=output_filename,
        version=version,
        extract_opts=extract_opts,
        reader_opts=reader_opts,
        transform_opts=transform_opts,
    )

    store_options_to_json_file(
        input_filename,
        output_filename,
        transform_opts,
        dict(version=version,
             extract_tokens_opts=extract_opts,
             reader_opts=reader_opts),
    )
Esempio n. 12
0
def test_extract_tokens_pos_verb_noun_text_succeeds(df_doc: pd.DataFrame):
    pytest.importorskip("spacy")
    df_doc = df_doc.copy()
    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=False,
        pos_includes='|VERB|NOUN|',
        pos_paddings=None,
        **SPACY_TAGGED_COLUMNS,
        filter_opts=dict(is_punct=False),
    )

    tokens = tagged_frame_to_tokens(doc=df_doc, extract_opts=extract_opts)
    assert tokens == ['seas', 'oceans', 'life']
Esempio n. 13
0
def test_reader_when_lemmatized_nn_vb_returns_lemmatized_nn_vb():

    reader = SparvCsvToText(
        extract_tokens_opts=ExtractTaggedTokensOpts(
            pos_includes='NN|VB', pos_paddings='JJ', pos_excludes='', lemmatize=True, **SPARV_TAGGED_COLUMNS
        )
    )

    expected = "rödräv vara hunddjur ha * utbredning * halvklot"

    result = reader.transform(TEST_DATA)

    assert expected == result
Esempio n. 14
0
def test_reader_when_only_nn_returns_only_nn():

    reader = SparvCsvToText(
        extract_tokens_opts=ExtractTaggedTokensOpts(
            pos_includes='NN', pos_paddings='VB', pos_excludes=None, lemmatize=False, **SPARV_TAGGED_COLUMNS
        )
    )

    expected = "Rödräven * hunddjur * utbredning halvklotet"

    result = reader.transform(TEST_DATA)

    assert expected == result
Esempio n. 15
0
def test_reader_when_no_transforms_returns_source_tokens():

    reader = SparvCsvToText(
        extract_tokens_opts=ExtractTaggedTokensOpts(
            pos_includes=None, pos_paddings=None, pos_excludes=None, lemmatize=False, **SPARV_TAGGED_COLUMNS
        )
    )

    expected = "Rödräven är ett hunddjur som har en mycket vidsträckt utbredning över norra halvklotet ."

    result = reader.transform(TEST_DATA)

    assert expected == result
Esempio n. 16
0
def test_extract_tokens_pos_propn_succeeds(df_doc: pd.DataFrame):
    pytest.importorskip("spacy")
    df_doc = df_doc.copy()
    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes='|PROPN|',
        pos_paddings=None,
        **SPACY_TAGGED_COLUMNS,
        filter_opts=dict(is_punct=False),
    )

    tokens = tagged_frame_to_tokens(doc=df_doc, extract_opts=extract_opts)
    assert tokens == ['mars']
Esempio n. 17
0
def test_reader_when_source_is_sparv3_succeeds():

    sparv_zipped_xml_export_v3_filename = './tests/test_data/sou_test_sparv3_xml.zip'

    reader = readers.Sparv3XmlReader(
        sparv_zipped_xml_export_v3_filename,
        chunk_size=None,
        extract_tokens_opts=ExtractTaggedTokensOpts(pos_includes='|NN|',
                                                    pos_paddings=None,
                                                    lemmatize=True),
    )

    for _, (_, tokens) in enumerate(reader):

        assert len(list(tokens)) > 0
Esempio n. 18
0
def test_extract_tokens_when_lemma_lacks_underscore_succeeds(
        df_doc: pd.DataFrame):
    pytest.importorskip("spacy")
    df_doc = df_doc.copy()
    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=False,
        target_override="lemma_",
        **SPACY_TAGGED_COLUMNS,
        filter_opts=dict(is_punct=False),
    )
    tokens = tagged_frame_to_tokens(doc=df_doc, extract_opts=extract_opts)
    assert tokens == [
        'Mars', 'be', 'once', 'home', 'to', 'sea', 'and', 'ocean', 'and',
        'perhaps', 'even', 'life'
    ]
Esempio n. 19
0
def test_tagged_frame_to_tokens_succeeds():
    pipeline = Mock(spec=CorpusPipeline,
                    payload=Mock(spec=PipelinePayload,
                                 tagged_columns_names={}))
    task = tasks.TaggedFrameToTokens(
        pipeline=pipeline,
        extract_opts=ExtractTaggedTokensOpts(
            lemmatize=True,
            **SPACY_TAGGED_COLUMNS,
            filter_opts=dict(is_punct=False),
        ),
    ).setup()
    current_payload = next(fake_data_frame_stream(1))
    next_payload = task.process(current_payload)
    assert next_payload.content_type == ContentType.TOKENS
Esempio n. 20
0
def test_extract_when_lemmatized_returns_baseform():

    expected = 'rödräv vara en hunddjur som ha en mycken vidsträckt utbredning över norra halvklot . '
    content = sparv_xml_test_file()
    parser = sparv.SparvXml2Text(
        delimiter=" ",
        extract_tokens_opts=ExtractTaggedTokensOpts(pos_includes='',
                                                    pos_paddings=None,
                                                    lemmatize=True,
                                                    append_pos=False,
                                                    pos_excludes=''),
    )

    result = parser.transform(content)

    assert result == expected
Esempio n. 21
0
def test_extract_when_ignore_punctuation_filters_out_punctuations():

    expected = "Rödräven är ett hunddjur som har en mycket vidsträckt utbredning över norra halvklotet "
    content = sparv_xml_test_file()
    parser = sparv.SparvXml2Text(
        delimiter=" ",
        extract_tokens_opts=ExtractTaggedTokensOpts(
            pos_includes='',
            lemmatize=False,
            append_pos=False,
            pos_excludes="|MAD|MID|PAD|"),
    )

    result = parser.transform(content)

    assert result == expected
Esempio n. 22
0
def test_extract_when_no_filter_or_lemmatize_returns_original_text():

    expected = "Rödräven är ett hunddjur som har en mycket vidsträckt utbredning över norra halvklotet . "
    content = sparv_xml_test_file()
    parser = sparv.SparvXml2Text(
        delimiter=" ",
        extract_tokens_opts=ExtractTaggedTokensOpts(pos_includes='',
                                                    pos_paddings=None,
                                                    lemmatize=False,
                                                    append_pos=False,
                                                    pos_excludes=''),
    )

    result = parser.transform(content)

    assert result == expected
Esempio n. 23
0
def test_extract_when_lemmatized_and_filter_nouns_returns_nouns_in_baseform():

    expected = 'rödräv hunddjur utbredning halvklot '
    content = sparv_xml_test_file()
    parser = sparv.SparvXml2Text(
        delimiter=" ",
        extract_tokens_opts=ExtractTaggedTokensOpts(
            pos_includes="|NN|",
            pos_paddings=None,
            lemmatize=True,
            append_pos=False,
            pos_excludes="|MAD|MID|PAD|"),
    )

    result = parser.transform(content)

    assert result == expected
Esempio n. 24
0
def test_reader_when_lemmatized_nn_vb_pos_appended_returns_lemmatized_nn_vb_pos():

    reader = SparvCsvToText(
        extract_tokens_opts=ExtractTaggedTokensOpts(
            pos_includes='NN|VB',
            pos_paddings='JJ',
            pos_excludes='',
            lemmatize=True,
            append_pos=True,
            **SPARV_TAGGED_COLUMNS,
        )
    )

    expected = "rödräv@NN vara@VB hunddjur@NN ha@VB * utbredning@NN * halvklot@NN"

    result = reader.transform(TEST_DATA)

    assert expected == result
Esempio n. 25
0
def test_reader_when_lemmatized_nn_vb_returns_lemmatized_nn_vb():

    tokens_reader = readers.SparvCsvTokenizer(
        source=SPARV_CSV_EXPORT_FILENAME_SMALL,
        reader_opts=TextReaderOpts(),
        extract_opts=ExtractTaggedTokensOpts(pos_includes='NN|VB',
                                             pos_paddings=None,
                                             pos_excludes=None,
                                             lemmatize=True,
                                             **SPARV_TAGGED_COLUMNS),
    )

    expected = "rödräv vara hunddjur ha utbredning halvklot".split()

    filename, tokens = next(tokens_reader)

    assert filename == os.path.split(filename)[1]
    assert expected == tokens
Esempio n. 26
0
def test_reader_when_chunk_size_specified_returns_chunked_text():

    expected_documents = [['rödräv', 'hunddjur'], ['utbredning', 'halvklot']]
    expected_names = [
        "sparv_xml_export_small_001.txt", "sparv_xml_export_small_002.txt"
    ]

    reader = readers.SparvXmlReader(
        SPARV_XML_EXPORT_FILENAME_SMALL,
        chunk_size=2,
        extract_opts=ExtractTaggedTokensOpts(pos_includes='|NN|',
                                             lemmatize=True),
    )

    for i, (filename, tokens) in enumerate(reader):

        assert expected_documents[i] == list(tokens)
        assert expected_names[i] == filename
Esempio n. 27
0
def test_reader_when_no_transforms_returns_source_tokens():

    tokens_reader = readers.SparvCsvTokenizer(
        source=SPARV_CSV_EXPORT_FILENAME_SMALL,
        reader_opts=TextReaderOpts(),
        extract_opts=ExtractTaggedTokensOpts(pos_includes=None,
                                             pos_paddings=None,
                                             pos_excludes=None,
                                             lemmatize=False,
                                             **SPARV_TAGGED_COLUMNS),
    )

    expected = "Rödräven är ett hunddjur som har en mycket vidsträckt utbredning över norra halvklotet .".split(
    )

    filename, tokens = next(tokens_reader)

    assert filename == os.path.split(filename)[1]
    assert expected == tokens
Esempio n. 28
0
def test_reader_when_only_nouns_ignore_puncts_returns_filter_outs_puncts():

    expected = ['rödräv', 'hunddjur', 'utbredning', 'halvklot']
    expected_name = "sparv_xml_export_small.txt"

    reader = readers.SparvXmlReader(
        SPARV_XML_EXPORT_FILENAME_SMALL,
        chunk_size=None,
        extract_opts=ExtractTaggedTokensOpts(
            pos_includes='|NN|',
            pos_paddings=None,
            lemmatize=True,
        ),
    )

    filename, tokens = next(iter(reader))

    assert expected == list(tokens)
    assert expected_name == filename
Esempio n. 29
0
def test_spacy_pipeline_load_text_to_spacy_to_dataframe_to_tokens_to_text_to_dtm(
        en_nlp):
    pytest.importorskip("spacy")

    reader_opts = TextReaderOpts(filename_pattern="*.txt",
                                 filename_fields="year:_:1")
    text_transform_opts = TextTransformOpts()
    reader = TextReader.create(MARY_TEST_CORPUS,
                               reader_opts=reader_opts,
                               transform_opts=text_transform_opts)

    attributes = ['text', 'lemma_', 'pos_', 'is_punct']
    extract_opts = ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes='|VERB|NOUN|',
        pos_paddings=None,
        **SPACY_TAGGED_COLUMNS,
        filter_opts=dict(is_punct=False),
    )
    transform_opts = None

    vectorize_opts = VectorizeOpts()

    config = Mock(
        spec=CorpusConfig,
        pipeline_payload=PipelinePayload(source=reader).put2(
            **SPACY_TAGGED_COLUMNS),
    )

    pipeline = (CorpusPipeline(config=config).load_text(
        reader_opts=reader_opts,
        transform_opts=text_transform_opts).set_spacy_model(
            en_nlp).text_to_spacy().spacy_to_tagged_frame(
                attributes=attributes).tagged_frame_to_tokens(
                    extract_opts=extract_opts,
                    transform_opts=transform_opts).tokens_to_text().to_dtm(
                        vectorize_opts))

    corpus = pipeline.value()
    assert corpus is not None
    assert isinstance(corpus, VectorizedCorpus)
Esempio n. 30
0
def test_reader_when_source_is_zipped_archive_succeeds():

    expected_documents = [
        ['rödräv', 'hunddjur', 'utbredning', 'halvklot'],
        [
            'fjällräv', 'fjällvärld', 'liv', 'fjällräv', 'vinter', 'men',
            'variant', 'år'
        ],
    ]
    expected_names = ["document_001.txt", "document_002.txt"]

    reader = readers.SparvXmlReader(
        SPARV_ZIPPED_XML_EXPORT_FILENAME,
        chunk_size=None,
        extract_opts=ExtractTaggedTokensOpts(pos_includes='|NN|',
                                             pos_paddings=None,
                                             lemmatize=True),
    )

    for i, (filename, tokens) in enumerate(reader):

        assert expected_documents[i] == list(tokens)
        assert expected_names[i] == filename