Beispiel #1
0
def test_pipeline_tagged_frame_to_text_succeeds(config: pipeline.CorpusConfig):

    tagged_corpus_source: str = os.path.join(CORPUS_FOLDER, 'checkpoint_pos_tagged_test.zip')

    extract_opts: corpora.ExtractTaggedTokensOpts = corpora.ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes='|NOUN|',
        pos_paddings=None,
        **config.pipeline_payload.tagged_columns_names,
        filter_opts=dict(is_punct=False),
    )

    tagged_payload = next(
        pipeline.CorpusPipeline(config=config).checkpoint(tagged_corpus_source, force_checkpoint=False).resolve()
    )

    text_payload = next(
        pipeline.CorpusPipeline(config=config)
        .checkpoint(tagged_corpus_source, force_checkpoint=False)
        .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=None)
        .tokens_to_text()
        .resolve()
    )

    assert tagged_payload.filename == text_payload.filename
    assert len(tagged_payload.content[tagged_payload.content.pos_ == 'NOUN']) == len(text_payload.content.split())
Beispiel #2
0
def test_pipeline_tagged_frame_to_vocabulary_succeeds(config: pipeline.CorpusConfig):

    tagged_corpus_source: str = os.path.join(CORPUS_FOLDER, 'legal_instrument_five_docs_test_pos_csv.zip')

    pipe: pipeline.CorpusPipeline = (
        pipeline.CorpusPipeline(config=config)
        .checkpoint(tagged_corpus_source, force_checkpoint=False)
        .vocabulary(lemmatize=True, progress=False)
        .exhaust()
    )

    assert pipe.payload.token2id is not None
    assert pipe.payload.token2id.tf is not None
    assert len(pipe.payload.token2id) == 1147
    assert len(pipe.payload.token2id) == len(pipe.payload.token2id.tf) is not None
    assert set(pipe.payload.token2id.data.keys()) == {x.lower() for x in pipe.payload.token2id.keys()}
    assert 'Cultural' not in pipe.payload.token2id
    assert 'wars' not in pipe.payload.token2id
    assert 'war' in pipe.payload.token2id

    pipe: pipeline.CorpusPipeline = (
        pipeline.CorpusPipeline(config=config)
        .checkpoint(tagged_corpus_source, force_checkpoint=False)
        .vocabulary(lemmatize=False, progress=False)
        .exhaust()
    )

    assert len(pipe.payload.token2id) == 1478
    assert 'Cultural' in pipe.payload.token2id
    assert 'wars' in pipe.payload.token2id

    assert pipe.payload.token2id.tf[pipe.payload.token2id['the']] == 704
Beispiel #3
0
def test_load_text_returns_payload_with_expected_document_index(config: pipeline.CorpusConfig):

    transform_opts = corpora.TextTransformOpts()

    pipe = pipeline.CorpusPipeline(config=config).load_text(
        reader_opts=config.text_reader_opts, transform_opts=transform_opts
    )
    assert pipe is not None

    payloads: List[pipeline.DocumentPayload] = [x for x in pipe.resolve()]

    assert len(payloads) == 5
    assert len(pipe.payload.document_index) == 5
    assert len(pipe.payload.metadata) == 5
    assert pipe.payload.pos_schema_name == "Universal"
    assert pipe.payload.get('text_reader_opts') == config.text_reader_opts.props
    assert isinstance(pipe.payload.document_index, pd.DataFrame)

    columns = pipe.payload.document_index.columns.tolist()
    assert columns == [
        'section_id',
        'unesco_id',
        'filename',
        'type',
        'href',
        'year',
        'date',
        'city',
        'x.title',
        'document_id',
        'document_name',
    ]
    assert all(x.split(':')[0] in columns for x in config.text_reader_opts.filename_fields)
    assert pipe.payload.document_lookup('RECOMMENDATION_0201_049455_2017.txt')['unesco_id'] == 49455
Beispiel #4
0
def test_pipeline_text_to_dtm_succeeds(config: pipeline.CorpusConfig):

    target_tag: str = uuid.uuid1()

    tagged_corpus_source: str = os.path.join(CORPUS_FOLDER, 'checkpoint_pos_tagged_test.zip')

    extract_opts: corpora.ExtractTaggedTokensOpts = corpora.ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes='|NOUN|',
        pos_paddings=None,
        **config.pipeline_payload.tagged_columns_names,
        filter_opts=dict(is_punct=False),
    )

    corpus: corpora.VectorizedCorpus = (
        (
            pipeline.CorpusPipeline(config=config)
            .checkpoint(tagged_corpus_source, force_checkpoint=False)
            .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=None)
            .tokens_transform(transform_opts=corpora.TokensTransformOpts())
            .tokens_to_text()
            .tqdm()
            .to_dtm()
        )
        .single()
        .content
    )

    corpus.dump(tag=target_tag, folder=OUTPUT_FOLDER)

    assert isinstance(corpus, corpora.VectorizedCorpus)
    assert corpus.data.shape[0] == 5
    assert len(corpus.token2id) == corpus.data.shape[1]

    corpus.remove(tag=target_tag, folder=OUTPUT_FOLDER)
Beispiel #5
0
def test_workflow_to_dtm_step_by_step(config: pipeline.CorpusConfig):

    corpus_tag: str = uuid.uuid1()
    target_folder: str = "./tests/output"
    corpus_source: str = './tests/test_data/legal_instrument_five_docs_test.zip'
    tagged_corpus_source: str = f"./tests/output/{uuid.uuid1()}_pos_csv.zip"

    args: ComputeOpts = ComputeOpts(
        corpus_tag=corpus_tag,
        corpus_source=corpus_source,
        target_folder=target_folder,
        corpus_type=pipeline.CorpusType.SpacyCSV,
        # pos_scheme: PoS_Tag_Scheme = PoS_Tag_Schemes.Universal
        transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True),
        text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']),
        extract_opts=corpora.ExtractTaggedTokensOpts(
            lemmatize=True,
            pos_includes='|NOUN|PROPN|VERB|',
            pos_excludes='|PUNCT|EOL|SPACE|',
            **config.pipeline_payload.tagged_columns_names,
            filter_opts=dict(is_alpha=False, is_punct=False, is_space=False),
        ),
        create_subfolder=False,
        persist=True,
        tf_threshold=1,
        tf_threshold_mask=False,
        vectorize_opts=corpora.VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            min_tf=1,
            max_tokens=None,
        ),
        enable_checkpoint=True,
        force_checkpoint=True,
    )
    with inline_code(spacy_pipeline.to_tagged_frame_pipeline):

        tagged_frame_filename: str = tagged_corpus_source or utility.path_add_suffix(
            config.pipeline_payload.source, '_pos_csv'
        )

        p: pipeline.CorpusPipeline = (
            pipeline.CorpusPipeline(config=config)
            .set_spacy_model(config.pipeline_payload.memory_store['spacy_model'])
            .load_text(
                reader_opts=config.text_reader_opts,
                transform_opts=None,
                source=corpus_source,
            )
            .text_to_spacy()
            .spacy_to_pos_tagged_frame()
            .checkpoint(filename=tagged_frame_filename, force_checkpoint=args.force_checkpoint)
        )

        if args.enable_checkpoint:
            p = p.checkpoint_feather(folder=config.get_feather_folder(corpus_source), force=args.force_checkpoint)

        p.exhaust()
Beispiel #6
0
def test_pipeline_can_load_pos_tagged_checkpoint(config: pipeline.CorpusConfig):

    tagged_corpus_source: str = os.path.join(CORPUS_FOLDER, 'legal_instrument_five_docs_test_pos_csv.zip')

    pipe = pipeline.CorpusPipeline(config=config).checkpoint(tagged_corpus_source, force_checkpoint=False)

    payloads: List[pipeline.DocumentPayload] = pipe.to_list()

    assert len(payloads) == 5
    assert len(pipe.payload.document_index) == 5
    assert isinstance(pipe.payload.document_index, pd.DataFrame)
Beispiel #7
0
def test_store_id_tagged_frame():
    config: pp.CorpusConfig = pp.CorpusConfig.load(
        './tests/test_data/tranströmer.yml')
    corpus_source: str = './tests/test_data/tranströmer_corpus_pos_csv.zip'
    target_folder: str = './tests/test_data/tranströmer_id_tagged_frames'
    _: pp.CorpusPipeline = (pp.CorpusPipeline(config=config).load_tagged_frame(
        filename=corpus_source,
        checkpoint_opts=config.checkpoint_opts,
        extra_reader_opts=config.text_reader_opts,
    ).to_id_tagged_frame(ingest_vocab_type=IngestVocabType.Incremental).
                            store_id_tagged_frame(target_folder)).exhaust()
Beispiel #8
0
def test_pipeline_take_succeeds(config: pipeline.CorpusConfig):
    tagged_corpus_source: str = os.path.join(CORPUS_FOLDER, 'checkpoint_pos_tagged_test.zip')

    extract_opts: corpora.ExtractTaggedTokensOpts = corpora.ExtractTaggedTokensOpts(
        lemmatize=True, **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_punct=False)
    )

    take_payloads = (
        pipeline.CorpusPipeline(config=config)
        .checkpoint(tagged_corpus_source, force_checkpoint=False)
        .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=None)
        .tokens_to_text()
        .take(2)
    )

    assert len(take_payloads) == 2
Beispiel #9
0
def test_load_id_tagged_frame():
    config: pp.CorpusConfig = pp.CorpusConfig.load(
        './tests/test_data/tranströmer.yml')
    folder: str = './tests/test_data/tranströmer_id_tagged_frames'
    p: pp.CorpusPipeline = pp.CorpusPipeline(
        config=config).load_id_tagged_frame(
            folder=folder,
            file_pattern='**/tran*.feather',
            id_to_token=False,
        )

    payloads = p.to_list()

    assert len(payloads) == 5
    assert len(p.payload.document_index) == 5
    assert len(p.payload.token2id) == 341
Beispiel #10
0
def noun_pipeline(id_to_token: bool) -> pp.CorpusPipeline:
    corpus_source: str = './tests/test_data/tranströmer_id_tagged_frames'
    file_pattern: str = '**/tran_*.feather'

    config_filename: str = jj(corpus_source, 'corpus.yml')
    corpus_config: pp.CorpusConfig = pp.CorpusConfig.load(path=config_filename).folders(corpus_source)
    extract_opts: pc.ExtractTaggedTokensOpts = pc.ExtractTaggedTokensOpts(
        lemmatize=False, pos_includes='NN', **corpus_config.pipeline_payload.tagged_columns_names
    )

    if not id_to_token:
        extract_opts.set_numeric_names()

    p: pp.CorpusPipeline = (
        pp.CorpusPipeline(config=corpus_config)
        .load_id_tagged_frame(folder=corpus_source, id_to_token=id_to_token, file_pattern=file_pattern)
        .filter_tagged_frame(extract_opts=extract_opts, pos_schema=utility.PoS_Tag_Schemes.SUC)
    )
    return p
Beispiel #11
0
def test_pipeline_load_text_tag_checkpoint_stores_checkpoint(config: pipeline.CorpusConfig):

    tagged_corpus_source: str = os.path.join(OUTPUT_FOLDER, 'legal_instrument_five_docs_test_pos_csv.zip')

    transform_opts = corpora.TextTransformOpts()

    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    pathlib.Path(tagged_corpus_source).unlink(missing_ok=True)

    _ = (
        pipeline.CorpusPipeline(config=config)
        .set_spacy_model(config.pipeline_payload.memory_store['spacy_model'])
        .load_text(reader_opts=config.text_reader_opts, transform_opts=transform_opts)
        .text_to_spacy()
        .tqdm()
        .spacy_to_pos_tagged_frame()
        .checkpoint(tagged_corpus_source, force_checkpoint=False)
    ).exhaust()

    assert os.path.isfile(tagged_corpus_source)
    pathlib.Path(tagged_corpus_source).unlink(missing_ok=True)
Beispiel #12
0
def test_pipeline_tagged_frame_to_tuple_succeeds(config: pipeline.CorpusConfig):

    tagged_corpus_source: str = os.path.join(CORPUS_FOLDER, 'checkpoint_pos_tagged_test.zip')

    extract_opts: corpora.ExtractTaggedTokensOpts = corpora.ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes='|NOUN|',
        pos_paddings='|VERB|',
        **config.pipeline_payload.tagged_columns_names,
        filter_opts=dict(is_punct=False),
    )

    payloads = (
        pipeline.CorpusPipeline(config=config)
        .checkpoint(tagged_corpus_source, force_checkpoint=False)
        .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=None)
        .tokens_to_text()
        .to_list()
    )
    assert len(payloads) == 5

    assert all(isinstance(payload.content, str) for payload in payloads)
def test_sparv_tagged_frame_to_tokens(corpus_config: pipeline.CorpusConfig):

    p = pipeline.CorpusPipeline(config=corpus_config)
    corpus_config.checkpoint_opts.feather_folder = None
    load = tasks.LoadTaggedCSV(
        filename=corpus_config.pipeline_payload.source,
        checkpoint_opts=corpus_config.checkpoint_opts,
        extra_reader_opts=corpus_config.text_reader_opts,
    )

    tagged_columns: dict = corpus_config.pipeline_payload.tagged_columns_names
    extract = tasks.TaggedFrameToTokens(extract_opts=ExtractTaggedTokensOpts(
        lemmatize=True, **tagged_columns, filter_opts=dict(is_punct=False)), )

    p.add([load, extract])

    payloads = [p for p in p.resolve()]

    assert [x.document_name for x in payloads
            ] == ['prot_197677__25', 'prot_197677__26', 'prot_197677__27']
    assert all(x.content_type == pipeline.ContentType.TOKENS for x in payloads)
    assert all(isinstance(x.content, list) for x in payloads)
    assert len(payloads) == 3
Beispiel #14
0
def test_pipeline_find_task(config: pipeline.CorpusConfig):
    p: pipeline.CorpusPipeline = (
        pipeline.CorpusPipeline(config=config).checkpoint("dummy_name", force_checkpoint=False).tqdm()
    )
    assert isinstance(p.find(tasks.Checkpoint), tasks.Checkpoint)
    assert isinstance(p.find(tasks.Tqdm), tasks.Tqdm)