Exemple #1
0
def test_workflow_to_dtm_step_by_step(config: pipeline.CorpusConfig):

    corpus_tag: str = uuid.uuid1()
    target_folder: str = "./tests/output"
    corpus_source: str = './tests/test_data/legal_instrument_five_docs_test.zip'
    tagged_corpus_source: str = f"./tests/output/{uuid.uuid1()}_pos_csv.zip"

    args: ComputeOpts = ComputeOpts(
        corpus_tag=corpus_tag,
        corpus_source=corpus_source,
        target_folder=target_folder,
        corpus_type=pipeline.CorpusType.SpacyCSV,
        # pos_scheme: PoS_Tag_Scheme = PoS_Tag_Schemes.Universal
        transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True),
        text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']),
        extract_opts=corpora.ExtractTaggedTokensOpts(
            lemmatize=True,
            pos_includes='|NOUN|PROPN|VERB|',
            pos_excludes='|PUNCT|EOL|SPACE|',
            **config.pipeline_payload.tagged_columns_names,
            filter_opts=dict(is_alpha=False, is_punct=False, is_space=False),
        ),
        create_subfolder=False,
        persist=True,
        tf_threshold=1,
        tf_threshold_mask=False,
        vectorize_opts=corpora.VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            min_tf=1,
            max_tokens=None,
        ),
        enable_checkpoint=True,
        force_checkpoint=True,
    )
    with inline_code(spacy_pipeline.to_tagged_frame_pipeline):

        tagged_frame_filename: str = tagged_corpus_source or utility.path_add_suffix(
            config.pipeline_payload.source, '_pos_csv'
        )

        p: pipeline.CorpusPipeline = (
            pipeline.CorpusPipeline(config=config)
            .set_spacy_model(config.pipeline_payload.memory_store['spacy_model'])
            .load_text(
                reader_opts=config.text_reader_opts,
                transform_opts=None,
                source=corpus_source,
            )
            .text_to_spacy()
            .spacy_to_pos_tagged_frame()
            .checkpoint(filename=tagged_frame_filename, force_checkpoint=args.force_checkpoint)
        )

        if args.enable_checkpoint:
            p = p.checkpoint_feather(folder=config.get_feather_folder(corpus_source), force=args.force_checkpoint)

        p.exhaust()
Exemple #2
0
def compute_callback(args: interface.ComputeOpts, corpus_config: pipeline.CorpusConfig) -> VectorizedCorpus:
    global LAST_ARGS, LAST_CORPUS_CONFIG
    LAST_ARGS = args
    LAST_CORPUS_CONFIG = corpus_config
    if args.dry_run:
        print(args.command_line("vectorize_corpus"))
        return None
    corpus: VectorizedCorpus = workflow.compute(args=args, corpus_config=corpus_config)
    return corpus
Exemple #3
0
def ComputeOptsSparvCSV(
    *,
    corpus_tag: str = 'TELLUS',
    corpus_source:
    str = './tests/test_data/tranströmer_corpus_export.sparv4.csv.zip',
) -> ComputeOpts:  # pylint: disable=too-many-instance-attributes)

    return ComputeOpts(
        corpus_tag=corpus_tag,
        corpus_source=corpus_source,
        target_folder="./tests/output",
        corpus_type=CorpusType.SparvCSV,
        transform_opts=TokensTransformOpts(
            to_lower=True,
            min_len=1,
            remove_stopwords=None,
            keep_symbols=True,
            keep_numerals=True,
            only_alphabetic=False,
            only_any_alphanumeric=False,
        ),
        text_reader_opts=TextReaderOpts(
            filename_pattern='*.csv',
            filename_fields=('year:_:1', ),
            index_field=None,  # use filename
            as_binary=False,
        ),
        extract_opts=ExtractTaggedTokensOpts(
            pos_includes=None,
            pos_excludes='|MAD|MID|PAD|',
            pos_paddings=None,
            lemmatize=False,
            **SPARV_TAGGED_COLUMNS,
            filter_opts=dict(
                is_alpha=False,
                is_punct=False,
                is_digit=None,
                is_stop=None,
                is_space=False,
            ),
        ),
        create_subfolder=False,
        persist=True,
        context_opts=ContextOpts(
            concept=('jag', ),
            context_width=2,
            partition_keys=['document_id'],
        ),
        tf_threshold=1,
        tf_threshold_mask=False,
        vectorize_opts=VectorizeOpts(already_tokenized=True,
                                     min_tf=1,
                                     max_tokens=None),
    )
Exemple #4
0
def test_spaCy_co_occurrence_pipeline3(config):

    corpus_source = './tests/test_data/legal_instrument_five_docs_test.zip'
    tagged_corpus_source = f'./tests/output/{uuid.uuid1()}_pos.csv.zip'
    args: ComputeOpts = ComputeOpts(
        corpus_tag=f'{uuid.uuid1()}',
        corpus_source=corpus_source,
        target_folder=f'./tests/output/{uuid.uuid1()}',
        corpus_type=pipeline.CorpusType.SpacyCSV,
        # pos_scheme: utility.PoS_Tag_Scheme = utility.PoS_Tag_Schemes.Universal
        transform_opts=corpora.TokensTransformOpts(language='english',
                                                   remove_stopwords=True,
                                                   to_lower=True),
        text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv',
                                                filename_fields=['year:_:1']),
        extract_opts=corpora.ExtractTaggedTokensOpts(
            lemmatize=True,
            pos_includes='|NOUN|PROPN|VERB|',
            pos_excludes='|PUNCT|EOL|SPACE|',
            **config.pipeline_payload.tagged_columns_names,
            filter_opts=dict(is_alpha=False, is_punct=False, is_space=False),
        ),
        create_subfolder=False,
        persist=True,
        vectorize_opts=corpora.VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            min_tf=1,
            max_tokens=None,
        ),
        enable_checkpoint=True,
        force_checkpoint=True,
        tf_threshold=1,
        tf_threshold_mask=False,
        context_opts=co_occurrence.ContextOpts(
            context_width=4,
            concept=set(),
            ignore_concept=False,
            partition_keys=['document_id'],
        ),
    )

    workflow.compute(
        args=args,
        corpus_config=config,
        tagged_corpus_source=tagged_corpus_source,
    )

    assert os.path.isfile(tagged_corpus_source)
    assert os.path.isdir(args.target_folder)

    shutil.rmtree(args.target_folder, ignore_errors=True)
    os.remove(tagged_corpus_source)
Exemple #5
0
def test_workflow_to_dtm(config: pipeline.CorpusConfig):

    args: ComputeOpts = ComputeOpts(
        corpus_tag=f'{uuid.uuid1()}',
        corpus_source='./tests/test_data/legal_instrument_five_docs_test.zip',
        corpus_type=pipeline.CorpusType.Text,
        target_folder='./tests/output/',
        transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True),
        text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']),
        extract_opts=corpora.ExtractTaggedTokensOpts(
            lemmatize=True,
            pos_includes='|NOUN|PROPN|VERB|',
            pos_excludes='|PUNCT|EOL|SPACE|',
            **config.pipeline_payload.tagged_columns_names,
            filter_opts=dict(is_alpha=False, is_punct=False, is_space=False),
        ),
        vectorize_opts=corpora.VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            min_tf=1,
            max_tokens=None,
        ),
        create_subfolder=False,
        persist=True,
        enable_checkpoint=True,
        force_checkpoint=True,
        tf_threshold=1,
        tf_threshold_mask=False,
        tagged_corpus_source='./tests/output/legal_instrument_five_docs_test_pos_csv.zip',
    )

    corpus = workflow.compute(args=args, corpus_config=config)

    corpus.remove(tag=args.corpus_tag, folder=args.target_folder)
    corpus.dump(tag=args.corpus_tag, folder=args.target_folder)

    assert corpora.VectorizedCorpus.dump_exists(tag=args.corpus_tag, folder=args.target_folder)

    corpus_loaded = corpora.VectorizedCorpus.load(tag=args.corpus_tag, folder=args.target_folder)

    assert corpus_loaded is not None

    y_corpus = corpus.group_by_year()

    assert y_corpus is not None

    with contextlib.suppress(Exception):
        corpus.remove(tag=args.corpus_tag, folder=args.target_folder)
Exemple #6
0
def compute(
    args: interface.ComputeOpts,
    corpus_config: CorpusConfig,
    tagged_frame_pipeline: pipeline.CorpusPipeline = None,
) -> VectorizedCorpus:

    try:

        assert args.is_satisfied()

        if tagged_frame_pipeline is None:
            tagged_frame_pipeline = corpus_config.get_pipeline(
                "tagged_frame_pipeline",
                corpus_source=args.corpus_source,
                enable_checkpoint=args.enable_checkpoint,
                force_checkpoint=args.force_checkpoint,
                tagged_corpus_source=args.tagged_corpus_source,
            )
        corpus: VectorizedCorpus = (
            tagged_frame_pipeline
            + wildcard_to_DTM_pipeline(
                transform_opts=args.transform_opts,
                extract_opts=args.extract_opts,
                vectorize_opts=args.vectorize_opts,
            )
        ).value()

        if (args.tf_threshold or 1) > 1:
            corpus = corpus.slice_by_tf(args.tf_threshold)

        if args.persist:
            store_corpus_bundle(corpus, args)

        return corpus

    except Exception as ex:
        raise ex
Exemple #7
0
def run_workflow():
    corpus_config = pipeline.CorpusConfig.load(CONFIG_FILENAME).folders(DATA_FOLDER)
    corpus_config.pipeline_payload.files(source=CORPUS_FILENAME, document_index_source=None)
    corpus_config.checkpoint_opts.deserialize_processes = 4

    compute_opts = ComputeOpts(
        corpus_type=pipeline.CorpusType.SparvCSV,
        corpus_source=CORPUS_FILENAME,
        target_folder=jj(OUTPUT_FOLDER, 'APA'),
        corpus_tag='APA',
        transform_opts=corpora.TokensTransformOpts(
            to_lower=True,
            to_upper=False,
            min_len=1,
            max_len=None,
            remove_accents=False,
            remove_stopwords=False,
            stopwords=None,
            extra_stopwords=None,
            language='swedish',
            keep_numerals=True,
            keep_symbols=True,
            only_alphabetic=False,
            only_any_alphanumeric=False,
        ),
        text_reader_opts=corpora.TextReaderOpts(
            filename_pattern='*.csv',
            filename_filter=None,
            filename_fields=[
                'year:prot\\_(\\d{4}).*',
                'year2:prot_\\d{4}(\\d{2})__*',
                'number:prot_\\d+[afk_]{0,4}__(\\d+).*',
            ],
            index_field=None,
            as_binary=False,
            sep='\t',
            quoting=3,
        ),
        extract_opts=corpora.ExtractTaggedTokensOpts(
            pos_includes='NN|PM',
            pos_excludes='MAD|MID|PAD',
            pos_paddings='AB|DT|HA|HD|HP|HS|IE|IN|JJ|KN|PC|PL|PN|PP|PS|RG|RO|SN|UO|VB',
            lemmatize=True,
            append_pos=False,
            global_tf_threshold=1,
            global_tf_threshold_mask=False,
            **corpus_config.pipeline_payload.tagged_columns_names,
        ),
        vectorize_opts=corpora.VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            stop_words=None,
            max_df=1.0,
            min_df=1,
            min_tf=1,
        ),
        tf_threshold=1,
        tf_threshold_mask=False,
        create_subfolder=True,
        persist=True,
        context_opts=ContextOpts(
            context_width=2,
            concept=set(['kammare']),
            ignore_concept=False,
            partition_keys=['document_name'],
            processes=4,
            chunksize=10,
        ),
        enable_checkpoint=False,
        force_checkpoint=False,
    )

    _ = workflow.compute(
        args=compute_opts,
        corpus_config=corpus_config,
        tagged_corpus_source=jj(OUTPUT_FOLDER, 'test.zip'),
    )
Exemple #8
0
def ComputeOptsSpacyCSV(
    *,
    corpus_tag: str = 'MARS',
    corpus_source:
    str = './tests/test_data/legal_instrument_five_docs_test.zip',
) -> ComputeOpts:  # pylint: disable=too-many-instance-attributes)

    return ComputeOpts(
        corpus_tag=corpus_tag,
        corpus_source=corpus_source,
        target_folder="./tests/output",
        corpus_type=CorpusType.SpacyCSV,
        # pos_scheme: PoS_Tag_Scheme = PoS_Tag_Schemes.Universal
        transform_opts=TokensTransformOpts(
            extra_stopwords=[],
            keep_numerals=True,
            keep_symbols=True,
            language='english',
            max_len=None,
            min_len=1,
            only_alphabetic=False,
            only_any_alphanumeric=False,
            remove_accents=False,
            remove_stopwords=True,
            stopwords=None,
            to_lower=True,
            to_upper=False,
        ),
        text_reader_opts=TextReaderOpts(
            filename_pattern='*.csv',
            filename_fields=['year:_:1'],
            index_field=None,  # use filename
            as_binary=False,
        ),
        extract_opts=ExtractTaggedTokensOpts(
            lemmatize=True,
            target_override=None,
            pos_includes='|NOUN|PROPN|VERB|',
            pos_paddings=None,
            pos_excludes='|PUNCT|EOL|SPACE|',
            passthrough_tokens=[],
            block_tokens=[],
            append_pos=False,
            global_tf_threshold=1,
            global_tf_threshold_mask=False,
            **SPACY_TAGGED_COLUMNS,
            filter_opts=dict(
                is_alpha=False,
                is_punct=False,
                is_digit=None,
                is_stop=None,
                is_space=False,
            ),
        ),
        create_subfolder=False,
        persist=True,
        context_opts=ContextOpts(
            context_width=4,
            concept=set(),
            ignore_concept=False,
            partition_keys=['document_id'],
        ),
        tf_threshold=1,
        tf_threshold_mask=False,
        vectorize_opts=VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            max_df=1.0,
            min_df=1,
            min_tf=1,
            max_tokens=None,
        ),
    )
Exemple #9
0
def compute(
    args: interface.ComputeOpts,
    corpus_config: pipeline.CorpusConfig,
    tagged_corpus_source: Optional[str] = None,
) -> co_occurrence.Bundle:
    """Creates and stores a concept co-occurrence bundle using specified options."""

    try:

        assert args.is_satisfied()

        target_filename = co_occurrence.to_filename(folder=args.target_folder,
                                                    tag=args.corpus_tag)

        os.makedirs(args.target_folder, exist_ok=True)

        tagged_corpus_source: Optional[str] = tagged_corpus_source or jj(
            dirname(args.corpus_source),
            f"{args.corpus_tag}{POS_TAGGED_FRAME_FILENAME_POSTFIX}")

        tagged_frame_pipeline: pipeline.CorpusPipeline = corpus_config.get_pipeline(
            "tagged_frame_pipeline",
            corpus_source=args.corpus_source,
            tagged_corpus_source=tagged_corpus_source,
            enable_checkpoint=args.enable_checkpoint,
            force_checkpoint=args.force_checkpoint,
        )

        args.extract_opts.passthrough_tokens = args.context_opts.concept
        args.extract_opts.block_tokens = []
        # args.extract_opts.block_chars = ''
        args.extract_opts.global_tf_threshold = args.tf_threshold
        args.extract_opts.global_tf_threshold_mask = args.tf_threshold_mask

        p: pipeline.CorpusPipeline = (
            tagged_frame_pipeline +
            pipeline.wildcard_to_partition_by_document_co_occurrence_pipeline(
                transform_opts=args.transform_opts,
                extract_opts=args.extract_opts,
                context_opts=args.context_opts,
                global_tf_threshold=args.tf_threshold,
            ))

        bundle: co_occurrence.Bundle = p.value()

        if bundle.corpus is None:
            raise co_occurrence.ZeroComputeError()

        bundle.tag = args.corpus_tag
        bundle.folder = args.target_folder

        try:
            bundle.co_occurrences = bundle.corpus.to_co_occurrences(
                bundle.token2id)
        except ValueError as ex:
            logger.error("fatal: to_co_occurrences failed (skipping)")
            logger.exception(ex)

        bundle.compute_options = compile_compute_options(args, target_filename)

        bundle.store()

        return bundle

    except (
            ValueError,
            FileNotFoundError,
            PermissionError,
    ) as ex:
        logger.error(ex)
        raise
    except Exception as ex:
        logger.error(ex)
        raise