Ejemplo n.º 1
0
def test_pipeline_text_to_dtm_succeeds(config: pipeline.CorpusConfig):

    target_tag: str = uuid.uuid1()

    tagged_corpus_source: str = os.path.join(CORPUS_FOLDER, 'checkpoint_pos_tagged_test.zip')

    extract_opts: corpora.ExtractTaggedTokensOpts = corpora.ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes='|NOUN|',
        pos_paddings=None,
        **config.pipeline_payload.tagged_columns_names,
        filter_opts=dict(is_punct=False),
    )

    corpus: corpora.VectorizedCorpus = (
        (
            pipeline.CorpusPipeline(config=config)
            .checkpoint(tagged_corpus_source, force_checkpoint=False)
            .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=None)
            .tokens_transform(transform_opts=corpora.TokensTransformOpts())
            .tokens_to_text()
            .tqdm()
            .to_dtm()
        )
        .single()
        .content
    )

    corpus.dump(tag=target_tag, folder=OUTPUT_FOLDER)

    assert isinstance(corpus, corpora.VectorizedCorpus)
    assert corpus.data.shape[0] == 5
    assert len(corpus.token2id) == corpus.data.shape[1]

    corpus.remove(tag=target_tag, folder=OUTPUT_FOLDER)
Ejemplo n.º 2
0
def test_pipeline_tagged_frame_to_text_succeeds(config: pipeline.CorpusConfig):

    tagged_corpus_source: str = os.path.join(CORPUS_FOLDER, 'checkpoint_pos_tagged_test.zip')

    extract_opts: corpora.ExtractTaggedTokensOpts = corpora.ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes='|NOUN|',
        pos_paddings=None,
        **config.pipeline_payload.tagged_columns_names,
        filter_opts=dict(is_punct=False),
    )

    tagged_payload = next(
        pipeline.CorpusPipeline(config=config).checkpoint(tagged_corpus_source, force_checkpoint=False).resolve()
    )

    text_payload = next(
        pipeline.CorpusPipeline(config=config)
        .checkpoint(tagged_corpus_source, force_checkpoint=False)
        .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=None)
        .tokens_to_text()
        .resolve()
    )

    assert tagged_payload.filename == text_payload.filename
    assert len(tagged_payload.content[tagged_payload.content.pos_ == 'NOUN']) == len(text_payload.content.split())
Ejemplo n.º 3
0
def test_workflow_to_dtm_step_by_step(config: pipeline.CorpusConfig):

    corpus_tag: str = uuid.uuid1()
    target_folder: str = "./tests/output"
    corpus_source: str = './tests/test_data/legal_instrument_five_docs_test.zip'
    tagged_corpus_source: str = f"./tests/output/{uuid.uuid1()}_pos_csv.zip"

    args: ComputeOpts = ComputeOpts(
        corpus_tag=corpus_tag,
        corpus_source=corpus_source,
        target_folder=target_folder,
        corpus_type=pipeline.CorpusType.SpacyCSV,
        # pos_scheme: PoS_Tag_Scheme = PoS_Tag_Schemes.Universal
        transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True),
        text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']),
        extract_opts=corpora.ExtractTaggedTokensOpts(
            lemmatize=True,
            pos_includes='|NOUN|PROPN|VERB|',
            pos_excludes='|PUNCT|EOL|SPACE|',
            **config.pipeline_payload.tagged_columns_names,
            filter_opts=dict(is_alpha=False, is_punct=False, is_space=False),
        ),
        create_subfolder=False,
        persist=True,
        tf_threshold=1,
        tf_threshold_mask=False,
        vectorize_opts=corpora.VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            min_tf=1,
            max_tokens=None,
        ),
        enable_checkpoint=True,
        force_checkpoint=True,
    )
    with inline_code(spacy_pipeline.to_tagged_frame_pipeline):

        tagged_frame_filename: str = tagged_corpus_source or utility.path_add_suffix(
            config.pipeline_payload.source, '_pos_csv'
        )

        p: pipeline.CorpusPipeline = (
            pipeline.CorpusPipeline(config=config)
            .set_spacy_model(config.pipeline_payload.memory_store['spacy_model'])
            .load_text(
                reader_opts=config.text_reader_opts,
                transform_opts=None,
                source=corpus_source,
            )
            .text_to_spacy()
            .spacy_to_pos_tagged_frame()
            .checkpoint(filename=tagged_frame_filename, force_checkpoint=args.force_checkpoint)
        )

        if args.enable_checkpoint:
            p = p.checkpoint_feather(folder=config.get_feather_folder(corpus_source), force=args.force_checkpoint)

        p.exhaust()
Ejemplo n.º 4
0
def debug_main(
    config_filename: str = None,
    corpus_source: str = None,
    lemmatize: bool = True,
    pos_includes: str = '',
    pos_excludes: str = '',
    to_lower: bool = True,
    remove_stopwords: str = None,
    min_word_length: int = 2,
    max_word_length: int = None,
    keep_symbols: bool = False,
    keep_numerals: bool = False,
    only_any_alphanumeric: bool = False,
    only_alphabetic: bool = False,
    min_tf: int = None,
):
    config: pipeline.CorpusConfig = load_config(config_filename, corpus_source)

    transform_opts: pc.TokensTransformOpts = pc.TokensTransformOpts(
        to_lower=to_lower,
        to_upper=False,
        min_len=min_word_length,
        max_len=max_word_length,
        remove_accents=False,
        remove_stopwords=(remove_stopwords is not None),
        stopwords=None,
        extra_stopwords=None,
        language=remove_stopwords,
        keep_numerals=keep_numerals,
        keep_symbols=keep_symbols,
        only_alphabetic=only_alphabetic,
        only_any_alphanumeric=only_any_alphanumeric,
    )

    extract_opts = pc.ExtractTaggedTokensOpts(
        lemmatize=lemmatize,
        pos_includes=pos_includes,
        pos_excludes=pos_excludes,
    ).set_numeric_names()

    vectorize_opts: pc.VectorizeOpts = pc.VectorizeOpts(already_tokenized=True, min_tf=min_tf, max_tokens=100000)

    corpus_source: str = corpus_source or config.pipeline_payload.source

    corpus: pc.VectorizedCorpus = id_tagged_frame_to_DTM_pipeline(
        corpus_config=config,
        corpus_source=corpus_source,
        file_pattern='**/prot-*.feather',
        extract_opts=extract_opts,
        transform_opts=transform_opts,
        vectorize_opts=vectorize_opts,
    ).value()
    corpus = corpus.slice_by_tf(5)

    os.makedirs('./data/bogger', exist_ok=True)
    corpus.dump(tag='bogger', folder='./data/bogger', mode='files')

    print(f"Stored corpus of shape {corpus.data.shape}")
Ejemplo n.º 5
0
def test_workflow_to_dtm():

    config: pipeline.CorpusConfig = pipeline.CorpusConfig.load(
        './tests/test_data/riksprot-kb-parlaclarin.yml')

    args: interface.ComputeOpts = interface.ComputeOpts(
        corpus_tag=f'{uuid.uuid1()}',
        corpus_source=
        '/data/westac/riksdagen_corpus_data/riksprot_parlaclarin_basic_protocol_stanza.csv.zip',
        corpus_type=pipeline.CorpusType.SparvCSV,
        target_folder='./data',
        transform_opts=corpora.TokensTransformOpts(to_lower=True,
                                                   only_alphabetic=True),
        # text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']),
        text_reader_opts=config.text_reader_opts,
        extract_opts=corpora.ExtractTaggedTokensOpts(
            lemmatize=True,
            pos_includes='',
            pos_excludes='|MID|MAD|PAD|',
            **config.pipeline_payload.tagged_columns_names,
        ),
        vectorize_opts=corpora.VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            min_tf=1,
            max_tokens=None,
        ),
        create_subfolder=True,
        persist=True,
        enable_checkpoint=True,
        force_checkpoint=True,
        tf_threshold=5,
        tf_threshold_mask=True,
    )

    corpus = workflow.compute(args=args, corpus_config=config)

    corpus.remove(tag=args.corpus_tag, folder=args.target_folder)
    corpus.dump(tag=args.corpus_tag, folder=args.target_folder)

    assert corpora.VectorizedCorpus.dump_exists(tag=args.corpus_tag,
                                                folder=args.target_folder)

    corpus_loaded = corpora.VectorizedCorpus.load(tag=args.corpus_tag,
                                                  folder=args.target_folder)

    assert corpus_loaded is not None

    y_corpus = corpus.group_by_year()

    assert y_corpus is not None

    with contextlib.suppress(Exception):
        corpus.remove(tag=args.corpus_tag, folder=args.target_folder)
Ejemplo n.º 6
0
def test_spaCy_co_occurrence_pipeline3(config):

    corpus_source = './tests/test_data/legal_instrument_five_docs_test.zip'
    tagged_corpus_source = f'./tests/output/{uuid.uuid1()}_pos.csv.zip'
    args: ComputeOpts = ComputeOpts(
        corpus_tag=f'{uuid.uuid1()}',
        corpus_source=corpus_source,
        target_folder=f'./tests/output/{uuid.uuid1()}',
        corpus_type=pipeline.CorpusType.SpacyCSV,
        # pos_scheme: utility.PoS_Tag_Scheme = utility.PoS_Tag_Schemes.Universal
        transform_opts=corpora.TokensTransformOpts(language='english',
                                                   remove_stopwords=True,
                                                   to_lower=True),
        text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv',
                                                filename_fields=['year:_:1']),
        extract_opts=corpora.ExtractTaggedTokensOpts(
            lemmatize=True,
            pos_includes='|NOUN|PROPN|VERB|',
            pos_excludes='|PUNCT|EOL|SPACE|',
            **config.pipeline_payload.tagged_columns_names,
            filter_opts=dict(is_alpha=False, is_punct=False, is_space=False),
        ),
        create_subfolder=False,
        persist=True,
        vectorize_opts=corpora.VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            min_tf=1,
            max_tokens=None,
        ),
        enable_checkpoint=True,
        force_checkpoint=True,
        tf_threshold=1,
        tf_threshold_mask=False,
        context_opts=co_occurrence.ContextOpts(
            context_width=4,
            concept=set(),
            ignore_concept=False,
            partition_keys=['document_id'],
        ),
    )

    workflow.compute(
        args=args,
        corpus_config=config,
        tagged_corpus_source=tagged_corpus_source,
    )

    assert os.path.isfile(tagged_corpus_source)
    assert os.path.isdir(args.target_folder)

    shutil.rmtree(args.target_folder, ignore_errors=True)
    os.remove(tagged_corpus_source)
Ejemplo n.º 7
0
def test_workflow_to_dtm(config: pipeline.CorpusConfig):

    args: ComputeOpts = ComputeOpts(
        corpus_tag=f'{uuid.uuid1()}',
        corpus_source='./tests/test_data/legal_instrument_five_docs_test.zip',
        corpus_type=pipeline.CorpusType.Text,
        target_folder='./tests/output/',
        transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True),
        text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']),
        extract_opts=corpora.ExtractTaggedTokensOpts(
            lemmatize=True,
            pos_includes='|NOUN|PROPN|VERB|',
            pos_excludes='|PUNCT|EOL|SPACE|',
            **config.pipeline_payload.tagged_columns_names,
            filter_opts=dict(is_alpha=False, is_punct=False, is_space=False),
        ),
        vectorize_opts=corpora.VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            min_tf=1,
            max_tokens=None,
        ),
        create_subfolder=False,
        persist=True,
        enable_checkpoint=True,
        force_checkpoint=True,
        tf_threshold=1,
        tf_threshold_mask=False,
        tagged_corpus_source='./tests/output/legal_instrument_five_docs_test_pos_csv.zip',
    )

    corpus = workflow.compute(args=args, corpus_config=config)

    corpus.remove(tag=args.corpus_tag, folder=args.target_folder)
    corpus.dump(tag=args.corpus_tag, folder=args.target_folder)

    assert corpora.VectorizedCorpus.dump_exists(tag=args.corpus_tag, folder=args.target_folder)

    corpus_loaded = corpora.VectorizedCorpus.load(tag=args.corpus_tag, folder=args.target_folder)

    assert corpus_loaded is not None

    y_corpus = corpus.group_by_year()

    assert y_corpus is not None

    with contextlib.suppress(Exception):
        corpus.remove(tag=args.corpus_tag, folder=args.target_folder)
Ejemplo n.º 8
0
def test_pipeline_take_succeeds(config: pipeline.CorpusConfig):
    tagged_corpus_source: str = os.path.join(CORPUS_FOLDER, 'checkpoint_pos_tagged_test.zip')

    extract_opts: corpora.ExtractTaggedTokensOpts = corpora.ExtractTaggedTokensOpts(
        lemmatize=True, **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_punct=False)
    )

    take_payloads = (
        pipeline.CorpusPipeline(config=config)
        .checkpoint(tagged_corpus_source, force_checkpoint=False)
        .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=None)
        .tokens_to_text()
        .take(2)
    )

    assert len(take_payloads) == 2
Ejemplo n.º 9
0
def test_spaCy_co_occurrence_workflow(config: pipeline.CorpusConfig):
    """Note: Use the output from this test case to update the tests/test_data/VENUS test data VENUS-TESTDATA"""

    os.makedirs('./tests/output', exist_ok=True)

    config.pipeline_payload.source = './tests/test_data/legal_instrument_five_docs_test.zip'
    config.pipeline_payload.document_index_source = './tests/test_data/legal_instrument_five_docs_test.csv'
    config.checkpoint_opts.feather_folder = f'tests/output/{uuid.uuid1()}'
    corpus_tag: str = 'VENUS'
    target_folder: str = f'./tests/output/{uuid.uuid1()}'

    tagged_corpus_source: str = "./tests/output/co_occurrence_test_pos_csv.zip"

    bundle: co_occurrence.Bundle = spaCy_co_occurrence_pipeline(
        corpus_config=config,
        corpus_source=None,
        transform_opts=corpora.TokensTransformOpts(language='english',
                                                   remove_stopwords=True,
                                                   to_lower=True),
        extract_opts=corpora.ExtractTaggedTokensOpts(
            lemmatize=True,
            pos_includes='|NOUN|PROPN|VERB|',
            pos_excludes='|PUNCT|EOL|SPACE|',
            **config.pipeline_payload.tagged_columns_names,
            filter_opts=dict(is_alpha=False, is_punct=False, is_space=False),
        ),
        context_opts=co_occurrence.ContextOpts(context_width=4,
                                               ignore_concept=True,
                                               partition_keys=['document_id'],
                                               processes=None),
        global_threshold_count=1,
        tagged_corpus_source=tagged_corpus_source,
    ).value()

    assert bundle.corpus is not None
    assert bundle.token2id is not None
    assert bundle.document_index is not None

    bundle.tag = corpus_tag
    bundle.folder = target_folder
    bundle.co_occurrences = bundle.corpus.to_co_occurrences(bundle.token2id)

    bundle.store()

    shutil.rmtree(bundle.folder, ignore_errors=True)
    shutil.rmtree(tagged_corpus_source, ignore_errors=True)
    shutil.rmtree(config.checkpoint_opts.feather_folder, ignore_errors=True)
Ejemplo n.º 10
0
def noun_pipeline(id_to_token: bool) -> pp.CorpusPipeline:
    corpus_source: str = './tests/test_data/tranströmer_id_tagged_frames'
    file_pattern: str = '**/tran_*.feather'

    config_filename: str = jj(corpus_source, 'corpus.yml')
    corpus_config: pp.CorpusConfig = pp.CorpusConfig.load(path=config_filename).folders(corpus_source)
    extract_opts: pc.ExtractTaggedTokensOpts = pc.ExtractTaggedTokensOpts(
        lemmatize=False, pos_includes='NN', **corpus_config.pipeline_payload.tagged_columns_names
    )

    if not id_to_token:
        extract_opts.set_numeric_names()

    p: pp.CorpusPipeline = (
        pp.CorpusPipeline(config=corpus_config)
        .load_id_tagged_frame(folder=corpus_source, id_to_token=id_to_token, file_pattern=file_pattern)
        .filter_tagged_frame(extract_opts=extract_opts, pos_schema=utility.PoS_Tag_Schemes.SUC)
    )
    return p
Ejemplo n.º 11
0
def test_spaCy_co_occurrence_pipeline(config: pipeline.CorpusConfig):

    os.makedirs('./tests/output', exist_ok=True)
    tagged_corpus_source: str = "./tests/test_data/legal_instrument_five_docs_test_pos_csv.zip"
    target_filename = './tests/output/SSI-co-occurrence-JJVBNN-window-9.csv'
    if os.path.isfile(target_filename):
        os.remove(target_filename)

    # .folder(folder='./tests/test_data')
    pos_scheme: utility.PoS_Tag_Scheme = utility.PoS_Tag_Schemes.Universal
    transform_opts: corpora.TokensTransformOpts = corpora.TokensTransformOpts()
    extract_opts: corpora.ExtractTaggedTokensOpts = corpora.ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes=utility.pos_tags_to_str(pos_scheme.Adjective +
                                             pos_scheme.Verb +
                                             pos_scheme.Noun),
        pos_paddings=utility.pos_tags_to_str(pos_scheme.Conjunction),
        **config.pipeline_payload.tagged_columns_names,
        filter_opts=dict(is_punct=False),
    )
    context_opts: co_occurrence.ContextOpts = co_occurrence.ContextOpts(
        context_width=4,
        partition_keys=['document_id'],
    )
    global_threshold_count: int = 1

    value: co_occurrence.Bundle = spaCy_co_occurrence_pipeline(
        corpus_config=config,
        corpus_source=config.pipeline_payload.source,
        transform_opts=transform_opts,
        context_opts=context_opts,
        extract_opts=extract_opts,
        global_threshold_count=global_threshold_count,
        tagged_corpus_source=tagged_corpus_source,
    ).value()

    value.co_occurrences.to_csv(target_filename, sep='\t')

    assert os.path.isfile(target_filename)

    os.remove(target_filename)
Ejemplo n.º 12
0
def test_pipeline_tagged_frame_to_tuple_succeeds(config: pipeline.CorpusConfig):

    tagged_corpus_source: str = os.path.join(CORPUS_FOLDER, 'checkpoint_pos_tagged_test.zip')

    extract_opts: corpora.ExtractTaggedTokensOpts = corpora.ExtractTaggedTokensOpts(
        lemmatize=True,
        pos_includes='|NOUN|',
        pos_paddings='|VERB|',
        **config.pipeline_payload.tagged_columns_names,
        filter_opts=dict(is_punct=False),
    )

    payloads = (
        pipeline.CorpusPipeline(config=config)
        .checkpoint(tagged_corpus_source, force_checkpoint=False)
        .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=None)
        .tokens_to_text()
        .to_list()
    )
    assert len(payloads) == 5

    assert all(isinstance(payload.content, str) for payload in payloads)
Ejemplo n.º 13
0
def debug_main(
    config_filename: str = None,
    target_name: str = None,
    corpus_source: str = None,
    train_corpus_folder: str = None,
    target_folder: str = None,
    fix_hyphenation: bool = True,
    fix_accents: bool = True,
    lemmatize: bool = True,
    pos_includes: str = '',
    pos_excludes: str = '',
    to_lower: bool = True,
    remove_stopwords: str = None,
    min_word_length: int = 2,
    max_word_length: int = None,
    keep_symbols: bool = False,
    keep_numerals: bool = False,
    only_any_alphanumeric: bool = False,
    only_alphabetic: bool = False,
    max_tokens: int = None,
    alpha: str = 'asymmetric',
    chunk_size: int = 2000,
    engine: str = "gensim_lda-multicore",
    max_iter: int = None,
    minimum_probability: float = None,
    n_topics: int = 50,
    passes: int = None,
    per_word_topics: bool = False,
    random_seed: int = None,
    update_every: int = 1,
    workers: int = None,
    store_corpus: bool = True,
    store_compressed: bool = True,
    passthrough_column: str = None,
):
    config: pipeline.CorpusConfig = load_config(config_filename, corpus_source)

    if passthrough_column is None:

        text_transform_opts: pc.TextTransformOpts = pc.TextTransformOpts()

        if fix_accents:
            text_transform_opts.fix_accents = True

        if fix_hyphenation:
            """Replace default dehyphen function"""
            # fix_hyphens: Callable[[str], str] = (
            #     remove_hyphens_fx(config.text_reader_opts.dehyphen_expr)
            #     if config.text_reader_opts.dehyphen_expr is not None
            #     else remove_hyphens
            # )
            text_transform_opts.fix_hyphenation = False
            text_transform_opts.extra_transforms.append(pc.remove_hyphens)

        transform_opts: pc.TokensTransformOpts = pc.TokensTransformOpts(
            to_lower=to_lower,
            to_upper=False,
            min_len=min_word_length,
            max_len=max_word_length,
            remove_accents=False,
            remove_stopwords=(remove_stopwords is not None),
            stopwords=None,
            extra_stopwords=None,
            language=remove_stopwords,
            keep_numerals=keep_numerals,
            keep_symbols=keep_symbols,
            only_alphabetic=only_alphabetic,
            only_any_alphanumeric=only_any_alphanumeric,
        )

        extract_opts = pc.ExtractTaggedTokensOpts(
            lemmatize=lemmatize,
            pos_includes=pos_includes,
            pos_excludes=pos_excludes,
        ).set_numeric_names()

    else:
        # extract_opts: str = passthrough_column
        text_transform_opts: pc.TextTransformOpts = None
        # transform_opts: penelope.TokensTransformOpts = None

    engine_args = remove_none({
        'n_topics':
        n_topics,
        'passes':
        passes,
        'random_seed':
        random_seed,
        'alpha':
        alpha,
        'workers':
        workers,
        'max_iter':
        max_iter,
        'work_folder':
        os.path.join(target_folder, target_name),
        'chunk_size':
        chunk_size,
        'update_every':
        2,
    })
    vectorize_opts: VectorizeOpts = VectorizeOpts(
        already_tokenized=True,
        max_tokens=max_tokens,
        lowercase=False,
    )
    corpus_source: str = corpus_source or config.pipeline_payload.source

    _: dict = from_id_tagged_frame_pipeline(
        corpus_config=config,
        corpus_source=corpus_source,
        file_pattern='**/*.feather',
        extract_opts=extract_opts,
        transform_opts=transform_opts,
        vectorize_opts=vectorize_opts,
        target_name=target_name,
        train_corpus_folder=train_corpus_folder,
        target_folder=target_folder,
        engine=engine,
        engine_args=engine_args,
        store_corpus=store_corpus,
        store_compressed=store_compressed,
    ).value()
Ejemplo n.º 14
0
def main(
    config_filename: Optional[str] = None,
    corpus_source: Optional[str] = None,
    filename_pattern: str = None,
    train_corpus_folder: Optional[str] = None,
    trained_model_folder: Optional[str] = None,
    target_mode: Literal['train', 'predict', 'both'] = 'both',
    target_folder: Optional[str] = None,
    target_name: Optional[str] = None,
    lemmatize: bool = True,
    pos_includes: str = '',
    pos_excludes: str = '',
    to_lower: bool = True,
    max_tokens: int = None,
    tf_threshold: int = None,
    # remove_stopwords: Optional[str] = None,
    # min_word_length: int = 2,
    # max_word_length: int = None,
    # keep_symbols: bool = False,
    # keep_numerals: bool = False,
    alpha: str = 'asymmetric',
    chunk_size: int = 2000,
    engine: str = "gensim_lda-multicore",
    max_iter: int = None,
    num_top_words: int = None,
    minimum_probability: float = None,
    n_topics: int = 50,
    passes: int = None,
    per_word_topics: bool = False,
    random_seed: int = None,
    update_every: int = 1,
    workers: int = None,
    store_corpus: bool = True,
    store_compressed: bool = True,
):
    to_lower = False  # for now...

    if not config_filename or not os.path.isfile(config_filename):
        click.echo("error: config file not specified/found")
        raise sys.exit(1)

    if target_name is None:
        click.echo("error: target_name not specified")
        raise sys.exit(1)

    if target_mode == 'predict' and not InferredModel.exists(
            trained_model_folder):
        click.echo("error: trained model folder not specified")
        raise sys.exit(1)

    config: pipeline.CorpusConfig = load_config(config_filename, corpus_source)

    if corpus_source is None and config.pipeline_payload.source is None:
        click.echo("usage: corpus source must be specified")
        sys.exit(1)

    if not config.pipeline_key_exists("topic_modeling_pipeline"):
        click.echo("config error: `topic_modeling_pipeline` not specified")
        sys.exit(1)

    # transform_opts: pc.TokensTransformOpts = None

    extract_opts: pc.ExtractTaggedTokensOpts = pc.ExtractTaggedTokensOpts(
        lemmatize=lemmatize,
        pos_includes=pos_includes,
        pos_excludes=pos_excludes,
        pos_column='pos_id',
        lemma_column='lemma_id',
        text_column='token_id',
    )

    vectorize_opts: pc.VectorizeOpts = pc.VectorizeOpts(
        already_tokenized=True,
        lowercase=to_lower,
        max_tokens=max_tokens,
        min_tf=tf_threshold,
    )
    engine_args = remove_none(
        dict(
            alpha=alpha,
            chunk_size=chunk_size,
            max_iter=max_iter,
            num_top_words=num_top_words,
            minimum_probability=minimum_probability,
            n_topics=n_topics,
            passes=passes,
            per_word_topics=per_word_topics,
            random_seed=random_seed,
            update_every=update_every,
            work_folder=os.path.join(target_folder, target_name),
            workers=workers,
        ))
    # _: dict = config.get_pipeline(
    #     pipeline_key="topic_modeling_pipeline",

    value: dict = workflow.compute(
        corpus_config=config,
        corpus_source=corpus_source,
        filename_pattern=filename_pattern,
        train_corpus_folder=train_corpus_folder,
        trained_model_folder=trained_model_folder,
        target_mode=target_mode,
        target_folder=target_folder,
        target_name=target_name,
        extract_opts=extract_opts,
        vectorize_opts=vectorize_opts,
        engine=engine,
        engine_args=engine_args,
        store_corpus=store_corpus,
        store_compressed=store_compressed,
        # transform_opts=transform_opts,
    )

    logger.info(
        f"workflow completed: model {value.get('target_name')} stored in {value.get('target_folder')}"
    )
Ejemplo n.º 15
0
def run_workflow():
    corpus_config = pipeline.CorpusConfig.load(CONFIG_FILENAME).folders(DATA_FOLDER)
    corpus_config.pipeline_payload.files(source=CORPUS_FILENAME, document_index_source=None)
    corpus_config.checkpoint_opts.deserialize_processes = 4

    compute_opts = ComputeOpts(
        corpus_type=pipeline.CorpusType.SparvCSV,
        corpus_source=CORPUS_FILENAME,
        target_folder=jj(OUTPUT_FOLDER, 'APA'),
        corpus_tag='APA',
        transform_opts=corpora.TokensTransformOpts(
            to_lower=True,
            to_upper=False,
            min_len=1,
            max_len=None,
            remove_accents=False,
            remove_stopwords=False,
            stopwords=None,
            extra_stopwords=None,
            language='swedish',
            keep_numerals=True,
            keep_symbols=True,
            only_alphabetic=False,
            only_any_alphanumeric=False,
        ),
        text_reader_opts=corpora.TextReaderOpts(
            filename_pattern='*.csv',
            filename_filter=None,
            filename_fields=[
                'year:prot\\_(\\d{4}).*',
                'year2:prot_\\d{4}(\\d{2})__*',
                'number:prot_\\d+[afk_]{0,4}__(\\d+).*',
            ],
            index_field=None,
            as_binary=False,
            sep='\t',
            quoting=3,
        ),
        extract_opts=corpora.ExtractTaggedTokensOpts(
            pos_includes='NN|PM',
            pos_excludes='MAD|MID|PAD',
            pos_paddings='AB|DT|HA|HD|HP|HS|IE|IN|JJ|KN|PC|PL|PN|PP|PS|RG|RO|SN|UO|VB',
            lemmatize=True,
            append_pos=False,
            global_tf_threshold=1,
            global_tf_threshold_mask=False,
            **corpus_config.pipeline_payload.tagged_columns_names,
        ),
        vectorize_opts=corpora.VectorizeOpts(
            already_tokenized=True,
            lowercase=False,
            stop_words=None,
            max_df=1.0,
            min_df=1,
            min_tf=1,
        ),
        tf_threshold=1,
        tf_threshold_mask=False,
        create_subfolder=True,
        persist=True,
        context_opts=ContextOpts(
            context_width=2,
            concept=set(['kammare']),
            ignore_concept=False,
            partition_keys=['document_name'],
            processes=4,
            chunksize=10,
        ),
        enable_checkpoint=False,
        force_checkpoint=False,
    )

    _ = workflow.compute(
        args=compute_opts,
        corpus_config=corpus_config,
        tagged_corpus_source=jj(OUTPUT_FOLDER, 'test.zip'),
    )
Ejemplo n.º 16
0
def main(
    config_filename: Optional[str] = None,
    corpus_source: Optional[str] = None,
    train_corpus_folder: Optional[str] = None,
    trained_model_folder: Optional[str] = None,
    target_mode: Literal['train', 'predict', 'both'] = 'both',
    target_folder: Optional[str] = None,
    target_name: Optional[str] = None,
    lemmatize: bool = True,
    pos_includes: str = '',
    pos_excludes: str = '',
    to_lower: bool = True,
    max_tokens: int = None,
    tf_threshold: int = None,
    remove_stopwords: Optional[str] = None,
    min_word_length: int = 2,
    max_word_length: int = None,
    keep_symbols: bool = False,
    keep_numerals: bool = False,
    alpha: str = 'asymmetric',
    chunk_size: int = 2000,
    engine: str = "gensim_lda-multicore",
    max_iter: int = None,
    minimum_probability: float = None,
    n_topics: int = 50,
    passes: int = None,
    per_word_topics: bool = False,
    random_seed: int = None,
    update_every: int = 1,
    workers: int = None,
    store_corpus: bool = True,
    store_compressed: bool = True,
    only_any_alphanumeric: bool = False,
    only_alphabetic: bool = False,
    fix_hyphenation: bool = True,
    fix_accents: bool = True,
    enable_checkpoint: bool = True,
    force_checkpoint: bool = False,
    passthrough_column: Optional[str] = None,
):

    if not config_filename or not os.path.isfile(config_filename):
        click.echo("error: config file not specified/found")
        raise sys.exit(1)

    if target_name is None:
        click.echo("error: target_name not specified")
        raise sys.exit(1)

    if target_mode == 'predict' and not InferredModel.exists(
            trained_model_folder):
        click.echo("error: trained model folder not specified")
        raise sys.exit(1)

    config: pipeline.CorpusConfig = load_config(config_filename, corpus_source)

    if corpus_source is None and config.pipeline_payload.source is None:
        click.echo("usage: corpus source must be specified")
        sys.exit(1)

    if not config.pipeline_key_exists("topic_modeling_pipeline"):
        click.echo("config error: `topic_modeling_pipeline` not specified")
        sys.exit(1)

    text_transform_opts: pc.TextTransformOpts = pc.TextTransformOpts()

    if fix_accents:
        text_transform_opts.fix_accents = True

    if fix_hyphenation:
        """Replace default dehyphen function"""
        # fix_hyphens: Callable[[str], str] = (
        #     remove_hyphens_fx(config.text_reader_opts.dehyphen_expr)
        #     if config.text_reader_opts.dehyphen_expr is not None
        #     else remove_hyphens
        # )
        text_transform_opts.fix_hyphenation = False
        text_transform_opts.extra_transforms.append(pc.remove_hyphens)

    transform_opts: pc.TokensTransformOpts = pc.TokensTransformOpts(
        to_lower=to_lower,
        to_upper=False,
        min_len=min_word_length,
        max_len=max_word_length,
        remove_accents=False,
        remove_stopwords=(remove_stopwords is not None),
        stopwords=None,
        extra_stopwords=None,
        language=remove_stopwords,
        keep_numerals=keep_numerals,
        keep_symbols=keep_symbols,
        only_alphabetic=only_alphabetic,
        only_any_alphanumeric=only_any_alphanumeric,
    )

    extract_opts: pc.ExtractTaggedTokensOpts = pc.ExtractTaggedTokensOpts(
        lemmatize=lemmatize,
        pos_includes=pos_includes,
        pos_excludes=pos_excludes,
        **config.pipeline_payload.tagged_columns_names,
    )

    if passthrough_column is not None:

        extract_opts: str = passthrough_column
        text_transform_opts: pc.TextTransformOpts = None
        transform_opts: pc.TokensTransformOpts = None

    engine_args: dict = remove_none(
        dict(
            alpha=alpha,
            chunk_size=chunk_size,
            max_iter=max_iter,
            minimum_probability=minimum_probability,
            n_topics=n_topics,
            passes=passes,
            per_word_topics=per_word_topics,
            random_seed=random_seed,
            update_every=update_every,
            work_folder=os.path.join(target_folder, target_name),
            workers=workers,
        ))

    _: dict = config.get_pipeline(
        pipeline_key="topic_modeling_pipeline",
        config=config,
        corpus_source=corpus_source,
        train_corpus_folder=train_corpus_folder,
        trained_model_folder=trained_model_folder,
        target_mode=target_mode,
        target_folder=target_folder,
        target_name=target_name,
        text_transform_opts=text_transform_opts,
        extract_opts=extract_opts,
        transform_opts=transform_opts,
        engine=engine,
        engine_args=engine_args,
        store_corpus=store_corpus,
        store_compressed=store_compressed,
        enable_checkpoint=enable_checkpoint,
        force_checkpoint=force_checkpoint,
    ).value()
def run_workflow():
    corpus_config = pipeline.CorpusConfig.load(CONFIG_FILENAME)  # .folders(DATA_FOLDER)
    #    corpus_config.pipeline_payload.files(source=CORPUS_FILENAME, document_index_source=None)
    # corpus_config.checkpoint_opts.deserialize_processes = 3

    transform_opts: corpora.TokensTransformOpts = corpora.TokensTransformOpts(
        to_lower=False,
        to_upper=False,
        min_len=1,
        max_len=None,
        remove_accents=False,
        remove_stopwords=False,
        stopwords=None,
        extra_stopwords=None,
        language='swedish',
        keep_numerals=True,
        keep_symbols=True,
        only_alphabetic=False,
        only_any_alphanumeric=False,
    )
    extract_opts: corpora.ExtractTaggedTokensOpts = corpora.ExtractTaggedTokensOpts(
        pos_includes=None,
        pos_excludes=None,
        pos_paddings=None,
        lemmatize=True,
        append_pos=False,
        global_tf_threshold=1,
        global_tf_threshold_mask=False,
        **corpus_config.pipeline_payload.tagged_columns_names,
    )
    engine_args = {
        'n_topics': 4,
        'passes': 1,
        'random_seed': 42,
        'alpha': 'symmetric',
        'workers': 1,
        'max_iter': 500,
        'work_folder': './tests/output/',
    }
    extract_opts = "lemma"
    transform_opts = None
    _ = (
        CorpusPipeline(config=corpus_config)
        .load_id_tagged_frame(
            folder=CORPUS_FOLDER,
            file_pattern='**/prot-*.feather',
            id_to_token=True,
        )
        .tagged_frame_to_tokens(
            extract_opts=extract_opts,
            transform_opts=transform_opts,
        )
        .to_topic_model(
            target_mode="both",
            target_folder="./tests/output",
            target_name="APA",
            engine="gensim_lda-multicore",
            engine_args=engine_args,
            store_corpus=True,
            store_compressed=True,
        )
    ).value()
Ejemplo n.º 18
0
def main(
    corpus_source: str = None,
    config_filename: str = None,
    model_folder: str = None,
    model_name: str = None,
    target_name: str = None,
    target_folder: str = None,
    lemmatize: bool = True,
    pos_includes: str = '',
    pos_excludes: str = '',
    to_lower: bool = True,
    remove_stopwords: str = None,
    min_word_length: int = 2,
    max_word_length: int = None,
    keep_symbols: bool = False,
    keep_numerals: bool = False,
    only_any_alphanumeric: bool = False,
    only_alphabetic: bool = False,
    minimum_probability: float = 0.001,
    n_tokens: int = 200,
    enable_checkpoint: bool = True,
    force_checkpoint: bool = False,
):

    config: pipeline.CorpusConfig = pipeline.CorpusConfig.load(
        path=config_filename)

    transform_opts: penelope.TokensTransformOpts = penelope.TokensTransformOpts(
        to_lower=to_lower,
        to_upper=False,
        min_len=min_word_length,
        max_len=max_word_length,
        remove_accents=False,
        remove_stopwords=(remove_stopwords is not None),
        stopwords=None,
        extra_stopwords=None,
        language=remove_stopwords,
        keep_numerals=keep_numerals,
        keep_symbols=keep_symbols,
        only_alphabetic=only_alphabetic,
        only_any_alphanumeric=only_any_alphanumeric,
    )

    extract_opts = penelope.ExtractTaggedTokensOpts(
        lemmatize=lemmatize,
        pos_includes=pos_includes,
        pos_excludes=pos_excludes,
        **config.pipeline_payload.tagged_columns_names,
    )

    tag, folder = workflow(
        config=config,
        model_name=model_name,
        model_folder=model_folder,
        target_name=target_name,
        target_folder=target_folder,
        corpus_source=corpus_source,
        extract_opts=extract_opts,
        transform_opts=transform_opts,
        minimum_probability=minimum_probability,
        n_tokens=n_tokens,
        enable_checkpoint=enable_checkpoint,
        force_checkpoint=force_checkpoint,
    )

    logger.info(f"Done! Model {tag} stored in {folder}")