コード例 #1
0
def test_to_spacy_doc(test_payload):
    pytest.importorskip("spacy")
    task = ToSpacyDoc()
    _ = patch_spacy_pipeline(test_payload).add(SetSpacyModel(name_or_nlp="en_core_web_sm")).add(task).setup()
    payload = DocumentPayload(content_type=ContentType.TEXT, filename='hello.txt', content="Hello world!")
    payload_next = task.process_payload(payload)
    assert payload_next.content_type == ContentType.SPACYDOC
コード例 #2
0
def fake_spacy_doc_stream(n: int = 1):
    dummy = MagicMock(spec=spacy_api.Doc)
    for i in range(1, n + 1):
        yield DocumentPayload(
            filename=f'dummy_{i}.txt',
            content_type=ContentType.SPACYDOC,
            content=dummy,
        )
コード例 #3
0
def test_spacy_doc_to_tagged_frame(looking_back, test_payload):
    pytest.importorskip("spacy")
    payload = DocumentPayload(content_type=ContentType.SPACYDOC, filename='hello.txt', content=looking_back)
    prior = Mock(spec=ITask, outstream=lambda: [payload])
    task = SpacyDocToTaggedFrame(prior=prior, attributes=POS_ATTRIBUTES)
    task.register_pos_counts = lambda p: p
    _ = patch_spacy_pipeline(test_payload).add([SetSpacyModel(name_or_nlp="en_core_web_sm"), task]).setup()
    payload_next = task.process_payload(payload)
    assert payload_next.content_type == ContentType.TAGGED_FRAME
コード例 #4
0
ファイル: pipeline_test.py プロジェクト: humlab/penelope
def test_topic_model_task_with_token_stream_and_document_index(method):

    target_folder: str = './tests/output'
    target_name: str = f'{str(uuid.uuid1())[:8]}'
    corpus = TranströmerCorpus()
    default_engine_args: dict = {
        'n_topics': 4,
        'passes': 1,
        'random_seed': 42,
        'workers': 1,
        'max_iter': 100,
        'work_folder': os.path.join(target_folder, target_name),
    }
    payload_stream = lambda: [
        DocumentPayload(content_type=ContentType.TOKENS, filename=filename, content=tokens)
        for filename, tokens in corpus
    ]

    pipeline = Mock(
        spec=CorpusPipeline,
        **{
            'payload.memory_store': SPARV_TAGGED_COLUMNS,
            'payload.document_index': corpus.document_index,
            'payload.token2id': None,
        },
    )

    prior = MagicMock(
        spec=ITask,
        outstream=payload_stream,
        content_stream=lambda: ContentStream(payload_stream),
        out_content_type=ContentType.TOKENS,
        filename_content_stream=lambda: [(p.filename, p.content) for p in payload_stream()],
    )

    task: ToTopicModel = ToTopicModel(
        pipeline=pipeline,
        prior=prior,
        target_folder="./tests/output",
        target_name=target_name,
        engine=method,
        engine_args=default_engine_args,
        store_corpus=True,
        store_compressed=True,
    )
    task.resolved_prior_out_content_type = lambda: ContentType.TOKENS

    task.setup()
    task.enter()
    payload: DocumentPayload = next(task.process_stream())

    assert payload is not None
    assert payload.content_type == ContentType.TOPIC_MODEL
    assert isinstance(payload.content, dict)

    output_models = find_models('./tests/output')
    assert any(m['name'] == target_name for m in output_models)
コード例 #5
0
def test_to_spacy_doc_to_tagged_frame(test_payload):
    payload = DocumentPayload(content_type=ContentType.TEXT, filename='hello.txt', content=SAMPLE_TEXT)
    config: CorpusConfig = CorpusConfig.load('./tests/test_data/SSI.yml')
    pipeline: CorpusPipeline = CorpusPipeline(config=config, tasks=[], payload=payload).setup()
    prior = MagicMock(spec=ITask, outstream=lambda: [payload])
    task = ToSpacyDocToTaggedFrame(pipeline=pipeline, prior=prior, attributes=POS_ATTRIBUTES)
    task.register_pos_counts = lambda p: p
    _ = patch_spacy_pipeline(test_payload).add([SetSpacyModel(name_or_nlp="en_core_web_sm"), task]).setup()
    payload_next = task.process_payload(payload)
    assert payload_next.content_type == ContentType.TAGGED_FRAME
コード例 #6
0
def fake_data_frame_stream(n: int = 1):
    for i in range(1, n + 1):
        yield DocumentPayload(
            filename=f'dummy_{i}.csv',
            content_type=ContentType.TAGGED_FRAME,
            content=pd.DataFrame(data={
                'text': ['bil'],
                'pos_': ['NOUN'],
                'lemma_': ['bil']
            }),
        )
コード例 #7
0
def fake_token_stream(n: int = 1):
    for i in (0, n):
        yield DocumentPayload(filename=TEST_CORPUS[i][0],
                              content_type=ContentType.TOKENS,
                              content=TEST_CORPUS[i][1].split())
コード例 #8
0
def fake_text_stream(n: int = 1):
    for i in (0, n):
        yield DocumentPayload(filename=TEST_CORPUS[i][0],
                              content_type=ContentType.TEXT,
                              content=TEST_CORPUS[i][1])
コード例 #9
0
 def project(p: DocumentPayload):
     p.content = "HELLO"
     return p
コード例 #10
0
def test_passthrough_process_succeeds():
    task = tasks.Passthrough(pipeline=CorpusPipeline(config=Mock(
        spec=CorpusConfig, pipeline_payload=PipelinePayload()))).setup()
    current_payload = DocumentPayload()
    next_payload = task.process(current_payload)
    assert current_payload == next_payload