def test_to_spacy_doc(test_payload): pytest.importorskip("spacy") task = ToSpacyDoc() _ = patch_spacy_pipeline(test_payload).add(SetSpacyModel(name_or_nlp="en_core_web_sm")).add(task).setup() payload = DocumentPayload(content_type=ContentType.TEXT, filename='hello.txt', content="Hello world!") payload_next = task.process_payload(payload) assert payload_next.content_type == ContentType.SPACYDOC
def fake_spacy_doc_stream(n: int = 1): dummy = MagicMock(spec=spacy_api.Doc) for i in range(1, n + 1): yield DocumentPayload( filename=f'dummy_{i}.txt', content_type=ContentType.SPACYDOC, content=dummy, )
def test_spacy_doc_to_tagged_frame(looking_back, test_payload): pytest.importorskip("spacy") payload = DocumentPayload(content_type=ContentType.SPACYDOC, filename='hello.txt', content=looking_back) prior = Mock(spec=ITask, outstream=lambda: [payload]) task = SpacyDocToTaggedFrame(prior=prior, attributes=POS_ATTRIBUTES) task.register_pos_counts = lambda p: p _ = patch_spacy_pipeline(test_payload).add([SetSpacyModel(name_or_nlp="en_core_web_sm"), task]).setup() payload_next = task.process_payload(payload) assert payload_next.content_type == ContentType.TAGGED_FRAME
def test_topic_model_task_with_token_stream_and_document_index(method): target_folder: str = './tests/output' target_name: str = f'{str(uuid.uuid1())[:8]}' corpus = TranströmerCorpus() default_engine_args: dict = { 'n_topics': 4, 'passes': 1, 'random_seed': 42, 'workers': 1, 'max_iter': 100, 'work_folder': os.path.join(target_folder, target_name), } payload_stream = lambda: [ DocumentPayload(content_type=ContentType.TOKENS, filename=filename, content=tokens) for filename, tokens in corpus ] pipeline = Mock( spec=CorpusPipeline, **{ 'payload.memory_store': SPARV_TAGGED_COLUMNS, 'payload.document_index': corpus.document_index, 'payload.token2id': None, }, ) prior = MagicMock( spec=ITask, outstream=payload_stream, content_stream=lambda: ContentStream(payload_stream), out_content_type=ContentType.TOKENS, filename_content_stream=lambda: [(p.filename, p.content) for p in payload_stream()], ) task: ToTopicModel = ToTopicModel( pipeline=pipeline, prior=prior, target_folder="./tests/output", target_name=target_name, engine=method, engine_args=default_engine_args, store_corpus=True, store_compressed=True, ) task.resolved_prior_out_content_type = lambda: ContentType.TOKENS task.setup() task.enter() payload: DocumentPayload = next(task.process_stream()) assert payload is not None assert payload.content_type == ContentType.TOPIC_MODEL assert isinstance(payload.content, dict) output_models = find_models('./tests/output') assert any(m['name'] == target_name for m in output_models)
def test_to_spacy_doc_to_tagged_frame(test_payload): payload = DocumentPayload(content_type=ContentType.TEXT, filename='hello.txt', content=SAMPLE_TEXT) config: CorpusConfig = CorpusConfig.load('./tests/test_data/SSI.yml') pipeline: CorpusPipeline = CorpusPipeline(config=config, tasks=[], payload=payload).setup() prior = MagicMock(spec=ITask, outstream=lambda: [payload]) task = ToSpacyDocToTaggedFrame(pipeline=pipeline, prior=prior, attributes=POS_ATTRIBUTES) task.register_pos_counts = lambda p: p _ = patch_spacy_pipeline(test_payload).add([SetSpacyModel(name_or_nlp="en_core_web_sm"), task]).setup() payload_next = task.process_payload(payload) assert payload_next.content_type == ContentType.TAGGED_FRAME
def fake_data_frame_stream(n: int = 1): for i in range(1, n + 1): yield DocumentPayload( filename=f'dummy_{i}.csv', content_type=ContentType.TAGGED_FRAME, content=pd.DataFrame(data={ 'text': ['bil'], 'pos_': ['NOUN'], 'lemma_': ['bil'] }), )
def fake_token_stream(n: int = 1): for i in (0, n): yield DocumentPayload(filename=TEST_CORPUS[i][0], content_type=ContentType.TOKENS, content=TEST_CORPUS[i][1].split())
def fake_text_stream(n: int = 1): for i in (0, n): yield DocumentPayload(filename=TEST_CORPUS[i][0], content_type=ContentType.TEXT, content=TEST_CORPUS[i][1])
def project(p: DocumentPayload): p.content = "HELLO" return p
def test_passthrough_process_succeeds(): task = tasks.Passthrough(pipeline=CorpusPipeline(config=Mock( spec=CorpusConfig, pipeline_payload=PipelinePayload()))).setup() current_payload = DocumentPayload() next_payload = task.process(current_payload) assert current_payload == next_payload