def test_pipeline_tagged_frame_to_text_succeeds(config: pipeline.CorpusConfig): tagged_corpus_source: str = os.path.join(CORPUS_FOLDER, 'checkpoint_pos_tagged_test.zip') extract_opts: corpora.ExtractTaggedTokensOpts = corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|NOUN|', pos_paddings=None, **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_punct=False), ) tagged_payload = next( pipeline.CorpusPipeline(config=config).checkpoint(tagged_corpus_source, force_checkpoint=False).resolve() ) text_payload = next( pipeline.CorpusPipeline(config=config) .checkpoint(tagged_corpus_source, force_checkpoint=False) .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=None) .tokens_to_text() .resolve() ) assert tagged_payload.filename == text_payload.filename assert len(tagged_payload.content[tagged_payload.content.pos_ == 'NOUN']) == len(text_payload.content.split())
def test_pipeline_tagged_frame_to_vocabulary_succeeds(config: pipeline.CorpusConfig): tagged_corpus_source: str = os.path.join(CORPUS_FOLDER, 'legal_instrument_five_docs_test_pos_csv.zip') pipe: pipeline.CorpusPipeline = ( pipeline.CorpusPipeline(config=config) .checkpoint(tagged_corpus_source, force_checkpoint=False) .vocabulary(lemmatize=True, progress=False) .exhaust() ) assert pipe.payload.token2id is not None assert pipe.payload.token2id.tf is not None assert len(pipe.payload.token2id) == 1147 assert len(pipe.payload.token2id) == len(pipe.payload.token2id.tf) is not None assert set(pipe.payload.token2id.data.keys()) == {x.lower() for x in pipe.payload.token2id.keys()} assert 'Cultural' not in pipe.payload.token2id assert 'wars' not in pipe.payload.token2id assert 'war' in pipe.payload.token2id pipe: pipeline.CorpusPipeline = ( pipeline.CorpusPipeline(config=config) .checkpoint(tagged_corpus_source, force_checkpoint=False) .vocabulary(lemmatize=False, progress=False) .exhaust() ) assert len(pipe.payload.token2id) == 1478 assert 'Cultural' in pipe.payload.token2id assert 'wars' in pipe.payload.token2id assert pipe.payload.token2id.tf[pipe.payload.token2id['the']] == 704
def test_load_text_returns_payload_with_expected_document_index(config: pipeline.CorpusConfig): transform_opts = corpora.TextTransformOpts() pipe = pipeline.CorpusPipeline(config=config).load_text( reader_opts=config.text_reader_opts, transform_opts=transform_opts ) assert pipe is not None payloads: List[pipeline.DocumentPayload] = [x for x in pipe.resolve()] assert len(payloads) == 5 assert len(pipe.payload.document_index) == 5 assert len(pipe.payload.metadata) == 5 assert pipe.payload.pos_schema_name == "Universal" assert pipe.payload.get('text_reader_opts') == config.text_reader_opts.props assert isinstance(pipe.payload.document_index, pd.DataFrame) columns = pipe.payload.document_index.columns.tolist() assert columns == [ 'section_id', 'unesco_id', 'filename', 'type', 'href', 'year', 'date', 'city', 'x.title', 'document_id', 'document_name', ] assert all(x.split(':')[0] in columns for x in config.text_reader_opts.filename_fields) assert pipe.payload.document_lookup('RECOMMENDATION_0201_049455_2017.txt')['unesco_id'] == 49455
def test_pipeline_text_to_dtm_succeeds(config: pipeline.CorpusConfig): target_tag: str = uuid.uuid1() tagged_corpus_source: str = os.path.join(CORPUS_FOLDER, 'checkpoint_pos_tagged_test.zip') extract_opts: corpora.ExtractTaggedTokensOpts = corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|NOUN|', pos_paddings=None, **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_punct=False), ) corpus: corpora.VectorizedCorpus = ( ( pipeline.CorpusPipeline(config=config) .checkpoint(tagged_corpus_source, force_checkpoint=False) .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=None) .tokens_transform(transform_opts=corpora.TokensTransformOpts()) .tokens_to_text() .tqdm() .to_dtm() ) .single() .content ) corpus.dump(tag=target_tag, folder=OUTPUT_FOLDER) assert isinstance(corpus, corpora.VectorizedCorpus) assert corpus.data.shape[0] == 5 assert len(corpus.token2id) == corpus.data.shape[1] corpus.remove(tag=target_tag, folder=OUTPUT_FOLDER)
def test_workflow_to_dtm_step_by_step(config: pipeline.CorpusConfig): corpus_tag: str = uuid.uuid1() target_folder: str = "./tests/output" corpus_source: str = './tests/test_data/legal_instrument_five_docs_test.zip' tagged_corpus_source: str = f"./tests/output/{uuid.uuid1()}_pos_csv.zip" args: ComputeOpts = ComputeOpts( corpus_tag=corpus_tag, corpus_source=corpus_source, target_folder=target_folder, corpus_type=pipeline.CorpusType.SpacyCSV, # pos_scheme: PoS_Tag_Scheme = PoS_Tag_Schemes.Universal transform_opts=corpora.TokensTransformOpts(language='english', remove_stopwords=True, to_lower=True), text_reader_opts=corpora.TextReaderOpts(filename_pattern='*.csv', filename_fields=['year:_:1']), extract_opts=corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|NOUN|PROPN|VERB|', pos_excludes='|PUNCT|EOL|SPACE|', **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_alpha=False, is_punct=False, is_space=False), ), create_subfolder=False, persist=True, tf_threshold=1, tf_threshold_mask=False, vectorize_opts=corpora.VectorizeOpts( already_tokenized=True, lowercase=False, min_tf=1, max_tokens=None, ), enable_checkpoint=True, force_checkpoint=True, ) with inline_code(spacy_pipeline.to_tagged_frame_pipeline): tagged_frame_filename: str = tagged_corpus_source or utility.path_add_suffix( config.pipeline_payload.source, '_pos_csv' ) p: pipeline.CorpusPipeline = ( pipeline.CorpusPipeline(config=config) .set_spacy_model(config.pipeline_payload.memory_store['spacy_model']) .load_text( reader_opts=config.text_reader_opts, transform_opts=None, source=corpus_source, ) .text_to_spacy() .spacy_to_pos_tagged_frame() .checkpoint(filename=tagged_frame_filename, force_checkpoint=args.force_checkpoint) ) if args.enable_checkpoint: p = p.checkpoint_feather(folder=config.get_feather_folder(corpus_source), force=args.force_checkpoint) p.exhaust()
def test_pipeline_can_load_pos_tagged_checkpoint(config: pipeline.CorpusConfig): tagged_corpus_source: str = os.path.join(CORPUS_FOLDER, 'legal_instrument_five_docs_test_pos_csv.zip') pipe = pipeline.CorpusPipeline(config=config).checkpoint(tagged_corpus_source, force_checkpoint=False) payloads: List[pipeline.DocumentPayload] = pipe.to_list() assert len(payloads) == 5 assert len(pipe.payload.document_index) == 5 assert isinstance(pipe.payload.document_index, pd.DataFrame)
def test_store_id_tagged_frame(): config: pp.CorpusConfig = pp.CorpusConfig.load( './tests/test_data/tranströmer.yml') corpus_source: str = './tests/test_data/tranströmer_corpus_pos_csv.zip' target_folder: str = './tests/test_data/tranströmer_id_tagged_frames' _: pp.CorpusPipeline = (pp.CorpusPipeline(config=config).load_tagged_frame( filename=corpus_source, checkpoint_opts=config.checkpoint_opts, extra_reader_opts=config.text_reader_opts, ).to_id_tagged_frame(ingest_vocab_type=IngestVocabType.Incremental). store_id_tagged_frame(target_folder)).exhaust()
def test_pipeline_take_succeeds(config: pipeline.CorpusConfig): tagged_corpus_source: str = os.path.join(CORPUS_FOLDER, 'checkpoint_pos_tagged_test.zip') extract_opts: corpora.ExtractTaggedTokensOpts = corpora.ExtractTaggedTokensOpts( lemmatize=True, **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_punct=False) ) take_payloads = ( pipeline.CorpusPipeline(config=config) .checkpoint(tagged_corpus_source, force_checkpoint=False) .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=None) .tokens_to_text() .take(2) ) assert len(take_payloads) == 2
def test_load_id_tagged_frame(): config: pp.CorpusConfig = pp.CorpusConfig.load( './tests/test_data/tranströmer.yml') folder: str = './tests/test_data/tranströmer_id_tagged_frames' p: pp.CorpusPipeline = pp.CorpusPipeline( config=config).load_id_tagged_frame( folder=folder, file_pattern='**/tran*.feather', id_to_token=False, ) payloads = p.to_list() assert len(payloads) == 5 assert len(p.payload.document_index) == 5 assert len(p.payload.token2id) == 341
def noun_pipeline(id_to_token: bool) -> pp.CorpusPipeline: corpus_source: str = './tests/test_data/tranströmer_id_tagged_frames' file_pattern: str = '**/tran_*.feather' config_filename: str = jj(corpus_source, 'corpus.yml') corpus_config: pp.CorpusConfig = pp.CorpusConfig.load(path=config_filename).folders(corpus_source) extract_opts: pc.ExtractTaggedTokensOpts = pc.ExtractTaggedTokensOpts( lemmatize=False, pos_includes='NN', **corpus_config.pipeline_payload.tagged_columns_names ) if not id_to_token: extract_opts.set_numeric_names() p: pp.CorpusPipeline = ( pp.CorpusPipeline(config=corpus_config) .load_id_tagged_frame(folder=corpus_source, id_to_token=id_to_token, file_pattern=file_pattern) .filter_tagged_frame(extract_opts=extract_opts, pos_schema=utility.PoS_Tag_Schemes.SUC) ) return p
def test_pipeline_load_text_tag_checkpoint_stores_checkpoint(config: pipeline.CorpusConfig): tagged_corpus_source: str = os.path.join(OUTPUT_FOLDER, 'legal_instrument_five_docs_test_pos_csv.zip') transform_opts = corpora.TextTransformOpts() os.makedirs(OUTPUT_FOLDER, exist_ok=True) pathlib.Path(tagged_corpus_source).unlink(missing_ok=True) _ = ( pipeline.CorpusPipeline(config=config) .set_spacy_model(config.pipeline_payload.memory_store['spacy_model']) .load_text(reader_opts=config.text_reader_opts, transform_opts=transform_opts) .text_to_spacy() .tqdm() .spacy_to_pos_tagged_frame() .checkpoint(tagged_corpus_source, force_checkpoint=False) ).exhaust() assert os.path.isfile(tagged_corpus_source) pathlib.Path(tagged_corpus_source).unlink(missing_ok=True)
def test_pipeline_tagged_frame_to_tuple_succeeds(config: pipeline.CorpusConfig): tagged_corpus_source: str = os.path.join(CORPUS_FOLDER, 'checkpoint_pos_tagged_test.zip') extract_opts: corpora.ExtractTaggedTokensOpts = corpora.ExtractTaggedTokensOpts( lemmatize=True, pos_includes='|NOUN|', pos_paddings='|VERB|', **config.pipeline_payload.tagged_columns_names, filter_opts=dict(is_punct=False), ) payloads = ( pipeline.CorpusPipeline(config=config) .checkpoint(tagged_corpus_source, force_checkpoint=False) .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=None) .tokens_to_text() .to_list() ) assert len(payloads) == 5 assert all(isinstance(payload.content, str) for payload in payloads)
def test_sparv_tagged_frame_to_tokens(corpus_config: pipeline.CorpusConfig): p = pipeline.CorpusPipeline(config=corpus_config) corpus_config.checkpoint_opts.feather_folder = None load = tasks.LoadTaggedCSV( filename=corpus_config.pipeline_payload.source, checkpoint_opts=corpus_config.checkpoint_opts, extra_reader_opts=corpus_config.text_reader_opts, ) tagged_columns: dict = corpus_config.pipeline_payload.tagged_columns_names extract = tasks.TaggedFrameToTokens(extract_opts=ExtractTaggedTokensOpts( lemmatize=True, **tagged_columns, filter_opts=dict(is_punct=False)), ) p.add([load, extract]) payloads = [p for p in p.resolve()] assert [x.document_name for x in payloads ] == ['prot_197677__25', 'prot_197677__26', 'prot_197677__27'] assert all(x.content_type == pipeline.ContentType.TOKENS for x in payloads) assert all(isinstance(x.content, list) for x in payloads) assert len(payloads) == 3
def test_pipeline_find_task(config: pipeline.CorpusConfig): p: pipeline.CorpusPipeline = ( pipeline.CorpusPipeline(config=config).checkpoint("dummy_name", force_checkpoint=False).tqdm() ) assert isinstance(p.find(tasks.Checkpoint), tasks.Checkpoint) assert isinstance(p.find(tasks.Tqdm), tasks.Tqdm)