def test_sparv_extract_and_store_when_only_nouns_and_source_is_sparv3_succeeds( ): os.makedirs(OUTPUT_FOLDER, exist_ok=True) target_filename = os.path.join(OUTPUT_FOLDER, f'{uuid.uuid1()}.zip') sparv_corpus.sparv_xml_extract_and_store( SPARV3_ZIPPED_XML_EXPORT_FILENAME, target_filename, version=3, extract_opts=ExtractTaggedTokensOpts(pos_includes='|NN|', pos_paddings=None, lemmatize=False), transform_opts=TokensTransformOpts(to_lower=True, min_len=2, stopwords=['<text>']), ) expected_document_start = "utredningar justitiedepartementet förslag utlänningslag angående om- händertagande förläggning års gere ide to lm \rstatens utredningar förteckning betänkande förslag utlänningslag lag omhändertagande utlänning anstalt förläggning tryckort tryckorten bokstäverna fetstil begynnelse- bokstäverna departement" test_filename = "sou_1945_1.txt" content = zip_utils.read_file_content(zip_or_filename=target_filename, filename=test_filename, as_binary=False) assert content.startswith(expected_document_start) os.remove(target_filename)
def test_to_dataframe_of_term_matrix_gives_expected_result(self): # Arrange reader = PandasCorpusReader(self.create_test_dataframe()) corpus = TokenizedCorpus( reader, # Pre-compute transform options: transform_opts=TokensTransformOpts( only_any_alphanumeric=False, to_lower=False, remove_accents=False, min_len=1, max_len=None, keep_numerals=False, ), ) term_term_matrix = CorpusVectorizer().fit_transform(corpus, already_tokenized=True).co_occurrence_matrix() # Act id2w = corpus.id2token.get co_occurrences = term_term_matrix_to_co_occurrences(term_term_matrix, threshold_count=1, ignore_ids=set()) co_occurrences['w1'] = co_occurrences.w1_id.apply(id2w) co_occurrences['w2'] = co_occurrences.w2_id.apply(id2w) # Assert assert 2 == int(co_occurrences[((co_occurrences.w1 == 'A') & (co_occurrences.w2 == 'B'))].value) assert 0 == len(co_occurrences[((co_occurrences.w1 == 'C') & (co_occurrences.w2 == 'F'))])
def test_reader_store_result(): os.makedirs(OUTPUT_FOLDER, exist_ok=True) expected_documents = [ ['rödräv', 'hunddjur', 'utbredning', 'halvklot'], [ 'fjällräv', 'fjällvärld', 'liv', 'fjällräv', 'vinter', 'men', 'variant', 'år' ], ] expected_names = ["document_001.txt", "document_002.txt"] target_filename = os.path.join(OUTPUT_FOLDER, 'test_reader_store_result.zip') sparv_corpus.sparv_xml_extract_and_store( SPARV_ZIPPED_XML_EXPORT_FILENAME, target_filename, version=4, extract_opts=ExtractTaggedTokensOpts(pos_includes='|NN|', pos_paddings=None, lemmatize=True), transform_opts=TokensTransformOpts(to_lower=True), ) for i in range(0, len(expected_names)): content = zip_utils.read_file_content(zip_or_filename=target_filename, filename=expected_names[i], as_binary=False) assert ' '.join(expected_documents[i]) == content os.remove(target_filename)
def test_n_tokens_when_exhausted_iterater_returns_expected_count(): reader = create_reader() corpus = corpora.TokenizedCorpus( reader, transform_opts=TokensTransformOpts(only_any_alphanumeric=False)) _ = [x for x in corpus] n_tokens = list(corpus.document_index.n_tokens) expected = [22, 16, 26, 45, 21] assert expected == n_tokens
def test_next_document_when_only_any_alphanumeric_true_skips_deliminators_using_defaults( ): reader = create_tokens_reader(filename_fields=None, fix_whitespaces=True, fix_hyphenation=True) corpus = corpora.TokenizedCorpus( reader, transform_opts=TokensTransformOpts(only_any_alphanumeric=True)) _, tokens = next(corpus) expected = "Tre svarta ekar ur snön Så grova men fingerfärdiga Ur deras väldiga flaskor ska grönskan skumma i vår" assert expected.split() == tokens
def test_n_tokens_when_exhausted_and_only_any_alphanumeric_is_true_returns_expected_count( ): reader = create_tokens_reader(filename_fields=None, fix_whitespaces=True, fix_hyphenation=True) corpus = corpora.TokenizedCorpus( reader, transform_opts=TokensTransformOpts(only_any_alphanumeric=True)) _ = [x for x in corpus] n_tokens = list(corpus.document_index.n_tokens) expected = [18, 14, 24, 42, 18] assert expected == n_tokens
def test_n_tokens_when_exhausted_and_only_any_alphanumeric_min_len_two_returns_expected_count( ): reader = create_reader() corpus = corpora.TokenizedCorpus( reader, transform_opts=TokensTransformOpts(only_any_alphanumeric=True, min_len=2)) n_expected = [17, 13, 21, 42, 18] _ = [x for x in corpus] n_tokens = list(corpus.document_index.n_tokens) assert n_expected == n_tokens
def ComputeOptsSparvCSV( *, corpus_tag: str = 'TELLUS', corpus_source: str = './tests/test_data/tranströmer_corpus_export.sparv4.csv.zip', ) -> ComputeOpts: # pylint: disable=too-many-instance-attributes) return ComputeOpts( corpus_tag=corpus_tag, corpus_source=corpus_source, target_folder="./tests/output", corpus_type=CorpusType.SparvCSV, transform_opts=TokensTransformOpts( to_lower=True, min_len=1, remove_stopwords=None, keep_symbols=True, keep_numerals=True, only_alphabetic=False, only_any_alphanumeric=False, ), text_reader_opts=TextReaderOpts( filename_pattern='*.csv', filename_fields=('year:_:1', ), index_field=None, # use filename as_binary=False, ), extract_opts=ExtractTaggedTokensOpts( pos_includes=None, pos_excludes='|MAD|MID|PAD|', pos_paddings=None, lemmatize=False, **SPARV_TAGGED_COLUMNS, filter_opts=dict( is_alpha=False, is_punct=False, is_digit=None, is_stop=None, is_space=False, ), ), create_subfolder=False, persist=True, context_opts=ContextOpts( concept=('jag', ), context_width=2, partition_keys=['document_id'], ), tf_threshold=1, tf_threshold_mask=False, vectorize_opts=VectorizeOpts(already_tokenized=True, min_tf=1, max_tokens=None), )
def test_corpus_can_be_reiterated(): reader = create_tokens_reader(filename_fields=None, fix_whitespaces=True, fix_hyphenation=True) corpus = corpora.TokenizedCorpus( reader, transform_opts=TokensTransformOpts(only_any_alphanumeric=True)) for _ in range(0, 4): n_tokens = [len(x) for x in corpus.terms] expected = [18, 14, 24, 42, 18] assert expected == n_tokens # , f"iteration{i}"
def test_next_document_when_token_corpus_returns_tokenized_document(): reader = create_tokens_reader(filename_fields=None, fix_whitespaces=True, fix_hyphenation=True) corpus = corpora.TokenizedCorpus( reader, transform_opts=TokensTransformOpts(only_any_alphanumeric=False)) _, tokens = next(corpus) expected = ( "Tre svarta ekar ur snön . Så grova , men fingerfärdiga . Ur deras väldiga flaskor ska grönskan skumma i vår ." ) assert expected.split() == tokens
def test_next_document_when_max_len_is_six_returns_filter_out_longer_words(): reader = create_reader() transform_opts = TokensTransformOpts(only_any_alphanumeric=True, to_lower=True, remove_accents=False, min_len=2, max_len=6, keep_numerals=True) corpus = corpora.TokenizedCorpus(reader, transform_opts=transform_opts) _, tokens = next(corpus) expected = "tre svarta ekar ur snön så grova men ur deras ska skumma vår" assert expected.split() == tokens
def create_corpus(): reader = create_reader() transform_opts = TokensTransformOpts( only_any_alphanumeric=True, to_lower=True, remove_accents=False, min_len=2, max_len=None, keep_numerals=False, ) corpus = TokenizedCorpus(reader, transform_opts=transform_opts) return corpus
def test_next_document_when_to_lower_is_true_returns_all_lowercase(): reader = create_reader() transform_opts = TokensTransformOpts(only_any_alphanumeric=True, to_lower=True, remove_accents=False, min_len=1, max_len=None, keep_numerals=True) corpus = corpora.TokenizedCorpus(reader, transform_opts=transform_opts) _, tokens = next(corpus) expected = "tre svarta ekar ur snön så grova men fingerfärdiga ur deras väldiga flaskor ska grönskan skumma i vår" assert expected.split() == tokens
def test_get_index_when_extract_passed_returns_expected_count(): reader = create_reader() transform_opts = TokensTransformOpts( only_any_alphanumeric=False, to_lower=False, remove_accents=False, min_len=2, max_len=None, keep_numerals=True, ) corpus = corpora.TokenizedCorpus(reader, transform_opts=transform_opts) result = corpus.metadata assert 5 == len(result)
def test_next_document_when_only_any_alphanumeric_true_skips_deliminators(): reader = create_reader() corpus = corpora.TokenizedCorpus( reader, transform_opts=TokensTransformOpts(only_any_alphanumeric=True, to_lower=False, remove_accents=False, min_len=1, keep_numerals=True), ) _, tokens = next(corpus) expected = "Tre svarta ekar ur snön Så grova men fingerfärdiga Ur deras väldiga flaskor ska grönskan skumma i vår" assert expected.split() == tokens
def test_processed_corpus_token_stream(self): df = self.create_test_dataframe() reader = PandasCorpusReader(df) corpus = TokenizedCorpus(reader, transform_opts=TokensTransformOpts()) result = [x for x in corpus] expected = [ ('document_0.txt', ['A', 'B', 'C']), ('document_1.txt', ['B', 'C', 'D']), ('document_2.txt', ['C', 'B']), ('document_3.txt', ['A', 'B', 'F']), ('document_4.txt', ['E', 'B']), ('document_5.txt', ['F', 'E', 'E']), ] self.assertEqual(expected, result)
def transform_opts(self) -> TokensTransformOpts: opts = TokensTransformOpts( keep_numerals=True, keep_symbols=True, language=self._config.language, max_len=None, min_len=1, only_alphabetic=self._only_alphabetic.value, only_any_alphanumeric=self._only_any_alphanumeric.value, remove_accents=False, remove_stopwords=self._remove_stopwords.value, stopwords=None, to_lower=self._to_lowercase.value, to_upper=False, ) if self._extra_stopwords.value.strip() != '': _words = [x for x in map(str.strip, self._extra_stopwords.value.strip().split()) if x != ''] if len(_words) > 0: opts.extra_stopwords = _words return opts
def test_predict_topics(method: str): """Train a model that will be used in prediction""" target_folder: str = './tests/output' train_target_name: str = f'train_{str(uuid.uuid1())[:8]}' payload: DocumentPayload = tranströmer_topic_model_payload( method=method, target_folder=target_folder, target_name=train_target_name ) model_folder: str = os.path.join(payload.content.get("target_folder"), payload.content.get("target_name")) """Predict using trained model""" config: CorpusConfig = CorpusConfig.load('./tests/test_data/tranströmer.yml') corpus_source: str = './tests/test_data/tranströmer_corpus_pos_csv.zip' minimum_probability: float = 0.001 n_tokens: int = 100 predict_target_name: str = f'predict_{str(uuid.uuid1())[:8]}' transform_opts = TokensTransformOpts() extract_opts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes='', pos_excludes='MAD|MID|PAD', **config.checkpoint_opts.tagged_columns, ) vectorize_opts: VectorizeOpts = VectorizeOpts(already_tokenized=True) payload: DocumentPayload = ( CorpusPipeline(config=config) .load_tagged_frame( filename=corpus_source, checkpoint_opts=config.checkpoint_opts, extra_reader_opts=config.text_reader_opts, ) .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=transform_opts) .to_dtm(vectorize_opts=vectorize_opts) .predict_topics( model_folder=model_folder, target_folder=target_folder, target_name=predict_target_name, minimum_probability=minimum_probability, n_tokens=n_tokens, ) ).single() assert payload is not None model_infos = find_models('./tests/output') assert any(m['name'] == predict_target_name for m in model_infos) model_info = next(m for m in model_infos if m['name'] == predict_target_name) assert 'method' in model_info['options']
def test_tokenized_document_token_counts_is_empty_if_enumerable_not_exhausted(self): corpus = self.create_simple_test_corpus( transform_opts=TokensTransformOpts( keep_symbols=False, only_any_alphanumeric=True, to_lower=True, remove_accents=False, min_len=0, max_len=None, keep_numerals=True, stopwords=None, ) ) self.assertTrue('n_raw_tokens' not in corpus.document_index.columns) self.assertTrue('n_tokens' not in corpus.document_index.columns)
def text_corpus() -> TokenizedCorpus: filename_fields = dict(year=r".{5}(\d{4})_.*", serial_no=r".{9}_(\d+).*") reader = create_tokens_reader(filename_fields=filename_fields, fix_whitespaces=True, fix_hyphenation=True) transform_opts = TokensTransformOpts( only_any_alphanumeric=True, to_lower=True, remove_accents=False, min_len=2, max_len=None, keep_numerals=False, ) corpus = TokenizedCorpus(reader, transform_opts=transform_opts) return corpus
def test_tokenized_document_token_counts_is_not_empty_if_enumerable_is_exhausted(self): # Note: Symbols are always removed by reader - hence "keep_symbols" filter has no effect corpus = self.create_simple_test_corpus( transform_opts=TokensTransformOpts( keep_symbols=False, only_any_alphanumeric=True, to_lower=True, remove_accents=False, min_len=0, max_len=None, keep_numerals=True, stopwords=None, ) ) for _ in corpus: pass self.assertTrue('n_raw_tokens' in corpus.document_index.columns) self.assertTrue('n_tokens' in corpus.document_index.columns)
def test_next_document_when_only_any_alphanumeric_is_false_returns_all_tokens( ): reader = create_reader() transform_opts = TokensTransformOpts( only_any_alphanumeric=False, to_lower=False, remove_accents=False, min_len=1, max_len=None, keep_numerals=True, only_alphabetic=False, ) corpus = corpora.TokenizedCorpus(reader, transform_opts=transform_opts) _, tokens = next(corpus) expected = ( "Tre svarta ekar ur snön . Så grova , men fingerfärdiga . Ur deras väldiga flaskor ska grönskan skumma i vår ." ) assert expected.split() == tokens
def test_tokenized_document_where_symbols_and_numerals_are_filtered_out(self): corpus = self.create_simple_test_corpus( transform_opts=TokensTransformOpts( keep_symbols=False, only_any_alphanumeric=True, to_lower=False, remove_accents=False, min_len=0, max_len=None, keep_numerals=False, stopwords=None, ) ) result = [x for x in corpus] expected = [ ('document_0.txt', ['Detta', 'är', 'en', 'mening', 'med', 'token', 'siffror', 'och', 'symboler']), ('document_1.txt', ['Är', 'det', 'i', 'denna', 'mening', 'en', 'mening']), ] self.assertEqual(expected, result)
def tranströmer_topic_model_payload(method: str, target_folder: str, target_name: str) -> DocumentPayload: transform_opts: TokensTransformOpts = TokensTransformOpts() extract_opts: ExtractTaggedTokensOpts = ExtractTaggedTokensOpts( lemmatize=True, pos_includes='', pos_excludes='MAD|MID|PAD', text_column='token', lemma_column='baseform', pos_column='pos', ) default_engine_args: dict = { 'n_topics': 4, 'passes': 1, 'random_seed': 42, 'workers': 1, 'max_iter': 100, 'work_folder': os.path.join(target_folder, target_name), } config: CorpusConfig = CorpusConfig.load('./tests/test_data/tranströmer.yml') corpus_source: str = './tests/test_data/tranströmer_corpus_pos_csv.zip' p: CorpusPipeline = ( CorpusPipeline(config=config) .load_tagged_frame( filename=corpus_source, checkpoint_opts=config.checkpoint_opts, extra_reader_opts=config.text_reader_opts, ) .tagged_frame_to_tokens(extract_opts=extract_opts, transform_opts=transform_opts) .to_dtm(VectorizeOpts(already_tokenized=True)) .to_topic_model( target_mode='both', target_folder=target_folder, target_name=target_name, engine=method, engine_args=default_engine_args, store_corpus=True, store_compressed=True, ) ) payload: DocumentPayload = p.single() return payload
def test_fit_transform_gives_document_term_matrix(self): # Arrange reader = PandasCorpusReader(self.create_test_dataframe()) corpus = TokenizedCorpus( reader, transform_opts=TokensTransformOpts( only_any_alphanumeric=False, to_lower=False, remove_accents=False, min_len=1, max_len=None, keep_numerals=False, ), ) v_corpus = CorpusVectorizer().fit_transform(corpus) term_term_matrix = v_corpus.co_occurrence_matrix() token2id = v_corpus.token2id assert 2 == term_term_matrix.todense()[token2id['A'], token2id['B']] assert 0 == term_term_matrix.todense()[token2id['C'], token2id['F']]
def test_fit_transform_when_given_a_vocabulary_returns_same_vocabulary(): corpus = TokenizedCorpus( reader=create_reader(), transform_opts=TokensTransformOpts(to_lower=True, min_len=10), ) vocabulary = CorpusVectorizer().fit_transform( corpus, already_tokenized=True).token2id assert corpus.token2id == vocabulary expected_vocabulary_reversed = { k: abs(v - 5) for k, v in corpus.token2id.items() } vocabulary = (CorpusVectorizer().fit_transform( corpus, already_tokenized=True, vocabulary=expected_vocabulary_reversed).token2id) assert expected_vocabulary_reversed == vocabulary
def test_corpus_apply_when_looping_through_partition_groups_filter_outs_other_groups( ): expected_groups = { 2019: ['tran_2019_01_test', 'tran_2019_02_test', 'tran_2019_03_test'], 2020: ['tran_2020_01_test', 'tran_2020_02_test'], } expected_tokens = { 2019: [ [ 'KYRKA', 'TURIST', 'HALVMÖRKER', 'VALV', 'VALV', 'ÖVERBLICK', 'LJUSLÅGA', 'ÄNGEL', 'ANSIKTE', 'KROPP', 'MÄNNISKA', 'VALV', 'VALV', 'TÅR', 'PIAZZA', 'MR', 'MRS', 'HERR', 'SIGNORA', 'VALV', 'VALV', ], [ 'KÖR', 'NATT', 'HUS', 'STRÅLKASTARSKEN', 'HUS', 'LADA', 'FORDON', 'NU', 'LIV', 'MÄNNISKA', 'DEL', 'ANLETSDRAG', 'TRÄNING', 'EVIGHET', 'ALLT', 'SÖMN', 'BOM', 'MYSTERIUM', ], [ 'SKOG', 'GLÄNTA', 'GLÄNTA', 'OMSLUT', 'SKOG', 'SJÄLV', 'STAM', 'LAV', 'SKÄGGSTUBB', 'TRÄD', 'TOPP', 'KVIST', 'LJUS', 'SKUGGA', 'SKUGGA', 'KÄRR', 'PLATS', 'GRÄS', 'STEN', 'VARA', 'GRUNDSTEN', 'HUS', 'HÄR', 'UPPLYSNING', 'NAMN', 'ARKIV', 'ARKIV', 'TRADITION', 'DÖD', 'MINNE', 'ZIGENARSTAMMEN', 'MEN', 'TORP', 'RÖST', 'VÄRLD', 'CENTRUM', 'INVÅNARE', 'KRÖNIKA', 'ÖDE', 'ÅR', 'TORP', 'SFINX', 'GRUNDSTEN', 'SÄTT', 'MÅSTE', 'NU', 'SNÅR', 'SIDA', 'STEG', 'GÅNGSTIG', 'KOMMUNIKATIONSNÄT', 'KRAFTLEDNINGSSTOLPEN', 'SKALBAGGE', 'SOL', 'SKÖLD', 'FLYGVINGARNA', 'FALLSKÄRM', 'EXPERT', ], ], 2020: [ [ 'VRAK', 'KRETSANDE', 'PUNKT', 'STILLHET', 'HAV', 'LJUS', 'BETSEL', 'TÅNG', 'STRAND', 'JORD', 'MÖRKER', 'FLADDERMUS', 'VRAK', 'STJÄRNA', ], [ 'ÅR', 'STÖVEL', 'SOL', 'TRÄD', 'VIND', 'FRIHET', 'BERG', 'FOT', 'BARRSKOGSBRÄNNINGEN', 'MEN', 'SOMMAR', 'DYNING', 'TRÄD', 'TOPP', 'ÖGONBLICK', 'KUST', ], ], } corpus = SparvTokenizedCsvCorpus( SPARV_ZIPPED_CSV_EXPORT_FILENAME, reader_opts=TextReaderOpts(filename_fields="year:_:1", ), extract_opts=ExtractTaggedTokensOpts(lemmatize=True, pos_includes='|NN|', pos_paddings=None, **SPARV_TAGGED_COLUMNS), transform_opts=TokensTransformOpts( min_len=2, to_upper=True, ), ) partitions = corpus.partition_documents('year') for key in partitions: corpus.reader.apply_filter(partitions[key]) assert expected_groups[key] == corpus.document_names tokens = [x for x in corpus.terms] assert expected_tokens[key] == tokens
def process( corpus_config: Optional[str] = None, input_filename: Optional[str] = None, output_folder: Optional[str] = None, output_tag: Optional[str] = None, filename_pattern: Optional[str] = None, phrase: Sequence[str] = None, phrase_file: Optional[str] = None, create_subfolder: bool = True, pos_includes: Optional[str] = None, pos_paddings: Optional[str] = None, pos_excludes: Optional[str] = None, append_pos: bool = False, to_lower: bool = True, lemmatize: bool = True, remove_stopwords: Optional[str] = None, min_word_length: int = 2, max_word_length: int = None, keep_symbols: bool = False, keep_numerals: bool = False, only_any_alphanumeric: bool = False, only_alphabetic: bool = False, tf_threshold: int = 1, tf_threshold_mask: bool = False, max_tokens: int = None, enable_checkpoint: bool = True, force_checkpoint: bool = False, deserialize_processes: int = 4, ): try: corpus_config: CorpusConfig = CorpusConfig.load(corpus_config) phrases = parse_phrases(phrase_file, phrase) if pos_excludes is None: pos_excludes = pos_tags_to_str(corpus_config.pos_schema.Delimiter) if pos_paddings.upper() in ["FULL", "ALL", "PASSTHROUGH"]: pos_paddings = pos_tags_to_str(corpus_config.pos_schema.all_types_except(pos_includes)) logger.info(f"PoS paddings expanded to: {pos_paddings}") text_reader_opts: TextReaderOpts = corpus_config.text_reader_opts.copy() if filename_pattern is not None: text_reader_opts.filename_pattern = filename_pattern corpus_config.checkpoint_opts.deserialize_processes = max(1, deserialize_processes) tagged_columns: dict = corpus_config.pipeline_payload.tagged_columns_names args: interface.ComputeOpts = interface.ComputeOpts( corpus_type=corpus_config.corpus_type, corpus_source=input_filename, target_folder=output_folder, corpus_tag=output_tag, transform_opts=TokensTransformOpts( to_lower=to_lower, to_upper=False, min_len=min_word_length, max_len=max_word_length, remove_accents=False, remove_stopwords=(remove_stopwords is not None), stopwords=None, extra_stopwords=None, language=remove_stopwords, keep_numerals=keep_numerals, keep_symbols=keep_symbols, only_alphabetic=only_alphabetic, only_any_alphanumeric=only_any_alphanumeric, ), text_reader_opts=text_reader_opts, extract_opts=ExtractTaggedTokensOpts( pos_includes=pos_includes, pos_paddings=pos_paddings, pos_excludes=pos_excludes, lemmatize=lemmatize, phrases=phrases, append_pos=append_pos, global_tf_threshold=tf_threshold, global_tf_threshold_mask=tf_threshold_mask, **tagged_columns, ), vectorize_opts=VectorizeOpts( already_tokenized=True, min_tf=tf_threshold, max_tokens=max_tokens, ), tf_threshold=tf_threshold, tf_threshold_mask=tf_threshold_mask, create_subfolder=create_subfolder, persist=True, enable_checkpoint=enable_checkpoint, force_checkpoint=force_checkpoint, ) workflow.compute(args=args, corpus_config=corpus_config) logger.info('Done!') except Exception as ex: # pylint: disable=try-except-raise logger.exception(ex) click.echo(ex) sys.exit(1)
def filter_tagged_frame( tagged_frame: pd.DataFrame, *, extract_opts: ExtractTaggedTokensOpts, token2id: Token2Id = None, pos_schema: PoS_Tag_Scheme = None, normalize_column_names: bool = True, transform_opts: TokensTransformOpts = None, ) -> pd.DataFrame: """Filters tagged frame (text or numeric). Returns tagged frame Args: tagged_frame ([pd.DataFrame]): Document frame to be filtered, can be text or numeric extract_opts (ExtractTaggedTokensOpts): PoS and lemma extract/filter opts token2id (Token2Id, optional): Vocabulary. Defaults to None. pos_schema (PoS_Tag_Scheme, optional): PoS schema. Defaults to None. transform_opts (TokensTransformOpts, optional): Filters and transforms. Defaults to None. normalize_column_names (bool, optional): If text, rename columns to `token` and `pos`. Defaults to True. Raises: Token2IdMissingError: Token2Id is mandatory if frame is numeric. PoSTagSchemaMissingError: PoS-schema is mandatory if frame is numeric. TaggedFrameColumnNameError: Missing target column (corrupt data) Returns: pd.DataFrame: Filtered and transformed document frame. """ if len(tagged_frame) == 0: return [] is_numeric_frame: bool = is_encoded_tagged_frame(tagged_frame) to_lower: bool = transform_opts and transform_opts.to_lower if is_numeric_frame: if token2id is None: raise Token2IdMissingError( "filter_tagged_frame: cannot filter tagged id frame without vocabulary" ) if pos_schema is None: raise PoSTagSchemaMissingError( "filter_tagged_frame: cannot filter tagged id frame without pos_schema" ) if to_lower: logger.warning( "lowercasing not implemented for numeric tagged frames") to_lower = False if not is_numeric_frame and extract_opts.lemmatize is None and extract_opts.target_override is None: raise ValueError("a valid target not supplied (no lemmatize or target") target_column: str = extract_opts.target_column pos_column: str = extract_opts.pos_column if target_column not in tagged_frame.columns: raise TaggedFrameColumnNameError( f"{target_column} is not valid target for given document (missing column)" ) if pos_column not in tagged_frame.columns: raise ValueError(f"configuration error: {pos_column} not in document") passthroughs: Set[str] = extract_opts.get_passthrough_tokens() blocks: Set[str] = extract_opts.get_block_tokens().union('') if is_numeric_frame: passthroughs = token2id.to_id_set(passthroughs) blocks = token2id.to_id_set(blocks) if not is_numeric_frame and (extract_opts.lemmatize or to_lower): tagged_frame[target_column] = tagged_frame[target_column].str.lower() # pd.Series([x.lower() for x in tagged_frame[target_column]]) passthroughs = {x.lower() for x in passthroughs} # if extract_opts.block_chars: # for char in extract_opts.block_chars: # doc[target] = doc[target].str.replace(char, '', regex=False) """ Phrase detection """ if extract_opts.phrases: if is_numeric_frame: logger.warning( "phrase detection not implemented for numeric tagged frames") extract_opts.phrases = None else: found_phrases = detect_phrases(tagged_frame[target_column], extract_opts.phrases, ignore_case=to_lower) if found_phrases: tagged_frame = merge_phrases(tagged_frame, found_phrases, target_column=target_column, pad=PHRASE_PAD) passthroughs = passthroughs.union( {'_'.join(x[1]) for x in found_phrases}) mask = np.repeat(True, len(tagged_frame.index)) if extract_opts.filter_opts and extract_opts.filter_opts.data: mask &= extract_opts.filter_opts.mask(tagged_frame) pos_includes: Set[str] = extract_opts.get_pos_includes() pos_excludes: Set[str] = extract_opts.get_pos_excludes() pos_paddings: Set[str] = extract_opts.get_pos_paddings() if is_numeric_frame: pg = pos_schema.pos_to_id.get pos_includes = {pg(x) for x in pos_includes} pos_excludes = {pg(x) for x in pos_excludes} pos_paddings = {pg(x) for x in pos_paddings} if pos_includes: """Don't filter if PoS-include is empty - and don't filter out PoS tokens that should be padded""" mask &= tagged_frame[pos_column].isin(pos_includes.union(pos_paddings)) if pos_excludes: mask &= ~(tagged_frame[pos_column].isin(pos_excludes)) if transform_opts and transform_opts.has_effect: mask &= transform_opts.mask(tagged_frame[target_column], token2id=token2id) if len(passthroughs) > 0: mask |= tagged_frame[target_column].isin(passthroughs) if len(blocks) > 0: mask &= ~tagged_frame[target_column].isin(blocks) filtered_data: pd.DataFrame = tagged_frame.loc[mask][[ target_column, pos_column ]] if extract_opts.global_tf_threshold > 1: if token2id is None or token2id.tf is None: logger.error( "Cannot apply TF filter since token2id has no term frequencies" ) extract_opts.global_tf_threshold = 1 else: filtered_data = filter_tagged_frame_by_term_frequency( tagged_frame=filtered_data, target_column=target_column, token2id=token2id, extract_opts=extract_opts, passthroughs=passthroughs, ) if not is_numeric_frame and normalize_column_names: filtered_data.rename(columns={ target_column: 'token', pos_column: 'pos' }, inplace=True) return filtered_data
def test_transform_smoke_test(): transformer = TokensTransformer(transform_opts=TokensTransformOpts()) assert transformer is not None