def feature_data(column, metadata, preprocessing_parameters): char_data = build_sequence_matrix( sequences=column, inverse_vocabulary=metadata['char_str2idx'], tokenizer_type=preprocessing_parameters['char_tokenizer'], length_limit=metadata['char_max_sequence_length'], padding_symbol=preprocessing_parameters['padding_symbol'], padding=preprocessing_parameters['padding'], unknown_symbol=preprocessing_parameters['unknown_symbol'], lowercase=preprocessing_parameters['lowercase'], tokenizer_vocab_file=preprocessing_parameters['char_vocab_file'], ) word_data = build_sequence_matrix( sequences=column, inverse_vocabulary=metadata['word_str2idx'], tokenizer_type=preprocessing_parameters['word_tokenizer'], length_limit=metadata['word_max_sequence_length'], padding_symbol=preprocessing_parameters['padding_symbol'], padding=preprocessing_parameters['padding'], unknown_symbol=preprocessing_parameters['unknown_symbol'], lowercase=preprocessing_parameters['lowercase'], tokenizer_vocab_file=preprocessing_parameters['word_vocab_file'], ) return char_data, word_data
def feature_data(column, metadata, preprocessing_parameters, backend): char_data = build_sequence_matrix( sequences=column, inverse_vocabulary=metadata['char_str2idx'], tokenizer_type=preprocessing_parameters['char_tokenizer'], length_limit=metadata['char_max_sequence_length'], padding_symbol=metadata['char_pad_symbol'], padding=preprocessing_parameters['padding'], unknown_symbol=metadata['char_unk_symbol'], lowercase=preprocessing_parameters['lowercase'], tokenizer_vocab_file=preprocessing_parameters['char_vocab_file'], pretrained_model_name_or_path=preprocessing_parameters[ 'pretrained_model_name_or_path'], processor=backend.df_engine) word_data = build_sequence_matrix( sequences=column, inverse_vocabulary=metadata['word_str2idx'], tokenizer_type=preprocessing_parameters['word_tokenizer'], length_limit=metadata['word_max_sequence_length'], padding_symbol=metadata['word_pad_symbol'], padding=preprocessing_parameters['padding'], unknown_symbol=metadata['word_unk_symbol'], lowercase=preprocessing_parameters['lowercase'], tokenizer_vocab_file=preprocessing_parameters['word_vocab_file'], pretrained_model_name_or_path=preprocessing_parameters[ 'pretrained_model_name_or_path'], processor=backend.df_engine) return char_data, word_data
def feature_data(column, metadata, preprocessing_parameters): char_data = build_sequence_matrix( column, metadata['char_str2idx'], preprocessing_parameters['char_tokenizer'], metadata['char_max_sequence_length'], preprocessing_parameters['padding_symbol'], preprocessing_parameters['padding'], preprocessing_parameters['lowercase'], tokenizer_vocab_file=preprocessing_parameters[ 'char_tokenizer_vocab_file'], ) word_data = build_sequence_matrix( column, metadata['word_str2idx'], preprocessing_parameters['word_tokenizer'], metadata['word_max_sequence_length'], preprocessing_parameters['padding_symbol'], preprocessing_parameters['padding'], preprocessing_parameters['lowercase'], tokenizer_vocab_file=preprocessing_parameters[ 'word_tokenizer_vocab_file'], ) return char_data, word_data
def feature_data(column, metadata, preprocessing_parameters): sequence_data = build_sequence_matrix( column, metadata['str2idx'], preprocessing_parameters['format'], metadata['max_sequence_length'], preprocessing_parameters['padding_symbol'], preprocessing_parameters['padding'], preprocessing_parameters['lowercase']) return sequence_data
def feature_data(column, metadata, preprocessing_parameters): char_data = build_sequence_matrix( column, metadata['char_str2idx'], preprocessing_parameters['char_format'], metadata['char_max_sequence_length'], preprocessing_parameters['padding_symbol'], preprocessing_parameters['padding'], preprocessing_parameters['lowercase']) word_data = build_sequence_matrix( column, metadata['word_str2idx'], preprocessing_parameters['word_format'], metadata['word_max_sequence_length'], preprocessing_parameters['padding_symbol'], preprocessing_parameters['padding'], preprocessing_parameters['lowercase']) return char_data, word_data
def feature_data(column, metadata, preprocessing_parameters, backend): sequence_data = build_sequence_matrix( sequences=column, inverse_vocabulary=metadata['str2idx'], tokenizer_type=preprocessing_parameters['tokenizer'], length_limit=metadata['max_sequence_length'], padding_symbol=preprocessing_parameters['padding_symbol'], padding=preprocessing_parameters['padding'], unknown_symbol=preprocessing_parameters['unknown_symbol'], lowercase=preprocessing_parameters['lowercase'], tokenizer_vocab_file=preprocessing_parameters['vocab_file'], processor=backend.df_engine) return sequence_data
def feature_data(column, metadata, preprocessing_parameters, backend): sequence_data = build_sequence_matrix( sequences=column, inverse_vocabulary=metadata["str2idx"], tokenizer_type=preprocessing_parameters["tokenizer"], length_limit=metadata["max_sequence_length"], padding_symbol=preprocessing_parameters["padding_symbol"], padding=preprocessing_parameters["padding"], unknown_symbol=preprocessing_parameters["unknown_symbol"], lowercase=preprocessing_parameters["lowercase"], tokenizer_vocab_file=preprocessing_parameters["vocab_file"], processor=backend.df_engine, ) return sequence_data
def test_build_sequence_matrix(): inverse_vocabulary = { "<EOS>": 0, "<SOS>": 1, "<PAD>": 2, "<UNK>": 3, "a": 4, "b": 5, "c": 6, } sequences = pd.core.series.Series(["a b c", "c b a"]) sequence_matrix = strings_utils.build_sequence_matrix( sequences, inverse_vocabulary, tokenizer_type="space", length_limit=10) assert not (sequence_matrix.tolist() - np.array([[1, 4, 5, 6, 0, 2, 2, 2, 2, 2], [1, 6, 5, 4, 0, 2, 2, 2, 2, 2]])).any()
def feature_data(column, metadata, preprocessing_parameters, backend): # TODO(1891): Remove backward compatibility hack once all models have been retrained with Ludwig after # https://github.com/ludwig-ai/ludwig/pull/1859. prefix = "" padding_symbol_metadata_key = "padding_symbol" unknown_symbol_metadata_key = "unknown_symbol" if "str2idx" not in metadata: prefix = "word_" padding_symbol_metadata_key = "word_pad_symbol" unknown_symbol_metadata_key = "word_unk_symbol" # ensure preprocessing param values match the metadata determined from dataset preprocessing_parameters["padding_symbol"] = metadata[ padding_symbol_metadata_key] preprocessing_parameters["unknown_symbol"] = metadata[ unknown_symbol_metadata_key] if preprocessing_parameters["fill_value"] == UNKNOWN_SYMBOL: preprocessing_parameters["fill_value"] = preprocessing_parameters[ "unknown_symbol"] if ("computed_fill_value" in preprocessing_parameters and preprocessing_parameters["computed_fill_value"] == UNKNOWN_SYMBOL): preprocessing_parameters[ "computed_fill_value"] = preprocessing_parameters[ "unknown_symbol"] return build_sequence_matrix( sequences=column, inverse_vocabulary=metadata[f"{prefix}str2idx"], tokenizer_type=preprocessing_parameters[f"{prefix}tokenizer"], length_limit=metadata[f"{prefix}max_sequence_length"], padding_symbol=metadata[padding_symbol_metadata_key], padding=preprocessing_parameters["padding"], unknown_symbol=metadata[unknown_symbol_metadata_key], lowercase=preprocessing_parameters["lowercase"], tokenizer_vocab_file=preprocessing_parameters[ f"{prefix}vocab_file"], pretrained_model_name_or_path=preprocessing_parameters[ "pretrained_model_name_or_path"], processor=backend.df_engine, )