def feature_meta(column, most_common_characters, most_common_words, lowercase): ( char_idx2str, char_str2idx, char_str2freq, char_max_len ) = create_vocabulary( column, 'characters', num_most_frequent=most_common_characters, lowercase=lowercase ) ( word_idx2str, word_str2idx, word_str2freq, word_max_len ) = create_vocabulary( column, 'english_tokenize', num_most_frequent=most_common_words, lowercase=lowercase ) return ( char_idx2str, char_str2idx, char_str2freq, char_max_len, word_idx2str, word_str2idx, word_str2freq, word_max_len )
def feature_meta(column, preprocessing_parameters, backend): ( char_idx2str, char_str2idx, char_str2freq, char_max_len, char_99ptile_max_len, char_pad_idx, char_pad_symbol, char_unk_symbol, ) = create_vocabulary( column, tokenizer_type="characters", num_most_frequent=preprocessing_parameters["char_most_common"], lowercase=preprocessing_parameters["lowercase"], unknown_symbol=preprocessing_parameters["unknown_symbol"], padding_symbol=preprocessing_parameters["padding_symbol"], pretrained_model_name_or_path=preprocessing_parameters[ "pretrained_model_name_or_path"], processor=backend.df_engine, ) ( word_idx2str, word_str2idx, word_str2freq, word_max_len, word_99ptile_max_len, word_pad_idx, word_pad_symbol, word_unk_symbol, ) = create_vocabulary( column, tokenizer_type=preprocessing_parameters["word_tokenizer"], num_most_frequent=preprocessing_parameters["word_most_common"], lowercase=preprocessing_parameters["lowercase"], vocab_file=preprocessing_parameters["word_vocab_file"], unknown_symbol=preprocessing_parameters["unknown_symbol"], padding_symbol=preprocessing_parameters["padding_symbol"], pretrained_model_name_or_path=preprocessing_parameters[ "pretrained_model_name_or_path"], processor=backend.df_engine, ) return ( char_idx2str, char_str2idx, char_str2freq, char_max_len, char_99ptile_max_len, char_pad_idx, char_pad_symbol, char_unk_symbol, word_idx2str, word_str2idx, word_str2freq, word_max_len, word_99ptile_max_len, word_pad_idx, word_pad_symbol, word_unk_symbol, )
def feature_meta(column, preprocessing_parameters, backend): ( char_idx2str, char_str2idx, char_str2freq, char_max_len, char_pad_idx, char_pad_symbol, char_unk_symbol, ) = create_vocabulary( column, tokenizer_type='characters', num_most_frequent=preprocessing_parameters['char_most_common'], lowercase=preprocessing_parameters['lowercase'], unknown_symbol=preprocessing_parameters['unknown_symbol'], padding_symbol=preprocessing_parameters['padding_symbol'], pretrained_model_name_or_path=preprocessing_parameters[ 'pretrained_model_name_or_path'], processor=backend.df_engine ) ( word_idx2str, word_str2idx, word_str2freq, word_max_len, word_pad_idx, word_pad_symbol, word_unk_symbol, ) = create_vocabulary( column, tokenizer_type=preprocessing_parameters['word_tokenizer'], num_most_frequent=preprocessing_parameters['word_most_common'], lowercase=preprocessing_parameters['lowercase'], vocab_file=preprocessing_parameters['word_vocab_file'], unknown_symbol=preprocessing_parameters['unknown_symbol'], padding_symbol=preprocessing_parameters['padding_symbol'], pretrained_model_name_or_path=preprocessing_parameters[ 'pretrained_model_name_or_path'], processor=backend.df_engine ) return ( char_idx2str, char_str2idx, char_str2freq, char_max_len, char_pad_idx, char_pad_symbol, char_unk_symbol, word_idx2str, word_str2idx, word_str2freq, word_max_len, word_pad_idx, word_pad_symbol, word_unk_symbol, )
def feature_meta(column, preprocessing_parameters): (char_idx2str, char_str2idx, char_str2freq, char_max_len) = create_vocabulary( column, 'characters', num_most_frequent=preprocessing_parameters['char_most_common'], lowercase=preprocessing_parameters['lowercase']) (word_idx2str, word_str2idx, word_str2freq, word_max_len) = create_vocabulary( column, preprocessing_parameters['word_format'], num_most_frequent=preprocessing_parameters['word_most_common'], lowercase=preprocessing_parameters['lowercase']) return (char_idx2str, char_str2idx, char_str2freq, char_max_len, word_idx2str, word_str2idx, word_str2freq, word_max_len)
def test_create_vocabulary_word(): data = pd.DataFrame([ "Hello, I'm a single sentence!", "And another sentence", "And the very very last one" ]) column = data[0] preprocessing_parameters = TextFeatureMixin.preprocessing_defaults() vocabulary_output = strings_utils.create_vocabulary( column, tokenizer_type=preprocessing_parameters["word_tokenizer"], num_most_frequent=preprocessing_parameters["word_most_common"], lowercase=preprocessing_parameters["lowercase"], vocab_file=preprocessing_parameters["word_vocab_file"], unknown_symbol=preprocessing_parameters["unknown_symbol"], padding_symbol=preprocessing_parameters["padding_symbol"], pretrained_model_name_or_path=preprocessing_parameters[ "pretrained_model_name_or_path"], ) assert len(vocabulary_output[0]) == 19 assert vocabulary_output[0][strings_utils.SpecialSymbol.UNKNOWN. value] == strings_utils.UNKNOWN_SYMBOL assert vocabulary_output[0][ strings_utils.SpecialSymbol.STOP.value] == strings_utils.STOP_SYMBOL assert vocabulary_output[0][strings_utils.SpecialSymbol.PADDING. value] == strings_utils.PADDING_SYMBOL assert vocabulary_output[0][strings_utils.SpecialSymbol.UNKNOWN. value] == strings_utils.UNKNOWN_SYMBOL
def feature_meta(column, preprocessing_parameters, backend): ( idx2str, str2idx, str2freq, max_len, max_len_99ptile, pad_idx, padding_symbol, unknown_symbol, ) = create_vocabulary( column, tokenizer_type=preprocessing_parameters["tokenizer"], num_most_frequent=preprocessing_parameters["most_common"], lowercase=preprocessing_parameters["lowercase"], vocab_file=preprocessing_parameters["vocab_file"], unknown_symbol=preprocessing_parameters["unknown_symbol"], padding_symbol=preprocessing_parameters["padding_symbol"], pretrained_model_name_or_path=preprocessing_parameters[ "pretrained_model_name_or_path"], processor=backend.df_engine, ) return ( idx2str, str2idx, str2freq, max_len, max_len_99ptile, pad_idx, padding_symbol, unknown_symbol, )
def get_feature_meta(column, preprocessing_parameters, backend): column = column.astype(str) idx2str, str2idx, str2freq, _, _, _, _ = create_vocabulary( column, "stripped", num_most_frequent=preprocessing_parameters["most_common"], lowercase=preprocessing_parameters["lowercase"], add_padding=False, processor=backend.df_engine, ) return {"idx2str": idx2str, "str2idx": str2idx, "str2freq": str2freq, "vocab_size": len(str2idx)}
def feature_meta(column, preprocessing_parameters): (char_idx2str, char_str2idx, char_str2freq, char_max_len) = create_vocabulary( column, tokenizer_type='characters', num_most_frequent=preprocessing_parameters['char_most_common'], lowercase=preprocessing_parameters['lowercase'], unknown_symbol=preprocessing_parameters['unknown_symbol'], padding_symbol=preprocessing_parameters['padding_symbol']) (word_idx2str, word_str2idx, word_str2freq, word_max_len) = create_vocabulary( column, tokenizer_type=preprocessing_parameters['word_tokenizer'], num_most_frequent=preprocessing_parameters['word_most_common'], lowercase=preprocessing_parameters['lowercase'], vocab_file=preprocessing_parameters['word_vocab_file'], unknown_symbol=preprocessing_parameters['unknown_symbol'], padding_symbol=preprocessing_parameters['padding_symbol'], ) return (char_idx2str, char_str2idx, char_str2freq, char_max_len, word_idx2str, word_str2idx, word_str2freq, word_max_len)
def get_feature_meta(column, preprocessing_parameters): idx2str, str2idx, str2freq, max_size = create_vocabulary( column, preprocessing_parameters['tokenizer'], num_most_frequent=preprocessing_parameters['most_common'], lowercase=preprocessing_parameters['lowercase']) return { 'idx2str': idx2str, 'str2idx': str2idx, 'str2freq': str2freq, 'vocab_size': len(str2idx), 'max_set_size': max_size }
def get_feature_meta(column, preprocessing_parameters): idx2str, str2idx, str2freq, _ = create_vocabulary( column, 'stripped', num_most_frequent=preprocessing_parameters['most_common'], lowercase=preprocessing_parameters['lowercase'], add_padding=False) return { 'idx2str': idx2str, 'str2idx': str2idx, 'str2freq': str2freq, 'vocab_size': len(str2idx) }
def get_feature_meta(column, preprocessing_parameters, backend): column = column.astype(str) idx2str, str2idx, str2freq, _, _, _, _ = create_vocabulary( column, 'stripped', num_most_frequent=preprocessing_parameters['most_common'], lowercase=preprocessing_parameters['lowercase'], add_padding=False, processor=backend.df_engine) return { 'idx2str': idx2str, 'str2idx': str2idx, 'str2freq': str2freq, 'vocab_size': len(str2idx) }
def get_feature_meta(column, preprocessing_parameters, backend): column = column.astype(str) idx2str, str2idx, str2freq, max_size, _, _, _ = create_vocabulary( column, preprocessing_parameters['tokenizer'], num_most_frequent=preprocessing_parameters['most_common'], lowercase=preprocessing_parameters['lowercase'], processor=backend.df_engine) return { 'idx2str': idx2str, 'str2idx': str2idx, 'str2freq': str2freq, 'vocab_size': len(str2idx), 'max_set_size': max_size }
def get_feature_meta(column, preprocessing_parameters): idx2str, str2idx, str2freq, max_length = create_vocabulary( column, preprocessing_parameters['format'], lowercase=preprocessing_parameters['lowercase'], num_most_frequent=preprocessing_parameters['most_common']) max_length = min(preprocessing_parameters['sequence_length_limit'], max_length) return { 'idx2str': idx2str, 'str2idx': str2idx, 'str2freq': str2freq, 'vocab_size': len(idx2str), 'max_sequence_length': max_length }
def get_feature_meta(column, preprocessing_parameters, backend): idx2str, str2idx, str2freq, max_size, _, _, _, _ = create_vocabulary( column, preprocessing_parameters["tokenizer"], num_most_frequent=preprocessing_parameters["most_common"], lowercase=preprocessing_parameters["lowercase"], processor=backend.df_engine, ) return { "idx2str": idx2str, "str2idx": str2idx, "str2freq": str2freq, "vocab_size": len(str2idx), "max_set_size": max_size, }
def test_create_vocabulary_from_hf(): data = pd.DataFrame(["Hello, I'm a single sentence!", "And another sentence", "And the very very last one"]) column = data[0] preprocessing_parameters = TextFeatureMixin.preprocessing_defaults() vocabulary_output = strings_utils.create_vocabulary( column, tokenizer_type="hf_tokenizer", num_most_frequent=preprocessing_parameters["most_common"], lowercase=preprocessing_parameters["lowercase"], unknown_symbol=preprocessing_parameters["unknown_symbol"], padding_symbol=preprocessing_parameters["padding_symbol"], pretrained_model_name_or_path="albert-base-v2", ) assert len(vocabulary_output[0]) == 30000
def get_feature_meta(column, preprocessing_parameters): idx2str, str2idx, str2freq, max_length, _, _, _ = create_vocabulary( column, preprocessing_parameters['tokenizer'], lowercase=preprocessing_parameters['lowercase'], num_most_frequent=preprocessing_parameters['most_common'], vocab_file=preprocessing_parameters['vocab_file'], unknown_symbol=preprocessing_parameters['unknown_symbol'], padding_symbol=preprocessing_parameters['padding_symbol'], ) max_length = min(preprocessing_parameters['sequence_length_limit'], max_length) return { 'idx2str': idx2str, 'str2idx': str2idx, 'str2freq': str2freq, 'vocab_size': len(idx2str), 'max_sequence_length': max_length }
def get_feature_meta(column, preprocessing_parameters, backend): idx2str, str2idx, str2freq, max_length, _, _, _, _ = create_vocabulary( column, preprocessing_parameters["tokenizer"], lowercase=preprocessing_parameters["lowercase"], num_most_frequent=preprocessing_parameters["most_common"], vocab_file=preprocessing_parameters["vocab_file"], unknown_symbol=preprocessing_parameters["unknown_symbol"], padding_symbol=preprocessing_parameters["padding_symbol"], processor=backend.df_engine, ) max_length = min(preprocessing_parameters["max_sequence_length"], max_length) return { "idx2str": idx2str, "str2idx": str2idx, "str2freq": str2freq, "vocab_size": len(idx2str), "max_sequence_length": max_length + 2, # For start and end symbol. }