def test_create_vocabulary_word(): data = pd.DataFrame([ "Hello, I'm a single sentence!", "And another sentence", "And the very very last one" ]) column = data[0] preprocessing_parameters = TextFeatureMixin.preprocessing_defaults() vocabulary_output = strings_utils.create_vocabulary( column, tokenizer_type=preprocessing_parameters["word_tokenizer"], num_most_frequent=preprocessing_parameters["word_most_common"], lowercase=preprocessing_parameters["lowercase"], vocab_file=preprocessing_parameters["word_vocab_file"], unknown_symbol=preprocessing_parameters["unknown_symbol"], padding_symbol=preprocessing_parameters["padding_symbol"], pretrained_model_name_or_path=preprocessing_parameters[ "pretrained_model_name_or_path"], ) assert len(vocabulary_output[0]) == 19 assert vocabulary_output[0][strings_utils.SpecialSymbol.UNKNOWN. value] == strings_utils.UNKNOWN_SYMBOL assert vocabulary_output[0][ strings_utils.SpecialSymbol.STOP.value] == strings_utils.STOP_SYMBOL assert vocabulary_output[0][strings_utils.SpecialSymbol.PADDING. value] == strings_utils.PADDING_SYMBOL assert vocabulary_output[0][strings_utils.SpecialSymbol.UNKNOWN. value] == strings_utils.UNKNOWN_SYMBOL
def test_create_vocabulary_from_hf(): data = pd.DataFrame(["Hello, I'm a single sentence!", "And another sentence", "And the very very last one"]) column = data[0] preprocessing_parameters = TextFeatureMixin.preprocessing_defaults() vocabulary_output = strings_utils.create_vocabulary( column, tokenizer_type="hf_tokenizer", num_most_frequent=preprocessing_parameters["most_common"], lowercase=preprocessing_parameters["lowercase"], unknown_symbol=preprocessing_parameters["unknown_symbol"], padding_symbol=preprocessing_parameters["padding_symbol"], pretrained_model_name_or_path="albert-base-v2", ) assert len(vocabulary_output[0]) == 30000
def test_validate_with_preprocessing_defaults(): config = { "input_features": [ audio_feature( "/tmp/destination_folder", preprocessing=AudioFeatureMixin.preprocessing_defaults()), bag_feature( preprocessing=BagFeatureMixin.preprocessing_defaults()), binary_feature( preprocessing=BinaryFeatureMixin.preprocessing_defaults()), category_feature( preprocessing=CategoryFeatureMixin.preprocessing_defaults()), date_feature( preprocessing=DateFeatureMixin.preprocessing_defaults()), h3_feature(preprocessing=H3FeatureMixin.preprocessing_defaults()), image_feature( "/tmp/destination_folder", preprocessing=ImageFeatureMixin.preprocessing_defaults()), number_feature( preprocessing=NumberFeatureMixin.preprocessing_defaults()), sequence_feature( preprocessing=SequenceFeatureMixin.preprocessing_defaults()), set_feature( preprocessing=SetFeatureMixin.preprocessing_defaults()), text_feature( preprocessing=TextFeatureMixin.preprocessing_defaults()), timeseries_feature( preprocessing=TimeseriesFeatureMixin.preprocessing_defaults()), vector_feature( preprocessing=VectorFeatureMixin.preprocessing_defaults()), ], "output_features": [{ "name": "target", "type": "category" }], "trainer": { "decay": True, "learning_rate": 0.001, "validation_field": "target", "validation_metric": "accuracy", }, } validate_config(config) config = merge_with_defaults(config) validate_config(config)