Example #1
0
def test_create_vocabulary_word():
    data = pd.DataFrame([
        "Hello, I'm a single sentence!", "And another sentence",
        "And the very very last one"
    ])
    column = data[0]
    preprocessing_parameters = TextFeatureMixin.preprocessing_defaults()

    vocabulary_output = strings_utils.create_vocabulary(
        column,
        tokenizer_type=preprocessing_parameters["word_tokenizer"],
        num_most_frequent=preprocessing_parameters["word_most_common"],
        lowercase=preprocessing_parameters["lowercase"],
        vocab_file=preprocessing_parameters["word_vocab_file"],
        unknown_symbol=preprocessing_parameters["unknown_symbol"],
        padding_symbol=preprocessing_parameters["padding_symbol"],
        pretrained_model_name_or_path=preprocessing_parameters[
            "pretrained_model_name_or_path"],
    )

    assert len(vocabulary_output[0]) == 19
    assert vocabulary_output[0][strings_utils.SpecialSymbol.UNKNOWN.
                                value] == strings_utils.UNKNOWN_SYMBOL
    assert vocabulary_output[0][
        strings_utils.SpecialSymbol.STOP.value] == strings_utils.STOP_SYMBOL
    assert vocabulary_output[0][strings_utils.SpecialSymbol.PADDING.
                                value] == strings_utils.PADDING_SYMBOL
    assert vocabulary_output[0][strings_utils.SpecialSymbol.UNKNOWN.
                                value] == strings_utils.UNKNOWN_SYMBOL
Example #2
0
def test_create_vocabulary_from_hf():
    data = pd.DataFrame(["Hello, I'm a single sentence!", "And another sentence", "And the very very last one"])
    column = data[0]
    preprocessing_parameters = TextFeatureMixin.preprocessing_defaults()

    vocabulary_output = strings_utils.create_vocabulary(
        column,
        tokenizer_type="hf_tokenizer",
        num_most_frequent=preprocessing_parameters["most_common"],
        lowercase=preprocessing_parameters["lowercase"],
        unknown_symbol=preprocessing_parameters["unknown_symbol"],
        padding_symbol=preprocessing_parameters["padding_symbol"],
        pretrained_model_name_or_path="albert-base-v2",
    )

    assert len(vocabulary_output[0]) == 30000
Example #3
0
def test_validate_with_preprocessing_defaults():
    config = {
        "input_features": [
            audio_feature(
                "/tmp/destination_folder",
                preprocessing=AudioFeatureMixin.preprocessing_defaults()),
            bag_feature(
                preprocessing=BagFeatureMixin.preprocessing_defaults()),
            binary_feature(
                preprocessing=BinaryFeatureMixin.preprocessing_defaults()),
            category_feature(
                preprocessing=CategoryFeatureMixin.preprocessing_defaults()),
            date_feature(
                preprocessing=DateFeatureMixin.preprocessing_defaults()),
            h3_feature(preprocessing=H3FeatureMixin.preprocessing_defaults()),
            image_feature(
                "/tmp/destination_folder",
                preprocessing=ImageFeatureMixin.preprocessing_defaults()),
            number_feature(
                preprocessing=NumberFeatureMixin.preprocessing_defaults()),
            sequence_feature(
                preprocessing=SequenceFeatureMixin.preprocessing_defaults()),
            set_feature(
                preprocessing=SetFeatureMixin.preprocessing_defaults()),
            text_feature(
                preprocessing=TextFeatureMixin.preprocessing_defaults()),
            timeseries_feature(
                preprocessing=TimeseriesFeatureMixin.preprocessing_defaults()),
            vector_feature(
                preprocessing=VectorFeatureMixin.preprocessing_defaults()),
        ],
        "output_features": [{
            "name": "target",
            "type": "category"
        }],
        "trainer": {
            "decay": True,
            "learning_rate": 0.001,
            "validation_field": "target",
            "validation_metric": "accuracy",
        },
    }

    validate_config(config)
    config = merge_with_defaults(config)
    validate_config(config)