Esempio n. 1
0
def audio_feature(folder, **kwargs):
    feature = {
        "name": "audio_" + random_string(),
        "type": "audio",
        "preprocessing": {
            "audio_feature": {
                "type": "fbank",
                "window_length_in_s": 0.04,
                "window_shift_in_s": 0.02,
                "num_filter_bands": 80,
            },
            "audio_file_length_limit_in_s": 3.0,
        },
        "encoder": "stacked_cnn",
        "should_embed": False,
        "conv_layers": [
            {"filter_size": 400, "pool_size": 16, "num_filters": 32, "regularize": "false"},
            {"filter_size": 40, "pool_size": 10, "num_filters": 64, "regularize": "false"},
        ],
        "fc_size": 256,
        "destination_folder": folder,
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 2
0
def h3_feature(**kwargs):
    feature = {
        'name': 'h3_' + random_string(),
        'type': 'h3'
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 3
0
def numerical_feature(normalization=None, **kwargs):
    feature = {
        "name": "num_" + random_string(),
        "type": "numerical",
        "preprocessing": {"normalization": normalization},
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 4
0
def vector_feature(**kwargs):
    feature = {
        'type': VECTOR,
        'vector_size': 5,
        'name': 'vector_' + random_string()
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 5
0
def timeseries_feature(**kwargs):
    feature = {
        'name': 'timeseries_' + random_string(),
        'type': 'timeseries',
        'max_len': 7
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 6
0
def date_feature(**kwargs):
    feature = {
        "name": "date_" + random_string(),
        "type": "date",
        "preprocessing": {"datetime_format": random.choice(list(DATETIME_FORMATS.keys()))},
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 7
0
def vector_feature(**kwargs):
    feature = {
        "type": VECTOR,
        "vector_size": 5,
        "name": "vector_" + random_string()
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 8
0
def timeseries_feature(**kwargs):
    feature = {
        "name": "timeseries_" + random_string(),
        "type": "timeseries",
        "max_len": 7,
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 9
0
def category_feature(**kwargs):
    feature = {
        'type': 'category',
        'name': 'category_' + random_string(),
        'vocab_size': 10,
        'embedding_size': 5
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 10
0
def category_feature(**kwargs):
    feature = {
        "type": "category",
        "name": "category_" + random_string(),
        "vocab_size": 10,
        "embedding_size": 5,
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 11
0
def numerical_feature(normalization=None, **kwargs):
    feature = {
        'name': 'num_' + random_string(),
        'type': 'numerical',
        'preprocessing': {
            'normalization': normalization
        }
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 12
0
def bag_feature(**kwargs):
    feature = {
        "name": "bag_" + random_string(),
        "type": "bag",
        "max_len": 5,
        "vocab_size": 10,
        "embedding_size": 5,
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 13
0
def date_feature(**kwargs):
    feature = {
        'name': 'date_' + random_string(),
        'type': 'date',
        'preprocessing': {
            'datetime_format': random.choice(list(DATETIME_FORMATS.keys()))
        }
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 14
0
def bag_feature(**kwargs):
    feature = {
        'name': 'bag_' + random_string(),
        'type': 'bag',
        'max_len': 5,
        'vocab_size': 10,
        'embedding_size': 5
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 15
0
def text_feature(**kwargs):
    feature = {
        "name": "text_" + random_string(),
        "type": "text",
        "vocab_size": 5,
        "min_len": 7,
        "max_len": 7,
        "embedding_size": 8,
        "state_size": 8,
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 16
0
    def __init__(self, feature, *args, **kwargs):
        super().__init__()

        if NAME not in feature:
            raise ValueError("Missing feature name")
        self.feature_name = feature[NAME]

        if COLUMN not in feature:
            feature[COLUMN] = self.feature_name
        self.column = feature[COLUMN]

        if PROC_COLUMN not in feature:
            feature[PROC_COLUMN] = compute_feature_hash(feature)
        self.proc_column = feature[PROC_COLUMN]
Esempio n. 17
0
def image_feature(folder, **kwargs):
    feature = {
        "type": "image",
        "name": "image_" + random_string(),
        "encoder": "resnet",
        "preprocessing": {"in_memory": True, "height": 12, "width": 12, "num_channels": 3},
        "resnet_size": 8,
        "destination_folder": folder,
        "fc_size": 8,
        "num_filters": 8,
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 18
0
def text_feature(**kwargs):
    feature = {
        'name': 'text_' + random_string(),
        'type': 'text',
        'reduce_input': None,
        'vocab_size': 5,
        'min_len': 7,
        'max_len': 7,
        'embedding_size': 8,
        'state_size': 8
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 19
0
def sequence_feature(**kwargs):
    feature = {
        'type': 'sequence',
        'name': 'sequence_' + random_string(),
        'vocab_size': 10,
        'max_len': 7,
        'encoder': 'embed',
        'embedding_size': 8,
        'fc_size': 8,
        'state_size': 8,
        'num_filters': 8,
        'hidden_size': 8
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 20
0
def sequence_feature(**kwargs):
    feature = {
        "type": "sequence",
        "name": "sequence_" + random_string(),
        "vocab_size": 10,
        "max_len": 7,
        "encoder": "embed",
        "embedding_size": 8,
        "fc_size": 8,
        "state_size": 8,
        "num_filters": 8,
        "hidden_size": 8,
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 21
0
def image_feature(folder, **kwargs):
    feature = {
        'type': 'image',
        'name': 'image_' + random_string(),
        'encoder': 'resnet',
        'preprocessing': {
            'in_memory': True,
            'height': 12,
            'width': 12,
            'num_channels': 3
        },
        'resnet_size': 8,
        'destination_folder': folder,
        'fc_size': 8,
        'num_filters': 8
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 22
0
def audio_feature(folder, **kwargs):
    feature = {
        'name':
        'audio_' + random_string(),
        'type':
        'audio',
        'preprocessing': {
            'audio_feature': {
                'type': 'fbank',
                'window_length_in_s': 0.04,
                'window_shift_in_s': 0.02,
                'num_filter_bands': 80
            },
            'audio_file_length_limit_in_s': 3.0
        },
        'encoder':
        'stacked_cnn',
        'should_embed':
        False,
        'conv_layers': [{
            'filter_size': 400,
            'pool_size': 16,
            'num_filters': 32,
            'regularize': 'false'
        }, {
            'filter_size': 40,
            'pool_size': 10,
            'num_filters': 64,
            'regularize': 'false'
        }],
        'fc_size':
        256,
        'destination_folder':
        folder
    }
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 23
0
def _set_proc_column(config: dict) -> None:
    for feature in config["input_features"] + config["output_features"]:
        if PROC_COLUMN not in feature:
            feature[PROC_COLUMN] = compute_feature_hash(feature)
Esempio n. 24
0
def numerical_feature():
    feature = {NAME: 'x', COLUMN: 'x', 'type': 'numerical'}
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 25
0
def h3_feature(**kwargs):
    feature = {"name": "h3_" + random_string(), "type": "h3"}
    feature.update(kwargs)
    feature[COLUMN] = feature[NAME]
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 26
0
def numerical_feature():
    feature = {NAME: "x", COLUMN: "x", "type": "numerical"}
    feature[PROC_COLUMN] = compute_feature_hash(feature)
    return feature
Esempio n. 27
0
def test_encoder(test_case):
    # set up required directories for images if needed
    shutil.rmtree(IMAGE_DIR, ignore_errors=True)
    os.mkdir(IMAGE_DIR)

    # reproducible synthetic data set
    np.random.seed(RANDOM_SEED)
    tf.random.set_seed(RANDOM_SEED)

    # create synthetic data for the test
    features = [
        test_case.syn_data.feature_generator(
            *test_case.syn_data.feature_generator_args,
            **test_case.syn_data.feature_generator_kwargs)
    ]
    name = features[0][NAME]
    proc_column = compute_feature_hash(features[0])
    features[0][PROC_COLUMN] = proc_column

    data_generator = build_synthetic_dataset(BATCH_SIZE, features)
    data_list = list(data_generator)
    raw_data = [x[0] for x in data_list[1:]]
    df = pd.DataFrame({data_list[0][0]: raw_data})

    # minimal config sufficient to create the input feature
    config = {'input_features': features, 'output_features': []}
    training_set, _, _, training_set_metadata = preprocess_for_training(
        config,
        training_set=df,
        skip_save_processed_input=True,
        random_seed=RANDOM_SEED)

    # run through each type of regularizer for the encoder
    regularizer_losses = []
    for regularizer in [None, 'l1', 'l2', 'l1_l2']:
        # start with clean slate and make reproducible
        tf.keras.backend.clear_session()
        np.random.seed(RANDOM_SEED)
        tf.random.set_seed(RANDOM_SEED)

        # setup kwarg for regularizer parms
        x_coder_kwargs = dict(
            zip(test_case.regularizer_parm_names,
                len(test_case.regularizer_parm_names) * [regularizer]))

        # combine other other keyword parameters
        x_coder_kwargs.update(test_case.XCoder_other_parms)
        features[0].update(x_coder_kwargs)

        # shim code to support sequence/sequence like features
        if features[0]['type'] in SEQUENCE_TYPES.union({'category', 'set'}):
            features[0]['vocab'] = training_set_metadata[name]['idx2str']
            training_set.dataset[proc_column] = \
                training_set.dataset[proc_column].astype(np.int32)

        input_def_obj = build_single_input(features[0], None)

        inputs = training_set.dataset[proc_column]
        # make sure we are at least rank 2 tensor
        if len(inputs.shape) == 1:
            inputs = inputs.reshape(-1, 1)

        # special handling for image feature
        if features[0]['type'] == 'image':
            inputs = tf.cast(inputs, tf.float32) / 255

        input_def_obj.encoder_obj(inputs)
        regularizer_loss = tf.reduce_sum(input_def_obj.encoder_obj.losses)
        regularizer_losses.append(regularizer_loss)

    # check loss regularization loss values
    # None should be zero
    assert regularizer_losses[0] == 0

    # l1, l2 and l1_l2 should be greater than zero
    assert np.all([t > 0.0 for t in regularizer_losses[1:]])

    # # using default setting l1 + l2 == l1_l2 losses
    assert np.isclose(
        regularizer_losses[1].numpy() + regularizer_losses[2].numpy(),
        regularizer_losses[3].numpy())

    # cleanup
    shutil.rmtree(IMAGE_DIR, ignore_errors=True)
Esempio n. 28
0
def test_decoder(test_case):
    # reproducible synthetic data set
    np.random.seed(RANDOM_SEED)
    tf.random.set_seed(RANDOM_SEED)

    # create synthetic data for the test
    features = [
        test_case.syn_data.feature_generator(
            *test_case.syn_data.feature_generator_args,
            **test_case.syn_data.feature_generator_kwargs)
    ]
    feature_name = features[0][NAME]
    proc_column = compute_feature_hash(features[0])
    features[0][PROC_COLUMN] = proc_column

    data_generator = build_synthetic_dataset(BATCH_SIZE, features)
    data_list = list(data_generator)
    raw_data = [x[0] for x in data_list[1:]]
    df = pd.DataFrame({data_list[0][0]: raw_data})

    # create synthetic combiner layer
    combiner_outputs_rank2 = {
        'combiner_output':
        tf.random.normal([BATCH_SIZE, HIDDEN_SIZE], dtype=tf.float32)
    }

    combiner_outputs_rank3 = {
        'combiner_output':
        tf.random.normal([BATCH_SIZE, SEQ_SIZE, HIDDEN_SIZE],
                         dtype=tf.float32),
        'encoder_output_state':
        tf.random.normal([BATCH_SIZE, HIDDEN_SIZE], dtype=tf.float32),
        'lengths':
        tf.convert_to_tensor(np.array(BATCH_SIZE * [SEQ_SIZE]), dtype=tf.int32)
    }

    # minimal config sufficient to create output feature
    config = {'input_features': [], 'output_features': features}
    training_set, _, _, training_set_metadata = preprocess_for_training(
        config,
        training_set=df,
        skip_save_processed_input=True,
        random_seed=RANDOM_SEED)

    # run through each type of regularizer
    regularizer_losses = []
    for regularizer in [None, 'l1', 'l2', 'l1_l2']:
        # start with clean slate and make reproducible
        tf.keras.backend.clear_session()
        np.random.seed(RANDOM_SEED)
        tf.random.set_seed(RANDOM_SEED)

        # setup kwarg for regularizer parms
        x_coder_kwargs = dict(
            zip(test_case.regularizer_parm_names,
                len(test_case.regularizer_parm_names) * [regularizer]))

        # combine other other keyword parameters
        x_coder_kwargs.update(test_case.XCoder_other_parms)

        features[0].update(x_coder_kwargs)
        if features[0]['type'] in SEQUENCE_TYPES:
            features[0]['num_classes'] = training_set_metadata[feature_name][
                'vocab_size'] + 1
            training_set.dataset[proc_column] = \
                training_set.dataset[proc_column].astype(np.int32)
            combiner_outputs = combiner_outputs_rank3
        else:
            combiner_outputs = combiner_outputs_rank2

        output_def_obj = build_single_output(features[0], None, None)

        targets = training_set.dataset[proc_column]
        if len(targets.shape) == 1:
            targets = targets.reshape(-1, 1)

        output_def_obj(((combiner_outputs, None), targets),
                       training=True,
                       mask=None)
        regularizer_loss = tf.reduce_sum(output_def_obj.decoder_obj.losses)
        regularizer_losses.append(regularizer_loss)

    # check loss regularization loss values
    # None should be zero
    assert regularizer_losses[0] == 0

    # l1, l2 and l1_l2 should be greater than zero
    assert np.all([t > 0.0 for t in regularizer_losses[1:]])

    # # using default setting l1 + l2 == l1_l2 losses
    assert np.isclose(
        regularizer_losses[1].numpy() + regularizer_losses[2].numpy(),
        regularizer_losses[3].numpy())