def generate_sequence_training_data():
    input_features = [
        sequence_feature(
            vocab_size=TEST_VOCAB_SIZE,
            embedding_size=TEST_EMBEDDING_SIZE,
            state_size=TEST_STATE_SIZE,
            hidden_size=TEST_HIDDEN_SIZE,
            num_filters=TEST_NUM_FILTERS,
            min_len=5,
            max_len=10,
            encoder="rnn",
            cell_type="lstm",
            reduce_output=None,
        )
    ]

    output_features = [
        sequence_feature(min_len=5,
                         max_len=10,
                         decoder="generator",
                         cell_type="lstm",
                         attention="bahdanau",
                         reduce_input=None)
    ]

    # generate synthetic data set testing
    dataset = build_synthetic_dataset(
        150,
        copy.deepcopy(input_features) + copy.deepcopy(output_features))
    raw_data = "\n".join([r[0] + "," + r[1] for r in dataset])
    df = pd.read_csv(StringIO(raw_data))

    return df, input_features, output_features
Ejemplo n.º 2
0
def generate_sequence_training_data():
    input_features = [
        sequence_feature(min_len=5,
                         max_len=10,
                         encoder='rnn',
                         cell_type='lstm',
                         reduce_output=None)
    ]

    output_features = [
        sequence_feature(min_len=5,
                         max_len=10,
                         decoder='generator',
                         cell_type='lstm',
                         attention='bahdanau',
                         reduce_input=None)
    ]

    # generate synthetic data set testing
    dataset = build_synthetic_dataset(
        150,
        copy.deepcopy(input_features) + copy.deepcopy(output_features))
    raw_data = '\n'.join([r[0] + ',' + r[1] for r in dataset])
    df = pd.read_csv(StringIO(raw_data))

    return df, input_features, output_features
Ejemplo n.º 3
0
def generate_data(
    input_features,
    output_features,
    filename="test_csv.csv",
    num_examples=25,
    nan_percent=0.0,
):
    """Helper method to generate synthetic data based on input, output feature specs.

    :param num_examples: number of examples to generate
    :param input_features: schema
    :param output_features: schema
    :param filename: path to the file where data is stored
    :return:
    """
    features = input_features + output_features
    df = build_synthetic_dataset(num_examples, features)
    data = [next(df) for _ in range(num_examples + 1)]

    dataframe = pd.DataFrame(data[1:], columns=data[0])
    if nan_percent > 0:
        add_nans_to_df_in_place(dataframe, nan_percent)
    dataframe.to_csv(filename, index=False)

    return filename
Ejemplo n.º 4
0
def test_build_synthetic_dataset():
    features = [
        {
            "name": "text",
            "type": "text"
        },
        {
            "name": "category",
            "type": "category"
        },
        {
            "name": "number",
            "type": "number"
        },
        {
            "name": "binary",
            "type": "binary"
        },
        {
            "name": "set",
            "type": "set"
        },
        {
            "name": "bag",
            "type": "bag"
        },
        {
            "name": "sequence",
            "type": "sequence"
        },
        {
            "name": "timeseries",
            "type": "timeseries"
        },
        {
            "name": "date",
            "type": "date"
        },
        {
            "name": "h3",
            "type": "h3"
        },
        {
            "name": "vector",
            "type": "vector"
        },
        {
            "name": "audio",
            "type": "audio"
        },
        {
            "name": "image",
            "type": "image"
        },
    ]
    assert len(list(dataset_synthesizer.build_synthetic_dataset(
        100, features))) == 101  # Extra for the header.
Ejemplo n.º 5
0
def test_decoder(test_case):
    # reproducible synthetic data set
    np.random.seed(RANDOM_SEED)
    tf.random.set_seed(RANDOM_SEED)

    # create synthetic data for the test
    features = [
        test_case.syn_data.feature_generator(
            *test_case.syn_data.feature_generator_args,
            **test_case.syn_data.feature_generator_kwargs
        )
    ]
    feature_name = features[0]['name']
    data_generator = build_synthetic_dataset(BATCH_SIZE, features)
    data_list = list(data_generator)
    raw_data = [x[0] for x in data_list[1:]]
    df = pd.DataFrame({data_list[0][0]: raw_data})

    # create synthetic combiner layer
    combiner_outputs_rank2 = {
        'combiner_output': tf.random.normal(
            [BATCH_SIZE, HIDDEN_SIZE],
            dtype=tf.float32
        )
    }

    combiner_outputs_rank3 = {
        'combiner_output': tf.random.normal(
            [BATCH_SIZE, SEQ_SIZE, HIDDEN_SIZE],
            dtype=tf.float32
        ),
        'encoder_output_state': tf.random.normal(
            [BATCH_SIZE, HIDDEN_SIZE],
            dtype=tf.float32
        ),
        'lengths': tf.convert_to_tensor(
            np.array(BATCH_SIZE * [SEQ_SIZE]),
            dtype=tf.int32
        )
    }

    # minimal config sufficient to create output feature
    config = {'input_features': [], 'output_features': features}
    training_set, _, _, training_set_metadata = preprocess_for_training(
        config,
        training_set=df,
        skip_save_processed_input=True,
        random_seed=RANDOM_SEED
    )

    # run through each type of regularizer
    regularizer_losses = []
    for regularizer in [None, 'l1', 'l2', 'l1_l2']:
        # start with clean slate and make reproducible
        tf.keras.backend.clear_session()
        np.random.seed(RANDOM_SEED)
        tf.random.set_seed(RANDOM_SEED)

        # setup kwarg for regularizer parms
        x_coder_kwargs = dict(
            zip(test_case.regularizer_parm_names,
                len(test_case.regularizer_parm_names) * [regularizer])
        )

        # combine other other keyword parameters
        x_coder_kwargs.update(test_case.XCoder_other_parms)

        features[0].update(x_coder_kwargs)
        if features[0]['type'] in SEQUENCE_TYPES:
            features[0]['num_classes'] = training_set_metadata[feature_name][
                                             'vocab_size'] + 1
            training_set.dataset[feature_name] = \
                training_set.dataset[feature_name].astype(np.int32)
            combiner_outputs = combiner_outputs_rank3
        else:
            combiner_outputs = combiner_outputs_rank2

        output_def_obj = build_single_output(features[0], None, None)

        targets = training_set.dataset[feature_name]
        if len(targets.shape) == 1:
            targets = targets.reshape(-1, 1)

        output_def_obj(
            (
                (combiner_outputs, None),
                targets
            ),
            training=True,
            mask=None
        )
        regularizer_loss = tf.reduce_sum(output_def_obj.decoder_obj.losses)
        regularizer_losses.append(regularizer_loss)

    # check loss regularization loss values
    # None should be zero
    assert regularizer_losses[0] == 0

    # l1, l2 and l1_l2 should be greater than zero
    assert np.all([t > 0.0 for t in regularizer_losses[1:]])

    # # using default setting l1 + l2 == l1_l2 losses
    assert np.isclose(
        regularizer_losses[1].numpy() + regularizer_losses[2].numpy(),
        regularizer_losses[3].numpy())
Ejemplo n.º 6
0
def test_encoder(test_case):
    # set up required directories for images if needed
    shutil.rmtree(IMAGE_DIR, ignore_errors=True)
    os.mkdir(IMAGE_DIR)

    # reproducible synthetic data set
    np.random.seed(RANDOM_SEED)
    tf.random.set_seed(RANDOM_SEED)

    # create synthetic data for the test
    features = [
        test_case.syn_data.feature_generator(
            *test_case.syn_data.feature_generator_args,
            **test_case.syn_data.feature_generator_kwargs
        )
    ]
    feature_name = features[0]['name']
    data_generator = build_synthetic_dataset(BATCH_SIZE, features)
    data_list = list(data_generator)
    raw_data = [x[0] for x in data_list[1:]]
    df = pd.DataFrame({data_list[0][0]: raw_data})

    # minimal config sufficient to create the input feature
    config = {'input_features': features, 'output_features': []}
    training_set, _, _, training_set_metadata = preprocess_for_training(
        config,
        training_set=df,
        skip_save_processed_input=True,
        random_seed=RANDOM_SEED
    )

    # run through each type of regularizer for the encoder
    regularizer_losses = []
    for regularizer in [None, 'l1', 'l2', 'l1_l2']:
        # start with clean slate and make reproducible
        tf.keras.backend.clear_session()
        np.random.seed(RANDOM_SEED)
        tf.random.set_seed(RANDOM_SEED)

        # setup kwarg for regularizer parms
        x_coder_kwargs = dict(
            zip(test_case.regularizer_parm_names,
                len(test_case.regularizer_parm_names) * [regularizer])
        )

        # combine other other keyword parameters
        x_coder_kwargs.update(test_case.XCoder_other_parms)
        features[0].update(x_coder_kwargs)

        # shim code to support sequence/sequence like features
        if features[0]['type'] in SEQUENCE_TYPES.union({'category', 'set'}):
            features[0]['vocab'] = training_set_metadata[feature_name][
                'idx2str']
            training_set.dataset[feature_name] = \
                training_set.dataset[feature_name].astype(np.int32)

        input_def_obj = build_single_input(features[0], None)

        inputs = training_set.dataset[feature_name]
        # make sure we are at least rank 2 tensor
        if len(inputs.shape) == 1:
            inputs = inputs.reshape(-1, 1)

        # special handling for image feature
        if features[0]['type'] == 'image':
            inputs = tf.cast(inputs, tf.float32) / 255

        input_def_obj.encoder_obj(inputs)
        regularizer_loss = tf.reduce_sum(input_def_obj.encoder_obj.losses)
        regularizer_losses.append(regularizer_loss)

    # check loss regularization loss values
    # None should be zero
    assert regularizer_losses[0] == 0

    # l1, l2 and l1_l2 should be greater than zero
    assert np.all([t > 0.0 for t in regularizer_losses[1:]])

    # # using default setting l1 + l2 == l1_l2 losses
    assert np.isclose(
        regularizer_losses[1].numpy() + regularizer_losses[2].numpy(),
        regularizer_losses[3].numpy())

    # cleanup
    shutil.rmtree(IMAGE_DIR, ignore_errors=True)