Esempio n. 1
0
def main():
    preprocess1 = PreprocessData(PreprocessType.STANDARDISATION_OVER_TENORS,
                                 short_end=True)
    preprocess2 = PreprocessData(PreprocessType.LOG_RETURNS_OVER_TENORS,
                                 short_end=True)

    # 1. get data and apply scaling
    sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess1.get_data(
    )

    print("sets_test_scaled, sets_training_scaled:", sets_test_scaled[0].shape,
          sets_training_scaled[0].shape)

    # 2: log returns of encoded data
    sets_encoded_log_training = preprocess2.scale_data(sets_training_scaled,
                                                       training_dataset_names,
                                                       should_fit=True)
    sets_encoded_log_test = preprocess2.scale_data(sets_test_scaled,
                                                   test_dataset_names,
                                                   should_fit=True)

    layers = [
        35, 35
    ]  # Number of hidden neurons in each layer of the encoder and decoder

    learning_rate = 0.01
    decay = 0  # Learning rate decay

    num_input_features = 1  # The dimensionality of the input at each time step. In this case a 1D signal.
    num_output_features = 1  # The dimensionality of the output at each time step. In this case a 1D signal.
    # There is no reason for the input sequence to be of same dimension as the ouput sequence.

    loss = "mse"  # Other loss functions are possible, see Keras documentation.

    # Regularisation isn't really needed for this application
    lambda_regulariser = 0.000001  # Will not be used if regulariser is None
    regulariser = None  # Possible regulariser: keras.regularizers.l2(lambda_regulariser)

    batch_size = 512
    steps_per_epoch = 200  # batch_size * steps_per_epoch = total number of training examples
    epochs = 10

    input_sequence_length = 42  # Length of the sequence used by the encoder
    target_sequence_length = 42  # Length of the sequence predicted by the decoder
    num_steps_to_predict = 42  # Length to use when testing the model

    model = Model(layers, learning_rate, decay, num_input_features,
                  num_output_features, loss, lambda_regulariser, regulariser,
                  batch_size, steps_per_epoch, epochs, input_sequence_length,
                  target_sequence_length, num_steps_to_predict)
    model.build()
    # model.load()
    model.train(sets_encoded_log_training)
    # model.predict_sequences_simple(np.vstack(sets_training_first_last_tenors))
    model.predict_sequences(sets_encoded_log_training)
    def test_two_preprocessing_methods(self):
        preprocess = PreprocessData(PreprocessType.STANDARDISATION_OVER_TENORS,
                                    short_end=True)
        preprocess2 = PreprocessData(PreprocessType.LOG_RETURNS_OVER_TENORS,
                                     short_end=True)
        sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess.get_data(
        )

        sets_encoded_log_test = preprocess2.scale_data(sets_test_scaled,
                                                       test_dataset_names,
                                                       should_fit=True)

        # in this case the start_value is required, otherwise it will take the start_value of the original data instead
        standardised_test_prediction = preprocess2.rescale_data(
            sets_encoded_log_test[0],
            test_dataset_names[0],
            start_value=sets_test_scaled[0][0],
            index=sets_test_scaled[0].index.values)
        rescaled_test_prediction = preprocess.rescale_data(
            standardised_test_prediction, test_dataset_names[0])

        # plotting.plot_2d(sets_test[0], "gain_test_prediction_rescaled", curve2=rescaled_test_prediction, title=True)

        np.testing.assert_allclose(rescaled_test_prediction, sets_test[0])
def simulate(latent_dim=2,
             preprocess_type1=None,
             preprocess_type2=None,
             ae_model=None,
             gan_model=None,
             force_training=True,
             plot=False):
    preprocess1 = PreprocessData(preprocess_type1)
    preprocess2 = PreprocessData(preprocess_type2)

    # 1. get data and apply scaling
    sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess1.get_data(
    )

    if ae_model is AEModel.AAE:
        ae_params = {
            'preprocess_type': preprocess_type1.
            value,  # only to make preprocess_type part of the hash
            'input_dim': sets_training_scaled[0].shape[1],  # 56
            'latent_dim': latent_dim,
            'hidden_layers': (
                56,
                40,
                28,
                12,
                4,
            ),
            'hidden_layers_discriminator': (
                2,
                2,
            ),
            'leaky_relu': 0.1,
            'last_activation': 'linear',
            'last_activation_discriminator': 'sigmoid',
            'loss_generator': 'mean_squared_error',
            'loss_discriminator': 'binary_crossentropy',
            'batch_size': 20,
            'epochs': 20000
        }
        ae_params_hash = hashlib.md5(
            json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest()
        autoencoder = AdversarialAutoencoder(ae_params, plot=False)
    elif ae_model is AEModel.VAE:
        ae_params = {
            'preprocess_type': preprocess_type1.
            value,  # only to make preprocess_type part of the hash
            'input_dim': sets_training_scaled[0].shape[1],  # 56
            'latent_dim': latent_dim,
            'hidden_layers': (
                56,
                40,
                28,
                12,
                4,
            ),
            'leaky_relu': 0.1,
            'last_activation': 'linear',  # sigmoid or linear
            'loss':
            'mean_square_error',  # binary_crossentropy or mean_square_error
            'epsilon_std': 1.0,
            'batch_size': 20,
            'epochs': 100,
            'steps_per_epoch': 500
        }
        ae_params_hash = hashlib.md5(
            json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest()
        autoencoder = VariationalAutoencoder(ae_params, plot=False)
    elif ae_model is AEModel.AE:
        ae_params = {
            'preprocess_type': preprocess_type1.
            value,  # only to make preprocess_type part of the hash
            'input_dim': sets_training_scaled[0].shape[1],  # 56
            'latent_dim': latent_dim,
            'hidden_layers': (
                56,
                40,
                28,
                12,
                4,
            ),
            'leaky_relu': 0.1,
            'loss': 'mse',
            'last_activation': 'linear',
            'batch_size': 20,
            'epochs': 100,
            'steps_per_epoch': 500
        }
        ae_params_hash = hashlib.md5(
            json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest()
        autoencoder = Autoencoder(ae_params, plot=False)
    else:  # elif ae_model is AEModel.PCA:
        ae_params = {
            'preprocess_type': preprocess_type1.
            value,  # only to make preprocess_type part of the hash
            'latent_dim': latent_dim
        }
        ae_params_hash = hashlib.md5(
            json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest()
        autoencoder = PCAModel(ae_params, plot=False)

    # 2. train/load autoencoder
    autoencoder.load_else_train(np.vstack(sets_training_scaled),
                                sets_test_scaled, "ae_" + ae_params_hash)

    # 2: encode data using autoencoder
    sets_encoded_training = autoencoder.encode(sets_training_scaled)
    sets_encoded_test = autoencoder.encode(sets_test_scaled)

    # 3: log returns of encoded data
    sets_encoded_log_training = preprocess2.scale_data(sets_encoded_training,
                                                       training_dataset_names,
                                                       should_fit=True)
    sets_encoded_log_test = preprocess2.scale_data(sets_encoded_test,
                                                   test_dataset_names,
                                                   should_fit=True)

    num_z = 6 * 7
    num_c = 6 * 7
    num_o = 6 * 7
    if gan_model is GANModel.WGAN:
        gan_params = {
            'ae_params_hash': ae_params_hash,
            'num_tenors': sets_encoded_log_training[0].shape[1],
            'num_c': num_c,
            'num_z': num_z,
            'num_o': num_o,
            'gen_model_type': 'standard',  # conv
            'dis_model_type': 'standard',  # conv
            'gen_layers': (4 * (6 * 7 * 2), ),  # 4 * num_o * num_tenors
            'dis_layers': (4 * (6 * 7), ),  # 4 * num_o
            'gen_last_activation': 'tanh',
            'dis_last_activation': 'sigmoid',
            'loss': 'binary_crossentropy',
            'batch_size': 32,
            'epochs': 10000,
            'sample_interval': 1000
        }
        gan_params_hash = hashlib.md5(
            json.dumps(gan_params,
                       sort_keys=True).encode('utf-8')).hexdigest()
        gan = CWGANGP(gan_params, plot=False)
    else:
        if gan_model is GANModel.GAN_CONV:
            model_type = 'conv'
        else:  # if gan_model is GANModel.GAN:
            model_type = 'standard'

        gan_params = {
            'ae_params_hash': ae_params_hash,
            'num_tenors': sets_encoded_log_training[0].shape[1],
            'num_c': num_c,
            'num_z': num_z,
            'num_o': num_o,
            'gen_model_type': model_type,  # conv
            'dis_model_type': model_type,  # conv
            'gen_layers': (4 * (6 * 7 * 2), ),  # 4 * num_o * num_tenors
            'dis_layers': (4 * (6 * 7), ),  # 4 * num_o
            'gen_last_activation': 'tanh',
            'dis_last_activation': 'sigmoid',
            'loss': 'binary_crossentropy',
            'batch_size': 128,
            'epochs': 20000
        }
        gan_params_hash = hashlib.md5(
            json.dumps(gan_params,
                       sort_keys=True).encode('utf-8')).hexdigest()
        gan = GAN(gan_params,
                  plot=False)  # try training on larger input and output

    if force_training:
        gan.train(sets_encoded_log_training, "gan_" + gan_params_hash)
    else:
        gan.load_else_train(sets_encoded_log_training,
                            "gan_" + gan_params_hash)

    # 4: simulate on encoded log returns, conditioned on test dataset
    num_simulations = 100
    num_repeats = 1
    generated, _ = gan.generate(condition=sets_encoded_log_test[-1],
                                condition_on_end=False,
                                num_simulations=num_simulations,
                                repeat=num_repeats)

    # insert the last real futures curve in order to do rescaling
    if preprocess_type2 is PreprocessType.LOG_RETURNS_OVER_TENORS:
        generated = np.insert(generated,
                              0,
                              sets_encoded_log_test[-1].iloc[num_c],
                              axis=1)

    # 5: undo scaling
    encoded_generated = preprocess2.rescale_data(
        generated,
        start_value=sets_encoded_test[-1][num_c],
        dataset_name=test_dataset_names[-1])
    if preprocess_type2 is PreprocessType.LOG_RETURNS_OVER_TENORS:
        encoded_generated = encoded_generated[:,
                                              1:]  # remove first curve again

    # 6: decode using autoencoder
    decoded_generated_segments = autoencoder.decode(encoded_generated)

    # 7: undo scaling, this can be log-returns
    simulated = preprocess1.rescale_data(decoded_generated_segments,
                                         start_value=sets_test[-1].iloc[num_c],
                                         dataset_name=test_dataset_names[-1])

    preprocess1.enable_curve_smoothing = True
    simulated_smooth = preprocess1.rescale_data(
        decoded_generated_segments,
        start_value=sets_test[-1].iloc[num_c],
        dataset_name=test_dataset_names[-1])

    if preprocess_type2 is PreprocessType.LOG_RETURNS_OVER_TENORS:
        real = sets_test[-1].iloc[
            num_c:num_c + num_o * num_repeats +
            1]  # `+1` because the log-returns also does +1
    else:
        real = sets_test[-1].iloc[num_c:num_c + num_o * num_repeats + 1]

    print("simulated, real", simulated.shape, real.shape)

    smape_result = smape(simulated, real)
    smape_result_smooth = smape(simulated_smooth, real)

    print("smape_result_smooth mean and std:", np.mean(smape_result_smooth),
          np.std(smape_result_smooth))

    if plot:
        plotting = Plotting()
        plotting.plot_3d("real", real, show_title=False)

        cov_log_returns = cov_log_returns_over_tenors(real)
        plotting.plot_3d_cov("gan_real_cov", cov_log_returns, show_title=False)

        for i in np.arange(1, 11):
            # name =  '_' + preprocess_type1.name + '_' + preprocess_type2.name + '_' + str(latent_dim) + '_' + ae_model.name + '_'+ gan_model.name
            plotting.plot_3d("gan_simulated_" + str(i),
                             simulated_smooth[i],
                             maturities=maturities,
                             time=real.index.values,
                             show_title=False)
            smape_result = smape(simulated_smooth[i], real)
            print("simulated_smooth[i], real", simulated_smooth[i].shape,
                  real.shape)
            print("simulate rates", i)
            print("smape:", smape_result)
            print("=============\n")

            cov_log_returns = cov_log_returns_over_tenors(simulated_smooth[i])
            plotting.plot_3d_cov("gan_simulated_" + str(i) + "_cov",
                                 cov_log_returns,
                                 maturities=maturities,
                                 show_title=False)

    return smape_result_smooth
Esempio n. 4
0
def simulate():
    plotting = Plotting()
    preprocess_minmax = PreprocessData()
    preprocess_logreturns = PreprocessData()
    preprocess_minmax.enable_min_max_scaler = True
    preprocess_logreturns.enable_log_returns = True

    # 1. get data and apply minimax
    sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess_minmax.get_data(
    )

    print("sets_training_scaled.shape", sets_training_scaled[0].shape)

    autoencoder = DeepAutoencoder(
        input_shape=(sets_training_scaled[0].shape[1], ), latent_dim=2)
    # autoencoder.train(np.vstack(sets_training_scaled), sets_test_scaled, epochs=100, batch_size=5)
    # autoencoder.save_model("deep_general_minimax")
    autoencoder.load_model("deep_general_minimax")

    # 2: encode data using autoencoder
    sets_encoded_training = []
    for set_training_scaled in sets_training_scaled:
        sets_encoded_training.append(autoencoder.encode(set_training_scaled))

    sets_encoded_test = []
    for set_test_scaled in sets_test_scaled:
        sets_encoded_test.append(autoencoder.encode(set_test_scaled))

    plotting.plot_2d(sets_encoded_test[0],
                     "encoded test data with deep autoencoder",
                     save=False)

    # 3: log returns of encoded data
    sets_encoded_log_training = []
    for index, set_encoded_training in enumerate(sets_encoded_training):
        sets_encoded_log_training.append(
            preprocess_logreturns.scale_data(set_encoded_training))

    sets_encoded_log_test = []
    for index, set_encoded_test in enumerate(sets_encoded_test):
        sets_encoded_log_test.append(
            preprocess_logreturns.scale_data(set_encoded_test))

    plotting.plot_2d(
        sets_encoded_log_test[0],
        "encoded test data with deep autoencoder, then log returns",
        save=False)

    num_tenors = sets_encoded_log_training[0].shape[1]
    gan = GAN(num_c=6 * 7, num_z=6 * 7, num_o=6 * 7,
              num_tenors=num_tenors)  # try training on larger input and output
    # gan.train(sets_encoded_log_training, epochs=20000, batch_size=100, sample_interval=200)
    # gan.save_model("general_ae")
    gan.load_model("general_ae")

    print("sets_encoded_log_test[0].shape", sets_encoded_log_test[0].shape)

    test_arr = np.full([1, 6 * 7 + 6 * 7, num_tenors], 10)

    validity = gan.discriminator.predict(
        test_arr)  # np.array(sets_encoded_log_test[0]
    print(validity)

    rolled_encoded_log_test = rolling_windows(sets_encoded_log_test[0],
                                              6 * 7 + 6 * 7)

    validity = gan.discriminator.predict(
        rolled_encoded_log_test)  # np.array(sets_encoded_log_test[0]
    print(validity)
def simulate(latent_dim=2, preprocess_type1=None, preprocess_type2=None, ae_model=None, plot=False):
    preprocess1 = PreprocessData(preprocess_type1)
    preprocess2 = PreprocessData(preprocess_type2)

    # 1. get data and apply scaling
    sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess1.get_data()

    if ae_model is AEModel.AAE:
        ae_params = {'preprocess_type': preprocess_type1.value, # only to make preprocess_type part of the hash
                     'input_dim': sets_training_scaled[0].shape[1],  # 56
                     'latent_dim': latent_dim,
                     'hidden_layers': (56, 40, 28, 12, 4,),
                     'hidden_layers_discriminator': (2, 2, ),
                     'leaky_relu': 0.1,
                     'last_activation': 'linear',
                     'last_activation_discriminator': 'sigmoid',
                     'loss_generator': 'mean_squared_error',
                     'loss_discriminator': 'binary_crossentropy',
                     'batch_size': 20,
                     'epochs': 20000}
        ae_params_hash = hashlib.md5(json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest()
        autoencoder = AdversarialAutoencoder(ae_params, plot=False)
    elif ae_model is AEModel.VAE:
        ae_params = {'preprocess_type': preprocess_type1.value, # only to make preprocess_type part of the hash
                     'input_dim': sets_training_scaled[0].shape[1],  # 56
                     'latent_dim': latent_dim,
                     'hidden_layers': (56, 40, 28, 12, 4,),
                     'leaky_relu': 0.1,
                     'last_activation': 'linear',  # sigmoid or linear
                     'loss': 'mean_square_error',  # binary_crossentropy or mean_square_error
                     'epsilon_std': 1.0,
                     'batch_size': 20,
                     'epochs': 100,
                     'steps_per_epoch': 500}
        ae_params_hash = hashlib.md5(json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest()
        autoencoder = VariationalAutoencoder(ae_params, plot=False)
    elif ae_model is AEModel.AE:
        ae_params = {'preprocess_type': preprocess_type1.value, # only to make preprocess_type part of the hash
                     'input_dim': sets_training_scaled[0].shape[1], # 56
                     'latent_dim': latent_dim,
                     'hidden_layers': (56, 40, 28, 12, 4,),
                     'leaky_relu': 0.1,
                     'loss': 'mse',
                     'last_activation': 'linear',
                     'batch_size': 20,
                     'epochs': 100,
                     'steps_per_epoch': 500}
        ae_params_hash = hashlib.md5(json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest()
        autoencoder = Autoencoder(ae_params, plot=False)
    else: # elif ae_model is AEModel.PCA:
        ae_params = {'preprocess_type': preprocess_type1.value,  # only to make preprocess_type part of the hash
                     'latent_dim': latent_dim }
        ae_params_hash = hashlib.md5(json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest()
        autoencoder = PCAModel(ae_params, plot=False)

    # 2. train/load autoencoder
    autoencoder.load_else_train(np.vstack(sets_training_scaled), sets_test_scaled, "ae_" + ae_params_hash)

    # 2: encode data using autoencoder
    sets_encoded_training = autoencoder.encode(sets_training_scaled)
    sets_encoded_test = autoencoder.encode(sets_test_scaled)

    # 3: log returns of encoded data
    sets_encoded_log_training = preprocess2.scale_data(sets_encoded_training, training_dataset_names, should_fit=True)
    sets_encoded_log_test = preprocess2.scale_data(sets_encoded_test, test_dataset_names, should_fit=True)

    print("="*20)
    print(ae_model.name)
    print("\n")
    for set_encoded_log_training, training_dataset_name in zip(sets_encoded_log_training, training_dataset_names):
        print(training_dataset_name)
        print("min:", np.min(set_encoded_log_training.min()), "max:", np.max(set_encoded_log_training.max()))

    print("\n")

    for set_encoded_log_test, test_dataset_name in zip(sets_encoded_log_test, test_dataset_names):
        print(test_dataset_name)
        print("min:", np.min(set_encoded_log_test.min()), "max:", np.max(set_encoded_log_test.max()))

    print("\n")
    print("=" * 20)
def simulate(latent_dim=2,
             preprocess_type1=None,
             preprocess_type2=None,
             ae_model=None,
             gan_model=None,
             force_training=True,
             plot=False):
    preprocess1 = PreprocessData(preprocess_type1, short_end=True)
    preprocess2 = PreprocessData(preprocess_type2, short_end=True)

    # 1. get data and apply scaling
    sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess1.get_data(
    )

    print("sets_test_scaled, sets_training_scaled:", sets_test_scaled[0].shape,
          sets_training_scaled[0].shape)

    # 2: log returns of encoded data
    sets_encoded_log_training = preprocess2.scale_data(sets_training_scaled,
                                                       training_dataset_names,
                                                       should_fit=True)
    sets_encoded_log_test = preprocess2.scale_data(sets_test_scaled,
                                                   test_dataset_names,
                                                   should_fit=True)

    num_c = 6 * 7
    num_o = 6 * 7
    if gan_model is GANModel.WGAN:
        gan_params = {
            'short_end_encoding':
            preprocess_type1.name + "_" + preprocess_type2.name,
            'num_tenors': sets_encoded_log_training[0].shape[1],
            'num_c': 6 * 7,
            'num_z': 6 * 7,
            'num_o': 6 * 7,
            'gen_model_type': 'standard',  # conv
            'dis_model_type': 'standard',  # conv
            'gen_layers': (4 * (6 * 7 * 2), ),  # 4 * num_o * num_tenors
            'dis_layers': (4 * (6 * 7), ),  # 4 * num_o
            'gen_last_activation': 'tanh',
            'dis_last_activation': 'sigmoid',
            'loss': 'binary_crossentropy',
            'batch_size': 32,
            'epochs': 10000,
            'sample_interval': 1000
        }
        gan_params_hash = hashlib.md5(
            json.dumps(gan_params,
                       sort_keys=True).encode('utf-8')).hexdigest()
        gan = CWGANGP(gan_params, plot=False)
    else:
        if gan_model is GANModel.GAN_CONV:
            model_type = 'conv'
        else:  # if gan_model is GANModel.GAN:
            model_type = 'standard'

        print("num tenors:", sets_encoded_log_training[0].shape[1])

        gan_params = {
            'short_end_encoding':
            preprocess_type1.name + "_" + preprocess_type2.name,
            'num_tenors': sets_encoded_log_training[0].shape[1],
            'num_c': num_c,
            'num_z': 6 * 7,
            'num_o': num_o,
            'gen_model_type': model_type,  # conv
            'dis_model_type': model_type,  # conv
            'gen_layers': (4 * (6 * 7 * 2), ),  # 4 * num_o * num_tenors
            'dis_layers': (4 * (6 * 7), ),  # 4 * num_o
            'gen_last_activation': 'tanh',
            'dis_last_activation': 'sigmoid',
            'loss': 'binary_crossentropy',
            'batch_size': 128,
            'epochs': 20000
        }
        gan_params_hash = hashlib.md5(
            json.dumps(gan_params,
                       sort_keys=True).encode('utf-8')).hexdigest()
        gan = GAN(gan_params,
                  plot=False)  # try training on larger input and output

    if force_training:
        gan.train(sets_encoded_log_training, "gan_" + gan_params_hash)
    else:
        gan.load_else_train(sets_encoded_log_training,
                            "gan_" + gan_params_hash)

    # 4: simulate on encoded log returns, conditioned on test dataset
    num_simulations = 100
    num_repeats = 0

    print("sets_encoded_log_test[-1]", sets_encoded_log_test[-1].shape)

    generated, _ = gan.generate(condition=sets_encoded_log_test[-1],
                                condition_on_end=False,
                                num_simulations=num_simulations,
                                repeat=num_repeats)

    # insert the last real futures curve in order to do rescaling
    if preprocess_type2 is PreprocessType.LOG_RETURNS_OVER_TENORS:
        generated = np.insert(generated,
                              0,
                              sets_encoded_log_test[-1].iloc[num_c],
                              axis=1)

    print("sets_test_scaled[-1]", sets_test_scaled[-1].shape)
    print("sets_test_scaled[-1][num_c]", sets_test_scaled[-1].iloc[num_c])

    # 5: undo scaling
    encoded_generated = preprocess2.rescale_data(
        generated,
        start_value=sets_test_scaled[-1].iloc[num_c],
        dataset_name=test_dataset_names[-1])
    if preprocess_type2 is PreprocessType.LOG_RETURNS_OVER_TENORS:
        encoded_generated = encoded_generated[:,
                                              1:]  # remove first curve again

    # 7: undo scaling, this can be log-returns
    simulated = preprocess1.rescale_data(encoded_generated,
                                         start_value=sets_test[-1].iloc[num_c],
                                         dataset_name=test_dataset_names[-1])

    if preprocess_type2 is PreprocessType.LOG_RETURNS_OVER_TENORS:
        real = np.array(
            sets_test[-1])[num_c:num_c + num_o +
                           1]  # `+1` because the log-returns also does +1
    else:
        real = np.array(sets_test[-1])[num_c:num_c + num_o + 1]

    sim = simulated.reshape(100, 43)

    print("sets_test[-1].iloc[num_c], sim[0][0]", sets_test[-1].iloc[num_c],
          sim[0][0], sim[1][0], sim[2][0])
    print("real, simulated", real.shape, sim.shape)

    smape_result = smape(sim, real, over_curves=True)

    if plot:
        condition_and_real = sets_test[-1].iloc[0:num_c + num_o + 1]
        plotting = Plotting()
        plotting.plot_training_sample("simulated_simple",
                                      sim,
                                      condition_and_real,
                                      num_c,
                                      after_real_data=True)

        # print("smape test:", smape(simulated[0], real), smape_result)

    return smape_result
Esempio n. 7
0
def simulate():
    plotting = Plotting()
    preprocess_normalisation = PreprocessData()
    preprocess_logreturns = PreprocessData()
    preprocess_normalisation.enable_normalisation_scaler = True
    preprocess_logreturns.enable_log_returns = True

    # 1. get data and apply pre-processing
    sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess_normalisation.get_data()

    ae_params = { 'preprocess_type': PreprocessType.NORMALISATION_OVER_TENORS.value,
                  'input_dim': (10, sets_training_scaled[0].shape[1],), # 56
                  'latent_dim': 2*56,
                  'hidden_layers': (12*56, 4*56, ),
                  'leaky_relu': 0.1,
                  'loss': 'mse',
                  'last_activation': 'linear',
                  'batch_size': 5,
                  'epochs': 5,
                  'steps_per_epoch': 500}

    ae_params_hash = hashlib.md5(json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest()

    autoencoder = Autoencoder(ae_params)
    # autoencoder.train(np.vstack(sets_training_scaled), sets_test_scaled)
    # autoencoder.save_model("ae_" + ae_params_hash)
    autoencoder.load_else_train(sets_training_scaled, sets_test_scaled, "ae_" + ae_params_hash)

    # 2: encode data using autoencoder
    sets_encoded_training = autoencoder.encode(sets_training_scaled)
    sets_encoded_test = autoencoder.encode(sets_test_scaled)

    print("sets_encoded_test", sets_encoded_test[0].shape)
    plotting.plot_2d(sets_encoded_test[0], "encoded test data with deep autoencoder", save=False)

    # 3: log returns of encoded data
    sets_encoded_log_training = preprocess_logreturns.scale_data(sets_encoded_training)
    sets_encoded_log_test = preprocess_logreturns.scale_data(sets_encoded_test)

    plotting.plot_2d(sets_encoded_log_test[0], "encoded test data with deep autoencoder, then log returns", save=False)

    num_c = 6*7
    num_o = 6*7
    gan_params = {'ae_params_hash': ae_params_hash,
                  'num_tenors': sets_encoded_log_training[0].shape[1],
                  'num_c': num_c,
                  'num_z': 6*7,
                  'num_o': num_o,
                  'gen_model_type': 'standard', # conv
                  'dis_model_type': 'standard', # conv
                  'gen_layers': (4*(6*7*2),), # 4 * num_o * num_tenors
                  'dis_layers': (4*(6*7),), # 4 * num_o
                  'gen_last_activation': 'tanh',
                  'dis_last_activation': 'sigmoid',
                  'loss': 'binary_crossentropy',
                  'batch_size': 128,
                  'epochs': 20000}
    gan_params_hash = hashlib.md5(json.dumps(gan_params, sort_keys=True).encode('utf-8')).hexdigest()

    gan = GAN(gan_params)  # try training on larger input and output
    # gan.train(sets_encoded_log_training, sample_interval=200)
    # gan.save_model("gan_" + gan_params_hash)
    gan.load_model("gan_" + gan_params_hash)

    # COV TEST, TEMPORARY
    # for name, set in zip(training_dataset_names, sets_training):
    #     print("name:", name)
    #     set_cov_log_returns_over_features = cov_log_returns_over_features(set)
    #     plotting.plot_3d_cov("covariance_time_series_" + name, set_cov_log_returns_over_features, show_title=False)
    #     plotting.plot_3d("time_series_" + name, set, maturities)
    # END COV TEST.

    # 4: simulate on encoded log returns, conditioned on test dataset
    num_simulations = 10
    num_repeats = 0
    generated, _ = gan.generate(condition=sets_encoded_log_test[-1], condition_on_end=False, num_simulations=num_simulations, repeat=num_repeats)

    # insert the last real futures curve in order to do rescaling
    print("sets_encoded_log_test[-1][num_c] shape", sets_encoded_log_test[-1].iloc[num_c].shape)
    print("generated_segments.shape", generated.shape)
    generated = np.insert(generated, 0, sets_encoded_log_test[-1].iloc[num_c], axis=0)

    # 5: undo log-returns # todo: this start_value is actually one off! Error still persists... autoencoder causing the difference?
    encoded_generated = preprocess_logreturns.rescale_data(generated, start_value=sets_encoded_test[-1][num_c])
    encoded_generated = encoded_generated[:, 1:] # remove first curve again
    # 6: decode using autoencoder
    decoded_generated_segments = autoencoder.decode(encoded_generated)

    # 7: undo minimax, for now only the first simulation
    simulated = preprocess_normalisation.rescale_data(decoded_generated_segments, dataset_name=test_dataset_names[-1])

    preprocess_normalisation.enable_curve_smoothing = True
    simulated_smooth = preprocess_normalisation.rescale_data(decoded_generated_segments, dataset_name=test_dataset_names[-1])

    real = np.array(sets_test[-1])[num_c:num_c + num_o]

    print("simulated, real", simulated.shape, real.shape)

    smape_result = smape(simulated, real)
    smape_result_smooth = smape(simulated_smooth, real)
    print("smape_result and smooth", smape_result, smape_result_smooth)
    print("smape_resul_smooth", smape_result_smooth)
Esempio n. 8
0
from helpers.plotting import Plotting
from imputance.gain_model import gain
import numpy as np
import matplotlib.pyplot as plt

if __name__ == '__main__':
    plotting = Plotting()
    preprocess = PreprocessData(PreprocessType.STANDARDISATION_OVER_TENORS,
                                short_end=True)
    preprocess2 = PreprocessData(PreprocessType.LOG_RETURNS_OVER_TENORS,
                                 short_end=True)
    sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess.get_data(
    )

    sets_encoded_log_training = preprocess2.scale_data(sets_training_scaled,
                                                       training_dataset_names,
                                                       should_fit=True)
    sets_encoded_log_test = preprocess2.scale_data(sets_test_scaled,
                                                   test_dataset_names,
                                                   should_fit=True)

    train = sets_encoded_log_training[0].copy()
    test = sets_encoded_log_test[0].copy()

    # print("train.shape[1]", train.shape[1])
    # print("sets_test_scaled[0]", sets_test_scaled[0].shape)
    # print("sets_encoded_log_test[0]", sets_encoded_log_test[0].shape)

    params = {
        'mb_size': 128,  # 'mb_size': 128,
        'p_miss': 0.5,  #  'p_miss': 0.5, doesn't do anything
def simulate():
    plotting = Plotting()
    preprocess_type = PreprocessType.STANDARDISATION_OVER_TENORS
    preprocess = PreprocessData(preprocess_type)

    # 1. get data and apply minimax
    sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess.get_data(
    )
    all_training_scaled = np.vstack(sets_training_scaled)

    ae_params = {
        'preprocess_type':
        preprocess_type.value,  # only to make preprocess_type part of the hash
        'input_dim': sets_training_scaled[0].shape[1],  # 56
        'latent_dim': 2,
        'hidden_layers': (
            56,
            40,
            28,
            12,
            4,
        ),
        'leaky_relu': 0.1,
        'loss': 'mse',
        'last_activation': 'linear',
        'batch_size': 20,
        'epochs': 100,
        'steps_per_epoch': 500
    }
    ae_params_hash = hashlib.md5(
        json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest()

    autoencoder = Autoencoder(ae_params)
    autoencoder.load_else_train(all_training_scaled, sets_test_scaled,
                                "ae_" + ae_params_hash)

    # 2: encode data using autoencoder

    encoded = autoencoder.encode(sets_test_scaled[0])
    decoded = autoencoder.decode(encoded)

    rescaled = preprocess.rescale_data(decoded,
                                       dataset_name=test_dataset_names[0])
    smape_result = smape(rescaled, np.array(sets_test[0]), over_curves=True)

    print("smape_result test set", np.mean(smape_result), np.std(smape_result),
          np.min(smape_result), np.max(smape_result))

    plotting.plot_2d(sets_test[0],
                     "evaluation of test curves",
                     timeseries=True,
                     evaluation=smape_result,
                     title=False)

    # for i in np.arange(len(test_eval)):
    #     if test_eval[i] > 4:
    #         plotting.plot_2d(sets_test_scaled[0][i], "Possible unrealistic curve" + str(i), save=False, title=True)

    # 3: lets see how well the autoencoder can map a zero vector
    # todo: generate random curves, THEN apply min-max feature scaling, THEN evaluate
    unrealistic_curves = []
    curve_shape = 56
    unrealistic_curves.append(np.full(curve_shape, 5))
    unrealistic_curves.append(np.full(curve_shape, 10))
    unrealistic_curves.append(np.full(curve_shape, 20))
    unrealistic_curves.append(np.full(curve_shape, 50))
    unrealistic_curves.append(np.full(curve_shape, 70))
    unrealistic_curves.append(np.full(curve_shape, 100))
    unrealistic_curves.append(np.full(curve_shape, 150))
    unrealistic_curves.append(np.full(curve_shape, 200))
    unrealistic_curves.append(np.full(curve_shape, 250))
    unrealistic_curves.append(np.full(curve_shape, 300))
    unrealistic_curves.append(
        np.hstack((np.full(int(curve_shape / 2),
                           50), np.full(int(curve_shape / 2), 150))))
    unrealistic_curves.append(
        np.hstack((np.full(int(curve_shape / 2),
                           100), np.full(int(curve_shape / 2), 150))))
    unrealistic_curves.append(
        np.hstack((np.full(int(curve_shape / 2),
                           100), np.full(int(curve_shape / 2), 200))))
    unrealistic_curves.append(np.random.uniform(0, 10, curve_shape))
    unrealistic_curves.append(np.random.uniform(10, 70, curve_shape))
    unrealistic_curves.append(np.random.uniform(0, 100, curve_shape))
    unrealistic_curves.append(np.random.uniform(100, 200, curve_shape))
    unrealistic_curves.append(np.random.uniform(200, 300, curve_shape))
    unrealistic_curves.append(np.random.uniform(0, 200, curve_shape))
    unrealistic_curves.append(np.random.uniform(0, 250, curve_shape))
    unrealistic_curves.append(np.random.uniform(0, 300, curve_shape))
    unrealistic_curves.append(np.linspace(0, 100, num=curve_shape))
    unrealistic_curves.append(np.linspace(50, 150, num=curve_shape))
    unrealistic_curves.append(np.linspace(100, 200, num=curve_shape))
    unrealistic_curves.append(np.linspace(150, 250, num=curve_shape))
    unrealistic_curves.append(np.linspace(200, 300, num=curve_shape))
    unrealistic_curves.append(np.linspace(0, 200, num=curve_shape))
    unrealistic_curves.append(np.linspace(0, 300, num=curve_shape))
    unrealistic_curves.append(np.linspace(100, 0, num=curve_shape))
    unrealistic_curves.append(np.linspace(150, 50, num=curve_shape))
    unrealistic_curves.append(np.linspace(200, 100, num=curve_shape))
    unrealistic_curves.append(np.linspace(250, 150, num=curve_shape))
    unrealistic_curves.append(np.linspace(300, 200, num=curve_shape))
    unrealistic_curves.append(np.linspace(200, 0, num=curve_shape))
    unrealistic_curves.append(np.linspace(300, 0, num=curve_shape))
    unrealistic_curves = np.array(unrealistic_curves)
    print("unrealistic_curves.shape", unrealistic_curves.shape)

    unrealistic_curves_scaled = preprocess.scale_data(
        unrealistic_curves,
        dataset_name=training_dataset_names[0],
        should_fit=True)

    encoded = autoencoder.encode(unrealistic_curves_scaled)
    decoded = autoencoder.decode(encoded)

    rescaled = preprocess.rescale_data(decoded,
                                       dataset_name=training_dataset_names[0])
    smape_result = smape(rescaled, unrealistic_curves, over_curves=True)

    round_to_n = lambda x, n: round(x, -int(np.floor(np.log10(x))) + (n - 1))

    print("smape results", smape_result)
    for a_smape_result in smape_result:
        print(round_to_n(a_smape_result, 2))

    plotting.plot_2d(smape_result,
                     "loss of unrealistic curves from autoencoder SMAPE",
                     save=False,
                     title=True)
    plotting.plot_2d(smape_result,
                     "loss of unrealistic curves from autoencoder SMAPE",
                     save=False,
                     title=True)
    # plotting.plot_2d(unrealistic_eval_mse, "loss of unrealistic curves from autoencoder MSE", save=False, title=True)
    plotting.plot_unrealisticness(
        unrealistic_curves,
        "loss of unrealistic curves from autoencoder",
        timeseries=True,
        evaluation=smape_result,
        title=False,
        eval_label="SMAPE")