def integration_test_mnist():
    '''
    Integration test with unstructed data.
    A mixed model is used that has structued and unstructured input.
    The unstructured input are mnist images. These mnist images are used in the formula and represent the number that is
    on the image. The test tests if the estimated numbers are on average (median) a monotonically increasing function of the
    true numbers on the mnist images
    '''

    #set seeds for reproducibility
    torch.manual_seed(1)
    np.random.seed(1)

    #load data
    data_path = '../data/mnist_data/tab.csv'

    data = pd.read_csv(data_path, delimiter=',').loc[:1000, :]

    for i in data.index:

        data.loc[i, 'groundtruth'] = np.sin(
            data.loc[i, 'x1']) - 3 * data.loc[i, 'x2'] + data.loc[
                i, 'x3']**4 + 3 * data.loc[i, 'y_true']

    data.loc[:,
             'groundtruth'] = data.loc[:,
                                       'groundtruth'] - data.loc[:,
                                                                 'groundtruth'].mean(
                                                                 )

    output_dir = './outputs'

    unstructured_data = {
        'numbers': {
            'path': '../data/mnist_data/mnist_images',
            'datatype': 'image'
        }
    }

    for i in data.index:
        data.loc[i, 'numbers'] = f'img_{i}.jpg'  #f'{data.id[i]}.jpg'

    #define Sddr parameters
    formulas = {
        'loc':
        '~ -1 + spline(x1, bs="bs", df=10) + x2 + dnn(numbers) + spline(x3, bs="bs", df=10)',
        'scale': '~1'
    }
    distribution = 'Normal'

    deep_models_dict = {
        'dnn': {
            'model':
            nn.Sequential(nn.Flatten(1, -1), nn.Linear(28 * 28, 128),
                          nn.ReLU()),
            'output_shape':
            128
        },
    }

    train_parameters = {
        'batch_size': 100,
        'epochs': 200,
        'degrees_of_freedom': {
            'loc': 9.6,
            'scale': 9.6
        },
        'optimizer': optim.RMSprop
    }

    #initialize Sddr
    sddr = Sddr(
        output_dir=output_dir,
        distribution=distribution,
        formulas=formulas,
        deep_models_dict=deep_models_dict,
        train_parameters=train_parameters,
    )

    # train Sddr
    sddr.train(structured_data=data,
               target="groundtruth",
               unstructured_data=unstructured_data)

    data_pred = data.loc[:, :]
    distribution_layer, partial_effect = sddr.predict(
        data_pred,
        clipping=True,
        plot=False,
        unstructured_data=unstructured_data)

    assert distribution_layer.scale[0] > 0.7, "Scale too large in mnist test"

    data_pred2 = data.copy()

    data_pred2.loc[:, 'x1'] = 0
    data_pred2.loc[:, 'x2'] = 0
    data_pred2.loc[:, 'x3'] = 0
    data_pred2

    distribution_layer, partial_effect = sddr.predict(
        data_pred2,
        clipping=True,
        plot=False,
        unstructured_data=unstructured_data)

    data_pred2['predicted_number'] = distribution_layer.loc[:, :].numpy(
    ).flatten()

    predicted_numbers = data_pred2.groupby('y_true').median().predicted_number
    print(predicted_numbers.loc[:].to_numpy())
    maximum_deviation_mnist = ((predicted_numbers.loc[1:].to_numpy() -
                                predicted_numbers.loc[:8].to_numpy()) /
                               3).min()

    assert maximum_deviation_mnist > 0, "Predicted numbers for the mnist not monotonically increasing"
def integration_test_load_and_resume():
    '''
    Integration test of training and saving a GAMLSS model, then loading the same  model.
    The model is used to continue training.
    '''
    #set seeds for reproducibility
    torch.manual_seed(1)
    np.random.seed(1)

    #load data
    data_path = '../data/simple_gam/X.csv'
    target_path = '../data/simple_gam/Y.csv'

    data = pd.read_csv(data_path, delimiter=';')
    target = pd.read_csv(target_path)

    output_dir = './outputs'

    #define Sddr parameters
    distribution = 'Poisson'

    formulas = {
        'rate':
        '~1 + spline(x1, bs="bs",df=9) + spline(x2, bs="bs",df=9) + d1(x1) + d2(x2)'
    }
    deep_models_dict = {
        'd1': {
            'model': nn.Sequential(nn.Linear(1, 15)),
            'output_shape': 15
        },
        'd2': {
            'model': nn.Sequential(nn.Linear(1, 3), nn.ReLU(), nn.Linear(3,
                                                                         8)),
            'output_shape': 8
        }
    }

    train_parameters = {
        'batch_size': 1000,
        'epochs': 100,
        'degrees_of_freedom': {
            'rate': 6
        },
        'optimizer': optim.RMSprop,
        'val_split': 0
    }

    #initialize Sddr
    sddr_100 = Sddr(output_dir=output_dir,
                    distribution=distribution,
                    formulas=formulas,
                    deep_models_dict=deep_models_dict,
                    train_parameters=train_parameters)

    # train Sddr
    sddr_100.train(target=target, structured_data=data)
    #_, partial_effects = sddr.predict(test_data, clipping=True)
    sddr_100.save('temp_simple_gam.pth')
    # load trained Sddr and predict
    train_parameters['epochs'] = 500
    sddr_resume = Sddr(output_dir=output_dir,
                       distribution=distribution,
                       formulas=formulas,
                       deep_models_dict=deep_models_dict,
                       train_parameters=train_parameters)
    sddr_resume.load('./outputs/temp_simple_gam.pth', data)
    sddr_resume.train(target=target, structured_data=data, resume=True)
    loss_resume = sddr_resume.epoch_train_loss
    # train continuously
    #set seeds for reproducibility
    torch.manual_seed(1)
    np.random.seed(1)

    deep_models_dict = {
        'd1': {
            'model': nn.Sequential(nn.Linear(1, 15)),
            'output_shape': 15
        },
        'd2': {
            'model': nn.Sequential(nn.Linear(1, 3), nn.ReLU(), nn.Linear(3,
                                                                         8)),
            'output_shape': 8
        }
    }

    train_parameters = {
        'batch_size': 1000,
        'epochs': 500,
        'degrees_of_freedom': {
            'rate': 6
        },
        'optimizer': optim.RMSprop,
        'val_split': 0
    }

    sddr_500 = Sddr(output_dir=output_dir,
                    distribution=distribution,
                    formulas=formulas,
                    deep_models_dict=deep_models_dict,
                    train_parameters=train_parameters)
    sddr_500.train(target=target, structured_data=data)
    loss_500 = sddr_500.epoch_train_loss
    loss_dif = abs(loss_500 - loss_resume)
    assert loss_dif < 0.01, "Loss function not equal in two training methods"

    os.remove('./outputs/temp_simple_gam.pth')
def integration_test_simple_gam():
    '''
    Integration test using a Simple GAM Poisson Distribution.
    The partial effects are estimated and compared with the ground truth 
    (only functional form: the terms are normalized before comparison)
    If the error is higher than a resonable value an error is raised.
    '''
    #set seeds for reproducibility
    torch.manual_seed(1)
    np.random.seed(1)

    #load data
    data_path = '../data/simple_gam/X.csv'
    target_path = '../data/simple_gam/Y.csv'

    data = pd.read_csv(data_path, delimiter=';')
    target = pd.read_csv(target_path)

    output_dir = './outputs'

    #define Sddr parameters
    distribution = 'Poisson'

    formulas = {
        'rate':
        '~1 + spline(x1, bs="bs",df=9) + spline(x2, bs="bs",df=9) + d1(x1) + d2(x2)'
    }
    deep_models_dict = {
        'd1': {
            'model': nn.Sequential(nn.Linear(1, 15)),
            'output_shape': 15
        },
        'd2': {
            'model': nn.Sequential(nn.Linear(1, 3), nn.ReLU(), nn.Linear(3,
                                                                         8)),
            'output_shape': 8
        }
    }

    train_parameters = {
        'batch_size': 1000,
        'epochs': 1000,
        'degrees_of_freedom': {
            'rate': 6
        },
        'optimizer': optim.RMSprop,
        'val_split': 0.15,
        'early_stop_epochs': 100,
        'early_stop_epsilon': 0.001
    }

    #initialize Sddr
    sddr = Sddr(output_dir=output_dir,
                distribution=distribution,
                formulas=formulas,
                deep_models_dict=deep_models_dict,
                train_parameters=train_parameters)

    # train Sddr
    sddr.train(target=target, structured_data=data)

    #compute partial effects
    partial_effects_rate = sddr.eval('rate', plot=False)

    #normalize partial effects and compare with ground truth
    x = partial_effects_rate[0][0]
    y = normalize(partial_effects_rate[0][1])

    y_target = normalize(x**2)  # ground truth: quadratic effect

    RMSE = (y - y_target).std()

    assert RMSE < 0.1, "Partial effect not properly estimated in simple GAM."

    x = partial_effects_rate[1][0]
    y = normalize(partial_effects_rate[1][1])

    y_target = normalize(-x)  # ground truth: linear effect

    RMSE = (y - y_target).std()

    assert RMSE < 0.02, "Partial effect not properly estimated in simple GAM."

    #compute partial effects on unseen data
    _, partial_effects_pred_rate = sddr.predict(data / 2, clipping=True)

    #normalize partial effects and compare with ground truth
    x = partial_effects_pred_rate['rate'][0][0]
    y = normalize(partial_effects_pred_rate['rate'][0][1])

    y_target = normalize(x**2)  # ground truth: quadratic effect

    RMSE = (y - y_target).std()

    assert RMSE < 0.1, "Partial effect not properly estimated on unseen data in simple GAM."

    x = partial_effects_pred_rate['rate'][1][0]
    y = normalize(partial_effects_pred_rate['rate'][1][1])

    y_target = normalize(-x)  # ground truth: linear effect

    RMSE = (y - y_target).std()

    assert RMSE < 0.02, "Partial effect not properly estimated on unseen data in simple GAM."
def integration_test_load_and_predict():
    '''
    Integration test of training and saving a GAMLSS model, then loading the same  model.
    The model is used to predict on unseen data right after training and once again after it
    is loaded. The results of the two must match for the test to pass.
    '''
    #set seeds for reproducibility
    torch.manual_seed(1)
    np.random.seed(1)

    #load data
    data_path = '../data/gamlss/X.csv'
    target_path = '../data/gamlss/Y.csv'

    data = pd.read_csv(data_path, delimiter=';')
    target = pd.read_csv(target_path)
    train_data = data.iloc[:800]
    train_target = target.iloc[:800]  #data.iloc[:800]
    test_data = data.iloc[800:]

    output_dir = './outputs'

    #define Sddr parameters
    distribution = 'Logistic'

    formulas = {
        'loc':
        '~1+spline(x1, bs="bs", df=4)+spline(x2, bs="bs",df=4) + d1(x1)+d2(x2)',
        'scale': '~1 + spline(x3, bs="bs",df=4) + spline(x4, bs="bs",df=4)'
    }

    deep_models_dict = {
        'd1': {
            'model': nn.Sequential(nn.Linear(1, 15)),
            'output_shape': 15
        },
        'd2': {
            'model': nn.Sequential(nn.Linear(1, 3), nn.ReLU(), nn.Linear(3,
                                                                         8)),
            'output_shape': 8
        }
    }

    train_parameters = {
        'batch_size': 1000,
        'epochs': 200,
        'degrees_of_freedom': {
            'loc': 4,
            'scale': 4
        },
        'optimizer': optim.RMSprop
    }
    #initialize Sddr
    sddr = Sddr(output_dir=output_dir,
                distribution=distribution,
                formulas=formulas,
                deep_models_dict=deep_models_dict,
                train_parameters=train_parameters)

    # train Sddr
    sddr.train(target=train_target, structured_data=train_data)
    _, partial_effects = sddr.predict(test_data, clipping=True)
    sddr.save('temp_gamlss.pth')
    # load trained Sddr and predict
    pred_sddr = Sddr(output_dir=output_dir,
                     distribution=distribution,
                     formulas=formulas,
                     deep_models_dict=deep_models_dict,
                     train_parameters=train_parameters)
    pred_sddr.load('./outputs/temp_gamlss.pth', train_data)
    _, partial_effects_loaded = pred_sddr.predict(test_data, clipping=True)
    # compare partial effects
    for param in partial_effects.keys():
        for partial_effect, partial_effect_loaded in zip(
                partial_effects[param], partial_effects_loaded[param]):
            abs_err = (partial_effect[1] - partial_effect_loaded[1])
            assert sum(
                abs_err
            ) < 0.001, "Partial effect not same with original prediction and prediction after load for param %s and partial effect %s" % (
                param, partial_effect[0])
    os.remove('./outputs/temp_gamlss.pth')
def integration_test_gamlss():
    '''
    Integration test using a GAMLSS - Logistic Distribution.
    The partial effects are estimated and compared with the ground truth 
    (only functional form: the terms are normalized before comparison)
    If the error is higher than a resonable value an error is raised.
    '''
    #set seeds for reproducibility
    torch.manual_seed(1)
    np.random.seed(1)

    #load data
    data_path = '../data/gamlss/X.csv'
    target_path = '../data/gamlss/Y.csv'

    data = pd.read_csv(data_path, delimiter=';')
    target = pd.read_csv(target_path)

    output_dir = './outputs'

    #define Sddr parameters
    distribution = 'Logistic'

    formulas = {
        'loc':
        '~1+spline(x1, bs="bs", df=4)+spline(x2, bs="bs",df=4) + d1(x1)+d2(x2)',
        'scale': '~1 + spline(x3, bs="bs",df=4) + spline(x4, bs="bs",df=4)'
    }

    deep_models_dict = {
        'd1': {
            'model': nn.Sequential(nn.Linear(1, 15)),
            'output_shape': 15
        },
        'd2': {
            'model': nn.Sequential(nn.Linear(1, 3), nn.ReLU(), nn.Linear(3,
                                                                         8)),
            'output_shape': 8
        }
    }

    train_parameters = {
        'batch_size': 1000,
        'epochs': 200,
        'degrees_of_freedom': {
            'loc': 4,
            'scale': 4
        },
        'optimizer': optim.RMSprop,
        'val_split': 0.01
    }

    #initialize Sddr
    sddr = Sddr(output_dir=output_dir,
                distribution=distribution,
                formulas=formulas,
                deep_models_dict=deep_models_dict,
                train_parameters=train_parameters)

    # train Sddr
    sddr.train(target=target, structured_data=data)

    #compute partial effects
    partial_effects_loc = sddr.eval('loc', plot=False)
    partial_effects_scale = sddr.eval('scale', plot=False)

    #normalize partial effects and compare with ground truth
    x = partial_effects_loc[0][0]
    y = normalize(partial_effects_loc[0][1])

    y_target = normalize(x**2)  # ground truth: quadratic effect

    RMSE = (y - y_target).std()

    assert RMSE < 0.12, "Partial effect not properly estimated in GAMLSS."

    x = partial_effects_loc[1][0]
    y = normalize(partial_effects_loc[1][1])

    y_target = normalize(-x)  # ground truth: linear effect

    RMSE = (y - y_target).std()

    assert RMSE < 0.1, "Partial effect not properly estimated in GAMLSS."

    x = partial_effects_scale[0][0]
    y = normalize(partial_effects_scale[0][1])

    y_target = normalize(x)  # ground truth: linear effect

    RMSE = (y - y_target).std()

    assert RMSE < 0.15, "Partial effect not properly estimated in GAMLSS."

    x = partial_effects_scale[1][0]
    y = normalize(partial_effects_scale[1][1])

    y_target = normalize(np.sin(4 * x))  # ground truth: sinusoidal effect

    RMSE = (y - y_target).std()

    assert RMSE < 0.4, "Partial effect not properly estimated in GAMLSS."

    #compute partial effects on unseen data
    _, partial_effects = sddr.predict(data / 2, clipping=True, plot=False)
    partial_effects_pred_loc = partial_effects['loc']
    partial_effects_pred_scale = partial_effects['scale']

    #normalize partial effects and compare with ground truth
    x = partial_effects_pred_loc[0][0]
    y = normalize(partial_effects_pred_loc[0][1])

    y_target = normalize(x**2)  # ground truth: quadratic effect

    RMSE = (y - y_target).std()
    print(RMSE)

    assert RMSE < 0.35, "Partial effect not properly estimated in GAMLSS."

    x = partial_effects_pred_loc[1][0]
    y = normalize(partial_effects_pred_loc[1][1])

    y_target = normalize(-x)  # ground truth: linear effect

    RMSE = (y - y_target).std()

    assert RMSE < 0.1, "Partial effect not properly estimated in GAMLSS."

    x = partial_effects_pred_scale[0][0]
    y = normalize(partial_effects_pred_scale[0][1])

    y_target = normalize(x)  # ground truth: linear effect

    RMSE = (y - y_target).std()

    assert RMSE < 0.15, "Partial effect not properly estimated in GAMLSS."

    x = partial_effects_pred_scale[1][0]
    y = normalize(partial_effects_pred_scale[1][1])

    y_target = normalize(np.sin(4 * x))  # ground truth: sinusoidal effect

    RMSE = (y - y_target).std()

    assert RMSE < 0.4, "Partial effect not properly estimated in GAMLSS."