def RidgeRegressionEnsembleTest():
    #dataset = DatasetFactory.friedman1(n_samples=200200)
    #dataset = DatasetFactory.friedman2(n_samples=200200)
    dataset = DatasetFactory.friedman3(n_samples=200200)
    Xtrain, X, ytrain, y = model_selection.train_test_split(dataset.data,
                                                            dataset.target,
                                                            random_state=0,
                                                            train_size=200)
    ensemble = EnsembleRegressor(type='ridge')
    ensemble.fit(Xtrain,
                 ytrain,
                 samples_per_regressor=200,
                 regressor_overlap=200)
    ridgecv = linear_model.RidgeCV(alphas=np.arange(.1, 1, .2),
                                   fit_intercept=True,
                                   normalize=True)
    ridgecv.fit(Xtrain, ytrain)
    y_ridgecv = ridgecv.predict(X)
    Z = ensemble.predict(X)

    sio.savemat(
        'RidgeRegression_Friedman3_200k.mat',
        {
            'names': ensemble.regressor_labels,
            'Z': Z,
            'y': y,
            # 'Ztrain': Z_train, 'ytrain': ytrain,
            'y_RidgeCV': y_ridgecv,
            'samples_per_regressor': 200,
            'regressor_samples_overlap': 200,
            'Ey': np.mean(y),
            'Ey2': np.mean(y**2),
            'Description': 'Ridge Regression (Friedman #3)'
        })
def UnequalMLPsEnsembleTest():
    #dataset = DatasetFactory.friedman1(n_samples=200200)
    #dataset = DatasetFactory.friedman2(n_samples=200200)
    dataset = DatasetFactory.friedman3(n_samples=200200)
    Xtrain, X, ytrain, y = cross_validation.train_test_split(
        dataset.data,  dataset.target, random_state=0, train_size=200)
    ensemble = EnsembleRegressor(type='auto_large')
    ensemble.fit(Xtrain,ytrain,samples_per_regressor=200,regressor_overlap=200)
    Ztrain = ensemble.predict(Xtrain)
    Z = ensemble.predict(X)

    sio.savemat('ManualEnsembleDatasets\DifferentRegressors_Friedman3.mat', {
        'names': ensemble.regressor_labels,
        'Z': Z, 'y': y,
        'Ztrain': Ztrain, 'ytrain': ytrain,
        'samples_per_regressor': 200,
        'regressor_samples_overlap': 200,
        'Ey': np.mean(y),
        'Ey2': np.mean(y ** 2),
        'Description': 'Different Regressors (Friedman #3)'
    })
def UnequalMLPsEnsembleTest():
    #dataset = DatasetFactory.friedman1(n_samples=200200)
    #dataset = DatasetFactory.friedman2(n_samples=200200)
    dataset = DatasetFactory.friedman3(n_samples=200200)
    Xtrain, X, ytrain, y = model_selection.train_test_split(dataset.data,
                                                            dataset.target,
                                                            random_state=0,
                                                            train_size=200)
    ensemble = EnsembleRegressor(type='auto_large')
    ensemble.fit(Xtrain,
                 ytrain,
                 samples_per_regressor=200,
                 regressor_overlap=200)
    Ztrain = ensemble.predict(Xtrain)
    Z = ensemble.predict(X)

    sio.savemat(
        'ManualEnsembleDatasets\DifferentRegressors_Friedman3.mat', {
            'names': ensemble.regressor_labels,
            'Z': Z,
            'y': y,
            'Ztrain': Ztrain,
            'ytrain': ytrain,
            'samples_per_regressor': 200,
            'regressor_samples_overlap': 200,
            'Ey': np.mean(y),
            'Ey2': np.mean(y**2),
            'Description': 'Different Regressors (Friedman #3)'
        })
def RealDatasetsManualEnsembleTest():
    for name,func in dataset_list.iteritems():
        print(name + ":", end="")
        dataset = func()
        print(" X.shape = " + str(dataset.data.shape))
        ensemble = EnsembleRegressor(type='auto', verbose=True)  #auto_large

        if name is 'blog_feedback':
            continue
            # samples_per_regressor = 2810
            # overlap = 2810
            # train_size = 2810
        else:
            samples_per_regressor = 200
            overlap = 0
            train_size = samples_per_regressor * ensemble.regressor_count

        if len(dataset.target) < train_size + 500:  # ignore datasets with less than 6000 samples
            continue
        # if dataset.data.shape[1] < 5:  # ignore datasets with less than 5 covariates
        #     continue

        Xtrain, X, ytrain, y = cross_validation.train_test_split(
            dataset.data, dataset.target, random_state=0, train_size=train_size)

        ensemble.fit(Xtrain, ytrain, samples_per_regressor=samples_per_regressor, regressor_overlap=overlap)
        Ztrain = ensemble.predict(Xtrain)
        Z = ensemble.predict(X)

        sio.savemat(path.join('ManualEnsembleDatasets',name + '.mat'), {
            'names': ensemble.regressor_labels,
            'Z': Z, 'y': y,
            'Ztrain': Ztrain, 'ytrain': ytrain,
            'samples_per_regressor': train_size,
            'regressor_samples_overlap': train_size,
            'Ey': np.mean(y),
            'Ey2': np.mean(y ** 2),
            'Description': ('Different Regressors (%s)' % name)
        })
def RealDatasetsLargeMLPEnsembleTest():
    for name,func in dataset_list.iteritems():
        print(name)
        dataset = func()

        if len(dataset.target) < 5500:  # ignore datasets with less than 6000 samples
            continue
        if dataset.data.shape[1] < 5:  # ignore datasets with less than 5 covariates
            continue

        if name is 'blog_feedback':
            train_size = 10000
        else:
            train_size = 500

        Xtrain, X, ytrain, y = cross_validation.train_test_split(
            dataset.data, dataset.target, random_state=0, train_size=train_size)

        if name is 'affairs':
            # ytrain, y = [np_utils.to_categorical(x) for x in (ytrain, y)]
            continue

        ensemble = EnsembleRegressor(type='mlp_large', verbose=True)
        ensemble.fit(Xtrain, ytrain, samples_per_regressor=train_size, regressor_overlap=train_size)
        Ztrain = ensemble.predict(Xtrain)
        Z = ensemble.predict(X)

        sio.savemat(path.join('ManualEnsembleDatasets',name + '_10mlp.mat'), {
            'names': ensemble.regressor_labels,
            'Z': Z, 'y': y,
            'Ztrain': Ztrain, 'ytrain': ytrain,
            'samples_per_regressor': train_size,
            'regressor_samples_overlap': train_size,
            'Ey': np.mean(y),
            'Ey2': np.mean(y ** 2),
            'Description': ('Different Regressors (%s)' % name)
        })
def RidgeRegressionEnsembleTest():
    #dataset = DatasetFactory.friedman1(n_samples=200200)
    #dataset = DatasetFactory.friedman2(n_samples=200200)
    dataset = DatasetFactory.friedman3(n_samples=200200)
    Xtrain, X, ytrain, y = cross_validation.train_test_split(
        dataset.data,  dataset.target, random_state=0, train_size=200)
    ensemble = EnsembleRegressor(type='ridge')
    ensemble.fit(Xtrain,ytrain,samples_per_regressor=200,regressor_overlap=200)
    ridgecv = linear_model.RidgeCV(alphas=np.arange(.1,1,.2), fit_intercept=True, normalize=True)
    ridgecv.fit(Xtrain,ytrain)
    y_ridgecv = ridgecv.predict(X)
    Z = ensemble.predict(X)

    sio.savemat('RidgeRegression_Friedman3_200k.mat', {
        'names': ensemble.regressor_labels,
        'Z': Z, 'y': y,
        # 'Ztrain': Z_train, 'ytrain': ytrain,
        'y_RidgeCV': y_ridgecv,
        'samples_per_regressor': 200,
        'regressor_samples_overlap': 200,
        'Ey': np.mean(y),
        'Ey2': np.mean(y ** 2),
        'Description': 'Ridge Regression (Friedman #3)'
    })
def RealDatasetsManualEnsembleTest():
    for name, func in dataset_list.iteritems():
        print(name + ":", end="")
        dataset = func()
        print(" X.shape = " + str(dataset.data.shape))
        ensemble = EnsembleRegressor(type='auto', verbose=True)  #auto_large

        if name is 'blog_feedback':
            continue
            # samples_per_regressor = 2810
            # overlap = 2810
            # train_size = 2810
        else:
            samples_per_regressor = 200
            overlap = 0
            train_size = samples_per_regressor * ensemble.regressor_count

        if len(
                dataset.target
        ) < train_size + 500:  # ignore datasets with less than 6000 samples
            continue
        # if dataset.data.shape[1] < 5:  # ignore datasets with less than 5 covariates
        #     continue

        Xtrain, X, ytrain, y = model_selection.train_test_split(
            dataset.data,
            dataset.target,
            random_state=0,
            train_size=train_size)

        ensemble.fit(Xtrain,
                     ytrain,
                     samples_per_regressor=samples_per_regressor,
                     regressor_overlap=overlap)
        Ztrain = ensemble.predict(Xtrain)
        Z = ensemble.predict(X)

        sio.savemat(
            path.join('ManualEnsembleDatasets', name + '.mat'), {
                'names': ensemble.regressor_labels,
                'Z': Z,
                'y': y,
                'Ztrain': Ztrain,
                'ytrain': ytrain,
                'samples_per_regressor': train_size,
                'regressor_samples_overlap': train_size,
                'Ey': np.mean(y),
                'Ey2': np.mean(y**2),
                'Description': ('Different Regressors (%s)' % name)
            })
def RealDatasetsLargeMLPEnsembleTest():
    for name, func in dataset_list.iteritems():
        print(name)
        dataset = func()

        if len(dataset.target
               ) < 5500:  # ignore datasets with less than 6000 samples
            continue
        if dataset.data.shape[
                1] < 5:  # ignore datasets with less than 5 covariates
            continue

        if name is 'blog_feedback':
            train_size = 10000
        else:
            train_size = 500

        Xtrain, X, ytrain, y = model_selection.train_test_split(
            dataset.data,
            dataset.target,
            random_state=0,
            train_size=train_size)

        if name is 'affairs':
            # ytrain, y = [np_utils.to_categorical(x) for x in (ytrain, y)]
            continue

        ensemble = EnsembleRegressor(type='mlp_large', verbose=True)
        ensemble.fit(Xtrain,
                     ytrain,
                     samples_per_regressor=train_size,
                     regressor_overlap=train_size)
        Ztrain = ensemble.predict(Xtrain)
        Z = ensemble.predict(X)

        sio.savemat(
            path.join('ManualEnsembleDatasets', name + '_10mlp.mat'), {
                'names': ensemble.regressor_labels,
                'Z': Z,
                'y': y,
                'Ztrain': Ztrain,
                'ytrain': ytrain,
                'samples_per_regressor': train_size,
                'regressor_samples_overlap': train_size,
                'Ey': np.mean(y),
                'Ey2': np.mean(y**2),
                'Description': ('Different Regressors (%s)' % name)
            })
Example #9
0
        est_y.extend(estimator.predict(X_test2).tolist())
        true_y.extend(y_test2.tolist())
    return bias_variance(est_y, true_y)


df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['target'] = boston.target

X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                    boston.target,
                                                    test_size=0.2)
print X_train.shape, X_test.shape

estimators = [
    {
        "context": EnsembleRegressor(),
        "tuned_parameters": [],
        "name": "EnsembleRegressor"
    },
    {
        "context":
        BaggingRegressor(tree.DecisionTreeRegressor(max_depth=12),
                         max_samples=0.9,
                         max_features=0.5,
                         n_estimators=50),
        "tuned_parameters": [],
        "name":
        "Bagging"
    },
    {
        "context":
def make_large_ensemble(dataset, mat_filename='large_ensemble.mat'):
    """
    construct_ensemble splits the dataset into train and 'test'. The ensemble regressors are trained on the training
    set. The test set is saved to the mat file to be used by matlab code.

    :param dataset: a dataset object created by DatasetFactory
    :param mat_filename: name of the mat file to save the results to
    # :param ensemble_type: 'auto' or 'mlp' for the choice of regressors in the ensemble
    # :param train_size: proportion or number of samples used for training (defaults to 25% for n>20,000, otherwise 50%)
    # :param test_size: proportion or number of samples used for testing (defaults to 75% for n>20,000, and 50% otherwise)
    # :param samples_per_regressor: Number of samples from X_train that each regressor will be trained on.
    #                               Default 'None' will cause all regressors to be trained on all samples.
    # :param overlap: this is the number of samples overlapping for every adjacent pair of regressors.
    #        Defaults to no overlap if there are at least 100 samples per regressor, else full overlap (overlap=n).
    # :param plotting: plots results
    # :param ensemble_train_size: The number of samples to output for training supervised ensemble methods
    # :param scale_data: boolean, if True the data will be scaled to mean-centered and variance 1.
    # :param Description: The text that will be written in the 'Description' field in the output file
    """

    # Init
    ensemble_type = 'auto_large'  # 'mlp_different'  # 'mlp_large'  ######################################################################
    (n_samples, n_features) = dataset.data.shape

    ensemble = EnsembleRegressor(verbose=False, type=ensemble_type)

    m = ensemble.regressor_count
    train_size = np.max([200, 100 * dataset.data.shape[1]
                         ])  # at least 100 samples per dimension (n/p >= 100)
    n = n_samples - train_size
    n_train = 200  # taken out of val_size
    samples_per_regressor = train_size  #//m
    overlap = samples_per_regressor  # 0

    # scale data
    dataset.data = preprocessing.scale(dataset.data)

    # split to train / validation
    X_train, X_val, y_train, y_val = model_selection.train_test_split(
        dataset.data,
        dataset.target,
        random_state=0,
        train_size=train_size,
        test_size=n)

    msg = "features=%s, n_tot=%d, training each regressors on %d, n=%d, n_train=%d, m=%d" % \
          (n_features, n_samples, samples_per_regressor, n, n_train, m)
    print(msg)

    ensemble.fit(
        X_train, y_train
    )  # full overlap #, samples_per_regressor=samples_per_regressor, regressor_overlap=overlap)

    scores_train = ensemble.score(X_train, y_train)
    MSEs_train = ensemble.mean_squared_error(X_train,
                                             y_train) / np.var(y_train)
    scores_val = ensemble.score(X_val, y_val)
    MSEs_val = ensemble.mean_squared_error(X_val, y_val) / np.var(y_val)

    for i, regr in enumerate(ensemble.regressors):
        print('## ' + str(i) + '. ' + regr.__class__.__name__ + ':')
        print(regr)

        print('\tMSE/Var(Y): %.2f/%.2f' % (MSEs_train[i], MSEs_val[i]))
        print('\tVariance score (R^2): %.2f/%.2f\n' %
              (scores_train[i], scores_val[i]))

    # create predictions matrix on the test set
    Zval = ensemble.predict(X_val)

    # Set aside n_train samples as a training set for the supervised ensemble learners
    Z_train, Z, y_ensemble_train, y_ensemble_test = \
        model_selection.train_test_split(Zval.T, y_val, random_state=42, train_size=n_train)
    Z_train = Z_train.T
    Z = Z.T

    # Add Description if none
    Description = "%s was generated with %s regressors of type %s:\n%s" % \
                  (mat_filename, msg, ensemble_type, str(locals()))

    sio.savemat(
        mat_filename,
        {
            'names': ensemble.regressor_labels,
            'Z': Z,
            'y': y_ensemble_test,
            'Ztrain': Z_train,
            'ytrain': y_ensemble_train,
            'samples_per_regressor': samples_per_regressor,
            'regressor_samples_overlap': overlap,
            'Ey': np.mean(y_ensemble_test),  # np.mean(dataset.target),
            'Ey2': np.mean(y_ensemble_test**2),  # np.mean(dataset.target ** 2)
            'Description': Description
        })

    results_df = pd.DataFrame({
        'i': range(1, 1 + len(MSEs_train)),
        'MSE_train': MSEs_train,
        'MSE_val': MSEs_val,
        'R2_train': scores_train,
        'R2_val': scores_val
    })
    return results_df
def make_ensemble(dataset,
                  mat_filename='ensemble.mat',
                  ensemble_type='auto',
                  train_size=None,
                  test_size=None,
                  samples_per_regressor=None,
                  overlap=None,
                  plotting=True,
                  Description=None,
                  scale_data=False,
                  ensemble_train_size=200):
    """
    construct_ensemble splits the dataset into train and 'test'. The ensemble regressors are trained on the training
    set. The test set is saved to the mat file to be used by matlab code.

    :param dataset: a dataset object created by DatasetFactory
    :param mat_filename: name of the mat file to save the results to
    :param ensemble_type: 'auto' or 'mlp' for the choice of regressors in the ensemble
    :param train_size: proportion or number of samples used for training (defaults to 25% for n>20,000, otherwise 50%)
    :param test_size: proportion or number of samples used for testing (defaults to 75% for n>20,000, and 50% otherwise)
    :param samples_per_regressor: Number of samples from X_train that each regressor will be trained on.
                                  Default 'None' will cause all regressors to be trained on all samples.
    :param overlap: this is the number of samples overlapping for every adjacent pair of regressors.
           Defaults to no overlap if there are at least 100 samples per regressor, else full overlap (overlap=n).
    :param plotting: plots results
    :param ensemble_train_size: The number of samples to output for training supervised ensemble methods
    :param scale_data: boolean, if True the data will be scaled to mean-centered and variance 1.
    :param Description: The text that will be written in the 'Description' field in the output file
    """
    if scale_data:
        dataset.data = preprocessing.scale(dataset.data)

    if (train_size is None) and (test_size is None):
        if len(dataset.target) < 20000:
            (test_size, train_size) = (0.75, 0.25)
        else:
            (test_size, train_size) = (0.5, 0.5)

    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        dataset.data,  # preprocessing.scale(dataset.data)
        dataset.target,
        random_state=0,
        test_size=test_size,
        train_size=train_size)

    # Prepare ensemble regressors
    ensemble = EnsembleRegressor(verbose=False, type=ensemble_type)

    n = len(y_train)
    m = ensemble.regressor_count

    # decide on how many samples per regressor and what's the overlap between regressors
    if samples_per_regressor and (overlap is not None):
        pass  # both were defined by caller

    elif (overlap is not None) and (samples_per_regressor is None):
        samples_per_regressor = (
            n // m) + overlap  # '//' is python operator for floor of n/m

    else:  # both are None or only samples_per_regressor was given
        if n < m * 100:  # reserve at least 100 samples for training the individual regressors
            overlap = n
            samples_per_regressor = (samples_per_regressor or n)
        else:  # we have enough samples to be training on different parts of the dataset
            overlap = 0
            samples_per_regressor = (samples_per_regressor or n // m)

    assert train_size == (samples_per_regressor *
                          m) - overlap * (m - 1), "inconsistent parameters"

    print("Training set size: %d with %d attributes" % X_train.shape)
    print("Each regressor is trained on %d samples" % samples_per_regressor)
    print("Test set size: %d" % len(y_test))

    ensemble.fit(X_train,
                 y_train,
                 samples_per_regressor=samples_per_regressor,
                 regressor_overlap=overlap)
    scores = ensemble.score(X_train, y_train)
    MSEs = ensemble.mean_squared_error(X_train, y_train)

    for i, regr in enumerate(ensemble.regressors):
        print('## ' + str(i) + '. ' + regr.__class__.__name__ + ':')
        print(regr)

        print('\tMSE: %.2f' % MSEs[i])
        print('\tVariance score (R^2): %.2f\n' % scores[i])

    # create predictions matrix on the test set
    Z = ensemble.predict(X_test)

    # Set aside 200 samples as a training set for the supervised ensemble learners
    Z_train, Z, y_ensemble_train, y_ensemble_test = \
        model_selection.train_test_split(Z.T, y_test, random_state=0, train_size=ensemble_train_size)
    Z_train = Z_train.T
    Z = Z.T

    # Add Description if none
    if not Description:
        Description = "%s was generated with %d samples and %d regressors of type %s:\n%s" % \
                      (mat_filename, n, m, ensemble_type, str(locals()))

    sio.savemat(
        mat_filename,
        {
            'names': ensemble.regressor_labels,
            'Z': Z,
            'y': y_ensemble_test,
            'Ztrain': Z_train,
            'ytrain': y_ensemble_train,
            'samples_per_regressor': samples_per_regressor,
            'regressor_samples_overlap': overlap,
            'Ey': np.mean(y_ensemble_test),  # np.mean(dataset.target),
            'Ey2': np.mean(y_ensemble_test**2),  # np.mean(dataset.target ** 2)
            'Description': Description
        })

    if plotting:
        plot_regression_results(ensemble, Z, y_ensemble_test)
        plot_y_e_correlation(ensemble, Z, y_ensemble_test)
def make_ensemble(dataset, mat_filename='ensemble.mat', ensemble_type='auto',
                  train_size=None, test_size=None, samples_per_regressor=None, overlap=None, plotting=True,
                  Description=None, scale_data=False):
    """
    construct_ensemble splits the dataset into train and 'test'. The ensemble regressors are trained on the training
    set. The test set is saved to the mat file to be used by matlab code.

    :param dataset: a dataset object created by DatasetFactory
    :param mat_filename: name of the mat file to save the results to
    :param ensemble_type: 'auto' or 'mlp' for the choice of regressors in the ensemble
    :param train_size: proportion or number of samples used for training (defaults to 25% for n>20,000, otherwise 50%)
    :param test_size: proportion or number of samples used for testing (defaults to 75% for n>20,000, and 50% otherwise)
    :param samples_per_regressor: Number of samples from X_train that each regressor will be trained on.
                                  Default 'None' will cause all regressors to be trained on all samples.
    :param overlap: this is the number of samples overlapping for every adjacent pair of regressors.
           Defaults to no overlap if there are at least 100 samples per regressor, else full overlap (overlap=n).
    :param plotting: plots results
    """
    if scale_data:
        dataset.data = preprocessing.scale(dataset.data)

    if (train_size is None) and (test_size is None):
        if len(dataset.target) < 20000:
            (test_size,train_size) = (0.75, 0.25)
        else:
            (test_size, train_size) = (0.5, 0.5)

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        dataset.data,  # preprocessing.scale(dataset.data)
        dataset.target, random_state=0, test_size=test_size, train_size=train_size)

    # Prepare ensemble regressors
    ensemble = EnsembleRegressor(verbose=False, type=ensemble_type)

    n = len(y_train)
    m = ensemble.regressor_count

    # decide on how many samples per regressor and what's the overlap between regressors
    if samples_per_regressor and overlap:
        pass # both were defined by caller

    elif overlap and not samples_per_regressor:
        samples_per_regressor = (n // m) + overlap  # '//' is python operator for floor of n/m

    else: # both are None or only samples_per_regressor was given
        if n < m*100:  # reserve at least 100 samples for training the individual regressors
            overlap = n
            samples_per_regressor = (samples_per_regressor or n)
        else:  # we have enough samples to be training on different parts of the dataset
            overlap = 0
            samples_per_regressor = (samples_per_regressor or n // m)

    print("Training set size: %d with %d attributes" % X_train.shape)
    print("Each regressor is trained on %d samples" % samples_per_regressor)
    print("Test set size: %d" % len(y_test))

    ensemble.fit(X_train, y_train, samples_per_regressor=samples_per_regressor, regressor_overlap=overlap)
    scores = ensemble.score(X_train, y_train)
    MSEs = ensemble.mean_squared_error(X_train, y_train)

    for i, regr in enumerate(ensemble.regressors):
        print('## ' + str(i) + '. ' + regr.__class__.__name__ + ':')
        print(regr)

        print('\tMSE: %.2f' % MSEs[i])
        print('\tVariance score (R^2): %.2f\n' % scores[i])

    # create predictions matrix on the test set
    Z = ensemble.predict(X_test)

    # Set aside 200 samples as a training set for the supervised ensemble learners
    Z_train, Z, y_ensemble_train, y_ensemble_test = \
        cross_validation.train_test_split(Z.T, y_test, random_state=0, train_size=200)
    Z_train = Z_train.T
    Z = Z.T

    # Add Description if none
    if not Description:
        Description = "%s was generated with %d samples and %d regressors of type %s:\n%s" % \
                      (mat_filename, n, m, ensemble_type, str(locals()))

    sio.savemat(mat_filename, {
        'names': ensemble.regressor_labels,
        'Z': Z, 'y': y_ensemble_test,
        'Ztrain': Z_train, 'ytrain': y_ensemble_train,
        'samples_per_regressor': samples_per_regressor,
        'regressor_samples_overlap': overlap,
        'Ey': np.mean(y_ensemble_test),  # np.mean(dataset.target),
        'Ey2': np.mean(y_ensemble_test ** 2),  # np.mean(dataset.target ** 2)
        'Description': Description
    })

    if plotting:
        plot_regression_results(ensemble, Z, y_ensemble_test)
        plot_y_e_correlation(ensemble, Z, y_ensemble_test)
    def run(self, train_data_path):
        """Takes argument 'train_data_path'.
        train_data_path: Training data path.
        
        Performs models selection process on the specified order.
        A no. of reqred models can added to this method body and corss validated
        These can be saved as it is or ensembling can be applied. 
        """
        #Loading training data
        dtrain = pd.read_csv(train_data_path)
        #gets predictors
        predictor_vars = self.get_predictors(dtrain)

        #Model I
        xgboost = XGBRegressor(learning_rate=0.06,
                               n_estimators=1000,
                               max_depth=2,
                               min_child_weight=2,
                               gamma=0,
                               subsample=0.4,
                               colsample_bytree=0.2,
                               objective='reg:linear',
                               nthread=-1,
                               scale_pos_weight=1,
                               seed=27,
                               reg_alpha=77)

        #Model II
        xgboost2 = XGBRegressor(learning_rate=0.04,
                                n_estimators=1500,
                                max_depth=2,
                                min_child_weight=0,
                                gamma=0,
                                subsample=0.7,
                                colsample_bytree=0.2,
                                objective='reg:linear',
                                nthread=-1,
                                scale_pos_weight=1,
                                seed=99,
                                reg_alpha=1.7)

        #Model III
        xgboost3 = XGBRegressor(learning_rate=0.02,
                                n_estimators=1200,
                                max_depth=3,
                                min_child_weight=2,
                                gamma=0,
                                subsample=0.65,
                                colsample_bytree=0.2,
                                objective='reg:linear',
                                nthread=-1,
                                scale_pos_weight=1,
                                seed=585,
                                reg_alpha=5000)

        #Model IV
        lightgbm = LGBMRegressor(objective='regression',
                                 num_leaves=4,
                                 min_data_in_leaf=5,
                                 learning_rate=0.02,
                                 n_estimators=3000,
                                 max_bin=320,
                                 bagging_fraction=0.85,
                                 bagging_freq=10,
                                 bagging_seed=9,
                                 feature_fraction=0.2,
                                 feature_fraction_seed=9,
                                 data_random_seed=9,
                                 reg_alpha=0.55,
                                 reg_lambda=0.3,
                                 verbose=-1)

        #Model V
        lightgbm2 = LGBMRegressor(objective='regression',
                                  num_leaves=4,
                                  min_data_in_leaf=3,
                                  learning_rate=0.01,
                                  n_estimators=4000,
                                  max_bin=295,
                                  bagging_fraction=0.5,
                                  bagging_freq=10,
                                  bagging_seed=24,
                                  feature_fraction=0.2,
                                  feature_fraction_seed=24,
                                  data_random_seed=24,
                                  reg_alpha=10,
                                  reg_lambda=0.7,
                                  verbose=-1)

        #Ensembling all the five models
        ens_model = EnsembleRegressor(
            [xgboost, xgboost2, xgboost3, lightgbm, lightgbm2])

        #Performs cross validation on the ensembled model.
        self.cross_validate(cv=5,
                            model=ens_model,
                            X=dtrain[predictor_vars],
                            y=dtrain[self.target_var],
                            n_jobs=1)
        #CV Score is: 0.92528287952747 all predictors

        #Saving the final model.
        self.finalize_and_save(ens_model, self.output_file_path,
                               dtrain[predictor_vars], dtrain[self.target_var])
                                       bagging_seed=24,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=24,
                                       data_random_seed=24,
                                       reg_alpha=10,
                                       reg_lambda=0.7,
                                       verbose=-1)

#CV Score is: 0.9243765697929301 
 
                      
#######################################################################  

""" ENSEMBLING """    
                                                                 
xgb_ens = EnsembleRegressor([xgboost,xgboost2, xgboost3])
#CV Score is: 0.9246359450211432                                     
xgb_ens = EnsembleRegressor([xgboost,xgboost2, xgboost3, lightgbm])
#CV Score is: 0.9249748684043093                                   
xgb_ens = EnsembleRegressor([xgboost,xgboost2, xgboost3, lightgbm, lightgbm2])
#CV Score is: 0.92528287952747       
#CV Score is: 0.9253181909342896                             
###################################################################### 
                                     
""" CROSS VALIDATION""" 

ms =ModelSelector()
ms.cross_validate(cv=5,model=xgb_ens,X=dtrain.drop(['SalePrice'], axis=1)[predictor_vars], y=dtrain['SalePrice'], n_jobs = 1)
#CV Score is: 0.92528287952747 all predictor variables
                                    
#Using feature importance to check for improvement.