Beispiel #1
0
def evalOneMax_KNN(individual, df_test, base_estimator):
    """
    This will have the code for testing the model on unseen data.
    
    Parameters
    ----------
    individual : list, required
        List of dataframes with each dataframes last column the
        dependent column.
    
    df_test : DataFrame, required
        The test dataframe against which you are evaluating your
        model in the fitness function.

    model : Estimator, required
        The estimator to use in the model.


    Side note: We can even start with just one partial part of the dataset, keep trying to
    increase its cv score in fitness function. To get the optimal dataset. I am fearing, 
    it will make a dataset of exactly same values in the end. That will have the best cv error.
    Therefore need to add a component of unseen data performance.
    """
    total_mse = []
    ensembles = []
    centroids = []

    for df_ in individual:
        # clf = LinearRegression()
        clf = base_estimator()

        clf = clf.fit(df_.iloc[:, 0:-1], df_.iloc[:,-1])
        ensembles.append(clf)
        centroids.append(centroid_df(df_.iloc[:,0:-1]))
        ## for sum of mse return uncomment these
        #y_pred = clf.predict(df_test[x_columns])
        #mse = mean_squared_error(y_pred, df_test[y_column])
        #total_mse.append(mse)
    
    #print total_mse
    y_preds_ensemble = []
    for row in df_test.iloc[:, 0:-1].values:
        distances = [distance(row, centroid) for centroid in centroids]
        model_index = np.argmin(distances)
        #todo: optional use the average of the 2 min distances ka prediction.
        #todo: parameter.
        y_pred = ensembles[model_index].predict(row)[0]
        

        y_preds_ensemble.append(y_pred)

    rmse_ensemble = np.sqrt(mean_squared_error(y_preds_ensemble, df_test.iloc[:,-1]))
    return (rmse_ensemble),
Beispiel #2
0
def evalOneMax_KNN(individual, df_test, base_estimator):
    """
    This will have the code for testing the model on unseen data.
    
    Parameters
    ----------
    individual : list, required
        List of dataframes with each dataframes last column the
        dependent column.
    
    df_test : DataFrame, required
        The test dataframe against which you are evaluating your
        model in the fitness function.

    model : Estimator, required
        The estimator to use in the model.


    Side note: We can even start with just one partial part of the dataset, keep trying to
    increase its cv score in fitness function. To get the optimal dataset. I am fearing, 
    it will make a dataset of exactly same values in the end. That will have the best cv error.
    Therefore need to add a component of unseen data performance.
    """
    total_mse = []
    ensembles = []
    centroids = []

    for df_ in individual:
        # clf = LinearRegression()
        clf = base_estimator()

        clf = clf.fit(df_.iloc[:, 0:-1], df_.iloc[:, -1])
        ensembles.append(clf)
        centroids.append(centroid_df(df_.iloc[:, 0:-1]))
        ## for sum of mse return uncomment these
        #y_pred = clf.predict(df_test[x_columns])
        #mse = mean_squared_error(y_pred, df_test[y_column])
        #total_mse.append(mse)

    #print total_mse
    y_preds_ensemble = []
    for row in df_test.iloc[:, 0:-1].values:
        distances = [distance(row, centroid) for centroid in centroids]
        model_index = np.argmin(distances)
        #todo: optional use the average of the 2 min distances ka prediction.
        #todo: parameter.
        y_pred = ensembles[model_index].predict(row)[0]

        y_preds_ensemble.append(y_pred)

    rmse_ensemble = np.sqrt(
        mean_squared_error(y_preds_ensemble, df_test.iloc[:, -1]))
    return (rmse_ensemble),
Beispiel #3
0
def eval_ensemble_oob_KNN_EG(individual, df, base_estimator, n_votes):
    """
    Fitness is based on entire ensembles performance on Out of Bag samples
    calculated using the union of the all samples used while training each child model.

    Used in FEGO.

    Parameters
    ----------
    individual : list, required
        List of dataframes with each dataframes last column the
        dependent column.
    
    df: DataFrame, required
        The entire dataframe given to model to train - X and y combined.

    model : Estimator, required
        The estimator to use in the model.
        #todo: not really needed. double check.

    Side note: We can even start with just one partial part of the dataset, keep trying to
    increase its cv score in fitness function. To get the optimal dataset. I am fearing, 
    it will make a dataset of exactly same values in the end. That will have the best cv error.
    Therefore need to add a component of unseen data performance.
    """
    total_mse = []
    ensembles = []
    centroids = []

    # All the indices used in each child model.
    bag_idx = []

    for eg_ in individual:
        # clf = LinearRegression()
        
        clf = eg_.estimator

        ensembles.append(clf)
        
        idx = eg_.X.index.tolist()
        # print len(idx)
        bag_idx.append(idx)

        centroids.append(centroid_df(eg_.X))
        
        ## for sum of mse return uncomment these
        #y_pred = clf.predict(df_test[x_columns])
        #mse = mean_squared_error(y_pred, df_test[y_column])
        #total_mse.append(mse)
    
    #print total_mse
    # flattening and converting to set.
    
    # print bag_idx
    bag_idx = set(sum(bag_idx, []))
    # print len(bag_idx)
    out_bag_idx  = list(set(df.index.tolist()) - bag_idx)
    print 'Size of OOB', len(out_bag_idx)
    df_test = df.loc[out_bag_idx]

    y_preds_ensemble = []
    ensembles = np.array(ensembles)


    for row in df_test.iloc[:, 0:-1].values:
        distances = np.array([distance(row, centroid) for centroid in centroids])
        # model_index = np.argmin(distances)
        #todo: optional use the average of the 2 min distances ka prediction.
        #todo: parameter.
        
        
        model_ixs = distances.argsort()[:n_votes]
        

        models = ensembles[model_ixs]


        # mean of all predictions.
        y_pred = np.mean([mdl.predict(row)[0] for mdl in models])
            
        # y_pred = ensembles[model_index].predict(row)[0]
        
        y_preds_ensemble.append(y_pred)

    rmse_ensemble = np.sqrt(mean_squared_error(y_preds_ensemble, df_test.iloc[:,-1]))
    return (rmse_ensemble),
Beispiel #4
0
def eval_ensemble_oob_KNN_EG(individual, df, base_estimator, n_votes):
    """
    Fitness is based on entire ensembles performance on Out of Bag samples
    calculated using the union of the all samples used while training each child model.

    Used in FEGO.

    Parameters
    ----------
    individual : list, required
        List of dataframes with each dataframes last column the
        dependent column.
    
    df: DataFrame, required
        The entire dataframe given to model to train - X and y combined.

    model : Estimator, required
        The estimator to use in the model.
        #todo: not really needed. double check.

    Side note: We can even start with just one partial part of the dataset, keep trying to
    increase its cv score in fitness function. To get the optimal dataset. I am fearing, 
    it will make a dataset of exactly same values in the end. That will have the best cv error.
    Therefore need to add a component of unseen data performance.
    """
    total_mse = []
    ensembles = []
    centroids = []

    # All the indices used in each child model.
    bag_idx = []

    for eg_ in individual:
        # clf = LinearRegression()

        clf = eg_.estimator

        ensembles.append(clf)

        idx = eg_.X.index.tolist()
        # print len(idx)
        bag_idx.append(idx)

        centroids.append(centroid_df(eg_.X))

        ## for sum of mse return uncomment these
        #y_pred = clf.predict(df_test[x_columns])
        #mse = mean_squared_error(y_pred, df_test[y_column])
        #total_mse.append(mse)

    #print total_mse
    # flattening and converting to set.

    # print bag_idx
    bag_idx = set(sum(bag_idx, []))
    # print len(bag_idx)
    out_bag_idx = list(set(df.index.tolist()) - bag_idx)
    print 'Size of OOB', len(out_bag_idx)
    df_test = df.loc[out_bag_idx]

    y_preds_ensemble = []
    ensembles = np.array(ensembles)

    for row in df_test.iloc[:, 0:-1].values:
        distances = np.array(
            [distance(row, centroid) for centroid in centroids])
        # model_index = np.argmin(distances)
        #todo: optional use the average of the 2 min distances ka prediction.
        #todo: parameter.

        model_ixs = distances.argsort()[:n_votes]

        models = ensembles[model_ixs]

        # mean of all predictions.
        y_pred = np.mean([mdl.predict(row)[0] for mdl in models])

        # y_pred = ensembles[model_index].predict(row)[0]

        y_preds_ensemble.append(y_pred)

    rmse_ensemble = np.sqrt(
        mean_squared_error(y_preds_ensemble, df_test.iloc[:, -1]))
    return (rmse_ensemble),