Beispiel #1
0
def pls_learning(num_components, num_vars):
    print(1)

    data_full = pd.read_csv('mango_flouro_rows.csv')
    data_full = data_full[data_full['position'] == 'pos 2']
    y_name = 'Total Chlorophyll (ug/ml)'
    # y_name = 'Chlorophyll b (ug/ml)'
    y_data = data_full[y_name]

    x_data = data_full[x_data_columns]

    x_data = get_best_pls_variables(x_data, y_data, num_components, num_vars)

    cv = GroupShuffleSplit(n_splits=100, test_size=0.3, random_state=0)
    group_splitter = data_full['Leaf number']
    estimator = PLS(num_components)
    title = "Learning curve {} components".format(num_components)

    plot_learning_curve(estimator,
                        title,
                        x_data,
                        y_data,
                        cv=cv,
                        group=group_splitter)
    plt.show()
def get_all_regrs():
    regrs = {
        "Linear regression":
        linear_model.LinearRegression(),
        # "Perceptron": linear_model.Perceptron(),
        "Lars":
        linear_model.Lars(),
        "Lasso":
        linear_model.LassoCV(max_iter=5000),
        # "Passive Aggressive": linear_model.PassiveAggressiveRegressor(),
        "PLS":
        PLS(n_components=3),
        "Random Forest":
        ensemble.RandomForestRegressor(),
        "Gradient Boost":
        ensemble.GradientBoostingRegressor(),
        "Extra Trees":
        ensemble.ExtraTreesRegressor(max_depth=2),
        "Ada Boost":
        ensemble.AdaBoostRegressor(
            base_estimator=tree.DecisionTreeRegressor(max_depth=2),
            n_estimators=250),
        "Gaussian Process":
        gaussian_process.GaussianProcessRegressor(),
        # "Isotonic": isotonic.IsotonicRegression(),
        "Kernel Ridge":
        kernel_ridge.KernelRidge(),
        "Ridge CV":
        linear_model.RidgeCV(),
        # "Exp tranform": TransformedTargetRegressor(regressor=PLS(n_components=3),
        #                                            func=np.exp,
        #                                            inverse_func=np.log),
        # "Log tranform": TransformedTargetRegressor(regressor=PLS(n_components=3),
        #                                            func=np.log,
        #                                            inverse_func=np.exp),
        # "Inv tranform": TransformedTargetRegressor(regressor=PLS(n_components=3),
        #                                            func=invert,
        #                                            inverse_func=invert),
        # "Log regressor": linear_model.LogisticRegressionCV(),
        "ML Perceptron":
        neural_network.MLPRegressor(max_iter=50000, hidden_layer_sizes=(5, 5)),
        "Linear SVR":
        linear_svc,
        "RBF SVR":
        svm.SVR(kernel='rbf'),
        "Poly SVR":
        svm.SVR(kernel='poly'),
        # "Sigmoid SVR": svm.SVR(kernel='sigmoid'),
        "Bayesian Ridge":
        linear_model.BayesianRidge(),
        "Huber":
        linear_model.HuberRegressor(),
        # "Poisson": linear_model.PoissonRegressor(),
        "K-neighbors":
        neighbors.KNeighborsRegressor()
    }
    # "Radius Neighbors": neighbors.RadiusNeighborsRegressor()}
    return regrs
Beispiel #3
0
def get_best_pls_variables(x, y, num_pls_components, num_varaibles):
    x_scaled_np = StandardScaler().fit_transform(x)
    x_scaled = pd.DataFrame(x_scaled_np, columns=x.columns)

    pls = PLS(num_pls_components)
    pls.fit(x_scaled, y)
    sorted_coeff = np.argsort(np.abs(pls.coef_[:, 0]))
    sorted_coeff = np.flip(sorted_coeff)
    columns_to_keep = x.columns[sorted_coeff[:num_varaibles]]
    print(columns_to_keep)
    return x_scaled[columns_to_keep]
Beispiel #4
0
def transform(X, factors, get_model, method, y=None):
    if method == "raw" or method is None:
        return X
    if not factors or factors == "full":
        factors = np.prod(X.shape[1:])
        if method == "lda":
            factors -= 1

    if not isinstance(method, str):
        raise RuntimeError("Please supply a method name (pca, lda, ica, cca, pls)")
    method = method.lower()

    if method == "pca":
        from sklearn.decomposition import PCA
        model = PCA(n_components=factors, whiten=True)
    elif method == "lda":
        from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
        model = LDA(n_components=factors)
    elif method == "ica":
        from sklearn.decomposition import FastICA as ICA
        model = ICA(n_components=factors)
    elif method == "cca":
        from sklearn.cross_decomposition import CCA
        model = CCA(n_components=factors)
    elif method == "pls":
        from sklearn.cross_decomposition import PLSRegression as PLS
        model = PLS(n_components=factors)
        if str(y.dtype)[:3] not in ("flo", "int"):
            y = dummycode(y, get_translator=False)
    else:
        raise ValueError("Method {} unrecognized!".format(method))

    X = rtm(X)
    if method in ("lda", "cca", "pls"):
        if y is None:
            raise RuntimeError("y must be supplied for {}!".format(method))
        latent = model.fit_transform(X, y)
    else:
        if y is not None:
            warnings.warn("y supplied for {}. Ignoring!".format(method))
        latent = model.fit_transform(X)

    if isinstance(latent, tuple):
        latent = latent[0]
    if get_model:
        return latent, model
    else:
        return latent
Beispiel #5
0
def plot_param_learning_curves():
    x_data, _y, full_data = data_get.get_data('as7262 mango', average=False)
    pls = PLS(n_components=6)
    print(full_data.columns)
    currents = full_data['LED current'].unique()
    times = full_data['integration time'].unique()
    print(currents, times)
    print(full_data['saturation check'].unique())
    figure, axes, = plt.subplots(len(currents),
                                 len(times),
                                 figsize=(9, 12),
                                 constrained_layout=True)

    figure.suptitle("Parameter scan of new AS7262 Mango data")
    # figure.suptitle("Gradient Boosting Regressor fit\nAS7262 Betel data")
    # axes_ = [axes[0][0], axes[0][1], axes[0][2], axes[0][3],
    #          axes[1][0], axes[1][1], axes[1][2], axes[1][3],
    #          axes[2][0], axes[2][1], axes[2][2], axes[2][3],
    #          axes[3][0], axes[3][1], axes[3][2], axes[3][3],
    #          axes[4][0], axes[4][1], axes[4][2], axes[4][3],]

    current_i = 0
    time_i = 0
    for current in currents:
        for time in times:
            X, Y = data_get.get_data("as7262 mango",
                                     integration_time=time,
                                     led_current=current,
                                     return_type="XY")

            X = StandardScaler().fit_transform(X)
            X = PolynomialFeatures().fit_transform(X)

            Y = Y['Total Chlorophyll (µg/mg)']
            title = str(time * 2.8) + " ms " + current
            print(title)

            plot_learning_curve(pls,
                                title,
                                X,
                                Y,
                                cv=cv,
                                ax=axes[current_i][time_i],
                                ylim=[-0.3, -.1])

            time_i += 1
        time_i = 0
        current_i += 1
Beispiel #6
0
    def fit(self, data, resp):
        dof = data.shape[1]
        n = data.shape[0]
        ortho_comp = dof - self.comp

        W = np.ndarray(shape=(dof, ortho_comp))
        P = np.ndarray(shape=(dof, ortho_comp))
        T = np.ndarray(shape=(n, ortho_comp))

        # Start with Vector
        w = np.transpose(np.matmul(np.transpose(resp), data) / lin.norm(resp))

        for i in range(ortho_comp):
            t = np.matmul(data, w) / np.matmul(np.transpose(w),
                                               w)  # get pls scores
            p = np.transpose(
                np.matmul(np.transpose(t), data) /
                np.matmul(np.transpose(t), t))  # pls loadings

            ## Get Orthogonal Components
            w_ortho = p - ((np.matmul(np.transpose(w), p) /
                            np.matmul(np.transpose(w), w)) * w)
            w_ortho = w_ortho / lin.norm(w_ortho)
            t_ortho = np.matmul(data, w_ortho) / np.matmul(
                np.transpose(w_ortho), w_ortho)
            p_ortho = np.transpose(
                np.matmul(np.transpose(t_ortho), data) /
                np.matmul(np.transpose(t_ortho), t_ortho))

            data = data - np.matmul(t_ortho, np.transpose(p_ortho))
            W[:, i] = np.reshape(w_ortho, (dof, ))
            P[:, i] = np.reshape(p_ortho, (dof, ))
            T[:, i] = np.reshape(t_ortho, (n, ))

        self.data_p = data
        self.data_o = np.matmul(T, np.transpose(P))
        self.W_o = W
        self.T_o = T
        self.P_o = P

        ## Build PLS Regression from data.P
        pls = PLS(n_components=self.comp).fit(self.data_p, resp)
        self.analysis = pls
        self.rotated_data = pls.transform(self.data_p)

        return (self)
Beispiel #7
0
def _pls_regression_train(table, feature_cols, label_cols, n_components=2, scale=True, max_iter=500, tol=1e-6):
    pls_model = PLS(n_components=n_components, scale=scale, max_iter=max_iter, tol=tol)
    _, features = check_col_type(table, feature_cols)
    _, labels = check_col_type(table, label_cols)
    pls_model.fit(features, labels)
    predict = pls_model.predict(features)
    _mean_absolute_error = mean_absolute_error(labels, predict)
    _mean_squared_error = mean_squared_error(labels, predict)
    _r2_score = r2_score(labels, predict)
    result_table = pd.DataFrame.from_items([
        ['Metric', ['Mean Absolute Error', 'Mean Squared Error', 'R2 Score']],
        ['Score', [_mean_absolute_error, _mean_squared_error, _r2_score]]
    ])
    label_name = {
        'n_components': 'Number of components',
        'scale': "Scale",
        'max_iter': 'Max iteration',
        'tol': 'Tolerance'
    }
    get_param = pls_model.get_params()
    param_table = pd.DataFrame.from_items([
        ['Parameter', list(label_name.values())],
        ['Value', [get_param[x] for x in list(label_name.keys())]]
    ])
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ### PLS Regression Result
    | {result}
    | ### Parameters
    | {list_parameters}
    """.format(result=pandasDF2MD(result_table), list_parameters=pandasDF2MD(param_table)
               )))
    model = _model_dict('pls_regression_model')
    model['feature_cols'] = feature_cols
    model['label'] = label_cols
    model['mean_absolute_error'] = _mean_absolute_error
    model['mean_squared_error'] = _mean_squared_error
    model['r2_score'] = _r2_score
    model['max_iter'] = max_iter
    model['tol'] = tol
    model['pls_model'] = pls_model
    model['_repr_brtc_'] = rb.get()
    return {'model': model}
res_data = res_data[y_name]
x_data_column = data["550 nm"]


def model(x, a, b, c):
    return a * np.exp(-b * x) + c


fit_values, _ = curve_fit(model, x_data_column, y_data, maxfev=10**6)

y_fit_model = model(x_data_column, *fit_values)

res_model = y_data - y_fit_model

# make residues from PLS
pls = PLS(n_components=3)
pls_model = pls.fit(x_data, y_data)
y_fit_pls = pls.predict(x_data)
print(y_fit_pls)
print(y_data)
print(type(y_data), type(y_fit_pls), type(y_fit_model))
print(y_data)
print(y_data.to_numpy())
print(y_fit_pls.T[0])
res_pls = y_data - y_fit_pls.T[0]

print(res_model)
print(type(res_model))
print(type(res_pls))
plt.hist(res_model,
         alpha=0.5,
Beispiel #9
0
crossval = KFold(n_splits=5)
cv_2 = crossval.split(data_norm, resp)

cv_accuracy = np.ndarray(5)
models = list()
vip = np.ndarray((5, 19))
iter = 0
for (train, test) in cv_2:
    train = np.array(train)
    test = np.array(test)
    train_data = data_norm[train, :]
    train_resp = resp[train, :]
    test_data = data_norm[test, :]
    test_resp = resp[test, :]

    models.append(PLS().fit(train_data, train_resp))
    pls_comps = models.__getitem__(iter).transform(train_data)

    testing = models.__getitem__(iter).predict(test_data)

    tmp = np.sqrt(np.mean((testing - resp[test])**2))
    cv_accuracy[iter] = tmp
    #print(testing)
    #print(resp[test])
    #print(tmp)

    vip[iter, :] = getVIP(models.__getitem__(iter))
    #print(vip)

    iter = iter + 1
for column in data.columns:
    if 'nm' in column:
        data_columns.append(column)
print(data_columns)
# x_data = data[data_columns]

y_columns = [
    'Total Chlorophyll (ug/ml)', 'Chlorophyll a (ug/ml)',
    'Chlorophyll b (ug/ml)'
]

invert_y = False
# x_data = np.log(x_data)
conditions = "Partial Least Squared"

modeler = PLS(n_components=1)
modeler_name = "Partial Least Squared"

modeler = TransformedTargetRegressor(regressor=PLS(n_components=3),
                                     func=np.reciprocal,
                                     inverse_func=np.reciprocal)


def plot_learning_curve(estimator,
                        axis,
                        X,
                        y,
                        ylim=None,
                        cv=None,
                        group=None,
                        scorer=make_scorer(mean_absolute_error),
Beispiel #11
0
                pc1 = tmp[:,i]
                pc2 = tmp[:,j]
                plt.scatter(pc1, pc2)
            plt.xlabel("PLS Component "+str(i+1))
            plt.ylabel("PLS Component "+str(j+1))
            
    plt.show()
    


##################### MAIN CODE #####################
#### Load data into numpy array'
# Keep pandas just for conveinience right now
data = load_data.loadDataPandas('../data/SCLC_study_output_filtered_2.csv')
d = data.to_numpy()
var_index = data.columns.values.tolist()

# vector of class responses associated with data
resp = load_data.getResponseMatrix2D()

#### Create object to normalize and un-normalize data
norm_trans = pre.StandardScaler().fit(d)
data_norm = norm_trans.transform(d)
#data_norm, norm_trans = pre.mean_center(d) 
#In-built preprocessing method - TBD

#### Fit a Partial Least Squaresn
pls = PLS().fit(data_norm, resp)
pls_trans = pls.transform(data_norm)

plotProjectionScatterMultiClass(pls_trans, resp, 2)
#                "Ridge alpha 0.01", "Ridge alpha 0.001",
#                "Ridge alpha 0.0001", "Ridge alpha 0.00001",
#                "Lars", "Guassian Regression",
#                "Gradient Boosting",
#                "SVR", "LinearSVR", "NuSVR",
#                "LogisticRegression", "LinearRegression", "SGDRegressor",
#                "ElasticNet", "ARDRegression", "BayesianRidge",
#                "HuberRegressor", "RANSACRegressor", "TheilSenRegressor",
#                "PassiveAggressiveRegressor",
#                "AdaBoostRegressor", "BaggingRegressor", "GradientBoostingRegressor",
#                "RandomForestRegressor", "ExtraTreesRegressor",
#                "Kernel Ridge"
#                ]

models_to_test = [
    PLS(n_components=1),
    PLS(n_components=2),
    PLS(n_components=3),
    PLS(n_components=4),
    Lasso(alpha=5),
    Lasso(alpha=2),
    Lasso(alpha=1),
    Lasso(alpha=0.2),
    LassoLars(alpha=1),
    LassoLars(alpha=0.1),
    LassoLars(alpha=0.01),
    LassoLars(alpha=0.001),
    LassoLars(alpha=0.0003),
    Ridge(alpha=0.01, max_iter=5000),
    Ridge(alpha=0.001, max_iter=5000),
    Ridge(alpha=0.0001, max_iter=5000),