def _bayesianridge(*, train, test, x_predict=None, metrics, n_iter=300, tol=0.001, 
        alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, alpha_init=None, 
        lambda_init=None, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, 
        verbose=False):
    """For more info visit : 
        https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.BayesianRidge.html#sklearn.linear_model.BayesianRidge
    """

    model = BayesianRidge(n_iter=n_iter, tol=tol, alpha_1=alpha_1, alpha_2=alpha_2, lambda_1=lambda_1,
        lambda_2=lambda_2, alpha_init=alpha_init, lambda_init=lambda_init, compute_score=compute_score,
        fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, verbose=verbose)
    model.fit(train[0], train[1])
    model_name = 'Bayesian Ridge'
    y_hat = model.predict(test[0])

    if metrics == 'mse':
        accuracy = _mse(test[1], y_hat)
    if metrics == 'rmse':
        accuracy = _rmse(test[1], y_hat)
    if metrics == 'mae':
        accuracy = _mae(test[1], y_hat)

    if x_predict is None:
        return (model_name, accuracy, None)

    y_predict = model.predict(x_predict)
    return (model_name, accuracy, y_predict)
    
Beispiel #2
0
def Basyen_stacking(oof_lgb,oof_xgb,predictions_lgb,predictions_xgb,sub,target):
    res = sub.copy()
    # 将lgb和xgb的结果进行stacking
    train_stack = np.vstack([oof_lgb,oof_xgb]).transpose()
    test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()

    folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590)
    oof_stack = np.zeros(train_stack.shape[0])
    predictions = np.zeros(test_stack.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack,target)):
        print("fold {}".format(fold_))
        trn_data, trn_y = train_stack[trn_idx], target[trn_idx]
        val_data, val_y = train_stack[val_idx], target[val_idx]
        
        clf_3 = BayesianRidge()
        clf_3.fit(trn_data, trn_y)
        
        oof_stack[val_idx] = clf_3.predict(val_data)
        predictions += clf_3.predict(test_stack) / 10
    
    cross_validation_loss = mean_squared_error(target, oof_stack)
    print(cross_validation_loss)

    res[1] = predictions
    mean = res[1].mean()
    print('mean:',mean)
    res.to_csv("./Basyen_stacking.csv",index=False,header = None)
    return cross_validation_loss
def fillna_knn_reg(df, base, target, fraction=1, threshold=10, n_neighbors=5):
    assert isinstance(
        base,
        list) or isinstance(base, np.ndarray) and isinstance(target, str)
    whole = [target] + base
    print(threshold, "\n", fraction, "\n", n_neighbors)
    miss = df[target].isnull()
    notmiss = ~miss

    X_target = df.loc[notmiss, whole]
    Y = X_target[target]
    X = X_target[base]
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=5)

    print('fitting')

    n_neighbors = n_neighbors
    clf = BayesianRidge()
    clf.fit(X, Y)
    print('predicting')

    print("Fit a model X_test and claculate Mean Squared Error with Y_test:")
    print(np.mean((Y_test - clf.predict(X_test))**2))

    Z = clf.predict(df.loc[miss, base])
    print('writing result to df')
    df.loc[miss, target] = Z
Beispiel #4
0
    def train_BayesianRidge(self, data):
        train, validacion = data
        x_tr, y_tr = train
        x_val, y_val = validacion
        #print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1]))
        #print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1]))

        print('Start training BayesianRidge...')
        start_time = self.timer()

        bayesr = BayesianRidge(normalize=True, n_iter=1000)
        bayesr.fit(x_tr, y_tr)
        print("The R2 is: {}".format(bayesr.score(x_tr, y_tr)))
        #		print("The alpha choose by CV is:{}".format(bayesr.alpha_))
        self.timer(start_time)

        print("Making prediction on validation data")
        y_val = np.expm1(y_val)
        y_val_pred = np.expm1(bayesr.predict(x_val))
        mae = mean_absolute_error(y_val, y_val_pred)
        print("El mean absolute error de es {}".format(mae))

        print('Saving model into a pickle')
        try:
            os.mkdir('pickles')
        except:
            pass

        with open('pickles/bayesrCV.pkl', 'wb') as f:
            pickle.dump(bayesr, f)

        print('Making prediction and saving into a csv')
        y_test = bayesr.predict(self.x_test)

        return y_test
Beispiel #5
0
def modelResultMerge(predictions_lgb, predictions_xgb, train_lgb, train_xgb,
                     target):
    train_stack = np.vstack([train_lgb, train_xgb]).transpose()
    test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()

    folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2012)
    oof_stack = np.zeros(train_stack.shape[0])
    predictions = np.zeros(test_stack.shape[0])

    for fold_, (trn_idx,
                val_idx) in enumerate(folds_stack.split(train_stack, target)):
        print("fold {}".format(fold_))
        trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
        val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

        clf_3 = BayesianRidge()
        clf_3.fit(trn_data, trn_y)

        #     clf_3 = linear_model.LinearRegression()
        #     clf_3.fit(trn_data, trn_y)

        oof_stack[val_idx] = clf_3.predict(val_data)
        predictions += clf_3.predict(test_stack) / 10

    print(mean_squared_error(target.values, oof_stack))
    return predictions
Beispiel #6
0
def stacking_predict(oof_lgb,
                     oof_xgb,
                     predictions_lgb,
                     predictions_xgb,
                     y_train,
                     verbose_eval=1):
    # stacking
    train_stack = np.vstack([oof_lgb, oof_xgb]).transpose()
    test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()

    folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590)
    oof_stack = np.zeros(train_stack.shape[0])
    predictions = np.zeros(test_stack.shape[0])
    stack_models = []

    for fold_, (trn_idx,
                val_idx) in enumerate(folds_stack.split(train_stack, y_train)):
        if verbose_eval:
            print("fold {}".format(fold_))
        trn_data, trn_y = train_stack[trn_idx], y_train[trn_idx]
        val_data, val_y = train_stack[val_idx], y_train[val_idx]

        clf_3 = BayesianRidge()
        clf_3.fit(trn_data, trn_y)
        stack_models.append(clf_3)

        oof_stack[val_idx] = clf_3.predict(val_data)
        predictions += clf_3.predict(test_stack) / 10

    final_score = mean_squared_error(y_train, oof_stack)
    if verbose_eval:
        print(final_score)
    return oof_stack, predictions, final_score, stack_models
Beispiel #7
0
class MixedRegressor(skl.base.BaseEstimator, skl.base.TransformerMixin):
    """docstring"""
    def __init__(self, save_path=None):
        super(MixedRegressor, self).__init__()
        self.save_path = save_path
        self.regressor = None
        self.regressorlt40 = None
        self.regressorgt60 = None

    def fit(self, X, y, sample_weight=None):
        X, y = check_X_y(X, y)

        self.regressor = BayesianRidge()
        self.regressorlt40 = BayesianRidge()
        self.regressorgt60 = BayesianRidge()

        self.regressor.fit(X, y)

        lt40 = y < 40
        gt60 = y > 60

        Xlt40 = X[lt40]
        Ylt40 = y[lt40]

        Xgt60 = X[gt60]
        Ygt60 = y[gt60]

        self.regressorlt40.fit(Xlt40, Ylt40)
        self.regressorgt60.fit(Xgt60, Ygt60)

        return self

    def predict(self, X):
        check_is_fitted(self, ["regressor", "regressorlt40", "regressorgt60"])
        X = check_array(X)
        predictions = self.regressor.predict(X)

        lt18 = predictions < 18
        gt88 = predictions > 88

        if (len(predictions[lt18]) > 0):
            predlt18 = self.regressorlt40.predict(X[lt18])
            predictions[lt18] = predlt18

        if (len(predictions[gt88]) > 0):
            predgt88 = self.regressorgt60.predict(X[gt88])
            predictions[gt88] = predgt88
        return predictions

    def score(self, X, y, sample_weight=None):
        scores = -(self.predict(X) - y)**2 / len(y)
        score = np.sum(scores)

        return score

    def set_save_path(self, save_path):
        self.save_path = save_path
Beispiel #8
0
def make_forward_model(data_ss, RDKit_FPs):
    # forward model library from scikit-learn
    from sklearn.linear_model import BayesianRidge
    # xenonpy library for data splitting (cross-validation)
    from xenonpy.datatools import Splitter

    # property name will be used as a reference for calling models
    prop = ['E', 'H**O-LUMO gap']

    # prepare indices for cross-validation data sets
    sp = Splitter(data_ss.shape[0], test_size=0, cv=5)

    # initialize output variables
    y_trues, y_preds = [[] for i in range(len(prop))], [[] for i in range(len(prop))]
    y_trues_fit, y_preds_fit = [[] for i in range(len(prop))], [[] for i in range(len(prop))]
    y_preds_std, y_preds_std_fit = [[] for i in range(len(prop))], [[] for i in range(len(prop))]

    # cross-validation test
    for iTr, iTe in sp.cv():
        x_train = data_ss['SMILES'].iloc[iTr]
        x_test = data_ss['SMILES'].iloc[iTe]

        fps_train = RDKit_FPs.transform(x_train)
        fps_test = RDKit_FPs.transform(x_test)

        y_train = data_ss[prop].iloc[iTr]
        y_test = data_ss[prop].iloc[iTe]
        for i in range(len(prop)):
            mdl = BayesianRidge(compute_score=True)
            mdl.fit(fps_train, y_train.iloc[:, i])
            prd_train, std_train = mdl.predict(fps_train, return_std=True)
            prd_test, std_test = mdl.predict(fps_test, return_std=True)

            y_trues[i].append(y_test.iloc[:, i].values)
            y_trues_fit[i].append(y_train.iloc[:, i].values)
            y_preds[i].append(prd_test)
            y_preds_fit[i].append(prd_train)
            y_preds_std[i].append(std_test)
            y_preds_std_fit[i].append(std_train)

    # write down list of property name(s) for forward models
    prop = ['E', 'H**O-LUMO gap']  # match with data table for convenience

    # calculate descriptor values for all SMILES in the data subset
    fps_train = RDKit_FPs.transform(data_ss['SMILES'])

    # initialize a dictionary for model storage
    mdls = {}

    # fill in and train the models
    for x in prop:
        mdls[x] = BayesianRidge()
        mdls[x].fit(fps_train, data_ss[x])

    # import descriptor calculator and forward model to iQSPR
    prd_mdls = BayesianRidgeEstimator(descriptor=RDKit_FPs, **mdls)
    return prd_mdls, mdls
Beispiel #9
0
    def do_cv_pred(train, test, files, use_cols=10, verbose=False):
        print("------- do preds --------")
        ensemble_col = [
            f[1] for i, f in enumerate(files) if (i % 20) <= use_cols
        ]
        if use_cols == 2:
            print(ensemble_col)
        train_x = train[ensemble_col]
        test_x = test[ensemble_col]
        train_y = train["target"]

        submission = pd.DataFrame()
        submission["card_id"] = test["card_id"]
        submission["target"] = 0

        outliers = (train["target"] < -30).astype(int).values
        split_num = 5
        skf = model_selection.StratifiedKFold(n_splits=split_num,
                                              shuffle=True,
                                              random_state=4590)
        train_preds = []
        for idx, (train_index,
                  test_index) in enumerate(skf.split(train, outliers)):
            X_train, X_test = train_x.iloc[train_index], train_x.iloc[
                test_index]
            y_train, y_test = train_y.iloc[train_index], train_y.iloc[
                test_index]

            reg = BayesianRidge().fit(X_train, y_train)
            valid_set_pred = reg.predict(X_test)
            score = evaluator.rmse(y_test, valid_set_pred)
            if verbose:
                print(reg.coef_)
                print(score)

            y_pred = reg.predict(test_x)
            submission["target"] = submission["target"] + y_pred
            train_id = train.iloc[test_index]
            train_cv_prediction = pd.DataFrame()
            train_cv_prediction["card_id"] = train_id["card_id"]
            train_cv_prediction["cv_pred"] = valid_set_pred
            train_preds.append(train_cv_prediction)

        train_output = pd.concat(train_preds, axis=0)

        submission["target"] = submission["target"] / split_num
        submission.to_csv(path_const.OUTPUT_SUB, index=False)

        train_output["cv_pred"] = np.clip(train_output["cv_pred"], -33.219281,
                                          18.0)
        train_output.to_csv(path_const.OUTPUT_OOF, index=False)

        df_pred = pd.merge(train[["card_id", "target"]],
                           train_output,
                           on="card_id")
        rmse_score = evaluator.rmse(df_pred["target"], df_pred["cv_pred"])
        print(rmse_score)
Beispiel #10
0
def time_period_model_predict(train, test):

    train_sr, test_sr = data_preprocessing(train, test)

    train_time = train_sr.copy()
    test_time = test_sr.copy()

    X_train, X_test, y_train = time_model.feature_engineering(
        train_time, test_time)

    predictions_time, oof_stack_time = time_model.model_predict(
        X_train, X_test, y_train)

    #train_sr,test_sr = data_preprocessing(train,test)
    train_period = train_sr.copy()
    test_period = test_sr.copy()

    X_train, X_test, y_train = period_model.feature_engineering(
        train_period, test_period)

    predictions_period, oof_stack_period = period_model.model_predict(
        X_train, X_test, y_train)

    y_train = train_sr['收率'].values
    # 将时间点time和时间段period的结果进行stacking
    train_stack = np.vstack(
        [np.round(oof_stack_time, 3),
         np.round(oof_stack_period, 3)]).transpose()
    test_stack = np.vstack(
        [np.round(predictions_time, 3),
         np.round(predictions_period, 3)]).transpose()

    folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590)
    oof_stack = np.zeros(train_stack.shape[0])
    predictions = np.zeros(test_stack.shape[0])

    for fold_, (trn_idx,
                val_idx) in enumerate(folds_stack.split(train_stack, y_train)):
        print("fold {}".format(fold_))
        trn_data, trn_y = train_stack[trn_idx], y_train[trn_idx]
        val_data, val_y = train_stack[val_idx], y_train[val_idx]

        clf_3 = BayesianRidge()
        clf_3.fit(trn_data, trn_y)

        oof_stack[val_idx] = clf_3.predict(val_data)
        predictions += clf_3.predict(test_stack) / 10

    print("time_period_stacking score: {:<8.8f}".format(
        mean_squared_error(y_train, oof_stack) / 2))

    return predictions
Beispiel #11
0
    def get_stacking(self, oof_list, prediction_list, labels):
        '''
        :param oof_list: out-of-fold predictions
        :param prediction_list: test predictions
        :param labels: true labels of the training data set
        :return: stacking oof predictions of the training set and the testing set
        '''
        train_stack = np.vstack(oof_list).transpose()  # vertical stack起来
        test_stack = np.vstack(prediction_list).transpose()

        repeats = len(oof_list)  # 第一层做了多少个模型第二层就重复多少次CV  但也可以自己定啦
        kfolder = RepeatedKFold(n_splits=self.n_fold,
                                n_repeats=repeats,
                                random_state=4590)
        kfold = kfolder.split(train_stack, labels)  # stacking的这些模型里面也要做CV的
        preds_list = list()  # predictions of the testing data's labels
        stacking_oof = np.zeros(
            train_stack.shape[0]
        )  # predictions of the oof training data's labels

        for train_index, vali_index in kfold:
            k_x_train = train_stack[train_index]
            k_y_train = labels.loc[train_index]
            k_x_vali = train_stack[vali_index]

            assert self.stack_model in ['Ridge', 'Huber']
            if self.stack_model == 'Ridge':
                stacking_model = BayesianRidge()  # BayesianRidge
            if self.stack_model == 'Huber':
                stacking_model = HuberRegressor()
            stacking_model.fit(k_x_train, k_y_train)

            k_pred = stacking_model.predict(k_x_vali)
            stacking_oof[vali_index] = k_pred

            preds = stacking_model.predict(test_stack)
            preds_list.append(preds)

        fold_mae_error = mean_absolute_error(labels, stacking_oof)
        print(f'stacking fold mae training error is {fold_mae_error}')
        fold_score = 1 / (1 + fold_mae_error)
        print(f'fold score is {fold_score}')

        preds_columns = [
            'preds_{id}'.format(id=i) for i in range(self.n_fold * repeats)
        ]
        preds_df = pd.DataFrame(data=preds_list)
        preds_df = preds_df.T
        preds_df.columns = preds_columns
        stacking_prediction = list(preds_df.mean(axis=1))

        return stacking_oof, stacking_prediction
Beispiel #12
0
    def runBayesianRidgeRegressor(self):
        lm = BayesianRidge(n_iter=300,
                           compute_score=True,
                           fit_intercept=True,
                           normalize=True)

        print("Ridge Regression")
        lm.fit(self.m_X_train, self.m_y_train)
        predictY = lm.predict(self.m_X_test)
        score = lm.score(self.m_X_test, self.m_y_test)
        predictTraingY = lm.predict(self.m_X_train)

        self.displayPredictPlot(predictY)
        self.displayResidualPlot(predictY, predictTraingY)
        self.dispalyModelResult(lm, predictY, score)
Beispiel #13
0
def update_model(nr_exp, budget):
    global x_unlabeled, y_unlabeled, x_labeled, y_labeled, x_test, y_test, clf, server_buffer, errorHistoryUS

    if clf is None:
        clf = BayesianRidge()
        clf.fit(x_labeled, y_labeled)
        np.save('../outputs/model_size_' + str(x_unlabeled.shape[1]) + '.npy',
                np.int32(asizeof.asizeof(pickle.dumps(clf))))
        print(np.int32(asizeof.asizeof(pickle.dumps(clf))))
    host = 'localhost'
    port = 33333
    my_socket = -1
    while my_socket == -1:
        my_socket = utilities.create_socket('normal', host, port)

    print("Connection to edge for model sharing established")
    my_socket.send(pickle.dumps(clf))
    myfile = "../outputs/" + str(nr_exp) + "_budget_" + str(budget) + ".txt"
    with open(myfile, "w") as f:
        f.write("len_buffer_server\n")
    while True:
        time.sleep(budget)
        len_buffer = len(server_buffer)
        if len_buffer > 0:
            buffer_data = server_buffer[:len_buffer]
            del server_buffer[:len_buffer]
            x = [bd[1] for bd in buffer_data]
            idx = [bd[0] for bd in buffer_data]
            x = np.array(x)
            x = x.reshape((x.shape[0], x.shape[2]))
            _, std = clf.predict(x, return_std=True)
            most_uncertain_idx = np.argmax(std)
            x_labeled = np.append(x_labeled,
                                  x[most_uncertain_idx:most_uncertain_idx + 1],
                                  axis=0)
            y_labeled = np.append(y_labeled,
                                  y_unlabeled[idx[most_uncertain_idx]])
            clf = BayesianRidge()
            clf.fit(x_labeled, y_labeled)
            p = clf.predict(x_test)
            errorHistoryUS.append(
                np.sqrt(mean_squared_error(y_test.flatten(), p.flatten())))
            with open(myfile, "a") as f:
                f.write(str(len_buffer) + '\n')
        try:
            my_socket.send(pickle.dumps(clf))
        except ConnectionResetError:
            break
def stacking(train_df, test_df, save=True, verbose=True):
    folds = KFold(n_splits=11, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['target', 'card_id']]

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['target'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx]

        # clf = LinearRegression(n_jobs=-1)
        clf = BayesianRidge()
        # clf = Ridge()
        # clf = Lasso()
        # clf = ElasticNet()
        # clf = SGDRegressor()
        # clf = HuberRegressor()
        clf.fit(train_x.values, train_y.values)

        oof_preds[valid_idx] = clf.predict(valid_x.values)
        sub_preds += clf.predict(test_df[feats].values) / folds.n_splits

        if verbose:
            print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))

    score = rmse(train_df['target'], oof_preds)
    if verbose:
        print(f'ALL RMSE: {score}')

    if save:
        out_dir = ("../data/output/stacking")
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        with open(os.path.join(out_dir, f"params_{score:.5f}.txt"), "w") as fp:
            print(",".join(feats), file=fp)
        oof_df = train_df.copy().reset_index()
        oof_df['target'] = oof_preds
        oof_df[['card_id', 'target']].to_csv(os.path.join(out_dir, f"oof_{score:.5f}.csv"), index=False)
        submission = test_df.copy().reset_index()
        submission['target'] = sub_preds
        submission[['card_id', 'target']].to_csv(os.path.join(out_dir, f"stacking_{score:.5f}.csv"), index=False)

    return score
Beispiel #15
0
class RegressionImputer(BaseEstimator, RegressorMixin):
    """Custom scikit-learn estimator for imputation with Bayesian regression"""
    def __init__(self):
        pass

    def fit(self, X, y=None):
        self.regr_ = BayesianRidge()
        self.regr_.fit(X, y)

        return self

    def predict(self, X, y=None):
        try:
            getattr(self, "regr_")
        except AttributeError:
            raise RuntimeError("Imputer must be fitted before prediction.")

        # predict measures
        preds, stds = self.regr_.predict(X, return_std=True)

        return (pd.DataFrame({
            "pred_mean": preds,
            "pred_std": stds
        }).apply(lambda x: np.random.normal(x.pred_mean, x.pred_std),
                 axis=1).round())
Beispiel #16
0
 def fnBayesianRidge(self, year, avgTemp, predictYear):
     feature_train, feature_test, target_train, target_test = train_test_split(
         year, avgTemp, test_size=0.1, random_state=42)
     br = BayesianRidge(compute_score=True)
     br.fit(feature_train[:, np.newaxis], target_train)
     return (br.score(feature_test[:, np.newaxis],
                      target_test), br.predict(predictYear))
Beispiel #17
0
 def bayes_ridge_reg(self):
     br = BayesianRidge()
     br.fit(self.x_data, self.y_data)
     adjusted_result = br.predict(self.x_data)
     print "bayes ridge params", br.coef_, br.intercept_
     print "bayes ridge accuracy", get_accuracy(adjusted_result, self.y_data)
     return map(int, list(adjusted_result))
 def bayes(self):
     clf = BayesianRidge(compute_score=True)
     clf.fit(self.X_train, self.y_train)
     y_pred = clf.predict(self.X_test)
     dict = {}
     set_metrics(y_pred, self.y_test, dict)
     return dict
def ridreg(df, test):
    clf = BayesianRidge()

    target = df['count']
    train = df[['time', 'temp']]
    test = test2[['time', 'temp']]

    clf.fit(train, target)
    final = []
    print(test.head(3))
    for i, row in enumerate(test.values):
        y = []
        for x in row:
            x = float(x)
            y.append(x)
            # print(x)
            final.append(y)
    predicted_probs = clf.predict(final)
    # print(predicted_probs.shape)
    # predicted_probs = pd.Series(predicted_probs)
    # predicted_probs = predicted_probs.map(lambda x: int(x))

    keep = pd.read_csv('data/test.csv')
    keep = keep['datetime']
    # #save to file
    predicted_probs = pd.DataFrame(predicted_probs)
    print(predicted_probs.head(3))
    predicted_probs.to_csv('data/submission3.csv', index=False)
Beispiel #20
0
def impute_regression(data, target, n=1):
    """
    Perform multiple imputation by drawing from posterior distribution of Bayesian ridge regression model.
    
    Parameters
    ----------
    data : DataFrame
        Data to use for imputation.
    target : str
        Column to impute.
    n : int
        Number of multiple imputations to perform.
        
    Returns
    -------
    out : DataFrame
        One column per imputation, indices of original rows.
        
    """
    # fit regression model without nas
    y = data.dropna()[target]
    X = data.dropna().drop(target, axis=1)
    regr = BayesianRidge()
    regr.fit(X, y)

    # predict measures
    preds, stds = regr.predict(data[data[target].isnull()].drop(target,
                                                                axis=1),
                               return_std=True)

    # sample from distribution
    return samplePredictions_df(preds, stds, n=n, name="imputation")
Beispiel #21
0
    def train(self, X_train, y_train):
        #X_train为矩阵,y_train为DataFrame()
        folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590)
        oof_stack = np.zeros(X_train.shape[0])
        for fold_, (trn_idx,
                    val_idx) in enumerate(folds_stack.split(X_train, y_train)):
            print("fold {}".format(fold_))
            trn_data, trn_y = X_train[trn_idx], y_train.iloc[trn_idx].values
            val_data, val_y = X_train[val_idx], y_train.iloc[val_idx].values

            clf = BayesianRidge()
            clf.fit(trn_data, trn_y)

            oof_stack[val_idx] = clf.predict(val_data)
            #存储模型
            joblib.dump(clf, 'stack_model' + str(fold_) + '.pkl')

        print("stack score:{:<8.8f}".format(
            mean_squared_error(y_train.values, oof_stack) / 2))
        self.train_result = pd.DataFrame({
            'real':
            y_train.values,
            'pred':
            oof_stack,
            'error': (y_train.values - oof_stack)**2
        })
Beispiel #22
0
class BayesianRidgeRegression(skl.base.BaseEstimator,
                              skl.base.TransformerMixin):
    def __init__(self, n_iter=300, save_path=None):
        super(BayesianRidgeRegression, self).__init__()
        self.save_path = save_path
        self.n_iter = n_iter
        self.model = None

    def fit(self, X, y):
        self.model = BayesianRidge(n_iter=self.n_iter, fit_intercept=True)
        self.model.fit(X, y)
        return self

    def predict(self, X):
        X = check_array(X)
        prediction = self.model.predict(X)
        print("BayesianRidge predicted")
        return prediction

    def score(self, X, y, sample_weight=None):
        scores = (self.predict(X) - y)**2 / len(y)
        score = np.sum(scores)
        return -score

    def set_save_path(self, save_path):
        self.save_path = save_path
def ridreg(df,test):
    clf = BayesianRidge()
    
    target = df['count']
    train  = df[['time','temp']]
    test   = test2[['time','temp']]

    clf.fit(train,target)
    final = []
    print(test.head(3))
    for i, row in enumerate(test.values):
        y=[]
        for x in row:
            x= float(x)
            y.append(x)
            # print(x)
            final.append(y)
    predicted_probs= clf.predict(final)
    # print(predicted_probs.shape)
    # predicted_probs = pd.Series(predicted_probs)
    # predicted_probs = predicted_probs.map(lambda x: int(x))

    keep = pd.read_csv('data/test.csv')
    keep = keep['datetime']
    # #save to file
    predicted_probs= pd.DataFrame(predicted_probs)
    print(predicted_probs.head(3))
    predicted_probs.to_csv('data/submission3.csv',index=False)
def test_save_load():
    clf = BayesianRidge(compute_score=True)
    X, y = get_traininig_data()
    clf.fit(X, y)
    y_hat = clf.predict(X)

    model = MyCustomModel(clf)
    conda = {
        "name": "test",
        "channels": ["defaults"],
        "dependencies": [{"pip": ["scipy", "sklearn"]}]
    }

    model_save_path = os.path.join(dirname(abspath(__file__)), "AzureMLModel")
    local_dependencies = [dirname(abspath(__file__))]

    save_generic_model(model, path=model_save_path, conda=conda, local_dependencies=local_dependencies)

    df = pd.DataFrame(data=X)
    df.columns = df.columns.astype(str)
    
    loaded_generic_model = load_generic_model(model_save_path)
    result_df = loaded_generic_model.predict(df)
    assert (result_df.to_numpy() == y_hat.reshape(-1, 1)).all()

    dfd_path = os.path.join(dirname((abspath(__file__))), "dfd")
    os.makedirs(dfd_path, exist_ok=True)
    data_save_path = os.path.join(dfd_path, "data.dataset.parquet")
    df.to_parquet(data_save_path, engine="pyarrow")
    meta_path = os.path.join(dfd_path, "_meta.yaml")
    with open(meta_path, "w") as fp:
        fp.write("type: DataFrameDirectory\nextension: {}\nformat: Parquet\ndata: data.dataset.parquet")
Beispiel #25
0
def test_return_std():
    # Test return_std option for both Bayesian regressors
    def f(X):
        return np.dot(X, w) + b

    def f_noise(X, noise_mult):
        return f(X) + np.random.randn(X.shape[0]) * noise_mult

    d = 5
    n_train = 50
    n_test = 10

    w = np.array([1.0, 0.0, 1.0, -1.0, 0.0])
    b = 1.0

    X = np.random.random((n_train, d))
    X_test = np.random.random((n_test, d))

    for decimal, noise_mult in enumerate([1, 0.1, 0.01]):
        y = f_noise(X, noise_mult)

        m1 = BayesianRidge()
        m1.fit(X, y)
        y_mean1, y_std1 = m1.predict(X_test, return_std=True)
        assert_array_almost_equal(y_std1, noise_mult, decimal=decimal)

        m2 = ARDRegression()
        m2.fit(X, y)
        y_mean2, y_std2 = m2.predict(X_test, return_std=True)
        assert_array_almost_equal(y_std2, noise_mult, decimal=decimal)
Beispiel #26
0
class BAYESIANRIDGE():
    """docstring for ClassName"""
    def __init__(self, BayesianRidge, N):
        self.cores_number = int(np.ceil(multiprocessing.cpu_count()/N))

        self.model = BayesianRidge(
                        alpha_1=1e-06, 
                        alpha_2=1e-06, 
                        compute_score=False, 
                        copy_X=True,
                        fit_intercept=True, 
                        lambda_1=1e-06, 
                        lambda_2=1e-06, 
                        n_iter=300,
                        normalize=False, 
                        tol=0.001, 
                        verbose=False)


        print("BayesianRidge Cores: ", np.nan)

    def fit(self, X_train, y_train, X_test, y_test, error_type = "MAE"):

        error_dict = {"MSE":"rmse", "R2":{"l1","l2"}, "MAE":"mae","LOGLOSS": "multi_logloss" }
        error_metric = error_dict[error_type]
        self.model.fit(X_train, y_train )

    def predict(self, X_test):
         prediction=self.model.predict(X_test)
         return(prediction)
Beispiel #27
0
def BayesianRidgeRegression(data, label, pred_data, pred_last):
    '''
    效果很差
    '''
    data = np.array(data)
    pred_data = np.array(pred_data)
    label = np.array(label)
    pred_last = np.array(pred_last)
    from sklearn.linear_model import BayesianRidge, LinearRegression
    clf = BayesianRidge(compute_score=True)
    clf.fit(data, label)
    print clf.score(data, label)
    pred_result = clf.predict(pred_data)
    print("Number of mislabeled points out of a total %d points : %d" %
          (pred_data.shape[0], (pred_last != pred_result).sum()))
    print clf.score(pred_data, pred_last)

    ols = LinearRegression()
    ols.fit(data, label)
    print ols.score(data, label)
    pred_result = ols.predict(pred_data)
    print("Number of mislabeled points out of a total %d points : %d" %
          (pred_data.shape[0], (pred_last != pred_result).sum()))
    print ols.score(pred_data, pred_last)
    return pred_result
Beispiel #28
0
def BayesianRidgeRegression(output, features, labels):
    X = features.values  #.reshape(-1,1) # features
    y = labels.values  #.reshape(-1,1) # labels
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)
    regressor = BayesianRidge()
    regressor.fit(X_train, y_train)  #training the algorithm
    y_pred = regressor.predict(X_test)  #predicting

    # visualiation of 10 vessels (predicted vs actual value)
    lr = np.round(
        pd.DataFrame({
            'Actual': y_test.flatten(),
            'Predicted': y_pred.flatten()
        }))
    lr = lr.head(10)
    lr.plot(kind='bar', figsize=(10, 6))
    plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
    plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
    plt.title(
        'Actual vs Predicted {0} (10 vessels), \n BayesianRidge Regression'.
        format(output),
        fontsize=12)
    plt.show()
    #evaluation metrics
    lrEvaluation = evaluation('BayesianRidge Regression {0}'.format(output),
                              y_pred, y_test)
    return (lrEvaluation, lr)
def Model_stack(df_train_x, df_train_y, df_test):
    # kernel has 'linear'/'poly'/'rbf'/'sigmoid'/'precomputed'/'callable' 如果没有给出,默认'rbf' callable 预先计算内核矩阵
    svr_ = SVR(kernel='linear', degree=3, coef0=0.0, tol=0.001,
               C=1.0, epsilon=0.1, shrinking=True, cache_size=20)
    lgb_ = lgb.LGBMModel(boosting_type='gbdt', num_leaves=35,
                         max_depth=20, max_bin=255, learning_rate=0.03, n_estimator=10, subsample_for_bin=2000,
                         objective='regression', min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20,
                         subsample=1.0, verbose=0, subsample_freq=1, colsample_bytree=1.0, reg_alpha=0.0,
                         reg_lambda=0.0, random_state=None, n_jobs=-1, silent=True)
    RF_model = RandomForestRegressor(n_estimators=50, max_depth=25, min_samples_split=20, min_samples_leaf=10,
                                     max_features='sqrt', oob_score=True, random_state=10)
    # 贝叶斯岭回归
    BR_model = BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True, fit_intercept=True,
                             lambda_1=1e-06, lambda_2=1e-06, n_iter=300, normalize=False, tol=0.0000001, verbose=False)
    linear_model = LinearRegression()
    ls = Lasso(alpha=0.00375)
    x_train, x_test, y_train, y_test = train_test_split(df_train_x, df_train_y,
                                                        test_size=0.6)
    rg = RidgeCV(cv=5)
    stack = pd.DataFrame()
    stack_test = pd.DataFrame()

    ls.fit(x_train, y_train)
    lgb_.fit(x_train, y_train)
    RF_model.fit(x_train, y_train)
    svr_.fit(x_train, y_train)
    linear_model.fit(x_train, y_train)
    BR_model.fit(x_train, y_train)

    stack['rf'] = ls.predict(x_test)
    stack['adaboost'] = lgb_.predict(x_test)
    stack['gbdt'] = RF_model.predict(x_test)
    stack['lightgbm'] = svr_.predict(x_test)
    stack['linear_model'] = linear_model.predict(x_test)
    stack['BR'] = BR_model.predict(x_test)
    # print('stacking_model: ',Cross_validation(stack, y_test, rg))

    rg.fit(stack, y_test)
    stack_test['rf'] = ls.predict(df_test)
    stack_test['adaboost'] = lgb_.predict(df_test)
    stack_test['gbdt'] = RF_model.predict(df_test)
    stack_test['lightgbm'] = svr_.predict(df_test)
    stack_test['linear_model'] = linear_model.predict(df_test)
    stack_test['BR'] = BR_model.predict(df_test)

    final_ans = rg.predict(stack_test)
    pd.DataFrame(final_ans).to_csv('predict_drop+3.txt', index=False, header=False)
def get_opti_temp(crop_list=["Onions", "Tomatoes"]):
    clft = BayesianRidge()
    clfo = BayesianRidge()
    client = MongoClient(
    )  #//////////////////////////////////////////////////////////////////////
    db = client.server_db
    cursor_tomato = db.corpusoptitemp.find({"Datatype": "Tomatoes"})
    cursor_onion = db.corpusoptitemp.find({"Datatype": "Onions"})
    xt = []
    yt = []
    xo = []
    yo = []
    for docs in cursor_tomato:
        item = []
        item.append(docs["Temperature"])
        item.append(docs["Humidity"])
        xt.append(item)
        yt.append(docs["Losses"])

    for docs in cursor_onion:
        item = []
        item.append(docs["Temperature"])
        item.append(docs["Humidity"])
        xo.append(item)
        yo.append(docs["Losses"])

    clft.fit(xt, yt)
    clfo.fit(xo, yo)

    final = []

    for temp in range(9, 40):
        for hum in range(60, 100):
            a = clft.predict([[temp, hum]])
            b = clfo.predict([[temp, hum]])
            item = []
            item.append((a[0] + b[0]) / 2)
            item.append(temp)
            item.append(hum)
            final.append(item)

    #print final
    final.sort(key=lambda x: x[0])
    print "Store the produce at ", final[0][1], "C and at ", final[0][
        2], "% relative humidity."
    return final[0][1], final[0][2]
Beispiel #31
0
def bayesRegr(source, target):
    # Binarize source
    clf = BayesianRidge()
    features = source.columns[:-1]
    klass = source[source.columns[-1]]
    clf.fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    return preds
def stacking_model(oof_list, prediction_list, labels, sample_ids):
    train_stack = np.vstack(oof_list).transpose()
    test_stack = np.vstack(prediction_list).transpose()

    kfolder = RepeatedKFold(n_splits=5, n_repeats=2, random_state=666)
    kfold = kfolder.split(train_stack, labels)
    preds_list = list()
    stacking_oof = np.zeros(train_stack.shape[0])

    for train_index, vali_index in kfold:
        k_x_train = train_stack[train_index]
        k_y_train = labels.loc[train_index]
        k_x_vali = train_stack[vali_index]

        gbm = BayesianRidge(normalize=True)
        gbm.fit(k_x_train, k_y_train)

        k_pred = gbm.predict(k_x_vali)
        stacking_oof[vali_index] = k_pred

        preds = gbm.predict(test_stack)
        preds_list.append(preds)

    fold_mse_error = mean_squared_error(labels, stacking_oof)
    print(f'stacking fold mse error is {fold_mse_error}')

    mse = make_scorer(mean_squared_error)
    gbm = BayesianRidge()
    cv_mse_error = cross_val_score(gbm,
                                   train_stack,
                                   labels,
                                   scoring=mse,
                                   cv=5,
                                   n_jobs=5)
    cv_mse_error = np.mean(cv_mse_error)
    print(f'stacking cv mse error is {cv_mse_error}')

    preds_columns = ['preds_{id}'.format(id=i) for i in range(10)]
    preds_df = pd.DataFrame(data=preds_list)
    preds_df = preds_df.T
    preds_df.columns = preds_columns
    preds_list = list(preds_df.mean(axis=1))

    sub_df = pd.DataFrame({'sample_id': sample_ids, 'rate': preds_list})
    sub_df.to_csv('submittion_tree.csv', index=False, header=False)
Beispiel #33
0
def fit_polynomial_bayesian_skl(X, Y, degree,
                                lambda_shape=1.e-6, lambda_invscale=1.e-6,
                                padding=10, n=100,
                                X_unknown=None):
    X_v = pol.polyvander(X, degree)

    clf = BayesianRidge(lambda_1=lambda_shape, lambda_2=lambda_invscale)
    clf.fit(X_v, Y)

    coeff = np.copy(clf.coef_)

    # there some weird intercept thing
    # since the Vandermonde matrix has 1 at the beginning, just add this
    # intercept to the first coeff
    coeff[0] += clf.intercept_

    ret_ = [coeff]

    # generate the line
    x = np.linspace(X.min()-padding, X.max()+padding, n)
    x_v = pol.polyvander(x, degree)

    # using the provided predict method
    y_1 = clf.predict(x_v)

    # using np.dot() with coeff
    y_2 = np.dot(x_v, coeff)

    ret_.append(((x, y_1), (x, y_2)))

    if X_unknown is not None:
        xu_v = pol.polyvander(X_unknown, degree)

        # using the predict method
        yu_1 = clf.predict(xu_v)

        # using np.dot() with coeff
        yu_2 = np.dot(xu_v, coeff)

        ret_.append(((X_unknown, yu_1), (X_unknown, yu_2)))

    return ret_
Beispiel #34
0
    def fit_model_10(self,toWrite=False):
        model = BayesianRidge(n_iter=5000)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict(X_test)
            print("Model 10 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model10/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Beispiel #35
0
def build_bayesian_rr(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a Bayesian ridge regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    clf = BayesianRidge()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
    # Optimal ridge regression alpha value from CV
    ridge_alpha = clf.alpha_

    with open('../trained_networks/brr_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return
def prediction_BayesianRidge (X_train, Y_train, X_test, Y_test,normalize):

    # Print shapes of the training and testing data sets
    #print ("Shapes of the training and testing data sets")
    #print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    #Create our regression object

    lreg = BayesianRidge(normalize=normalize)

    #do a linear regression, except only on the training
    lreg.fit(X_train,Y_train)

    #print("The estimated intercept coefficient is %.2f " %lreg.intercept_)
    #print("The number of coefficients used was %d " % len(lreg.coef_))



    # Set a DataFrame from the Facts
    coeff_df = DataFrame(X_train.columns)
    coeff_df.columns = ["Fact"]


    # Set a new column lining up the coefficients from the linear regression
    coeff_df["Coefficient"] = pd.Series(lreg.coef_)


    # Show
    #coeff_df

    #highest correlation between a fact and fraction votes
    #print ("Highest correlation fact: %s is %.9f" % (cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"], coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]) )

    #sns_plot = sns.jointplot(coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"Fraction Votes",pd.merge(X_test,pd.DataFrame(Y_test), right_index=True, left_index=True),kind="scatter")


    #Predictions on training and testing sets
    pred_train = lreg.predict(X_train)
    pred_test = lreg.predict(X_test)

    # The mean square error
    #print("MSE with X_train and Y_train: %.6f"  % np.mean((Y_train - pred_train) ** 2))
    #print("MSE with X_test and Y_test: %.6f"  %np.mean((Y_test - pred_test) ** 2))

    #Explained variance score: 1 is perfect prediction
    #print("Variance score: %.2f" % lreg.score(X_test, Y_test))

    result={}
    result["method"]="BayesianRidge"
    if normalize :
        result["normalize"]="Y"
    else:
        result["normalize"]="N"
    result["X_train_shape"]=X_train.shape
    result["Y_train_shape"]=Y_train.shape
    result["X_test_shape"]=X_test.shape
    result["Y_test_shape"]=Y_test.shape
    result["intercept"]=lreg.intercept_
    result["num_coef"]=len(lreg.coef_)
    result["max_fact"]=cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"]
    result["max_fact_value"]=coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]
    result["MSE_train"]=np.mean((Y_train - pred_train) ** 2)
    result["MSE_test"]=np.mean((Y_test - pred_test) ** 2)
    result["variance"]=lreg.score(X_test, Y_test)
    return pred_test,coeff_df,pred_train,result

def sale(data):
	data = int(data) + 1
	return log(data)


dataset = pandas.read_csv("input/train2_.csv")
testset = pandas.read_csv("input/test2_.csv")

dataset['Sale'] = dataset['Sales'].apply(sale)

labelData = dataset['Sale'].values
myId = testset['Id'].values

testset.drop(['Id'], inplace=True, axis=1)
testData = testset.iloc[:, :].values
dataset.drop(['Sales', 'Sale'], inplace=True, axis=1)
dataData = dataset.iloc[:, :].values

BRModel = BayesianRidge(compute_score=True)
BRModel.fit(dataset.iloc[:, :].values, labelData)
preds = numpy.column_stack((myId, BRModel.predict(testData))).tolist()
preds = [[int(i[0])] + [exp(float(i[1])) - 1] for i in preds]

print BRModel.scores_
with open("result/sub_BayesRidge.csv", "w") as output:
	writer = csv.writer(output, lineterminator='\n')
	writer.writerow(["Id", "Sales"])
	writer.writerows(preds)
Beispiel #38
0
def main():
    usage = 'usage: %prog [options] <repr_hdf5> <data_hdf5> <target_index>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='add_only', default=False, action='store_true', help='Use additional features only; no sequence features')
    parser.add_option('-b', dest='balance', default=False, action='store_true', help='Downsample the negative set to balance [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='postmodel', help='Output directory [Default: %default]')
    parser.add_option('-r', dest='regression', default=False, action='store_true', help='Regression mode [Default: %default]')
    parser.add_option('-s', dest='seq_only', default=False, action='store_true', help='Use sequence features only; no additional features [Default: %default]')
    parser.add_option('--sample', dest='sample', default=None, type='int', help='Sample from the training set [Default: %default]')
    parser.add_option('-t', dest='target_hdf5', default=None, help='Extract targets from this HDF5 rather than data_hdf5 argument')
    parser.add_option('-x', dest='regex_add', default=None, help='Filter additional features using a comma-separated list of regular expressions')
    (options,args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide full data HDF5, representation HDF5, and target index or filename')
    else:
        repr_hdf5_file = args[0]
        data_hdf5_file = args[1]
        target_i = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(1)

    #######################################################
    # preprocessing
    #######################################################

    # load training targets
    data_hdf5_in = h5py.File(data_hdf5_file, 'r')
    if options.target_hdf5:
        target_hdf5_in = h5py.File(options.target_hdf5, 'r')
    else:
        target_hdf5_in = data_hdf5_in
    train_y = np.array(target_hdf5_in['train_out'])[:,target_i]
    test_y = np.array(target_hdf5_in['test_out'])[:,target_i]

    # load training representations
    if not options.add_only:
        repr_hdf5_in = h5py.File(repr_hdf5_file, 'r')
        train_x = np.array(repr_hdf5_in['train_repr'])
        test_x = np.array(repr_hdf5_in['test_repr'])
        repr_hdf5_in.close()

    if options.seq_only:
        add_labels = []

    else:
        # load additional features
        train_a = np.array(data_hdf5_in['train_add'])
        test_a = np.array(data_hdf5_in['test_add'])
        add_labels = np.array(data_hdf5_in['add_labels'])

        if options.regex_add:
            fi = filter_regex(options.regex_add, add_labels)
            train_a, test_a, add_labels = train_a[:,fi], test_a[:,fi], add_labels[fi]

        # append additional features
        if options.add_only:
            add_i = 0
            train_x, test_x = train_a, test_a
        else:
            add_i = train_x.shape[1]
            train_x = np.concatenate((train_x,train_a), axis=1)
            test_x = np.concatenate((test_x,test_a), axis=1)

    data_hdf5_in.close()
    if options.target_hdf5:
        target_hdf5_in.close()

    # balance
    if options.balance:
        train_x, train_y = balance(train_x, train_y)

    # sample
    if options.sample is not None and options.sample < train_x.shape[0]:
        sample_indexes = random.sample(range(train_x.shape[0]), options.sample)
        train_x = train_x[sample_indexes]
        train_y = train_y[sample_indexes]


    #######################################################
    # model
    #######################################################
    if options.regression:
        # fit
        model = BayesianRidge(fit_intercept=True)
        model.fit(train_x, train_y)

        # accuracy
        acc_out = open('%s/r2.txt' % options.out_dir, 'w')
        print >> acc_out, model.score(test_x, test_y)
        acc_out.close()

        test_preds = model.predict(test_x)

        # plot a sample of predictions versus actual
        plt.figure()
        sns.jointplot(test_preds[:5000], test_y[:5000], joint_kws={'alpha':0.3})
        plt.savefig('%s/scatter.pdf' % options.out_dir)
        plt.close()

        # plot the distribution of residuals
        plt.figure()
        sns.distplot(test_y-test_preds)
        plt.savefig('%s/residuals.pdf' % options.out_dir)
        plt.close()

    else:
        # fit
        model = LogisticRegression(penalty='l2', C=1000)
        model.fit(train_x, train_y)

        # accuracy
        test_preds = model.predict_proba(test_x)[:,1].flatten()
        acc_out = open('%s/auc.txt' % options.out_dir, 'w')
        print >> acc_out, roc_auc_score(test_y, test_preds)
        acc_out.close()

        # compute and print ROC curve
        fpr, tpr, thresholds = roc_curve(test_y, test_preds)

        roc_out = open('%s/roc.txt' % options.out_dir, 'w')
        for i in range(len(fpr)):
            print >> roc_out, '%f\t%f\t%f' % (fpr[i], tpr[i], thresholds[i])
        roc_out.close()

        # compute and print precision-recall curve
        precision, recall, thresholds = precision_recall_curve(test_y, test_preds)

        prc_out = open('%s/prc.txt' % options.out_dir, 'w')
        for i in range(len(precision)):
            print >> prc_out, '%f\t%f' % (precision[i], recall[i])
        prc_out.close()

    # save model
    joblib.dump(model, '%s/model.pkl' % options.out_dir)

    #######################################################
    # analyze
    #######################################################
    # print coefficients table
    coef_out = open('%s/add_coefs.txt' % options.out_dir, 'w')
    for ai in range(len(add_labels)):
        if options.regression:
            coefi = model.coef_[add_i+ai]
        else:
            coefi = model.coef_[0,add_i+ai]
        print >> coef_out, add_labels[ai], coefi
    coef_out.close()
def do_validation(data_path, steps=10):
    allfiles = initialize(data_path)
    gbm = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05, max_depth=6, min_samples_leaf=5, subsample=0.5)
    ada = AdaBoostRegressor(n_estimators=200, learning_rate=1)
    etree = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, min_samples_leaf=5)
    rf = RandomForestRegressor(n_estimators=200, max_features=4, min_samples_leaf=5)
    kn = KNeighborsRegressor(n_neighbors=25)
    logit = LogisticRegression(tol=0.05)
    enet = ElasticNetCV(l1_ratio=0.75, max_iter=1000, tol=0.05)
    svr = SVR(kernel="linear", probability=True)
    ridge = Ridge(alpha=18)
    bridge = BayesianRidge(n_iter=500)

    gbm_metrics = 0.0
    ada_metrics = 0.0
    etree_metrics = 0.0
    rf_metrics = 0.0
    kn_metrics = 0.0
    logit_metrics = 0.0
    svr_metrics = 0.0
    ridge_metrics = 0.0
    bridge_metrics = 0.0
    enet_metrics = 0.0
    nnet_metrics = 0.0

    logistic = LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

    for i in xrange(steps):
        driver = allfiles[i]
        df, Y = create_merged_dataset(driver)
        df['label'] = Y        
        # Shuffle DF.
        df = df.reindex(np.random.permutation(df.index))

        train = df[:100]
        label = train['label']
        del train['label']

        test = df[100:400]
        Y = test['label']
        del test['label']

        #to_drop = ['driver', 'trip', 'speed1', 'speed2', 'speed3', 'speed4', 'speed5', 'speed6', 'speed7', 'speed8', 'speed9', 
        #        'speed10', 'speed11', 'speed12', 'speed13', 'speed14', 'speed15', 'speed16', 'speed17', 'speed18', 'speed19', 
        #        'speed20', 'speed21', 'speed22', 'speed23', 'speed24', 'speed25', 'speed26', 'speed27', 'speed28', 'speed29', 
        #        'speed30', 'speed31', 'speed32', 'speed33', 'speed34', 'speed35', 'speed36', 'speed37', 'speed38', 'speed39', 
        #        'speed40', 'speed41', 'speed42', 'speed43', 'speed44', 'speed45', 'speed46', 'speed47', 'speed48', 'speed49', 
        #        'speed50', 'speed51', 'speed52', 'speed53', 'speed54', 'speed55', 'speed56', 'speed57', 'speed58', 'speed59', 
        #        'speed60', 'speed61', 'speed62', 'speed63', 'speed64', 'speed65', 'speed66', 'speed67', 'speed68', 'speed69', 
        #        'speed70', 'speed71', 'speed72', 'speed73', 'speed74', 'speed75', 'speed76', 'speed77', 'speed78', 'speed79', 'speed80']
        to_drop = ['driver', 'trip']

        X_train = train.drop(to_drop, 1)
        X_test = test.drop(to_drop, 1)
        
        gbm.fit(X_train, label)
        Y_hat = gbm.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        gbm_metrics += metrics.auc(fpr, tpr) 
        
        ada.fit(X_train, label)
        Y_hat = ada.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        ada_metrics += metrics.auc(fpr, tpr)
    
        etree.fit(X_train, label)
        Y_hat = etree.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        etree_metrics += metrics.auc(fpr, tpr)
        
        rf.fit(X_train, label)
        Y_hat = rf.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        rf_metrics += metrics.auc(fpr, tpr)
        
        kn.fit(X_train, label)
        Y_hat = kn.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        kn_metrics += metrics.auc(fpr, tpr)

        # Linear models.
        to_drop = ['driver', 'trip', 'distance', 'sd_acceleration', 'final_angle', 'mean_acceleration', 'mean_avg_speed', 'sd_inst_speed',
                'sd_avg_speed', 'mean_inst_speed', 'points']

        X_train = train.drop(to_drop, 1)
        X_test = test.drop(to_drop, 1)
        
        logit.fit(X_train, label)
        Y_hat = [i[1] for i in logit.predict_proba(X_test)]
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        logit_metrics += metrics.auc(fpr, tpr)

        svr.fit(X_train, label)
        Y_hat = svr.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        svr_metrics += metrics.auc(fpr, tpr)
        
        ridge.fit(X_train, label)
        Y_hat = ridge.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        ridge_metrics += metrics.auc(fpr, tpr)

        bridge.fit(X_train, label)
        Y_hat = bridge.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        bridge_metrics += metrics.auc(fpr, tpr)

        enet.fit(X_train, label)
        Y_hat = enet.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        enet_metrics += metrics.auc(fpr, tpr)

        classifier.fit(X_train, label)
        Y_hat = classifier.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        nnet_metrics += metrics.auc(fpr, tpr)

    print ""
    print "GBM:", gbm_metrics/steps
    print "AdaBoost:", ada_metrics/steps
    print "Extra Trees:", etree_metrics/steps
    print "RF:", rf_metrics/steps
    print "KN:", kn_metrics/steps
    print ""
    print "Logit:", logit_metrics/steps
    print "SVR:", svr_metrics/steps
    print "Ridge:", ridge_metrics/steps
    print "BayesianRidge:", bridge_metrics/steps
    print "Elastic Net:", enet_metrics/steps
    print "Neural Networks:", nnet_metrics/steps
    print ""
Beispiel #40
0
trainingcounts = counts[100:]
testcounts = counts[:100]

trainingrates = countrates[100:]
testrates = countrates[:100]

trainingtimes = times[100:]
testtimes = times[:100]

# using trainingcounts and training hists use log linear
#poisson_model = sm.GLM(trainingrates,
#						sm.tools.tools.add_constant(traininghists),
#						family =sm.families.Poisson(sm.genmod.families.links.log))
#results = poisson_model.fit()
#print(results.summary())

#x = results.predict(sm.tools.tools.add_constant(testhists))


clf = BayesianRidge(compute_score=True)
clf.fit(traininghists,trainingrates)
x = clf.predict(testhists)  

answer = testrates

plt.plot(bins,x)
plt.plot(bins,answer)
plt.show()


Beispiel #41
0
def bayes_ridge_reg(x_data,y_data):
    br = BayesianRidge()
    br.fit(x_data,y_data)
    print 'br params',br.coef_,br.intercept_
    adjusted_result = br.predict(x_data)
    return map(int,list(adjusted_result))
Beispiel #42
0
#    print("-----------------------------------------------")
#    X_train, X_test = selectFeatures(X_train, X_test, y_train, k)

    print("-----------------------------------------------")
    print("SVM Classification of training set")   
    print("-----------------------------------------------")
    class_weight = {0:5}
    print("Class weight=", class_weight)
    clf = BayesianRidge(compute_score=True).fit(X_train, y_train)
    print("Test svm.SVC score=", clf.score(X_test, y_test))
    print("Train svm.SVC score=", clf.score(X_train, y_train))
    
    print("-----------------------------------------------")
    print("Metrics on TEST SET")   
    print("-----------------------------------------------")    
    y_pred = clf.predict(X_test)
    
    print(metrics.classification_report(y_test, y_pred, target_names=label_names))
    print(metrics.confusion_matrix(y_test, y_pred))       
    
    print("-----------------------------------------------")
    print("Metrics on TRAIN SET")   
    print("-----------------------------------------------")    
    y_predTrain = clf.predict(X_train)
    
    print(metrics.classification_report(y_train, y_predTrain, target_names=label_names))
    print(metrics.confusion_matrix(y_train, y_predTrain))       

    #met.crossValidationScores(clf, X_train, y_train)
    
    # met.showRocAnalysis(X_bns, Y)       
plt.plot(clf.scores_, color='navy', linewidth=lw)
plt.ylabel("Score")
plt.xlabel("Iterations")


# Plotting some predictions for polynomial regression
def f(x, noise_amount):
    y = np.sqrt(x) * np.sin(x)
    noise = np.random.normal(0, 1, len(x))
    return y + noise_amount * noise


degree = 10
X = np.linspace(0, 10, 100)
y = f(X, noise_amount=0.1)
clf_poly = BayesianRidge()
clf_poly.fit(np.vander(X, degree), y)

X_plot = np.linspace(0, 11, 25)
y_plot = f(X_plot, noise_amount=0)
y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True)
plt.figure(figsize=(6, 5))
plt.errorbar(X_plot, y_mean, y_std, color='navy',
             label="Polynomial Bayesian Ridge Regression", linewidth=lw)
plt.plot(X_plot, y_plot, color='gold', linewidth=lw,
         label="Ground Truth")
plt.ylabel("Output y")
plt.xlabel("Feature X")
plt.legend(loc="lower left")
plt.show()
X_train = np.vander(x_train, n_order + 1, increasing=True)
X_test = np.vander(x_test, n_order + 1, increasing=True)

# #############################################################################
# Plot the true and predicted curves with log marginal likelihood (L)
reg = BayesianRidge(tol=1e-6, fit_intercept=False, compute_score=True)
fig, axes = plt.subplots(1, 2, figsize=(8, 4))
for i, ax in enumerate(axes):
    # Bayesian ridge regression with different initial value pairs
    if i == 0:
        init = [1 / np.var(y_train), 1.]  # Default values
    elif i == 1:
        init = [1., 1e-3]
        reg.set_params(alpha_init=init[0], lambda_init=init[1])
    reg.fit(X_train, y_train)
    ymean, ystd = reg.predict(X_test, return_std=True)

    ax.plot(x_test, func(x_test), color="blue", label="sin($2\\pi x$)")
    ax.scatter(x_train, y_train, s=50, alpha=0.5, label="observation")
    ax.plot(x_test, ymean, color="red", label="predict mean")
    ax.fill_between(x_test, ymean-ystd, ymean+ystd,
                    color="pink", alpha=0.5, label="predict std")
    ax.set_ylim(-1.3, 1.3)
    ax.legend()
    title = "$\\alpha$_init$={:.2f},\\ \\lambda$_init$={}$".format(
            init[0], init[1])
    if i == 0:
        title += " (Default)"
    ax.set_title(title, fontsize=12)
    text = "$\\alpha={:.1f}$\n$\\lambda={:.3f}$\n$L={:.1f}$".format(
           reg.alpha_, reg.lambda_, reg.scores_[-1])
kr = KernelRidge(alpha=0.0001, coef0=1, degree=1, gamma=0.001, kernel='rbf',kernel_params=None)
kr.fit(x_train_scaled, y_train)
y3 = kr.predict(x_test_scaled)

lasso = Lasso(alpha=1e-09)
lasso.fit(x_train_scaled, y_train)
y4 = lasso.predict(x_test_scaled)

linear_ridge = Ridge(alpha=0.1)
linear_ridge.fit(x_train_scaled,y_train)
y5 = linear_ridge.predict(x_test_scaled)

bayesian_ridge = BayesianRidge(alpha_1=1e-05, alpha_2=10, lambda_1=10, lambda_2=1e-05)
bayesian_ridge.fit(x_train_scaled, y_train)
y6 = bayesian_ridge.predict(x_test_scaled)

sgd = SGDRegressor(alpha=0.1, epsilon=0.001, l1_ratio=0.2, loss='squared_loss', penalty='none', power_t=0.2)
sgd.fit(x_train_scaled, y_train)
y7 = sgd.predict(x_test_scaled)

###########################################
print '########## TESTING ERRORS ##########'

print "MAE for Linear Regression:", mean_absolute_error(y_test, y_predicted)
print "MAE for SVR:", mean_absolute_error(y_test, y2)
print "MAE for Kernel Ridge Regression:", mean_absolute_error(y_test, y3)
print "MAE for Lasso Regression:", mean_absolute_error(y_test, y4)
print "MAE for Linear Ridge Regression:", mean_absolute_error(y_test, y5)
print "MAE for Bayesian Ridge Regression:", mean_absolute_error(y_test, y6)
print "MAE for Stochastic Gradient Descent Regression:", mean_absolute_error(y_test, y7)
Beispiel #46
0
for _ in range(10):
    train_latent_matrix = get_latent_matrix(x,y,x)
    test_latent_matrix = get_latent_matrix(x,y,x_test)
    # Clean out rows with NaN.
    #mask = ~np.any(np.isnan(train_latent_matrix), axis=1)
    #newx = train_latent_matrix[mask]
    #newy = y[mask]
    
    newx = np.nan_to_num(train_latent_matrix)
    newy = y

    #last_layer = SVR(kernel='rbf', C=1e3, gamma=0.1)
    last_layer = BayesianRidge()
    last_layer.fit(newx, newy)

    output = last_layer.predict(test_latent_matrix)
    assert len(output) == 8500
    runs.append(output)

#for i in runs:
#print len(i)
   
fout = open('modelz.10.output', 'w')
for line in zip(*runs):
    avg =sum(line)/len(line)
    if avg > 5:
        avg = 5.0
    elif avg < 0:
        avg = 0.0
    fout.write(str(avg)[:6]+'\n')
io.imsave("/Users/qcaudron/Desktop/charo/2_smoothed.jpg", ski.img_as_uint(surf))

# <codecell>

z1 = np.mean(surf, axis=0)
z2 = np.mean(surf, axis=1)

#for i in range(surf.shape[1]) : 
#    plt.plot(surf[:, i], "k")
#plt.plot(z2)
r = [BayesianRidge().fit(np.vander(np.arange(surf.shape[i]), 2), np.mean(surf, axis = 1-i)) for i in [0, 1]]
r1 = BayesianRidge().fit(np.arange(len(z1)).reshape(len(z1),1), z1)
r2 = BayesianRidge().fit(np.arange(len(z2[500:-500])).reshape(len(z2[500:-500]),1), z2[500:-500])

#plt.plot(r1.predict(np.arange(len(z1)).reshape(len(z1),1)), linewidth=5)
plt.plot(r2.predict(np.arange(len(z2)).reshape(len(z2),1)), linewidth=5)
plt.plot(z2, linewidth=5)
#plt.axhline(b[np.argmax(h)], c="r", linewidth=3)
#plt.plot(r[0].predict(np.vander(np.arange(surf.shape[0]), 2)), linewidth=3)

#plt.plot(r[0].predict(np.arange(len(z1)).reshape(len(z1),1)), linewidth=3)
#plt.plot(r[0].predict(np.expand_dims(np.arange(surf.shape[0]), axis=1)), linewidth=5)
#plt.axhline(np.mean(z1 / r1.predict(np.arange(len(z1)).reshape(len(z1),1))))

# <codecell>

lz = np.log(z2)
r3 = BayesianRidge().fit(np.arange(len(lz[500:-500])).reshape(len(lz[500:-500]),1), lz[500:-500])

plt.plot(np.exp(lz))
plt.plot(np.exp(r3.predict(np.arange(len(lz)).reshape(len(lz),1))))
def main():
    parser = argparse.ArgumentParser(description="""Creates embeddings predictions.""")
    parser.add_argument('--train')
    parser.add_argument('--test')
    parser.add_argument('--embeddings')
    parser.add_argument('--cv',default=False)


    args = parser.parse_args()

    stoplist = stopwords.words("english")
    stoplist.extend("it's 've 's i'm he's she's you're we're they're i'll you'll he'll ".split(" "))


    embeddings={}
    for line in codecs.open(args.embeddings,encoding="utf-8").readlines():
        line = line.strip()
        if line:
            a= line.split(" ")
            embeddings[a[0]] = np.array([float(v) for v in a[1:]]) #cast to float, otherwise we cannot operate

    train_indices = []
    test_indices = []
    train_scores = []
    train_features = []
    test_features = []


    # if args.learner == "logisticregression":
    #     learner= LogisticRegression()
    #     learner_type = "classification"
    # elif args.learner == "decisiontreeclassification":
    #     learner = tree.DecisionTreeClassifier()
    #     learner_type = "classification"
    # elif args.learner == "decisiontreeregression":
    #     learner = tree.DecisionTreeRegressor()
    #     learner_type = "regression"
    # elif args.learner == "bayesianridge":
    #     learner = BayesianRidge()
    #     learner_type = "regression"
    # else:
    learner = BayesianRidge()
    learner_type = "regression"

    le = preprocessing.LabelEncoder()


    for line in open(args.train).readlines():
        (index, score, tweet) = line.strip().split("\t")
        train_indices.append(index)
        train_scores.append(float(score))
        tweet = tweet.split(" ")
        train_features.append(embedfeats(tweet,embeddings,stoplist))


    train_indices = np.array(train_indices)
    train_scores = np.array(train_scores)
    train_features = np.array(train_features)

    train_scores_int = [roundup(v) for v in train_scores]
    le.fit(train_scores_int)

    train_scores_int_transformed = le.transform(train_scores_int)


    if args.cv:
        train_cv={}
        cross=cross_validation.KFold(len(train_scores),n_folds=10)
        acc=[]
        for train_index, test_index in cross:
            #if args.debug:
            #    print("TRAIN:", len(train_index), "TEST:", len(test_index))
            X=train_features
            y=train_scores
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]


            learner.fit(X_train,y_train)

            y_pred= learner.predict(X_test)
            assert(len(y_pred)==len(test_index))
            tids=train_indices[test_index]
            for twid,pred in zip(tids,y_pred):
                train_cv[twid] =  pred

            acc.append(cosine_similarity(y_test,y_pred)[0][0])

        print >>sys.stderr, "Cosine of 10-folds:", acc
        print >>sys.stderr, "Macro average:", np.mean(np.array(acc)), np.std(np.array(acc))

        for twid in train_indices:
            print "{}\t{}".format(twid,train_cv[twid])
    else:

        for line in open(args.test).readlines():
            (index, score, tweet) = line.strip().split("\t")
            test_indices.append(index)
            #scores.append(score)
            tweet = tweet.split(" ")
            test_features.append(embedfeats(tweet,embeddings,stoplist))


        #print  np.array(train_features).shape
        # when features are generated, train and test

        if learner_type == "regression":
            learner.fit(train_features,train_scores)
        else:
                learner.fit(train_features,train_scores_int_transformed)

        predicted_scores= learner.predict(test_features)
        if learner_type != "regression":
            predicted_scores = le.inverse_transform(predicted_scores)
        for index, score in zip(test_indices,predicted_scores):
            print index+"\t"+str(score)
import time
import sys
import numpy
import vector
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.cross_validation import train_test_split

usage = "filename features_file labels_file output_file"

if __name__ == "__main__":

	if (len(sys.argv)!=5):
		print usage
	else:
		file_x = sys.argv[1]
		file_y = sys.argv[2]
		file_out = sys.argv[3]
		split_seed = sys.argv[4]

		X = numpy.genfromtxt(file_x, delimiter=' ')
		y = numpy.genfromtxt(file_y, delimiter=' ')

		# Split the data into training/testing sets
		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=split_seed)
		
		# Bayesian Ridge Regression
		clf = BayesianRidge(compute_score=True)
		clf.fit(X, y)
		y_predict=clf.predict(X_test)
		numpy.savetxt(file_out, y_predict)
Beispiel #50
0
def nickmain1():

	train_all = pd.read_csv(trainloc)
	target_all = pd.read_csv(trainloc)
	test_all = pd.read_csv(testloc)
	targets = ['Ca','P','pH','SOC','Sand']
	train_cols_to_remove = ['PIDN']+targets
	train_all["Depth"] = train_all["Depth"].replace(["Topsoil", "Subsoil"],[10,-10])
	test_all["Depth"] = test_all["Depth"].replace(["Topsoil", "Subsoil"],[10,-10])
	common_features = ['BSAN','BSAS','BSAV','CTI','ELEV','EVI','LSTD','LSTN','REF1','REF2','REF3','REF7','RELI','TMAP','TMFI']
	feats_list = {}
	colnames_nums = []
	colnames = train_all.ix[:,'m7497.96':'m599.76'].columns.values
	for x in colnames:
		match = re.search(r'(?<=m)[0-9]*',x)
		if match: 
			colnames_nums.append(int(match.group()))
	
	print len(colnames)
	print len(colnames_nums)
	print len(train_all.ix[0,'m7497.96':'m599.76'].values)


	

	for target in targets:
		selector = SelectKBest(f_regression, k=200)
		selector.fit_transform(train_all.ix[:,'m7497.96':'m599.76'], train_all[target])
		selected = selector.get_support()
		feats = [col for (col,sel) in zip(list(train_all.ix[:,'m7497.96':'m599.76'].columns.values), selected) if sel]
		feats_list[target] = feats+common_features

		


	#pickTest = ['PIDN', 'BSAN','BSAS','BSAV','CTI','ELEV','EVI','LSTD','LSTN','REF1','REF2','REF3','REF7','RELI','TMAP','TMFI','Depth']#ORIGINAL10
	ids = np.genfromtxt(testloc, dtype=str, skip_header=1, delimiter=',', usecols=0)
	df = pd.DataFrame({"PIDN": ids, "Ca": test_all['PIDN'], "P": test_all['PIDN'], "pH": test_all['PIDN'], "SOC": test_all['PIDN'], "Sand": test_all['PIDN']})
	
	cv = cross_validation.KFold(len(train_all), n_folds=10, indices=False)
	subresults = {}
	results = []

	if issub == False:
		for train_sub, test_sub in cv:
			for target in targets:
				#clf = ensemble.GradientBoostingRegressor(n_estimators=6)
				#clf = RandomForestRegressor(n_estimators = 40)
				#clf = linear_model.Lasso(alpha=0.08)
				#clf = svm.SVC()
				#clf = tree.DecisionTreeRegressor(min_samples_leaf=20)
				#clf = Ridge(alpha=1.0)
				#clf = ElasticNet(alpha=0.1, l1_ratio=0.7)
				clf = BayesianRidge(compute_score=True)
				clf.fit(np.array(train_all[feats_list[target]])[train_sub], np.array(train_all[target])[train_sub])
				pred = clf.predict(np.array(train_all[feats_list[target]])[test_sub])
				subresults[target] = ev.rmse(np.array(train_all[target])[test_sub],np.array(pred))
				#df[target] = pred
			subtotal = 0 
			for x in subresults:
				subtotal = subtotal + subresults[x]
			print ("average for the run is ", subtotal/len(targets))
			results.append(subtotal/len(targets))
		print "Results: " + str( np.array(results).mean() )

	else:
		for target in targets:
			#clf = ensemble.GradientBoostingRegressor(n_estimators=6)
			#clf = RandomForestRegressor(n_estimators = 20)
			#clf = linear_model.Lasso(alpha=0.08)
			#clf = svm.SVC()
			#clf = tree.DecisionTreeRegressor(min_samples_leaf=20)
			#clf = Ridge(alpha=1.0)
			#clf = ElasticNet(alpha=0.1, l1_ratio=0.7)
			clf = BayesianRidge(compute_score=True)
			clf.fit(np.array(train_all[feats_list[target]]), np.array(train_all[target]))
			pred = clf.predict(np.array(test_all[feats_list[target]]))
			df[target] = pred
			df.to_csv(predloc, index=False, cols=["PIDN","Ca","P","pH","SOC","Sand"])
# Linear Regression
print 'linear'
lr = LinearRegression()
#lr.fit(x[:, np.newaxis], y)
#lr_sts_scores = lr.predict(xt[:, np.newaxis])
lr.fit(x, y)
lr_sts_scores = lr.predict(xt)


# Baysian Ridge Regression
print 'baysian ridge'
br = BayesianRidge(compute_score=True)
#br.fit(x[:, np.newaxis], y)
#br_sts_scores = br.predict(xt[:, np.newaxis])
br.fit(x, y)
br_sts_scores = br.predict(xt)


# Elastic Net
print 'elastic net'
enr = ElasticNet()
#enr.fit(x[:, np.newaxis], y)
#enr_sts_scores = enr.predict(xt[:, np.newaxis])
enr.fit(x, y)
enr_sts_scores = enr.predict(xt)


# Passive Aggressive Regression
print 'passive aggressive'
par = PassiveAggressiveRegressor()
par.fit(x, y)
Beispiel #52
0
def main():
    usage = "usage: %prog [options] <model_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-c",
        dest="center_dist",
        default=10,
        type="int",
        help="Distance between the motifs and sequence center [Default: %default]",
    )
    parser.add_option(
        "-d", dest="model_hdf5_file", default=None, help="Pre-computed model output as HDF5 [Default: %default]"
    )
    parser.add_option(
        "-g", dest="cuda", default=False, action="store_true", help="Run on the GPGPU [Default: %default]"
    )
    parser.add_option("-l", dest="seq_length", default=600, type="int", help="Sequence length [Default: %default]")
    parser.add_option("-o", dest="out_dir", default="heat", help="Output directory [Default: %default]")
    parser.add_option(
        "-t",
        dest="targets",
        default="0",
        help="Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error("Must provide Basset model file")
    else:
        model_file = args[0]

    out_targets = [int(ti) for ti in options.targets.split(",")]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(1)

    # torch options
    cuda_str = ""
    if options.cuda:
        cuda_str = "-cuda"

    #################################################################
    # place filter consensus motifs
    #################################################################
    # determine filter consensus motifs
    filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str)

    seqs_1hot = []
    # num_filters = len(filter_consensus)
    num_filters = 20
    filter_len = filter_consensus[0].shape[1]

    # position the motifs
    left_i = options.seq_length / 2 - options.center_dist - filter_len
    right_i = options.seq_length / 2 + options.center_dist

    ns_1hot = np.zeros((4, options.seq_length)) + 0.25
    # ns_1hot = np.zeros((4,options.seq_length))
    # for i in range(options.seq_length):
    #     nt_i = random.randint(0,3)
    #     ns_1hot[nt_i,i] = 1

    for i in range(num_filters):
        for j in range(num_filters):
            # copy the sequence of N's
            motifs_seq = np.copy(ns_1hot)

            # write them into the one hot coding
            motifs_seq[:, left_i : left_i + filter_len] = filter_consensus[i]
            motifs_seq[:, right_i : right_i + filter_len] = filter_consensus[j]

            # save
            seqs_1hot.append(motifs_seq)

    # make a full array
    seqs_1hot = np.array(seqs_1hot)

    # reshape for spatial
    seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, options.seq_length))

    #################################################################
    # place filter consensus motifs
    #################################################################
    # save to HDF5
    seqs_file = "%s/motif_seqs.h5" % options.out_dir
    h5f = h5py.File(seqs_file, "w")
    h5f.create_dataset("test_in", data=seqs_1hot)
    h5f.close()

    # predict scores
    scores_file = "%s/motif_seqs_scores.h5" % options.out_dir
    torch_cmd = "th basset_place2_predict.lua %s %s %s %s" % (cuda_str, model_file, seqs_file, scores_file)
    subprocess.call(torch_cmd, shell=True)

    # load in scores
    hdf5_in = h5py.File(scores_file, "r")
    motif_seq_scores = np.array(hdf5_in["scores"])
    hdf5_in.close()

    #################################################################
    # analyze
    #################################################################
    for ti in out_targets:
        #################################################################
        # compute pairwise expectations
        #################################################################
        # X = np.zeros((motif_seq_scores.shape[0],num_filters))
        # xi = 0
        # for i in range(num_filters):
        #     for j in range(num_filters):
        #         X[xi,i] += 1
        #         X[xi,j] += 1
        #         xi += 1

        X = np.zeros((motif_seq_scores.shape[0], 2 * num_filters))
        xi = 0
        for i in range(num_filters):
            for j in range(num_filters):
                X[xi, i] += 1
                X[xi, num_filters + j] += 1
                xi += 1

        # fit model
        model = BayesianRidge()
        model.fit(X, motif_seq_scores[:, ti])

        # predict pairwise expectations
        motif_seq_preds = model.predict(X)
        print model.score(X, motif_seq_scores[:, ti])

        # print filter coefficients
        coef_out = open("%s/coefs_t%d.txt" % (options.out_dir, ti), "w")
        for i in range(num_filters):
            print >> coef_out, "%3d  %6.2f" % (i, model.coef_[i])
        coef_out.close()

        #################################################################
        # normalize pairwise predictions
        #################################################################
        filter_interaction = np.zeros((num_filters, num_filters))
        table_out = open("%s/table_t%d.txt" % (options.out_dir, ti), "w")

        si = 0
        for i in range(num_filters):
            for j in range(num_filters):
                filter_interaction[i, j] = motif_seq_scores[si, ti] - motif_seq_preds[si]
                cols = (i, j, motif_seq_scores[si, ti], motif_seq_preds[si], filter_interaction[i, j])
                print >> table_out, "%3d  %3d  %6.3f  %6.3f  %6.3f" % cols
                si += 1

        table_out.close()

        # plot heat map
        plt.figure()
        sns.heatmap(filter_interaction)
        plt.savefig("%s/heat_t%d.pdf" % (options.out_dir, ti))
Beispiel #53
0
### Imputing DYAR
train = df[(df.DYAR.isnull() ==False) & (df.pct_team_tgts.isnull() == False)]
train.reset_index(inplace=True, drop=True)
test = df[(df.DYAR.isnull() == True) & (df.pct_team_tgts.isnull() == False)]
test.reset_index(inplace= True, drop=True)

features = ['targets', 'receptions', 'rec_tds', 'start_ratio', 'pct_team_tgts', 'pct_team_receptions', 'pct_team_touchdowns',
            'rec_yards', 'dpi_yards', 'fumbles', 'first_down_ctchs', 'pct_of_team_passyards']
X = scale(train[features])
y = train.DYAR

# Our best model for predicting DYAR was a Bayesian Ridge Regressor
br = BayesianRidge()
br.fit(X,y)
dyar_predictions = pd.DataFrame(br.predict(scale(test[features])), columns = ['DYAR_predicts'])

test = test.join(dyar_predictions)
test['DYAR'] = test['DYAR_predicts']
test.drop('DYAR_predicts', inplace=True, axis=1)

frames = [train,test]
df = pd.concat(frames, axis=0, ignore_index=True)

### Imputing EYds
train = df[(df.EYds.isnull() ==False) & (df.pct_team_tgts.isnull() == False)]
train.reset_index(inplace=True, drop=True)
test = df[(df.EYds.isnull() == True) & (df.pct_team_tgts.isnull() == False)]
test.reset_index(inplace= True, drop=True)

# A Bayesian Ridge was also our best predictor for EYds. In general, we're able to most confidently predict EYds.
Beispiel #54
0
def main():
    usage = 'usage: %prog [options] <model_file>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='center_dist', default=10, type='int', help='Distance between the motifs and sequence center [Default: %default]')
    parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option('-g', dest='cuda', default=False, action='store_true', help='Run on the GPGPU [Default: %default]')
    parser.add_option('-l', dest='seq_length', default=600, type='int', help='Sequence length [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]')
    parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide Basset model file')
    else:
        model_file = args[0]

    out_targets = [int(ti) for ti in options.targets.split(',')]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(1)

    # torch options
    cuda_str = ''
    if options.cuda:
        cuda_str = '-cuda'

    #################################################################
    # place filter consensus motifs
    #################################################################
    # determine filter consensus motifs
    filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str)

    seqs_1hot = []
    num_filters = len(filter_consensus)
    # num_filters = 40
    filter_len = filter_consensus[0].shape[1]

    # position the motifs
    left_i = options.seq_length/2 - options.center_dist - filter_len
    right_i = options.seq_length/2 + options.center_dist

    ns_1hot = np.zeros((4,options.seq_length)) + 0.25
    # ns_1hot = np.zeros((4,options.seq_length))
    # for i in range(options.seq_length):
    #     nt_i = random.randint(0,3)
    #     ns_1hot[nt_i,i] = 1

    for i in range(num_filters):
        for j in range(num_filters):
            # copy the sequence of N's
            motifs_seq = np.copy(ns_1hot)

            # write them into the one hot coding
            motifs_seq[:,left_i:left_i+filter_len] = filter_consensus[i]
            motifs_seq[:,right_i:right_i+filter_len] = filter_consensus[j]

            # save
            seqs_1hot.append(motifs_seq)

    # make a full array
    seqs_1hot = np.array(seqs_1hot)

    # reshape for spatial
    seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,options.seq_length))


    #################################################################
    # place filter consensus motifs
    #################################################################
    # save to HDF5
    seqs_file = '%s/motif_seqs.h5' % options.out_dir
    h5f = h5py.File(seqs_file, 'w')
    h5f.create_dataset('test_in', data=seqs_1hot)
    h5f.close()

    # predict scores
    scores_file = '%s/motif_seqs_scores.h5' % options.out_dir
    torch_cmd = 'th basset_place2_predict.lua %s %s %s %s' % (cuda_str, model_file, seqs_file, scores_file)
    subprocess.call(torch_cmd, shell=True)

    # load in scores
    hdf5_in = h5py.File(scores_file, 'r')
    motif_seq_scores = np.array(hdf5_in['scores'])
    hdf5_in.close()

    #################################################################
    # analyze
    #################################################################
    for ti in out_targets:
        #################################################################
        # compute pairwise expectations
        #################################################################
        # X = np.zeros((motif_seq_scores.shape[0],num_filters))
        # xi = 0
        # for i in range(num_filters):
        #     for j in range(num_filters):
        #         X[xi,i] += 1
        #         X[xi,j] += 1
        #         xi += 1

        X = np.zeros((motif_seq_scores.shape[0],2*num_filters))
        xi = 0
        for i in range(num_filters):
            for j in range(num_filters):
                X[xi,i] += 1
                X[xi,num_filters+j] += 1
                xi += 1

        # fit model
        model = BayesianRidge()
        model.fit(X, motif_seq_scores[:,ti])

        # predict pairwise expectations
        motif_seq_preds = model.predict(X)
        print model.score(X, motif_seq_scores[:,ti])

        # print filter coefficients
        coef_out = open('%s/coefs_t%d.txt' % (options.out_dir,ti), 'w')
        for i in range(num_filters):
            print >> coef_out, '%3d  %6.2f' % (i,model.coef_[i])
        coef_out.close()

        #################################################################
        # normalize pairwise predictions
        #################################################################
        filter_interaction = np.zeros((num_filters,num_filters))
        table_out = open('%s/table_t%d.txt' % (options.out_dir,ti), 'w')

        si = 0
        for i in range(num_filters):
            for j in range(num_filters):
                filter_interaction[i,j] = motif_seq_scores[si,ti] - motif_seq_preds[si]
                cols = (i, j, motif_seq_scores[si,ti], motif_seq_preds[si], filter_interaction[i,j])
                print >> table_out, '%3d  %3d  %6.3f  %6.3f  %6.3f' % cols
                si += 1

        table_out.close()

        scores_abs = abs(filter_interaction.flatten())
        max_score = stats.quantile(scores_abs, .999)
        print 'Limiting scores to +-%f' % max_score
        filter_interaction_max = np.zeros((num_filters, num_filters))
        for i in range(num_filters):
            for j in range(num_filters):
                filter_interaction_max[i,j] = np.min([filter_interaction[i,j], max_score])
                filter_interaction_max[i,j] = np.max([filter_interaction_max[i,j], -max_score])

        # plot heat map
        plt.figure()
        sns.heatmap(filter_interaction_max, xticklabels=False, yticklabels=False)
        plt.savefig('%s/heat_t%d.pdf' % (options.out_dir,ti))
KNNmse = Model_two.predict(X_crosstest_scl)
print("KNN_RMSE:",np.sqrt(mean_squared_error(y_crosstest,KNNmse)))
print(datetime.now() - start)

start = datetime.now()   
from sklearn import ensemble
Model_three = ensemble.RandomForestRegressor(n_estimators = 500,verbose=1,n_jobs=-1,random_state = 120,max_depth=16)
Model_three.fit(X_crosstrain_svd,y_crosstrain)
RFmse = Model_three.predict(X_crosstest_svd)
print("RandomForest_RMSE:",np.sqrt(mean_squared_error(y_crosstest,RFmse)))
print(datetime.now() - start)

start = datetime.now()   
from sklearn.linear_model import BayesianRidge
BR = BayesianRidge(n_iter=500,tol= 0.001,normalize=True).fit(X_crosstrain_scl,y_crosstrain)
pred_BR = BR.predict(X_crosstest_scl)
print("BayesinRidge_RMSE:",np.sqrt(mean_squared_error(y_crosstest,pred_BR)))
print(datetime.now() - start)

start = datetime.now() 
from sklearn.linear_model import LinearRegression
LR = LinearRegression(fit_intercept = True,normalize = True,n_jobs=-1).fit(X_crosstrain_svd,y_crosstrain)
pred_LR = LR.predict(X_crosstest_svd)
print("LinearRegression_RMSE:",np.sqrt(mean_squared_error(y_crosstest,pred_LR)))

print(datetime.now() - start)

#decision tree along with Adaboost
start = datetime.now() 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor