def bayesian_ridge_regression(feature_array, label_array):
    clf = BayesianRidge(compute_score=True)
    clf.fit(feature_array, label_array)

    ols = LinearRegression()
    ols.fit(feature_array, label_array)


    n_features = 9

    plt.figure(figsize=(6, 5))
    plt.title("Weights of the model")
    plt.plot(clf.coef_, 'b-', label="Bayesian Ridge estimate")
    plt.plot(label_array, 'g-', label="Ground truth")
    plt.plot(ols.coef_, 'r--', label="OLS estimate")
    plt.xlabel("Features")
    plt.ylabel("Values of the weights")
    plt.legend(loc="best", prop=dict(size=12))

    plt.figure(figsize=(6, 5))
    plt.title("Histogram of the weights")
    plt.hist(clf.coef_, bins=n_features, log=True)
    # plt.plot(clf.coef_[feature_array], 5 * np.ones(len(feature_array)),
    #          'ro', label="Relevant features")
    plt.ylabel("Features")
    plt.xlabel("Values of the weights")
    plt.legend(loc="lower left")

    plt.figure(figsize=(6, 5))
    plt.title("Marginal log-likelihood")
    plt.plot(clf.scores_)
    plt.ylabel("Score")
    plt.xlabel("Iterations")
    plt.show()
def ridreg(df,test):
    clf = BayesianRidge()
    
    target = df['count']
    train  = df[['time','temp']]
    test   = test2[['time','temp']]

    clf.fit(train,target)
    final = []
    print(test.head(3))
    for i, row in enumerate(test.values):
        y=[]
        for x in row:
            x= float(x)
            y.append(x)
            # print(x)
            final.append(y)
    predicted_probs= clf.predict(final)
    # print(predicted_probs.shape)
    # predicted_probs = pd.Series(predicted_probs)
    # predicted_probs = predicted_probs.map(lambda x: int(x))

    keep = pd.read_csv('data/test.csv')
    keep = keep['datetime']
    # #save to file
    predicted_probs= pd.DataFrame(predicted_probs)
    print(predicted_probs.head(3))
    predicted_probs.to_csv('data/submission3.csv',index=False)
Beispiel #3
0
 def bayes_ridge_reg(self):
     br = BayesianRidge()
     br.fit(self.x_data, self.y_data)
     adjusted_result = br.predict(self.x_data)
     print "bayes ridge params", br.coef_, br.intercept_
     print "bayes ridge accuracy", get_accuracy(adjusted_result, self.y_data)
     return map(int, list(adjusted_result))
Beispiel #4
0
def bayesRegr(source, target):
    # Binarize source
    clf = BayesianRidge()
    features = source.columns[:-1]
    klass = source[source.columns[-1]]
    clf.fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    return preds
Beispiel #5
0
    def fit_model_10(self,toWrite=False):
        model = BayesianRidge(n_iter=5000)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict(X_test)
            print("Model 10 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model10/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
def train_BayesianRegressionModel(
    X,
    y,
    n_iter=300,
    tol=0.001,
    alpha_1=1e-06,
    alpha_2=1e-06,
    lambda_1=1e-06,
    lambda_2=1e-06,
    compute_score=False,
    fit_intercept=True,
    normalize=False,
    copy_X=True,
    verbose=False,
):
    """
    Train a Bayesian regression model
    """
    model = BayesianRidge(
        n_iter=n_iter,
        tol=tol,
        alpha_1=alpha_1,
        alpha_2=alpha_2,
        lambda_1=lambda_1,
        lambda_2=lambda_2,
        compute_score=compute_score,
        fit_intercept=fit_intercept,
        normalize=normalize,
        copy_X=copy_X,
        verbose=verbose,
    )
    model = model.fit(X, y)
    return model
Beispiel #7
0
def br_modeling(data, y_name, candidates_location):
    from sklearn.linear_model import BayesianRidge
    temp = data.copy()
    candidates = get_variables("./%s" % candidates_location)
    temp = rf_trim(temp, y_name, candidates)
    model = BayesianRidge()
    res = model.fit(temp[candidates], temp[y_name])
    joblib.dump(res, "./%sbr_model%s.pkl" % (y_name, datetime.datetime.today()))
    return res
Beispiel #8
0
class BayesianRRCalculator(EnergyCalculator):
    """Energy calculator using global feature vectors and Bayesian Ridge Regression."""

    def __init__(self, feature_key):
        EnergyCalculator.__init__(self)

        self.ridge = BayesianRidge(fit_intercept=False)
        self.energy_key = 'BRR'
        self.feature_key = feature_key

    def fit(self, training_set, energy_key):
        """Fit the BRR model.

        The feature vectors with key=self.feature_key will be used for feature vectors. The
        energy with the specified energy_key will be the target function.

        Parameters:
            training_set : list of Nanoparticles
            energy_key : str
        """
        feature_vectors = [p.get_feature_vector(self.feature_key) for p in training_set]
        energies = [p.get_energy(energy_key) for p in training_set]

        self.ridge.fit(feature_vectors, energies)

    def get_coefficients(self):
        return self.ridge.coef_

    def set_coefficients(self, new_coefficients):
        self.ridge.coef_ = new_coefficients

    def set_feature_key(self, feature_key):
        self.feature_key = feature_key

    def compute_energy(self, particle):
        """Compute the energy using BRR.

        Assumes that a feature vector with key=self.feature_key is present in the particle.

        Parameters:
            particle : Nanoparticle
        """
        brr_energy = np.dot(np.transpose(self.ridge.coef_), particle.get_feature_vector(self.feature_key))
        particle.set_energy(self.energy_key, brr_energy)
def ridge():
    ac = loadmat('./data/component_contribution_python.mat')

    S = ac['train_S']

    df_S = pd.DataFrame(ac['train_S'])
    df_S_unique = df_S.T.drop_duplicates().T
    unque_cols = df_S_unique.columns.values.tolist()
    S = S[:, unque_cols]

    G = ac['G']
    # b = ac['b']

    b_list = json.load(open('./data/median_b.json'))
    b = np.asarray(b_list)
    b = np.reshape(b, (-1, 1))

    # w = ac['w']

    # pdb.set_trace()

    m, n = S.shape
    assert G.shape[0] == m
    assert b.shape == (n, 1)

    STG = np.dot(S.T, G)

    X = STG
    y = b

    # clf = Ridge(alpha=0.1,fit_intercept=False)
    # clf.fit(X, y)
    # print('R2',clf.score(X, y))
    # print clf.coef_

    reg = BayesianRidge(tol=1e-6, fit_intercept=False, compute_score=True)
    reg.fit(X, y)
    # print reg.coef_
    conv = reg.sigma_
    conv_coeff = [conv[i][i] for i in range(len(conv))]

    for num in conv_coeff[0:263]:
        if num < 500: print num
    pdb.set_trace()
Beispiel #10
0
def baysian_curve ():
    for degree in range(1,12):
        clf_poly = BayesianRidge(compute_score=True)   #BayesianRidge library used 
        clf_poly.fit(num.vander(X, degree), t)         #Bayesian fit the data using L2(ridge) regularization
        #print(clf_poly.coef_[0:len(clf_poly.coef_)-1])
        X_plot = num.linspace(0, 1, 500)
        y_plot = f(X_plot)
        y_mean, y_std = clf_poly.predict(num.vander(X_plot, degree), return_std=True)  #this will return mean and normal fitted polynomial
        plot.figure(figsize=(6,5))
        plot.title('Bayesian Curve Fitting for polynomial of Degree = ' + str(degree-1) , color='black')
        plot.errorbar(X_plot, y_mean, y_std, color='red', label="Polynomial Bayesian Regression",  linewidth=lw)
        plot.plot(X_plot, y_plot, color='green', label="Sine Curve")
        plot.plot(X,t,'o')
        plot.ylabel("Output y")
        plot.xlabel("Feature X")
        plot.legend(loc="lower left")
        plot.subplots_adjust(hspace=2.0)   #plot  the generated data 
        plot.subplots_adjust(wspace=0.2)
        plot.show()
Beispiel #11
0
def stackModel(train_stack, test_stack, y_train, myMetrics_type, n_runs=3, n_folds=5, use_StratifiedKFold=True):

    predictions_stack_sum_of_runs = np.zeros(len(test_stack))
    oof_stack_sum_of_runs = np.zeros(len(train_stack))

    for run in range(n_runs):
        predictions = np.zeros(test_stack.shape[0])
        oof_stack_pre = np.zeros(train_stack.shape[0])
        random_seed = 2015 + 1000 * (run + 1)
        if use_StratifiedKFold:
            sfolder = StratifiedKFold(n_splits=n_folds, random_state=random_seed, shuffle=True)
        else:
            sfolder = KFold(n_splits=n_folds, random_state=random_seed, shuffle=True)
        skf_tmp = sfolder.split(train_stack, y_train)

        for fold_, (trn_idx, val_idx) in enumerate(skf_tmp):
            print("Stack Run: %d, Fold: %d" % (run+1, fold_+1))

            trn_data, trn_y = train_stack[trn_idx], y_train.iloc[trn_idx]
            val_data, val_y = train_stack[val_idx], y_train.iloc[val_idx]

            clf_3 = BayesianRidge()
            clf_3.fit(trn_data, trn_y)

            oof_stack_pre[val_idx] = clf_3.predict(val_data)
            predictions += clf_3.predict(test_stack) / n_folds

        score = MyMetrics(myMetrics_type).metricsFunc(oof_stack_pre, y_train)
        print("Stack Run: {}, CV val score: {:<8.5f}".format(run + 1, score))

        predictions_stack_sum_of_runs = predictions_stack_sum_of_runs + predictions
        oof_stack_sum_of_runs = oof_stack_sum_of_runs + oof_stack_pre

    predictions_stack_mean_of_runs = predictions_stack_sum_of_runs / n_runs
    oof_stack_mean_of_runs = oof_stack_sum_of_runs / n_runs

    finalScore = MyMetrics(myMetrics_type).metricsFunc(oof_stack_mean_of_runs, y_train)
    print("Final score: {}".format(finalScore))
    return predictions_stack_mean_of_runs, oof_stack_mean_of_runs, finalScore


# train_stack = np.vstack([oof_lgb, oof_xgb]).transpose()
# test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()
Beispiel #12
0
def bayes_regression(x_train, x_test, y_train, y_test):
    model = BayesianRidge()

    model.fit(x_train, y_train)
    score = model.score(x_test, y_test)

    print("score", score)
    y_pred = model.predict(x_test)

    print("mean_squared_error", mean_squared_error(y_test, y_pred))

    # 保存结果
    # result = model.predict(test_df)
    # print(result)
    # result_df = pd.DataFrame(result, columns=['target'])
    # result_df.to_csv("0.098.txt", index=False, header=False)

    # 绘制学习率曲线
    plot_learning_curve(model, title="learn_rate", X=x_train, y=y_train, cv=10)
Beispiel #13
0
    def bay_ridge_model(self, X_train, y_train, X_test, y_test):

        bay_ridge_model = BayesianRidge(alpha_1=1,
                                        alpha_2=1,
                                        lambda_1=900,
                                        lambda_2=100)

        bay_ridge_model.fit(X_train, y_train)

        y_train_pred = bay_ridge_model.predict(X_train)
        y_test_pred = bay_ridge_model.predict(X_test)

        # Scoring the model
        print(bay_ridge_model.score(X_train, y_train))
        print(bay_ridge_model.score(X_test, y_test))
        print('MSE train: %.6f, MSE test: %.6f' % (mean_squared_error(
            y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
        print('R^2 train: %.6f, R^2 test: %.6f' %
              (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))
Beispiel #14
0
 def fit_predict(self, m_current):
     obs, _ = self.get_obs_ixs(m_current)
     Uo, mo, wo = self.U[obs], np.copy(m_current)[obs], self.weighting[obs]
     ridge = BayesianRidge(
         fit_intercept=self.add_bias,
         alpha_init=self.alpha_init,
         lambda_init=self.lambda_init,
         alpha_1=self.alpha_1,
         alpha_2=self.alpha_2,
         lambda_1=self.lambda_1,
         lambda_2=self.lambda_2,
     )
     ridge.fit(Uo, mo, sample_weight=wo)
     pred, pred_std = ridge.predict(self.U, return_std=True)
     pred[obs] = m_current[obs]
     # standard deviation of observed regions is 0
     pred_std[obs] = 0.0
     self.model = ridge
     return pred, pred_std
Beispiel #15
0
def from_station(station, departing=False):
    journeys = at_station(station)
    livst = station == 'LIVST'

    X = [row[2:4] for row in journeys if None not in [row[3], row[4], row[5]]]
    Y = [
        row[-1 if departing else -2] for row in journeys
        if None not in [row[3], row[4], row[5]]
    ]

    if livst:
        X = [row[2:4] for row in journeys if None not in [row[3], row[4]]]
        Y = [row[-2] for row in journeys if None not in [row[3], row[4]]]

    reg = BayesianRidge()
    knn = KNeighborsRegressor(n_neighbors=2)

    X_train, X_test, Y_train, Y_test = train_test_split(
        X,
        Y,
        test_size=0.2,
    )

    reg.fit(X_train, Y_train)
    knn.fit(X_train, Y_train)
    Y_pred = [int(round(num)) for num in reg.predict(X_test)]
    Y_pred2 = [int(round(num)) for num in knn.predict(X_test)]

    # df = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred, 'KNN': Y_pred2})
    # df1 = df.head(25)
    # print(df1)
    # print('Mean Absolute', metrics.mean_absolute_error(Y_test, Y_pred),
    #       metrics.mean_absolute_error(Y_test, Y_pred2))
    # print('Mean Squared', metrics.mean_squared_error(Y_test, Y_pred),
    #       metrics.mean_squared_error(Y_test, Y_pred2))
    # print('Root Mean Squared',
    #       np.sqrt((metrics.mean_squared_error(Y_test, Y_pred))),
    #       np.sqrt((metrics.mean_squared_error(Y_test, Y_pred2))))

    # print(reg.predict([[1, 15]]))
    # print(Y_pred)
    # format_results(Y_test, Y_pred, Y_pred2)
    return reg
Beispiel #16
0
def fit_polynomial_bayesian_skl(X, Y, degree,
                                lambda_shape=1.e-6, lambda_invscale=1.e-6,
                                padding=10, n=100,
                                X_unknown=None):
    X_v = pol.polyvander(X, degree)

    clf = BayesianRidge(lambda_1=lambda_shape, lambda_2=lambda_invscale)
    clf.fit(X_v, Y)

    coeff = np.copy(clf.coef_)

    # there some weird intercept thing
    # since the Vandermonde matrix has 1 at the beginning, just add this
    # intercept to the first coeff
    coeff[0] += clf.intercept_

    ret_ = [coeff]

    # generate the line
    x = np.linspace(X.min()-padding, X.max()+padding, n)
    x_v = pol.polyvander(x, degree)

    # using the provided predict method
    y_1 = clf.predict(x_v)

    # using np.dot() with coeff
    y_2 = np.dot(x_v, coeff)

    ret_.append(((x, y_1), (x, y_2)))

    if X_unknown is not None:
        xu_v = pol.polyvander(X_unknown, degree)

        # using the predict method
        yu_1 = clf.predict(xu_v)

        # using np.dot() with coeff
        yu_2 = np.dot(xu_v, coeff)

        ret_.append(((X_unknown, yu_1), (X_unknown, yu_2)))

    return ret_
Beispiel #17
0
    def fit(self, smiles, y=None, *, X_scaler=None, y_scaler=None, **kwargs):
        """
        Parameters
        ----------
        smiles: list[str]
            SMILES for training.
        y: pandas.DataFrame
            Target properties for training.
        X_scaler: Scaler (optional, not implement)
            Scaler for transform X.
        y_scaler: Scaler (optional, not implement)
            Scaler for transform y.
        kwargs: dict
            Parameters pass to BayesianRidge initialization.
        """

        if self._mdl:
            raise RuntimeError('estimators have been set.'
                               'If you want to re-train these estimators,'
                               'please use `remove_estimator()` method first.')

        if not isinstance(y, pd.DataFrame):
            raise TypeError(
                'please package all properties into a pd.DataFrame')

        # remove NaN fromm X
        desc = self._descriptor.transform(
            smiles, return_type='df').reset_index(drop=True)
        y = y.reset_index(drop=True)
        desc.dropna(inplace=True)
        y = y.loc[desc.index]

        for c in y:
            y_ = y[c]  # get target property.
            # remove NaN from y_
            y_.dropna(inplace=True)
            desc_ = desc.loc[y_.index]
            desc_ = desc_.values

            mdl = BayesianRidge(compute_score=True, **kwargs)
            mdl.fit(desc_, y_)
            self._mdl[c] = mdl
def mr_link_ridge(outcome_geno,
                 r_sq_mat,
                 exposure_betas,
                 causal_exposure_indices,
                 outcome_phenotype,
                 upper_r_sq_threshold=0.99,
                 lower_r_sq_threshold=0.1,
                 prune_r_sq_threshold=0.95,
                 ):
    """

    Does MR-link solved by ridge regression.
    Please note that the p value and se is uncorrected. so these are usually _very_ conservative.
    See the MR-link manuscript for details.

    :param outcome_geno: outcome genotypes
    :param r_sq_mat: R^2 matrix in order of genotypes of outcome geno
    :param exposure_betas: beta estimates of the exposure instrumental variables.
    :param causal_exposure_indices:  indices of the exposure instrumental variables.
    :param outcome_phenotype: outcome phenotype vector
    :param upper_r_sq_threshold: the upper r_sq threshold for which the variants around the IVs are pruned.
    :return: beta, se and p value estimate of the MR-link estimate
    """

    design_mat = make_mr_link_design_matrix(outcome_geno,
                                            r_sq_mat,
                                            exposure_betas,
                                            causal_exposure_indices,
                                            upper_r_sq_threshold=upper_r_sq_threshold,
                                            lower_r_sq_threshold=lower_r_sq_threshold,
                                            prune_r_sq_threshold=prune_r_sq_threshold
                                            )

    ridge_fit = BayesianRidge(fit_intercept=False)

    ridge_fit.fit(design_mat, outcome_phenotype)

    t_stat = np.abs(ridge_fit.coef_[0] / np.sqrt(ridge_fit.sigma_[0, 0]))

    p_val = 2 * scipy.stats.norm.sf(t_stat)

    return ridge_fit.coef_[0], np.sqrt(ridge_fit.sigma_[0,0]), p_val
def rand_search_lm(funcs,df,var_y,var_media,var_nonmedia,n):    
    
    clf = BayesianRidge(compute_score=True,fit_intercept=True)

    my_coefs=np.zeros((n,len(var_media)+ len(var_nonmedia)+1))
    my_scores=np.zeros((n,1))
 
    par_list=ram_par_create(funcs,df,var_media,n)
        
    df_var=df.loc[:,var_y+var_nonmedia+var_media]
    if funcs == "log_y":
        y=np.log(df[var_y]/(100-df[var_y])).values
    elif funcs == "Simple Power":
        y=df[var_y].values
    elif funcs == "S curves":
        y=df[var_y].values
    elif funcs == "log_log":
        y=np.log(df[var_y]).values       
    
    
    ###modeling
    for iteration in range(par_list.shape[0]):
        
        X = df_var.iloc[:,1:].values        
        for j in range(len(var_media)):
           
            if funcs == "log_y":
                X[:,j+len(var_nonmedia)]=Adstock(df[var_media[j]],par_carryover=par_list[iteration][j])
            elif funcs == "Simple Power":
                X[:,j+len(var_nonmedia)]=Adstock(df[var_media[j]],par_carryover=par_list[iteration][j])**par_list[iteration][j+len(var_media)]
            elif funcs == "S curves":
                X[:,j+len(var_nonmedia)]=np.exp(par_list[iteration][j+len(var_media)]-par_list[iteration][j+2*len(var_media)]/(Adstock(df[var_media[j]],par_carryover=par_list[iteration][j])+0.00001))
            elif funcs == "log_log":
                X[:,j+len(var_nonmedia)]=np.log(Adstock(df[var_media[j]],par_carryover=par_list[iteration][j])+0.00001)
                        
        clf.fit(X, y)
        my_coefs[iteration,0]=clf.intercept_
        my_coefs[iteration,1:]=clf.coef_
        my_scores[iteration,0]=clf.score(X,y)
                               

    return my_coefs,my_scores,par_list
Beispiel #20
0
def test_bayesian_ridge_score_values():
    """Check value of score on toy example.

    Compute log marginal likelihood with equation (36) in Sparse Bayesian
    Learning and the Relevance Vector Machine (Tipping, 2001):

    - 0.5 * (log |Id/alpha + X.X^T/lambda| +
             y^T.(Id/alpha + X.X^T/lambda).y + n * log(2 * pi))
    + lambda_1 * log(lambda) - lambda_2 * lambda
    + alpha_1 * log(alpha) - alpha_2 * alpha

    and check equality with the score computed during training.
    """

    X, y = diabetes.data, diabetes.target
    n_samples = X.shape[0]
    # check with initial values of alpha and lambda (see code for the values)
    eps = np.finfo(np.float64).eps
    alpha_ = 1. / (np.var(y) + eps)
    lambda_ = 1.

    # value of the parameters of the Gamma hyperpriors
    alpha_1 = 0.1
    alpha_2 = 0.1
    lambda_1 = 0.1
    lambda_2 = 0.1

    # compute score using formula of docstring
    score = lambda_1 * log(lambda_) - lambda_2 * lambda_
    score += alpha_1 * log(alpha_) - alpha_2 * alpha_
    M = 1. / alpha_ * np.eye(n_samples) + 1. / lambda_ * np.dot(X, X.T)
    M_inv = pinvh(M)
    score += - 0.5 * (fast_logdet(M) + np.dot(y.T, np.dot(M_inv, y)) +
                      n_samples * log(2 * np.pi))

    # compute score with BayesianRidge
    clf = BayesianRidge(alpha_1=alpha_1, alpha_2=alpha_2,
                        lambda_1=lambda_1, lambda_2=lambda_2,
                        n_iter=1, fit_intercept=False, compute_score=True)
    clf.fit(X, y)

    assert_almost_equal(clf.scores_[0], score, decimal=9)
Beispiel #21
0
def stack(*avg):
    train_stack = np.vstack(avg[0]).transpose()
    test_stack = np.vstack(avg[1]).transpose()
    y_train = avg[2]
    folds_stack = StratifiedKFold(n_splits=10, shuffle=True, random_state=8888)
    oof_stack = np.zeros(train_stack.shape[0])
    predictions = np.zeros(test_stack.shape[0])
    for fold_, (trn_idx,
                val_idx) in enumerate(folds_stack.split(train_stack, y_train)):
        print("fold :", fold_ + 1)
        trn_data, trn_y = train_stack[trn_idx], y_train[trn_idx]
        val_data, val_y = train_stack[val_idx], y_train[val_idx]
        stacking = BayesianRidge()
        stacking.fit(trn_data, trn_y)
        oof_stack[val_idx] = stacking.predict(val_data)
        predictions += stacking.predict(test_stack) / folds_stack.n_splits

    print("stacking auc score: {:<8.8f}".format(
        roc_auc_score(y_train, oof_stack)))
    return predictions
Beispiel #22
0
def predict_price(area) -> float:
    """
    This method must accept as input an array `area` (represents a list of areas sizes in sq feet) and must return the respective predicted prices (price per sq foot) using the linear regression model that you build.

    You can run this program from the command line using `python3 regression.py`.
    """
    response = requests.get(TRAIN_DATA_URL)
    s = response.content
    d = pd.read_csv(io.StringIO(s.decode('utf-8')))
    d = d.T
    d.reset_index(level=0, inplace=True)
    d = d[1:]
    d.columns = ['area', 'price']
    model = BayesianRidge()
    model.fit(numpy.reshape(numpy.array(d['area']), (-1, 1)),
              numpy.reshape(numpy.array(d['price']), (-1, 1)))
    area = area.reshape(-1, 1)
    print(model.coef_)
    print(model.intercept_)
    return model.predict(area)
Beispiel #23
0
def Train_Model(X_Train, Y_Train, X_Test):
    #fitting Bayseian Regression to the training dataset
    regressor = BayesianRidge()
    fitResult = regressor.fit(X_Train, Y_Train)

    #predicting the model for X_test
    b = fitResult.predict(X_Test)

    #Performing exponential function on the predicted values and exporting the output values to an out excel file
    b = np.exp(b)
    np.savetxt('out.csv', b)
Beispiel #24
0
def test_bayesian_initial_params():
    # Test BayesianRidge with initial values (alpha_init, lambda_init)
    X = np.vander(np.linspace(0, 4, 5), 4)
    y = np.array([0., 1., 0., -1., 0.])    # y = (x^3 - 6x^2 + 8x) / 3

    # In this case, starting from the default initial values will increase
    # the bias of the fitted curve. So, lambda_init should be small.
    reg = BayesianRidge(alpha_init=1., lambda_init=1e-3)
    # Check the R2 score nearly equals to one.
    r2 = reg.fit(X, y).score(X, y)
    assert_almost_equal(r2, 1.)
Beispiel #25
0
    def __get_fitness(self, genes, x, y, initialFitness=None, runs=1):
        nog = np.count_nonzero(genes)
        if nog == 0:
            return self.__get_fitness_instance(0, 1000, 1, len(genes))

        hashedGenes = hash(genes.data.tobytes())
        self.FitnessCites += 1
        if initialFitness is not None and hashedGenes in self.Memo:  #Tabu mem search
            self.SavedCalculationTimes += 1
            return self.__get_fitness_instance(initialFitness.Metric - 10,
                                               1000, initialFitness.Std + 0.1,
                                               initialFitness.NOG)

        clf = BayesianRidge(n_iter=100)
        clf.fit(x[:, np.where(genes == 1)[0]], y)
        scores2, bic = regression_accuracy_scorer(
            clf, x[:, np.where(genes == 1)[0]], y)
        fitness = self.__get_fitness_instance(scores2, bic, 0, nog)
        self.Memo[hashedGenes] = fitness
        return fitness
Beispiel #26
0
def model():
    x_train, x_test, y_train, y_test = feature_sel_data_split()
    model_1 = LinearRegression()
    model_1.fit(x_train, y_train)
    model_1_result = model_1.predict(x_test)
    model_1_file = open("data/linear_reg.model", 'wb')
    pickle.dump(model_1, model_1_file)
    model_1_file.close()

    ridge = Ridge(alpha=0.5)
    ridge.fit(x_train, y_train)
    ridge_result = ridge.predict(x_test)
    ridge_file = open("data/ridge.model", 'wb')
    pickle.dump(ridge, ridge_file)
    ridge_file.close()

    lasso = Lasso(alpha=0.01)
    lasso.fit(x_train, y_train)
    lasso_result = lasso.predict(x_test)
    lasso_file = open("data/lasso.model", 'wb')
    pickle.dump(lasso, lasso_file)
    lasso_file.close()

    bayesian = BayesianRidge()
    bayesian.fit(x_train, y_train)
    bayesian_result = bayesian.predict(x_test)
    bayesian_file = open("data/bayesian.model", 'wb')
    pickle.dump(bayesian, bayesian_file)
    bayesian_file.close()

    elastic = ElasticNet(alpha=0.01)
    elastic.fit(x_train, y_train)
    elastic_result = elastic.predict(x_test)
    elastic_file = open("data/elastic.model", 'wb')
    pickle.dump(elastic, elastic_file)
    elastic_file.close()

    return y_test, [
        model_1_result, ridge_result, elastic_result, lasso_result,
        bayesian_result
    ]
    def get_stacking(self, oof_list, prediction_list, labels):
        train_stack = np.vstack(oof_list).transpose()
        test_stack = np.vstack(prediction_list).transpose()

        repeats = len(oof_list)
        #RepeatedKFold  p次k折交叉验证
        kfolder = RepeatedKFold(n_splits=self.n_fold,
                                n_repeats=repeats,
                                random_state=4590)
        kfold = kfolder.split(train_stack, labels)
        preds_list = list()
        stacking_oof = np.zeros(train_stack.shape[0])

        for train_index, vali_index in kfold:
            k_x_train = train_stack[train_index]
            k_y_train = labels.loc[train_index]
            k_x_vali = train_stack[vali_index]

            gbm = BayesianRidge(normalize=True)
            gbm.fit(k_x_train, k_y_train)

            k_pred = gbm.predict(k_x_vali)
            stacking_oof[vali_index] = k_pred

            preds = gbm.predict(test_stack)
            preds_list.append(preds)

        fold_mae_error = mean_absolute_error(labels, stacking_oof)
        print(f'stacking fold mae error is {fold_mae_error}')
        fold_score = 1 / (1 + fold_mae_error)
        print(f'fold score is {fold_score}')

        preds_columns = [
            'preds_{id}'.format(id=i) for i in range(self.n_fold * repeats)
        ]
        preds_df = pd.DataFrame(data=preds_list)
        preds_df = preds_df.T
        preds_df.columns = preds_columns
        stacking_prediction = list(preds_df.mean(axis=1))

        return stacking_oof, stacking_prediction
Beispiel #28
0
def main():
    # Multiple Regression using Backward Elimination and Cross_Val

    # df = pd.read_csv('train_adjusted.csv')
    df = pd.read_csv('train_adjusted.csv')
    columns = list(df.columns.values)

    # train all columns
    x = df[[
        'LotArea', 'OverallQual', 'OverallCond', 'MasVnrArea', 'BsmtFinSF1',
        'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'BsmtFullBath', 'BedroomAbvGr',
        'KitchenAbvGr', 'TotRmsAbvGrd', 'Fire2laces', 'GarageYrBlt',
        'GarageCars', 'WoodDeckSF', 'ScreenPorch'
    ]]

    # y = df.iloc[:, lambda df: [38]].values
    y = df[['SalePrice']]

    poly = PolynomialFeatures(degree=2, include_bias=False)
    x = poly.fit_transform(x)
    sds = StandardScaler()
    x = sds.fit_transform(x)

    model = BayesianRidge()
    model.fit(x, y)

    scores = cross_val_score(model, x, y, cv=10)
    print(scores)

    predictions = cross_val_predict(model, x, y, cv=10)
    plt.scatter(y, predictions)

    coeff = metrics.r2_score(y, predictions)
    print("R^2 Value:", coeff)

    rmse = np.sqrt(mean_squared_error(predictions, y))
    print('Root Mean Squared Error:', rmse)

    plt.scatter(y, predictions)
    plt.xlabel("Actual Sales Price")
    plt.ylabel("Predictions")
Beispiel #29
0
def bayeImpute(data, target_col, verbose=0):
    '''
    currently, BayesianRidge.
    return the imputated data, and model coefficient
    '''

    from sklearn.linear_model import BayesianRidge, LinearRegression
    from sklearn.ensemble import RandomForestRegressor
    model = BayesianRidge()
    # model = LinearRegression()
    # model = RandomForestRegressor()

    original_data = np.copy(data)

    target = data[:, target_col]
    data = np.delete(data, obj=target_col,
                     axis=1)  #remove the missing-value column
    mv_mask = pd.isnull(target)
    if verbose:
        print("number of imputated cells: {}".format(
            sum(pd.isnull(original_data[:, target_col]))))

    x_test = data[mv_mask]
    x_train = data[~mv_mask]
    y_train = target[~mv_mask]
    # check if valid to regression: wether only one value exist in target.
    # If happen, use default "mean" method (which is all same)
    is_other_value = False in (y_train == y_train[0])
    if (not is_other_value):
        model = "mean"
        original_data[mv_mask, target_col] = y_train[0] * len(mv_mask)
        return original_data, model

    model.fit(x_train, y_train)
    result = model.predict(x_test)
    original_data[
        mv_mask,
        target_col] = result  #put the imputation result back to original data, following the index

    # print("coefficient: {}".format(model.coef_))
    return original_data, model
Beispiel #30
0
def bayfit(data):  #Bayesian regression - very bad
    lw = 2
    x = np.arange(len(data) - 1)
    degree = 3
    clf_poly = BayesianRidge()
    clf_poly.fit(np.vander(x, degree), data[x])
    x_plot = np.arange(len(data))
    y_mean, y_std = clf_poly.predict(np.vander(x_plot, degree),
                                     return_std=True)
    plt.figure(figsize=(6, 5))
    plt.errorbar(x_plot,
                 y_mean,
                 y_std,
                 color='navy',
                 label="Polynomial Bayesian Ridge Regression",
                 linewidth=lw)
    plt.plot(x_plot, data, color='gold', linewidth=lw, label="Ground Truth")
    plt.ylabel("Output y")
    plt.xlabel("Feature X")
    plt.legend(loc="lower left")
    plt.show()
Beispiel #31
0
def train_classiifer(X_train, y_train, to_tune, classifier):
    # Initialize Classifier.
    clf = BayesianRidge()
    clf = SVR(kernel='rbf', C=1e3, gamma=0.1)
    #clf = RandomForestRegressor()
    if classifier:
        clf = classifier
        to_tune = False
    if to_tune:
        # Grid search: find optimal classifier parameters.
        param_grid = {'alpha_1': sp_rand(), 'alpha_2': sp_rand()}
        param_grid = {'C': sp_rand(), 'gamma': sp_rand()}
        rsearch = RandomizedSearchCV(estimator=clf, 
                                     param_distributions=param_grid, n_iter=5000)
        rsearch.fit(X_train, y_train)
        # Use tuned classifier.
        clf = rsearch.best_estimator_
          
    # Trains Classifier   
    clf.fit(X_train, y_train)
    return clf
Beispiel #32
0
def stacking_model(oof_lgb, oof_xgb, predictions_lgb, predictions_xgb):
    train_stack = np.vstack([oof_lgb, oof_xgb]).transpose()
    test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()

    folds_stack = RepeatedKFold(n_splits=9, n_repeats=2, random_state=4590)
    oof_stack = np.zeros(train_stack.shape[0])
    predictions = np.zeros(test_stack.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, y_train_)):
        trn_data, trn_y = train_stack[trn_idx], y_train_[trn_idx]
        val_data, val_y = train_stack[val_idx], y_train_[val_idx]

        clf_3 = BayesianRidge()
        clf_3.fit(trn_data, trn_y)

        oof_stack[val_idx] = clf_3.predict(val_data)
        predictions += clf_3.predict(test_stack) / 18

    loss = mean_squared_error(y_train_, oof_stack)
    print('merge loss:', loss)
    return predictions
Beispiel #33
0
def bayesianRidge(X_train, y_train, X_test, y_test, Identifier):
    '''
        Fits Bayesian Ridge model on the data provided after feature selection.
    :param X_train: the data frame containing the selected features for training
    :param y_train: the data frame of target variable used for training
    :param X_test: the data frame of containing the selected features for testing
    :param y_test: the data frame of target variable for testing
    :param Identifier: whether called for time series prediction or news prediction
    :return: returns the error score and predicted values
    '''

    bayesianRidge = BayesianRidge()

    bayesianRidge.fit(X_train, y_train)

    # save the model to disk
    if Identifier != "News":
        filename = '../Models/BayesianRidge' + 'TimeSeries' + '.sav'
    else:
        filename = '../Models/BayesianRidge' + 'News' + '.sav'
    joblib.dump(bayesianRidge, filename)

    prediction = bayesianRidge.predict(X_test)

    prediction = pd.DataFrame(prediction, index=y_test.index)
    error = mean_absolute_error(y_test, prediction)

    if Identifier != "News":
        makeGraph(y_test,
                  valueFromTimeSeries=prediction,
                  name="Time Series - Bayesian Ridge")
    else:
        makeGraph(y_test,
                  valueFromNews=prediction,
                  name="News - Bayesian Ridge")

    #print(prediction)
    statistic, pvalue = mannwhitneyu(y_test, pd.Series(prediction[0]))

    return error, prediction, pvalue
Beispiel #34
0
    def ApplyBayesianRidge(self, train, test, cross_validation, full_train,
                           config):
        BR = BayesianRidge(verbose=True, n_iter=1000, tol=0.00001)
        target_train = train[['Hazard']]
        cross_validation_test = cross_validation[['Hazard']]
        prepared_train = train[train.columns.difference(['Id', 'Hazard'])]

        print "prepared_train meta"
        print "shape", prepared_train.shape
        print prepared_train.head(3)

        BR.fit(prepared_train, target_train)
        dt = BR.predict(test[test.columns.difference(['Id'])])
        print "prediction score on cross validation"
        print BR.score(
            cross_validation[cross_validation.columns.difference(
                ['Id', 'Hazard'])], cross_validation_test)
        dt_cv = BR.predict(
            cross_validation[cross_validation.columns.difference(
                ['Id', 'Hazard'])])
        test['Hazard'] = self.clipForecastValue(dt)
        cross_validation['predicted_Hazard'] = self.clipForecastValue(dt_cv)

        # print "sorted feature importance"
        # print sorted(zip(map(lambda x: round(x, 4), BR.feature_importances_), names),
        #	     reverse=True)

        print "regression model coefficients"
        print BR.coef_

        print "estimated precision of the noise"
        print BR.alpha_

        print "estimated precision of the weights"
        print BR.lambda_

        print "value of the objective function"
        print BR.scores_

        return test, cross_validation
Beispiel #35
0
def bayesian_ridge(X_train, y_train, X_test, y_test):
    '''
        Purpose: Use Bayesian Ridge to calculate accuracy
        Input: X_train, y_train, X_test, y_test
        Output: accuracy_score
   '''
    clf = BayesianRidge(compute_score=True)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred = y_pred.round()
    #ols = LinearRegression()
    #ols.fit(X, y)
    return metrics.accuracy_score(y_test, y_pred)
Beispiel #36
0
def _fit_bayesian_ridge(X: np.ndarray, y: np.ndarray,
                        fit_intercept: bool = False,
                        **kwargs) -> Dict[str, Any]:
    """
    Returns the solution `a` to the linear problem `Xa=y` obtained by using
    Bayesian ridge regression as implemented in scitkit-learn in the
    form of a dictionary with a key named `parameters`.

    Parameters
    -----------
    X
        fit matrix
    y
        target array
    fit_intercept
        center data or not, forwarded to sklearn
    """
    brr = BayesianRidge(fit_intercept=fit_intercept, **kwargs)
    brr.fit(X, y)
    results = dict()
    results['parameters'] = brr.coef_
    return results
Beispiel #37
0
def MSE_Bay(train_data,lag,t_ahead,s_i):
    sample_x = np.transpose(train_data[:,:-lag])
    
    for i in range(1, lag):
        sample_x = np.hstack([sample_x, np.transpose(train_data[:,i:-(lag-i)])])
    
    sample_x = sample_x[:-t_ahead,:]
    
#    num_stream = 1
    slding_predict_t = 730
    landmark_win_ini_size = 367
#    for s_i in range(num_stream):
    sample_y_si = np.transpose(train_data[s_i,t_ahead+lag-1:])
#        print(sample_y_si[367])
    reg_si = BayesianRidge()
    pre_y = []
    act_y = []
    for landmark_win in range(slding_predict_t):
        train_x = sample_x[:landmark_win_ini_size+landmark_win,:]
        train_y = sample_y_si[:landmark_win_ini_size+landmark_win]
        reg_si.fit(train_x,train_y)
        y_hat = reg_si.predict(sample_x[landmark_win_ini_size+landmark_win:landmark_win_ini_size+landmark_win+1,:])
        pre_y.append(y_hat)
        act_y.append(sample_y_si[landmark_win_ini_size+landmark_win:landmark_win_ini_size+landmark_win+1])
        
#        plt.plot(range(landmark_win_ini_size+1,landmark_win_ini_size+landmark_win+2),pre_y,label='prediction s'+str(s_i))
#        plt.plot(range(landmark_win_ini_size+1,landmark_win_ini_size+landmark_win+2),act_y,label='actual')
#        plt.legend()
#        plt.show()
#        print(pre_y)
#        print(act_y)
    MSE = 0
    for i in range (0,len(pre_y)):
        if not(np.isnan(pre_y[i])):
            MSE = MSE + (pre_y[i]-act_y[i])**2
#        pre = np.array(pre_y)
#        act = np.array(act_y)
#        print(np.sum(pre-act))
    return MSE,pre_y
Beispiel #38
0
def myRidgeHH(data, hh=1, dd=1):
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    "produces a forecast for half hour hh on day dd"
    "format is numpy array of shape (n_households,)"
    from sklearn.linear_model import BayesianRidge

    data1 = data[data.Day == dd]
    past1 = data1[data1.Week != 22]
    past = past1[past1.Week != 21]

    if hh == 1:
        h = [1, 2, 3]
    elif hh == 2:
        h = [1, 2, 3, 4]
    elif hh in range(3, 47):
        h = [hh - 2, hh - 1, hh, hh + 1, hh + 2]
    elif hh == 47:
        h = [45, 46, 47, 48]
    elif hh == 48:
        h = [46, 47, 48]

    XX = pd.DataFrame()
    YY = pd.DataFrame()
    for i in h:
        XX = XX.append(past[past.HH == i], ignore_index=True)
        YY = YY.append(past1[past1.HH == i], ignore_index=True)
    X = XX.transpose()[4:]

    Y = YY[(YY.Week == YY.Week.max()) & (YY.HH == hh)].transpose()[4:]

    BR = BayesianRidge()
    BR.fit(X, Y)

    XX_new = YY[YY.Week != YY.Week.min()]
    X_new = XX_new.transpose()[4:]
    forecast_hh_dd = BR.predict(X_new)
    return forecast_hh_dd
Beispiel #39
0
        def Bayesian(
                self, X_train, X_test, y_train, y_test, date, q
        ):  #self,Xtrain,Xtest,ytrain,ytest,['dd','mm',yyyy'],indexofalgo]
            model = BayesianRidge(compute_score=True)
            y_train.shape
            X_train.shape

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            df = pd.DataFrame({
                'Actual': y_test.flatten(),
                'Predicted': y_pred.flatten()
            })
            mse = mean_squared_error(y_test, y_pred)
            predictedval = (model.predict([date]))

            self.mse[q] = mse
            self.predicted_avgtemp[q] = predictedval[0]
            self.predicted_mintemp[
                q] = '-'  # because BAyesian only does one val prediction
            self.predicted_maxtemp[
                q] = '-'  # because BAyesian only does one val prediction
Beispiel #40
0
def br_modeling(data,y_name,candidates_location):
 from sklearn.linear_model import BayesianRidge
 temp=data.copy()
 print("made temp copy")
 candidates=get_variables("./%s"%candidates_location)
 print("got candidates for regressors")
 temp=rf_trim(temp,y_name,candidates)
 print("trimmed dataset")
 model=BayesianRidge()
 print("assigned model")
 res=model.fit(temp[candidates],temp[y_name])
 print("fit model")
 joblib.dump(res,"./%sbr_model%s.pkl"%(y_name,datetime.datetime.today()))
 print("saved model")
 return res
Beispiel #41
0
def build_bayesian_rr(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a Bayesian ridge regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    clf = BayesianRidge()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
    # Optimal ridge regression alpha value from CV
    ridge_alpha = clf.alpha_

    with open('../trained_networks/brr_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return

def sale(data):
	data = int(data) + 1
	return log(data)


dataset = pandas.read_csv("input/train2_.csv")
testset = pandas.read_csv("input/test2_.csv")

dataset['Sale'] = dataset['Sales'].apply(sale)

labelData = dataset['Sale'].values
myId = testset['Id'].values

testset.drop(['Id'], inplace=True, axis=1)
testData = testset.iloc[:, :].values
dataset.drop(['Sales', 'Sale'], inplace=True, axis=1)
dataData = dataset.iloc[:, :].values

BRModel = BayesianRidge(compute_score=True)
BRModel.fit(dataset.iloc[:, :].values, labelData)
preds = numpy.column_stack((myId, BRModel.predict(testData))).tolist()
preds = [[int(i[0])] + [exp(float(i[1])) - 1] for i in preds]

print BRModel.scores_
with open("result/sub_BayesRidge.csv", "w") as output:
	writer = csv.writer(output, lineterminator='\n')
	writer.writerow(["Id", "Sales"])
	writer.writerows(preds)
Beispiel #43
0
print  (y_test[y_test==1] == y_test_predictions[y_test==1]).sum().astype(float) / y_test[y_test==1].shape[0]
#0.875
#But, at what expense do we do this? To find out, use the following command:
print  (y_test_predictions == y_test).sum().astype(float) / y_test.shape[0]
#0.967999



# Directly applying Bayesian ridge regression 贝叶斯岭回归

from sklearn.datasets import make_regression
X, y = make_regression(1000, 10, n_informative=2, noise=20)
#We can just "throw" ridge regression at the problem with a few simple steps:
from sklearn.linear_model import BayesianRidge
br = BayesianRidge()
br.fit(X, y)
print  br.coef_
#array([0.3000136 , -0.33023408, 68.166673, -0.63228159, 0.07350987,
#-0.90736606, 0.38851709, -0.8085291 , 0.97259451, 68.73538646])

br_alphas = BayesianRidge(alpha_1=10, lambda_1=10)
br_alphas.fit(X, y)
print  br_alphas.coef_
#array([0.30054387, -0.33130025, 68.10432626, -0.63056712,
#0.07751436, -0.90919326, 0.39020878, -0.80822013,
#0.97497567, 68.67409658])


# Using boosting to learn from errors

#Gradient boosting regression is a technique that learns from its mistakes. 
Beispiel #44
0
def main():
    usage = 'usage: %prog [options] <model_file>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='center_dist', default=10, type='int', help='Distance between the motifs and sequence center [Default: %default]')
    parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option('-g', dest='cuda', default=False, action='store_true', help='Run on the GPGPU [Default: %default]')
    parser.add_option('-l', dest='seq_length', default=600, type='int', help='Sequence length [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]')
    parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide Basset model file')
    else:
        model_file = args[0]

    out_targets = [int(ti) for ti in options.targets.split(',')]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(1)

    # torch options
    cuda_str = ''
    if options.cuda:
        cuda_str = '-cuda'

    #################################################################
    # place filter consensus motifs
    #################################################################
    # determine filter consensus motifs
    filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str)

    seqs_1hot = []
    num_filters = len(filter_consensus)
    # num_filters = 40
    filter_len = filter_consensus[0].shape[1]

    # position the motifs
    left_i = options.seq_length/2 - options.center_dist - filter_len
    right_i = options.seq_length/2 + options.center_dist

    ns_1hot = np.zeros((4,options.seq_length)) + 0.25
    # ns_1hot = np.zeros((4,options.seq_length))
    # for i in range(options.seq_length):
    #     nt_i = random.randint(0,3)
    #     ns_1hot[nt_i,i] = 1

    for i in range(num_filters):
        for j in range(num_filters):
            # copy the sequence of N's
            motifs_seq = np.copy(ns_1hot)

            # write them into the one hot coding
            motifs_seq[:,left_i:left_i+filter_len] = filter_consensus[i]
            motifs_seq[:,right_i:right_i+filter_len] = filter_consensus[j]

            # save
            seqs_1hot.append(motifs_seq)

    # make a full array
    seqs_1hot = np.array(seqs_1hot)

    # reshape for spatial
    seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,options.seq_length))


    #################################################################
    # place filter consensus motifs
    #################################################################
    # save to HDF5
    seqs_file = '%s/motif_seqs.h5' % options.out_dir
    h5f = h5py.File(seqs_file, 'w')
    h5f.create_dataset('test_in', data=seqs_1hot)
    h5f.close()

    # predict scores
    scores_file = '%s/motif_seqs_scores.h5' % options.out_dir
    torch_cmd = 'th basset_place2_predict.lua %s %s %s %s' % (cuda_str, model_file, seqs_file, scores_file)
    subprocess.call(torch_cmd, shell=True)

    # load in scores
    hdf5_in = h5py.File(scores_file, 'r')
    motif_seq_scores = np.array(hdf5_in['scores'])
    hdf5_in.close()

    #################################################################
    # analyze
    #################################################################
    for ti in out_targets:
        #################################################################
        # compute pairwise expectations
        #################################################################
        # X = np.zeros((motif_seq_scores.shape[0],num_filters))
        # xi = 0
        # for i in range(num_filters):
        #     for j in range(num_filters):
        #         X[xi,i] += 1
        #         X[xi,j] += 1
        #         xi += 1

        X = np.zeros((motif_seq_scores.shape[0],2*num_filters))
        xi = 0
        for i in range(num_filters):
            for j in range(num_filters):
                X[xi,i] += 1
                X[xi,num_filters+j] += 1
                xi += 1

        # fit model
        model = BayesianRidge()
        model.fit(X, motif_seq_scores[:,ti])

        # predict pairwise expectations
        motif_seq_preds = model.predict(X)
        print model.score(X, motif_seq_scores[:,ti])

        # print filter coefficients
        coef_out = open('%s/coefs_t%d.txt' % (options.out_dir,ti), 'w')
        for i in range(num_filters):
            print >> coef_out, '%3d  %6.2f' % (i,model.coef_[i])
        coef_out.close()

        #################################################################
        # normalize pairwise predictions
        #################################################################
        filter_interaction = np.zeros((num_filters,num_filters))
        table_out = open('%s/table_t%d.txt' % (options.out_dir,ti), 'w')

        si = 0
        for i in range(num_filters):
            for j in range(num_filters):
                filter_interaction[i,j] = motif_seq_scores[si,ti] - motif_seq_preds[si]
                cols = (i, j, motif_seq_scores[si,ti], motif_seq_preds[si], filter_interaction[i,j])
                print >> table_out, '%3d  %3d  %6.3f  %6.3f  %6.3f' % cols
                si += 1

        table_out.close()

        scores_abs = abs(filter_interaction.flatten())
        max_score = stats.quantile(scores_abs, .999)
        print 'Limiting scores to +-%f' % max_score
        filter_interaction_max = np.zeros((num_filters, num_filters))
        for i in range(num_filters):
            for j in range(num_filters):
                filter_interaction_max[i,j] = np.min([filter_interaction[i,j], max_score])
                filter_interaction_max[i,j] = np.max([filter_interaction_max[i,j], -max_score])

        # plot heat map
        plt.figure()
        sns.heatmap(filter_interaction_max, xticklabels=False, yticklabels=False)
        plt.savefig('%s/heat_t%d.pdf' % (options.out_dir,ti))
# Linear Regression
print 'linear'
lr = LinearRegression()
#lr.fit(x[:, np.newaxis], y)
#lr_sts_scores = lr.predict(xt[:, np.newaxis])
lr.fit(x, y)
lr_sts_scores = lr.predict(xt)


# Baysian Ridge Regression
print 'baysian ridge'
br = BayesianRidge(compute_score=True)
#br.fit(x[:, np.newaxis], y)
#br_sts_scores = br.predict(xt[:, np.newaxis])
br.fit(x, y)
br_sts_scores = br.predict(xt)


# Elastic Net
print 'elastic net'
enr = ElasticNet()
#enr.fit(x[:, np.newaxis], y)
#enr_sts_scores = enr.predict(xt[:, np.newaxis])
enr.fit(x, y)
enr_sts_scores = enr.predict(xt)


# Passive Aggressive Regression
print 'passive aggressive'
par = PassiveAggressiveRegressor()
def prediction_BayesianRidge (X_train, Y_train, X_test, Y_test,normalize):

    # Print shapes of the training and testing data sets
    #print ("Shapes of the training and testing data sets")
    #print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    #Create our regression object

    lreg = BayesianRidge(normalize=normalize)

    #do a linear regression, except only on the training
    lreg.fit(X_train,Y_train)

    #print("The estimated intercept coefficient is %.2f " %lreg.intercept_)
    #print("The number of coefficients used was %d " % len(lreg.coef_))



    # Set a DataFrame from the Facts
    coeff_df = DataFrame(X_train.columns)
    coeff_df.columns = ["Fact"]


    # Set a new column lining up the coefficients from the linear regression
    coeff_df["Coefficient"] = pd.Series(lreg.coef_)


    # Show
    #coeff_df

    #highest correlation between a fact and fraction votes
    #print ("Highest correlation fact: %s is %.9f" % (cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"], coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]) )

    #sns_plot = sns.jointplot(coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"Fraction Votes",pd.merge(X_test,pd.DataFrame(Y_test), right_index=True, left_index=True),kind="scatter")


    #Predictions on training and testing sets
    pred_train = lreg.predict(X_train)
    pred_test = lreg.predict(X_test)

    # The mean square error
    #print("MSE with X_train and Y_train: %.6f"  % np.mean((Y_train - pred_train) ** 2))
    #print("MSE with X_test and Y_test: %.6f"  %np.mean((Y_test - pred_test) ** 2))

    #Explained variance score: 1 is perfect prediction
    #print("Variance score: %.2f" % lreg.score(X_test, Y_test))

    result={}
    result["method"]="BayesianRidge"
    if normalize :
        result["normalize"]="Y"
    else:
        result["normalize"]="N"
    result["X_train_shape"]=X_train.shape
    result["Y_train_shape"]=Y_train.shape
    result["X_test_shape"]=X_test.shape
    result["Y_test_shape"]=Y_test.shape
    result["intercept"]=lreg.intercept_
    result["num_coef"]=len(lreg.coef_)
    result["max_fact"]=cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"]
    result["max_fact_value"]=coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]
    result["MSE_train"]=np.mean((Y_train - pred_train) ** 2)
    result["MSE_test"]=np.mean((Y_test - pred_test) ** 2)
    result["variance"]=lreg.score(X_test, Y_test)
    return pred_test,coeff_df,pred_train,result
Beispiel #47
0
runs = []

for _ in range(10):
    train_latent_matrix = get_latent_matrix(x,y,x)
    test_latent_matrix = get_latent_matrix(x,y,x_test)
    # Clean out rows with NaN.
    #mask = ~np.any(np.isnan(train_latent_matrix), axis=1)
    #newx = train_latent_matrix[mask]
    #newy = y[mask]
    
    newx = np.nan_to_num(train_latent_matrix)
    newy = y

    #last_layer = SVR(kernel='rbf', C=1e3, gamma=0.1)
    last_layer = BayesianRidge()
    last_layer.fit(newx, newy)

    output = last_layer.predict(test_latent_matrix)
    assert len(output) == 8500
    runs.append(output)

#for i in runs:
#print len(i)
   
fout = open('modelz.10.output', 'w')
for line in zip(*runs):
    avg =sum(line)/len(line)
    if avg > 5:
        avg = 5.0
    elif avg < 0:
        avg = 0.0
Beispiel #48
0
def main():
    usage = "usage: %prog [options] <model_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-c",
        dest="center_dist",
        default=10,
        type="int",
        help="Distance between the motifs and sequence center [Default: %default]",
    )
    parser.add_option(
        "-d", dest="model_hdf5_file", default=None, help="Pre-computed model output as HDF5 [Default: %default]"
    )
    parser.add_option(
        "-g", dest="cuda", default=False, action="store_true", help="Run on the GPGPU [Default: %default]"
    )
    parser.add_option("-l", dest="seq_length", default=600, type="int", help="Sequence length [Default: %default]")
    parser.add_option("-o", dest="out_dir", default="heat", help="Output directory [Default: %default]")
    parser.add_option(
        "-t",
        dest="targets",
        default="0",
        help="Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error("Must provide Basset model file")
    else:
        model_file = args[0]

    out_targets = [int(ti) for ti in options.targets.split(",")]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(1)

    # torch options
    cuda_str = ""
    if options.cuda:
        cuda_str = "-cuda"

    #################################################################
    # place filter consensus motifs
    #################################################################
    # determine filter consensus motifs
    filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str)

    seqs_1hot = []
    # num_filters = len(filter_consensus)
    num_filters = 20
    filter_len = filter_consensus[0].shape[1]

    # position the motifs
    left_i = options.seq_length / 2 - options.center_dist - filter_len
    right_i = options.seq_length / 2 + options.center_dist

    ns_1hot = np.zeros((4, options.seq_length)) + 0.25
    # ns_1hot = np.zeros((4,options.seq_length))
    # for i in range(options.seq_length):
    #     nt_i = random.randint(0,3)
    #     ns_1hot[nt_i,i] = 1

    for i in range(num_filters):
        for j in range(num_filters):
            # copy the sequence of N's
            motifs_seq = np.copy(ns_1hot)

            # write them into the one hot coding
            motifs_seq[:, left_i : left_i + filter_len] = filter_consensus[i]
            motifs_seq[:, right_i : right_i + filter_len] = filter_consensus[j]

            # save
            seqs_1hot.append(motifs_seq)

    # make a full array
    seqs_1hot = np.array(seqs_1hot)

    # reshape for spatial
    seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, options.seq_length))

    #################################################################
    # place filter consensus motifs
    #################################################################
    # save to HDF5
    seqs_file = "%s/motif_seqs.h5" % options.out_dir
    h5f = h5py.File(seqs_file, "w")
    h5f.create_dataset("test_in", data=seqs_1hot)
    h5f.close()

    # predict scores
    scores_file = "%s/motif_seqs_scores.h5" % options.out_dir
    torch_cmd = "th basset_place2_predict.lua %s %s %s %s" % (cuda_str, model_file, seqs_file, scores_file)
    subprocess.call(torch_cmd, shell=True)

    # load in scores
    hdf5_in = h5py.File(scores_file, "r")
    motif_seq_scores = np.array(hdf5_in["scores"])
    hdf5_in.close()

    #################################################################
    # analyze
    #################################################################
    for ti in out_targets:
        #################################################################
        # compute pairwise expectations
        #################################################################
        # X = np.zeros((motif_seq_scores.shape[0],num_filters))
        # xi = 0
        # for i in range(num_filters):
        #     for j in range(num_filters):
        #         X[xi,i] += 1
        #         X[xi,j] += 1
        #         xi += 1

        X = np.zeros((motif_seq_scores.shape[0], 2 * num_filters))
        xi = 0
        for i in range(num_filters):
            for j in range(num_filters):
                X[xi, i] += 1
                X[xi, num_filters + j] += 1
                xi += 1

        # fit model
        model = BayesianRidge()
        model.fit(X, motif_seq_scores[:, ti])

        # predict pairwise expectations
        motif_seq_preds = model.predict(X)
        print model.score(X, motif_seq_scores[:, ti])

        # print filter coefficients
        coef_out = open("%s/coefs_t%d.txt" % (options.out_dir, ti), "w")
        for i in range(num_filters):
            print >> coef_out, "%3d  %6.2f" % (i, model.coef_[i])
        coef_out.close()

        #################################################################
        # normalize pairwise predictions
        #################################################################
        filter_interaction = np.zeros((num_filters, num_filters))
        table_out = open("%s/table_t%d.txt" % (options.out_dir, ti), "w")

        si = 0
        for i in range(num_filters):
            for j in range(num_filters):
                filter_interaction[i, j] = motif_seq_scores[si, ti] - motif_seq_preds[si]
                cols = (i, j, motif_seq_scores[si, ti], motif_seq_preds[si], filter_interaction[i, j])
                print >> table_out, "%3d  %3d  %6.3f  %6.3f  %6.3f" % cols
                si += 1

        table_out.close()

        # plot heat map
        plt.figure()
        sns.heatmap(filter_interaction)
        plt.savefig("%s/heat_t%d.pdf" % (options.out_dir, ti))
lambda_ = 4.
w = np.zeros(n_features)
# Only keep 10 weights of interest
relevant_features = np.random.randint(0, n_features, 10)
for i in relevant_features:
    w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_))
# Create noise with a precision alpha of 50.
alpha_ = 50.
noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples)
# Create the target
y = np.dot(X, w) + noise

###############################################################################
# Fit the Bayesian Ridge Regression and an OLS for comparison
clf = BayesianRidge(compute_score=True)
clf.fit(X, y)

ols = LinearRegression()
ols.fit(X, y)

###############################################################################
# Plot true weights, estimated weights, histogram of the weights, and
# predictions with standard deviations
lw = 2
plt.figure(figsize=(6, 5))
plt.title("Weights of the model")
plt.plot(clf.coef_, color='lightgreen', linewidth=lw,
         label="Bayesian Ridge estimate")
plt.plot(w, color='gold', linewidth=lw, label="Ground truth")
plt.plot(ols.coef_, color='navy', linestyle='--', label="OLS estimate")
plt.xlabel("Features")
Beispiel #50
0
def nickmain1():

	train_all = pd.read_csv(trainloc)
	target_all = pd.read_csv(trainloc)
	test_all = pd.read_csv(testloc)
	targets = ['Ca','P','pH','SOC','Sand']
	train_cols_to_remove = ['PIDN']+targets
	train_all["Depth"] = train_all["Depth"].replace(["Topsoil", "Subsoil"],[10,-10])
	test_all["Depth"] = test_all["Depth"].replace(["Topsoil", "Subsoil"],[10,-10])
	common_features = ['BSAN','BSAS','BSAV','CTI','ELEV','EVI','LSTD','LSTN','REF1','REF2','REF3','REF7','RELI','TMAP','TMFI']
	feats_list = {}
	colnames_nums = []
	colnames = train_all.ix[:,'m7497.96':'m599.76'].columns.values
	for x in colnames:
		match = re.search(r'(?<=m)[0-9]*',x)
		if match: 
			colnames_nums.append(int(match.group()))
	
	print len(colnames)
	print len(colnames_nums)
	print len(train_all.ix[0,'m7497.96':'m599.76'].values)


	

	for target in targets:
		selector = SelectKBest(f_regression, k=200)
		selector.fit_transform(train_all.ix[:,'m7497.96':'m599.76'], train_all[target])
		selected = selector.get_support()
		feats = [col for (col,sel) in zip(list(train_all.ix[:,'m7497.96':'m599.76'].columns.values), selected) if sel]
		feats_list[target] = feats+common_features

		


	#pickTest = ['PIDN', 'BSAN','BSAS','BSAV','CTI','ELEV','EVI','LSTD','LSTN','REF1','REF2','REF3','REF7','RELI','TMAP','TMFI','Depth']#ORIGINAL10
	ids = np.genfromtxt(testloc, dtype=str, skip_header=1, delimiter=',', usecols=0)
	df = pd.DataFrame({"PIDN": ids, "Ca": test_all['PIDN'], "P": test_all['PIDN'], "pH": test_all['PIDN'], "SOC": test_all['PIDN'], "Sand": test_all['PIDN']})
	
	cv = cross_validation.KFold(len(train_all), n_folds=10, indices=False)
	subresults = {}
	results = []

	if issub == False:
		for train_sub, test_sub in cv:
			for target in targets:
				#clf = ensemble.GradientBoostingRegressor(n_estimators=6)
				#clf = RandomForestRegressor(n_estimators = 40)
				#clf = linear_model.Lasso(alpha=0.08)
				#clf = svm.SVC()
				#clf = tree.DecisionTreeRegressor(min_samples_leaf=20)
				#clf = Ridge(alpha=1.0)
				#clf = ElasticNet(alpha=0.1, l1_ratio=0.7)
				clf = BayesianRidge(compute_score=True)
				clf.fit(np.array(train_all[feats_list[target]])[train_sub], np.array(train_all[target])[train_sub])
				pred = clf.predict(np.array(train_all[feats_list[target]])[test_sub])
				subresults[target] = ev.rmse(np.array(train_all[target])[test_sub],np.array(pred))
				#df[target] = pred
			subtotal = 0 
			for x in subresults:
				subtotal = subtotal + subresults[x]
			print ("average for the run is ", subtotal/len(targets))
			results.append(subtotal/len(targets))
		print "Results: " + str( np.array(results).mean() )

	else:
		for target in targets:
			#clf = ensemble.GradientBoostingRegressor(n_estimators=6)
			#clf = RandomForestRegressor(n_estimators = 20)
			#clf = linear_model.Lasso(alpha=0.08)
			#clf = svm.SVC()
			#clf = tree.DecisionTreeRegressor(min_samples_leaf=20)
			#clf = Ridge(alpha=1.0)
			#clf = ElasticNet(alpha=0.1, l1_ratio=0.7)
			clf = BayesianRidge(compute_score=True)
			clf.fit(np.array(train_all[feats_list[target]]), np.array(train_all[target]))
			pred = clf.predict(np.array(test_all[feats_list[target]]))
			df[target] = pred
			df.to_csv(predloc, index=False, cols=["PIDN","Ca","P","pH","SOC","Sand"])
lambda_ = 4.
w = np.zeros(n_features)
# Only keep 10 weights of interest
relevant_features = np.random.randint(0, n_features, 10)
for i in relevant_features:
    w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_))
# Create noise with a precision alpha of 50.
alpha_ = 50.
noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples)
# Create the target
y = np.dot(X, w) + noise

###############################################################################
# Fit the Bayesian Ridge Regression and an OLS for comparison
clf = BayesianRidge(compute_score=True)
clf.fit(X, y)

ols = LinearRegression()
ols.fit(X, y)

###############################################################################
# Plot true weights, estimated weights and histogram of the weights
pl.figure(figsize=(6, 5))
pl.title("Weights of the model")
pl.plot(clf.coef_, 'b-', label="Bayesian Ridge estimate")
pl.plot(w, 'g-', label="Ground truth")
pl.plot(ols.coef_, 'r--', label="OLS estimate")
pl.xlabel("Features")
pl.ylabel("Values of the weights")
pl.legend(loc="best", prop=dict(size=12))
Beispiel #52
0
df = pd.concat(frames, axis=0, ignore_index=True)

### Imputing DYAR
train = df[(df.DYAR.isnull() ==False) & (df.pct_team_tgts.isnull() == False)]
train.reset_index(inplace=True, drop=True)
test = df[(df.DYAR.isnull() == True) & (df.pct_team_tgts.isnull() == False)]
test.reset_index(inplace= True, drop=True)

features = ['targets', 'receptions', 'rec_tds', 'start_ratio', 'pct_team_tgts', 'pct_team_receptions', 'pct_team_touchdowns',
            'rec_yards', 'dpi_yards', 'fumbles', 'first_down_ctchs', 'pct_of_team_passyards']
X = scale(train[features])
y = train.DYAR

# Our best model for predicting DYAR was a Bayesian Ridge Regressor
br = BayesianRidge()
br.fit(X,y)
dyar_predictions = pd.DataFrame(br.predict(scale(test[features])), columns = ['DYAR_predicts'])

test = test.join(dyar_predictions)
test['DYAR'] = test['DYAR_predicts']
test.drop('DYAR_predicts', inplace=True, axis=1)

frames = [train,test]
df = pd.concat(frames, axis=0, ignore_index=True)

### Imputing EYds
train = df[(df.EYds.isnull() ==False) & (df.pct_team_tgts.isnull() == False)]
train.reset_index(inplace=True, drop=True)
test = df[(df.EYds.isnull() == True) & (df.pct_team_tgts.isnull() == False)]
test.reset_index(inplace= True, drop=True)
Beispiel #53
0
trainingcounts = counts[100:]
testcounts = counts[:100]

trainingrates = countrates[100:]
testrates = countrates[:100]

trainingtimes = times[100:]
testtimes = times[:100]

# using trainingcounts and training hists use log linear
#poisson_model = sm.GLM(trainingrates,
#						sm.tools.tools.add_constant(traininghists),
#						family =sm.families.Poisson(sm.genmod.families.links.log))
#results = poisson_model.fit()
#print(results.summary())

#x = results.predict(sm.tools.tools.add_constant(testhists))


clf = BayesianRidge(compute_score=True)
clf.fit(traininghists,trainingrates)
x = clf.predict(testhists)  

answer = testrates

plt.plot(bins,x)
plt.plot(bins,answer)
plt.show()


def do_validation(data_path, steps=10):
    allfiles = initialize(data_path)
    gbm = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05, max_depth=6, min_samples_leaf=5, subsample=0.5)
    ada = AdaBoostRegressor(n_estimators=200, learning_rate=1)
    etree = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, min_samples_leaf=5)
    rf = RandomForestRegressor(n_estimators=200, max_features=4, min_samples_leaf=5)
    kn = KNeighborsRegressor(n_neighbors=25)
    logit = LogisticRegression(tol=0.05)
    enet = ElasticNetCV(l1_ratio=0.75, max_iter=1000, tol=0.05)
    svr = SVR(kernel="linear", probability=True)
    ridge = Ridge(alpha=18)
    bridge = BayesianRidge(n_iter=500)

    gbm_metrics = 0.0
    ada_metrics = 0.0
    etree_metrics = 0.0
    rf_metrics = 0.0
    kn_metrics = 0.0
    logit_metrics = 0.0
    svr_metrics = 0.0
    ridge_metrics = 0.0
    bridge_metrics = 0.0
    enet_metrics = 0.0
    nnet_metrics = 0.0

    logistic = LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

    for i in xrange(steps):
        driver = allfiles[i]
        df, Y = create_merged_dataset(driver)
        df['label'] = Y        
        # Shuffle DF.
        df = df.reindex(np.random.permutation(df.index))

        train = df[:100]
        label = train['label']
        del train['label']

        test = df[100:400]
        Y = test['label']
        del test['label']

        #to_drop = ['driver', 'trip', 'speed1', 'speed2', 'speed3', 'speed4', 'speed5', 'speed6', 'speed7', 'speed8', 'speed9', 
        #        'speed10', 'speed11', 'speed12', 'speed13', 'speed14', 'speed15', 'speed16', 'speed17', 'speed18', 'speed19', 
        #        'speed20', 'speed21', 'speed22', 'speed23', 'speed24', 'speed25', 'speed26', 'speed27', 'speed28', 'speed29', 
        #        'speed30', 'speed31', 'speed32', 'speed33', 'speed34', 'speed35', 'speed36', 'speed37', 'speed38', 'speed39', 
        #        'speed40', 'speed41', 'speed42', 'speed43', 'speed44', 'speed45', 'speed46', 'speed47', 'speed48', 'speed49', 
        #        'speed50', 'speed51', 'speed52', 'speed53', 'speed54', 'speed55', 'speed56', 'speed57', 'speed58', 'speed59', 
        #        'speed60', 'speed61', 'speed62', 'speed63', 'speed64', 'speed65', 'speed66', 'speed67', 'speed68', 'speed69', 
        #        'speed70', 'speed71', 'speed72', 'speed73', 'speed74', 'speed75', 'speed76', 'speed77', 'speed78', 'speed79', 'speed80']
        to_drop = ['driver', 'trip']

        X_train = train.drop(to_drop, 1)
        X_test = test.drop(to_drop, 1)
        
        gbm.fit(X_train, label)
        Y_hat = gbm.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        gbm_metrics += metrics.auc(fpr, tpr) 
        
        ada.fit(X_train, label)
        Y_hat = ada.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        ada_metrics += metrics.auc(fpr, tpr)
    
        etree.fit(X_train, label)
        Y_hat = etree.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        etree_metrics += metrics.auc(fpr, tpr)
        
        rf.fit(X_train, label)
        Y_hat = rf.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        rf_metrics += metrics.auc(fpr, tpr)
        
        kn.fit(X_train, label)
        Y_hat = kn.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        kn_metrics += metrics.auc(fpr, tpr)

        # Linear models.
        to_drop = ['driver', 'trip', 'distance', 'sd_acceleration', 'final_angle', 'mean_acceleration', 'mean_avg_speed', 'sd_inst_speed',
                'sd_avg_speed', 'mean_inst_speed', 'points']

        X_train = train.drop(to_drop, 1)
        X_test = test.drop(to_drop, 1)
        
        logit.fit(X_train, label)
        Y_hat = [i[1] for i in logit.predict_proba(X_test)]
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        logit_metrics += metrics.auc(fpr, tpr)

        svr.fit(X_train, label)
        Y_hat = svr.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        svr_metrics += metrics.auc(fpr, tpr)
        
        ridge.fit(X_train, label)
        Y_hat = ridge.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        ridge_metrics += metrics.auc(fpr, tpr)

        bridge.fit(X_train, label)
        Y_hat = bridge.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        bridge_metrics += metrics.auc(fpr, tpr)

        enet.fit(X_train, label)
        Y_hat = enet.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        enet_metrics += metrics.auc(fpr, tpr)

        classifier.fit(X_train, label)
        Y_hat = classifier.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        nnet_metrics += metrics.auc(fpr, tpr)

    print ""
    print "GBM:", gbm_metrics/steps
    print "AdaBoost:", ada_metrics/steps
    print "Extra Trees:", etree_metrics/steps
    print "RF:", rf_metrics/steps
    print "KN:", kn_metrics/steps
    print ""
    print "Logit:", logit_metrics/steps
    print "SVR:", svr_metrics/steps
    print "Ridge:", ridge_metrics/steps
    print "BayesianRidge:", bridge_metrics/steps
    print "Elastic Net:", enet_metrics/steps
    print "Neural Networks:", nnet_metrics/steps
    print ""
#sc = supervised_clustering.SupervisedClusteringRegressor(clf, connectivity=A,
#        n_iterations=30, verbose=1, n_jobs=8,
#        cv=ShuffleSplit(X_train.shape[0], n_splits=10, test_fraction=0.6,
#            random_state=0))
t1 = time()
sc.fit(X_train, y_train)
sc_time = time() -t1
computed_coefs = sc.inverse_transform()
computed_coefs = np.reshape(computed_coefs, [size, size, size])
score = sc.score(X_test, y_test)


###############################################################################
# Compute the results for simple BayesianRidge
t1 = time()
clf.fit(X_train, y_train)
bayes_time = time() - t1
bayes_coefs = clf.coef_
bayes_score = clf.score(X_test, y_test)
bayes_coefs = bayes_coefs.reshape((size, size, size))


###############################################################################
# Plot the results

pl.close('all')
pl.figure()
pl.title('Scores of the supervised clustering')
pl.subplot(2, 1, 1)
pl.plot(np.arange(len(sc.scores_)), sc.scores_)
pl.xlabel('score')
Beispiel #56
0
def main():
    usage = 'usage: %prog [options] <repr_hdf5> <data_hdf5> <target_index>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='add_only', default=False, action='store_true', help='Use additional features only; no sequence features')
    parser.add_option('-b', dest='balance', default=False, action='store_true', help='Downsample the negative set to balance [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='postmodel', help='Output directory [Default: %default]')
    parser.add_option('-r', dest='regression', default=False, action='store_true', help='Regression mode [Default: %default]')
    parser.add_option('-s', dest='seq_only', default=False, action='store_true', help='Use sequence features only; no additional features [Default: %default]')
    parser.add_option('--sample', dest='sample', default=None, type='int', help='Sample from the training set [Default: %default]')
    parser.add_option('-t', dest='target_hdf5', default=None, help='Extract targets from this HDF5 rather than data_hdf5 argument')
    parser.add_option('-x', dest='regex_add', default=None, help='Filter additional features using a comma-separated list of regular expressions')
    (options,args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide full data HDF5, representation HDF5, and target index or filename')
    else:
        repr_hdf5_file = args[0]
        data_hdf5_file = args[1]
        target_i = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(1)

    #######################################################
    # preprocessing
    #######################################################

    # load training targets
    data_hdf5_in = h5py.File(data_hdf5_file, 'r')
    if options.target_hdf5:
        target_hdf5_in = h5py.File(options.target_hdf5, 'r')
    else:
        target_hdf5_in = data_hdf5_in
    train_y = np.array(target_hdf5_in['train_out'])[:,target_i]
    test_y = np.array(target_hdf5_in['test_out'])[:,target_i]

    # load training representations
    if not options.add_only:
        repr_hdf5_in = h5py.File(repr_hdf5_file, 'r')
        train_x = np.array(repr_hdf5_in['train_repr'])
        test_x = np.array(repr_hdf5_in['test_repr'])
        repr_hdf5_in.close()

    if options.seq_only:
        add_labels = []

    else:
        # load additional features
        train_a = np.array(data_hdf5_in['train_add'])
        test_a = np.array(data_hdf5_in['test_add'])
        add_labels = np.array(data_hdf5_in['add_labels'])

        if options.regex_add:
            fi = filter_regex(options.regex_add, add_labels)
            train_a, test_a, add_labels = train_a[:,fi], test_a[:,fi], add_labels[fi]

        # append additional features
        if options.add_only:
            add_i = 0
            train_x, test_x = train_a, test_a
        else:
            add_i = train_x.shape[1]
            train_x = np.concatenate((train_x,train_a), axis=1)
            test_x = np.concatenate((test_x,test_a), axis=1)

    data_hdf5_in.close()
    if options.target_hdf5:
        target_hdf5_in.close()

    # balance
    if options.balance:
        train_x, train_y = balance(train_x, train_y)

    # sample
    if options.sample is not None and options.sample < train_x.shape[0]:
        sample_indexes = random.sample(range(train_x.shape[0]), options.sample)
        train_x = train_x[sample_indexes]
        train_y = train_y[sample_indexes]


    #######################################################
    # model
    #######################################################
    if options.regression:
        # fit
        model = BayesianRidge(fit_intercept=True)
        model.fit(train_x, train_y)

        # accuracy
        acc_out = open('%s/r2.txt' % options.out_dir, 'w')
        print >> acc_out, model.score(test_x, test_y)
        acc_out.close()

        test_preds = model.predict(test_x)

        # plot a sample of predictions versus actual
        plt.figure()
        sns.jointplot(test_preds[:5000], test_y[:5000], joint_kws={'alpha':0.3})
        plt.savefig('%s/scatter.pdf' % options.out_dir)
        plt.close()

        # plot the distribution of residuals
        plt.figure()
        sns.distplot(test_y-test_preds)
        plt.savefig('%s/residuals.pdf' % options.out_dir)
        plt.close()

    else:
        # fit
        model = LogisticRegression(penalty='l2', C=1000)
        model.fit(train_x, train_y)

        # accuracy
        test_preds = model.predict_proba(test_x)[:,1].flatten()
        acc_out = open('%s/auc.txt' % options.out_dir, 'w')
        print >> acc_out, roc_auc_score(test_y, test_preds)
        acc_out.close()

        # compute and print ROC curve
        fpr, tpr, thresholds = roc_curve(test_y, test_preds)

        roc_out = open('%s/roc.txt' % options.out_dir, 'w')
        for i in range(len(fpr)):
            print >> roc_out, '%f\t%f\t%f' % (fpr[i], tpr[i], thresholds[i])
        roc_out.close()

        # compute and print precision-recall curve
        precision, recall, thresholds = precision_recall_curve(test_y, test_preds)

        prc_out = open('%s/prc.txt' % options.out_dir, 'w')
        for i in range(len(precision)):
            print >> prc_out, '%f\t%f' % (precision[i], recall[i])
        prc_out.close()

    # save model
    joblib.dump(model, '%s/model.pkl' % options.out_dir)

    #######################################################
    # analyze
    #######################################################
    # print coefficients table
    coef_out = open('%s/add_coefs.txt' % options.out_dir, 'w')
    for ai in range(len(add_labels)):
        if options.regression:
            coefi = model.coef_[add_i+ai]
        else:
            coefi = model.coef_[0,add_i+ai]
        print >> coef_out, add_labels[ai], coefi
    coef_out.close()
y2 = svr.predict(x_test_scaled)

kr = KernelRidge(alpha=0.0001, coef0=1, degree=1, gamma=0.001, kernel='rbf',kernel_params=None)
kr.fit(x_train_scaled, y_train)
y3 = kr.predict(x_test_scaled)

lasso = Lasso(alpha=1e-09)
lasso.fit(x_train_scaled, y_train)
y4 = lasso.predict(x_test_scaled)

linear_ridge = Ridge(alpha=0.1)
linear_ridge.fit(x_train_scaled,y_train)
y5 = linear_ridge.predict(x_test_scaled)

bayesian_ridge = BayesianRidge(alpha_1=1e-05, alpha_2=10, lambda_1=10, lambda_2=1e-05)
bayesian_ridge.fit(x_train_scaled, y_train)
y6 = bayesian_ridge.predict(x_test_scaled)

sgd = SGDRegressor(alpha=0.1, epsilon=0.001, l1_ratio=0.2, loss='squared_loss', penalty='none', power_t=0.2)
sgd.fit(x_train_scaled, y_train)
y7 = sgd.predict(x_test_scaled)

###########################################
print '########## TESTING ERRORS ##########'

print "MAE for Linear Regression:", mean_absolute_error(y_test, y_predicted)
print "MAE for SVR:", mean_absolute_error(y_test, y2)
print "MAE for Kernel Ridge Regression:", mean_absolute_error(y_test, y3)
print "MAE for Lasso Regression:", mean_absolute_error(y_test, y4)
print "MAE for Linear Ridge Regression:", mean_absolute_error(y_test, y5)
print "MAE for Bayesian Ridge Regression:", mean_absolute_error(y_test, y6)
def main():
    parser = argparse.ArgumentParser(description="""Creates embeddings predictions.""")
    parser.add_argument('--train')
    parser.add_argument('--test')
    parser.add_argument('--embeddings')
    parser.add_argument('--cv',default=False)


    args = parser.parse_args()

    stoplist = stopwords.words("english")
    stoplist.extend("it's 've 's i'm he's she's you're we're they're i'll you'll he'll ".split(" "))


    embeddings={}
    for line in codecs.open(args.embeddings,encoding="utf-8").readlines():
        line = line.strip()
        if line:
            a= line.split(" ")
            embeddings[a[0]] = np.array([float(v) for v in a[1:]]) #cast to float, otherwise we cannot operate

    train_indices = []
    test_indices = []
    train_scores = []
    train_features = []
    test_features = []


    # if args.learner == "logisticregression":
    #     learner= LogisticRegression()
    #     learner_type = "classification"
    # elif args.learner == "decisiontreeclassification":
    #     learner = tree.DecisionTreeClassifier()
    #     learner_type = "classification"
    # elif args.learner == "decisiontreeregression":
    #     learner = tree.DecisionTreeRegressor()
    #     learner_type = "regression"
    # elif args.learner == "bayesianridge":
    #     learner = BayesianRidge()
    #     learner_type = "regression"
    # else:
    learner = BayesianRidge()
    learner_type = "regression"

    le = preprocessing.LabelEncoder()


    for line in open(args.train).readlines():
        (index, score, tweet) = line.strip().split("\t")
        train_indices.append(index)
        train_scores.append(float(score))
        tweet = tweet.split(" ")
        train_features.append(embedfeats(tweet,embeddings,stoplist))


    train_indices = np.array(train_indices)
    train_scores = np.array(train_scores)
    train_features = np.array(train_features)

    train_scores_int = [roundup(v) for v in train_scores]
    le.fit(train_scores_int)

    train_scores_int_transformed = le.transform(train_scores_int)


    if args.cv:
        train_cv={}
        cross=cross_validation.KFold(len(train_scores),n_folds=10)
        acc=[]
        for train_index, test_index in cross:
            #if args.debug:
            #    print("TRAIN:", len(train_index), "TEST:", len(test_index))
            X=train_features
            y=train_scores
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]


            learner.fit(X_train,y_train)

            y_pred= learner.predict(X_test)
            assert(len(y_pred)==len(test_index))
            tids=train_indices[test_index]
            for twid,pred in zip(tids,y_pred):
                train_cv[twid] =  pred

            acc.append(cosine_similarity(y_test,y_pred)[0][0])

        print >>sys.stderr, "Cosine of 10-folds:", acc
        print >>sys.stderr, "Macro average:", np.mean(np.array(acc)), np.std(np.array(acc))

        for twid in train_indices:
            print "{}\t{}".format(twid,train_cv[twid])
    else:

        for line in open(args.test).readlines():
            (index, score, tweet) = line.strip().split("\t")
            test_indices.append(index)
            #scores.append(score)
            tweet = tweet.split(" ")
            test_features.append(embedfeats(tweet,embeddings,stoplist))


        #print  np.array(train_features).shape
        # when features are generated, train and test

        if learner_type == "regression":
            learner.fit(train_features,train_scores)
        else:
                learner.fit(train_features,train_scores_int_transformed)

        predicted_scores= learner.predict(test_features)
        if learner_type != "regression":
            predicted_scores = le.inverse_transform(predicted_scores)
        for index, score in zip(test_indices,predicted_scores):
            print index+"\t"+str(score)
Beispiel #59
0
def bayes_ridge_reg(x_data,y_data):
    br = BayesianRidge()
    br.fit(x_data,y_data)
    print 'br params',br.coef_,br.intercept_
    adjusted_result = br.predict(x_data)
    return map(int,list(adjusted_result))
Beispiel #60
0
     if pd.isnull(row['Age']):
         for key in avg_age.keys():
             if key in row['Name']:
                 tt.loc[index,"Age"] = avg_age[key]
#--------------------------------------------------------------------------------
#

X =  td.loc[:,['Sex','Age', 'Fare','SibSp','Parch','Pclass']].values
X = np.where(np.isnan(X), -1, X)
X_ = tt.loc[:,['Sex','Age', 'Fare','SibSp','Parch', 'Pclass']].values
X_ = np.where(np.isnan(X_), -1, X_)

Y = td['Survived'].values

clf = BayesianRidge(lambda_1=10**-4, lambda_2=10**-4, alpha_1=10**2.75,alpha_2=10**3.3, compute_score=True) #0.78947
model = clf.fit(X, Y)

#Result
predict_result = model.predict(X_).round(0).astype(int)

result = pd.DataFrame.from_items([('PassengerId',tt['PassengerId']), ('Survived',predict_result)])
result.to_csv('result/bayes_result.csv', index=False)

t1 = pd.read_csv("result/elastic_result_077512.csv")
t2 = pd.read_csv("result/bayes_result.csv")
t3 = t1 == t2
i = 0
for index, row in t3.iterrows():
    if row['Survived'] == False:
        i += 1