def add_overall_trend_feature(df_shop, target='pays_count'):
    biweek_max = df_shop.biweek_id.max()

    trend_name = 'trend_overall'
    coeff_name = 'trend_overall_coeff'
    df_shop[trend_name] = np.nan
    df_shop[coeff_name] = np.nan

    for m in range(biweek_max - 1, 0, -1):
        train_idx = df_shop.biweek_id >= m
        test_idx = df_shop.biweek_id == (m - 1)

        df_train = df_shop[train_idx]

        y = df_train[target]
        not_null = ~y.isnull()
        if not_null.sum() <= 7:
            continue

        x = df_train.days_from_beginning
        x_not_null = x[not_null].values.reshape(-1, 1)
        y = y[not_null].values
        lr = Ridge(alpha=1).fit(x_not_null, y)

        if m == biweek_max - 1:
            x = x.values.reshape(-1, 1)
            df_shop.loc[train_idx, trend_name] = lr.predict(x)
            df_shop.loc[train_idx, coeff_name] = lr.coef_[0]

        df_test = df_shop[test_idx]
        x = df_test.days_from_beginning.values.reshape(-1, 1)

        df_shop.loc[test_idx, trend_name] = lr.predict(x)
        df_shop.loc[test_idx, coeff_name] = lr.coef_[0]
def add_weekly_overall_trends(df_shop, regressor, trend_name, coeff_name, target='pays_count'):
    biweek_max = df_shop.biweek_id.max()

    df_shop[trend_name] = np.nan
    df_shop[coeff_name] = np.nan

    for m in range(biweek_max - 1, 0, -1):
        train_idx = df_shop.biweek_id >= m
        test_idx = df_shop.biweek_id == (m - 1)

        df_train = df_shop[train_idx]

        y = df_train[target]
        not_null = ~y.isnull()
        if not_null.sum() < 7:
            continue

        x = -df_train[regressor]
        x_not_null = x[not_null].values.reshape(-1, 1)
        y = y[not_null].values
        lr = Ridge(alpha=1).fit(x_not_null, y)

        if m == biweek_max - 1:
            x = x.values.reshape(-1, 1)
            df_shop.loc[train_idx, trend_name] = lr.predict(x)
            df_shop.loc[train_idx, coeff_name] = lr.coef_[0]

        df_test = df_shop[test_idx]
        x = -df_test[regressor].values.reshape(-1, 1)

        df_shop.loc[test_idx, trend_name] = lr.predict(x)
        df_shop.loc[test_idx, coeff_name] = lr.coef_[0]
def add_window_trend_overall_features(df_shop, target='pays_count'):
    biweek_max = df_shop.biweek_id.max()

    for biweeks_past in [2, 3, 4, 5, 6, 12, 18]:
        trend_name = 'trend_%d' % biweeks_past
        trend_coef_name = 'trend_coef_%d' % biweeks_past
        df_shop[trend_name] = np.nan
        df_shop[trend_coef_name] = np.nan

        for m in range(biweek_max, biweeks_past, -1):
            m_past = m - biweeks_past
            train_idx = (df_shop.biweek_id >= m_past) & (df_shop.biweek_id <= m)
            test_idx = df_shop.biweek_id == (m_past - 1)

            df_rolling_train = df_shop[train_idx]
            df_rolling_test = df_shop[test_idx]

            y = df_rolling_train[target]
            not_null = ~y.isnull()
            if not_null.sum() <= 7:
                continue
        
            x = df_rolling_train.days_from_beginning
            x_not_null = x[not_null].values.reshape(-1, 1)
            y = y[not_null].values
            lr = Ridge(alpha=1).fit(x_not_null, y)

            if m == biweek_max:
                x = x.values.reshape(-1, 1)
                df_shop.loc[train_idx, trend_name] = lr.predict(x)
                df_shop.loc[train_idx, trend_coef_name] = lr.coef_[0]

            x_val = df_rolling_test.days_from_beginning.values.reshape(-1, 1)
            df_shop.loc[test_idx, trend_name] = lr.predict(x_val)
            df_shop.loc[test_idx, trend_coef_name] = lr.coef_[0]
Beispiel #4
0
def Ridge_model(train_linear, test_linear):
    ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400))
    ridgecv.fit(train_linear_fea, train_linear_tar)
    ridgecv_score = ridgecv.score(train_linear_fea, train_linear_tar)
    ridgecv_alpha = ridgecv.alpha_
    print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score)
    coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    
    start=time.time()
    ridge =Ridge(normalize = True)
    ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000)
    #ridge.set_params(alpha=6,max_iter = 10000)
    ridge.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, ridge.predict(x_test))
    coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(ridge,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_ridge_predict=ridge.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_ridge_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_ridge=np.expm1(ridge.predict(test_linear))
    write_pkl(ridgecv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/ridge_params.pkl')
    return test_prediction_ridge
    
    
Beispiel #5
0
class OrderScorer(Scorer):

    def __init__(self):
        self.classifier = Ridge(alpha=0.1)
        self.cache_filename = 'subgraph_order_scorer_reg.pickle'

    def train(self, train_instances, train_labels, update_cache=True,
              sample_weight=None):
        """
        Trains a scorer to score the quality of an ordering of sentences
        Loads from cache if available
        """
        self.classifier.fit(train_instances, train_labels, sample_weight=sample_weight)
        if update_cache:
            pickle.dump(self.classifier, open(self.cache_filename, 'wb'))

    def test(self, test_instances, test_labels):
        """ Uses test set to evaluate the performance of the scorer and print it out """
        scores = self.classifier.predict(test_instances)
        # TODO: print report

    def load(self):
        if os.path.exists(self.cache_filename):
            self.classifier = pickle.load(open(self.cache_filename, 'rb'))
        else:
            raise Exception("No classifier exists! Must call train with update_cache=True") 

    def evaluate(self, test_instance):
        """ Applies the scoring function to a given test instance """
        return self.classifier.predict([test_instance])[0]
def forecast_future_attention(train_index, test_index, alpha):
    """Forecast future attention via train dataset index and test dataset index."""
    m, n = len(train_index), len(test_index)
    x_train_predict = attention_data[train_index, :num_train]
    x_test_predict = attention_data[test_index, :num_train]
    for i in xrange(num_train, age):
        if with_share == 1:
            x_train = np.hstack((x_train_predict, share_data[train_index, :i + 1]))
            x_test = np.hstack((x_test_predict, share_data[test_index, :i + 1]))
            norm = np.hstack((x_train[:, :i], attention_data[train_index, i].reshape(m, 1), share_data[train_index, :i + 1]))
        else:
            x_train = x_train_predict
            x_test = x_test_predict
            norm = np.hstack((x_train[:, :i], attention_data[train_index, i].reshape(m, 1)))
        x_train_norm = x_train / np.sum(norm, axis=1)[:, None]
        y_train = np.ones(m, )

        # == == == == == == == == Training with Ridge Regression == == == == == == == == #
        predictor = Ridge(fit_intercept=False, alpha=alpha)
        predictor.fit(x_train_norm, y_train)

        # == == == == == == == == Iteratively add forecasted value to x matrix == == == == == == == == #
        predict_train_value = (predictor.predict(x_train) - np.sum(x_train, axis=1)).reshape(m, 1)
        predict_train_value[predict_train_value < 0] = 0
        x_train_predict = np.hstack((x_train_predict, predict_train_value))
        predict_test_value = (predictor.predict(x_test) - np.sum(x_test, axis=1)).reshape(n, 1)
        predict_test_value[predict_test_value < 0] = 0
        x_test_predict = np.hstack((x_test_predict, predict_test_value))
    return x_test_predict[:, num_train: age]
Beispiel #7
0
class TriFiLearn:

	def __init__(self):
		print "requesting x training set"
		# setX = pd.read_csv("http://localhost:8080/csv/dimension/x/version/floor10-test")
		setX = pd.read_csv("csv/training-set-x-version-floor10-test.csv")
		labelsX = setX['x'].values
		featuresX = setX.iloc[:, 1:].values
		self.modelX = Ridge(normalize=True).fit(featuresX, labelsX)

		print "requesting y training set"
		# setY = pd.read_csv("http://localhost:8080/csv/dimension/y/version/floor10-test")
		setY = pd.read_csv("csv/training-set-y-version-floor10-test.csv")
		labelsY = setY['y'].values
		featuresY = setY.iloc[:, 1:].values
		self.modelY = Ridge(normalize=True).fit(featuresY, labelsY)

	def predictX(self, features):
		return self.modelX.predict(features)

	def predictY(self, features):
		return self.modelY.predict(features)

	def trainX(self, label, features):
		labelsX.append(label)
		featuresX.append(features)
		self.modelX = Ridge(normalize=True).fit(featuresX, labelsX)

	def trainY(self, label, features):
		labelsY.append(label)
		featuresY.append(features)
		self.modelY = Ridge(normalize=True).fit(featuresY, labelsY)

	def reset(self):
		self.__init__()
Beispiel #8
0
def ridge_regression(data, predictors, alpha, models_to_plot={},test=None):
    #Fit the model
    ridgereg = Ridge(alpha=alpha,normalize=True)
    ridgereg.fit(data[predictors],data['y'])
    y_pred = ridgereg.predict(data[predictors])
    n = len(data['x'])
    #Check if a plot is to be made for the entered alpha
    if alpha in models_to_plot:
        #plt.subplot(models_to_plot[power])
        #plt.tight_layout()
        plt.figure()
        plt.plot(data['x'],data['y'],'.',markersize=15)
        plt.plot(data['x'],y_pred,color='red',linewidth=2)
        #plt.title('power: %d'%power)
        plt.savefig('./fig/ridge_alpha' + models_to_plot[alpha] + '.png',dpi=dpi)
    
    #Return the result in pre-defined format
    rss = (1.0/n)*sum((y_pred-data['y'])**2)
    ret = [rss]
    ret.extend([ridgereg.intercept_])
    ret.extend(ridgereg.coef_)
    if test is not None:
        nt = len(test['x'])
        y_test = ridgereg.predict(test[predictors])
        test_rss = (1.0/nt)*sum((y_test - test['y'])**2)
        ret.extend([test_rss])
    return ret
Beispiel #9
0
def reg_lin():
    Xtrain, Xtest, Ytrain, Ytest = skcv.train_test_split(X, Y, train_size=.8)
    
    regressor = Ridge(alpha=1)  
    regressor.fit(Xtrain,np.log(Ytrain))
    
    Ypred = np.array(regressor.predict(Xtest),dtype=float) 
    
    print logscore( Ytest, np.exp(Ypred ) )
        
    validate = load_data('validate')
    validate = transformFeatures(validate)
    np.savetxt('results/validate.txt', np.exp(np.array( regressor.predict(validate), dtype=np.dtype('d'))))
Beispiel #10
0
class ThreeRidgeEstimator(BaseEstimator):
    '''
    Three Ridge estimator for each class of variable
    '''
    def __init__(self, alpha1=1.0,
                 alpha2=1.0, alpha3=1.0):
        '''
        Initializes a new instance of this estimator

        alpha1:
            alpha parameter for the first ridge

        alpha2:
            alpha parameter for the second ridge

        alpha3:
            alpha parameter for the third ridge
        '''
        self.alpha1 = alpha1
        self.alpha2 = alpha2
        self.alpha3 = alpha3
        self.models = []

    def fit(self, X, Y):
        self.model1 = Ridge(alpha=self.alpha1)
        self.model1.fit(X, Y[:, 0:5])

        self.model2 = Ridge(alpha=self.alpha2)
        self.model2.fit(X, Y[:, 5:9])

        'fit k'
        self.model3 = Ridge(alpha=self.alpha3)
        self.model3.fit(X, Y[:, 9:])

    def predict(self, X):
        pred_s = self.model1.predict(X)
        pred_w = self.model2.predict(X)
        pred_k = self.model3.predict(X)

        pred_s_sum = pred_s.sum(axis=1)[:, np.newaxis]
        pred_s /= pred_s_sum

        pred_w_sum = pred_w.sum(axis=1)[:, np.newaxis]
        pred_w /= pred_w_sum

        predictions = np.hstack((pred_s, pred_w, pred_k))

        return predictions
Beispiel #11
0
class MyRegression(object):
    def __init__(self, x_data, y_data):
        self.x_data = np.array(x_data)
        self.y_data = np.array(y_data)
        self.v_data = [x[0] for x in x_data]
        print "old accuracy", get_accuracy(self.v_data, self.y_data)

    def ols_linear_reg(self):
        self.lr = LinearRegression()
        self.lr.fit(self.x_data, self.y_data)
        adjusted_result = self.lr.predict(self.x_data)
        print "lr params", self.lr.coef_, self.lr.intercept_
        print "lr accuracy", get_accuracy(adjusted_result, self.y_data)
        return map(int, list(adjusted_result))

    def bayes_ridge_reg(self):
        br = BayesianRidge()
        br.fit(self.x_data, self.y_data)
        adjusted_result = br.predict(self.x_data)
        print "bayes ridge params", br.coef_, br.intercept_
        print "bayes ridge accuracy", get_accuracy(adjusted_result, self.y_data)
        return map(int, list(adjusted_result))

    def linear_ridge_reg(self):
        self.rr = Ridge()
        self.rr.fit(self.x_data, self.y_data)
        adjusted_result = self.rr.predict(self.x_data)
        print "ridge params", self.rr.coef_, self.rr.intercept_
        print "ridge accuracy", get_accuracy(adjusted_result, self.y_data)
        return map(int, list(adjusted_result))
Beispiel #12
0
def ridgereg(a):
    print("Doing ridge regression")
    clf = Ridge(alpha=a)
    clf.fit(base_X, base_Y)
    print ("Score = %f" % clf.score(base_X, base_Y))
    clf_pred = clf.predict(X_test)
    write_to_file("ridge.csv", clf_pred)
Beispiel #13
0
def test_brr_like_sklearn():
    n = 10000
    d = 10
    sigma_sqr = 5
    X = np.random.randn(n, d)
    beta_true = np.random.random(d)
    y = np.dot(X, beta_true) + np.sqrt(sigma_sqr) * np.random.randn(n)
    X_tr = X[:n / 2, :]
    y_tr = y[:n / 2]
    X_ts = X[n / 2:, :]
    #  y_ts = y[n / 2:]

    # prediction with my own bayesian ridge
    lambda_reg = 1
    brr = BayesianRidgeRegression(lambda_reg,
                                  add_ones=True,
                                  normalize_lambda=False)
    brr.fit(X_tr, y_tr)
    y_ts_brr = brr.predict(X_ts)

    # let's compare to scikit-learn's ridge regression
    rr = Ridge(lambda_reg)
    rr.fit(X_tr, y_tr)
    y_ts_rr = rr.predict(X_ts)

    assert np.mean(np.abs(y_ts_brr - y_ts_rr)) < 0.001, \
        "Predictions are different from sklearn's ridge regression."
def training(X,Y,X_test, pca='kpca', regressor='ridge', dim=50):
    # X and Y are numpy arrays
    print 'Input data and label shape: ', X.shape, Y.shape

    if pca == 'nopca': return simpleTraining(X, Y, X_test, regressor)

    model, P = getProjectionMatrixPCA(Y, dim) if pca=='pca' else getProjectionMatrixKPCA(dim)
    Y_train = np.dot(Y, P) if pca=='kpca' else np.dot(Y,P.transpose())


    regressors = []
    for i in range(dim):
        print 'at regressor number: ', i
        reg = Ridge() if regressor=='ridge' else SVR()
        y = [x[i] for x in Y_train]
        reg.fit(X, y)
        regressors.append(reg)

    Z_pred = []
    for reg in regressors:
        Z_pred.append(reg.predict(X_test))
    print 'prediction shapes:' , len(Z_pred), len(Z_pred[0])
    Z_pred = np.array(Z_pred)
    Y_pred = np.dot(P, Z_pred).transpose() if pca=='kpca' else np.dot(Z_pred.transpose(), P)
    return model, regressors, Y_pred
Beispiel #15
0
def ridge_regression(data,target,alphas):
    plt.figure()
    mean_rmses=[]
    kf=KFold(len(target),10,True,None)
    for alpha0 in alphas:
        rmses=[]
        clf=Ridge(alpha=alpha0,normalize=True,solver='svd')
        for train_index, test_index in kf:
            data_train,data_test=data[train_index],data[test_index]
            target_train,target_test=target[train_index],target[test_index]
            clf.fit(data_train,target_train)
            rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2))
            rmses.append(rmse)
            
        mean_rmses.append(np.mean(rmses))
        x0=np.arange(1,11)
        plt.plot(x0,rmses,label='alpha='+str(alpha0),marker='o')
        
    lr = linear_model.LinearRegression(normalize = True)
    rmses = []
    for train_index, test_index in kf:
        data_train, data_test = data[train_index], data[test_index]
        target_train, target_test = target[train_index], target[test_index]
        lr.fit(data_train, target_train)
        rmse = sqrt(np.mean((lr.predict(data_test) - target_test) ** 2))
        rmses.append(rmse)
    mean_rmses.append(np.mean(rmses))
    x0=np.arange(1,11)
    plt.plot(x0,rmses,label='linear',marker='*')
    
    plt.title("RMSE comparison between different alpha values of Ridge regularization")
    plt.legend()
    plt.show()
#    print(mean_rmses)
    return mean_rmses
Beispiel #16
0
class LogisticRegressionSeparator(BaseEstimator):

    def get_params(self, deep=True):
        return {}

    def fit(self, X, y):
        # lets predict which users will spend anything later
        classes = y - X[:, 0]
        classes = np.where(classes > 0.1, 1, 0)

        self.classifier = LogisticRegression(
                class_weight='balanced')

        self.classifier.fit(X, classes)
        results = self.classifier.predict(X)
        results = results == 1

        self.estimator = Ridge(alpha=0.05)
        self.estimator.fit(X[results], y[results])

    def predict(self, X):
        y = X[:,0].reshape(X.shape[0])
        labels = (self.classifier.predict(X) == 1)
        y[labels] = self.estimator.predict(X[labels])
        return y
Beispiel #17
0
def regression_NumMosquitos(Xtr, ytr, Xte):
    from sklearn.linear_model import Ridge, RidgeCV
    #model_nm = RidgeCV(alphas=range(200, 401, 10), cv=5)
    model_nm = Ridge(alpha = 340)
    model_nm = model_nm.fit(Xtr, ytr)
    results_nm = model_nm.predict(Xte)
    return results_nm
def ridgeRegression(X,y):

    print("\n### ~~~~~~~~~~~~~~~~~~~~ ###")
    print("Ridge Regression")

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myDegree = 40
    polynomialFeatures = PolynomialFeatures(degree=myDegree, include_bias=False)
    Xp = polynomialFeatures.fit_transform(X)

    myScaler = StandardScaler()
    scaled_Xp = myScaler.fit_transform(Xp)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    ridgeRegression = Ridge(alpha=1e-11,solver="cholesky")
    ridgeRegression.fit(scaled_Xp,y)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    dummyX = np.arange(0,2,0.01)
    dummyX = dummyX.reshape((dummyX.shape[0],1))
    dummyXp = polynomialFeatures.fit_transform(dummyX)
    scaled_dummyXp = myScaler.transform(dummyXp)
    dummyY = ridgeRegression.predict(scaled_dummyXp)

    outputFILE = 'plot-ridgeRegression.png'
    fig, ax = plt.subplots()
    fig.set_size_inches(h = 6.0, w = 10.0)
    ax.axis([0,2,0,15])
    ax.scatter(X,y,color="black",s=10.0)
    ax.plot(dummyX, dummyY, color='red', linewidth=1.5)
    plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2, dpi = 600)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
Beispiel #19
0
    def _make_forecast(self, model, name, alpha=None, l1_ratio=None):
        """
        Output: DataFrame

        Train on the holdout set and make predictions for the next week
        """
        X_hold = self.hold_set[self.hold_set.columns[1:]]
        if 'lyft' in self.filename:
            y_hold = self.hold_set['avg_est_price']
        else:
            y_hold = self.hold_set['avg_price_est']
        if name.split("_")[0] == "ridgecv":
            model = Ridge(alpha=alpha)
        elif name.split("_")[0] == "lassocv":
            model = Lasso(alpha=alpha)
        elif name.split("_")[0] == "elasticnetcv":
            model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
        model.fit(X_hold, y_hold)
        self.X_forecast = X_hold.copy()
        # assumes weekofyear is increasing
        self.X_forecast['weekofyear'] = self.X_forecast['weekofyear'].apply(lambda x: x+1)
        self.X_forecast.index = self.X_forecast.index + pd.Timedelta(days=7)
        self.y_forecast = model.predict(self.X_forecast)
        self.y_forecast = pd.DataFrame(self.y_forecast, index=self.X_forecast.index, columns=['y_forecast'])
        self.y_forecast = pd.concat([self.X_forecast, self.y_forecast], axis=1)
        saved_filename = "rideshare_app/data/{}_forecast.csv".format(name)
        self.y_forecast.to_csv(saved_filename)
        print "saved prediction values to {}".format(saved_filename)
def ridge_regression(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True):
    """
   :param train_x: train
   :param train_y: text
   :param pred_x: test set to predict
   :param review_id: takes in a review id
   :param v_curve: run the model for validation curve
   :param l_curve: run the model for learning curve
   :param get_model: run the model
   :return:the predicted values,learning curve, validation curve
   """
    lin = Ridge(alpha=0.5)
    if get_model:
        print "Fitting Ridge..."
        lin.fit(train_x, np.log(train_y+1))
        gbr_pred = np.exp(lin.predict(pred_x))- 1
        for i in range(len(gbr_pred)):
            if gbr_pred[i] < 0:
                gbr_pred[i] = 0
        Votes = gbr_pred[:, np.newaxis]
        Id = np.array(review_id)[:, np.newaxis]
        submission_lin= np.concatenate((Id,Votes),axis=1)
        np.savetxt("submission_ridge.csv", submission_lin,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='')
    if v_curve:
        print "Working on Validation Curves"
        plot_validation_curve(Ridge(), "Validation Curve for Ridge Regression", train_x, np.log(train_y+1.0),
                              param_name="alpha", param_range=[0.1,0.2,0.5,1,10])
    if l_curve:
        print "Working on Learning Curves"
        plot_learning_curve(Ridge(), "Learning Curve for Linear Regression", train_x, np.log(train_y+1.0))
Beispiel #21
0
def kfold_cv(X_train, y_train,idx,k):

    kf = StratifiedKFold(y_train,n_folds=k)
    xx=[]
    count=0
    for train_index, test_index in kf:
        count+=1
        X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:]
        gc.collect()
        y_train_cv, y_test_cv = y_train[train_index],y_train[test_index]
        y_pred=np.zeros(X_test_cv.shape[0])
        m=0
         
        for j in range(m):
            clf=xgb_classifier(eta=0.05,min_child_weight=20,col=0.5,subsample=0.7,depth=5,num_round=500,seed=j*77,gamma=0.1)
            y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv))
            yqq=y_pred*(1.0/(j+1))

            print j,llfun(y_test_cv,yqq)

        #y_pred/=m;
        clf=Ridge()#RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100)
        clf.fit(X_train_cv,(y_train_cv))
        y_pred=clf.predict(X_test_cv)
        print y_pred.shape
        xx.append(llfun(y_test_cv,(y_pred)))
        ypred=y_pred
        yreal=y_test_cv
        idx=idx[test_index]
        print xx[-1]#,y_pred.shape
        break

    print xx,'average:',np.mean(xx),'std',np.std(xx)
    return ypred,yreal,idx#np.mean(xx)
Beispiel #22
0
def knn_twice(k):
	knn1 = neighbors.KNeighborsRegressor(n_neighbors=k)
	knn1.fit(trainf,trainlab)
	print 'here'
	tim = time.time();

	n = len(train)/1000
	pred1 = []
	for i in range(0,n):
		pred1.extend(knn1.predict(trainf[(i*1000):((i+1)*(1000))]))
		print(i)
	pred1.extend(knn1.predict(trainf[67000:67946]))
	print "time: " + str(time.time() - tim)
	#knn = neighbors.KNeighborsRegressor(n_neighbors=k)
	#knn.fit(pred1,trainlab)
	ridge = Ridge(alpha=1.0)
	ridge.fit(pred1, trainlab)

	n = 10
	pred2 = []
	for i in range(0,n):
		pred2.extend(knn1.predict(testf[(i*1000):((i+1)*(1000))].toarray()))
		print(i)	

	n = 10
	pred = []
	for i in range(0,n):
		pred.extend(ridge.predict(pred2[(i*1000):((i+1)*(1000))]))
		print(i)	

	#RMSE:
	testlab = np.array(test.ix[:,4:])
	err = format(np.sqrt(np.sum(np.array(np.array(pred-testlab)**2)/ (testf.shape[0]*24.0))))
	return err
def cross_valid(X,Y,n_fold):
	clf = Ridge(alpha=1.0)
	total_mean_square = 0
	total_coef = 0
	Y_np = np.array(Y)
	n_samples, n_features = len(X), len(X[0])
	kf_Y = cross_validation.KFold(n_samples, n_fold)
	index = []
	preds = []
	truths = []
	for train_index, test_index in kf_Y:
		X_train, X_test = X[train_index], X[test_index]
		y_train, y_test = Y_np[train_index], Y_np[test_index]
		

		clf.fit(X_train,y_train)
		y_pred = clf.predict(X_test)
		index += test_index.tolist()
		preds += map(lambda x: 1 if x > 0.5 else 0 ,y_pred.tolist())
		truths += y_test.tolist()
		#print "predict:",map(lambda x: 1 if x > 0.5 else 0,y_pred)
		#print "original:",y_test

		total_mean_square += mean_squared_error(y_test,y_pred) 
		total_coef += clf.coef_
	
		#print 'Coefficient of the prediction (pearsonr): ' , pearsonr(y_pred,y_test) 
	print 'All Coefficient of the prediction (pearsonr): ' , pearsonr(truths,preds) 
	print 'Average mean squared error is: ' , total_mean_square / n_fold

	diff_count = sum([abs(truth - pred) for truth, pred in zip(truths, preds)])
	acc =  100-1.* diff_count/len(truths)*100
	print 'prediction accuracy is %f'%(acc)
	return [total_coef, index , preds]
Beispiel #24
0
def bowFitAndPrediction(predictData, textSeries, outcome,typeModel='binary'):
    print "Bag of words for %s" % (textSeries.name)
    
    if typeModel == 'continuous':
        bowModel = Ridge(alpha = 0.001)
    else:
        bowModel = LogisticRegression(penalty='l2',dual=False,tol=0.0001,fit_intercept=True, C=1, intercept_scaling=1, class_weight=None, random_state=423) 
    
    
    vectorizer = getFeatures(textSeries)
    
    X_train = vectorizer.transform(predictData)
        
    #Outcomes
    Y_train = outcome
    
    #Logistic regression, not sure if best
    bowModel.fit(X_train,Y_train)
    
    #Comment out later, fitting on CV data
    
    if typeModel == 'continuous':
        predict = bowModel.predict(X_train)
        yhat = predict
    else:
        predict = bowModel.predict_proba(X_train)
        yhat = predict[:,1]
    
    
    return (yhat, vectorizer, bowModel)
Beispiel #25
0
def impute_age():
    X, P = gfa.platform_expression("GPL96")
    model = impute.KNNImputer()
    Xi = model.fit_transform(X, axis=1)

    age = array(P["age"].tolist())
    Xm = Xi.as_matrix()
    ix = array((age >= 10) & (age <= 120)).nonzero()[0]
    np.random.shuffle(ix)
    Xm = Xm[ix, :]
    age = age[ix]

    n_train = 2000
    n_test = 500
    # clf = SVR(C=1e-5, epsilon=1)
    # clf = LinearRegression()
    clf = Ridge()
    # clf = SimpleRegressor()
    # clf = Lasso()
    clf.fit(Xm[:n_train, :], age[:n_train])
    y = age[n_train : (n_train + n_test)]
    y_hat = clf.predict(Xm[n_train : (n_train + n_test)])
    dy = y - y_hat

    bias_tr = y_hat.mean() - age.mean()
    print("\nBias (vs train):\t\t", bias_tr)
    print("Bias (vs test):\t\t\t", dy.mean())
    print("Mean error:\t\t\t", fabs(dy).mean())
    print("Mean error (bias corrected):\t", fabs(dy - bias_tr).mean())
    print("MSE:\t\t\t\t", np.power(dy, 2).mean())
Beispiel #26
0
 def RidgeRegression(self,filename,outputFile):
     pheno,geno = self.inputParse(filename)
     for row in geno:
                 if len(row)%2 !=0:
                         return "Rows are not even."
     maxGeno = max(geno)
     allGeno = list(set(maxGeno))
     encoder = [i for i in range(len(allGeno))]
     lengthGeno = len(geno)
     length = len(geno)
     lenInnerGeno = len(geno[0])
     genoMake = [0 for x in range(len(allGeno))]
     dictionary = dict(zip(allGeno,encoder))
     for i in range(length):
             for x in range(lenInnerGeno):
                     geno[i][x] = dictionary[geno[i][x]]
     phenoNaN = []
     for i in range(len(pheno)):
         if pheno[i] == 'NaN':
             phenoNaN.append(i)
     phenoNaN.reverse()
     for i in phenoNaN:
         del pheno[i]
     genoMiss = []
     for i in range(len(geno)):
         if i not in phenoNaN:
             genoMiss.append(geno[i])
     pheno = [float(i) for i in pheno]    
     alpha = self.alphaOptimization(genoMiss,pheno)
     clf = Ridge(alpha = alpha)
     clf.fit(genoMiss,pheno)
     predicted = clf.predict(geno)
     predicted = np.transpose(predicted)
     np.savetxt(outputFile,np.transpose(predicted))
Beispiel #27
0
def ridge_regressor(df):
    """
    INPUT: Pandas dataframe
    OUTPUT: R^2 and Mean Absolute Error performance metrics, feature coefficients
    """
    y = df.pop("price").values
    X = df.values
    feature_names = df.columns
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0)

    clf = Ridge(alpha=1.0)
    clf.fit(xtrain, ytrain)

    score = clf.score(xtest, ytest)
    feat_imps = clf.coef_
    ypredict = clf.predict(xtest)
    mae = np.mean(np.absolute(ytest - ypredict))
    mae_percent = np.mean(np.absolute(ytest - ypredict) / ytest)
    return (
        "R^2 is ",
        score,
        "RMSE is ",
        rmse,
        "MAE percent is ",
        mae_percent,
        "Feature coefficients are ",
        zip(feature_names, feat_imps),
    )
Beispiel #28
0
def _check_ridge_model(featureses, labels):
    """Plot ridge regression predictions"""
    for tfidf_count in FEATURES_SIZES:
        test_points = []
        for i in range(16):
            tmp = [i, 100]
            tmptmp = [0] * tfidf_count
            if tmptmp:
                tmp.extend(tmptmp)
            test_points.append(tmp)
        test_points = np.array(test_points)
        limit = tfidf_count + 2
        model = Ridge()
        model.fit(featureses[:, :limit], labels)
        predictions = model.predict(test_points)
        plt.plot(
            predictions,
            label=str(tfidf_count),
            linestyle=next(LINECYCLER),
            linewidth=3)
        # plt.text(test_points[-1, 0], predictions[-1], str(tfidf_count))
    plt.legend()
    plt.xlabel('Document order')
    plt.ylabel('Time (seconds)')
    plt.savefig('ridge_predictions.pdf')
Beispiel #29
0
def traverse_movies_ridge():
	LBMAP = getLBMap()
	DMAP = createEmpty()

	P_ERRORS, ERRORS = [], []

	training_data, training_response = [], []

	for i in range(len(data)):

		movie = data[i]
		m_rev = movie['revenue']

		myvector = vectorizeMovie(movie, LBMAP, DMAP)

		if i > 100:
			model = Ridge(alpha = .5)
			model.fit(training_data, training_response)
			raw = math.fabs(model.predict(myvector) - m_rev)
			ERRORS.append(raw)
			#P_ERRORS.append(round(raw/m_rev, 4))
		
		training_data.append(myvector)
		training_response.append(m_rev)

		DMAP = update(movie, DMAP)

	#print 'all', avg_float_list(P_ERRORS)
	print 'all', avg_float_list(ERRORS)
def reg_skl_ridge(param, data):
    [X_tr, X_cv, y_class_tr, y_class_cv, y_reg_tr, y_reg_cv] = data
    ridge = Ridge(alpha=param["alpha"], normalize=True)
    ridge.fit(X_tr, y_reg_tr)
    pred = ridge.predict(X_cv)
    RMSEScore = getscoreRMSE(y_reg_cv, pred)
    return RMSEScore, pred
    predictions = results.predict(X_test)
    MAPE_linear.append(1/a * sum(abs((y_test - predictions)/y_test)))
    
print("Linear regression: " + str(np.mean(MAPE_linear)))
#higher cross validation MAPE even with high R square might be a case of overfitting

#cross validation - ridge regression 
MAPE_ridge = [] #mean absolute percentage error
cv = KFold(n_splits = 10, shuffle=True)
for train_index, test_index in cv.split(X, y):
    a = len(test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    fit_ridge = Ridge(alpha=30)
    fit_ridge.fit(X_train, y_train)
    predictions = fit_ridge.predict(X_test)
    MAPE_ridge.append(1/a * sum(abs((y_test - predictions)/y_test)))

print("Ridge regression: " + str(np.mean(MAPE_ridge)))

#Cross Validation - spline regression
MAPE_spline = [] #mean absolute percentage error
cv = KFold(n_splits = 10, shuffle=True)
for train_index, test_index in cv.split(X, y):
    n = len(test_index)
    seconds_train, seconds_test = seconds.iloc[train_index], seconds.iloc[test_index]
    followers_train, followers_test = followers.iloc[train_index], followers.iloc[test_index]
    dummies_train, dummies_test = dummies.iloc[train_index], dummies.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    X_train = spline_transform(seconds_train, followers_train, dummies_train)
    results = sm.OLS(y_train, X_train).fit()
Beispiel #32
0
ax[1].set_xlabel("target")
ax[1].set_ylabel("predict")
ax[1].scatter(t, th3, c='g', s=3)
plt.savefig("Linear_regression_advance.jpg")

# Tikhanov (quadratic) Regularizer
gamma = 0.2
wR = np.linalg.inv(X2.T @ X2 + gamma * np.identity(NumFeatures + 1)) @ X2.T @ t

l1 = Lasso(alpha=0.2)
l1.fit(X, t)
th_lasso = l1.predict(X)
print(' L1 Reg:{:.3f}'.format(error(t, th_lasso)))
l2 = Ridge(alpha=0.2)
l2.fit(X, t)
th_ridge = l2.predict(X)
print(' L2 Reg:{:.3f}'.format(error(t, th_ridge)))

fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(16, 16))
ax[0].bar(np.arange(len(wR)), wR)  # Tikhanov (quadratic) Regularizer
ax[1].bar(np.arange(len(l2.coef_)), l2.coef_)  # Ridge
ax[0].set_ylim(-900, 900)
ax[1].set_ylim(-900, 900)
ax[0].set_title("Tikhanov (quadratic) Regularizer")
ax[1].set_title("Ridge regularizer")
plt.savefig("compare L2 regularizer.jpg")

fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 10))
ax[0].bar(np.arange(len(w)),
          w)  # Pseudo-increase solution to linear regression
ax[1].bar(np.arange(len(l1.coef_)), l1.coef_)  # Lasso
Beispiel #33
0
#lin_reg = LinearRegression()
#plot_learning_curve(lin_reg, X, y)
from sklearn.pipeline import Pipeline
polynomial_regression = Pipeline([
    ("poly_features", PolynomialFeatures(degree=10, include_bias=False)),
    ("sgd_reg", LinearRegression()),
])
#plot_learning_curve(polynomial_regression, X, y)

# 1.5 Regulation: Ridge(l2)
# Ridge (l2)
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=1, solver="cholesky")  # Ridge is square
ridge_reg.fit(X, y)
print(ridge_reg.predict([[1.5]]))
sgd_reg = SGDRegressor(penalty="l2")  #"l2" is Ridge regulation
sgd_reg.fit(X, y.ravel())  #SGDRegressor's y is one dimension: use .ravel()

# Lasso regulation (l1)
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.1)
sgd_reg = SGDRegressor(penalty="l1")  #"l1" is Lasso; first order
lasso_reg.fit(X, y)
lasso_reg.predict([[1.5]])

# Elastic Net (l2+l1)
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X, y)
elastic_net.predict([[1.5]])
Beispiel #34
0
# In[17]:

from sklearn.linear_model import Ridge

model = Ridge(alpha=1, normalize=True)

model.fit(x_train, y_train)

# In[18]:

print('Training_score : ', model.score(x_train, y_train))

# In[19]:

y_pred = model.predict(x_test)

# In[ ]:

df_pred_actual = pd.DataFrame({'predicted': y_pred, 'actual': y_test})

# In[ ]:

df_pred_actual.head(50)

# In[ ]:

df_pred_actual.to_csv('Results.csv')

# In[22]:
Beispiel #35
0
          validation_data=(X_test, y_test),
          batch_size=128,
          epochs=100,
          verbose=1)

y_preds = model.predict(X_test)
print(r2_score(y_test, y_preds))

# In[50]:

from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.linear_model import Ridge
regressor = Ridge(alpha=50, max_iter=10000)
regressor.fit(X_train, y_train)
y_preds = regressor.predict(X_test)
print(r2_score(y_test, y_preds))
print(sqrt(mean_squared_error(y_test, y_preds)))

# In[37]:

# model = build_model(X_train.shape[1])
# model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=16, epochs=100, verbose=1)
#
# y_preds = model.predict(X_test)
# print(r2_score(y_test, y_preds))

# In[38]:

sqrt(mean_squared_error(y_test, y_preds))
Beispiel #36
0
def NARMA_Test(test_length=800,
               train_length=800,
               num_loops=1,
               a=0,
               plot=True,
               N=400,
               eta=0.4,
               gamma=0.05,
               phi=np.pi / 6,
               tau=400,
               bits=8,
               preload=False):
    """
    Args:
        test_length: length of testing data
        train_length: length of training data
        num_loops: number of delay loops in reservoir
        a: ridge regression parameter
        N: number of virtual nodes
        plot: display calculated time series
        gamma: input gain
        eta: oscillation strength
        phi: phase of MZN
        r: loop delay length 
        bits: bit precision
        preload: preload mask and time-series data

    Returns:
        NRMSE: Normalized Root Mean Square Error
    """

    #Import u and m
    if preload:
        file1 = open("data/Input_sequence.txt", "r")
        file2 = open("data/mask_2.txt", "r")
        contents = file1.readlines()
        contents2 = file2.readlines()
        u = []
        m = []
        for i in range(1000):
            u.append(float(contents[i][0:contents[i].find("\t")]))
            if i < 400:
                m.append(float(contents2[i][0:contents2[i].find("\n")]))
        file1.close()
        file2.close()
        u = np.array(u)
        m = np.array(m)
    #Randomly initialize u and m
    else:
        u = np.random.rand(train_length + test_length) / 2.
        m = np.array(
            [random.choice([-0.1, 0.1]) for i in range(N // num_loops)])

    #Calculate NARMA10 target
    target = NARMA_Generator(len(u), u)

    #Instantiate Reservoir, feed in training and verification datasets
    r1 = DelayReservoir(N=N // num_loops,
                        eta=eta,
                        gamma=gamma,
                        theta=0.2,
                        loops=num_loops,
                        phi=phi)
    x = r1.calculateMZNBit(u[:train_length], m, bits)
    #x_ideal = r1.calculateMZN(u[:train_length],m)
    x_test = r1.calculateMZNBit(u[train_length:], m, bits)
    #x_test_ideal = r1.calculateMZN(u[train_length:],m)

    #Train using Ridge Regression
    #clf = RidgeCV(alphas = a,fit_intercept = True)
    clf = Ridge(alpha=a)
    clf.fit(x, target[:train_length])
    y_test = clf.predict(x_test)
    y_input = clf.predict(x)

    #Calculate NRMSE
    NRMSE = np.sqrt(np.mean(np.square(y_test[50:]-target[train_length+50:]))/\
            np.var(target[train_length+50:]))

    NRMSEi = np.sqrt(np.mean(np.square(y_input-target[:train_length]))/\
            np.var(target[:train_length]))

    #Write to File
    '''
    x_total = np.concatenate((x,x_test))
    x_total = x_total.flatten(order='C')
    file1 = open("data/64_bit_test_x.txt","w+")
    file2 = open("data/64_bit_test_y.txt","w+")
    for i in range(2*320000):
        file1.write("%f"%x_total[i]+"\n")
        if(i < 1600):
            file2.write("%f"%target[i]+"\n")
    file1.close()
    '''

    #Plot predicted Time Series
    if (plot == True):
        #fig, (ax1,ax2) = plt.subplots(2,1)
        #ax1.plot(x.flatten()[5000:])
        #ax2.plot(x_ideal.flatten()[5000:])
        #plt.plot(x.flatten()[:1200])
        plt.plot(y_test[50:], label='Prediction')
        plt.plot(target[train_length + 50:], label='Target')
        plt.title('NRMSE = %f' % NRMSE)
        plt.legend()
        plt.show()

    return NRMSE
Beispiel #37
0
# Замените пропуски в столбцах LocationNormalized и ContractTime на специальную строку 'nan'.
train['LocationNormalized'].fillna('nan', inplace=True)
train['ContractTime'].fillna('nan', inplace=True)

# Примените DictVectorizer для получения one-hot-кодирования признаков LocationNormalized и ContractTime.
enc = DictVectorizer()
X_train_cat = enc.fit_transform(train[['LocationNormalized',
                                       'ContractTime']].to_dict('records'))

# Объедините все полученные признаки в одну матрицу "объекты-признаки". Обратите внимание, что матрицы для текстов и
# категориальных признаков являются разреженными. Для объединения их столбцов нужно воспользоваться функцией
# scipy.sparse.hstack.
X_train = hstack([X_train_text, X_train_cat])

# 3. Обучите гребневую регрессию с параметром alpha=1. Целевая переменная записана в столбце SalaryNormalized.

y_train = train['SalaryNormalized']
model = Ridge(alpha=1)
model.fit(X_train, y_train)

# 4. Постройте прогнозы для двух примеров из файла salary-test-mini.csv. Значения полученных прогнозов являются
# ответом на задание. Укажите их через пробел.

test = pandas.read_csv('../source/salary-test-mini.csv')
X_test_text = vec.transform(text_transform(test['FullDescription']))
X_test_cat = enc.transform(test[['LocationNormalized',
                                 'ContractTime']].to_dict('records'))
X_test = hstack([X_test_text, X_test_cat])

y_test = model.predict(X_test)
print(1, '{:0.2f} {:0.2f}'.format(y_test[0], y_test[1]))
Beispiel #38
0
    for power in range(2,degree+1):
        expanded[:,power-1]=data[:,0]**power
    return expanded

data = expand(data, 10)
train, temp = random_split(data, len(data)/2)
valid, test = random_split(temp, len(temp)/2)

lambs = np.linspace(0.01,0.2)
bestYs = []
result = []
best_err = 10000000 # very large number
for lamb in lambs:
    solver = Ridge(alpha = lamb, solver='cholesky',tol=0.00001)
    solver.fit(train[:,:-1],train[:,-1])
    ys = solver.predict(valid[:,:-1])
    valid_err = np.mean((ys-valid[:,-1])**2)
    result.append([lamb, valid_err])
    if valid_err<best_err:
        # keep the best
        bestYs = ys

result = np.array(result)

plt.plot(result[:,0], result[:,1], 'o')
plt.show()
    
    
'''
means = np.mean(Xs, 0)
stdevs = np.std(Xs, 0)
Beispiel #39
0
    day_to_predict = datetime.date(year=year, month=month, day=day)
    precipitation_intensity = row["Precipitation Intensity"]
    precipitation_probability = row["Precipitation Probability"]
    dew_point = row["Dew Point"]
    highest_temp = row["Highest Temp"]
    lowest_temp = row["Lowest Temp"]
    humidity = row['Humidity']
    uv_index = row['UV Index']

    prediction_values = numpy.array([[year, month, day,
                                      precipitation_intensity,
                                      precipitation_probability,
                                      dew_point, highest_temp,
                                      lowest_temp, humidity, uv_index]])

    prediction = Ridge.predict(ridge, prediction_values)
    write_string = str(prediction).strip("[]")
    prediction_list.append(prediction)

    #Predictions are outputted with brackets around the number which is a nuisance when
    # trying to graph the data in excel so I remove them before writing to the csv
    filtered_data.loc[index, "Predicted Generation [kWh]"] = str(prediction).strip("[]")

    filtered_data.loc[index, "Date"] = day_to_predict
    write_string = write_string.strip("[]")

# applies the filter to our graphs to remove the noise
filtered_predictions = filter(a, b, prediction_list, axis=0)
filtered_actual = filter(a, b, data["Generation [kWh]"], axis=0)
filtered_data["Filtered Predictions"] = filtered_predictions
filtered_data["Filtered Actual"] = filtered_actual
Beispiel #40
0
for bool, feature in zip(mask, df.columns[1:].tolist()):
    if bool:
        new_features.append(feature)

#print(new_features)
features.value = new_features
stats.text = "Top 5 features according to Select K Best (Chi2) : " + str(new_features)
 '''
#print(new_features)


x_train_original,x_test_original,y_train_original,y_test_original=train_test_split(df1,y,test_size=0.25)

clf = Ridge()
clf.fit(x_train_original,y_train_original)
predictions=clf.predict(x_test_original)
scores = cross_val_score(clf,df1,y,cv=5,scoring='neg_mean_squared_error')

stats2.text += "Mean Squared Error: %.2f" % mean_squared_error(y_test_original, predictions) + '</br>'
stats2.text += " Variance score: %.2f" % r2_score(y_test_original, predictions) + '</br>'
stats2.text += " Cross Validation score: %.2f " % scores.mean()


''' p1 = figure(plot_height=350,title="PR Curve")
p1.x_range = Range1d(0,1)
p1.y_range = Range1d(0,1)
p1.line([0],[0],name ="line2")

tab1 = Panel(child=p1, title="PR Curve")
tabs = Tabs(tabs=[ tab1 ])
        '''
Beispiel #41
0
# print(np.shape(coefs))
# ax = plt.gca()
# ax.plot(grid, coefs)
# ax.set_xscale('log')
# plt.axis('tight')
# plt.xlabel('alpha')
# plt.ylabel('weights')
# plt.show()

# Split data into 50/50 train/test
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)

# Fit a ridge regression model with lambda 4
ridge4 = Ridge(alpha = 4, normalize = True)
ridge4.fit(X_train, y_train)                                # Fit a ridge regression on the training data
pred = ridge4.predict(X_test)                               # Use this model to predict the test data
# print(pd.Series(ridge4.coef_, index = X.columns))           # Print coefficients
print("MSE alpha 4: ", round(mean_squared_error(y_test, pred),2))    # Calculate the test MSE

# Fit a ridge regression model with lambda 10^10
ridge1010 = Ridge(alpha = 10**10, normalize = True)
ridge1010.fit(X_train, y_train)
pred = ridge1010.predict(X_test)
# print(pd.Series(ridge1010.coef_, index = X.columns))
print("MSE alpha 10^10: ", round(mean_squared_error(y_test, pred),2))

# Fit a ridge regression model with lambda 0 (Which is equivalent to least squares)
ridge = Ridge(alpha = 0, normalize = True)
ridge.fit(X_train, y_train)
pred = ridge.predict(X_test)
# print(pd.Series(ridge.coef_, index = X.columns))
Beispiel #42
0
class RidgeClass:
    """
    Name      : Ridge
    Attribute : None
    Method    : predict, predict_by_cv, save_model
    """

    def __init__(self):
        # 알고리즘 이름
        self._name = 'ridge'

        # 기본 경로
        self._f_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir))

        # 경고 메시지 삭제
        warnings.filterwarnings('ignore')

        # 원본 데이터 로드
        data = pd.read_csv(self._f_path + "/regression/resource/regression_sample.csv", sep=",", encoding="utf-8")

        # 학습 및 테스트 데이터 분리
        self._x = (data["year"] <= 2017)
        self._y = (data["year"] >= 2018)

        # 학습 데이터 분리
        self._x_train, self._y_train = self.preprocessing(data[self._x])
        # 테스트 데이터 분리
        self._x_test, self._y_test = self.preprocessing(data[self._y])

        # 모델 선언
        self._model = Ridge(alpha=0.5)

        # 모델 학습
        self._model.fit(self._x_train, self._y_train)

    # 데이터 전처리
    def preprocessing(self, data):
        # 학습
        x = []
        # 레이블
        y = []
        # 기준점(7일)
        base_interval = 7
        # 기온
        temps = list(data["temperature"])

        for i in range(len(temps)):
            if i < base_interval:
                continue
            y.append(temps[i])

            xa = []

            for p in range(base_interval):
                d = i + p - base_interval
                xa.append(temps[d])
            x.append(xa)
        return x, y

    # 일반 예측
    def predict(self, save_img=False, show_chart=False):
        # 예측
        y_pred = self._model.predict(self._x_test)

        # 스코어 정보
        score = r2_score(self._y_test, y_pred)

        # 리포트 확인
        if hasattr(self._model, 'coef_') and hasattr(self._model, 'intercept_'):
            print(f'Coef = {self._model.coef_}')
            print(f'intercept = {self._model.intercept_}')

        print(f'Score = {score}')

        # 이미지 저장 여부
        if save_img:
            self.save_chart_image(y_pred, show_chart)

        # 예측 값  & 스코어
        return [list(y_pred), score]

    #  CV 예측(Cross Validation)
    def predict_by_cv(self):
        # Regression 알고리즘은 실 프로젝트 상황에 맞게 Cross Validation 구현
        return False

    #  GridSearchCV 예측
    def predict_by_gs(self):
        pass

    # 모델 저장 및 갱신
    def save_model(self, renew=False):
        # 모델 저장
        if not renew:
            # 처음 저장
            joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl')
        else:
            # 기존 모델 대체
            if os.path.isfile(self._f_path + f'/model/{self._name}_rg.pkl'):
                os.rename(self._f_path + f'/model/{self._name}_rg.pkl',
                          self._f_path + f'/model/{str(self._name) + str(time.time())}_rg.pkl')
            joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl')

    # 회귀 차트 저장
    def save_chart_image(self, data, show_chart):
        # 사이즈
        plt.figure(figsize=(15, 10), dpi=100)

        # 레이블
        plt.plot(self._y_test, c='r')

        # 예측 값
        plt.plot(data, c='b')

        # 이미지로 저장
        plt.savefig('./chart_images/tenki-kion-lr.png')

        # 차트 확인(Optional)
        if show_chart:
            plt.show()

    def __del__(self):
        del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
Beispiel #43
0
這兩個迴歸帶有Regularization正規化效果
ElasticNet是LASSO跟Ridge的結合
'''

#Ridge訓練(需手動設定參數)
from sklearn.linear_model import Ridge
Ridge_regressor = Ridge(alpha=1.0)
Ridge_regressor.fit(X_train, Y)

#RidgeCV訓練(透過CV挑選參數)
from sklearn.linear_model import RidgeCV
Ridge_regressor = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1)
Ridge_regressor.fit(X_train, Y)

#Ridge預測
y_pred = Ridge_regressor.predict(X_test)


#ElasticNet訓練(需手動設定參數)
from sklearn.linear_model import ElasticNet
ElasticNet_regressor = ElasticNet(alpha=1.0, l1_ratio=0.5)
ElasticNet_regressor.fit(X_train, Y)

#ElasticNetCV訓練(透過CV決定參數)
from sklearn.linear_model import ElasticNetCV
ElasticNet_regressor = ElasticNet(cv=5)
ElasticNet_regressor.fit(X_train, Y)

#ElasticNet預測
y_pred = ElasticNet_regressor.predict(X_test)
Beispiel #44
0
def NARMA_Test_Compare(test_length=200,
                       train_length=800,
                       num_loops=1,
                       a=0,
                       plot=True,
                       N=400,
                       eta=0.5,
                       gamma=1,
                       phi=np.pi / 4,
                       r=1):
    """
    Compare with pre-determined NARMA10 series

    Args:
        test_length: length of verification data
        train_length: length of training data
        num_loops: number of delay loops in reservoir
        a: list of ridge regression constants for hyperparameter tuning
        N: number of virtual nodes
        plot: display calculated time series
        gamma: input gain
        eta: oscillation strength
        phi: phase of MZN
        r: loop length ratio

    Returns:
        NRMSE: Normalized Root Mean Square Error
    """

    #Import u and m
    file1 = open("data/uin_and_target.txt", "r")
    file2 = open("data/Mask.txt", "r")
    contents = file1.readlines()
    contents2 = file2.readlines()
    u = []
    target = []
    m = []
    for i in range(1000):
        u.append(float(contents[i][0:contents[i].find("\t")]))
        target.append(float(contents[i][contents[i].find("\t"):]))
        if i < 400:
            m.append(float(contents2[i][0:contents2[i].find("\n")]))
    file1.close()
    file2.close()
    u = np.array(u)
    m = np.array(m)
    target = np.array(target)

    #Instantiate Reservoir, feed in training and verification datasets
    r1 = DelayReservoir(N=N // num_loops,
                        eta=eta,
                        gamma=gamma,
                        theta=0.2,
                        loops=num_loops,
                        phi=phi)
    x = r1.calculateMZN(u[:train_length], m)
    x_test = r1.calculateMZN(u[train_length:], m)

    x = []
    file3 = open("data/X_node.txt", "r")
    contents3 = file3.readlines()
    print(len(contents3))
    for i in range(400000):
        x.append(float(contents3[i][:contents3[i].find("\n")]))

    x = np.array(x)
    x = x.reshape((-1, 1))
    x = x.reshape((1000, 400))

    #Train using Ridge Regression
    clf = Ridge(alpha=a, fit_intercept=True)
    clf.fit(x[:800], target[:train_length])
    w = clf.coef_
    y_train = x @ w
    y_test = clf.predict(x[800:])

    #Write to file

    x_total = np.concatenate((x, x_test))
    x_total = x_total.flatten(order='C')
    file3 = open("data/y_train2.txt", "w+")
    for i in range(800):
        file3.write("%f" % y_train[i] + "\n")
    file3.close()

    #Calculate NRMSE
    NRMSE = np.sqrt(np.mean(np.square(y_test[50:]-target[train_length+50:]))/\
            np.var(target[train_length+50:]))

    #Plot predicted Time Series

    if (plot == True):
        plt.plot(y_test[50:], label='Prediction')
        plt.plot(target[train_length + 50:], label='Target')
        plt.title('NRMSE = %f' % NRMSE)
        plt.legend()
        plt.show()

    return NRMSE
Beispiel #45
0
    BINS[i] = NUM * i

#Loading data
X, Y = Loader.data_load(CLASS, PARTS, PATH)

#Using colour histograms if needed
X = Loader.histogram(X, BINS, NUM)
#Reshape and polynomize if needed
if FL:
    X = Loader.preproc(X, normalize=False, reshape=True)[0]
    print('DO ', X.shape)
    X = Loader.polynom(X, GRADE)
    print('POSLE', X.shape)
    X, TRAIN_IND, TEST_IND = Loader.preproc(X, reshape=False)
#preprocessing and data split if needed
if not FL:
    X, TRAIN_IND, TEST_IND = Loader.preproc(X)
print(X.shape)
print(X[TRAIN_IND].shape, X[TEST_IND].shape)
eval_set = [X[TEST_IND], Y[TEST_IND]]

#The Ridge
model = Ridge(alpha=ALPH, max_iter=ITER)
model.fit(X[TRAIN_IND], Y[TRAIN_IND])

#prediction
#X_TEST = X[TEST_IND]
Y_P = model.predict(X[TEST_IND])
accuracy = score(Y[TEST_IND], Y_P.round())
print('TEST ACCURACY = ', accuracy * 100, '%')
y_pred_lr=model_lr.predict(x_test)

get_performance(y_pred_lr)

get_plot(y_pred_lr)

get_performance(y_pred_lr)

"""# Ridge Regression"""

model_ridge = Ridge()
model_ridge.fit(x_train, y_train)

#generate predictions
y_pred_ridge=model_ridge.predict(x_test)

get_performance(y_pred_ridge)

get_plot(y_pred_ridge)

"""# Gradient Boosting Trees"""

# Model #2 - Gradient Boosting Trees
model_gb = GradientBoostingRegressor()
model_gb.fit(x_train, y_train)

# Infer
y_pred_gb = model_gb.predict(x_test)

get_performance(y_pred_gb)
class ExpectedRankRegression(ObjectRanker, Learner):
    def __init__(self,
                 n_object_features,
                 alpha=0.0,
                 l1_ratio=0.5,
                 tol=1e-4,
                 normalize=True,
                 fit_intercept=True,
                 random_state=None,
                 **kwargs):
        """
            Create an expected rank regression model.

            This model normalizes the ranks to [0, 1] and treats them as regression target. For α = 0 we employ simple
            linear regression. For α > 0 the model becomes ridge regression (when l1_ratio = 0) or elastic net
            (when l1_ratio > 0).

            Parameters
            ----------
            n_object_features : int
                Number of features of the object space
            alpha : float, optional
                Regularization strength
            l1_ratio : float, optional
                Ratio between pure L2 (=0) or pure L1 (=1) regularization.
            tol : float, optional
                Optimization tolerance
            normalize : bool, optional
                If True, the regressors will be normalized before fitting.
            fit_intercept : bool, optional
                If True, the linear model will also fit an intercept.
            random_state : int, RandomState instance or None, optional
                Seed of the pseudorandom generator or a RandomState instance
            **kwargs
                Keyword arguments for the algorithms

            References
            ----------
            .. [1] Kamishima, T., Kazawa, H., & Akaho, S. (2005, November).
                   "Supervised ordering-an empirical survey.",
                   Fifth IEEE International Conference on Data Mining.
        """
        self.normalize = normalize
        self.n_object_features = n_object_features
        self.alpha = alpha
        self.l1_ratio = l1_ratio
        self.tol = tol
        self.logger = logging.getLogger('ERR')
        self.fit_intercept = fit_intercept
        self.random_state = check_random_state(random_state)
        self.weights = None

    def fit(self, X, Y, **kwargs):
        self.logger.debug('Creating the Dataset')
        x_train, y_train = complete_linear_regression_dataset(X, Y)
        assert x_train.shape[1] == self.n_object_features
        self.logger.debug('Finished the Dataset')
        if self.alpha < 1e-3:
            self.model = LinearRegression(normalize=self.normalize,
                                          fit_intercept=self.fit_intercept)
            self.logger.info("LinearRegression")
        else:
            if self.l1_ratio >= 0.01:
                self.model = ElasticNet(alpha=self.alpha,
                                        l1_ratio=self.l1_ratio,
                                        normalize=self.normalize,
                                        tol=self.tol,
                                        fit_intercept=self.fit_intercept,
                                        random_state=self.random_state)
                self.logger.info("Elastic Net")
            else:
                self.model = Ridge(alpha=self.alpha,
                                   normalize=self.normalize,
                                   tol=self.tol,
                                   fit_intercept=self.fit_intercept,
                                   random_state=self.random_state)
                self.logger.info("Ridge")
        self.logger.debug('Finished Creating the model, now fitting started')
        self.model.fit(x_train, y_train)
        self.weights = self.model.coef_.flatten()
        if self.fit_intercept:
            self.weights = np.append(self.weights, self.model.intercept_)
        self.logger.debug('Fitting Complete')

    def _predict_scores_fixed(self, X, **kwargs):
        n_instances, n_objects, n_features = X.shape
        self.logger.info(
            "For Test instances {} objects {} features {}".format(*X.shape))
        X1 = X.reshape(n_instances * n_objects, n_features)
        scores = n_objects - self.model.predict(X1)
        scores = scores.reshape(n_instances, n_objects)
        scores = normalize(scores)
        self.logger.info("Done predicting scores")
        return scores

    def predict_scores(self, X, **kwargs):
        return super().predict_scores(X, **kwargs)

    def predict_for_scores(self, scores, **kwargs):
        return ObjectRanker.predict_for_scores(self, scores, **kwargs)

    def predict(self, X, **kwargs):
        return super().predict(X, **kwargs)

    def clear_memory(self, **kwargs):
        pass

    def set_tunable_parameters(self,
                               alpha=0.0,
                               l1_ratio=0.5,
                               tol=1e-4,
                               **point):
        self.tol = tol
        self.alpha = alpha
        self.l1_ratio = l1_ratio
        if len(point) > 0:
            self.logger.warning('This ranking algorithm does not support'
                                ' tunable parameters'
                                ' called: {}'.format(print_dictionary(point)))
Beispiel #48
0
    output_file = open('output.txt', 'w', encoding='ANSI')

    X_train, y_train = prepare_train_data_set()
    X_test = prepare_test_data_set()

    vectorizer = TfidfVectorizer(min_df=5)

    X_train_vector = coo_matrix(
        vectorizer.fit_transform(X_train.FullDescription))

    X_test_vector = coo_matrix(vectorizer.transform(X_test.FullDescription))

    X_train_category = coo_matrix(
        enc.fit_transform(X_train[['LocationNormalized',
                                   'ContractTime']].to_dict('records')))

    X_test_category = coo_matrix(
        enc.transform(X_test[['LocationNormalized',
                              'ContractTime']].to_dict('records')))

    X_train_stack = hstack([X_train_vector, X_train_category])
    X_test_stack = hstack([X_test_vector, X_test_category])

    clf = Ridge(alpha=1)
    clf.fit(X_train_stack, y_train)

    a, b = clf.predict(X_test_stack)
    print(round(a, 2), round(b, 2), sep=' ', file=output_file)

    output_file.close()
Beispiel #49
0
    def model_selection(self, train):
        """
        Performs a test/train split on the training data. Gridsearches over three
        regularixation models (Lasso, Ridge, and ElasticNet), and fits a final
        model using the best performing model (Ridge) from the gridsearch stage.
        Returns the validation MSE and RMSE of the final model.

        Args:
            train: cleaned and scaled training data

        Returns:
            Validation MSE and RMSE of best performing gridsearched model.
        """
        # Test/Train split training data
        y = train['SalePrice']
        X = train.drop('SalePrice', axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        # Gridsearch Lasso Model
        lasso = Lasso()
        param_list = {'alpha': np.linspace(.1, 1, 10)}
        lasso_grid = GridSearchCV(lasso,
                                  param_list,
                                  scoring='neg_mean_squared_error',
                                  cv=5)
        lasso_grid.fit(X_train, y_train)
        print('Model: {}, Best Params: {}, Best Score: {}'\
            .format(lasso, lasso_grid.best_params_, abs(lasso_grid.best_score_)))

        # Gridsearch Ridge Model
        ridge = Ridge()
        param_list = {
            'alpha': np.linspace(.1, 1, 10),
            'solver': ['auto', 'svd', 'lsqr', 'cholesky']
        }
        ridge_grid = GridSearchCV(ridge,
                                  param_list,
                                  scoring='neg_mean_squared_error',
                                  cv=5)
        ridge_grid.fit(X_train, y_train)
        print('Model: {}, Best Params: {}, Best Score: {}'\
            .format(ridge, ridge_grid.best_params_, abs(ridge_grid.best_score_)))

        # Gridsearch ElasticNet Model
        elastic = ElasticNet()
        param_list = {
            'alpha': np.linspace(0.5, 0.9, 20),
            'l1_ratio': np.linspace(0.9, 1.0, 10)
        }
        elastic_grid = GridSearchCV(elastic,
                                    param_list,
                                    scoring='neg_mean_squared_error',
                                    cv=5)
        elastic_grid.fit(X_train, y_train)
        print('Model: {}, Best Params: {}, Best Score: {}'\
            .format(elastic, elastic_grid.best_params_, abs(elastic_grid.best_score_)))

        # Best model on validation set of training data
        final_ridge = Ridge(alpha=1.0, solver='svd')
        final_ridge.fit(X_train, y_train)
        y_pred = final_ridge.predict(X_test)
        log_diff = np.log(y_pred + 1) - np.log(y_test + 1)
        score = np.sqrt(np.mean(log_diff**2))
        print('Validation MSE Score: {}'.format(
            mean_squared_error(y_test, y_pred)))
        print('Validation RMSLE Score: {}'.format(score))
Beispiel #50
0
#spliting the dataset for training and testing

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target)

#importing ridge model

from sklearn.linear_model import Ridge

ridge = Ridge()

ridge.fit(X_train, y_train)

# In[12]:

pred_test = ridge.predict(X_test)
pred_test

# In[13]:

ridge.score(X_test, y_test)

# In[14]:

#MSE

from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, pred_test)

# In[16]:
Beispiel #51
0
    print("fold n°{}".format(fold_ + 1))
    trn_data, trn_y = train.iloc[trn_idx][features], target.iloc[
        trn_idx].values
    val_data, val_y = train.iloc[val_idx][features], target.iloc[
        val_idx].values

    trn_data.fillna((trn_data.mean()), inplace=True)
    val_data.fillna((val_data.mean()), inplace=True)

    trn_data = trn_data.values
    val_data = val_data.values

    clf = Ridge(alpha=100)
    clf.fit(trn_data, trn_y)

    oof_ridge[val_idx] = clf.predict(val_data)
    predictions_ridge += clf.predict(tst_data) / folds.n_splits

np.save('oof_ridge', oof_ridge)
np.save('predictions_ridge', predictions_ridge)
np.sqrt(mean_squared_error(target.values, oof_ridge))

# In[ ]:

del tst_data
gc.collect()

# 3.78 CV is not bad, but it's far from what the best models can do in this competition. Let's take a look at a few non-linear models. We'll start with LightGBM.

# In[ ]:
Beispiel #52
0
    'Total', 'Precipitation', 'Date', 'Day', 'Brooklyn Bridge',
    'Manhattan Bridge', 'Queensboro Bridge', 'Williamsburg Bridge'
])
y_train = df_new['Total']

#%%
from sklearn import preprocessing
from sklearn.linear_model import Ridge
reg = Ridge(alpha=100)
reg.fit(x_train, y_train)

#%%
reg.coef_

#%%
from sklearn.metrics import r2_score, mean_squared_error
y_pred = reg.predict(x_train)

print(r2_score(y_train, y_pred))
print(mean_squared_error(y_train, y_pred))

#%%
import yellowbrick
res = y_train - y_pred

#%%
from yellowbrick.regressor import ResidualsPlot
visualizer = ResidualsPlot(reg)
visualizer.score(x_train, y_train)  # Evaluate the model on the test data
visualizer.poof()  # Draw/show/poof the data
x_train = np.array([[0], [1], [2], [3], [4], [5]])
y_train = np.array([0, 1, 2, 3, 4, 5])

x_test = np.array([[6], [7], [8]])
y_test = np.array([6, 7, 8])

reg = Ridge(alpha=1.0,
            fit_intercept=True,
            normalize=False,
            copy_X=True,
            max_iter=None,
            tol=0.001,
            solver='auto',
            random_state=None)
reg.fit(x_train, y_train)

y_predict = reg.predict(x_test)

print('Ridge_score: ', reg.score(x_train, y_train))
print('Ridge_coef_: ', reg.coef_)
print('Ridge_intercept_: ', reg.intercept_)

# Plot outputs
plt.scatter(x_test, y_test, color='black')
plt.plot(x_test, y_predict, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()
Beispiel #54
0
# 岭回归

import numpy as np
from sklearn.linear_model import Ridge
from sklearn import model_selection
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures

data = np.genfromtxt('Data/RidgeTestData01.txt', delimiter=',', skip_header=0)
plt.plot(data[:, 4])
# plt.show()
# data[:, :4] 指的是:所有行的前四列
X = data[:, :4]
y = data[:, 4]
poly = PolynomialFeatures(6)
X = poly.fit_transform(X)
train_set_X, test_set_X, train_set_y, test_set_y = model_selection.train_test_split(
    X, y, test_size=0.3, random_state=0)
clf = Ridge(alpha=1.0, fit_intercept=True)
clf.fit(train_set_X, train_set_y)
clf.score(test_set_X, test_set_y)
start = 1  # 接下来我们画一段200到300范围内的拟合曲线
end = 18
y_pre = clf.predict(X)  # 是调用predict函数的拟合值
time = np.arange(start, end)
plt.plot(time, y[start:end], 'b', label="real")
plt.plot(time, y_pre[start:end], 'r', label='predict')
# 展示真实数据(蓝色)以及拟合的曲线(红色)
plt.legend(loc='upper lef')  # 设置图例的位置
plt.show()
lr = LinearRegression()
lasso = Lasso()
ridge = Ridge()
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor(n_estimators=50)

lr.fit(X_train, y_train)
lasso.fit(X_train, y_train)
ridge.fit(X_train, y_train)
dtr.fit(X_train, y_train)
rfr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
y_pred_lasso = lasso.predict(X_test)
y_pred_ridge = ridge.predict(X_test)
y_pred_dtr = dtr.predict(X_test)
y_pred_rfr = rfr.predict(X_test)

pred = lasso.predict(df[[
    'day', 'month', 'year', 'wickets in 1 to 6 1st innings',
    'venue average runs in 1st innings', 'venue average wickets in 1st innings'
]])
act = df[['runs in 7 to 14 overs 1st innings']]

dif = []
for i in range(len(pred)):
    k = ((abs(pred[i] - act['runs in 7 to 14 overs 1st innings'].iloc[i])) /
         act['runs in 7 to 14 overs 1st innings'].iloc[i])
    dif.append(k * 100)
#Code starts here
regressor = LinearRegression()
score = cross_val_score(regressor, X_train, y_train, cv=10)

mean_score = np.mean(score)

print(mean_score)

# --------------
from sklearn.linear_model import Lasso

# Code starts here
lasso = Lasso(random_state=0)

lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
r2_lasso = r2_score(y_test, y_pred)

# --------------
from sklearn.linear_model import Ridge

# Code starts here
ridge = Ridge(random_state=0)

ridge.fit(X_train, y_train)

y_pred = ridge.predict(X_test)

r2_ridge = r2_score(y_test, y_pred)
dfExp = scaler.fit_transform(dfExp)

X_train, X_test, y_train, y_test = train_test_split(dfExp,
                                                    yT,
                                                    test_size=0.33,
                                                    random_state=42)

## based on the label value introduce more anomlous observations

# xTrain = xTrain.join(y_train, lsuffix='_caller', rsuffix='_other')
# print(xTrain)
# print(y_train)

clf = Ridge(alpha=1.0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
clf.score(X_test, y_test, sample_weight=None)
list_of_lists = y_pred
flattened = [val for sublist in list_of_lists for val in sublist]

dfOut = dfOg
dfOut = dfOut.drop([
    'travel_start_date', 'per_diem_based_on_rate', 'approval_date', 'taxi',
    'travel_end_date', 'air_fare', 'mileage_based_on_rate', 'mileage', 'hotel',
    'car_rental', 'per_diem'
],
                   axis=1)
dfOut['TotalExpensePred'] = pd.DataFrame(flattened)

dfOut.to_excel('outputRidge.xlsx')
dfLogistic = pd.read_excel('outputRidge.xlsx')
Beispiel #58
0
    plt.plot(X, y, "b.", linewidth=3)
    plt.legend(loc='upper left')
    plt.xlabel('x')
    plt.axis([0, 3, 0, 4])
plt.figure()
plt.subplot(121)
plot_model(Ridge, polynomial=False, alphas=(0, 10, 100), random_state=42)
plt.ylabel('y', rotation=0, fontsize=18)
plt.subplot(122)
plot_model(Ridge, polynomial=True, alphas=(0, 10**-5, 1), random_state=42)
plt.show()

#Now lets do Ridge Regression with scikit-learn
ridge_reg = Ridge(alpha=1, solver="cholesky", random_state=42)
ridge_reg.fit(X, y)
ridge_reg.predict([[1.5]])
#We can also do this using SGD by adding a penalty hyperparameter=l2, meaning we want to add a regularization term to cost function
sgd_reg = SGDRegressor(penalty='l2')
sgd_reg.fit(X,y)
sgd_reg.predict([[1.5]])

#----------------------------------------------------------LASSO REGRESSION---------------------------------------------------------
#Lasso stands for: Least Absolute Shrinkage and Selection Operator Regression. Very similar to Ridge reg but adds a regularization term
#To the cost function but uses the l1 norm of the weight vector instead of half the square of the l2 norm used in Ridge
#An important characteristic of lasso is that it tends to completely eliminate(set to zero) the weights(thetas) of least important features

#Lets plot the same data as before but using Lasso models and smaller alpha instead of our Ridge model
plt.figure()
plt.subplot(121)
plot_model(Lasso, polynomial=False, alphas=(0, 0.1, 1), random_state=42)
plt.ylabel('y', rotation=0)
# Reading Data
data = pd.read_csv('headbrain.csv')
data.head()

# Collecting X and Y
X = data['Head Size(cm^3)'].values
Y = data['Brain Weight(grams)'].values

m = len(X)

X = X.reshape((m, 1))
#X = [x[0] for x in X1]
print(X)
#input()
# Model Intialization
reg = Ridge(alpha=0.05, normalize=True)
reg = Ridge()
# Data Fitting
reg = reg.fit(X, Y)
# Y Prediction
Y_pred = reg.predict(X)

# Model Evaluation
rmse = np.sqrt(mean_squared_error(Y, Y_pred))
r2 = reg.score(X, Y)

print("RMSE")
print(rmse)
print("R2 Score")
print(r2)
  plt.grid()

show_plot(X_train, 'Training Data')
show_plot(X_test, 'Testing Data')


from sklearn.linear_model import Ridge
# Note that Ridge regression performs linear least squares with L2 regularization.
# Create and train the Ridge Linear Regression  Model
regression_model = Ridge()
regression_model.fit(X_train, y_train)

lr_accuracy = regression_model.score(X_test, y_test)
print("Linear Regression Score: ", lr_accuracy)

predicted_prices = regression_model.predict(X)
Predicted = []
for i in predicted_prices:
  Predicted.append(i[0])

close = []
for i in price_volume_target_scaled_df:
  close.append(i[0])

df_predicted = price_volume_target_df[['Date']]
df_predicted['Close'] = close
df_predicted['Prediction'] = Predicted
interactive_plot(df_predicted, "Original Vs. Prediction")

#LSTM Series model
price_volume_df = individual_stock(stock_price_df, stock_vol_df, 'sp500')