Ejemplo n.º 1
0
def linear_learning(labels, train, test):
    label_log=np.log1p(labels)
    linear=LinearRegression()
    model=linear.fit(train, label_log)
    preds1=model.predict(test)
    preds=np.expm1(preds1)
    return  preds
def test_partial_dependence_easy_target(est, power):
    # If the target y only depends on one feature in an obvious way (linear or
    # quadratic) then the partial dependence for that feature should reflect
    # it.
    # We here fit a linear regression_data model (with polynomial features if
    # needed) and compute r_squared to check that the partial dependence
    # correctly reflects the target.

    rng = np.random.RandomState(0)
    n_samples = 100
    target_variable = 2
    X = rng.normal(size=(n_samples, 5))
    y = X[:, target_variable]**power

    est.fit(X, y)

    averaged_predictions, values = partial_dependence(
        est, features=[target_variable], X=X, grid_resolution=1000)

    new_X = values[0].reshape(-1, 1)
    new_y = averaged_predictions[0]
    # add polynomial features if needed
    new_X = PolynomialFeatures(degree=power).fit_transform(new_X)

    lr = LinearRegression().fit(new_X, new_y)
    r2 = r2_score(new_y, lr.predict(new_X))

    assert r2 > .99
Ejemplo n.º 3
0
 def _ols(self,x,y):
     lr = LinearRegression()
     coef_xy = lr.fit(y= y.reshape(-1, 1), X= x.reshape(-1, 1)).coef_
     coef_yx = lr.fit(y= x.reshape(-1, 1), X= y.reshape(-1, 1)).coef_
     r_xy = y - coef_xy*x
     r_yx = x - coef_yx*y
     return r_xy/np.std(r_xy), r_yx/np.std(r_yx)
Ejemplo n.º 4
0
def linreg_ccv_plot_roc(num_folds):

    global data
    folds = pd.create_folds(data, num_folds)
    classifier = LinearRegression()
    
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []

    for i in range(num_folds):
        test_x, test_y, train_x, train_y = pd.split_into_sets(data, folds, i)
        probs = classifier.fit(train_x, train_y).predict(test_x)
        fpr, tpr, thresholds = roc_curve(test_y, probs) #takes, y_true and y_score
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        
        plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
    
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

    mean_tpr /= len(folds) 
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--',
             label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('%d-fold Clustered Cross-Validation' % num_folds)
    plt.legend(loc="lower right")
    plt.show()   
Ejemplo n.º 5
0
 def _reduce_X(self,X,i):
     X_new = np.zeros(X.shape)
     lr = LinearRegression()
     for j in range(X_new.shape[1]):
         lr.fit(y= X[:,j].reshape(-1, 1), X= X[:,i].reshape(-1, 1))
         X_new[:,j] = X[:,j] - lr.coef_*X[:,i]
     return np.delete(X_new, i, axis=1)
Ejemplo n.º 6
0
def linregress(X_train, X_test, y_train, y_test):
    coef = []
    for col in X_train.columns.tolist():
        X = StandardScaler().fit_transform(X_train[col])
        lr = LinearRegression()
        lr.fit(X.reshape(-1, 1), y_train)
        coef.append([col, lr.coef_])
    coef = sorted(coef, key=lambda x: x[1])[::-1]
    nos = [x[1] for x in coef]
    labs = [x[0] for x in coef]
    for lab in labs:
        if lab == 'doubles':
            labs[labs.index(lab)] = '2B'
        elif lab == 'triples':
            labs[labs.index(lab)] = '3B'
        elif lab == 'Intercept':
            idx = labs.index('Intercept')
            labs.pop(idx)
            nos.pop(idx)
    labs = [lab.upper() for lab in labs]
    x = range(len(nos))
    plt.plot(x,nos, lw=2, c='b')
    plt.xticks(x, labs)
    plt.title('Linear Regression Coefficients (Win Percentage)')
    plt.savefig('images/coefficients.png')
    plt.show()
    print labs
Ejemplo n.º 7
0
def train_regressor(options, embed_map, wordvecs, worddict):
    """
    Return regressor to map word2vec to RNN word space
    """
    # Gather all words from word2vec that appear in wordvecs
    d = defaultdict(lambda : 0)
    for w in embed_map.vocab.keys():
        d[w] = 1
    shared = OrderedDict()
    count = 0
    for w in worddict.keys()[:options['n_words']-2]:
        if d[w] > 0:
            shared[w] = count
            count += 1

    # Get the vectors for all words in 'shared'
    w2v = numpy.zeros((len(shared), 300), dtype='float32')
    sg = numpy.zeros((len(shared), options['dim_word']), dtype='float32')
    for w in shared.keys():
        w2v[shared[w]] = embed_map[w]
        sg[shared[w]] = wordvecs[w]

    clf = LinearRegression()
    clf.fit(w2v, sg)
    return clf
def linearRegressionExample(X, Y):
	# fit-intercept defines if we should fit an intrecpt term or not
	est = LinearRegression(fit_intercept=False)
	#fit the data
	est.fit(X,Y)
	# get coefficients
	est.coef_
Ejemplo n.º 9
0
def normalize_money_with_date():
    with open('train_test.pickle') as f:
        train_set,test_set = pickle.load(f)
    
    money = float(np.max([movie['total_money'] for movie in train_set]))
    year_money = np.array([[movie['date'].year,float(movie['total_money'])/money] for movie in train_set],float)
    
    year_mean = np.zeros([5,2])
    for y in range(5):
        money = year_money[year_money[:,0] == 2011+y,1]
        plt.scatter(y*np.ones(np.shape(money)),money)
        mean = np.mean(money)
        year_mean[y,:] = np.array([1+y,mean],float)
    
    regressor = LinearRegression()
    regressor.fit(year_mean[:,0:1],year_mean[:,1])
    a,b = regressor.coef_, regressor.intercept_
    with open('coef.pickle') as f:
        coef = pickle.load(f)
        coef['normalize_year'] = {'a':a,'b':b,'base':2010}
    with open('coef.pickle','w') as f:
        pickle.dump(coef,f)
    
    print a,b,regressor.score(year_mean[:,0:1],year_mean[:,1])
    plt.plot(year_mean[:,1])
    plt.savefig('year_money.png')
Ejemplo n.º 10
0
def train_leastSquareModel(X, y, fit_intercept=True, normalize=False, copy_X=True, n_jobs=1):
    """
    Train a regression model using Least Square method
    """
    model = LinearRegression(fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, n_jobs=n_jobs)
    model = model.fit(X, y)
    return model
Ejemplo n.º 11
0
    def RunLinearRegressionScikit(q):
      totalTimer = Timer()

      # Load input dataset.
      # If the dataset contains two files then the second file is the responses 
      # file.
      Log.Info("Loading dataset", self.verbose)
      if len(self.dataset) == 2:
        X = np.genfromtxt(self.dataset[0], delimiter=',')
        y = np.genfromtxt(self.dataset[1], delimiter=',')
      else:
        X = np.genfromtxt(self.dataset, delimiter=',')
        y = X[:, (X.shape[1] - 1)]
        X = X[:,:-1]

      try:
        with totalTimer:
          # Perform linear regression.
          model = SLinearRegression()
          model.fit(X, y, n_jobs=-1)
          b = model.coef_
      except Exception as e:
        q.put(-1)
        return -1

      time = totalTimer.ElapsedTime()
      q.put(time)
      return time
Ejemplo n.º 12
0
def linear_regression(predictors, titanic):
	# Initialize our algorithm class
	alg = LinearRegression()
	# Generate cross validation folds for the titanic dataset.  It return the row indices corresponding to train and test.
	# We set random_state to ensure we get the same splits every time we run this.
	kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

	predictions = []
	for train, test in kf:
	    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
	    train_predictors = (titanic[predictors].iloc[train,:])
	    # The target we're using to train the algorithm.
	    train_target = titanic["Survived"].iloc[train]
	    # Training the algorithm using the predictors and target.
	    alg.fit(train_predictors, train_target)
	    # We can now make predictions on the test fold
	    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
	    predictions.append(test_predictions)
	# The predictions are in three separate numpy arrays.  Concatenate them into one.  
	# We concatenate them on axis 0, as they only have one axis.
	predictions = np.concatenate(predictions, axis=0)

	# Map predictions to outcomes (only possible outcomes are 1 and 0)
	predictions[predictions > .5] = 1
	predictions[predictions <=.5] = 0

	accuracy_list = [x == y for x, y in zip(titanic["Survived"], predictions)]

	num_acc = sum(accuracy_list)
	accuracy = sum(accuracy_list) / len(accuracy_list)
	accuracy = accuracy.item()
def calc_task_two_one():
    warnings.warn("deprecated", DeprecationWarning)
    model = LinearRegression()
    X = np.array(df[x_list].values)
    y = df['Price'].values
    model.fit(X, y)
    return model, X, y
Ejemplo n.º 14
0
def linear_model(df):
    dff = df
    df2 = dff.fillna(0)
    linreg = LinearRegression()
    df2 = df2[pd.notnull(df2[['Mean Price', 'Volume']])]
    df3 = df2[['Mean Price','Volume']]

    x = df3[['Mean Price']]
    y = df3[['Volume']]

    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)
    linreg.fit(x_train, y_train)
    intercept = linreg.intercept_
    coef = linreg.coef_

    plt.plot(x_train, linreg.predict(x_train), c='g', lw=3, label='Fitted line')
    plt.scatter(x_train, y_train, c='k')
    plt.xlabel('Mean Price')
    plt.ylabel('Volume')
    plt.show()


    # # compute root mean squared error
    # print np.sqrt(metrics.mean_squared_error(y_test, prediction))
    #
    # rss = np.sum((y_test - linreg.predict(x_test)) ** 2)
    score = linreg.score(x_train, y_train)
    print score
Ejemplo n.º 15
0
def solution_1(N=3):
  sales_dict, month_list, class_id_list = prepare_data()
  df = make_train_data(sales_dict, month_list, class_id_list, lastN=N)
  df = df.sample(frac=1).reset_index(drop=True)

  data_X = pd.DataFrame()
  for i in range(N):
    data_X["last_"+str(i+1)] = df["last_"+str(i+1)]
  # print(data_X)
  # data_X = pd.DataFrame({
  #            'last_1': df.last_1,
  #            'last_2': df.last_2,
  #            'last_3': df.last_3,
  #            })
  data_Y = df.Y
  test_size = int(len(df)/5)
  train_X = data_X[:-test_size]
  train_Y = data_Y[:-test_size]
  test_Y = data_Y[-test_size:]
  test_X = data_X[-test_size:]

  model = LinearRegression()
  model.fit(train_X, train_Y)
  print("train_size:", len(train_X), ", test_size", len(test_X))
  print("model.coef_ = ", model.coef_)
  print("model.score = ", model.score(test_X, test_Y))
  return model.score(test_X, test_Y), model.coef_,model
Ejemplo n.º 16
0
def impute_error(df, is_plot=False):
    slr = LinearRegression()
    mask1 = df['Percent.Error'].notnull()
    mask0 = df['Percent.Error'].isnull()
    df1 = df[mask1]
    df0 = df[mask0]
    print df0.shape, df1.shape

    # linear regression 
    slr.fit(df1[['RadPeer.Score']], df1['Percent.Error'])
    predicted1 = slr.predict(df.loc[mask1, ['RadPeer.Score']])
    predicted0 = slr.predict(df.loc[mask0, ['RadPeer.Score']])

    df.loc[mask0, 'Percent.Error'] = predicted0

    if is_plot: # make plot 
        df1.plot(kind='scatter', x='RadPeer.Score', y='Percent.Error',
                 color='blue', alpha=0.4, label='126 non-null',
                 figsize=(7,7), zorder=2)
        plt.plot(df1[['RadPeer.Score']], predicted1, color='blue',
                 label='linear fit', zorder=1) 	
        plt.scatter(df0[['RadPeer.Score']], predicted0, color='red',
                    alpha=0.6, label='71 null', zorder=3) 	

        plt.legend(loc='upper left')
        sns.plt.savefig('impute_error.png', bbox_inches='tight')
Ejemplo n.º 17
0
def plot_linear_regression():
    a = 0.5
    b = 1.0

    # x from 0 to 10
    x = 30 * np.random.random(20)

    # y = a*x + b with noise
    y = a * x + b + np.random.normal(size=x.shape)

    # create a linear regression classifier
    clf = LinearRegression()
    clf.fit(x[:, None], y)

    # predict y from the data
    x_new = np.linspace(0, 30, 100)
    y_new = clf.predict(x_new[:, None])

    # plot the results
    ax = plt.axes()
    ax.scatter(x, y)
    ax.plot(x_new, y_new)

    ax.set_xlabel('x')
    ax.set_ylabel('y')

    ax.axis('tight')
Ejemplo n.º 18
0
def linear_regression(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True):
    """
    :param train_x: train
    :param train_y: text
    :param pred_x: test set to predict
    :param review_id: takes in a review id
    :param v_curve: run the model for validation curve
    :param l_curve: run the model for learning curve
    :param get_model: run the model
    :return:the predicted values,learning curve, validation curve
    """
    lin = LinearRegression(normalize=True)
    if get_model:
        print "Fitting Linear..."
        lin.fit(train_x, np.log(train_y+1))
        gbr_pred = np.exp(lin.predict(pred_x))- 1
        for i in range(len(gbr_pred)):
            if gbr_pred[i] < 0:
                gbr_pred[i] = 0
        Votes = gbr_pred[:,np.newaxis]
        Id = np.array(review_id)[:,np.newaxis]
        submission_lin= np.concatenate((Id,Votes),axis=1)
        np.savetxt("submission_lin.csv", submission_lin,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='')
    # plot validation and learning curves
    if v_curve:
        pass
    if l_curve:
        print "Working on Learning Curves"
        plot_learning_curve(LinearRegression(), "Learning Curve for Linear Regression", train_x, np.log(train_y+1.0))
def bayesian_ridge_regression(feature_array, label_array):
    clf = BayesianRidge(compute_score=True)
    clf.fit(feature_array, label_array)

    ols = LinearRegression()
    ols.fit(feature_array, label_array)


    n_features = 9

    plt.figure(figsize=(6, 5))
    plt.title("Weights of the model")
    plt.plot(clf.coef_, 'b-', label="Bayesian Ridge estimate")
    plt.plot(label_array, 'g-', label="Ground truth")
    plt.plot(ols.coef_, 'r--', label="OLS estimate")
    plt.xlabel("Features")
    plt.ylabel("Values of the weights")
    plt.legend(loc="best", prop=dict(size=12))

    plt.figure(figsize=(6, 5))
    plt.title("Histogram of the weights")
    plt.hist(clf.coef_, bins=n_features, log=True)
    # plt.plot(clf.coef_[feature_array], 5 * np.ones(len(feature_array)),
    #          'ro', label="Relevant features")
    plt.ylabel("Features")
    plt.xlabel("Values of the weights")
    plt.legend(loc="lower left")

    plt.figure(figsize=(6, 5))
    plt.title("Marginal log-likelihood")
    plt.plot(clf.scores_)
    plt.ylabel("Score")
    plt.xlabel("Iterations")
    plt.show()
Ejemplo n.º 20
0
def quantify_higher_nesting(higher_dim, lower_dim):
    """
    Quantifies how well higher levels of the tree can be reconstructed from 
    lower levels
    """
    lr = LinearRegression()
    best_score = -1
    relationship = []
    # quantify how well the higher dimensional solution can reconstruct
    # the lower dimensional solution using a linear combination of two factors
    for higher_name, higher_c in higher_dim.iteritems():
        for lower_c1, lower_c2 in combinations(lower_dim.columns, 2):
            # combined prediction
            predict_mat = higher_dim.loc[:,[lower_c1, lower_c2]]
            lr.fit(predict_mat, higher_c)
            score = lr.score(predict_mat, higher_c)
            # individual correlation
            lower_subset = lower_dim.drop(higher_name, axis=1)
            higher_subset = higher_dim.drop([lower_c1, lower_c2], axis=1)
            corr = corr_lower_higher(higher_subset, lower_subset)
            if len(corr)==1:
                other_cols = [corr.iloc[0,0]]
            else:
                other_cols = corr.apply(lambda x: max(x**2)-sorted(x**2)[-2],
                                        axis=1)
            total_score = np.mean(np.append(other_cols, score))
            if total_score>best_score:
                best_score = total_score
                relationship = {'score': score,
                                'lower_factor': higher_c.name, 
                                'higher_factors': (lower_c1, lower_c2), 
                                'coefficients': lr.coef_}
    return relationship
Ejemplo n.º 21
0
def event_prediction(thres,min_num_points,b_events,tc_list):
	# b_events: [(x,y,type,year)]
	events = ['accidentsAndIncidents','roadwork','precipitation','deviceStatus','obstruction','trafficConditions']
	ret = []	# [(accidentsAndIncidents,roadwork,precipitation,deviceStatus,obstruction,trafficConditions)]
	lr = LinearRegression()
	for xmin,xmax,ymin,ymax in tc_list:
		cnt = Counter([(e_type,year) for x,y,e_type,year in b_events.value if x>xmin and x<xmax and y>ymin and y<ymax])  # {(e_type,year):count}
		counts = []
		for e in events:
			year_count = {key[1]:val for key,val in cnt.items() if key[0] == e}	# {year:count}
			if len(year_count) == 0:
				counts.append("0.00")
				continue
			year_count_desc_c = sorted(year_count.items(), key=operator.itemgetter(1), reverse = True)  # [(year,count)], decending by count.
			current_max = year_count_desc_c[0][1]
			train_points = []	 # (year,count)
			for y,c in year_count_desc_c:
				if c >= thres*current_max:
					current_max = c
					train_points.append((y,c))
			if len(train_points) < min_num_points:
				# most recent year data for prediction
				year_count_desc_y = sorted(train_points, key=operator.itemgetter(0), reverse = True)  # [(year,count)], decending by year.
				counts.append("%.2f"%(year_count_desc_y[0][1]/12.0))	# use the most recent year for prediction, because we don't have sufficient samples for model training.
			else:
				# linear regression for prediction
				x = np.array([v[0] for v in train_points])
				y = np.array([v[1] for v in train_points])
				m = lr.fit(x[:, np.newaxis], y)
				counts.append("%.2f"%(m.predict(2015)[0]/12.0))
		ret.append(counts)
	return ret
Ejemplo n.º 22
0
def best_split_lin_reg(x_vect, y):
    node_lg = LinearRegression(n_jobs=NUM_CORES).fit(x_vect[:, np.newaxis], y)
    node_score = mse(y, node_lg.predict(x_vect[:, np.newaxis]))

    best_score = -np.inf
    best_split_value = None
    best_true_inds = None
    best_false_inds = None

    for split_value in np.unique(x_vect):
        true_inds = x_vect > split_value
        true_ratio = np.sum(true_inds) / float(len(y))
        true_score = ling_reg_score(true_inds, x_vect, y)

        false_inds = np.invert(true_inds)
        false_ratio = 1 - true_ratio
        false_score = ling_reg_score(false_inds, x_vect, y)

        score = node_score - (true_ratio * true_score + false_ratio * false_score)

        if score > best_score:
            best_score = score
            best_split_value = split_value
            best_true_inds = true_inds
            best_false_inds = false_inds

    return best_false_inds, best_true_inds, best_split_value, best_score
Ejemplo n.º 23
0
def predict_residuals(train, test, forward):
    """
    Linear Regression
    Args:
        train: Training data Data
        x_hat: Target Data

    Returns:
        y_hat: Estimated Output

    """
    mdl = LinearRegression()
    # mdl = GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1)
    if forward:
        X = unpack(train, axis=0)
        y = unpack(train, axis=1)
        x_hat = unpack(test, axis=0)
        y_hat = unpack(test, axis=1)
    else:
        X = unpack(train, axis=1)
        y = unpack(train, axis=0)
        x_hat = unpack(test, axis=1)
        y_hat = unpack(test, axis=0)

    mdl.fit(X, y)
    return y_hat - mdl.predict(x_hat)
Ejemplo n.º 24
0
    def do_stack_learn(self):
        reviews = AbstractEstimateBase.reviews
        # Collect several estimates
        es = np.array([
            self._usermodel.all_estimates(),
            self._similarmovie.all_estimates(k = 1),
            self._similarmovie.all_estimates(k = 2),
            self._similarmovie.all_estimates(k = 3),
            self._similarmovie.all_estimates(k = 4),
            self._similarmovie.all_estimates(k = 5),
        ])
        
        total_error = 0.0
        coefficients = []
        
        reg = LinearRegression()
        # Iterate over all users
        for u in range(reviews.shape[0]):
            es0 = np.delete(es, u, axis=1)
            r0 = np.delete(reviews, u, axis=0)
            X, Y = np.where(r0 > 0)
            X = es[:, X, Y]
            y = r0[r0 > 0]
            reg.fit(X.T, y)
            coefficients.append(reg.coef_)
        
            r0 = reviews[u]
            X = np.where(r0 > 0)
            p0 = reg.predict(es[:, u, X].squeeze().T)
            err0 = r0[r0 > 0] - p0
            total_error += np.dot(err0, err0)
        coefficients = np.array(coefficients)

        print coefficients
Ejemplo n.º 25
0
def predict_device_byday_linear_regression():
    X,Y_unique,Y_all,X_raw = load_device_counter_byday()
    # print X
    # print Y_unique
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    training_size = 160
    # model.fit(X[:training_size],Y_unique[:training_size])
    model.fit(X[:training_size],Y_all[:training_size])

    start_index = 180
    end_index = 190
    X_to_predict = X[start_index:end_index]
    # X_to_predict.append([date_str_toordinal('2017-04-18')])
    # X_to_predict.append([date_str_toordinal('2017-03-27')])

    print X_to_predict
    # Y_real = Y_unique[start_index:end_index]
    Y_real = Y_all[start_index:end_index]
    print X_raw[start_index:end_index]
    y_predicted=model.predict(X_to_predict)
    # print y_predicted
    y_predicted = np.array(y_predicted).astype(int)
    print y_predicted
    print Y_real
    # print y_predicted - np.array(Y_real)

    # plt.subplot(111)
    # plt.scatter(X_to_predict,Y_real,c='r')
    plt.scatter(X_to_predict,y_predicted)
    # plt.plot(X_to_predict,y_predicted)
    plt.show()
Ejemplo n.º 26
0
    def setUp(self):
        # initialize a Image Database Object
        self.model_db = ModelDatabase('test')

        # for test purposes
        self.x = np.array([1,2,3,4,5,6,7,8,9,10]).reshape(-1,1)
        self.y = np.array([2,4,6,8,10,12,14,16,18,20]).reshape(-1,1)
        self.z = np.array([4,8,12,16,20,24,28,32,36,40]).reshape(-1,1)
        self.k = np.array([5,5,5,5,5,5,5,5,5,5]).reshape(-1,1)
        self.l = np.array([-2,-4,-6,-8,-10,-12,-14,-16,-18,-20]).reshape(-1,1)

        # y = 2x ; Assume id : '121A'
        self.regression_model1 = LinearRegression()
        self.regression_model1.fit(self.x, self.y)

        # z = 4x ; Assume id : '243'
        self.regression_model2 = LinearRegression()
        self.regression_model2.fit(self.x,self.z)

        # k = 5 ; Assume id : '392'
        self.regression_model3 = LinearRegression()
        self.regression_model3.fit(self.x,self.k)

        # l = -2x ; Assume id : '41A3'
        self.regression_model4 = LinearRegression()
        self.regression_model4.fit(self.x,self.l)
    def get_tracks_params(self, x, y, labels, sample_weight=None):

        tracks_params = []

        unique_labels = numpy.unique(labels)
        track_ids = unique_labels[unique_labels != -1]

        if len(track_ids) == 0:
            return []

        for track_id in track_ids:

            x_track = x[labels == track_id]
            y_track = y[labels == track_id]

            if sample_weight != None:
                sample_weight_track = sample_weight[labels == track_id]
            else:
                sample_weight_track = None

            lr = LinearRegression()
            lr.fit(x_track.reshape(-1,1), y_track, sample_weight_track)

            params = list(lr.coef_) + [lr.intercept_]
            tracks_params.append(params)

        return numpy.array(tracks_params)
def plot_EFA_relationships(all_results):
    EFA_all_results = {k:v.EFA for k,v in all_results.items()}
    scores = {k:v.get_scores() for k,v in EFA_all_results.items()}
    # quantify relationships using linear regression
    for name1, name2 in combinations(scores.keys(), 2):
        scores1 = scores[name1]
        scores2 = scores[name2]
        lr = LinearRegression()  
        cv_score = np.mean(cross_val_score(lr, scores1, scores2, cv=10))
        print(name1, name2, cv_score)
    # plot
    # plot task factors in task PCA space
    pca = PCA(2)
    task_pca = pca.fit_transform(scores['task'])
    palettes = ['Reds', 'Blues', 'Greens']
    all_colors = []
    # plot scores in task PCA space
    f, ax = plt.subplots(figsize=[12,8])
    ax.set_facecolor('white')

    for k,v in scores.items():
        palette = sns.color_palette(palettes.pop(), n_colors = len(v.columns))
        all_colors += palette
        lr = LinearRegression()
        lr.fit(task_pca, v)
        for i, coef in enumerate(lr.coef_):
            plt.plot([0,coef[0]], [0, coef[1]], linewidth=3, 
                     c=palette[i], label=k+'_'+str(v.columns[i]))
    leg = plt.legend(bbox_to_anchor=(.8, .5))
    frame = leg.get_frame()
    frame.set_color('black')
    beautify_legend(leg, all_colors)
Ejemplo n.º 29
0
def lr_prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model):
    # Dataframe to store the model prediction
    df_model_lr = df_train.copy()
    for col in col_names:
        # X will store the features and the outcome Y
        X = df_train.copy()
        X = X.rename(columns={col:'Y'})
        X = pd.merge(X, df_day_avg_values[[col]], left_on='day_time', right_index=True)
        X = X.rename(columns={col:col+'avg'})

        # Building the neighbors (from adjacency list) with missing values filled as in model
        neighbors_col = ['S'+str(n) for n in adjacency_list[int(col[1:])]]
        X = X[['Y']].join(df_model[neighbors_col])

        X_train = X[X['Y'] != -1]
        X_test = X[X['Y'] == -1]
        test_indices = X[X['Y'] == -1].index
        col_values = X['Y']

        if len(X_test):
            # Models
            lr = LinearRegression()
            lr = lr.fit(X_train.drop('Y', axis=1), X_train.Y)
            col_values.ix[test_indices] = lr.predict(X_test.drop('Y', axis=1))

            # Filling the result with the current sensor prediction
            df_model_lr[col] = np.round(col_values)
    return df_model_lr
Ejemplo n.º 30
0
def LinearRegressionPred(X, Y):
    lm = LinearRegression()
    lm.fit(X, Y)
    preds = lm.predict(X)
    preds_sorted = lm.predict(np.sort(X, 0))

    return preds_sorted
#all column data have less p value 5%

#now perform linear regression
#first decimal scalling
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
features=sc.fit_transform(features)


#train test split
from sklearn.model_selection import train_test_split
features_train,features_test,labels_train,labels_test=train_test_split(features,labels,random_state=0,test_size=0.005)

#now perform multiple linear regression
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(features_train,labels_train)
pred=regressor.predict(features_test)
print(pd.DataFrame({'actual':labels_test,'pred':pred}))











Ejemplo n.º 32
0
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# veri yukleme
veriler = pd.read_csv('maaslar.csv')

x = veriler.iloc[:,1:2]
y = veriler.iloc[:,2:]
X = x.values
Y = y.values


#linear regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X,Y)

plt.scatter(X,Y,color='red')
plt.plot(x,lin_reg.predict(X), color = 'blue')
plt.show()


#polynomial regression
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
x_poly = poly_reg.fit_transform(X)
print(x_poly)
lin_reg2 = LinearRegression()
lin_reg2.fit(x_poly,y)
plt.scatter(X,Y,color = 'red')
Ejemplo n.º 33
0
import numpy
import matplotlib.pyplot as plt

from ages_net_worths import ageNetWorthData

ages_train, ages_test, net_worths_train, net_worths_test = ageNetWorthData()



from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(ages_train, net_worths_train)

### get Katie's net worth (she's 27)
### sklearn predictions are returned in an array, so you'll want to index into
### the output to get what you want, e.g. net_worth = predict([[27]])[0] (not
### exact syntax, the point is the [0] at the end). In addition, make sure the
### argument to your prediction function is in the expected format - if you get
### a warning about needing a 2d array for your data, a list of lists will be
### interpreted by sklearn as such (e.g. [[27]]).
km_net_worth = reg.predict([[27]]) ### fill in the line of code to get the right value

### get the slope
### again, you'll get a 2-D array, so stick the [0][0] at the end
slope = reg.coef_[0, 0] ### fill in the line of code to get the right value

### get the intercept
### here you get a 1-D array, so stick [0] on the end to access
### the info we want
intercept = reg.intercept_[0] ### fill in the line of code to get the right value
Ejemplo n.º 34
0
#! /usr/bin/env python3
# coding : utf-8

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import datasets
from sklearn.model_selection import train_test_split

clf = LinearRegression()
data = datasets.load_boston()
X = data.data
y = data.target
# X_train = X[:int(len(X) * 0.7)]
# X_test = X[int(len(X) * 0.7):]
# y_train = y[:int(len(y) * 0.7)]
# y_test = y[int(len(y) * 0.7):]
X_train, X_test, y_train, y_test = train_test_split(data.data,
                                                    data.target,
                                                    test_size=0.2)

clf.fit(X_train, y_train)
print(clf.coef_, clf.intercept_)
print(clf.score(X_test, y_test))

# from sklearn import datasets
# from sklearn.model_selection import cross_val_predict
# from sklearn import linear_model
# import matplotlib.pyplot as plt

# lr = linear_model.LinearRegression()
# boston = datasets.load_boston()
Ejemplo n.º 35
0
advert = pd.read_excel("灰度表1.xlsx")
dataSet = pd.read_excel("预测表2.xlsx")
columns = [
    '线路价格(不含税)', '总里程', '业务类型', '需求类型1', '需求类型2', '是否续签', '车辆长度', '车辆吨位',
    '打包类型', '运输等级', '计划卸货等待时长', '计划运输时长', '线路总成本', '需求紧急程度'
]
advert = advert[columns]
dataSet = dataSet[columns[1:]]
dataSet = fillNaN(dataSet)
advert = fillNaN(advert)
advert.columns = columns

col = columns[1:]
X = advert[col]
y = advert['线路价格(不含税)']
lm1 = LinearRegression()
lm1.fit(X, y)
lm1_predict = lm1.predict(X[col])
print("R^2  lm1:", r2_score(y, lm1_predict))
# print(lm1.intercept_)
nparr = lm1.coef_.tolist()  # 模型值转list好计算
lis = []
dataSet.columns = [
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'
]
for i in range(1489):
    count = 0.0
    for j in range(13):
        count += dataSet[str(j)][i] * nparr[j]
    lis.append([(count + lm1.intercept_), dataSet['12'][i]])
dataWrite(lis, 4)
Ejemplo n.º 36
0
    def api_return():

        if 'id' in request.args:
            id = int(request.args['id'])

            month = 12
            year = 19
            listing = listings.query.get(id)

            property_type = listing.property_type.property_type
            room_type = listing.room_types
            neighbourhood = listing.neighborhood.neighborhood
            accommodates = listing.accommodates
            bedrooms = listing.bedrooms
            bathrooms = listing.bathrooms
            beds = listing.bedrooms

            df = pd.DataFrame(
                columns=['month', 'year', 'property_type', 'room_type',
                         'neighbourhood', 'accommodates', 'bedrooms',
                         'bathrooms', 'beds'],
                data=[[month, year, property_type, room_type, neighbourhood,
                       accommodates, bedrooms, bathrooms, beds]]
            )

            train = pd.read_csv('https://raw.githubusercontent.com/JimKing100/airbnb-app-4/master/Datascience/data/train.csv')
            train = train.drop(columns=['old_index'])
            target = 'price'

            features = train.columns.drop(target)
            X_train = train[features]
            y_train = train[target]

            pipeline = make_pipeline(
                ce.OrdinalEncoder(),
                LinearRegression()
            )

            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(df)
            y_pred = y_pred[0]

            transformers = make_pipeline(
                ce.OrdinalEncoder(),
                SimpleImputer(strategy='mean')
            )

            X_train_transformed = transformers.fit_transform(X_train)

            model = LinearRegression()
            model.fit(X_train_transformed, y_train)

            pro, con = explains(df, model, X_train_transformed, transformers, 0)

            output_str = jsonify(prediction=str(int(y_pred)),
                                 pros1=pro[0],
                                 pros2=pro[1],
                                 cons1=con[0],
                                 cons2=con[1])

        else:
            return "Error: no id field provided"

        return output_str
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Salary_Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 1].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=1 / 3,
                                                    random_state=0)

# Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

# Visualising the Training set results
plt.scatter(X_train, y_train, color='red')
plt.plot(X_train, regressor.predict(X_train), color='green')
plt.title('Salary vs Experience (Training set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()
# X = dataset.iloc[:, : -1].values
y = dataset.values[:, 4]

ct_X = ColumnTransformer([('one_hot_encoder', OneHotEncoder(), [3])],
                         remainder='passthrough')
X = np.array(ct_X.fit_transform(X), dtype=np.float)

# Avoiding the dummy variable trap
X = X[:, 1:]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the results
y_pred = regressor.predict(X=X_test)

# building the optimal model with backwards elimination
# setup variables for loop
X = np.append(arr=np.ones((len(X), 1)).astype(np.float), values=X, axis=1)
columns = [0, 1, 2, 3, 4, 5]  # Used colims
critical_value = 0.1
running = True

# This is working
while running:
    # Setup the new smaller set with more significant variables
Ejemplo n.º 39
0
#다중선형회귀
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

#fit_intetcept = 상수항을 사용 할 거냐
#y=a+bX 에서 a가 상수항

X=wine.drop(["index","type","quality"],axis=1)
y=wine.quality


X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state=1)


model=LinearRegression()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
#성능 측정
#RMSE
np.round(np.sqrt(mean_squared_error(y_test,y_pred)),5)

#######################################################
from sklearn.linear_model import Ridge
ridge=Ridge(alpha=0.05)
ridge.fit(X_train,y_train)
y_pred = ridge.predict(X_test)
np.round(np.sqrt(mean_squared_error(y_test,y_pred)),5)

fig = plt.figure(figsize=(6,3))
ax=fig.add_subplot(111)
from twitterscraper import query_tweets_from_user
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.linear_model import LinearRegression

sid = SentimentIntensityAnalyzer()
model = LinearRegression()

features = []
labels = []

#Aumentamos el número de tweets
all_tweets = query_tweets_from_user("barackobama", limit=800)

#Ahora entrenamos con 600 tweets
training = all_tweets[:600]
testing = all_tweets[600:]

for tweet in training:
    tweetAnalysis = sid.polarity_scores(tweet.text)
    #Dividimos el número de retweets y likes entre 1000 y lo convertimos en entero, para que ahora 30.000 y 30.500 ambos sean 30
    #Multiplicamos las probabilidades para trabajar con números más grandes, ahora 0.10 será 10
    features.append([
        int(tweetAnalysis["neg"] * 100),
        int(tweetAnalysis["pos"] * 100),
        int(tweetAnalysis["neu"] * 100)
    ])
    labels.append(int(tweet.likes / 1000))

model = model.fit(features, labels)
Ejemplo n.º 41
0
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 20, 10
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier as MLP
import pandas_datareader.data as web
from pandas import Series, DataFrame
from sklearn.linear_model import LinearRegression
import datetime, math
from sklearn.neighbors import KNeighborsRegressor as knn
import matplotlib.dates as mdates
clf = LinearRegression()  #n_jobs=-1)

days = 240
sight = 480
scaler = MinMaxScaler(feature_range=(0, 1))
start = datetime.datetime(2010, 1, 1)

end = datetime.datetime.today() + datetime.timedelta(days=days)
dayss = (end - start).days
predicted_list = [end - datetime.timedelta(days=x) for x in range(days)]
predicted_list.reverse()

stock = input("Stock: ").upper()
df = web.DataReader(stock, 'yahoo', start, end)
data = df['Adj Close']
X, y = [], []
Ejemplo n.º 42
0
# Splitting the dataset into the Training set and Test set
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""

#Linear Regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X,y)

#Polynomial Linear Regression
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X)

lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly, y)

# Visualising the Linear Regression results
plt.scatter(X, y, color = 'red')
plt.plot(X, lin_reg.predict(X), color = 'blue')
plt.title('Truth or Bluff (Linear Regression)')
plt.xlabel('Position level')
feature_cols = ["age", "sex", "bmi", "children", "smoker", "region"]
target = "charges"
#separating feature attributes and target attribute
y = df[target].values
X = df[feature_cols].values

#80/20 hold out approach
#splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

#training and fitting the model
reg = LinearRegression()
reg.fit(X_train, y_train)
X_input = [[12, 1, 28, 0, 1, 2]]
y_pred = reg.predict(X_test)

#plotting actual vs predicted. First n instances
df1 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
n = 25
df1 = df1.head(n)
df1.plot(kind='bar', figsize=(10, 8))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.title('Actual vs Predicted Values')
plt.xlabel('Instance')
plt.ylabel('Charges')
plt.show()
Ejemplo n.º 44
0
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
from sklearn.linear_model import LinearRegression

df = pd.read_csv("train.csv")

# カラムとターゲットを指定
# ターゲットはSalePrice
X = df[["OverallQual", "GrLivArea"]].values
y = df["SalePrice"].values

# 線形回帰の学習
slr = LinearRegression()
slr.fit(X, y)

# 回帰係数とy切片の表示
print("Coefficient : {0}".format(slr.coef_))
a1, a2 = slr.coef_
print("intercepts : {0}".format(slr.intercept_))
b = slr.intercept_

# 3D描画(回帰平面の描画)
x, y, z = np.array(df["OverallQual"]), np.array(df["GrLivArea"]), np.array(
    df["SalePrice"].values)
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter3D(np.ravel(x), np.ravel(y), np.ravel(z))
Ejemplo n.º 45
0
def clusteringPoints(train_data, if_show_cluster, Y1, Y2, height, width, side):
    
    
    # ===============================================
    num_cluster_choices = [1, 2, 3, 4] 
        
        
    # ===============================================
    # Initialize Average Score  
    best_avg_score      = np.inf
    
    
    # ===============================================
    # Start to select number of clusters
    for num_cluster in num_cluster_choices:

        
        # ===============================================
        # Do Clustering
        cluster = GaussianMixture(n_components = num_cluster, covariance_type = "full")
        gmm     = cluster.fit(train_data)
        labels  = gmm.predict(train_data)
        
        
        # ===============================================
        # Prepare Validation
        scores            = []
        X1_all            = []
        X2_all            = []
        
        
        # ===============================================
        # Start Validation
        for label in range(num_cluster):
            indices = np.where(labels == label)[0].tolist()


            # ===============================================
            # Regression on clusters that have more than 1 point
            if len(indices) > 1:
                reg_data = train_data[indices]
                reg      = LinearRegression().fit(reg_data[:, 0].reshape(-1, 1), reg_data[:, 1])
                
                
                # ===============================================
                # Get coefficient
                k = reg.coef_[0]
                b = reg.intercept_
                
                
                # ===============================================
                # Avoid Bad k-values
                if (side == "L" and k > 0) or (side == "R" and k < 0):
                    scores.append([-1, 1000])
                    break
                
                
                # ===============================================
                # Get coefficient
                score = reg.score(reg_data[:, 0].reshape(-1, 1), reg_data[:, 1])
                
                
                # ===============================================
                # Scores          
                scores.append([label, score])
                
                
                # ===============================================
                # Get drawing points for X coordinate
                X1 = (Y1 - b) / k
                X2 = (Y2 - b) / k
                
                
                # ===============================================
                # Append X coordinates to lists
                X1_all.append([label, X1])
                X2_all.append([label, X2])
                
                
                # ===============================================
                # Update Info
        
        
        # ===============================================
        # Check if this is a good clustering
        avg_score = sum(pair[1] for pair in scores) / len(scores)

        if abs(avg_score - 1) <= abs(best_avg_score - 1):
            
            
            # ===============================================
            # Update
            best_avg_score               = avg_score 
            best_scores                  = scores
            best_num_cluster            = num_cluster
            best_X1_all  = X1_all
            best_X2_all  = X2_all
            best_labels  = labels
     
  
    # ===============================================
    # Try to reduce the number of clusters
    while (len(best_scores) > 1):
        
        
        # ===============================================
        # Initialization & n choose 2
        if_reduced = False
        label_comb = combinations(list(set(best_labels)), 2)
        
        
        # ===============================================
        # True to merge vassal_label to suzerain_label
        for vassal_label, suzerain_label in label_comb:
            
            
            # ===============================================
            # Get Merged Indices
            vassal_indices   = np.where(best_labels == vassal_label)[0].tolist()
            suzerain_indices = np.where(best_labels == suzerain_label)[0].tolist()
            indices          = np.concatenate((vassal_indices, suzerain_indices))
            
            
            # ===============================================
            # Do Regression for merged Data
            reg_data = train_data[indices]
            reg      = LinearRegression().fit(reg_data[:, 0].reshape(-1, 1), reg_data[:, 1])
            score    = reg.score(reg_data[:, 0].reshape(-1, 1), reg_data[:, 1])
            
            
            # ===============================================
            # After merging two clusters, what are the scores for all clusters?
            scores    = [pair for pair in best_scores 
                         if (pair[0] != vassal_label and pair[0] != suzerain_label)]
            scores.append([suzerain_label, score])
            
        

            # ===============================================
            # Get Average
            avg_score = sum(pair[1] for pair in scores) / len(scores)
            
            

            # ===============================================
            # Also need to worry about k
            temp_k = reg.coef_[0]
            if (side == "L" and temp_k <=0) or (side == "R" and temp_k >= 0):
                valid_k = True
            else:
                valid_k = False
                
    
            # ===============================================
            # Does this merging actuall help us??? 
            # 0.03 is tolerance because we want to merge some clusters
            if (abs(avg_score - 1) <= abs(best_avg_score - 1) + 0.03) and valid_k:
                
                
                # ===============================================
                # Save information if  we want to use this merge later
                k              = temp_k
                b              = reg.intercept_
                new_X1         = (Y1 - b) / k
                new_X2         = (Y2 - b) / k
                removed_label  = vassal_label
                merged_label   = suzerain_label
                
                
                # ===============================================
                # Useful for next loop round
                best_avg_score = avg_score
                if_reduced     = True
         
            
        # ===============================================
        # If num of clusters is reduced, we use the best merging
        if if_reduced:        
            
            
            # ===============================================
            # Update information for drawing
            best_num_cluster  = best_num_cluster  - 1
            best_labels = [merged_label if item == removed_label else item for item in best_labels]
            best_X1_all = [pair for pair in best_X1_all if (pair[0] != removed_label and pair[0] != merged_label)]
            best_X2_all = [pair for pair in best_X2_all if (pair[0] != removed_label and pair[0] != merged_label)]
            best_X1_all.append([merged_label, new_X1])
            best_X2_all.append([merged_label, new_X2])
        
        
        # ===============================================
        # If not reduced, we just what we have before
        else:
            break


    # ===============================================
    # If we want to see clusters
    if if_show_cluster:
      plt.figure()
      plt.axis([0, width, 0, height])
      plt.scatter(train_data[:, 0], train_data[:, 1], c = best_labels, cmap= "viridis")
      plt.gca().invert_yaxis()
        
      
    return [pair[1] for pair in best_X1_all], [pair[1] for pair in best_X2_all], len(list(set(best_labels)))
Ejemplo n.º 46
0
def auto_arima(y,
               X=None,
               start_p=2,
               d=None,
               start_q=2,
               max_p=5,
               max_d=2,
               max_q=5,
               start_P=1,
               D=None,
               start_Q=1,
               max_P=2,
               max_D=1,
               max_Q=2,
               max_order=5,
               m=1,
               seasonal=True,
               stationary=False,
               information_criterion='aic',
               alpha=0.05,
               test='kpss',
               seasonal_test='ocsb',
               stepwise=True,
               n_jobs=1,
               start_params=None,
               trend=None,
               method='lbfgs',
               maxiter=50,
               offset_test_args=None,
               seasonal_test_args=None,
               suppress_warnings=True,
               error_action='trace',
               trace=False,
               random=False,
               random_state=None,
               n_fits=10,
               return_valid_fits=False,
               out_of_sample_size=0,
               scoring='mse',
               scoring_args=None,
               with_intercept="auto",
               sarimax_kwargs=None,
               **fit_args):

    # NOTE: Doc is assigned BELOW this function

    # Temporary shim until we remove `exogenous` support completely
    X, fit_args = pm_compat.get_X(X, **fit_args)

    # pop out the deprecated kwargs
    fit_args = _warn_for_deprecations(**fit_args)

    # misc kwargs passed to various fit or test methods
    offset_test_args = val.check_kwargs(offset_test_args)
    seasonal_test_args = val.check_kwargs(seasonal_test_args)
    scoring_args = val.check_kwargs(scoring_args)
    sarimax_kwargs = val.check_kwargs(sarimax_kwargs)

    m = val.check_m(m, seasonal)
    trace = val.check_trace(trace)
    # can't have stepwise AND parallel
    n_jobs = val.check_n_jobs(stepwise, n_jobs)

    # validate start/max points
    start_p, max_p = val.check_start_max_values(start_p, max_p, "p")
    start_q, max_q = val.check_start_max_values(start_q, max_q, "q")
    start_P, max_P = val.check_start_max_values(start_P, max_P, "P")
    start_Q, max_Q = val.check_start_max_values(start_Q, max_Q, "Q")

    # validate d & D
    for _d, _max_d in ((d, max_d), (D, max_D)):
        if _max_d < 0:
            raise ValueError('max_d & max_D must be positive integers (>= 0)')
        if _d is not None:
            if _d < 0:
                raise ValueError('d & D must be None or a positive '
                                 'integer (>= 0)')

    # check on n_iter
    if random and n_fits < 0:
        raise ValueError('n_iter must be a positive integer '
                         'for a random search')

    # validate error action
    actions = {'warn', 'raise', 'ignore', 'trace', None}
    if error_action not in actions:
        raise ValueError('error_action must be one of %r, but got %r'
                         % (actions, error_action))

    # start the timer after the parameter validation
    start = time.time()

    # copy array
    y = check_endog(y, dtype=DTYPE)
    n_samples = y.shape[0]

    # the workhorse of the model fits
    fit_partial = functools.partial(
        solvers._fit_candidate_model,
        start_params=start_params,
        trend=trend,
        method=method,
        maxiter=maxiter,
        fit_params=fit_args,
        suppress_warnings=suppress_warnings,
        trace=trace,
        error_action=error_action,
        scoring=scoring,
        out_of_sample_size=out_of_sample_size,
        scoring_args=scoring_args,
        information_criterion=information_criterion,
    )

    # check for constant data
    if is_constant(y):
        warnings.warn('Input time-series is completely constant; '
                      'returning a (0, 0, 0) ARMA.')

        return _return_wrapper(
            solvers._sort_and_filter_fits(
                fit_partial(
                    y,
                    X=X,
                    order=(0, 0, 0),
                    seasonal_order=(0, 0, 0, 0),
                    with_intercept=val.auto_intercept(
                        with_intercept, False),  # False for the constant model
                    **sarimax_kwargs
                )
            ),
            return_valid_fits, start, trace)

    information_criterion = \
        val.check_information_criterion(information_criterion,
                                        out_of_sample_size)

    # the R code handles this, but I don't think statsmodels
    # will even fit a model this small...
    # if n_samples <= 3:
    #     if information_criterion != 'aic':
    #         warnings.warn('n_samples (%i) <= 3 '
    #                       'necessitates using AIC' % n_samples)
    #     information_criterion = 'aic'

    # adjust max p, q -- R code:
    # max.p <- min(max.p, floor(serieslength/3))
    # max.q <- min(max.q, floor(serieslength/3))
    max_p = int(min(max_p, np.floor(n_samples / 3)))
    max_q = int(min(max_q, np.floor(n_samples / 3)))

    # this is not in the R code and poses a risk that R did not consider...
    # if max_p|q has now dropped below start_p|q, correct it.
    start_p = min(start_p, max_p)
    start_q = min(start_q, max_q)

    # if it's not seasonal, we can avoid multiple 'if not is None' comparisons
    # later by just using this shortcut (hack):
    # TODO: can we remove this hack now?
    if not seasonal:
        D = m = -1

    # TODO: check rank deficiency, check for constant Xs, regress if necessary
    xx = y.copy()
    if X is not None:
        lm = LinearRegression().fit(X, y)
        xx = y - lm.predict(X)

    # choose the order of differencing
    # is the TS stationary?
    if stationary:
        d = D = 0

    # todo: or not seasonal ?
    if m == 1:
        D = max_P = max_Q = 0
    # m must be > 1 for nsdiffs
    elif D is None:  # we don't have a D yet and we need one (seasonal)
        D = nsdiffs(xx, m=m, test=seasonal_test, max_D=max_D,
                    **seasonal_test_args)

        if D > 0 and X is not None:
            diffxreg = diff(X, differences=D, lag=m)
            # check for constance on any column
            if np.apply_along_axis(is_constant, arr=diffxreg, axis=0).any():
                D -= 1

    # D might still be None if not seasonal
    if D > 0:
        dx = diff(xx, differences=D, lag=m)
    else:
        dx = xx

    # If D was too big, we might have gotten rid of x altogether!
    if dx.shape[0] == 0:
        raise ValueError("The seasonal differencing order, D=%i, was too "
                         "large for your time series, and after differencing, "
                         "there are no samples remaining in your data. "
                         "Try a smaller value for D, or if you didn't set D "
                         "to begin with, try setting it explicitly. This can "
                         "also occur in seasonal settings when m is too large."
                         % D)

    # difference the exogenous matrix
    if X is not None:
        if D > 0:
            diffxreg = diff(X, differences=D, lag=m)
        else:
            diffxreg = X
    else:
        # here's the thing... we're only going to use diffxreg if exogenous
        # was not None in the first place. However, PyCharm doesn't know that
        # and it thinks we might use it before assigning it. Therefore, assign
        # it to None as a default value and it won't raise the warning anymore.
        diffxreg = None

    # determine/set the order of differencing by estimating the number of
    # orders it would take in order to make the TS stationary.
    if d is None:
        d = ndiffs(
            dx,
            test=test,
            alpha=alpha,
            max_d=max_d,
            **offset_test_args,
        )

        if d > 0 and X is not None:
            diffxreg = diff(diffxreg, differences=d, lag=1)

            # if any columns are constant, subtract one order of differencing
            if np.apply_along_axis(is_constant, arr=diffxreg, axis=0).any():
                d -= 1

    # check differences (do we want to warn?...)
    if not suppress_warnings:  # TODO: context manager for entire block  # noqa: E501
        val.warn_for_D(d=d, D=D)

    if d > 0:
        dx = diff(dx, differences=d, lag=1)

    # check for constant
    if is_constant(dx):
        ssn = (0, 0, 0, 0) if not seasonal \
            else sm_compat.check_seasonal_order((0, D, 0, m))

        # Include the benign `ifs`, because R's auto.arima does. R has some
        # more options to control that we don't, but this is more readable
        # with a single `else` clause than a complex `elif`.
        if D > 0 and d == 0:
            with_intercept = val.auto_intercept(with_intercept, True)
            # TODO: if ever implemented in sm
            # fixed=mean(dx/m, na.rm = TRUE)
        elif D > 0 and d > 0:
            pass
        elif d == 2:
            pass
        elif d < 2:
            with_intercept = val.auto_intercept(with_intercept, True)
            # TODO: if ever implemented in sm
            # fixed=mean(dx, na.rm = TRUE)
        else:
            raise ValueError('data follow a simple polynomial and are not '
                             'suitable for ARIMA modeling')

        # perfect regression
        return _return_wrapper(
            solvers._sort_and_filter_fits(
                fit_partial(
                    y,
                    X=X,
                    order=(0, d, 0),
                    seasonal_order=ssn,
                    with_intercept=with_intercept,
                    **sarimax_kwargs
                )
            ),
            return_valid_fits, start, trace
        )

    # seasonality issues
    if m > 1:
        if max_P > 0:
            max_p = min(max_p, m - 1)
        if max_Q > 0:
            max_q = min(max_q, m - 1)

    # TODO: if approximation
    #   . we need method='css' or something similar for this

    # R determines whether to use a constant like this:
    #   allowdrift <- allowdrift & (d + D) == 1
    #   allowmean <- allowmean & (d + D) == 0
    #   constant <- allowdrift | allowmean
    # but we don't have `allowdrift` or `allowmean` so use just d and D
    if with_intercept == 'auto':
        with_intercept = (d + D) in (0, 1)

    if not stepwise:

        # validate max_order
        if max_order is None:
            max_order = np.inf
        elif max_order < 0:
            raise ValueError('max_order must be None or a positive '
                             'integer (>= 0)')

        search = solvers._RandomFitWrapper(
            y=y,
            X=X,
            fit_partial=fit_partial,
            d=d,
            D=D,
            m=m,
            max_order=max_order,
            max_p=max_p,
            max_q=max_q,
            max_P=max_P,
            max_Q=max_Q,
            random=random,
            random_state=random_state,
            n_fits=n_fits,
            n_jobs=n_jobs,
            seasonal=seasonal,
            trace=trace,
            with_intercept=with_intercept,
            sarimax_kwargs=sarimax_kwargs,
        )

    else:
        if n_samples < 10:
            start_p = min(start_p, 1)
            start_q = min(start_q, 1)
            start_P = start_Q = 0

        # seed p, q, P, Q vals
        p = min(start_p, max_p)
        q = min(start_q, max_q)
        P = min(start_P, max_P)
        Q = min(start_Q, max_Q)

        # init the stepwise model wrapper
        search = solvers._StepwiseFitWrapper(
            y,
            X=X,
            start_params=start_params,
            trend=trend,
            method=method,
            maxiter=maxiter,
            fit_params=fit_args,
            suppress_warnings=suppress_warnings,
            trace=trace,
            error_action=error_action,
            out_of_sample_size=out_of_sample_size,
            scoring=scoring,
            scoring_args=scoring_args,
            p=p,
            d=d,
            q=q,
            P=P,
            D=D,
            Q=Q,
            m=m,
            max_p=max_p,
            max_q=max_q,
            max_P=max_P,
            max_Q=max_Q,
            seasonal=seasonal,
            information_criterion=information_criterion,
            with_intercept=with_intercept,
            **sarimax_kwargs,
        )

    sorted_res = search.solve()
    return _return_wrapper(sorted_res, return_valid_fits, start, trace)
Ejemplo n.º 47
0
features = data.iloc[:, 3:].values
labels = data.iloc[:, 2].values

from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.1, random_state=0)

#scaling of data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
features_train = sc.fit_transform(features_train)
features_test = sc.transform(features_test)

#system is trained using linear regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(features_train, labels_train)

#prediction done  over test data
pred = regressor.predict(features_test)

#prediction over next lot of data
x = pd.read_csv("rfmmed.csv")
testd = x.iloc[99:, 1:]
testd = sc.transform(testd)
pred1 = regressor.predict(testd)

score = regressor.score(features_train, labels_train)
print(score)
score = regressor.score(features_test, labels_test)
print(score)
import mglearn as mglearn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


x,y = mglearn.datasets.load_extended_boston()

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 0)


lr = LinearRegression().fit(x_train,y_train)

print("Training set score : %f" %lr.score(x_train,y_train))
print("Test set score : %f" %lr.score(x_test,y_test))
Ejemplo n.º 49
0
#
# Indeed, :class:`~sklearn.linear_model.LinearRegression` is a least squares
# approach minimizing the mean squared error (MSE) between the training and
# predicted targets. In contrast,
# :class:`~sklearn.linear_model.QuantileRegressor` with `quantile=0.5`
# minimizes the mean absolute error (MAE) instead.
#
# Let's first compute the training errors of such models in terms of mean
# squared error and mean absolute error. We will use the asymmetric Pareto
# distributed target to make it more interesting as mean and median are not
# equal.
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

linear_regression = LinearRegression()
quantile_regression = QuantileRegressor(quantile=0.5, alpha=0)

y_pred_lr = linear_regression.fit(X, y_pareto).predict(X)
y_pred_qr = quantile_regression.fit(X, y_pareto).predict(X)

print(f"""Training error (in-sample performance)
    {linear_regression.__class__.__name__}:
    MAE = {mean_absolute_error(y_pareto, y_pred_lr):.3f}
    MSE = {mean_squared_error(y_pareto, y_pred_lr):.3f}
    {quantile_regression.__class__.__name__}:
    MAE = {mean_absolute_error(y_pareto, y_pred_qr):.3f}
    MSE = {mean_squared_error(y_pareto, y_pred_qr):.3f}
    """)

# %%
Ejemplo n.º 50
0
predictors = ["Dietary Calories (cal)", "Steps (count)"]

#lag Y variable, because our weight in the morning is function of what we did yesterday
missing = df.loc[1][target]  #save the first value
Y = df[target].shift(-1).dropna().values  #shift removes the last value
Y = np.append(missing, Y)  #attach the last value back onto the Y array

#impute missing values
X = df[predictors]
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values=0, strategy='mean', axis=0)
X = imp.fit_transform(X)

#traing the machine learning model
from sklearn.linear_model import LinearRegression
ols = LinearRegression(fit_intercept=True, normalize=False)

ols.fit(X, Y)

#reporting from the model
coefs = ols.coef_
inter = round(ols.intercept_, 3)
print("-" * 15 + "model intercept" + "-" * 15)
print("all else constant the model predicts that {t} should be {i}".format(
    t=target, i=inter))

print("-" * 15 + "predictor variables coefficients" + "-" * 15)
for i, var in enumerate(predictors):
    print("for one unit increase in {v} your model predicts:".format(v=var))
    print("a {c} change in {t}".format(c=coefs[i], t=target))
Ejemplo n.º 51
0
                              'ACTION':'ACTION',
                              'THRILLER':'SUSPENSE',
                              'ANIMATION': 'ANIMATION',
                              'ROMANTIC COMEDY':'COMEDY',
                              'ROMANTIC DRAMA': 'DRAMA',
                              'ROMANCE': 'DRAMA',
                              'DOCUMENTARY': 'DOCUMENTARY'
                               })

# prediction
X_train = df_train.iloc[:,2:6].values
y_train = df_train['OBO'].values

X_test = df_test.iloc[:,2:6].values 

mod_reg = LinearRegression()
mod_reg.fit(X_train, y_train)
#y_pred = mod_reg.predict(X_test)

# APP
app = dash.Dash()

colors = {
    'background': '#111111',
    'text': '#808080'
}

available_indicators = np.array(['OBO', 'UA_T', 'TA_T', 'DI_T', 'FC_T'])


markdown_text ='''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
myData = pd.read_excel('data.xlsx')

# split datasets: 90% training data & 10% test data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(myData['ukuran'],
                                                    myData['harga'],
                                                    test_size=.1)

# linear regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# training
model.fit(myData[['ukuran']], myData['harga'])

# save model: pickle
import pickle

with open('1_modelPickle', 'wb') as modelku:
    pickle.dump(model, modelku)
Ejemplo n.º 53
0
 def __init__(self, k=3, buffer_ratio=4, allow_sub_k=True):
     self.k = k
     self.buffer_ratio = buffer_ratio
     self.model = LinearRegression()
     self.variance = 3.
     self.allow_sub_k = allow_sub_k
Ejemplo n.º 54
0
    
    
    # test_predict_plot =np.empty_like(data)
    # test_predict_plot[:,:]=np.nan
    # test_predict_plot[len(train_predict)+(lags*2)+1:len(data)-1,:]=test_predict
    
    # plt.plot(data, label='Observado', color='blue')
    # plt.plot(train_predict_plot, label='Previsão para os dados de treino', color='red', alpha=0.5)
    # plt.plot(test_predict_plot, label='Previsão para os dados de teste', color='yellow')
    # plt.legend(loc='best')
    # plt.show
    
    # plt.savefig('Grafico MLP Lag' + str(lags) +'.png')
    
    from sklearn.linear_model import LinearRegression
    rl = LinearRegression().fit(X_train,y_train)
    rl_trainscore =rl.score(X_train, y_train)
    rl_testscore=rl.score(X_test, y_test)
    rl_predicttest =rl.predict(X_test)
    rl_predicttrain =rl.predict(X_train)
    rl_r2=pearsonr(y_test, rl_predicttest)
    # rl_r2=r2_score(y_test,rl_predicttest)
    # rl_rmse = mean_squared_error(y_test, rl_predicttest)
    # rl_mae = mean_absolute_error(y_test, rl_predicttest)
    
    print("Iniciando Regressão Linear")

    rl_r2_train = cross_val_score(rl, X_train, y_train, cv=splits, scoring=my_scorer)
    rl_r2_test = cross_val_score(rl, X_test, y_test, cv=splits, scoring=my_scorer)
    rl_mse_train = cross_val_score(rl, X_train, y_train, cv=splits, scoring='neg_mean_squared_error')
    rl_mae_train = cross_val_score(rl, X_train, y_train, cv=splits, scoring='neg_mean_absolute_error')
Ejemplo n.º 55
0
rnd_reg.fit(X_train, y_train)

y_pred_rf = rnd_reg.predict(X_test)

# Residual Squared Error
from sklearn.metrics import r2_score
result_score_random_reg = r2_score(y_test, y_pred_rf)
print('result score for random forest regression is ', result_score_random_reg)

####################################################################################################################################################

# Linear Regression

# Fitting Linear Regression to the dataset
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_linear = lin_reg.predict(X_test)

result_score_linear_prediction = r2_score(y_test, y_pred_linear)

print('result score for linear regression is ', result_score_linear_prediction)

####################################################################################################################################################

# Polynomial Regression

from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=2)
X_poly = poly_reg.fit_transform(X_train)
lin_reg_2 = LinearRegression()
Ejemplo n.º 56
0
    str = str.replace(',', '')
    return float(str.replace('+', ''))

# xử lý dữ liệu
import numpy as np
converter_1 = np.vectorize(lambda str: convertSize(str))
converter_2 = np.vectorize(lambda str: removePlus(str))

X[:, 2] = converter_1(X[:, 2])  # column file size
y = converter_2(y)  # column install

# chia dataset 20% cho testing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
regr = LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

from sklearn.decomposition import PCA
pca = PCA(n_components=1)
X_new = pca.fit_transform(X)

matrix_w = pca.components_.T
X_train, X_test = X_train.dot(matrix_w), X_test.dot(matrix_w)

# sử dụng Linear Regression để training data
regr2 = LinearRegression()
regr2.fit(X_train, y_train)
y_pred = regr2.predict(X_test)
def validate():
    """
    run KFOLD method for regression 
    """
    #defining directories    
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/merraLRValidation"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    
    #cd to the lagged predictors directory
    os.chdir(dir_in)
    
    
    x = 105
    y = 106
    
    #empty dataframe for model validation
    df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse'])
    
    #looping through 
    for tg in range(x,y):
        
        os.chdir(dir_in)

        tg_name = os.listdir()[tg]
        print(tg, tg_name)
        
        ##########################################
        #check if this tg is already taken care of
        ##########################################
        os.chdir(dir_out)
        if os.path.isfile(tg_name):
            return "file already analyzed!"
        
        
        os.chdir(dir_in)

        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis = 1)

        #standardize predictor data
        dat = pred.iloc[:,1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis = 1)
        
    
        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index, axis = 0, inplace = True)
        surge.reset_index(inplace = True)
        surge.drop('index', axis = 1, inplace = True)
        
        
        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns = ['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis = 1)
    
        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized, surge_new.iloc[:,:2], on='date', how='right')
        pred_surge.sort_values(by = 'date', inplace = True)
        
        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis =1)]
        pred_surge.drop(row_nan.index, axis = 0, inplace = True)
        pred_surge.reset_index(inplace = True)
        pred_surge.drop('index', axis = 1, inplace = True)
        
        
        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-'*80)
            print('Predictors and Surge don''t overlap')
            print('-'*80)
            continue
        
     
        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])
        
        #prepare data for training/testing
        X = pred_surge.iloc[:,1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis = 1, inplace = True)
        
        #apply PCA
        pca = PCA(.95)
        pca.fit(X)
        X_pca = pca.transform(X)
        
        #apply 10 fold cross validation
        kf = KFold(n_splits=10, random_state=29)
        
        metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs'])
        for train_index, test_index in kf.split(X):
            X_train, X_test = X_pca[train_index], X_pca[test_index]
            y_train, y_test = y['surge'][train_index], y['surge'][test_index]
            
            #train regression model
            lm = LinearRegression()
            lm.fit(X_train, y_train)
            
            #predictions
            predictions = lm.predict(X_test)
            # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #                       pd.DataFrame(np.array(y_test))], \
            #                      axis = 1)
            # pred_obs.columns = ['pred', 'obs']
            # combo = pd.concat([combo, pred_obs], axis = 0)    
            
            #evaluation matrix - check p value
            if stats.pearsonr(y_test, predictions)[1] >= 0.05:
                print("insignificant correlation!")
                continue
            else:
                print(stats.pearsonr(y_test, predictions))
                metric_corr.append(stats.pearsonr(y_test, predictions)[0])
                print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
                metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
            
        
        #number of years used to train/test model
        num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\
                             pred_surge['date'][0]).days/365
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]
        num_pc = X_pca.shape[1] #number of principal components
        corr = np.mean(metric_corr)
        rmse = np.mean(metric_rmse)
        
        print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' -  avg_rmse (m) = ', \
              np.mean(metric_rmse), '\n')
        
        #original size and pca size of matrix added
        new_df = pd.DataFrame([tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T
        new_df.columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse']
        df = pd.concat([df, new_df], axis = 0)
        
        
        #save df as cs - in case of interruption
        os.chdir(dir_out)
        df.to_csv(tg_name)
        
        #cd to dir_in
        os.chdir(dir_in)
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


df = pd.read_csv("AutoInsurSweden.csv", sep = '\t')
df['Y'] = [x.replace(',', '.') for x in df['Y']]
df['Y'] = df['Y'].astype(float)
print(df.head())
X = df['X']
Y = df['Y']
X_train, X_test, Y_train, Y_test = np.asarray(train_test_split(X, Y, test_size = 0.15))

plt.scatter(X_train, Y_train)
plt.xlabel("X_train")
plt.ylabel("Y_train")
plt.show()

print("X_train contain = ", X_train.shape, "    and    Y_train contain = ", Y_train.shape)
print("X_test  contain = ", X_test.shape, "    and    Y_test   contain = ", Y_test.shape)

model = LinearRegression()
model.fit(X_train.values.reshape(-1,1), Y_train.values.reshape(-1,1))
#prediction = model.predict(X_test.values.reshape(-1,1))

score = model.score(X_test.values.reshape(-1,1), Y_test.values.reshape(-1,1))

print("Score = ", score)
Ejemplo n.º 59
0
from sklearn.model_selection import train_test_split
from  sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# Importing the dataset
data = pd.read_csv('50_Startups.csv')
x = data.iloc[:, :-1].values
y = data.iloc[:, -1].values


ct  = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

x_train, x_test, y_train, y_test  = train_test_split(x, y, test_size=0.2,  random_state=1)

regressor = LinearRegression()
regressor.fit(x_train, y_train)

percentErros = (abs(regressor.predict(x_test) - y_test) /y_test)*100
AveragePercentError  = sum(percentErros)/len(percentErros)
y_pred = regressor.predict(x_test)

np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
print("the accruacy of the model is:", 100 - AveragePercentError)

#print( (abs(regressor.predict(x_test) - y_test) /y_test)*100)



Ejemplo n.º 60
0
#导入数据
data = pd.read_csv('test.csv',encoding='gbk')

#画出散点图,求x和y的相关系数
plt.scatter(data.活动推广费,data.销售额)

print(data.corr())

#估计模型参数,建立回归模型
'''
(1) 首先导入简单线性回归的求解类LinearRegression
(2) 然后使用该类进行建模,得到lrModel的模型变量
'''

lrModel = LinearRegression()
#(3) 接着,我们把自变量和因变量选择出来
x = data[['活动推广费']]    #自变量
y = data[['销售额']]   #因变量

#模型训练
'''
调用模型的fit方法,对模型进行训练
这个训练过程就是参数求解的过程
并对模型进行拟合
'''
lrModel.fit(x,y)

#查看模型训练后的评分结果
print(lrModel.score(x,y))