def linear_learning(labels, train, test): label_log=np.log1p(labels) linear=LinearRegression() model=linear.fit(train, label_log) preds1=model.predict(test) preds=np.expm1(preds1) return preds
def test_partial_dependence_easy_target(est, power): # If the target y only depends on one feature in an obvious way (linear or # quadratic) then the partial dependence for that feature should reflect # it. # We here fit a linear regression_data model (with polynomial features if # needed) and compute r_squared to check that the partial dependence # correctly reflects the target. rng = np.random.RandomState(0) n_samples = 100 target_variable = 2 X = rng.normal(size=(n_samples, 5)) y = X[:, target_variable]**power est.fit(X, y) averaged_predictions, values = partial_dependence( est, features=[target_variable], X=X, grid_resolution=1000) new_X = values[0].reshape(-1, 1) new_y = averaged_predictions[0] # add polynomial features if needed new_X = PolynomialFeatures(degree=power).fit_transform(new_X) lr = LinearRegression().fit(new_X, new_y) r2 = r2_score(new_y, lr.predict(new_X)) assert r2 > .99
def _ols(self,x,y): lr = LinearRegression() coef_xy = lr.fit(y= y.reshape(-1, 1), X= x.reshape(-1, 1)).coef_ coef_yx = lr.fit(y= x.reshape(-1, 1), X= y.reshape(-1, 1)).coef_ r_xy = y - coef_xy*x r_yx = x - coef_yx*y return r_xy/np.std(r_xy), r_yx/np.std(r_yx)
def linreg_ccv_plot_roc(num_folds): global data folds = pd.create_folds(data, num_folds) classifier = LinearRegression() mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i in range(num_folds): test_x, test_y, train_x, train_y = pd.split_into_sets(data, folds, i) probs = classifier.fit(train_x, train_y).predict(test_x) fpr, tpr, thresholds = roc_curve(test_y, probs) #takes, y_true and y_score mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') mean_tpr /= len(folds) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('%d-fold Clustered Cross-Validation' % num_folds) plt.legend(loc="lower right") plt.show()
def _reduce_X(self,X,i): X_new = np.zeros(X.shape) lr = LinearRegression() for j in range(X_new.shape[1]): lr.fit(y= X[:,j].reshape(-1, 1), X= X[:,i].reshape(-1, 1)) X_new[:,j] = X[:,j] - lr.coef_*X[:,i] return np.delete(X_new, i, axis=1)
def linregress(X_train, X_test, y_train, y_test): coef = [] for col in X_train.columns.tolist(): X = StandardScaler().fit_transform(X_train[col]) lr = LinearRegression() lr.fit(X.reshape(-1, 1), y_train) coef.append([col, lr.coef_]) coef = sorted(coef, key=lambda x: x[1])[::-1] nos = [x[1] for x in coef] labs = [x[0] for x in coef] for lab in labs: if lab == 'doubles': labs[labs.index(lab)] = '2B' elif lab == 'triples': labs[labs.index(lab)] = '3B' elif lab == 'Intercept': idx = labs.index('Intercept') labs.pop(idx) nos.pop(idx) labs = [lab.upper() for lab in labs] x = range(len(nos)) plt.plot(x,nos, lw=2, c='b') plt.xticks(x, labs) plt.title('Linear Regression Coefficients (Win Percentage)') plt.savefig('images/coefficients.png') plt.show() print labs
def train_regressor(options, embed_map, wordvecs, worddict): """ Return regressor to map word2vec to RNN word space """ # Gather all words from word2vec that appear in wordvecs d = defaultdict(lambda : 0) for w in embed_map.vocab.keys(): d[w] = 1 shared = OrderedDict() count = 0 for w in worddict.keys()[:options['n_words']-2]: if d[w] > 0: shared[w] = count count += 1 # Get the vectors for all words in 'shared' w2v = numpy.zeros((len(shared), 300), dtype='float32') sg = numpy.zeros((len(shared), options['dim_word']), dtype='float32') for w in shared.keys(): w2v[shared[w]] = embed_map[w] sg[shared[w]] = wordvecs[w] clf = LinearRegression() clf.fit(w2v, sg) return clf
def linearRegressionExample(X, Y): # fit-intercept defines if we should fit an intrecpt term or not est = LinearRegression(fit_intercept=False) #fit the data est.fit(X,Y) # get coefficients est.coef_
def normalize_money_with_date(): with open('train_test.pickle') as f: train_set,test_set = pickle.load(f) money = float(np.max([movie['total_money'] for movie in train_set])) year_money = np.array([[movie['date'].year,float(movie['total_money'])/money] for movie in train_set],float) year_mean = np.zeros([5,2]) for y in range(5): money = year_money[year_money[:,0] == 2011+y,1] plt.scatter(y*np.ones(np.shape(money)),money) mean = np.mean(money) year_mean[y,:] = np.array([1+y,mean],float) regressor = LinearRegression() regressor.fit(year_mean[:,0:1],year_mean[:,1]) a,b = regressor.coef_, regressor.intercept_ with open('coef.pickle') as f: coef = pickle.load(f) coef['normalize_year'] = {'a':a,'b':b,'base':2010} with open('coef.pickle','w') as f: pickle.dump(coef,f) print a,b,regressor.score(year_mean[:,0:1],year_mean[:,1]) plt.plot(year_mean[:,1]) plt.savefig('year_money.png')
def train_leastSquareModel(X, y, fit_intercept=True, normalize=False, copy_X=True, n_jobs=1): """ Train a regression model using Least Square method """ model = LinearRegression(fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, n_jobs=n_jobs) model = model.fit(X, y) return model
def RunLinearRegressionScikit(q): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the responses # file. Log.Info("Loading dataset", self.verbose) if len(self.dataset) == 2: X = np.genfromtxt(self.dataset[0], delimiter=',') y = np.genfromtxt(self.dataset[1], delimiter=',') else: X = np.genfromtxt(self.dataset, delimiter=',') y = X[:, (X.shape[1] - 1)] X = X[:,:-1] try: with totalTimer: # Perform linear regression. model = SLinearRegression() model.fit(X, y, n_jobs=-1) b = model.coef_ except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def linear_regression(predictors, titanic): # Initialize our algorithm class alg = LinearRegression() # Generate cross validation folds for the titanic dataset. It return the row indices corresponding to train and test. # We set random_state to ensure we get the same splits every time we run this. kf = KFold(titanic.shape[0], n_folds=3, random_state=1) predictions = [] for train, test in kf: # The predictors we're using the train the algorithm. Note how we only take the rows in the train folds. train_predictors = (titanic[predictors].iloc[train,:]) # The target we're using to train the algorithm. train_target = titanic["Survived"].iloc[train] # Training the algorithm using the predictors and target. alg.fit(train_predictors, train_target) # We can now make predictions on the test fold test_predictions = alg.predict(titanic[predictors].iloc[test,:]) predictions.append(test_predictions) # The predictions are in three separate numpy arrays. Concatenate them into one. # We concatenate them on axis 0, as they only have one axis. predictions = np.concatenate(predictions, axis=0) # Map predictions to outcomes (only possible outcomes are 1 and 0) predictions[predictions > .5] = 1 predictions[predictions <=.5] = 0 accuracy_list = [x == y for x, y in zip(titanic["Survived"], predictions)] num_acc = sum(accuracy_list) accuracy = sum(accuracy_list) / len(accuracy_list) accuracy = accuracy.item()
def calc_task_two_one(): warnings.warn("deprecated", DeprecationWarning) model = LinearRegression() X = np.array(df[x_list].values) y = df['Price'].values model.fit(X, y) return model, X, y
def linear_model(df): dff = df df2 = dff.fillna(0) linreg = LinearRegression() df2 = df2[pd.notnull(df2[['Mean Price', 'Volume']])] df3 = df2[['Mean Price','Volume']] x = df3[['Mean Price']] y = df3[['Volume']] x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1) linreg.fit(x_train, y_train) intercept = linreg.intercept_ coef = linreg.coef_ plt.plot(x_train, linreg.predict(x_train), c='g', lw=3, label='Fitted line') plt.scatter(x_train, y_train, c='k') plt.xlabel('Mean Price') plt.ylabel('Volume') plt.show() # # compute root mean squared error # print np.sqrt(metrics.mean_squared_error(y_test, prediction)) # # rss = np.sum((y_test - linreg.predict(x_test)) ** 2) score = linreg.score(x_train, y_train) print score
def solution_1(N=3): sales_dict, month_list, class_id_list = prepare_data() df = make_train_data(sales_dict, month_list, class_id_list, lastN=N) df = df.sample(frac=1).reset_index(drop=True) data_X = pd.DataFrame() for i in range(N): data_X["last_"+str(i+1)] = df["last_"+str(i+1)] # print(data_X) # data_X = pd.DataFrame({ # 'last_1': df.last_1, # 'last_2': df.last_2, # 'last_3': df.last_3, # }) data_Y = df.Y test_size = int(len(df)/5) train_X = data_X[:-test_size] train_Y = data_Y[:-test_size] test_Y = data_Y[-test_size:] test_X = data_X[-test_size:] model = LinearRegression() model.fit(train_X, train_Y) print("train_size:", len(train_X), ", test_size", len(test_X)) print("model.coef_ = ", model.coef_) print("model.score = ", model.score(test_X, test_Y)) return model.score(test_X, test_Y), model.coef_,model
def impute_error(df, is_plot=False): slr = LinearRegression() mask1 = df['Percent.Error'].notnull() mask0 = df['Percent.Error'].isnull() df1 = df[mask1] df0 = df[mask0] print df0.shape, df1.shape # linear regression slr.fit(df1[['RadPeer.Score']], df1['Percent.Error']) predicted1 = slr.predict(df.loc[mask1, ['RadPeer.Score']]) predicted0 = slr.predict(df.loc[mask0, ['RadPeer.Score']]) df.loc[mask0, 'Percent.Error'] = predicted0 if is_plot: # make plot df1.plot(kind='scatter', x='RadPeer.Score', y='Percent.Error', color='blue', alpha=0.4, label='126 non-null', figsize=(7,7), zorder=2) plt.plot(df1[['RadPeer.Score']], predicted1, color='blue', label='linear fit', zorder=1) plt.scatter(df0[['RadPeer.Score']], predicted0, color='red', alpha=0.6, label='71 null', zorder=3) plt.legend(loc='upper left') sns.plt.savefig('impute_error.png', bbox_inches='tight')
def plot_linear_regression(): a = 0.5 b = 1.0 # x from 0 to 10 x = 30 * np.random.random(20) # y = a*x + b with noise y = a * x + b + np.random.normal(size=x.shape) # create a linear regression classifier clf = LinearRegression() clf.fit(x[:, None], y) # predict y from the data x_new = np.linspace(0, 30, 100) y_new = clf.predict(x_new[:, None]) # plot the results ax = plt.axes() ax.scatter(x, y) ax.plot(x_new, y_new) ax.set_xlabel('x') ax.set_ylabel('y') ax.axis('tight')
def linear_regression(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True): """ :param train_x: train :param train_y: text :param pred_x: test set to predict :param review_id: takes in a review id :param v_curve: run the model for validation curve :param l_curve: run the model for learning curve :param get_model: run the model :return:the predicted values,learning curve, validation curve """ lin = LinearRegression(normalize=True) if get_model: print "Fitting Linear..." lin.fit(train_x, np.log(train_y+1)) gbr_pred = np.exp(lin.predict(pred_x))- 1 for i in range(len(gbr_pred)): if gbr_pred[i] < 0: gbr_pred[i] = 0 Votes = gbr_pred[:,np.newaxis] Id = np.array(review_id)[:,np.newaxis] submission_lin= np.concatenate((Id,Votes),axis=1) np.savetxt("submission_lin.csv", submission_lin,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='') # plot validation and learning curves if v_curve: pass if l_curve: print "Working on Learning Curves" plot_learning_curve(LinearRegression(), "Learning Curve for Linear Regression", train_x, np.log(train_y+1.0))
def bayesian_ridge_regression(feature_array, label_array): clf = BayesianRidge(compute_score=True) clf.fit(feature_array, label_array) ols = LinearRegression() ols.fit(feature_array, label_array) n_features = 9 plt.figure(figsize=(6, 5)) plt.title("Weights of the model") plt.plot(clf.coef_, 'b-', label="Bayesian Ridge estimate") plt.plot(label_array, 'g-', label="Ground truth") plt.plot(ols.coef_, 'r--', label="OLS estimate") plt.xlabel("Features") plt.ylabel("Values of the weights") plt.legend(loc="best", prop=dict(size=12)) plt.figure(figsize=(6, 5)) plt.title("Histogram of the weights") plt.hist(clf.coef_, bins=n_features, log=True) # plt.plot(clf.coef_[feature_array], 5 * np.ones(len(feature_array)), # 'ro', label="Relevant features") plt.ylabel("Features") plt.xlabel("Values of the weights") plt.legend(loc="lower left") plt.figure(figsize=(6, 5)) plt.title("Marginal log-likelihood") plt.plot(clf.scores_) plt.ylabel("Score") plt.xlabel("Iterations") plt.show()
def quantify_higher_nesting(higher_dim, lower_dim): """ Quantifies how well higher levels of the tree can be reconstructed from lower levels """ lr = LinearRegression() best_score = -1 relationship = [] # quantify how well the higher dimensional solution can reconstruct # the lower dimensional solution using a linear combination of two factors for higher_name, higher_c in higher_dim.iteritems(): for lower_c1, lower_c2 in combinations(lower_dim.columns, 2): # combined prediction predict_mat = higher_dim.loc[:,[lower_c1, lower_c2]] lr.fit(predict_mat, higher_c) score = lr.score(predict_mat, higher_c) # individual correlation lower_subset = lower_dim.drop(higher_name, axis=1) higher_subset = higher_dim.drop([lower_c1, lower_c2], axis=1) corr = corr_lower_higher(higher_subset, lower_subset) if len(corr)==1: other_cols = [corr.iloc[0,0]] else: other_cols = corr.apply(lambda x: max(x**2)-sorted(x**2)[-2], axis=1) total_score = np.mean(np.append(other_cols, score)) if total_score>best_score: best_score = total_score relationship = {'score': score, 'lower_factor': higher_c.name, 'higher_factors': (lower_c1, lower_c2), 'coefficients': lr.coef_} return relationship
def event_prediction(thres,min_num_points,b_events,tc_list): # b_events: [(x,y,type,year)] events = ['accidentsAndIncidents','roadwork','precipitation','deviceStatus','obstruction','trafficConditions'] ret = [] # [(accidentsAndIncidents,roadwork,precipitation,deviceStatus,obstruction,trafficConditions)] lr = LinearRegression() for xmin,xmax,ymin,ymax in tc_list: cnt = Counter([(e_type,year) for x,y,e_type,year in b_events.value if x>xmin and x<xmax and y>ymin and y<ymax]) # {(e_type,year):count} counts = [] for e in events: year_count = {key[1]:val for key,val in cnt.items() if key[0] == e} # {year:count} if len(year_count) == 0: counts.append("0.00") continue year_count_desc_c = sorted(year_count.items(), key=operator.itemgetter(1), reverse = True) # [(year,count)], decending by count. current_max = year_count_desc_c[0][1] train_points = [] # (year,count) for y,c in year_count_desc_c: if c >= thres*current_max: current_max = c train_points.append((y,c)) if len(train_points) < min_num_points: # most recent year data for prediction year_count_desc_y = sorted(train_points, key=operator.itemgetter(0), reverse = True) # [(year,count)], decending by year. counts.append("%.2f"%(year_count_desc_y[0][1]/12.0)) # use the most recent year for prediction, because we don't have sufficient samples for model training. else: # linear regression for prediction x = np.array([v[0] for v in train_points]) y = np.array([v[1] for v in train_points]) m = lr.fit(x[:, np.newaxis], y) counts.append("%.2f"%(m.predict(2015)[0]/12.0)) ret.append(counts) return ret
def best_split_lin_reg(x_vect, y): node_lg = LinearRegression(n_jobs=NUM_CORES).fit(x_vect[:, np.newaxis], y) node_score = mse(y, node_lg.predict(x_vect[:, np.newaxis])) best_score = -np.inf best_split_value = None best_true_inds = None best_false_inds = None for split_value in np.unique(x_vect): true_inds = x_vect > split_value true_ratio = np.sum(true_inds) / float(len(y)) true_score = ling_reg_score(true_inds, x_vect, y) false_inds = np.invert(true_inds) false_ratio = 1 - true_ratio false_score = ling_reg_score(false_inds, x_vect, y) score = node_score - (true_ratio * true_score + false_ratio * false_score) if score > best_score: best_score = score best_split_value = split_value best_true_inds = true_inds best_false_inds = false_inds return best_false_inds, best_true_inds, best_split_value, best_score
def predict_residuals(train, test, forward): """ Linear Regression Args: train: Training data Data x_hat: Target Data Returns: y_hat: Estimated Output """ mdl = LinearRegression() # mdl = GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1) if forward: X = unpack(train, axis=0) y = unpack(train, axis=1) x_hat = unpack(test, axis=0) y_hat = unpack(test, axis=1) else: X = unpack(train, axis=1) y = unpack(train, axis=0) x_hat = unpack(test, axis=1) y_hat = unpack(test, axis=0) mdl.fit(X, y) return y_hat - mdl.predict(x_hat)
def do_stack_learn(self): reviews = AbstractEstimateBase.reviews # Collect several estimates es = np.array([ self._usermodel.all_estimates(), self._similarmovie.all_estimates(k = 1), self._similarmovie.all_estimates(k = 2), self._similarmovie.all_estimates(k = 3), self._similarmovie.all_estimates(k = 4), self._similarmovie.all_estimates(k = 5), ]) total_error = 0.0 coefficients = [] reg = LinearRegression() # Iterate over all users for u in range(reviews.shape[0]): es0 = np.delete(es, u, axis=1) r0 = np.delete(reviews, u, axis=0) X, Y = np.where(r0 > 0) X = es[:, X, Y] y = r0[r0 > 0] reg.fit(X.T, y) coefficients.append(reg.coef_) r0 = reviews[u] X = np.where(r0 > 0) p0 = reg.predict(es[:, u, X].squeeze().T) err0 = r0[r0 > 0] - p0 total_error += np.dot(err0, err0) coefficients = np.array(coefficients) print coefficients
def predict_device_byday_linear_regression(): X,Y_unique,Y_all,X_raw = load_device_counter_byday() # print X # print Y_unique from sklearn.linear_model import LinearRegression model = LinearRegression() training_size = 160 # model.fit(X[:training_size],Y_unique[:training_size]) model.fit(X[:training_size],Y_all[:training_size]) start_index = 180 end_index = 190 X_to_predict = X[start_index:end_index] # X_to_predict.append([date_str_toordinal('2017-04-18')]) # X_to_predict.append([date_str_toordinal('2017-03-27')]) print X_to_predict # Y_real = Y_unique[start_index:end_index] Y_real = Y_all[start_index:end_index] print X_raw[start_index:end_index] y_predicted=model.predict(X_to_predict) # print y_predicted y_predicted = np.array(y_predicted).astype(int) print y_predicted print Y_real # print y_predicted - np.array(Y_real) # plt.subplot(111) # plt.scatter(X_to_predict,Y_real,c='r') plt.scatter(X_to_predict,y_predicted) # plt.plot(X_to_predict,y_predicted) plt.show()
def setUp(self): # initialize a Image Database Object self.model_db = ModelDatabase('test') # for test purposes self.x = np.array([1,2,3,4,5,6,7,8,9,10]).reshape(-1,1) self.y = np.array([2,4,6,8,10,12,14,16,18,20]).reshape(-1,1) self.z = np.array([4,8,12,16,20,24,28,32,36,40]).reshape(-1,1) self.k = np.array([5,5,5,5,5,5,5,5,5,5]).reshape(-1,1) self.l = np.array([-2,-4,-6,-8,-10,-12,-14,-16,-18,-20]).reshape(-1,1) # y = 2x ; Assume id : '121A' self.regression_model1 = LinearRegression() self.regression_model1.fit(self.x, self.y) # z = 4x ; Assume id : '243' self.regression_model2 = LinearRegression() self.regression_model2.fit(self.x,self.z) # k = 5 ; Assume id : '392' self.regression_model3 = LinearRegression() self.regression_model3.fit(self.x,self.k) # l = -2x ; Assume id : '41A3' self.regression_model4 = LinearRegression() self.regression_model4.fit(self.x,self.l)
def get_tracks_params(self, x, y, labels, sample_weight=None): tracks_params = [] unique_labels = numpy.unique(labels) track_ids = unique_labels[unique_labels != -1] if len(track_ids) == 0: return [] for track_id in track_ids: x_track = x[labels == track_id] y_track = y[labels == track_id] if sample_weight != None: sample_weight_track = sample_weight[labels == track_id] else: sample_weight_track = None lr = LinearRegression() lr.fit(x_track.reshape(-1,1), y_track, sample_weight_track) params = list(lr.coef_) + [lr.intercept_] tracks_params.append(params) return numpy.array(tracks_params)
def plot_EFA_relationships(all_results): EFA_all_results = {k:v.EFA for k,v in all_results.items()} scores = {k:v.get_scores() for k,v in EFA_all_results.items()} # quantify relationships using linear regression for name1, name2 in combinations(scores.keys(), 2): scores1 = scores[name1] scores2 = scores[name2] lr = LinearRegression() cv_score = np.mean(cross_val_score(lr, scores1, scores2, cv=10)) print(name1, name2, cv_score) # plot # plot task factors in task PCA space pca = PCA(2) task_pca = pca.fit_transform(scores['task']) palettes = ['Reds', 'Blues', 'Greens'] all_colors = [] # plot scores in task PCA space f, ax = plt.subplots(figsize=[12,8]) ax.set_facecolor('white') for k,v in scores.items(): palette = sns.color_palette(palettes.pop(), n_colors = len(v.columns)) all_colors += palette lr = LinearRegression() lr.fit(task_pca, v) for i, coef in enumerate(lr.coef_): plt.plot([0,coef[0]], [0, coef[1]], linewidth=3, c=palette[i], label=k+'_'+str(v.columns[i])) leg = plt.legend(bbox_to_anchor=(.8, .5)) frame = leg.get_frame() frame.set_color('black') beautify_legend(leg, all_colors)
def lr_prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model): # Dataframe to store the model prediction df_model_lr = df_train.copy() for col in col_names: # X will store the features and the outcome Y X = df_train.copy() X = X.rename(columns={col:'Y'}) X = pd.merge(X, df_day_avg_values[[col]], left_on='day_time', right_index=True) X = X.rename(columns={col:col+'avg'}) # Building the neighbors (from adjacency list) with missing values filled as in model neighbors_col = ['S'+str(n) for n in adjacency_list[int(col[1:])]] X = X[['Y']].join(df_model[neighbors_col]) X_train = X[X['Y'] != -1] X_test = X[X['Y'] == -1] test_indices = X[X['Y'] == -1].index col_values = X['Y'] if len(X_test): # Models lr = LinearRegression() lr = lr.fit(X_train.drop('Y', axis=1), X_train.Y) col_values.ix[test_indices] = lr.predict(X_test.drop('Y', axis=1)) # Filling the result with the current sensor prediction df_model_lr[col] = np.round(col_values) return df_model_lr
def LinearRegressionPred(X, Y): lm = LinearRegression() lm.fit(X, Y) preds = lm.predict(X) preds_sorted = lm.predict(np.sort(X, 0)) return preds_sorted
#all column data have less p value 5% #now perform linear regression #first decimal scalling from sklearn.preprocessing import StandardScaler sc=StandardScaler() features=sc.fit_transform(features) #train test split from sklearn.model_selection import train_test_split features_train,features_test,labels_train,labels_test=train_test_split(features,labels,random_state=0,test_size=0.005) #now perform multiple linear regression from sklearn.linear_model import LinearRegression regressor=LinearRegression() regressor.fit(features_train,labels_train) pred=regressor.predict(features_test) print(pd.DataFrame({'actual':labels_test,'pred':pred}))
import numpy as np import matplotlib.pyplot as plt import pandas as pd # veri yukleme veriler = pd.read_csv('maaslar.csv') x = veriler.iloc[:,1:2] y = veriler.iloc[:,2:] X = x.values Y = y.values #linear regression from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(X,Y) plt.scatter(X,Y,color='red') plt.plot(x,lin_reg.predict(X), color = 'blue') plt.show() #polynomial regression from sklearn.preprocessing import PolynomialFeatures poly_reg = PolynomialFeatures(degree = 2) x_poly = poly_reg.fit_transform(X) print(x_poly) lin_reg2 = LinearRegression() lin_reg2.fit(x_poly,y) plt.scatter(X,Y,color = 'red')
import numpy import matplotlib.pyplot as plt from ages_net_worths import ageNetWorthData ages_train, ages_test, net_worths_train, net_worths_test = ageNetWorthData() from sklearn.linear_model import LinearRegression reg = LinearRegression() reg.fit(ages_train, net_worths_train) ### get Katie's net worth (she's 27) ### sklearn predictions are returned in an array, so you'll want to index into ### the output to get what you want, e.g. net_worth = predict([[27]])[0] (not ### exact syntax, the point is the [0] at the end). In addition, make sure the ### argument to your prediction function is in the expected format - if you get ### a warning about needing a 2d array for your data, a list of lists will be ### interpreted by sklearn as such (e.g. [[27]]). km_net_worth = reg.predict([[27]]) ### fill in the line of code to get the right value ### get the slope ### again, you'll get a 2-D array, so stick the [0][0] at the end slope = reg.coef_[0, 0] ### fill in the line of code to get the right value ### get the intercept ### here you get a 1-D array, so stick [0] on the end to access ### the info we want intercept = reg.intercept_[0] ### fill in the line of code to get the right value
#! /usr/bin/env python3 # coding : utf-8 import numpy as np from sklearn.linear_model import LinearRegression from sklearn import datasets from sklearn.model_selection import train_test_split clf = LinearRegression() data = datasets.load_boston() X = data.data y = data.target # X_train = X[:int(len(X) * 0.7)] # X_test = X[int(len(X) * 0.7):] # y_train = y[:int(len(y) * 0.7)] # y_test = y[int(len(y) * 0.7):] X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2) clf.fit(X_train, y_train) print(clf.coef_, clf.intercept_) print(clf.score(X_test, y_test)) # from sklearn import datasets # from sklearn.model_selection import cross_val_predict # from sklearn import linear_model # import matplotlib.pyplot as plt # lr = linear_model.LinearRegression() # boston = datasets.load_boston()
advert = pd.read_excel("灰度表1.xlsx") dataSet = pd.read_excel("预测表2.xlsx") columns = [ '线路价格(不含税)', '总里程', '业务类型', '需求类型1', '需求类型2', '是否续签', '车辆长度', '车辆吨位', '打包类型', '运输等级', '计划卸货等待时长', '计划运输时长', '线路总成本', '需求紧急程度' ] advert = advert[columns] dataSet = dataSet[columns[1:]] dataSet = fillNaN(dataSet) advert = fillNaN(advert) advert.columns = columns col = columns[1:] X = advert[col] y = advert['线路价格(不含税)'] lm1 = LinearRegression() lm1.fit(X, y) lm1_predict = lm1.predict(X[col]) print("R^2 lm1:", r2_score(y, lm1_predict)) # print(lm1.intercept_) nparr = lm1.coef_.tolist() # 模型值转list好计算 lis = [] dataSet.columns = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12' ] for i in range(1489): count = 0.0 for j in range(13): count += dataSet[str(j)][i] * nparr[j] lis.append([(count + lm1.intercept_), dataSet['12'][i]]) dataWrite(lis, 4)
def api_return(): if 'id' in request.args: id = int(request.args['id']) month = 12 year = 19 listing = listings.query.get(id) property_type = listing.property_type.property_type room_type = listing.room_types neighbourhood = listing.neighborhood.neighborhood accommodates = listing.accommodates bedrooms = listing.bedrooms bathrooms = listing.bathrooms beds = listing.bedrooms df = pd.DataFrame( columns=['month', 'year', 'property_type', 'room_type', 'neighbourhood', 'accommodates', 'bedrooms', 'bathrooms', 'beds'], data=[[month, year, property_type, room_type, neighbourhood, accommodates, bedrooms, bathrooms, beds]] ) train = pd.read_csv('https://raw.githubusercontent.com/JimKing100/airbnb-app-4/master/Datascience/data/train.csv') train = train.drop(columns=['old_index']) target = 'price' features = train.columns.drop(target) X_train = train[features] y_train = train[target] pipeline = make_pipeline( ce.OrdinalEncoder(), LinearRegression() ) pipeline.fit(X_train, y_train) y_pred = pipeline.predict(df) y_pred = y_pred[0] transformers = make_pipeline( ce.OrdinalEncoder(), SimpleImputer(strategy='mean') ) X_train_transformed = transformers.fit_transform(X_train) model = LinearRegression() model.fit(X_train_transformed, y_train) pro, con = explains(df, model, X_train_transformed, transformers, 0) output_str = jsonify(prediction=str(int(y_pred)), pros1=pro[0], pros2=pro[1], cons1=con[0], cons2=con[1]) else: return "Error: no id field provided" return output_str
import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('Salary_Data.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 1].values # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3, random_state=0) # Fitting Simple Linear Regression to the Training set from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(X_train, y_train) # Predicting the Test set results y_pred = regressor.predict(X_test) # Visualising the Training set results plt.scatter(X_train, y_train, color='red') plt.plot(X_train, regressor.predict(X_train), color='green') plt.title('Salary vs Experience (Training set)') plt.xlabel('Years of Experience') plt.ylabel('Salary') plt.show()
# X = dataset.iloc[:, : -1].values y = dataset.values[:, 4] ct_X = ColumnTransformer([('one_hot_encoder', OneHotEncoder(), [3])], remainder='passthrough') X = np.array(ct_X.fit_transform(X), dtype=np.float) # Avoiding the dummy variable trap X = X[:, 1:] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) regressor = LinearRegression() regressor.fit(X_train, y_train) # Predicting the results y_pred = regressor.predict(X=X_test) # building the optimal model with backwards elimination # setup variables for loop X = np.append(arr=np.ones((len(X), 1)).astype(np.float), values=X, axis=1) columns = [0, 1, 2, 3, 4, 5] # Used colims critical_value = 0.1 running = True # This is working while running: # Setup the new smaller set with more significant variables
#다중선형회귀 from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error #fit_intetcept = 상수항을 사용 할 거냐 #y=a+bX 에서 a가 상수항 X=wine.drop(["index","type","quality"],axis=1) y=wine.quality X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state=1) model=LinearRegression() model.fit(X_train,y_train) y_pred=model.predict(X_test) #성능 측정 #RMSE np.round(np.sqrt(mean_squared_error(y_test,y_pred)),5) ####################################################### from sklearn.linear_model import Ridge ridge=Ridge(alpha=0.05) ridge.fit(X_train,y_train) y_pred = ridge.predict(X_test) np.round(np.sqrt(mean_squared_error(y_test,y_pred)),5) fig = plt.figure(figsize=(6,3)) ax=fig.add_subplot(111)
from twitterscraper import query_tweets_from_user import nltk nltk.download('vader_lexicon') from nltk.sentiment.vader import SentimentIntensityAnalyzer from sklearn.linear_model import LinearRegression sid = SentimentIntensityAnalyzer() model = LinearRegression() features = [] labels = [] #Aumentamos el número de tweets all_tweets = query_tweets_from_user("barackobama", limit=800) #Ahora entrenamos con 600 tweets training = all_tweets[:600] testing = all_tweets[600:] for tweet in training: tweetAnalysis = sid.polarity_scores(tweet.text) #Dividimos el número de retweets y likes entre 1000 y lo convertimos en entero, para que ahora 30.000 y 30.500 ambos sean 30 #Multiplicamos las probabilidades para trabajar con números más grandes, ahora 0.10 será 10 features.append([ int(tweetAnalysis["neg"] * 100), int(tweetAnalysis["pos"] * 100), int(tweetAnalysis["neu"] * 100) ]) labels.append(int(tweet.likes / 1000)) model = model.fit(features, labels)
import numpy as np import matplotlib.pyplot as plt from matplotlib import style import matplotlib as mpl mpl.rc('figure', figsize=(8, 7)) from matplotlib.pylab import rcParams rcParams['figure.figsize'] = 20, 10 from sklearn.preprocessing import MinMaxScaler from sklearn.neural_network import MLPClassifier as MLP import pandas_datareader.data as web from pandas import Series, DataFrame from sklearn.linear_model import LinearRegression import datetime, math from sklearn.neighbors import KNeighborsRegressor as knn import matplotlib.dates as mdates clf = LinearRegression() #n_jobs=-1) days = 240 sight = 480 scaler = MinMaxScaler(feature_range=(0, 1)) start = datetime.datetime(2010, 1, 1) end = datetime.datetime.today() + datetime.timedelta(days=days) dayss = (end - start).days predicted_list = [end - datetime.timedelta(days=x) for x in range(days)] predicted_list.reverse() stock = input("Stock: ").upper() df = web.DataReader(stock, 'yahoo', start, end) data = df['Adj Close'] X, y = [], []
# Splitting the dataset into the Training set and Test set """from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)""" # Feature Scaling """from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) sc_y = StandardScaler() y_train = sc_y.fit_transform(y_train)""" #Linear Regression from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(X,y) #Polynomial Linear Regression from sklearn.preprocessing import PolynomialFeatures poly_reg = PolynomialFeatures(degree=4) X_poly = poly_reg.fit_transform(X) lin_reg_2 = LinearRegression() lin_reg_2.fit(X_poly, y) # Visualising the Linear Regression results plt.scatter(X, y, color = 'red') plt.plot(X, lin_reg.predict(X), color = 'blue') plt.title('Truth or Bluff (Linear Regression)') plt.xlabel('Position level')
feature_cols = ["age", "sex", "bmi", "children", "smoker", "region"] target = "charges" #separating feature attributes and target attribute y = df[target].values X = df[feature_cols].values #80/20 hold out approach #splitting data into training and testing X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) #training and fitting the model reg = LinearRegression() reg.fit(X_train, y_train) X_input = [[12, 1, 28, 0, 1, 2]] y_pred = reg.predict(X_test) #plotting actual vs predicted. First n instances df1 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}) n = 25 df1 = df1.head(n) df1.plot(kind='bar', figsize=(10, 8)) plt.grid(which='major', linestyle='-', linewidth='0.5', color='green') plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black') plt.title('Actual vs Predicted Values') plt.xlabel('Instance') plt.ylabel('Charges') plt.show()
import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np from mpl_toolkits.mplot3d import Axes3D from sklearn.linear_model import LinearRegression df = pd.read_csv("train.csv") # カラムとターゲットを指定 # ターゲットはSalePrice X = df[["OverallQual", "GrLivArea"]].values y = df["SalePrice"].values # 線形回帰の学習 slr = LinearRegression() slr.fit(X, y) # 回帰係数とy切片の表示 print("Coefficient : {0}".format(slr.coef_)) a1, a2 = slr.coef_ print("intercepts : {0}".format(slr.intercept_)) b = slr.intercept_ # 3D描画(回帰平面の描画) x, y, z = np.array(df["OverallQual"]), np.array(df["GrLivArea"]), np.array( df["SalePrice"].values) fig = plt.figure() ax = Axes3D(fig) ax.scatter3D(np.ravel(x), np.ravel(y), np.ravel(z))
def clusteringPoints(train_data, if_show_cluster, Y1, Y2, height, width, side): # =============================================== num_cluster_choices = [1, 2, 3, 4] # =============================================== # Initialize Average Score best_avg_score = np.inf # =============================================== # Start to select number of clusters for num_cluster in num_cluster_choices: # =============================================== # Do Clustering cluster = GaussianMixture(n_components = num_cluster, covariance_type = "full") gmm = cluster.fit(train_data) labels = gmm.predict(train_data) # =============================================== # Prepare Validation scores = [] X1_all = [] X2_all = [] # =============================================== # Start Validation for label in range(num_cluster): indices = np.where(labels == label)[0].tolist() # =============================================== # Regression on clusters that have more than 1 point if len(indices) > 1: reg_data = train_data[indices] reg = LinearRegression().fit(reg_data[:, 0].reshape(-1, 1), reg_data[:, 1]) # =============================================== # Get coefficient k = reg.coef_[0] b = reg.intercept_ # =============================================== # Avoid Bad k-values if (side == "L" and k > 0) or (side == "R" and k < 0): scores.append([-1, 1000]) break # =============================================== # Get coefficient score = reg.score(reg_data[:, 0].reshape(-1, 1), reg_data[:, 1]) # =============================================== # Scores scores.append([label, score]) # =============================================== # Get drawing points for X coordinate X1 = (Y1 - b) / k X2 = (Y2 - b) / k # =============================================== # Append X coordinates to lists X1_all.append([label, X1]) X2_all.append([label, X2]) # =============================================== # Update Info # =============================================== # Check if this is a good clustering avg_score = sum(pair[1] for pair in scores) / len(scores) if abs(avg_score - 1) <= abs(best_avg_score - 1): # =============================================== # Update best_avg_score = avg_score best_scores = scores best_num_cluster = num_cluster best_X1_all = X1_all best_X2_all = X2_all best_labels = labels # =============================================== # Try to reduce the number of clusters while (len(best_scores) > 1): # =============================================== # Initialization & n choose 2 if_reduced = False label_comb = combinations(list(set(best_labels)), 2) # =============================================== # True to merge vassal_label to suzerain_label for vassal_label, suzerain_label in label_comb: # =============================================== # Get Merged Indices vassal_indices = np.where(best_labels == vassal_label)[0].tolist() suzerain_indices = np.where(best_labels == suzerain_label)[0].tolist() indices = np.concatenate((vassal_indices, suzerain_indices)) # =============================================== # Do Regression for merged Data reg_data = train_data[indices] reg = LinearRegression().fit(reg_data[:, 0].reshape(-1, 1), reg_data[:, 1]) score = reg.score(reg_data[:, 0].reshape(-1, 1), reg_data[:, 1]) # =============================================== # After merging two clusters, what are the scores for all clusters? scores = [pair for pair in best_scores if (pair[0] != vassal_label and pair[0] != suzerain_label)] scores.append([suzerain_label, score]) # =============================================== # Get Average avg_score = sum(pair[1] for pair in scores) / len(scores) # =============================================== # Also need to worry about k temp_k = reg.coef_[0] if (side == "L" and temp_k <=0) or (side == "R" and temp_k >= 0): valid_k = True else: valid_k = False # =============================================== # Does this merging actuall help us??? # 0.03 is tolerance because we want to merge some clusters if (abs(avg_score - 1) <= abs(best_avg_score - 1) + 0.03) and valid_k: # =============================================== # Save information if we want to use this merge later k = temp_k b = reg.intercept_ new_X1 = (Y1 - b) / k new_X2 = (Y2 - b) / k removed_label = vassal_label merged_label = suzerain_label # =============================================== # Useful for next loop round best_avg_score = avg_score if_reduced = True # =============================================== # If num of clusters is reduced, we use the best merging if if_reduced: # =============================================== # Update information for drawing best_num_cluster = best_num_cluster - 1 best_labels = [merged_label if item == removed_label else item for item in best_labels] best_X1_all = [pair for pair in best_X1_all if (pair[0] != removed_label and pair[0] != merged_label)] best_X2_all = [pair for pair in best_X2_all if (pair[0] != removed_label and pair[0] != merged_label)] best_X1_all.append([merged_label, new_X1]) best_X2_all.append([merged_label, new_X2]) # =============================================== # If not reduced, we just what we have before else: break # =============================================== # If we want to see clusters if if_show_cluster: plt.figure() plt.axis([0, width, 0, height]) plt.scatter(train_data[:, 0], train_data[:, 1], c = best_labels, cmap= "viridis") plt.gca().invert_yaxis() return [pair[1] for pair in best_X1_all], [pair[1] for pair in best_X2_all], len(list(set(best_labels)))
def auto_arima(y, X=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2, max_q=5, start_P=1, D=None, start_Q=1, max_P=2, max_D=1, max_Q=2, max_order=5, m=1, seasonal=True, stationary=False, information_criterion='aic', alpha=0.05, test='kpss', seasonal_test='ocsb', stepwise=True, n_jobs=1, start_params=None, trend=None, method='lbfgs', maxiter=50, offset_test_args=None, seasonal_test_args=None, suppress_warnings=True, error_action='trace', trace=False, random=False, random_state=None, n_fits=10, return_valid_fits=False, out_of_sample_size=0, scoring='mse', scoring_args=None, with_intercept="auto", sarimax_kwargs=None, **fit_args): # NOTE: Doc is assigned BELOW this function # Temporary shim until we remove `exogenous` support completely X, fit_args = pm_compat.get_X(X, **fit_args) # pop out the deprecated kwargs fit_args = _warn_for_deprecations(**fit_args) # misc kwargs passed to various fit or test methods offset_test_args = val.check_kwargs(offset_test_args) seasonal_test_args = val.check_kwargs(seasonal_test_args) scoring_args = val.check_kwargs(scoring_args) sarimax_kwargs = val.check_kwargs(sarimax_kwargs) m = val.check_m(m, seasonal) trace = val.check_trace(trace) # can't have stepwise AND parallel n_jobs = val.check_n_jobs(stepwise, n_jobs) # validate start/max points start_p, max_p = val.check_start_max_values(start_p, max_p, "p") start_q, max_q = val.check_start_max_values(start_q, max_q, "q") start_P, max_P = val.check_start_max_values(start_P, max_P, "P") start_Q, max_Q = val.check_start_max_values(start_Q, max_Q, "Q") # validate d & D for _d, _max_d in ((d, max_d), (D, max_D)): if _max_d < 0: raise ValueError('max_d & max_D must be positive integers (>= 0)') if _d is not None: if _d < 0: raise ValueError('d & D must be None or a positive ' 'integer (>= 0)') # check on n_iter if random and n_fits < 0: raise ValueError('n_iter must be a positive integer ' 'for a random search') # validate error action actions = {'warn', 'raise', 'ignore', 'trace', None} if error_action not in actions: raise ValueError('error_action must be one of %r, but got %r' % (actions, error_action)) # start the timer after the parameter validation start = time.time() # copy array y = check_endog(y, dtype=DTYPE) n_samples = y.shape[0] # the workhorse of the model fits fit_partial = functools.partial( solvers._fit_candidate_model, start_params=start_params, trend=trend, method=method, maxiter=maxiter, fit_params=fit_args, suppress_warnings=suppress_warnings, trace=trace, error_action=error_action, scoring=scoring, out_of_sample_size=out_of_sample_size, scoring_args=scoring_args, information_criterion=information_criterion, ) # check for constant data if is_constant(y): warnings.warn('Input time-series is completely constant; ' 'returning a (0, 0, 0) ARMA.') return _return_wrapper( solvers._sort_and_filter_fits( fit_partial( y, X=X, order=(0, 0, 0), seasonal_order=(0, 0, 0, 0), with_intercept=val.auto_intercept( with_intercept, False), # False for the constant model **sarimax_kwargs ) ), return_valid_fits, start, trace) information_criterion = \ val.check_information_criterion(information_criterion, out_of_sample_size) # the R code handles this, but I don't think statsmodels # will even fit a model this small... # if n_samples <= 3: # if information_criterion != 'aic': # warnings.warn('n_samples (%i) <= 3 ' # 'necessitates using AIC' % n_samples) # information_criterion = 'aic' # adjust max p, q -- R code: # max.p <- min(max.p, floor(serieslength/3)) # max.q <- min(max.q, floor(serieslength/3)) max_p = int(min(max_p, np.floor(n_samples / 3))) max_q = int(min(max_q, np.floor(n_samples / 3))) # this is not in the R code and poses a risk that R did not consider... # if max_p|q has now dropped below start_p|q, correct it. start_p = min(start_p, max_p) start_q = min(start_q, max_q) # if it's not seasonal, we can avoid multiple 'if not is None' comparisons # later by just using this shortcut (hack): # TODO: can we remove this hack now? if not seasonal: D = m = -1 # TODO: check rank deficiency, check for constant Xs, regress if necessary xx = y.copy() if X is not None: lm = LinearRegression().fit(X, y) xx = y - lm.predict(X) # choose the order of differencing # is the TS stationary? if stationary: d = D = 0 # todo: or not seasonal ? if m == 1: D = max_P = max_Q = 0 # m must be > 1 for nsdiffs elif D is None: # we don't have a D yet and we need one (seasonal) D = nsdiffs(xx, m=m, test=seasonal_test, max_D=max_D, **seasonal_test_args) if D > 0 and X is not None: diffxreg = diff(X, differences=D, lag=m) # check for constance on any column if np.apply_along_axis(is_constant, arr=diffxreg, axis=0).any(): D -= 1 # D might still be None if not seasonal if D > 0: dx = diff(xx, differences=D, lag=m) else: dx = xx # If D was too big, we might have gotten rid of x altogether! if dx.shape[0] == 0: raise ValueError("The seasonal differencing order, D=%i, was too " "large for your time series, and after differencing, " "there are no samples remaining in your data. " "Try a smaller value for D, or if you didn't set D " "to begin with, try setting it explicitly. This can " "also occur in seasonal settings when m is too large." % D) # difference the exogenous matrix if X is not None: if D > 0: diffxreg = diff(X, differences=D, lag=m) else: diffxreg = X else: # here's the thing... we're only going to use diffxreg if exogenous # was not None in the first place. However, PyCharm doesn't know that # and it thinks we might use it before assigning it. Therefore, assign # it to None as a default value and it won't raise the warning anymore. diffxreg = None # determine/set the order of differencing by estimating the number of # orders it would take in order to make the TS stationary. if d is None: d = ndiffs( dx, test=test, alpha=alpha, max_d=max_d, **offset_test_args, ) if d > 0 and X is not None: diffxreg = diff(diffxreg, differences=d, lag=1) # if any columns are constant, subtract one order of differencing if np.apply_along_axis(is_constant, arr=diffxreg, axis=0).any(): d -= 1 # check differences (do we want to warn?...) if not suppress_warnings: # TODO: context manager for entire block # noqa: E501 val.warn_for_D(d=d, D=D) if d > 0: dx = diff(dx, differences=d, lag=1) # check for constant if is_constant(dx): ssn = (0, 0, 0, 0) if not seasonal \ else sm_compat.check_seasonal_order((0, D, 0, m)) # Include the benign `ifs`, because R's auto.arima does. R has some # more options to control that we don't, but this is more readable # with a single `else` clause than a complex `elif`. if D > 0 and d == 0: with_intercept = val.auto_intercept(with_intercept, True) # TODO: if ever implemented in sm # fixed=mean(dx/m, na.rm = TRUE) elif D > 0 and d > 0: pass elif d == 2: pass elif d < 2: with_intercept = val.auto_intercept(with_intercept, True) # TODO: if ever implemented in sm # fixed=mean(dx, na.rm = TRUE) else: raise ValueError('data follow a simple polynomial and are not ' 'suitable for ARIMA modeling') # perfect regression return _return_wrapper( solvers._sort_and_filter_fits( fit_partial( y, X=X, order=(0, d, 0), seasonal_order=ssn, with_intercept=with_intercept, **sarimax_kwargs ) ), return_valid_fits, start, trace ) # seasonality issues if m > 1: if max_P > 0: max_p = min(max_p, m - 1) if max_Q > 0: max_q = min(max_q, m - 1) # TODO: if approximation # . we need method='css' or something similar for this # R determines whether to use a constant like this: # allowdrift <- allowdrift & (d + D) == 1 # allowmean <- allowmean & (d + D) == 0 # constant <- allowdrift | allowmean # but we don't have `allowdrift` or `allowmean` so use just d and D if with_intercept == 'auto': with_intercept = (d + D) in (0, 1) if not stepwise: # validate max_order if max_order is None: max_order = np.inf elif max_order < 0: raise ValueError('max_order must be None or a positive ' 'integer (>= 0)') search = solvers._RandomFitWrapper( y=y, X=X, fit_partial=fit_partial, d=d, D=D, m=m, max_order=max_order, max_p=max_p, max_q=max_q, max_P=max_P, max_Q=max_Q, random=random, random_state=random_state, n_fits=n_fits, n_jobs=n_jobs, seasonal=seasonal, trace=trace, with_intercept=with_intercept, sarimax_kwargs=sarimax_kwargs, ) else: if n_samples < 10: start_p = min(start_p, 1) start_q = min(start_q, 1) start_P = start_Q = 0 # seed p, q, P, Q vals p = min(start_p, max_p) q = min(start_q, max_q) P = min(start_P, max_P) Q = min(start_Q, max_Q) # init the stepwise model wrapper search = solvers._StepwiseFitWrapper( y, X=X, start_params=start_params, trend=trend, method=method, maxiter=maxiter, fit_params=fit_args, suppress_warnings=suppress_warnings, trace=trace, error_action=error_action, out_of_sample_size=out_of_sample_size, scoring=scoring, scoring_args=scoring_args, p=p, d=d, q=q, P=P, D=D, Q=Q, m=m, max_p=max_p, max_q=max_q, max_P=max_P, max_Q=max_Q, seasonal=seasonal, information_criterion=information_criterion, with_intercept=with_intercept, **sarimax_kwargs, ) sorted_res = search.solve() return _return_wrapper(sorted_res, return_valid_fits, start, trace)
features = data.iloc[:, 3:].values labels = data.iloc[:, 2].values from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size=0.1, random_state=0) #scaling of data from sklearn.preprocessing import StandardScaler sc = StandardScaler() features_train = sc.fit_transform(features_train) features_test = sc.transform(features_test) #system is trained using linear regression from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(features_train, labels_train) #prediction done over test data pred = regressor.predict(features_test) #prediction over next lot of data x = pd.read_csv("rfmmed.csv") testd = x.iloc[99:, 1:] testd = sc.transform(testd) pred1 = regressor.predict(testd) score = regressor.score(features_train, labels_train) print(score) score = regressor.score(features_test, labels_test) print(score)
import mglearn as mglearn from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression x,y = mglearn.datasets.load_extended_boston() x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 0) lr = LinearRegression().fit(x_train,y_train) print("Training set score : %f" %lr.score(x_train,y_train)) print("Test set score : %f" %lr.score(x_test,y_test))
# # Indeed, :class:`~sklearn.linear_model.LinearRegression` is a least squares # approach minimizing the mean squared error (MSE) between the training and # predicted targets. In contrast, # :class:`~sklearn.linear_model.QuantileRegressor` with `quantile=0.5` # minimizes the mean absolute error (MAE) instead. # # Let's first compute the training errors of such models in terms of mean # squared error and mean absolute error. We will use the asymmetric Pareto # distributed target to make it more interesting as mean and median are not # equal. from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error linear_regression = LinearRegression() quantile_regression = QuantileRegressor(quantile=0.5, alpha=0) y_pred_lr = linear_regression.fit(X, y_pareto).predict(X) y_pred_qr = quantile_regression.fit(X, y_pareto).predict(X) print(f"""Training error (in-sample performance) {linear_regression.__class__.__name__}: MAE = {mean_absolute_error(y_pareto, y_pred_lr):.3f} MSE = {mean_squared_error(y_pareto, y_pred_lr):.3f} {quantile_regression.__class__.__name__}: MAE = {mean_absolute_error(y_pareto, y_pred_qr):.3f} MSE = {mean_squared_error(y_pareto, y_pred_qr):.3f} """) # %%
predictors = ["Dietary Calories (cal)", "Steps (count)"] #lag Y variable, because our weight in the morning is function of what we did yesterday missing = df.loc[1][target] #save the first value Y = df[target].shift(-1).dropna().values #shift removes the last value Y = np.append(missing, Y) #attach the last value back onto the Y array #impute missing values X = df[predictors] from sklearn.preprocessing import Imputer imp = Imputer(missing_values=0, strategy='mean', axis=0) X = imp.fit_transform(X) #traing the machine learning model from sklearn.linear_model import LinearRegression ols = LinearRegression(fit_intercept=True, normalize=False) ols.fit(X, Y) #reporting from the model coefs = ols.coef_ inter = round(ols.intercept_, 3) print("-" * 15 + "model intercept" + "-" * 15) print("all else constant the model predicts that {t} should be {i}".format( t=target, i=inter)) print("-" * 15 + "predictor variables coefficients" + "-" * 15) for i, var in enumerate(predictors): print("for one unit increase in {v} your model predicts:".format(v=var)) print("a {c} change in {t}".format(c=coefs[i], t=target))
'ACTION':'ACTION', 'THRILLER':'SUSPENSE', 'ANIMATION': 'ANIMATION', 'ROMANTIC COMEDY':'COMEDY', 'ROMANTIC DRAMA': 'DRAMA', 'ROMANCE': 'DRAMA', 'DOCUMENTARY': 'DOCUMENTARY' }) # prediction X_train = df_train.iloc[:,2:6].values y_train = df_train['OBO'].values X_test = df_test.iloc[:,2:6].values mod_reg = LinearRegression() mod_reg.fit(X_train, y_train) #y_pred = mod_reg.predict(X_test) # APP app = dash.Dash() colors = { 'background': '#111111', 'text': '#808080' } available_indicators = np.array(['OBO', 'UA_T', 'TA_T', 'DI_T', 'FC_T']) markdown_text ='''
import numpy as np import pandas as pd import matplotlib.pyplot as plt myData = pd.read_excel('data.xlsx') # split datasets: 90% training data & 10% test data from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(myData['ukuran'], myData['harga'], test_size=.1) # linear regression from sklearn.linear_model import LinearRegression model = LinearRegression() # training model.fit(myData[['ukuran']], myData['harga']) # save model: pickle import pickle with open('1_modelPickle', 'wb') as modelku: pickle.dump(model, modelku)
def __init__(self, k=3, buffer_ratio=4, allow_sub_k=True): self.k = k self.buffer_ratio = buffer_ratio self.model = LinearRegression() self.variance = 3. self.allow_sub_k = allow_sub_k
# test_predict_plot =np.empty_like(data) # test_predict_plot[:,:]=np.nan # test_predict_plot[len(train_predict)+(lags*2)+1:len(data)-1,:]=test_predict # plt.plot(data, label='Observado', color='blue') # plt.plot(train_predict_plot, label='Previsão para os dados de treino', color='red', alpha=0.5) # plt.plot(test_predict_plot, label='Previsão para os dados de teste', color='yellow') # plt.legend(loc='best') # plt.show # plt.savefig('Grafico MLP Lag' + str(lags) +'.png') from sklearn.linear_model import LinearRegression rl = LinearRegression().fit(X_train,y_train) rl_trainscore =rl.score(X_train, y_train) rl_testscore=rl.score(X_test, y_test) rl_predicttest =rl.predict(X_test) rl_predicttrain =rl.predict(X_train) rl_r2=pearsonr(y_test, rl_predicttest) # rl_r2=r2_score(y_test,rl_predicttest) # rl_rmse = mean_squared_error(y_test, rl_predicttest) # rl_mae = mean_absolute_error(y_test, rl_predicttest) print("Iniciando Regressão Linear") rl_r2_train = cross_val_score(rl, X_train, y_train, cv=splits, scoring=my_scorer) rl_r2_test = cross_val_score(rl, X_test, y_test, cv=splits, scoring=my_scorer) rl_mse_train = cross_val_score(rl, X_train, y_train, cv=splits, scoring='neg_mean_squared_error') rl_mae_train = cross_val_score(rl, X_train, y_train, cv=splits, scoring='neg_mean_absolute_error')
rnd_reg.fit(X_train, y_train) y_pred_rf = rnd_reg.predict(X_test) # Residual Squared Error from sklearn.metrics import r2_score result_score_random_reg = r2_score(y_test, y_pred_rf) print('result score for random forest regression is ', result_score_random_reg) #################################################################################################################################################### # Linear Regression # Fitting Linear Regression to the dataset from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(X_train, y_train) y_pred_linear = lin_reg.predict(X_test) result_score_linear_prediction = r2_score(y_test, y_pred_linear) print('result score for linear regression is ', result_score_linear_prediction) #################################################################################################################################################### # Polynomial Regression from sklearn.preprocessing import PolynomialFeatures poly_reg = PolynomialFeatures(degree=2) X_poly = poly_reg.fit_transform(X_train) lin_reg_2 = LinearRegression()
str = str.replace(',', '') return float(str.replace('+', '')) # xử lý dữ liệu import numpy as np converter_1 = np.vectorize(lambda str: convertSize(str)) converter_2 = np.vectorize(lambda str: removePlus(str)) X[:, 2] = converter_1(X[:, 2]) # column file size y = converter_2(y) # column install # chia dataset 20% cho testing from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) regr = LinearRegression() regr.fit(X_train, y_train) y_pred = regr.predict(X_test) from sklearn.decomposition import PCA pca = PCA(n_components=1) X_new = pca.fit_transform(X) matrix_w = pca.components_.T X_train, X_test = X_train.dot(matrix_w), X_test.dot(matrix_w) # sử dụng Linear Regression để training data regr2 = LinearRegression() regr2.fit(X_train, y_train) y_pred = regr2.predict(X_test)
def validate(): """ run KFOLD method for regression """ #defining directories dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged" dir_out = "/lustre/fs0/home/mtadesse/merraLRValidation" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" #cd to the lagged predictors directory os.chdir(dir_in) x = 105 y = 106 #empty dataframe for model validation df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse']) #looping through for tg in range(x,y): os.chdir(dir_in) tg_name = os.listdir()[tg] print(tg, tg_name) ########################################## #check if this tg is already taken care of ########################################## os.chdir(dir_out) if os.path.isfile(tg_name): return "file already analyzed!" os.chdir(dir_in) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis = 1, inplace = True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis = 1) #standardize predictor data dat = pred.iloc[:,1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis = 1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis = 1, inplace = True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis = 0, inplace = True) surge.reset_index(inplace = True) surge.drop('index', axis = 1, inplace = True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns = ['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis = 1) #merge predictors and surge to find common time frame pred_surge = pd.merge(pred_standardized, surge_new.iloc[:,:2], on='date', how='right') pred_surge.sort_values(by = 'date', inplace = True) #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis =1)] pred_surge.drop(row_nan.index, axis = 0, inplace = True) pred_surge.reset_index(inplace = True) pred_surge.drop('index', axis = 1, inplace = True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-'*80) print('Predictors and Surge don''t overlap') print('-'*80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:,1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis = 1, inplace = True) #apply PCA pca = PCA(.95) pca.fit(X) X_pca = pca.transform(X) #apply 10 fold cross validation kf = KFold(n_splits=10, random_state=29) metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs']) for train_index, test_index in kf.split(X): X_train, X_test = X_pca[train_index], X_pca[test_index] y_train, y_test = y['surge'][train_index], y['surge'][test_index] #train regression model lm = LinearRegression() lm.fit(X_train, y_train) #predictions predictions = lm.predict(X_test) # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # pd.DataFrame(np.array(y_test))], \ # axis = 1) # pred_obs.columns = ['pred', 'obs'] # combo = pd.concat([combo, pred_obs], axis = 0) #evaluation matrix - check p value if stats.pearsonr(y_test, predictions)[1] >= 0.05: print("insignificant correlation!") continue else: print(stats.pearsonr(y_test, predictions)) metric_corr.append(stats.pearsonr(y_test, predictions)[0]) print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions))) #number of years used to train/test model num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\ pred_surge['date'][0]).days/365 longitude = surge['lon'][0] latitude = surge['lat'][0] num_pc = X_pca.shape[1] #number of principal components corr = np.mean(metric_corr) rmse = np.mean(metric_rmse) print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' - avg_rmse (m) = ', \ np.mean(metric_rmse), '\n') #original size and pca size of matrix added new_df = pd.DataFrame([tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T new_df.columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse'] df = pd.concat([df, new_df], axis = 0) #save df as cs - in case of interruption os.chdir(dir_out) df.to_csv(tg_name) #cd to dir_in os.chdir(dir_in)
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split df = pd.read_csv("AutoInsurSweden.csv", sep = '\t') df['Y'] = [x.replace(',', '.') for x in df['Y']] df['Y'] = df['Y'].astype(float) print(df.head()) X = df['X'] Y = df['Y'] X_train, X_test, Y_train, Y_test = np.asarray(train_test_split(X, Y, test_size = 0.15)) plt.scatter(X_train, Y_train) plt.xlabel("X_train") plt.ylabel("Y_train") plt.show() print("X_train contain = ", X_train.shape, " and Y_train contain = ", Y_train.shape) print("X_test contain = ", X_test.shape, " and Y_test contain = ", Y_test.shape) model = LinearRegression() model.fit(X_train.values.reshape(-1,1), Y_train.values.reshape(-1,1)) #prediction = model.predict(X_test.values.reshape(-1,1)) score = model.score(X_test.values.reshape(-1,1), Y_test.values.reshape(-1,1)) print("Score = ", score)
from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression # Importing the dataset data = pd.read_csv('50_Startups.csv') x = data.iloc[:, :-1].values y = data.iloc[:, -1].values ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough') x = np.array(ct.fit_transform(x)) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) regressor = LinearRegression() regressor.fit(x_train, y_train) percentErros = (abs(regressor.predict(x_test) - y_test) /y_test)*100 AveragePercentError = sum(percentErros)/len(percentErros) y_pred = regressor.predict(x_test) np.set_printoptions(precision=2) print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)) print("the accruacy of the model is:", 100 - AveragePercentError) #print( (abs(regressor.predict(x_test) - y_test) /y_test)*100)
#导入数据 data = pd.read_csv('test.csv',encoding='gbk') #画出散点图,求x和y的相关系数 plt.scatter(data.活动推广费,data.销售额) print(data.corr()) #估计模型参数,建立回归模型 ''' (1) 首先导入简单线性回归的求解类LinearRegression (2) 然后使用该类进行建模,得到lrModel的模型变量 ''' lrModel = LinearRegression() #(3) 接着,我们把自变量和因变量选择出来 x = data[['活动推广费']] #自变量 y = data[['销售额']] #因变量 #模型训练 ''' 调用模型的fit方法,对模型进行训练 这个训练过程就是参数求解的过程 并对模型进行拟合 ''' lrModel.fit(x,y) #查看模型训练后的评分结果 print(lrModel.score(x,y))