def combination_algorithm(AMDs_train, energy_train, AMDs_test, energy_test, type): NUMBER_OF_CLUSTER = 5 if type == "kmeans_com": model = KMeans(n_clusters=NUMBER_OF_CLUSTER).fit(AMDs_train) y_clusters = model.predict(AMDs_test) elif type == "affinity_com": model = AffinityPropagation(damping=0.9, random_state=5).fit(AMDs_train) y_clusters = model.predict(AMDs_test) elif type == "agglomerative_com": model = AgglomerativeClustering(n_clusters=NUMBER_OF_CLUSTER) y_clusters = model.fit_predict(AMDs_test) elif type == "birch_com": model = Birch(threshold=0.1, n_clusters=NUMBER_OF_CLUSTER).fit(AMDs_train) y_clusters = model.predict(AMDs_test) elif type == "minibatch_com": model = MiniBatchKMeans(n_clusters=NUMBER_OF_CLUSTER).fit(AMDs_train) y_clusters = model.predict(AMDs_test) elif type == "meanshift_com": model = MeanShift().fit(AMDs_train) y_clusters = model.predict(AMDs_test) else: return new_energy = [] new_energy_test = [] for i in range(NUMBER_OF_CLUSTER): if i not in y_clusters: print("ERROR: ", i, " is not here") continue index = 0 temp_AMDs = [] temp_energy = [] for j in model.labels_: if i == j: temp_AMDs.append(AMDs_train[index]) temp_energy.append(energy_train[index]) index += 1 index = 0 temp_AMDs_test = [] temp_energy_test = [] for j in y_clusters: if i == j: temp_AMDs_test.append(AMDs_test[index]) temp_energy_test.append(energy_test[index]) index += 1 quadratic_featurizer = PolynomialFeatures(degree=1, interaction_only=True) X_train_quadratic = quadratic_featurizer.fit_transform(temp_AMDs) X_test_quadratic = quadratic_featurizer.fit_transform(temp_AMDs_test) model2 = LinearRegression() model2.fit(X_train_quadratic, temp_energy) temp_energy_pred = model2.predict(X_test_quadratic) new_energy.extend(temp_energy_pred) new_energy_test.extend(temp_energy_test) fig, ax = plt.subplots() ax.scatter(temp_energy_test, temp_energy_pred) ax.plot([np.min(temp_energy_test), np.max(temp_energy_test)], [np.min(temp_energy_test), np.max(temp_energy_test)], 'k--', lw=4) ax.set_xlabel('Given') ax.set_ylabel('Predicted') plt.savefig('./image/combination_algorithm' + str(i) + '.jpg') fig, ax = plt.subplots() print("R^2 score of the combination algorithm is: ", r2_score(new_energy_test, new_energy)) print("RMSE of the combination algorithm is: ", math.sqrt(mean_squared_error(new_energy_test, new_energy))) ax.scatter(new_energy_test, new_energy) ax.plot([np.min(new_energy_test), np.max(new_energy_test)], [np.min(new_energy_test), np.max(new_energy_test)], 'k--', lw=4) ax.set_xlabel('Given') ax.set_ylabel('Predicted') plt.savefig('./image/combination_algorithm.jpg')
st.sidebar.title("Upload Your Sales History") def load_data(file): df = pd.read_csv(file,decimal=".") df2 =df.drop(["date"], axis=1) df2=df2.replace(0, 0.01) df2['total']=df2.sum(axis=1) return df, df2 if uploaded_file is not None: df, df2 = load_data(uploaded_file) # prepare models models = [] models.append(('LR', LinearRegression())) models.append(('KNN', KNeighborsRegressor())) models.append(('RF', RandomForestRegressor())) models.append(('GB', GradientBoostingRegressor())) models.append(('XGBoost', XGBRegressor(verbosity = 0))) models.append(('SVM', LinearSVR())) models.append(('Extra Trees', ExtraTreesRegressor())) models.append(('Naive', NaiveForecaster(strategy="last", sp=12))) models.append(('Theta', ThetaForecaster(sp=12))) models.append(('Exp_Smoothing', ExponentialSmoothing(trend="add", seasonal="additive", sp=12))) models.append(('TBATS', TBATS(sp=12, use_trend=True, use_box_cox=False))) forecast_horizon = st.sidebar.slider(label = 'Forecast Length (months)',min_value = 3, max_value = 36, value = 12) window_length = st.sidebar.slider(label = 'Sliding Window Length ',min_value = 1, value = 12) # evaluate each model in turn results1 = []
# In[10]: feature_cols = [ "Monthly Income", "Transaction Time", "Gender_Female", "Gender_Male", "City_Tier 1", "City_Tier 2", "City_Tier 3", "Record" ] # In[11]: X = df_new[feature_cols] Y = df_new["Total Spend"] # In[12]: lm = LinearRegression() lm.fit(X, Y) # In[13]: print(lm.intercept_) print(lm.coef_) # In[14]: list(zip(feature_cols, lm.coef_)) # In[15]: lm.score(X, Y)
print (x) """**Encoding Categorical Data**""" from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder ct= ColumnTransformer( transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough') x = np.array(ct.fit_transform(x)) print (x) """**Seperate Test Set and Training Set**""" from sklearn.model_selection import train_test_split x_train, x_test, y_train , y_test = train_test_split(x, y, test_size=0.2 , random_state=0) """**Training the Multiple Linear Regression Model**""" from sklearn.linear_model import LinearRegression regressor= LinearRegression() regressor.fit(x_train, y_train) """**Predicting the Test Set Result**""" y_pred= regressor.predict(x_test) np.set_printoptions(precision=2) print (y_pred) print(y_test)
#No missing values, no need for imputer this time # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split #We will split 10 to test, 20 to train X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3, random_state=0) #No need for feature scaling #Fitting Simple Lin Regression model to Training Set from sklearn.linear_model import LinearRegression regressor = LinearRegression() #We are fine with default parameters regressor.fit( X_train, y_train) #machine is the regressor, made it learn on the training set #Machine can now based on its learning experience predict the new salary #Regressor learned the correlations between experience and salary #Predicting the test results - create a vector of predictions y_pred = regressor.predict( X_test) #vector of predictions of dependant variable #The predictions are pretty damn close #Visualizing the results with matplotlib plt.scatter(X_train, y_train, color='red') #plots the real values
from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso from sklearn.linear_model import LogisticRegression from sklearn.datasets import load_breast_cancer from sklearn.svm import LinearSVC import mglearn from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from sklearn.datasets import make_blobs import numpy as np #线性回归与L2正则化 X, y = mglearn.datasets.load_extended_boston() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) lr = LinearRegression().fit(X_train, y_train) print(lr.score(X_train, y_train), lr.score(X_test, y_test)) ridge = Ridge(alpha=0.1).fit(X_train, y_train) #L2正则化,alpha为正则化参数,越大则越趋向0,泛化性越强 print(ridge.score(X_train, y_train), ridge.score(X_test, y_test)) mglearn.plots.plot_ridge_n_samples() plt.show() #L1正则化 lasso = Lasso(alpha=0.1).fit(X_train, y_train) print(lasso.score(X_train, y_train), lasso.score(X_test, y_test)) #分类的线性模型
X = preprocessing.scale( X ) # scaled the Data this is a set of ADJ Close values for label these are the results or X #used to generate the model X_lately = X[-forcast_out:] X = X[:-forcast_out] #used for labels y = np.array(dfreg['Label']) y = y[:-forcast_out] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) len(X) #Linear LinearRegression clfreg = LinearRegression(n_jobs=-1) # -1 means uses all processors clfreg.fit(X_train, y_train) #Quaratic Regression clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge()) clfpoly2.fit(X_train, y_train) #Polynomial regresion of degree 3 (Cubic?) clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge()) clfpoly3.fit(X_train, y_train) #KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) clfknn.fit(X_train, y_train)
def GetForecast(): loc = request.args.get('loc') def PolynomialRegressionPrecip(degree,val): regressor = LinearRegression() regressor.fit(MainList, Preci) xx = np.linspace(0, 26, 100) yy = regressor.predict(xx.reshape(xx.shape[0], 1)) quadratic_featurizer = PolynomialFeatures(degree) X_quadratic = quadratic_featurizer.fit_transform(MainList) regressor_quadratic = LinearRegression() regressor_quadratic.fit(X_quadratic, Preci) xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1)) #print ('Residual sum of squares: %.2f' % np.mean(( regressor_quadratic.predict(X_quadratic)- Preci) ** 2)) 10.74 X_quadratic = quadratic_featurizer.fit_transform([30 + val]) output = regressor_quadratic.predict(X_quadratic) return (output) def PolynomialRegressionHumidity(degree,val): regressor = LinearRegression() regressor.fit(MainList, Humidity) xx = np.linspace(0, 26, 100) yy = regressor.predict(xx.reshape(xx.shape[0], 1)) quadratic_featurizer = PolynomialFeatures(degree) X_quadratic = quadratic_featurizer.fit_transform(MainList) regressor_quadratic = LinearRegression() regressor_quadratic.fit(X_quadratic, Humidity) xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1)) # print ('Residual sum of squares: %.2f' % np.mean(( regressor_quadratic.predict(X_quadratic)- Humidity) ** 2)) #error 7.08 X_quadratic = quadratic_featurizer.fit_transform([30 + val]) output = regressor_quadratic.predict(X_quadratic) return (output) df = pd.read_csv('static/WeatherData.csv') url = 'https://mean-wizards-v2.mybluemix.net/api/getdata?loc='+loc+'&limit=30' r = requests.get(url) ServerData =r.json() Temp = df['Temp'].values Temp = Temp[90:120] Humidity = df['Humidity'].values Humidity = Humidity[90:120] Preci = df['preci'].values Preci = Preci[90:120] WeatherData = df[['Temp','Humidity']].values X = [each for each in range(1,31)] MainList = [] for each in X: MainList.append([each]) slr = LinearRegression() WeatherData = WeatherData[:30] coData = [float(i['co']) for i in ServerData] co2Data = [float(i['co2']) for i in ServerData] no2Data = [float(i['no2']) for i in ServerData] pm25Data = [float(i['pm25']) for i in ServerData] so2Data = [float(i['so2']) for i in ServerData] print (len(coData)) print (len(WeatherData)) # In[284]: def Prediction(val): slr.fit(MainList,Temp) Temp_predict = [30 + val] Temp_output = slr.predict(Temp_predict) PredictedWeatherData['temp'] = Temp_output[0] Humid = PolynomialRegressionHumidity(9,val) PredictedWeatherData['Humidity'] = Humid[0] preci = PolynomialRegressionPrecip(22,val) PredictedWeatherData['preci'] = preci[0] AQIParameters = {} model = LinearRegression() model.fit(WeatherData, coData) X_predict = [PredictedWeatherData['temp'],PredictedWeatherData['Humidity']] y_predict = model.predict(X_predict) AQIParameters['co'] = y_predict[0] model.fit(WeatherData, co2Data) y_predict = model.predict(X_predict) AQIParameters['co2'] = y_predict[0] model.fit(WeatherData, no2Data) y_predict = model.predict(X_predict) AQIParameters['no2'] = y_predict[0] model.fit(WeatherData, pm25Data) y_predict = model.predict(X_predict) AQIParameters['pm25'] = y_predict[0] model.fit(WeatherData, so2Data) y_predict = model.predict(X_predict) AQIParameters['so2'] = y_predict[0] FinalDataRecord = {} FinalDataRecord = AQIParameters.copy() FinalDataRecord.update(PredictedWeatherData) CObr=[0, 1.0, 2.0, 10, 17, 34, 49]; SO2br=[0, 40, 80, 380, 800, 1600, 2400]; O3br=[0, 50, 100, 168, 208, 748, 1300] PM25br=[0, 30, 60, 90, 120, 250, 350.4]; PM10br=[0, 50, 100, 250, 350, 430, 504]; NO2br=[0, 40, 80, 180, 280, 400, 540]; AQI=[0, 50, 100, 200, 300, 400, 500]; dummy = [] so2= FinalDataRecord['so2'] so2AQI=0; i=0; while(i<6): if( so2 > SO2br[i] and so2 <= SO2br[i+1]): so2AQI= ( ( AQI[i+1]-AQI[i] ) * ( so2 - SO2br[i] ) / ( SO2br[i+1] - SO2br[i] ) ) + AQI[i] break; else: i = i+1 dummy.append(so2AQI) no2= FinalDataRecord['no2'] no2AQI=0; i=0; while(i<6): if( no2 > NO2br[i] and no2 <= NO2br[i+1]): no2AQI= ( ( AQI[i+1]-AQI[i] ) * ( no2 - NO2br[i] ) / ( NO2br[i+1] - NO2br[i] ) ) + AQI[i] break; else: i = i+1 dummy.append(no2AQI) co= FinalDataRecord['co'] coAQI=0; i=0; while(i<6): if( co > CObr[i] and co <= CObr[i+1]): coAQI= ( ( AQI[i+1]-AQI[i] ) * ( co - CObr[i] ) / ( CObr[i+1] - CObr[i] ) ) + AQI[i] break; else: i = i+1 dummy.append(coAQI) pm25= FinalDataRecord['pm25'] pmAQI=0; i=0; while(i<6): if( pm25 > PM25br[i] and pm25 <= PM25br[i+1]): pmAQI= ( ( AQI[i+1]-AQI[i] ) * ( pm25 - PM25br[i] ) / ( PM25br[i+1] - PM25br[i] ) ) + AQI[i] break; else: i = i+1 dummy.append(pmAQI) finalAQI = max(dummy) FinalDataRecord['AQI'] = finalAQI return FinalDataRecord # In[285]: Data = [] Data.append(Prediction(1)) Data.append(Prediction(2)) Data.append(Prediction(3)) Data.append(Prediction(4)) Data.append(Prediction(5)) Data.append(Prediction(6)) Data.append(Prediction(7)) print(Data) return jsonify(results=Data)
def fit_model(x_train, y_train): # Fits a linear regression to find the actual b and w that minimize the loss regression = LinearRegression() regression.fit(x_train, y_train) b_minimum, w_minimum = regression.intercept_[0], regression.coef_[0][0] return b_minimum, w_minimum
def linear_regression(X, y): return LinearRegression().fit(X, y)
def Prediction(val): slr.fit(MainList,Temp) Temp_predict = [30 + val] Temp_output = slr.predict(Temp_predict) PredictedWeatherData['temp'] = Temp_output[0] Humid = PolynomialRegressionHumidity(9,val) PredictedWeatherData['Humidity'] = Humid[0] preci = PolynomialRegressionPrecip(22,val) PredictedWeatherData['preci'] = preci[0] AQIParameters = {} model = LinearRegression() model.fit(WeatherData, coData) X_predict = [PredictedWeatherData['temp'],PredictedWeatherData['Humidity']] y_predict = model.predict(X_predict) AQIParameters['co'] = y_predict[0] model.fit(WeatherData, co2Data) y_predict = model.predict(X_predict) AQIParameters['co2'] = y_predict[0] model.fit(WeatherData, no2Data) y_predict = model.predict(X_predict) AQIParameters['no2'] = y_predict[0] model.fit(WeatherData, pm25Data) y_predict = model.predict(X_predict) AQIParameters['pm25'] = y_predict[0] model.fit(WeatherData, so2Data) y_predict = model.predict(X_predict) AQIParameters['so2'] = y_predict[0] FinalDataRecord = {} FinalDataRecord = AQIParameters.copy() FinalDataRecord.update(PredictedWeatherData) CObr=[0, 1.0, 2.0, 10, 17, 34, 49]; SO2br=[0, 40, 80, 380, 800, 1600, 2400]; O3br=[0, 50, 100, 168, 208, 748, 1300] PM25br=[0, 30, 60, 90, 120, 250, 350.4]; PM10br=[0, 50, 100, 250, 350, 430, 504]; NO2br=[0, 40, 80, 180, 280, 400, 540]; AQI=[0, 50, 100, 200, 300, 400, 500]; dummy = [] so2= FinalDataRecord['so2'] so2AQI=0; i=0; while(i<6): if( so2 > SO2br[i] and so2 <= SO2br[i+1]): so2AQI= ( ( AQI[i+1]-AQI[i] ) * ( so2 - SO2br[i] ) / ( SO2br[i+1] - SO2br[i] ) ) + AQI[i] break; else: i = i+1 dummy.append(so2AQI) no2= FinalDataRecord['no2'] no2AQI=0; i=0; while(i<6): if( no2 > NO2br[i] and no2 <= NO2br[i+1]): no2AQI= ( ( AQI[i+1]-AQI[i] ) * ( no2 - NO2br[i] ) / ( NO2br[i+1] - NO2br[i] ) ) + AQI[i] break; else: i = i+1 dummy.append(no2AQI) co= FinalDataRecord['co'] coAQI=0; i=0; while(i<6): if( co > CObr[i] and co <= CObr[i+1]): coAQI= ( ( AQI[i+1]-AQI[i] ) * ( co - CObr[i] ) / ( CObr[i+1] - CObr[i] ) ) + AQI[i] break; else: i = i+1 dummy.append(coAQI) pm25= FinalDataRecord['pm25'] pmAQI=0; i=0; while(i<6): if( pm25 > PM25br[i] and pm25 <= PM25br[i+1]): pmAQI= ( ( AQI[i+1]-AQI[i] ) * ( pm25 - PM25br[i] ) / ( PM25br[i+1] - PM25br[i] ) ) + AQI[i] break; else: i = i+1 dummy.append(pmAQI) finalAQI = max(dummy) FinalDataRecord['AQI'] = finalAQI return FinalDataRecord
def __init__(self): self.linear_regression = LinearRegression()
pipelines.append( ( "SVM", make_pipeline( preprocessing.StandardScaler(), LinearSVR(C=4, random_state=seed) ), ) ) pipelines.append( ("RF", make_pipeline(RandomForestRegressor(n_estimators=100, random_state=seed))) ) pipelines.append( ("KNN", make_pipeline(preprocessing.StandardScaler(), KNeighborsRegressor())) ) pipelines.append(("LM", make_pipeline(LinearRegression()))) #%% plot_cv_scores( pipelines=pipelines, X=X, y=y, crossvalidation=crossvalidation, scoring=scoring, file_suffix="unoptimized_simple", ) plot_cv_predictions( pipelines=pipelines, X=X,
# icept = np.mean(icepts) x = np.array(x).reshape(-1, 1) # x *= 0.510127 # x = x[~np.isnan(x) and x > float(-inf)].reshape(-1,1) y = np.array(y).reshape(-1, 1) # print x.shape, y.shape # y = y[~np.isnan(y) and x > float(-inf)].reshape(-1,1) # print x.shape, y.shape data = np.concatenate((y, x), axis=1) data = data[np.all(data != float('+inf'), axis=1)] # print data.shape print data[:10] # np.save('data4regression.npy',data) # data = np.load('data4regression.npy') lreg = LinearRegression(normalize=True, n_jobs=-1) lreg.fit(data[:, [1]], data[:, [0]]) print "coefficient: %f\t\tintercept: %f" % (lreg.coef_, lreg.intercept_) for p in preds: if args['nn_score_fmt'] == "sphinx": scores = readSen(p) elif args['nn_score_fmt'] == "text": scores = np.loadtxt(p) print p + '.sen' writeSenScores(p + '.sen', scores, lreg.coef_, 0) os.system("""pocketsphinx_batch \ -hmm {} \ -lm {} \ -cepdir {} \
def regression_ceof(pts): x = np.array([pt[0] for pt in pts]).reshape(-1, 1) y = np.array([pt[1] for pt in pts]) model = LinearRegression() model.fit(x, y) return model.coef_[0], model.intercept_
def train_stage2(self, force=False, print_fnc=print): """ trains stage2 models, store it in self.stage2_model Args: force: force training even if we've already trained print_fnc: some function for printing/logging """ try: self.stage2_model if not force: raise ValueError( "stage2 model already trained, set force=True to force retraining" ) except AttributeError: pass # generate the stage2 training data if not already done try: self.stage2_data except AttributeError: self._generate_stage2_data() x_cols = self.exog_x_cols + [self.endog_x_col] if self.stage2_model_type == 'lgb': # lgb datasets for training df_train = self.stage2_data.loc[self.stage2_data['_purpose_'] == 'train2', :] df_val = self.stage2_data.loc[self.stage2_data['_purpose_'] == 'val2', :] dat_train = lgb.Dataset(df_train[x_cols], label=df_train[self.y_col]) dat_train.grouper = df_train[self.id_col] dat_val = lgb.Dataset(df_val[x_cols], label=df_val[self.y_col]) dat_val.grouper = df_val[self.id_col] # ok, now start training params = self.stage2_params print_every = 0 if print_fnc is None else params[ 'num_iterations'] // 10 eval_results = { } # store evaluation results as well with the trained model if self.stage2_objective == 'true': # copy the params because lgb modifies it during run...? gbm = lgb.train( params.copy(), train_set=dat_train, valid_sets=[dat_train, dat_val], valid_names=['train', 'val'], verbose_eval=print_every, fobj=lambda preds, dataset: co.grouped_sse_loss_grad_hess( preds, dataset.label, dataset.grouper), feval=lambda preds, dataset: ('grouped sse', co.grouped_sse_loss(preds, dataset.label, dataset.grouper ), False), callbacks=[lgb.record_evaluation(eval_results)]) elif self.stage2_objective == 'upper': gbm = lgb.train( params.copy(), train_set=dat_train, valid_sets=[dat_train, dat_val], valid_names=['train', 'val'], verbose_eval=print_every, callbacks=[lgb.record_evaluation(eval_results)]) else: raise ValueError("self.stage2_objective not recognized") gbm.eval_results = eval_results # save the model self.stage2_model = ModelWrapper(gbm) elif self.stage2_model_type == 'linear': df_train = self.stage2_data if self.stage2_objective == 'true': min_output = minimize(fun=co.grouped_sse_loss_linear, x0=np.zeros(shape=len(x_cols) + 1), args=(df_train, x_cols, self.y_col, self.id_col)) coefs = min_output.x[1:] intercept = min_output.x[0] model = LinearModel(coefs, intercept) elif self.stage2_objective == 'upper': model = LinearRegression() model.fit(df_train[x_cols], df_train[self.y_col]) else: raise ValueError("self.stage2_objective not recognized") # add a feature_name functionality to this object, then wrap it up and return model.feature_name = lambda: x_cols self.stage2_model = ModelWrapper(model) else: raise ValueError("self.stage2_model_type not recognized")
print(seenMovie) print(metadata) print("Data loaded") print(seenMovie.shape, '\t', metadata.shape) seenMovie = seenMovie.astype('int') # split train and test set X_train, X_test, y_train, y_test = train_test_split(metadata, seenMovie, test_size=0.3, random_state=1, shuffle=True, stratify=seenMovie) # build model 2 nnls regression model reg_nnls = LinearRegression(positive=True) y_pred_nnls = reg_nnls.fit(X_train, y_train).predict(X_test) r2_score_nnls = r2_score(y_test, y_pred_nnls) print("NNLS R2 score", r2_score_nnls) logLossVal_nnls = log_loss(y_test, y_pred_nnls, eps=1e-15, normalize=True, sample_weight=None, labels=None) scaled_test = minmax_scale(y_test, feature_range=(0, 1)) scaled_pred = minmax_scale(y_pred_nnls, feature_range=(0, 1)) mse_2 = calculateMeanSquareError(scaled_test, scaled_pred) # m2_recall = recall_score(y_test, y_pred_nnls, average='binary') # m2_precision = precision_score(y_test, y_pred_nnls, average='binary')
#------------------------------- Machine Learning Models --------------------------------------# #Splitting data into train and test data from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(cleanData.iloc[:, cleanData.columns != 'Absenteeism time in hours'], cleanData.iloc[:, 20], test_size = 0.30, random_state = 1) #------------------------------------ Linear Regression Model ---------------------------------# # Root Mean Squared Error: 2.898405340060082 # R^2 Score(coefficient of determination) = 0.2772050386036977 from sklearn.linear_model import LinearRegression #Build Linear regression model lrModel = LinearRegression().fit(X_train , y_train) #Perdict for test records lrModelPred = lrModel.predict(X_test) #Storing results in a data frame for Actual and Predicted values lrResult = pd.DataFrame({'Actual': y_test, 'Predicted': lrModelPred}) print(lrResult.head()) #Calculate RMSE and R-squared value def RMSE(y_actual,y_predicted): rmse = np.sqrt(mean_squared_error(y_actual,y_predicted)) return rmse print("Root Mean Squared Error: "+str(RMSE(y_test, lrModelPred))) print("R^2 Score(coefficient of determination) = "+str(r2_score(y_test, lrModelPred)))
# -*- coding: utf-8 -*- import numpy as np import pandas as pd import matplotlib.pyplot as plt df = pd.read_csv("../../../data/Position_Salaries.csv") X = df.iloc[:, 1:2].values Y = df.iloc[:, 2:].values from sklearn.preprocessing import PolynomialFeatures poly_feature = PolynomialFeatures(degree=2) X_poly = poly_feature.fit_transform(X) from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(X_poly, Y) fig = plt.figure() ax = fig.add_axes([0, 0, 1, 1]) ax.scatter(X, Y, color='r') X_grid = np.arange(min(X), max(X), 0.1) X_grid = X_grid.reshape((len(X_grid), 1)) ax.plot(X_grid, lin_reg.predict(poly_feature.fit_transform(X_grid))) ax.set_title('level-salary curve') ax.set_xlabel('level') ax.set_ylabel('salary') plt.show()
def evaluate_lenet5(learning_rate=0.008, n_epochs=2000, nkerns=[400], batch_size=1, window_width=3, maxSentLength=30, emb_size=300, hidden_size=[300,10], margin=0.5, L2_weight=0.0001, Div_reg=0.0001, norm_threshold=5.0, use_svm=False): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/'; rng = numpy.random.RandomState(23455) datasets, word2id=load_msr_corpus_20161229(rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength) vocab_size=len(word2id)+1 mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/' mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt') wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_number_matching_scores.txt', rootPath+'test_number_matching_scores.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2] indices_train_r=indices_train[1::2] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1] indices_test_l=indices_test[::2] indices_test_r=indices_test[1::2] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] train_size = len(indices_train_l) test_size = len(indices_test_l) train_batch_start=range(train_size) test_batch_start=range(test_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int32') # indices_train_r=T.cast(indices_train_r, 'int32') # indices_test_l=T.cast(indices_test_l, 'int32') # indices_test_r=T.cast(indices_test_r, 'int32') rand_values=random_value_normal((vocab_size, emb_size), theano.config.floatX, rng) # rand_values[0]=numpy.array(numpy.zeros(emb_size)) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init_new(rand_values, id2word, word2vec) embeddings=theano.shared(value=numpy.array(rand_values,dtype=theano.config.floatX), borrow=True)#theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.iscalar() x_index_l = T.imatrix() # now, x is the index matrix, must be integer x_index_r = T.imatrix() y = T.ivector() left_l=T.iscalar() right_l=T.iscalar() left_r=T.iscalar() right_r=T.iscalar() length_l=T.iscalar() length_r=T.iscalar() norm_length_l=T.fscalar() norm_length_r=T.fscalar() mts=T.fmatrix() wmf=T.fmatrix() # cost_tmp=T.fscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1) layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output') layer0_l_output_maxpool = T.max(layer0_l.output_narrow_conv_out[:,:,:,left_l:], axis=3).reshape((1, nkerns[0])) layer0_r_output_maxpool = T.max(layer0_r.output_narrow_conv_out[:,:,:,left_r:], axis=3).reshape((1, nkerns[0])) layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1) sum_uni_l=T.sum(layer0_l_input[:,:,:,left_l:], axis=3).reshape((1, emb_size)) norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r=T.sum(layer0_r_input[:,:,:,left_r:], axis=3).reshape((1, emb_size)) norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) uni_cosine=cosine(sum_uni_l, sum_uni_r) ''' linear=Linear(sum_uni_l, sum_uni_r) poly=Poly(sum_uni_l, sum_uni_r) sigmoid=Sigmoid(sum_uni_l, sum_uni_r) rbf=RBF(sum_uni_l, sum_uni_r) gesd=GESD(sum_uni_l, sum_uni_r) ''' eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2% #eucli_1=EUCLID(sum_uni_l, sum_uni_r) len_l=norm_length_l.reshape((1,1)) len_r=norm_length_r.reshape((1,1)) ''' len_l=length_l.reshape((1,1)) len_r=length_r.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts HL_layer_1_input=T.concatenate([ # mts, eucli_1, #uni_cosine,norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, # uni_cosine, # sum_uni_l, # sum_uni_r, # sum_uni_l+sum_uni_r, 1.0/(1.0+EUCLID(layer0_l_output_maxpool, layer0_r_output_maxpool)), cosine(layer0_l_output_maxpool, layer0_r_output_maxpool), layer0_l_output_maxpool, layer0_r_output_maxpool, T.sqrt((layer0_l_output_maxpool-layer0_r_output_maxpool)**2+1e-10), layer1.output_eucli_to_simi, #layer1.output_cosine,layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, # layer1.output_cosine, layer1.output_vector_l, layer1.output_vector_r, T.sqrt((layer1.output_vector_l-layer1.output_vector_r)**2+1e-10), # len_l, len_r layer1.output_attentions # wmf, ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) HL_layer_1_input_with_extra=T.concatenate([#HL_layer_1_input, mts, len_l, len_r # wmf ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) HL_layer_1_input_size=1+1+ 1+1+3* nkerns[0] +1+1+3*nkerns[0]+10*10 HL_layer_1_input_with_extra_size = HL_layer_1_input_size+15+2 HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[0], activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[1], activation=T.tanh) LR_layer_input=T.concatenate([HL_layer_2.output, HL_layer_1.output, HL_layer_1_input],axis=1) LR_layer_input_with_extra=T.concatenate([HL_layer_2.output, HL_layer_1_input_with_extra],axis=1)#HL_layer_1.output, LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=HL_layer_1_input_size+hidden_size[0]+hidden_size[1], n_out=2) # LR_layer_input=HL_layer_2.output # LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=hidden_size, n_out=2) # layer3=LogisticRegression(rng, input=layer3_input, n_in=15+1+1+2+3, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((LR_layer.W** 2).sum()+(HL_layer_2.W** 2).sum()+(HL_layer_1.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum() # diversify_reg= Diversify_Reg(LR_layer.W.T)+Diversify_Reg(HL_layer_2.W.T)+Diversify_Reg(HL_layer_1.W.T)+Diversify_Reg(conv_W_into_matrix) cost_this =debug_print(LR_layer.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=cost_this+L2_weight*L2_reg#+Div_reg*diversify_reg test_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [LR_layer.errors(y), LR_layer.y_pred, LR_layer_input_with_extra, y], on_unused_input='ignore',allow_input_downcast=True) params = LR_layer.params+ HL_layer_2.params+HL_layer_1.params+[conv_W, conv_b]+[embeddings]#+[embeddings]# + layer1.params accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): clipped_grad = T.clip(grad_i, -0.5, 0.5) acc = acc_i + T.sqr(clipped_grad) updates.append((param_i, param_i - learning_rate * clipped_grad / T.sqrt(acc+1e-10))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [cost,LR_layer.errors(y)], updates=updates, on_unused_input='ignore',allow_input_downcast=True) train_model_predict = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [cost_this,LR_layer.errors(y), LR_layer_input_with_extra, y],on_unused_input='ignore',allow_input_downcast=True) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is best_params = None best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() epoch = 0 done_looping = False max_acc=0.0 nn_max_acc=0.0 best_iter=0 cost_tmp=0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 shuffle(train_batch_start)#shuffle training data for index in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * train_size + minibatch_index +1 minibatch_index=minibatch_index+1 # if iter%update_freq != 0: # cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) # #print 'cost_ij: ', cost_ij # cost_tmp+=cost_ij # error_sum+=error_ij # else: cost_i, error_i= train_model(indices_train_l[index: index + batch_size], indices_train_r[index: index + batch_size], trainY[index: index + batch_size], trainLeftPad_l[index], trainRightPad_l[index], trainLeftPad_r[index], trainRightPad_r[index], trainLengths_l[index], trainLengths_r[index], normalized_train_length_l[index], normalized_train_length_r[index], mt_train[index: index + batch_size], wm_train[index: index + batch_size]) cost_tmp+=cost_i if iter < 6000 and iter %100 ==0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter) if iter >= 6000 and iter % 100 == 0: # if iter%100 ==0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter) test_losses=[] test_y=[] test_features=[] for index in test_batch_start: test_loss, pred_y, layer3_input, y=test_model(indices_test_l[index: index + batch_size], indices_test_r[index: index + batch_size], testY[index: index + batch_size], testLeftPad_l[index], testRightPad_l[index], testLeftPad_r[index], testRightPad_r[index], testLengths_l[index], testLengths_r[index], normalized_test_length_l[index], normalized_test_length_r[index], mt_test[index: index + batch_size], wm_test[index: index + batch_size]) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y[0]) test_features.append(layer3_input[0]) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) test_acc = (1-test_score) * 100. if test_acc > nn_max_acc: nn_max_acc = test_acc print '\t\t\tepoch:', epoch, 'iter:', iter, 'current acc:', test_acc, 'nn_max_acc:', nn_max_acc #now, see the results of svm if use_svm: train_y=[] train_features=[] for index in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(indices_train_l[index: index + batch_size], indices_train_r[index: index + batch_size], trainY[index: index + batch_size], trainLeftPad_l[index], trainRightPad_l[index], trainLeftPad_r[index], trainRightPad_r[index], trainLengths_l[index], trainLengths_r[index], normalized_train_length_l[index], normalized_train_length_r[index], mt_train[index: index + batch_size], wm_train[index: index + batch_size]) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(' '.join(map(str,layer3_input[0]))+'\n') #write_feature.close() clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33 clf.fit(train_features, train_y) results=clf.predict(test_features) lr=LinearRegression().fit(train_features, train_y) results_lr=lr.predict(test_features) corr_count=0 corr_lr=0 test_size=len(test_y) for i in range(test_size): if results[i]==test_y[i]: corr_count+=1 if numpy.absolute(results_lr[i]-test_y[i])<0.5: corr_lr+=1 acc=corr_count*1.0/test_size acc_lr=corr_lr*1.0/test_size if acc > max_acc: max_acc=acc best_iter=iter if acc_lr> max_acc: max_acc=acc_lr best_iter=iter print '\t\t\t\tsvm acc: ', acc, 'LR acc: ', acc_lr, ' max acc: ', max_acc , ' at iter: ', best_iter if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2, max_q=5, start_P=1, D=None, start_Q=1, max_P=2, max_D=1, max_Q=2, max_order=5, m=1, seasonal=True, stationary=False, information_criterion='aic', alpha=0.05, test='kpss', seasonal_test='ocsb', stepwise=True, n_jobs=1, start_params=None, trend=None, method='lbfgs', maxiter=50, offset_test_args=None, seasonal_test_args=None, suppress_warnings=False, error_action='warn', trace=False, random=False, random_state=None, n_fits=10, return_valid_fits=False, out_of_sample_size=0, scoring='mse', scoring_args=None, with_intercept=True, sarimax_kwargs=None, **fit_args): # NOTE: Doc is assigned BELOW this function # pop out the deprecated kwargs fit_args = _warn_for_deprecations(**fit_args) start = time.time() # validate start/max points if any(_ < 0 for _ in (max_p, max_q, max_P, max_Q, start_p, start_q, start_P, start_Q)): raise ValueError('starting and max p, q, P & Q values must ' 'be positive integers (>= 0)') if max_p < start_p or max_q < start_q \ or max_P < start_P or max_Q < start_Q: raise ValueError('max p, q, P & Q must be >= than ' 'their starting values') # validate d & D for _d, _max_d in ((d, max_d), (D, max_D)): if _max_d < 0: raise ValueError('max_d & max_D must be positive integers (>= 0)') if _d is not None: if _d < 0: raise ValueError('d & D must be None or a positive ' 'integer (>= 0)') # v0.9.0+ - ignore this if it's explicitly set... # if _d > _max_d: # raise ValueError('if explicitly defined, d & D must be <= ' # 'max_d & <= max_D, respectively') # is stepwise AND parallel enabled? if stepwise and n_jobs != 1: n_jobs = 1 warnings.warn('stepwise model cannot be fit in parallel (n_jobs=%i). ' 'Falling back to stepwise parameter search.' % n_jobs) # check on m if m < 1: raise ValueError('m must be a positive integer (> 0)') # check on n_iter if random and n_fits < 0: raise ValueError('n_iter must be a positive integer ' 'for a random search') # validate error action actions = {'warn', 'raise', 'ignore', None} if error_action not in actions: raise ValueError('error_action must be one of %r, but got %r' % (actions, error_action)) # copy array y = check_endog(y, dtype=DTYPE) n_samples = y.shape[0] sarimax_kwargs = {} if not sarimax_kwargs else sarimax_kwargs # check for constant data if is_constant(y): warnings.warn('Input time-series is completely constant; ' 'returning a (0, 0, 0) ARMA.') return _return_wrapper( _post_ppc_arima( solvers._fit_arima(y, xreg=exogenous, order=(0, 0, 0), seasonal_order=(0, 0, 0, 0), start_params=start_params, trend=trend, method=method, maxiter=maxiter, fit_params=fit_args, suppress_warnings=suppress_warnings, trace=trace, error_action=error_action, scoring=scoring, out_of_sample_size=out_of_sample_size, scoring_args=scoring_args, with_intercept=with_intercept, **sarimax_kwargs)), return_valid_fits, start, trace) # test ic, and use AIC if n <= 3 if information_criterion not in VALID_CRITERIA: raise ValueError('auto_arima not defined for information_criteria=%s. ' 'Valid information criteria include: %r' % (information_criterion, VALID_CRITERIA)) # the R code handles this, but I don't think statsmodels # will even fit a model this small... # if n_samples <= 3: # if information_criterion != 'aic': # warnings.warn('n_samples (%i) <= 3 ' # 'necessitates using AIC' % n_samples) # information_criterion = 'aic' # adjust max p, q -- R code: # max.p <- min(max.p, floor(serieslength/3)) # max.q <- min(max.q, floor(serieslength/3)) max_p = int(min(max_p, np.floor(n_samples / 3))) max_q = int(min(max_q, np.floor(n_samples / 3))) # this is not in the R code and poses a risk that R did not consider... # if max_p|q has now dropped below start_p|q, correct it. start_p = min(start_p, max_p) start_q = min(start_q, max_q) # if it's not seasonal, we can avoid multiple 'if not is None' comparisons # later by just using this shortcut (hack): if not seasonal: D = m = -1 # choose the order of differencing xx = y.copy() if exogenous is not None: lm = LinearRegression().fit(exogenous, y) xx = y - lm.predict(exogenous) # is the TS stationary? if stationary: d = D = 0 # now for seasonality if m == 1: D = max_P = max_Q = 0 # m must be > 1 for nsdiffs elif D is None: # we don't have a D yet and we need one (seasonal) seasonal_test_args = seasonal_test_args \ if seasonal_test_args is not None else dict() D = nsdiffs(xx, m=m, test=seasonal_test, max_D=max_D, **seasonal_test_args) if D > 0 and exogenous is not None: diffxreg = diff(exogenous, differences=D, lag=m) # check for constance on any column if np.apply_along_axis(is_constant, arr=diffxreg, axis=0).any(): D -= 1 # D might still be None if not seasonal. Py 3 will throw and error for that # unless we explicitly check for ``seasonal`` if D > 0: dx = diff(xx, differences=D, lag=m) else: dx = xx # If D was too big, we might have gotten rid of x altogether! if dx.shape[0] == 0: raise ValueError( "The seasonal differencing order, D=%i, was too " "large for your time series, and after differencing, " "there are no samples remaining in your data. " "Try a smaller value for D, or if you didn't set D " "to begin with, try setting it explicitly. This can " "also occur in seasonal settings when m is too large." % D) # difference the exogenous matrix if exogenous is not None: if D > 0: diffxreg = diff(exogenous, differences=D, lag=m) else: diffxreg = exogenous else: # here's the thing... we're only going to use diffxreg if exogenous # was not None in the first place. However, PyCharm doesn't know that # and it thinks we might use it before assigning it. Therefore, assign # it to None as a default value and it won't raise the warning anymore. diffxreg = None # determine/set the order of differencing by estimating the number of # orders it would take in order to make the TS stationary. if d is None: offset_test_args = offset_test_args \ if offset_test_args is not None else dict() d = ndiffs(dx, test=test, alpha=alpha, max_d=max_d, **offset_test_args) if d > 0 and exogenous is not None: diffxreg = diff(diffxreg, differences=d, lag=1) # if any columns are constant, subtract one order of differencing if np.apply_along_axis(is_constant, arr=diffxreg, axis=0).any(): d -= 1 # check differences (do we want to warn?...) if error_action == 'warn' and not suppress_warnings: if D >= 2: warnings.warn( "Having more than one seasonal differences is " "not recommended. Please consider using only one " "seasonal difference.", ModelFitWarning) # if D is -1, this will be off, so we include the OR elif D + d > 2 or d > 2: warnings.warn( "Having 3 or more differencing operations is not " "recommended. Please consider reducing the total " "number of differences.", ModelFitWarning) if d > 0: dx = diff(dx, differences=d, lag=1) # check for constance if is_constant(dx): if exogenous is None and not (D > 0 or d < 2): raise ValueError('data follow a simple polynomial and are not ' 'suitable for ARIMA modeling') # perfect regression ssn = (0, 0, 0, 0) if not seasonal else (0, D, 0, m) return _return_wrapper( _post_ppc_arima( solvers._fit_arima(y, xreg=exogenous, order=(0, d, 0), seasonal_order=ssn, start_params=start_params, trend=trend, method=method, maxiter=maxiter, fit_params=fit_args, suppress_warnings=suppress_warnings, trace=trace, error_action=error_action, scoring=scoring, out_of_sample_size=out_of_sample_size, scoring_args=scoring_args, with_intercept=with_intercept, **sarimax_kwargs)), return_valid_fits, start, trace) # seasonality issues if m > 1: if max_P > 0: max_p = min(max_p, m - 1) if max_Q > 0: max_q = min(max_q, m - 1) if not stepwise: # validate max_order if max_order is None: max_order = np.inf elif max_order < 0: raise ValueError('max_order must be None or a positive ' 'integer (>= 0)') # NOTE: pre-1.5.2, we started at start_p, start_q, etc. However, when # using stepwise=FALSE in R, hyndman starts at 0. He only uses start_* # when stepwise=TRUE. # generate the set of (p, q, P, Q) FIRST, since it is contingent # on whether or not the user is interested in a seasonal ARIMA result. # This will reduce the search space for non-seasonal ARIMA models. # loop p, q. Make sure to loop at +1 interval, # since max_{p|q} is inclusive. if seasonal: gen = (((p, d, q), (P, D, Q, m)) for p in range(0, max_p + 1) for q in range(0, max_q + 1) for P in range(0, max_P + 1) for Q in range(0, max_Q + 1) if p + q + P + Q <= max_order) else: # otherwise it's not seasonal and we don't need the seasonal pieces gen = (((p, d, q), (0, 0, 0, 0)) for p in range(0, max_p + 1) for q in range(0, max_q + 1) if p + q <= max_order) # if we are fitting a random search rather than an exhaustive one, we # will scramble up the generator (as a list) and only fit n_iter ARIMAs if random: random_state = check_random_state(random_state) # make a list to scramble... gen = random_state.permutation(list(gen))[:n_fits] # get results in parallel all_res = Parallel(n_jobs=n_jobs)( delayed(solvers._fit_arima)(y, xreg=exogenous, order=order, seasonal_order=seasonal_order, start_params=start_params, trend=trend, method=method, maxiter=maxiter, fit_params=fit_args, suppress_warnings=suppress_warnings, trace=trace, error_action=error_action, out_of_sample_size=out_of_sample_size, scoring=scoring, scoring_args=scoring_args, with_intercept=with_intercept, **sarimax_kwargs) for order, seasonal_order in gen) # otherwise, we're fitting the stepwise algorithm... else: if n_samples < 10: start_p = min(start_p, 1) start_q = min(start_q, 1) start_P = start_Q = 0 # adjust to p, q, P, Q vals p = start_p = min(start_p, max_p) q = start_q = min(start_q, max_q) P = start_P = min(start_P, max_P) Q = start_Q = min(start_Q, max_Q) # init the stepwise model wrapper stepwise_wrapper = solvers._StepwiseFitWrapper( y, xreg=exogenous, start_params=start_params, trend=trend, method=method, maxiter=maxiter, fit_params=fit_args, suppress_warnings=suppress_warnings, trace=trace, error_action=error_action, out_of_sample_size=out_of_sample_size, scoring=scoring, scoring_args=scoring_args, p=p, d=d, q=q, P=P, D=D, Q=Q, m=m, start_p=start_p, start_q=start_q, start_P=start_P, start_Q=start_Q, max_p=max_p, max_q=max_q, max_P=max_P, max_Q=max_Q, seasonal=seasonal, information_criterion=information_criterion, with_intercept=with_intercept, **sarimax_kwargs) # do the step-through... all_res = stepwise_wrapper.solve_stepwise() # filter the non-successful ones filtered = _post_ppc_arima(all_res) # sort by the criteria - lower is better for both AIC and BIC # (https://stats.stackexchange.com/questions/81427/aic-guidelines-in-model-selection) sorted_res = sorted(filtered, key=(lambda mod: getattr(mod, information_criterion) ())) # remove all the cached .pmdpkl files... someday write this as an exit hook # in case of a KeyboardInterrupt or anything for model in sorted_res: model._clear_cached_state() return _return_wrapper(sorted_res, return_valid_fits, start, trace)
data.head() # TO VISUALISE DATA fig, axs = plt.subplots(1, 3, sharey=True) data.plot(kind='scatter', x='TV', y='Sales', ax=axs[0], figsize=(14, 7)) data.plot(kind='scatter', x='Radio', y='Sales', ax=axs[1]) data.plot(kind='scatter', x='Newspaper', y='Sales', ax=axs[2]) # CREATING X&Y FOR LINEAR REGRESSION feature_cols = ['TV'] X = data[feature_cols] Y = data.Sales #IMPORTING LINEAR REGRESSION ALGO FOR SIMPLE LINEAR REGRESSION from sklearn.linear_model import LinearRegression lr = LinearRegression() lr.fit(X, Y) print(lr.intercept_) print(lr.coef_) result = 6.97 + 0.0554 * 50 print(result) #CREATE A DATAFRAM WITH MIN AND MAX VALUE OF THE TABLE X_new = pd.DataFrame({'TV': [data.TV.min(), data.TV.max()]}) X_new.head() preds = lr.predict(X_new) preds
def main(regressor="random_forest"): """ The main method """ # Fetch data from internet data = fetch_and_load_data() # Process median_income into categories data["income_cat"] = np.ceil(data["median_income"] / 1.5) data["income_cat"].where(data["income_cat"] < 5, 5.0, inplace=True) # Split data into training and testing sets train_data, test_data = split_train_test_stratified(data, "income_cat") # Extract labels and housing data housing_labels = train_data["median_house_value"].copy() housing = train_data.drop("median_house_value", axis=1) # split housing into categorical and numerical data # cat_attributes = ["ocean_proximity", "income_cat"] cat_attributes = ["ocean_proximity"] num_attributes = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income'] # Set up pipeline to prepare data with. full_pipeline = setup_pipeline(num_attributes, cat_attributes) # Prepare the data housing_prepared = full_pipeline.fit_transform(housing) print() # Select the appropriate regressor if regressor == "linear": reg_model = LinearRegression() reg_name = "Linear Regressor" elif regressor == "random_forest": reg_model = RandomForestRegressor() reg_name = "Random Forest Regressor" elif regressor == "decision_tree": reg_model = DecisionTreeRegressor() reg_name = "Decision Tree Regressor" elif regressor == "svr": reg_model = SVR(kernel="linear", gamma='auto') reg_name = "Support Vector Machine" else: error_mes = "Regressor '{regressor}' not recognised." raise ValueError(error_mes.format(regressor=regressor)) # Train regression model reg_model.fit(housing_prepared, housing_labels) display_model_performance(reg_model, housing_prepared, housing_labels, reg_name) if regressor == "random_forest": # Fine tune Random Forest param_grid = [ {'n_estimators': [50, 100, 1000], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [50, 100, 1000], 'max_features': [2, 4, 6]}] final_model = fine_tune_model(RandomForestRegressor(), param_grid, housing_prepared, housing_labels) # Get the best model weights print() print("Attribute weights:") feature_importances = final_model.feature_importances_ print_attribute_importances(feature_importances, num_attributes, full_pipeline) elif regressor == "linear": final_model = reg_model print("Coefficients used by linear model:") coeffs = final_model.coef_ print_attribute_importances(coeffs, num_attributes, full_pipeline) elif regressor == "decision_tree": # Fine tune Decision Tree param_grid = [{'criterion': ["mse", "friedman_mse", "mae"]}] final_model = fine_tune_model(DecisionTreeRegressor(), param_grid, housing_prepared, housing_labels) elif regressor == "svr": param_grid = [ {'kernel': ["linear"], "C": [10000, 100000]}, {'kernel': ["rbf"], "C": [10000, 100000], "gamma": [0.045, 0.05, 0.055]}] final_model = fine_tune_model(SVR(), param_grid, housing_prepared, housing_labels) else: final_model = reg_model print() # Evaluate on test set X_test = test_data.drop("median_house_value", axis=1) y_test = test_data["median_house_value"].copy() X_test_prepared = full_pipeline.transform(X_test) final_predictions = final_model.predict(X_test_prepared) final_mse = mean_squared_error(y_test, final_predictions) final_rmse = np.sqrt(final_mse) print("Final Standard Error:", final_rmse)
def validate(): """ run KFOLD method for regression """ #defining directories dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged" dir_out = "/lustre/fs0/home/mtadesse/merraLRValidation" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" #cd to the lagged predictors directory os.chdir(dir_in) x = 824 y = 825 #empty dataframe for model validation df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse']) #looping through for tg in range(x,y): os.chdir(dir_in) tg_name = os.listdir()[tg] print(tg, tg_name) ########################################## #check if this tg is already taken care of ########################################## os.chdir(dir_out) if os.path.isfile(tg_name): return "file already analyzed!" os.chdir(dir_in) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis = 1, inplace = True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis = 1) #standardize predictor data dat = pred.iloc[:,1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis = 1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis = 1, inplace = True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis = 0, inplace = True) surge.reset_index(inplace = True) surge.drop('index', axis = 1, inplace = True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns = ['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis = 1) #merge predictors and surge to find common time frame pred_surge = pd.merge(pred_standardized, surge_new.iloc[:,:2], on='date', how='right') pred_surge.sort_values(by = 'date', inplace = True) #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis =1)] pred_surge.drop(row_nan.index, axis = 0, inplace = True) pred_surge.reset_index(inplace = True) pred_surge.drop('index', axis = 1, inplace = True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-'*80) print('Predictors and Surge don''t overlap') print('-'*80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:,1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis = 1, inplace = True) #apply PCA pca = PCA(.95) pca.fit(X) X_pca = pca.transform(X) #apply 10 fold cross validation kf = KFold(n_splits=10, random_state=29) metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs']) for train_index, test_index in kf.split(X): X_train, X_test = X_pca[train_index], X_pca[test_index] y_train, y_test = y['surge'][train_index], y['surge'][test_index] #train regression model lm = LinearRegression() lm.fit(X_train, y_train) #predictions predictions = lm.predict(X_test) # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # pd.DataFrame(np.array(y_test))], \ # axis = 1) # pred_obs.columns = ['pred', 'obs'] # combo = pd.concat([combo, pred_obs], axis = 0) #evaluation matrix - check p value if stats.pearsonr(y_test, predictions)[1] >= 0.05: print("insignificant correlation!") continue else: print(stats.pearsonr(y_test, predictions)) metric_corr.append(stats.pearsonr(y_test, predictions)[0]) print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions))) #number of years used to train/test model num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\ pred_surge['date'][0]).days/365 longitude = surge['lon'][0] latitude = surge['lat'][0] num_pc = X_pca.shape[1] #number of principal components corr = np.mean(metric_corr) rmse = np.mean(metric_rmse) print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' - avg_rmse (m) = ', \ np.mean(metric_rmse), '\n') #original size and pca size of matrix added new_df = pd.DataFrame([tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T new_df.columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse'] df = pd.concat([df, new_df], axis = 0) #save df as cs - in case of interruption os.chdir(dir_out) df.to_csv(tg_name) #cd to dir_in os.chdir(dir_in)
#train_x = np.reshape(train_x,(-1,2)) #print(train_x) #train_x2 = data["Y"][:-2].values #train_x2 = np.reshape(train_x2,(-1,1)) train_y = data["Expected_output"][:-2].values.reshape(-1,1) #train_y = np.reshape(train_y,(-1,1)) #test_x = pd.DataFrame(data,columns = data[["X","Y"]][-2:].values) test_x = data[["X","Y"]][-2:].values.reshape(-1,2) #test_x = np.reshape(test_x,(-1,2)) #test_x2 = data["Y"][-2:].values #test_x2 = np.reshape(test_x2,(-1,1)) test_y = data["Expected_output"][-2:].values.reshape(-1,1) #test_y = np.reshape(test_y,(-1,1)) #print(test_x["X"]) model = LinearRegression() model.fit(train_x,train_y) coeff = model.coef_ intercept = model.intercept_ points = [intercept+(coeff[0]*i[0]) for i in train_x] plt.plot(points,"ro") predict_y = model.predict(test_x) plt.plot(train_y,predict_y,"b*") print(predict_y) plt.show() #intercept = #points =
def imputation(data): data = basic_eda(data) count_of_null = data.isnull().sum() percent_of_missing = data.isnull().sum() * 100 / len(data) missing_value_data = pd.DataFrame( {'percent_missing': percent_of_missing, 'Count_of_Missing_Values ': count_of_null}) global numerical_column_names global categorical_column_names numerical_column_names, categorical_column_names = num_cat_separation(data) global data_null_treated data_null_treated = data.copy() label_encoder = LabelEncoder() cols_to_be_imputed = missing_value_data[missing_value_data['percent_missing'] > 0].sort_values( 'percent_missing', ascending=False).index cols_to_be_imputed = list(cols_to_be_imputed) # if target in cols_to_be_imputed: # cols_to_be_imputed.remove(target) Imputed_column_array = [] for i in cols_to_be_imputed: data_dup = data_null_treated.copy() # Replacing column having below 2 percent missing values with median and mode below_2_percent_columns = missing_value_data[missing_value_data['percent_missing'] < 2].index below_2_percent_columns = list(below_2_percent_columns) if i in below_2_percent_columns: below_2_percent_columns.remove(i) for j in below_2_percent_columns: if j in numerical_column_names: data_dup[j] = data_dup[[j]].apply( lambda x: x.fillna(x.median()), axis=0) else: data_dup[j] = data_dup[[j]].apply( lambda x: x.fillna(data_dup[j].value_counts().index.max())) # Seperating rows without null for train data_dup_train = data_dup[data_dup[i].isna() == False] data_dup_train_copy = data_dup_train.copy() # Dropping null values in other columns data_dup_train = data_dup_train.dropna() # Seperating rows with null for test data_dup_test = data_dup[data_dup[i].isna()] # Removing column having above 15 percent missing values above_15_percent_columns = missing_value_data[missing_value_data['percent_missing'] > 15].index above_15_percent_columns = list(above_15_percent_columns) if i in above_15_percent_columns: above_15_percent_columns.remove(i) data_dup_train = data_dup_train.drop(above_15_percent_columns, axis=1) data_dup_test = data_dup_test.drop(above_15_percent_columns, axis=1) # Train test split x_test = data_dup_test.drop(i, axis=1) x_test = pd.get_dummies(x_test, drop_first=True) x_test_columns = x_test.columns for k in x_test_columns: if x_test[k].dtype == 'float64': x_test[k] = x_test[[k]].apply( lambda x: x.fillna(x.median()), axis=0) else: x_test[k] = x_test[[k]].apply(lambda x: x.fillna( x_test[k].value_counts().index.max())) x_train = data_dup_train.drop(i, axis=1) x_train = pd.get_dummies(x_train, drop_first=True) x_train = x_train[x_test.columns] y_train = data_dup_train[[i]] if y_train[i].dtype == 'O': y_train[i] = label_encoder.fit_transform(y_train[i]) y_train[[i]] = y_train[[i]].astype('int') # Building model if i in numerical_column_names: model_rf = RandomForestRegressor(n_estimators=100, max_depth=6) else: model_rf = RandomForestClassifier(n_estimators=100, max_depth=6) model_rf.fit(x_train, y_train) rf_score = model_rf.score(x_train, y_train) print('RandomForest Score :', rf_score) if i in numerical_column_names: model_lr = LinearRegression() else: model_lr = LogisticRegression() model_lr.fit(x_train, y_train) lr_score = model_lr.score(x_train, y_train) print('\nLogisticRegression Score :', lr_score) # Checking which model is better if rf_score > lr_score: print('\nFor', i, ' RandomForest performs better. So we will go with this.\n') model = model_rf Imputed_column_array.append({i: 'Random Forest'}) else: print( '\n\nFor', i, ' Logistic Regression performs better. So we will go with this.') model = model_lr Imputed_column_array.append({i: 'Logistic Regression'}) prediction = model.predict(x_test) print(prediction.dtype, '\n\n') if prediction.dtype == 'int32': prediction = label_encoder.inverse_transform(prediction) prediction_df = pd.DataFrame(prediction) #print('\n\n Predicted count of ', i , ' :' , prediction_df[0].value_counts()) data_dup_test = data_dup_test.drop(i, axis=1) data_dup_test[i] = prediction data_dup_complete = pd.concat([data_dup_train_copy, data_dup_test]) data_dup_complete = data_dup_complete.sort_index() predicted = data_dup_complete[[i]] data_null_treated = data_null_treated.drop(i, axis=1) data_null_treated[i] = predicted #feature_selection(data_null_treated, target) return (Imputed_column_array, data_null_treated)
# In[ ]: X_train = train.drop("Survived", axis=1) Y_train = train["Survived"] X_test = test.drop("PassengerId", axis=1).copy() X_train.shape, Y_train.shape, X_test.shape # In[ ]: # The set of models I am going to compare models = [ LinearRegression(), LogisticRegressionCV(), Perceptron(), GaussianNB(), KNeighborsClassifier(), SVC(probability=True), DecisionTreeClassifier(), AdaBoostClassifier(), RandomForestClassifier(), XGBClassifier() ] # Create a table of comparison for models models_columns = ['Name', 'Parameters','Train Accuracy', 'Validation Accuracy', 'Execution Time'] models_df = pd.DataFrame(columns = models_columns) predictions = pd.DataFrame(columns = ['Survived'])
import numpy as np import pandas as pd import matplotlib.pyplot as plt df = pd.read_csv('Position_Salaries.csv') df.head() X = df.iloc[:, 1:2].values y = df.iloc[:, -1].values #fitting linear regression model from sklearn.linear_model import LinearRegression linear_reg = LinearRegression() linear_reg.fit(X, y) #fitting polynomial regression model from sklearn.preprocessing import PolynomialFeatures poly_reg = PolynomialFeatures(degree=5) X_poly = poly_reg.fit_transform(X) X_poly lin_reg_2 = LinearRegression() lin_reg_2.fit(X_poly, y) #visualing the linear model plt.scatter(X, y, color='red') plt.plot(X, linear_reg.predict(X), color='blue') plt.title('LinearModel')
gsFeatureSelector = RFECV(gs_b.best_estimator_, cv = 5).fit(X_train,Y_train) gsX = gsFeatureSelector.transform(X_train) gsFeatureSupport = gsFeatureSelector.support_ gs_a = GridSearchCV( param_grid = {'min_samples_leaf':np.linspace(5,55,10).astype(int), 'min_samples_split':np.linspace(5,55,10).astype(int)}, estimator = RandomForestClassifier(n_estimators=1000), scoring = 'accuracy') gs_a.support = gsFeatureSupport gs_a.selector = gsFeatureSelector gs_a.fit(gsX,Y_train)#Train the model #%% # Linear Regression model and feature selection linearRegression = LinearRegression() linearFeatureSelector = RFECV(linearRegression, cv = 5).fit(X_train,Y_train) LinearX = linearFeatureSelector.transform(X_train) linearFeatureSupport = linearFeatureSelector.support_ # store selector in Linear Regression Model linearRegression.support = linearFeatureSupport linearRegression.selector = linearFeatureSelector # train Linear Regression Model linearRegression.fit(LinearX,Y_train) # %% ''' We can choose to open or not open a model '''
def nn_linear_regression(AMDs_train, AMDs_test, energy_train, energy_test): mse_min = 100 r2_max = 0 total_number = ATTRIBUTE_NUM weight_list = [0.008, 0.04, 0.2, 1, 5, 25, 125] import random while True: remains = total_number center_number = random.randint(0, (int)(remains / 2)) * 2 remains -= center_number number_1 = random.randint(0, (int)(remains / 2)) * 2 remains -= number_1 number_2 = random.randint(0, (int)(remains / 2)) * 2 remains -= number_2 number_3 = remains center_list = [] list_1 = [] list_2 = [] list_3 = [] for i in range(center_number): temp = random.randint(0, total_number - 1) while temp in center_list: temp = random.randint(0, total_number - 1) center_list.append(temp) list_1_1 = [] list_1_2 = [] for i in range(number_1): temp = random.randint(0, total_number - 1) while (temp in center_list) \ or (temp in list_1): temp = random.randint(0, total_number - 1) list_1.append(temp) if i % 2 == 0: list_1_1.append(temp) else: list_1_2.append(temp) list_2_1 = [] list_2_2 = [] for i in range(number_2): temp = random.randint(0, total_number - 1) while (temp in center_list) \ or (temp in list_1) \ or (temp in list_2): temp = random.randint(0, total_number - 1) list_2.append(temp) if i % 2 == 0: list_2_1.append(temp) else: list_2_2.append(temp) for i in range(total_number): if i not in center_list \ and i not in list_1 \ and i not in list_2: list_3.append(i) list_3_1 = list_3[0:(int)(number_3 / 2)] list_3_2 = list_3[(int)(number_3 / 2):number_3] new_AMDs_train = [] for row in AMDs_train: temp_row = [] for i in range(total_number): if i in center_list: temp_row.append(weight_list[3] * row[i]) elif i in list_1_2: temp_row.append(weight_list[4] * row[i]) elif i in list_1_1: temp_row.append(weight_list[2] * row[i]) elif i in list_2_2: temp_row.append(weight_list[5] * row[i]) elif i in list_2_1: temp_row.append(weight_list[1] * row[i]) elif i in list_3_2: temp_row.append(weight_list[6] * row[i]) elif i in list_3_1: temp_row.append(weight_list[0] * row[i]) else: print("index not found: ", i) new_AMDs_train.append(temp_row) new_AMDs_test = [] for row in AMDs_test: temp_row = [] for i in range(total_number): if i in center_list: temp_row.append(weight_list[3] * row[i]) elif i in list_1_2: temp_row.append(weight_list[4] * row[i]) elif i in list_1_1: temp_row.append(weight_list[2] * row[i]) elif i in list_2_2: temp_row.append(weight_list[5] * row[i]) elif i in list_2_1: temp_row.append(weight_list[1] * row[i]) elif i in list_3_2: temp_row.append(weight_list[6] * row[i]) elif i in list_3_1: temp_row.append(weight_list[0] * row[i]) else: print("index not found: ", i) new_AMDs_test.append(temp_row) # Linear regression linreg = LinearRegression(normalize=True, n_jobs=-1) linreg.fit(new_AMDs_train, energy_train) energy_pred = linreg.predict(new_AMDs_test) mse = round(mean_squared_error(energy_test, energy_pred), 4) r2 = round(r2_score(energy_test, energy_pred), 4) print("MSR of weighted linear regression is: ", mse) print("r2 is ", r2) fig, ax = plt.subplots() ax.scatter(energy_test, energy_pred) ax.plot([np.min(energy_test), np.max(energy_test)], [np.min(energy_test), np.max(energy_test)], 'k--', lw=4) ax.set_xlabel('Given') ax.set_ylabel('Predicted') plt.savefig('./image/wlin_' + str(index) + '.jpg') break