def elasticNet(X,y): print("\n### ~~~~~~~~~~~~~~~~~~~~ ###") print("Lasso Regression") ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myDegree = 40 polynomialFeatures = PolynomialFeatures(degree=myDegree, include_bias=False) Xp = polynomialFeatures.fit_transform(X) myScaler = StandardScaler() scaled_Xp = myScaler.fit_transform(Xp) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### elasticNet = ElasticNet(alpha=1e-7,l1_ratio=0.5) elasticNet.fit(scaled_Xp,y) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### dummyX = np.arange(0,2,0.01) dummyX = dummyX.reshape((dummyX.shape[0],1)) dummyXp = polynomialFeatures.fit_transform(dummyX) scaled_dummyXp = myScaler.transform(dummyXp) dummyY = elasticNet.predict(scaled_dummyXp) outputFILE = 'plot-elasticNet.png' fig, ax = plt.subplots() fig.set_size_inches(h = 6.0, w = 10.0) ax.axis([0,2,0,15]) ax.scatter(X,y,color="black",s=10.0) ax.plot(dummyX, dummyY, color='red', linewidth=1.5) plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2, dpi = 600) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( None )
def check_ElasticNet(X, y, pred, tol, reg_alpha, reg_lambda, weights): enet = ElasticNet(alpha=reg_alpha + reg_lambda, l1_ratio=reg_alpha / (reg_alpha + reg_lambda)) enet.fit(X, y) enet_pred = enet.predict(X) assert np.isclose(weights, enet.coef_, rtol=tol, atol=tol).all() assert np.isclose(enet_pred, pred, rtol=tol, atol=tol).all()
def report_ff_en(): # Fastfood approximation of Gaussian kernel para = FastfoodPara(n, d) st = time() PHI_train, _ = FastfoodForKernel(trainData, para, sgm) elapsed_ff_kern_train = time() - st st = time() PHI_valid, _ = FastfoodForKernel(validationData, para, sgm) elapsed_ff_kern_valid = time() - st # Train elastic net on projected training data en = ElasticNet() st = time() en.fit(PHI_train.T, trainLabels) elapsed_en_fit = time() - st # Predict labels for projected validation data st = time() y_pred = en.predict(PHI_valid.T) elapsed_en_pred = time() - st # Report performance mse_proj = metrics.mean_squared_error(validationLabels, y_pred) # print("For projected data, MSE = {:0.4g}.".format(mse_proj)) return mse_proj, elapsed_en_fit, elapsed_ff_kern_train
def enet(a): print ("Doing elastic net") clf3 = ElasticNet(alpha=a) clf3.fit(base_X, base_Y) print ("Score = %f" % clf3.score(base_X, base_Y)) clf3_pred = clf3.predict(X_test) write_to_file("elastic.csv", clf3_pred)
def enet_granger_causality_test(X_t, y_t, top_df, max_iter=10000000): """ Return the cv-parameters tested across the whole data :param X_t: :param y_t: :param top_df: :return: res_df, test_betas """ test_errs = np.zeros(len(top_df)) scores = np.zeros(len(top_df)) dfs = np.zeros(len(top_df)) test_coefs = np.zeros((len(top_df), X_t.shape[1])) for i in range(len(top_df)): alpha = top_df.iloc[i]["alpha"] lambda_min = top_df.iloc[i]["lambda.min"] enet = ElasticNet(l1_ratio=alpha, alpha=lambda_min, max_iter=max_iter) enet.fit(X_t, y_t) y_pred = enet.predict(X_t) test_errs[i] = np.average((y_t - y_pred)**2) scores[i] = enet.score(X_t, y_t) test_coefs[i] = enet.coef_ dfs[i] = len(np.where(enet.coef_)[0]) top_df["test_err"] = test_errs top_df["score"] = scores top_df["df"] = dfs return top_df, test_coefs
def fit_enet(train_X, train_y, test_X): """ Use linear regression to predict. Elastic net is LR with L1 and L2 regularisation. :param train_X: :param train_y: :param test_X: :return: """ enet = ElasticNet() enet.fit(train_X, train_y) model = "ElasticNet int %.2f coefs %s" % (enet.intercept_, pprint(enet.coef_)) yhat_train = enet.predict(train_X) yhat_test = enet.predict(test_X) return model, yhat_train, yhat_test
def ElasticNetRegression(input_dict): # from sklearn.datasets import load_iris # from sklearn import tree # iris = load_iris() # clf = tree.DecisionTreeClassifier() # clf = clf.fit(iris.data, iris.target) from sklearn.datasets import load_diabetes dta = load_diabetes() n_sample = dta.data n_feature = dta.target print "*******SAMPLES********" print n_sample print "******FEARTURES*******" print n_feature from sklearn.linear_model import ElasticNet rgs = ElasticNet().fit(n_sample, n_feature) print rgs print rgs.predict(n_sample)
def fit_model_12(self,toWrite=False): model = ElasticNet(alpha=1.0) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict(X_test) print("Model 12 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model12/model.pkl','w') pickle.dump(model,f2) f2.close()
def predict_linear(self, enet=True): """How well can we do on this SRFF with a linear regression (with optional elastic-net regularisation)?""" if enet: clf = ElasticNet() else: clf = LinearRegression() # we have to transpose X here because sklearn uses the # opposite order (rows v columns). maybe this is a sign that # I'm using the wrong order. clf.fit(self.train_X.T, self.train_y) yhat = clf.predict(self.test_X.T) err = self.defn(self.test_y, yhat) return clf.intercept_, clf.coef_, err
def report_orig_en(): # Train elastic net on original training data en = ElasticNet() st = time() en.fit(trainData.T, trainLabels) elapsed_en_fit = time() - st # Predict labels for original validation data st = time() y_pred = en.predict(validationData.T) elapsed_en_pred = time() - st # Report performance mse_orig = metrics.mean_squared_error(validationLabels, y_pred) return mse_orig, elapsed_en_fit, 0.
def create_ml_classifier(df): import operator X = np.array(df.drop('base_ip_release',1)) y = np.array(df['base_ip_release']) #clf = LinearRegression() clf = ElasticNet(alpha=1,l1_ratio=0.5) #clf = Ridge(alpha=2) # train_X,test_X,train_y,test_y = cross_validation.train_test_split(X,y,train_size=0.9) # # # sc = StandardScaler() # sc.fit(train_X) # X_train_std = sc.transform(train_X) # X_test_std = sc.transform(test_X) # # clf.fit(X_train_std,train_y) # print clf.predict(X_test_std) # print accuracy_score(test_y,clf.predict(X_test_std)) c = np.zeros(len(X)/10) kf = k(len(y),n_folds=10) c = 0 min_dict = {} get_error = [] for train,test in kf: get_clif = clf.fit(X[train],y[train]) p = clf.predict(X[test]) #print p e = (p - y[test]) #print e, len(e) t = np.dot(e,e) # print t c += t # print c #print p, y[test] min_dict[t] = get_clif get_error.append(t) #print min_dict min_error = min(get_error) print sorted(min_dict.items(),key=operator.itemgetter(0)) print min_dict[min_error] print c print np.sqrt(c/len(X)) return min_dict[min_error]
def assert_regression_result(results, tol): regression_results = [r for r in results if r["param"]["objective"] == "reg:linear"] for res in regression_results: X = scale(res["dataset"].X, with_mean=isinstance(res["dataset"].X, np.ndarray)) y = res["dataset"].y reg_alpha = res["param"]["alpha"] reg_lambda = res["param"]["lambda"] pred = res["bst"].predict(xgb.DMatrix(X)) weights = xgb_get_weights(res["bst"])[1:] enet = ElasticNet(alpha=reg_alpha + reg_lambda, l1_ratio=reg_alpha / (reg_alpha + reg_lambda)) enet.fit(X, y) enet_pred = enet.predict(X) assert np.isclose(weights, enet.coef_, rtol=tol, atol=tol).all(), (weights, enet.coef_) assert np.isclose(enet_pred, pred, rtol=tol, atol=tol).all(), ( res["dataset"].name, enet_pred[:5], pred[:5])
def Lasso(): from sklearn.linear_model import Lasso from sklearn.metrics import r2_score alpha = 0.1 lasso = Lasso(alpha=alpha) trainDat = shortData trainLab = shortLabels lassoPred = lasso.fit(trainDat,trainLab) labPredict = lassoPred.predict(testDat) r2val = r2_score(testLab,labPredict) print(lasso) print "r^2 for lasso testing is: ", r2val from sklearn.linear_model import ElasticNet enet = ElasticNet(alpha=alpha, l1_ratio = 0.7) enetPred = enet.fit(trainDat, trainLab) labPredict_enet = enet.predict(testDat) r2val_enet = r2_score(testLab, labPredict_enet) print enet print "r^2 for enet testing is: ", r2val_enet
# Baysian Ridge Regression print 'baysian ridge' br = BayesianRidge(compute_score=True) #br.fit(x[:, np.newaxis], y) #br_sts_scores = br.predict(xt[:, np.newaxis]) br.fit(x, y) br_sts_scores = br.predict(xt) # Elastic Net print 'elastic net' enr = ElasticNet() #enr.fit(x[:, np.newaxis], y) #enr_sts_scores = enr.predict(xt[:, np.newaxis]) enr.fit(x, y) enr_sts_scores = enr.predict(xt) # Passive Aggressive Regression print 'passive aggressive' par = PassiveAggressiveRegressor() par.fit(x, y) par_sts_scores = par.predict(xt) #par.fit(x[:, np.newaxis], y) #par_sts_scores = par.predict(xt[:, np.newaxis]) # RANSAC Regression print 'ransac' ransac = RANSACRegressor() #ransac.fit(x[:, np.newaxis], y) #ransac_sts_scores = ransac.predict(xt[:, np.newaxis])
WBC = 7.13914 Urine = 3739.39 PatientX = np.array([[ICD9_code, Oxygen, PO2, Bicarbonate, Bilirubin, Sodium, Urea_Nitrogen, Potassium, WBC, Urine, Age, Gender, Admission_Type, Admissin_Location, Insurance, Religion, Marital_Status, Ethnicity, Diagnosis]]) # In[55]: #Prediction using the Elastic NET Model model_Elnet = ElasticNet(alpha = 0.01, l1_ratio = 0.1) model_Elnet.fit(x_train_res, y_train_res) y_patientX = model_Elnet.predict(PatientX) if y_patientX > 0.5: y_pred_px = "POSITIVE" prob = 100*y_patientX else: y_pred_px = "NEGATIVE" prob = 100 - 100*y_patientX print("The newly addmited patient has been classified %s for the death prediction.With a probability of %f %%" % (y_pred_px,prob)) # In[73]: from sklearn.preprocessing import normalize
def Final_Stacking(Train_DS, y, Actual_DS, Sample_DS): print("***************Starting Final Stacking*************** at Time: %s" %(tm.strftime("%H:%M:%S"))) t0 = time() #Setting Standard scaler for data # stdScaler = StandardScaler() # stdScaler.fit(Train_DS,y) # Train_DS = stdScaler.transform(Train_DS) # Actual_DS = stdScaler.transform(Actual_DS) #CV: 0.36225ff # clf = RandomForestRegressor(n_estimators=100,min_samples_leaf=18,max_features=None,bootstrap=True, # min_samples_split=23,max_depth=25) # clf=Lasso(alpha=0.02) # print("lasso CV") # Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf) #clf = xgb.XGBRegressor(n_estimators=2000,max_depth=1,learning_rate=0.01,nthread=4,min_child_weight=7,subsample=1,colsample_bytree=0.7,silent=True,gamma=0.8) #cv:0.3872 #clf = ExtraTreesRegressor(n_estimators=1000,min_samples_leaf=19,max_features=12,bootstrap=True,min_samples_split=1,max_depth=25,n_jobs=-1) ################################################################################################################################################# #CV: # clf = RandomForestRegressor(n_estimators=1000,n_jobs=-1) # #Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf) # clf.fit(Train_DS, y) # Pred_Actual = clf.predict(Actual_DS).reshape(-1,1) # preds_RFR = pd.DataFrame(Pred_Actual,columns=Sample_DS.columns[1:]).reset_index().sort(columns='Hazard',ascending= True).reset_index(drop=True).reset_index().sort(columns='index',ascending= True).reset_index(drop=True) # preds_RFR = preds_RFR.drop(['Hazard','index'], axis = 1) # # preds = pd.DataFrame(np.array(preds_RFR), index=Sample_DS.Id.values, columns=Sample_DS.columns[1:]) # preds.to_csv(file_path+'output/Submission_Stacking_RFR_1.csv', index_label='Id') # print("RFR Actual Model predicted") # print(preds_RFR.head(10)) # # sys.exit(0) ################################################################################################################################################ #CV:0.39009654989438813 # clf = xgb.XGBRegressor(n_estimators=2000,max_depth=2,learning_rate=0.01,nthread=4,min_child_weight=23,subsample=0.9,colsample_bytree=0.2,silent=True,gamma=0.9) # #clf = xgb.XGBRegressor(n_estimators=1000) # #Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf) # clf.fit(Train_DS, y) # Pred_Actual = clf.predict(Actual_DS).reshape(-1,1) # preds_XGB = pd.DataFrame(Pred_Actual,columns=Sample_DS.columns[1:]).reset_index().sort(columns='Hazard',ascending= True).reset_index(drop=True).reset_index().sort(columns='index',ascending= True).reset_index(drop=True) # preds_XGB = preds_XGB.drop(['Hazard','index'], axis = 1) # # preds = pd.DataFrame(np.array(preds_XGB), index=Sample_DS.Id.values, columns=Sample_DS.columns[1:]) # preds.to_csv(file_path+'output/Submission_Stacking_XGB_1.csv', index_label='Id') # print("XGB Actual Model predicted") # print(preds_XGB.head(10)) ################################################################################################################################################# #CV: 0.3879 , LB:0.382419 #clf = ElasticNet(alpha=0.1, l1_ratio=0.3) #CV:0.3902-.3905 , 0.385 clf = ElasticNet(alpha=0.1, l1_ratio=0.1) clf = BayesianRidge(n_iter=300) Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf) clf.fit(Train_DS, y) Pred_Actual = clf.predict(Actual_DS).reshape(-1,1) preds_ELN = pd.DataFrame(Pred_Actual,columns=Sample_DS.columns[1:]).reset_index().sort(columns='Hazard',ascending= True).reset_index(drop=True).reset_index().sort(columns='index',ascending= True).reset_index(drop=True) preds_ELN = preds_ELN.drop(['Hazard','index'], axis = 1) preds = pd.DataFrame(np.array(preds_ELN), index=Sample_DS.Id.values, columns=Sample_DS.columns[1:]) preds.to_csv(file_path+'output/Submission_Stacking_ELN_1.csv', index_label='Id') print("ELN Actual Model predicted") print(preds_ELN.head(10)) sys.exit(0) ####################################################################################################################################### Pred_Actual = NN1_Regressor(Train_DS, y, Actual_DS, grid= False) preds_NNT = pd.DataFrame(Pred_Actual,columns=Sample_DS.columns[1:]).reset_index().sort(columns='Hazard',ascending= True).reset_index(drop=True).reset_index().sort(columns='index',ascending= True).reset_index(drop=True) preds_NNT = preds_NNT.drop(['Hazard','index'], axis = 1) preds = pd.DataFrame(np.array(preds_NNT), index=Sample_DS.Id.values, columns=Sample_DS.columns[1:]) preds.to_csv(file_path+'output/Submission_Stacking_NNT_1.csv', index_label='Id') print("NNT Actual Model predicted") print(preds_NNT.head(10)) pred_Actual = (preds_XGB['level_0'] + preds_ELN['level_0'] + preds_NNT['level_0'])/3 #pred_Actual = np.power((Pred_Actual * Pred_Actual1 * Pred_Actual2), (1/3.0)) #Get the predictions for actual data set preds = pd.DataFrame(pred_Actual, index=Sample_DS.Id.values, columns=Sample_DS.columns[1:]) preds.to_csv(file_path+'output/Submission_Stacking_1.csv', index_label='Id') ######################################################################################################################################## print("***************Ending Final Stacking*************** at Time: %s" %(tm.strftime("%H:%M:%S"))) return pred_Actual
def get_en_prediction(train_data, train_truth, test_data, test_truth, alpha=1.0, l1_ratio=0.5, iter_id=0): clf = ElasticNet(alpha=alpha, l1_ratio=l1_ratio) clf.fit(train_data, train_truth) predicted = clf.predict(test_data) return predicted.ravel()
#%% #now for the estimation #split into test & train set train_X,test_X,train_Y,test_Y = train_test_split(lagged_stimuli,Y,test_size=0.3) #%% #try elastic net CV #enet_model = ElasticNetCV([.1,.3,.7,.9,.99],cv=3,n_jobs=-1) #enet_model.train(train_X,train_Y) #pred_Y = enet_model.predict(train_X) #for now enet = ElasticNet(l1_ratio=0.7) enet.fit(train_X,train_Y) pred_Y = enet.predict(train_X) #%% #non-linearity first by CV NN parameters_NN = { 'n_neighbors' : [5,10,20,40]} NN_nonl = KNeighborsRegressor() gs_NN = grid_search.RandomizedSearchCV(NN_nonl,parameters_NN,verbose=1) #%% #try Radius Neighbors Regr #parameters_radius = { 'weights' : ('uniform','distance') , 'radius' : [0.5,1.0,3.0,5.0,10.0,20.0]} #RN_nonl = RadiusNeighborsRegressor() #gs_RN = grid_search.RandomizedSearchCV(RN_nonl,parameters_radius,verbose=1) #%%
# It is made available under the MIT License import numpy as np from sklearn.datasets import load_svmlight_file from sklearn.cross_validation import KFold from sklearn.linear_model import ElasticNet from sklearn.metrics import mean_squared_error, r2_score data, target = load_svmlight_file('data/E2006.train') # Edit the lines below if you want to switch method: # met = LinearRegression(fit_intercept=True) met = ElasticNet(fit_intercept=True, alpha=.1) kf = KFold(len(target), n_folds=5) pred = np.zeros_like(target) for train, test in kf: met.fit(data[train], target[train]) pred[test] = met.predict(data[test]) print('[EN 0.1] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) print('[EN 0.1] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) print('') met.fit(data, target) pred = met.predict(data) print('[EN 0.1] RMSE on training, {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) print('[EN 0.1] R2 on training, {:.2}'.format(r2_score(target, pred)))
mean = X_train.mean(axis=0) X_train = (X_train - mean) / std X_test = (X_test - mean) / std std = y_train.std(axis=0) mean = y_train.mean(axis=0) y_train = (y_train - mean) / std y_test = (y_test - mean) / std gc.collect() print("- benchmarking ElasticNet") clf = ElasticNet(alpha=alpha, l1_ratio=0.5, fit_intercept=False) tstart = time() clf.fit(X_train, y_train) elnet_results[i, j, 0] = mean_squared_error(clf.predict(X_test), y_test) elnet_results[i, j, 1] = time() - tstart gc.collect() print("- benchmarking SGD") clf = SGDRegressor(alpha=alpha / n_train, fit_intercept=False, max_iter=max_iter, learning_rate="invscaling", eta0=.01, power_t=0.25, tol=1e-3) tstart = time() clf.fit(X_train, y_train) sgd_results[i, j, 0] = mean_squared_error(clf.predict(X_test),
class ElasticNet(Model): # X represents the features, Y represents the labels X = None Y = None prediction = None model = None def __init__(self): pass def __init__(self, X=None, Y=None, label_headers=None, l1_ratio=1, type='regressor', cfg=False): if X is not None: self.X = X if Y is not None: self.Y = Y self.type = type self.cfg = cfg self.mapping_dict = None self.label_headers = label_headers self.model = ElasticNetModel(l1_ratio=l1_ratio) def fit(self, X=None, Y=None): if X is not None: self.X = X if Y is not None: self.Y = Y if self.type == 'classifier': self.Y = self.map_str_to_number(self.Y) print('ElasticNet Train started............') self.model.fit(self.X, self.Y) print('ElasticNet completed..........') return self.model def predict(self, test_features): print('Prediction started............') self.predictions = self.model.predict(test_features) if self.type == 'classifier': predictions = predictions.round() print('Prediction completed..........') return self.predictions def save(self): if self.cfg: f = open('elasticnet_configs.txt', 'w') f.write(json.dumps(self.model.get_params())) f.close() print('No models will be saved for elasticnet') def featureImportance(self): return self.model.coef_ def map_str_to_number(self, Y): mapping_flag = False if self.mapping_dict is not None: for label_header in self.label_headers: Y[label_header] = Y[label_header].map(self.mapping_dict) return Y mapping_dict = None for label_header in self.label_headers: check_list = pd.Series(Y[label_header]) for item in check_list: if type(item) == str: mapping_flag = True break if mapping_flag: classes = Y[label_header].unique() mapping_dict = {} index = 0 for c in classes: mapping_dict[c] = index index += 1 Y[label_header] = Y[label_header].map(mapping_dict) mapping_flag = False self.mapping_dict = mapping_dict return Y def map_number_to_str(self, Y, classes): Y = Y.round() Y = Y.astype(int) if self.mapping_dict is not None: mapping_dict = self.mapping_dict else: mapping_dict = {} index = 0 for c in classes: mapping_dict[index] = c index += 1 inv_map = {v: k for k, v in mapping_dict.items()} return Y.map(inv_map) def getAccuracy(self, test_labels, predictions, origin=0, hitmissr=0.8): if self.type == 'classifier': correct = 0 df = pd.DataFrame(data=predictions.flatten()) test_labels = self.map_str_to_number(test_labels.copy()) for i in range(len(df)): if (df.values[i] == test_labels.values[i]): correct = correct + 1 else: correct = 0 df = pd.DataFrame(data=predictions.flatten()) for i in range(len(df)): if 1 - abs(df.values[i] - test_labels.values[i])/abs(df.values[i]) >= hitmissr: correct = correct + 1 return float(correct)/len(df) def getConfusionMatrix(self, test_labels, predictions, label_headers): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'classifier': index = 0 for label_header in label_headers: classes = test_labels[label_header].unique() df_tmp = self.map_number_to_str(df.ix[:,index], classes) title = 'Normalized confusion matrix for NeuralNetwork (' + label_header + ')' self.plot_confusion_matrix(test_labels.ix[:,index], df_tmp, classes=classes, normalize=True, title=title) index = index + 1 else: return 'No Confusion Matrix for Regression' def getROC(self, test_labels, predictions, label_headers): predictions=pd.DataFrame(data=predictions.flatten()) predictions.columns=test_labels.columns.values if self.type == 'classifier': test_labels = self.map_str_to_number(test_labels) fpr, tpr, _ = roc_curve(test_labels, predictions) plt.figure(1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr, tpr) plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.show() else: return 'No Confusion Matrix for Regression' def getRSquare(self, test_labels, predictions, mode='single'): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'regressor': if mode == 'multiple': errors = r2_score(test_labels, df, multioutput='variance_weighted') else: errors = r2_score(test_labels, df) return errors else: return 'No RSquare for Classification' def getMSE(self, test_labels, predictions): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'regressor': errors = mean_squared_error(test_labels, df) return errors else: return 'No MSE for Classification' def getMAPE(self, test_labels, predictions): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'regressor': errors = np.mean(np.abs((test_labels - df.values) / test_labels)) * 100 return errors.values[0] else: return 'No MAPE for Classification' def getRMSE(self, test_labels, predictions): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'regressor': errors = sqrt(mean_squared_error(test_labels, df)) return errors else: return 'No RMSE for Classification'
plt.ylabel('Residuals') plt.legend(loc='upper left') plt.hlines(y=0, xmin=-10, xmax=50, color='black', lw=2) plt.show() print('At alpha = 0.1, MSE train: %.3f, test: %.3f' % (mean_squared_error( y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('At alpha = 0.1, R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))) ## alpha = 0.1 gives the best performing model ## Elastic Net Model from sklearn.linear_model import ElasticNet elanet1 = ElasticNet(alpha=1.0, l1_ratio=0.5) elanet1.fit(X_train, y_train) y_train_pred = elanet1.predict(X_train) y_test_pred = elanet1.predict(X_test) plt.scatter(y_train_pred, y_train_pred - y_train, c='green', marker='o', edgecolor='white', label='Training data') plt.scatter(y_test_pred, y_test_pred - y_test, c='red', marker='s', edgecolor='white', label='Test data') plt.title('Elastic Net Model with alpha = 1') plt.xlabel('Predicted values')
from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error print('MSE train: %.3f, test: %.3f' % (mean_squared_error( y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))) #ElasticNet from sklearn.linear_model import ElasticNet for n in floatrange(0.1, 0.5, 5): elanet = ElasticNet(alpha=n, l1_ratio=0.5) X = df.iloc[:, :-1].values y = df['MEDV'].values elanet.fit(X_train, y_train) y_train_pred = elanet.predict(X_train) y_test_pred = elanet.predict(X_test) #print('MSE train: %.3f, test: %.3f' % ( # mean_squared_error(y_train, y_train_pred), # mean_squared_error(y_test, y_test_pred))) #print('R^2 train: %.3f, test: %.3f' % ( # r2_score(y_train, y_train_pred), # r2_score(y_test, y_test_pred))) ary = np.array(range(100000)) np.linalg.norm(ary) sp.linalg.norm(ary) np.sqrt(np.sum(ary**2)) plt.scatter(y_train_pred, y_train_pred - y_train, c='steelblue', marker='o',
# 定义验证函数 def rmse_cv(model): rmse = np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv=5)) return (rmse) #Lasso clf1 = LassoCV(alphas=[1, 0.1, 0.001, 0.0005, 0.0003, 0.0002, 5e-4]) clf1.fit(X_train, y) lasso_preds = np.expm1(clf1.predict(X_test)) # exp(x) - 1 <---->log1p(x)==log(1+x) score1 = rmse_cv(clf1) print("\nLasso score: {:.4f} ({:.4f})\n".format(score1.mean(), score1.std())) #ElasticNet clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.9) clf2.fit(X_train, y) elas_preds = np.expm1(clf2.predict(X_test)) score2 = rmse_cv(clf2) print("\nElasticNet score: {:.4f} ({:.4f})\n".format(score2.mean(), score2.std())) # print(lasso_preds) # print(elas_preds) # Id_list=[i for i in range(1461,2920)] # print (len(Id_list)) # price_list=[] # for i in range(0,1459): # new_list=[] # new_list=[Id_list[i],lasso_preds[i]] # price_list.append(new_list) # print(price_list)
predictions_ridge = ridge.predict(test_features) rmse_ridge = sqrt(mean_squared_error(predictions_ridge, test_labels)) print("RMSE:", round(rmse_ridge, 2)) # In[23]: EN = ElasticNet(alpha=0.01, max_iter=10000, normalize=True, l1_ratio=0.8) EN.fit(train_features, train_labels) train_EN = EN.score(train_features, train_labels) test_EN = EN.score(test_features, test_labels) coeff_used = np.sum(EN.coef_ != 0) print("number of features used:", coeff_used) predictions_EN = EN.predict(test_features) rmse_EN = sqrt(mean_squared_error(predictions_EN, test_labels)) print("RMSE:", round(rmse_EN, 2)) # In[24]: df = pd.read_csv("data_clean.csv") del df["Unnamed: 0"] df = df[[ "gross_square_feet", "block", "land_square_feet", "lot", "age_of_building", "borough", "residential_units", "commercial_units", "total_units", "sale_price" ]] df['borough'] = df['borough'].astype('category')
from sklearn.linear_model import Lasso from sklearn.linear_model import Ridge from sklearn.linear_model import ElasticNet L = Lasso() R = Ridge() EN = ElasticNet() L.fit(X_train, y_train) R.fit(X_train, y_train) EN.fit(X_train, y_train) y_predL = L.predict(X_test) y_predR = R.predict(X_test) y_predEN = EN.predict(X_test) # Metrics Report from sklearn.metrics import mean_absolute_error as mae from sklearn.metrics import mean_squared_error as mse print("Lasso Mean Absolute Error: ", mae(y_test, y_predL)) print("Lasso Mean Squared Error: ", mse(y_test, y_predL)) print("Lasso Root Mean Squared Error: ", np.sqrt(mse(y_test, y_predL))) # Lasso Model Checking if np.sqrt(mse(y_test, y_predL)) < (0.1 * y_mean): print("ALgo works properly") else: print("Model needs some changes") from sklearn.metrics import mean_absolute_error as mae
from sklearn.cross_validation import KFold from sklearn.linear_model import LinearRegression, ElasticNet import numpy as np from sklearn.datasets import load_boston boston = load_boston() x = np.array([np.concatenate((v, [1])) for v in boston.data]) y = boston.target FIT_EN = False if FIT_EN: model = ElasticNet(fit_intercept=True, alpha=0.5) else: model = LinearRegression(fit_intercept=True) model.fit(x, y) p = np.array([model.predict(xi) for xi in x]) e = p - y total_error = np.dot(e, e) rmse_train = np.sqrt(total_error / len(p)) kf = KFold(len(x), n_folds=10) err = 0 for train, test in kf: model.fit(x[train], y[train]) p = np.array([model.predict(xi) for xi in x[test]]) e = p - y[test] err += np.dot(e, e) rmse_10cv = np.sqrt(err / len(x)) print('RMSE on training: {}'.format(rmse_train)) print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
# print(X_train.shape) X_train = X_train.reshape((X_train.shape[0], 1)) # print(X_train.shape) # print(X_test.shape) X_test = X_test.reshape(X_test.shape[0], 1) # print(X_test.shape) regr = ElasticNet(random_state=0) regr.fit(X_train, y_train) # print(regr.score(X_train, y_train)) # print(regr.coef_) # print(regr.intercept_) y_predicted = regr.predict(X_test) print('y_test: ') print(y_test) print('y_predicted: ') print(y_predicted) predictions = list() for i in range(len(test_scaled)): X, y = test_scaled[i, 0:-1], test_scaled[i, -1] yhat = y_predicted[i] # print("Y_test: " + str(y) + " Yhat: " + str(yhat)) yhat = data_misc.invert_scale(scaler, X, yhat) # print("yhat no scaled:" + str(yhat))
lasso = Lasso(alpha=5) ridge = Ridge(alpha=3) lr = LinearRegression() dtr = DecisionTreeRegressor(max_depth=17) bagger = BaggingRegressor(net, verbose = 1) X_train, X_test, y_train, y_test = train_test_split(X_model, y) dtr.fit(X_train,y_train) dtr.score(X_test, y_test) pred = dtr.predict(X_test) plt.scatter(y_test, (pred*0.8)-y_test) net.fit(X_train, y_train) net.score(X_test, y_test) preds = net.predict(X_test) plt.scatter(y_test, (preds) - y_test, alpha = 0.7) scores = cross_val_score(net, scale(X_model), y, cv=12) scores.mean() X2 = pivoted[['compilation_0', 'compilation_1', 'compilation_2']] y2 = pivoted.compilation_3 X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2) lr.fit(X_train, y_train) lr.score(X_test, y_test) pivoted.head() mapped_pivot = pd.read_csv('pivot_catcherr.csv')
train[val] = train_tmp test[val] = test_tmp error_dict = {} elastic_models = {} finalError = [] for val in players: errors2 = [] ENreg = ElasticNet(alpha=0.5, l1_ratio=0.5, normalize=False) ENreg.fit(train[val]['DEFENSIVE_RATING'].values.reshape(-1, 1), train[val]['POINTS']) elastic_models[val] = ENreg prediction = ENreg.predict( test[val]['DEFENSIVE_RATING'].values.reshape(-1, 1)) prediction = np.round(prediction, 0) val_prediction = prediction val_actual = test[val]['POINTS'].values cnt = 0 compare = {} for val1 in prediction: compare[val1] = test[val]['POINTS'].values[cnt] cnt += 1 for i in range(len(prediction)): errors2.append(abs(prediction[i] - test[val]['POINTS'].values[i])) error_dict[val] = abs(prediction[i] - test[val]['POINTS'].values[i])
def elastic_net_regression(test, train, seed, alpha, n): model = ElasticNet(alpha=alpha, random_state=seed).fit(train.iloc[:, :n], train['loss']) predicted = model.predict(test.iloc[:, :n]) result = mean_absolute_error(test['loss'], predicted) print("Elastic Net: " + str(round(result, 2)))
# create an index object of our column names to plot our feature importance with xCols = xTrain.columns # now let's plot our coefficients plt.style.use('ggplot') plt.plot(range(len(xCols)), elasticNetCoef) plt.xticks(range(len(xCols)), xCols.values) plt.title( "Feature importance of independent variables for\nElasticNet model (coefficient values)" ) plt.margins(0.02) plt.show() # now let's do our 2020 predictions of CPI based on this model predict2020CPI = model.predict(x2020Forward) # reset our myData2020Forward index to zero so we can attach these predicted CPIs myData2020Forward.reset_index(drop=True, inplace=True) # now let's add this back into our 2020 dataframe myData2020Forward = pd.concat([ myData2020Forward, pd.DataFrame(predict2020CPI, columns=['predicted_CPI']) ], axis=1) # next we will change the dates from ordinal back to dates so we can union them back together myData2020Forward['Date'] = myData2020Forward.iloc[:, 0].astype(int).map( dt.date.fromordinal) myDataBefore2020['Date'] = myDataBefore2020.iloc[:, 0].astype(int).map(
pred2 = nn_reg.predict(testx) nn_reg.score(testx, testY) #Lasso lasso = Lasso(alpha=1) lasso.fit(trainx, trainy[y[i]]) pred3 = lasso.predict(testx) lasso.score(testx, testY) #Ridge ridge = Ridge(alpha=1.0) ridge.fit(trainx, trainy[y[i]]) pred4 = ridge.predict(testx) ridge.score(testx, testY) #ElasticNet elsnet = ElasticNet(alpha=1, l1_ratio=0.7) elsnet.fit(trainx, trainy[y[i]]) pred5 = elsnet.predict(testx) elsnet.score(testx, testY) #plot among them xx = np.linspace(0, max(testY), 100) fig = plt.figure(figsize=(10, 8)) ax = plt.gca() plt.scatter(testY, pred, label='Linear regression') plt.scatter(testY, pred2, label='$k$-NN') plt.scatter(testY, pred3, label='Lasso') plt.scatter(testY, pred4, label='Ridge') plt.scatter(testY, pred5, label='ElasticNet') plt.plot(xx, xx) plt.ylabel('Estimation', fontsize=16) plt.xlabel('True output', fontsize=16) plt.legend(fontsize=14)
X = df.iloc[:, 0:13].values # 正解(目的変数)に住宅価格を設定 y = df["MEDV"].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) POLY = PolynomialFeatures(degree=2, include_bias=False) X_train_pol = POLY.fit_transform(X_train) X_test_pol = POLY.transform(X_test) sc = StandardScaler() X_train_std = sc.fit_transform(X_train_pol) X_test_std = sc.transform(X_test_pol) # model = LinearRegression() # model = Lasso(alpha=0.1) model = ElasticNet(alpha=0.1, l1_ratio=0.6) model.fit(X_train_std, y_train) y_train_pred = model.predict(X_train_std) y_test_pred = model.predict(X_test_std) print("MSE train: {0}, test: {1}".format( mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
train_file = train_file.drop(train_file.columns[0], axis=1) train_file = train_file.values train_X_temp = train_file[5:50000, :-1] train_Y = train_file[6:50001, -1] train_X = np.zeros((train_X_temp.shape[0], 8 * 5)) for i in range(train_X_temp.shape[0]): for j in range(5): for k in range(8): train_X[i][j * 8 + k] = train_X_temp[i - j][k] test_file_name = dir_path + "test2.csv" test_file = read_csv(test_file_name, skiprows=1, header=None) test_file = test_file.values test_X = np.array(test_file[:, :-1]) test_y = test_file[:, -1] #best_l1_ratio, best_alpha = train_EN_model(train_X, train_Y, test_X) #print "Best L1 ratio, Best alpha",best_l1_ratio,best_alpha #enet = ElasticNet(l1_ratio=best_l1_ratio, alpha=best_alpha) start = time.time() enet = ElasticNet() enet.fit(train_X, train_Y) #model = "ElasticNet int %.2f coefs %s" % (enet.intercept_, pprint(enet.coef_)) prediction = enet.predict(test_X) mse = np.mean((prediction - test_y)**2) print "MSE: ", mse # print prediction print "Score: ", enet.score(test_X, test_y) print "Time: ", (time.time() - start)
def train(in_alpha, in_l1_ratio): import os import warnings import sys import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.model_selection import train_test_split from sklearn.linear_model import ElasticNet import mlflow import mlflow.sklearn def eval_metrics(actual, pred): rmse = np.sqrt(mean_squared_error(actual, pred)) mae = mean_absolute_error(actual, pred) r2 = r2_score(actual, pred) return rmse, mae, r2 warnings.filterwarnings("ignore") np.random.seed(40) # Read the wine-quality csv file (make sure you're running this from the root of MLflow!) # Assumes wine-quality.csv is located in the same folder as the notebook wine_path = "wine-quality.csv" data = pd.read_csv(wine_path) # Split the data into training and test sets. (0.75, 0.25) split. train, test = train_test_split(data) # The predicted column is "quality" which is a scalar from [3, 9] train_x = train.drop(["quality"], axis=1) test_x = test.drop(["quality"], axis=1) train_y = train[["quality"]] test_y = test[["quality"]] # Set default values if no alpha is provided if float(in_alpha) is None: alpha = 0.5 else: alpha = float(in_alpha) # Set default values if no l1_ratio is provided if float(in_l1_ratio) is None: l1_ratio = 0.5 else: l1_ratio = float(in_l1_ratio) #mlflow.set_tracking_uri("http://zak-tracking-server-svc-myproject.192.168.64.12.nip.io") # Useful for multiple runs (only doing one run in this sample notebook) with mlflow.start_run(): # Execute ElasticNet lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42) lr.fit(train_x, train_y) # Evaluate Metrics predicted_qualities = lr.predict(test_x) (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities) # Print out metrics print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio)) print(" RMSE: %s" % rmse) print(" MAE: %s" % mae) print(" R2: %s" % r2) # Log parameter, metrics, and model to MLflow mlflow.log_param("alpha", alpha) mlflow.log_param("l1_ratio", l1_ratio) mlflow.log_metric("rmse", rmse) mlflow.log_metric("r2", r2) mlflow.log_metric("mae", mae) mlflow.sklearn.log_model(lr, "model")
# Train classifier clf = ElasticNet() train = pd.read_csv("train/subtrain.csv", chunksize = 100000, iterator = True) all_classes = np.array([0, 1]) for chunk in train: y_train = chunk["click"] chunk = chunk[cols] chunk = chunk.join(pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd", "hr"])) chunk.drop(["hour"], axis=1, inplace = True) Xcat = fh.transform(np.asarray(chunk.astype(str))) clf.fit(Xcat, y_train) # Create a submission file usecols = cols + ["id"] X_test = pd.read_csv("test/mtest.csv", usecols=usecols) X_test = X_test.join(pd.DataFrame([dayhour(x) for x in X_test.hour], columns=["wd", "hr"])) X_test.drop(["hour"], axis=1, inplace = True) X_enc_test = fh.transform(np.asarray(X_test.astype(str))) y_act = pd.read_csv("test/mtest.csv", usecols=['click']) y_pred = clf.predict(X_enc_test) with open('logloss.txt','a') as f: f.write('\n'+str(log_loss(y_act, y_pred))) with open("submission/submission_elnet.csv", "w") as f: f.write("id,click\n") for idx, xid in enumerate(X_test.id): f.write(str(xid) + "," + "{0:.10f}".format(y_pred[idx]) + "\n") f.close()
df_tmp[list(range(30))] = df_tmp[list(range(30))].where(df_tmp[list(range(30))]>.1,0) topic_0 + topic_7 + topic_8 + topic_9 + topic_12 + topic_14 + topic_16 + topic_17+ topic_20 + topic_23 + topic_24 + topic_25 + topic_28 X = df[[str(x) for x in [0,7,8,9,12,14,16,17,20,23,24,25,28]]+["black_proportion","log_income","log_price","total_RE"]] y = np.where(df['white_proportion']>np.median(df['white_proportion']),1,0) y= df['income'] OLR = OLS(y,X).fit() OLR.summary() OLR.predict(exog=X) df_full_results.params.sort_values() df_results.params.sort_values() df_results.summary() EN = ElasticNet(alpha = .02, l1_ratio=.001) EN.fit(X,y) EN.score(X,y) EN.predict(X) LinR = LinearRegression() LinR.fit(X,y) LinR.score(X,y) RR = Ridge() RR.fit(X,y).score(X,y) pd.Series(RR.coef_) from sklearn.svm import SVR, SVC supportR = SVR() supportR.fit(X,y) supportC = SVC() supportC.fit(X,y) supportC.score(X,y)
def train_and_evaluate(config_path): config = read_params(config_path) test_data_path = config["split_data"]["test_path"] train_data_path = config["split_data"]["train_path"] random_state = config["base"]["random_state"] model_dir = config["model_dir"] alpha = config["estimators"]["ElasticNet"]["params"]["alpha"] l1_ratio = config["estimators"]["ElasticNet"]["params"]["l1_ratio"] target = [config["base"]["target_col"]] train = pd.read_csv(train_data_path, sep=",") test = pd.read_csv(test_data_path, sep=",") train_y = train[target] test_y = test[target] train_x = train.drop(target, axis=1) test_x = test.drop(target, axis=1) lr = ElasticNet( alpha=alpha, l1_ratio=l1_ratio, random_state=random_state) lr.fit(train_x, train_y) predicted_qualities = lr.predict(test_x) (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities) print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio)) print(" RMSE: %s" % rmse) print(" MAE: %s" % mae) print(" R2: %s" % r2) ##################################################### scores_file = config["reports"]["scores"] params_file = config["reports"]["params"] with open(scores_file, "w") as f: scores = { "rmse": rmse, "mae": mae, "r2": r2 } json.dump(scores, f, indent=4) with open(params_file, "w") as f: params = { "alpha": alpha, "l1_ratio": l1_ratio, } json.dump(params, f, indent=4) ##################################################### os.makedirs(model_dir, exist_ok=True) model_path = os.path.join(model_dir, "model.joblib") joblib.dump(lr, model_path)
优化方法呢? ''' rg = ElasticNet(alpha=1.0, l1_ratio=0.5, fit_intercept=True, normalize=False, precompute=False, max_iter=1000, copy_X=True, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic') rg.fit(X_train, Y_train) Y_pre = rg.predict(X_test) rg.score(X_test, Y_test) rg.coef_ rg.intercept_ ''' alpha 两个惩罚项系数的相乘,如果是0就是最小二乘了 l1_ratio 混合参数,在[0,1]之间是弹性网的 fit_intercept 是否训练截距 normalize 归一化否 precompute 是否使用Gram矩阵来加速 max_iter 最大迭代次数 copy_X 是否覆盖模型中的X tol 精度 warm_start 是否使用上一次调用的解决方案作为初始化 positive 设置强制系数为正的嘛? random_state 随机器
#Random Forest Regressor (good cv, good mape) reg =RR(n_estimators = 100) reg.fit(x_train, y_train) cv_score = cross_val_score(reg,x_train,y_train,cv = 10) cv_score.mean() y_pred = reg.predict(x_test) reg.score(x_test,y_test) forecast_accuracy(y_pred,y_test) #Elastic Net regressor (very bad cv, very good mape) regr = EN(random_state=0) regr.fit(x_train, y_train) cv_score = cross_val_score(regr,x_train,y_train,cv = 10) cv_score.mean() y_pred = regr.predict(x_test) regr.score(x_test,y_test) forecast_accuracy(y_pred,y_test) #K-neighbors Regressor ( bad cv, good mape) regr2 = knr(10) regr2.fit(x_train, y_train) cv_score = cross_val_score(regr2,x_train,y_train,cv = 10) cv_score.mean() y_pred = regr2.predict(x_test) regr2.score(x_test,y_test) forecast_accuracy(y_pred,y_test) #SVR (bad cv, very good mape) regr3 = SVR() regr3.fit(x_train, y_train)
train, test = train_test_split(data) # The predicted column is "quality" which is a scalar from [3, 9] train_x = train.drop(["quality"], axis=1) test_x = test.drop(["quality"], axis=1) train_y = train[["quality"]] test_y = test[["quality"]] alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5 l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5 with mlflow.start_run(): lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42) lr.fit(train_x, train_y) predicted_qualities = lr.predict(test_x) (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities) print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio)) print(" RMSE: %s" % rmse) print(" MAE: %s" % mae) print(" R2: %s" % r2) mlflow.log_param("alpha", alpha) mlflow.log_param("l1_ratio", l1_ratio) mlflow.log_metric("rmse", rmse) mlflow.log_metric("r2", r2) mlflow.log_metric("mae", mae) mlflow.sklearn.log_model(lr, "model")
lasso.fit(x_train_all, y_train_all) # training mse y_lasso_train = lasso.predict(x_train_all) lasso_train_mse = mean_squared_error(y_train_all, y_lasso_train) # test mse y_lasso_pred = lasso.predict(x_test) lasso_test_mse = mean_squared_error(y_test, y_lasso_pred) ## elastic net elastic = ElasticNet(alpha = 1.0, l1_ratio=0.5) elastic.fit(x_train_all, y_train_all) # training mse y_elastic_train = elastic.predict(x_train_all) elastic_train_mse = mean_squared_error(y_train_all, y_elastic_train) # test mse y_elastic_pred = elastic.predict(x_test) elastic_test_mse = mean_squared_error(y_test, y_lasso_pred) #### perform cross validation to select model ## ridge regression with CV ridgeCV = RidgeCV(alphas = [0.1, 0.3, 1.0, 3.0, 10.0, 30.0, 100.0, 300.0, 1000.0]) ridgeCV.fit(x_train_all, y_train_all) ridgeCV.alpha_ # training mse y_ridgeCV_train = ridgeCV.predict(x_train_all) ridgeCV_train_mse = mean_squared_error(y_train_all, y_ridgeCV_train)
print X_test.shape y_train=df_train["Purchase"] df_train=df_train.drop("Purchase", axis=1) #from sklearn.feature_selection import SelectKBest #from sklearn.feature_selection import f_regression #sel = SelectKBest(f_regression, k=10) #X_tr=pd.DataFrame(sel.fit_transform(X_train,y_train)) #X_tst=pd.DataFrame(sel.transform(X_test)) #print X_tr.shape #print X_tst.shape from sklearn.linear_model import ElasticNet model=ElasticNet(alpha=0.001) model.fit(X_train,y_train) y_pred=model.predict(X_test) #print y_pred.shape #print key1.shape #print key2.shape out=pd.DataFrame() out["User_ID"]=key1 out["Product_ID"]=key2 out["Purchase"]=y_pred out.to_csv('outavb.csv', index=False)
def elastic_net(datapath): # load mat datafile = os.path.join(datapath, 'data_numpy.mat') if os.path.exists(datafile) is False: print('Data file %s not found.' % datafile) data_numpy = sio.loadmat(datafile) # get training and test data train_x_raw = data_numpy['trainX_raw']; train_x_smooth= data_numpy['trainX_smooth']; train_y = data_numpy['trainY']; test_x_raw = data_numpy['testX_raw']; test_x_smooth = data_numpy['testX_smooth']; test_y = data_numpy['testY']; base_y = data_numpy['baseY']; train_y = train_y.ravel() t_start = time.perf_counter() x_fft = np.fft.fft(train_x_raw) raw_fft_time = time.perf_counter() - t_start train_x_raw_fft = np.concatenate((np.imag(x_fft), np.real(x_fft)), axis=1) x_fft = np.fft.fft(test_x_raw) test_x_raw_fft = np.concatenate((np.imag(x_fft), np.real(x_fft)), axis=1) t_start = time.perf_counter() x_fft = np.fft.fft(train_x_smooth) smooth_fft_time = time.perf_counter() - t_start train_x_smooth_fft = np.concatenate((np.imag(x_fft), np.real(x_fft)), axis=1) x_fft = np.fft.fft(test_x_smooth) test_x_smooth_fft = np.concatenate((np.imag(x_fft), np.real(x_fft)), axis=1) enet_raw = ElasticNet(alpha=0.5, max_iter=50000, tol=0.3) t_start = time.perf_counter() enet_raw.fit(train_x_raw, train_y) elastic_net_raw_time = time.perf_counter() - t_start pred_y = enet_raw.predict(test_x_raw) np.savetxt(os.path.join(datapath, 'elastic_net_raw.txt'), pred_y) enet_raw_fft = ElasticNet(alpha=0.5, max_iter=50000, tol=0.3) t_start = time.perf_counter() enet_raw_fft.fit(train_x_raw_fft, train_y) elastic_net_raw_fft_time = time.perf_counter() - t_start pred_y = enet_raw_fft.predict(test_x_raw_fft) np.savetxt(os.path.join(datapath, 'elastic_net_raw_fft.txt'), pred_y) enet_smooth = ElasticNet(alpha=0.5, max_iter=50000, tol=0.3) t_start = time.perf_counter() enet_smooth.fit(train_x_smooth, train_y) elastic_net_smooth_time = time.perf_counter() - t_start pred_y = enet_smooth.predict(test_x_smooth) np.savetxt(os.path.join(datapath, 'elastic_net_smooth.txt'), pred_y) enet_smooth_fft = ElasticNet(alpha=0.5, max_iter=50000, tol=0.3) t_start = time.perf_counter() enet_smooth_fft.fit(train_x_smooth_fft, train_y) elastic_net_smooth_fft_time = time.perf_counter() - t_start pred_y = enet_smooth_fft.predict(test_x_smooth_fft) np.savetxt(os.path.join(datapath, 'elastic_net_smooth_fft.txt'), pred_y) f_time = open(os.path.join(datapath, 'elastic_net_time.txt'), 'w') f_time.write(str(raw_fft_time) + '\n') f_time.write(str(smooth_fft_time)+ '\n') f_time.write(str(elastic_net_raw_time)+ '\n') f_time.write(str(elastic_net_raw_fft_time)+ '\n') f_time.write(str(elastic_net_smooth_time)+ '\n') f_time.write(str(elastic_net_smooth_fft_time)+ '\n') f_time.close()
flag = False for i in range(20): X, y = 0, 0 if (flag): diff_values = data_misc.difference(raw_values, 1) supervised = data_misc.timeseries_to_supervised(diff_values, 1) supervised_values = supervised.values train, test = supervised_values[0:-1], supervised_values[-1:] test_scaled = scaler.transform(test) X, y = test_scaled[0, 0:-1], test_scaled[0, -1] else: flag = True X, y = test_scaled[i, 0:-1], test_scaled[i, -1] yhat = regr.predict([X]) print("Y_test: " + str(y) + " Yhat: " + str(yhat)) yhat = data_misc.invert_scale(scaler, X, yhat) # Se recorre -1 porque para que no se alinie donde empezó yhat = data_misc.inverse_difference(raw_values, yhat, -1 - i) # store forecast predictions.append(yhat) allList.append(yhat) # df = DataFrame(raw_values) # print(df[0][i]) # columns = [df[0][i] for i in range(0,df.size)] # columns.append(yhat) raw_values = allList
# ElasticNet Regression import numpy as np from sklearn import datasets from sklearn.linear_model import ElasticNet # load the diabetes datasets dataset = datasets.load_diabetes() # fit a model to the data model = ElasticNet(alpha=0.1) model.fit(dataset.data, dataset.target) print(model) # make predictions expected = dataset.target predicted = model.predict(dataset.data) # summarize the fit of the model mse = np.mean((predicted-expected)**2) print(mse) print(model.score(dataset.data, dataset.target))
def stacking(ext_total_df): cat_col = [15, 18] lin_col = [3, 9, 10, 11, 14, 17, 21, 22, 23, 24, 25] + list(range( 27, 81)) + list(range(82, 108)) # ここまで欠損値無し? nan_is_m1 = [ "home_team_match_point_prev3_mean", "home_team_match_point_prev5_mean", "away_team_match_point_prev3_mean", "away_team_match_point_prev5_mean" ] lin_total_df = ext_total_df.iloc[:, [0] + lin_col] lin_total_df = pd.concat([ lin_total_df, pd.get_dummies(ext_total_df.iloc[:, cat_col].astype("str"), drop_first=True) ], axis=1) lin_total_df.loc[:, nan_is_m1] = lin_total_df.loc[:, nan_is_m1].replace(-1, 1.2) dropcol = ["attendance"] all_train_X = lin_total_df.query("1994 <= match_date_year").drop(dropcol, axis=1) all_train_y = np.log1p( lin_total_df.query("1994 <= match_date_year")["attendance"]) all_train_y2 = lin_total_df.query( "1994 <= match_date_year")["attendance"] / lin_total_df.query( "1994 <= match_date_year")["capacity"] for year in tqdm(range(2002, 2017, 2)): window = 6 duration = 2 Z = all_train_X.match_date_year Z2 = all_train_X.division train = (year - duration - window < Z) & (Z <= year - duration) val = (year - duration < Z) & (Z <= year) train_X = all_train_X.loc[train, :] train_y = all_train_y[train] train_y2 = all_train_y2[train] val_X = all_train_X.loc[val, :] scl = StandardScaler() scl.fit(train_X.values.astype(np.float64)) train_scl = scl.transform(train_X.values.astype(np.float64)) val_scl = scl.transform(val_X.values.astype(np.float64)) elastic_net = ElasticNet(alpha=10**-2.7, l1_ratio=0.75, max_iter=10000, random_state=2434) elastic_net.fit(train_scl, train_y2) ext_total_df.loc[val_X.index, "elastic_net"] = elastic_net.predict(val_scl).clip( 0, 1) train = (2010 < Z) & (Z <= 2016) val = (2017 == Z) | ((Z == 2018) & ((all_train_X["section"] <= 17) | (33 <= all_train_X["section"]))) train_X = all_train_X.loc[train, :] train_y = all_train_y[train] train_y2 = all_train_y2[train] val_X = all_train_X.loc[val, :] scl = StandardScaler() scl.fit(train_X.values.astype(np.float64)) train_scl = scl.transform(train_X.values.astype(np.float64)) val_scl = scl.transform(val_X.values.astype(np.float64)) elastic_net = ElasticNet(alpha=10**-2.7, l1_ratio=0.75, random_state=2434) elastic_net.fit(train_scl, train_y2) ext_total_df.loc[val_X.index, "elastic_net"] = elastic_net.predict(val_scl).clip(0, 1) return ext_total_df
def elasticnet(): regressor = ElasticNet(alpha=1, l1_ratio=0.5, normalize=False) regressor.fit(X_train, y_train) y_predictions = regressor.predict(X_test) return (regressor.score(X_test, y_test), sqrt(mean_squared_error(y_test, y_predictions)))
y_test_pred_ridge = ridge_regressor.predict(X_test) print('MSE train Ridge: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred_ridge), mean_squared_error(y_test, y_test_pred_ridge))) print('R^2 train Ridge: %.3f, test: %.3f' % (r2_score( y_train, y_train_pred_ridge), r2_score(y_test, y_test_pred_ridge))) lasso_regressor.fit(X_train, y_train) y_train_pred_lasso = lasso_regressor.predict(X_train) y_test_pred_lasso = lasso_regressor.predict(X_test) print('MSE train Lasso: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred_lasso), mean_squared_error(y_test, y_test_pred_lasso))) print('R^2 train Lasso: %.3f, test: %.3f' % (r2_score( y_train, y_train_pred_lasso), r2_score(y_test, y_test_pred_lasso))) elastic_regressor.fit(X_train, y_train) y_train_pred_elastic = elastic_regressor.predict(X_train) y_test_pred_elastic = elastic_regressor.predict(X_test) print('MSE train elastic: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred_elastic), mean_squared_error(y_test, y_test_pred_elastic))) print('R^2 train elastic: %.3f, test: %.3f' % (r2_score( y_train, y_train_pred_elastic), r2_score(y_test, y_test_pred_elastic))) # Now let's do a Random Forest Regression # In[15]: from sklearn.ensemble import RandomForestRegressor forest = RandomForestRegressor(n_estimators=1000, criterion='mse', random_state=1, n_jobs=-1) forest.fit(X_train, y_train) y_train_pred = forest.predict(X_train)
import numpy as np from sklearn.datasets import load_svmlight_file from sklearn.cross_validation import KFold from sklearn.linear_model import ElasticNet, LinearRegression data, target = load_svmlight_file('data/E2006.train') # Edit the lines below if you want to switch method: # met = LinearRegression(fit_intercept=True) met = ElasticNet(fit_intercept=True, alpha=.1) kf = KFold(len(target), n_folds=10) err = 0 for train, test in kf: met.fit(data[train], target[train]) p = met.predict(data[test]) p = np.array(p).ravel() e = p - target[test] err += np.dot(e, e) rmse_10cv = np.sqrt(err / len(target)) met.fit(data, target) p = met.predict(data) p = p.ravel() e = p - target total_error = np.dot(e, e) rmse_train = np.sqrt(total_error / len(p))
gs_cv_lasso.best_params_ lasso_tuned = Lasso(**gs_cv_lasso.best_params_).fit(X_train, y_train) y_pred = lasso_tuned.predict(X_test) np.sqrt(mean_squared_error(y_test, y_pred)) pd.Series(lasso_tuned.coef_, index=X_train.columns) # ElasticNet REGRESSION df = load_advertising() X = df.drop('sales', axis=1) y = df[["sales"]] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=46) enet_model = ElasticNet().fit(X_train, y_train) y_pred = enet_model.predict(X_test) np.sqrt(mean_squared_error(y_test, y_pred)) # MODEL TUNING WITH GRIDSEARCHCV enet_params = { "l1_ratio": [0.1, 0.4, 0.5, 0.6, 0.8, 1], "alpha": [0.1, 0.01, 0.001, 0.2, 0.3, 0.5, 0.8, 0.9, 1] } enet_model = ElasticNet() gs_cv_enet = GridSearchCV(enet_model, enet_params, cv=10).fit(X_train, y_train) gs_cv_enet.best_params_ enet_tuned = ElasticNet(**gs_cv_enet.best_params_).fit(X_train, y_train)
print("Took {:0.4g}s to compute training Fastfood expansion.".format(elapsed_ff_kern_train)) st = time() PHI_valid, _ = FastfoodForKernel(validationData, para, sgm) elapsed_ff_kern_valid = time() - st print("Took {:0.4g}s to compute validation Fastfood expansion.".format(elapsed_ff_kern_valid)) # Train elastic net on projected training data en = ElasticNet() st = time() en.fit(PHI_train.T, trainLabels) elapsed_en_fit = time() - st print("Took {:0.4g}s to fit elastic net on projected training data.".format(elapsed_en_fit)) # Predict labels for projected validation data st = time() y_pred = en.predict(PHI_valid.T) elapsed_en_pred = time() - st print("Took {:0.4g}s to predict on projected validation data.".format(elapsed_en_pred)) # Report performance mse_proj = metrics.mean_squared_error(validationLabels, y_pred) print("For projected data, MSE = {:0.4g}.".format(mse_proj)) # Train elastic net on original training data en = ElasticNet() st = time() en.fit(trainData.T, trainLabels) elapsed_en_fit = time() - st print("Took {:0.4g}s to fit elastic net on original training data.".format(elapsed_en_fit)) # Predict labels for original validation data
elasticnet = ElasticNet() elasticnet.fit(X_train, y_train) # In[1385]: elasticnet_score = elasticnet.score(X_test, y_test) elasticnet_score # In[1420]: elasticnet_score = elasticnet.score(X_test, y_test) elasticnet_score # In[1386]: elasticnet_pred = elasticnet.predict(X_test) # In[1422]: # The mean squared error print("Root mean squared error: %.2f" % sqrt(mean_squared_error(y_test, elasticnet_pred))) # The absolute squared error print("Mean absolute error: %.2f" % mean_absolute_error(y_test, elasticnet_pred)) # Explained variance score: 1 is perfect prediction print('R-squared: %.2f' % r2_score(y_test, elasticnet_pred)) # In[1416]: #Evaluate Models
print ridge_scores.mean() print ridge_scores # combination of ridge and Lasso print "Elastic net regularization" for alpha in range(1,5): elastic_net = ElasticNet(alpha) elastic_net_scores =cross_val_score(elastic_net, x, y, cv = 5) print "alpha={a}".format(a=alpha) print elastic_net_scores.mean() print elastic_net_scores # best performing regressor for this data set was Elastic net with alpha=1 # with score = 0.472705248975 # draw scatter plot for values predicted with this regressor print "Showing scatter plot for elastic net with alpha = 1" elastic_net = ElasticNet(1) elastic_net.fit(x, y) predicted_y = elastic_net.predict(x) fig = plt.figure() plt.scatter(y, predicted_y, alpha=0.3, ) fig.suptitle('Boston real estate pricing', fontsize=20) plt.figtext(.5,.9,'Elastic net regularization, alpha=1', fontsize=15, ha='center') plt.xlabel('Actual value, $1000s', fontsize=18) plt.ylabel('Predicted value, $1000s', fontsize=18) plt.show()
sc_x = StandardScaler() sc_y = StandardScaler() X_std = sc_x.fit_transform(X) y_std = np.ravel(sc_y.fit_transform(y.reshape(-1, 1))) X_train, X_test, y_train, y_test = train_test_split( X_std, y_std, test_size=0.3, random_state=0) # train and test from sklearn.linear_model import ElasticNet from sklearn.metrics import mean_squared_error alphas = [0.001, 0.01, 0.1, 1, 10, 100, 1000] train_errors = [] test_errors = [] for alpha in alphas: model = ElasticNet(alpha=alpha, l1_ratio=0.5) model.fit(X_train, y_train) y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) train_errors.append(mean_squared_error(y_train, y_train_pred)) test_errors.append(mean_squared_error(y_test, y_test_pred)) print(train_errors) print(test_errors)
class linReg: def __init__(self, in_df): df = self.__imputeVals(in_df.copy()) self.X = df.drop(columns=["SalePrice"]).copy() self.y = np.log(df.SalePrice.values.reshape(-1, 1)) self._gridSearch = None self.pipeline_X = self.__make_pipe() self.pipeline_y = StandardScaler() self._searchSpace = None self._params = None self.lm = ElasticNet() def __imputeVals(self, in_df): return imputeVals(in_df) def __make_pipe(self): nonePipeline = make_pipeline( SimpleImputer(strategy="constant", fill_value="None"), OneHotEncoder(drop="first")) zeroPipeline = make_pipeline( SimpleImputer(strategy="constant", fill_value=0), OneHotEncoder(drop="first", categories="auto")) scalePipeline = make_pipeline( SimpleImputer(strategy="constant", fill_value=0), StandardScaler()) regressionPipeline = ColumnTransformer( [("setNone", nonePipeline, fillNone), ("setZero", zeroPipeline, fillZeroCat), ("transformed", scalePipeline, fillZeroCont), ("dictImputed", make_pipeline(dictImputer(imputeDict), OneHotEncoder( drop="first")), list(imputeDict.keys())), ("bool", "passthrough", imputeBool), ("categoricalInts", "passthrough", cat_to_int), ("dropped", "drop", dropList)], remainder="drop") return regressionPipeline def gridSearch(self, params, cv=5, njobs=-1, verbose=50): self._searchSpace = params #self._params = None piped_X = self.pipeline_X.fit_transform(self.X) piped_y = self.pipeline_y.fit_transform(self.y) self._gridSearch = GridSearchCV(self.lm, params, cv=cv, scoring="neg_mean_squared_error", n_jobs=njobs, verbose=verbose) self._gridSearch.fit(piped_X, piped_y) def getBestParams(self): if self._gridSearch is not None: return self._gridSearch.best_params_ else: raise ValueError() def getBestScore(self): if self._gridSearch is not None: return self._gridSearch.best_score_ else: raise ValueError() def fitModel(self, params): piped_X = self.pipeline_X.fit_transform(self.X) piped_y = self.pipeline_y.fit_transform(self.y) self._params = params self.lm.set_params(**params) self.lm.fit(piped_X, piped_y) def __invert(self, y): return np.exp(self.pipeline_y.inverse_transform(y)) def getTrainScore(self): piped_X = self.pipeline_X.transform(self.X) piped_y = self.pipeline_y.transform(self.y) return self.lm.score(piped_X, piped_y) # Root Mean Square Log Error def getRMSLE(self): piped_X = self.pipeline_X.transform(self.X) preds = self.pipeline_y.inverse_transform(self.lm.predict(piped_X)) return mean_squared_error(self.y, preds) def predict(self, test_X): piped_X = self.pipeline_X.transform(self.__imputeVals(test_X)) preds = self.lm.predict(piped_X) return self.__invert(preds)
from sklearn.linear_model import ElasticNetCV, ElasticNet from sklearn.metrics import mean_squared_error, r2_score from matplotlib import pyplot as plt data, target = load_svmlight_file('data/E2006.train') # 다음을 변경한다 # from sklearn.linear_model import Lasso # met = Lasso(alpha=0.1) met = ElasticNet(alpha=0.1) kf = KFold(len(target), n_folds=5) pred = np.zeros_like(target) for train, test in kf: met.fit(data[train], target[train]) pred[test] = met.predict(data[test]) print('[EN 0.1] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) print('[EN 0.1] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) print('') # ElasticNetCV 생성자(모든 CPU 사용) met = ElasticNetCV(n_jobs=-1) kf = KFold(len(target), n_folds=5) pred = np.zeros_like(target) for train, test in kf: met.fit(data[train], target[train]) pred[test] = met.predict(data[test]) print('[EN CV] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
std = X_train.std(axis=0) mean = X_train.mean(axis=0) X_train = (X_train - mean) / std X_test = (X_test - mean) / std std = y_train.std(axis=0) mean = y_train.mean(axis=0) y_train = (y_train - mean) / std y_test = (y_test - mean) / std gc.collect() print("- benching ElasticNet") clf = ElasticNet(alpha=alpha, rho=0.5, fit_intercept=False) tstart = time() clf.fit(X_train, y_train) elnet_results[i, j, 0] = mean_squared_error(clf.predict(X_test), y_test) elnet_results[i, j, 1] = time() - tstart gc.collect() print("- benching SGD") n_iter = np.ceil(10 ** 4.0 / n_train) clf = SGDRegressor(alpha=alpha, fit_intercept=False, n_iter=n_iter, learning_rate="invscaling", eta0=.01, power_t=0.25) tstart = time() clf.fit(X_train, y_train) sgd_results[i, j, 0] = mean_squared_error(clf.predict(X_test), y_test) sgd_results[i, j, 1] = time() - tstart
class RiskClassifier(): def __init__(self, train=False): if train: self.train() #else: #Load classifier def train(self, plot=False): X, y = self.prepareData() n_samples = X.shape[0] X_train, y_train = X[:n_samples // 2], y[:n_samples // 2] X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] ####################################################################### # Lasso alpha = 0.1 self.lasso = Lasso(alpha=alpha) y_pred_lasso = self.lasso.fit(X_train, y_train).predict(X_test) r2_score_lasso = r2_score(y_test, y_pred_lasso) print(self.lasso) print("r^2 on test data : %f" % r2_score_lasso) # ##################################################################### # ElasticNet from sklearn.linear_model import ElasticNet self.enet = ElasticNet(alpha=alpha, l1_ratio=0.7) y_pred_enet = self.enet.fit(X_train, y_train).predict(X_test) r2_score_enet = r2_score(y_test, y_pred_enet) print(self.enet) print("r^2 on test data : %f" % r2_score_enet) if plot: import matplotlib.pyplot as plt plt.plot(self.enet.coef_, color='lightgreen', linewidth=2, \ label='Elastic net coefficients') plt.plot(self.lasso.coef_, color='gold', linewidth=2, \ label='Lasso coefficients') plt.legend(loc='best') plt.title("Lasso R^2: %f, Elastic Net R^2: %f" % (r2_score_lasso, r2_score_enet)) plt.show() def classify(self, example, test=None): #perform the classification y_pred_lasso = self.lasso.predict(example) y_pred_enet = self.enet.predict(example) if not test is None: r2_score_lasso = r2_score(test, y_pred_lasso) r2_score_enet = r2_score(test, y_pred_enet) print("r^2 Classify Lasso test: %f" % r2_score_lasso) print("r^2 Classify Enet test: %f" % r2_score_enet) return y_pred_lasso, y_pred_enet def getAllData(self): dfCorruption = pd.read_csv('../economy/Corruption.csv', sep=';', na_values=0) dfEducation = pd.read_csv('../economy/Education.csv', sep=';', na_values=0) dfGini = pd.read_csv('../economy/Gini.csv', sep=';', na_values=0) dfImports = pd.read_csv('../economy/Imports.csv', sep=';', na_values=0) dfInflation = pd.read_csv('../economy/Inflation.csv', sep=';', na_values=0) dfPopulation = pd.read_csv('../economy/Population.csv', sep=';', na_values=0) dfReserves = pd.read_csv('../economy/Reserves.csv', sep=';', na_values=0) dfUnemployment = pd.read_csv('../economy/Unemployment.csv', sep=';', na_values=0) riskData = pd.read_csv('../targets/psr.csv', na_values=0) # result = pd.concat([dfCorruption, dfEducation, dfGini, dfImports, dfInflation, dfPopulation, dfReserves, dfUnemployment], axis=1, join='inner') allData = dfCorruption.set_index('Country Name') \ .join(dfEducation.set_index('Country Name'), lsuffix='_corruption') \ .join(dfGini.set_index('Country Name'), lsuffix='_education') \ .join(dfImports.set_index('Country Name'),lsuffix='_gini') \ .join(dfInflation.set_index('Country Name'),lsuffix='_imports') \ .join(dfPopulation.set_index('Country Name'), lsuffix = '_inflation') \ .join(dfReserves.set_index('Country Name'), lsuffix = '_population') \ .join(dfUnemployment.set_index('Country Name'), lsuffix = '_reserves', rsuffix = '_unemployment') allData = allData.join(riskData.set_index('Country Name'), how='inner') return allData def getDataFromYear(self, year): allData = self.getAllData() strYear = str(year) data = allData[[ strYear + '_corruption', strYear + '_education', strYear + '_gini', strYear + '_imports', strYear + '_inflation', strYear + '_population', strYear + '_reserves', strYear + '_unemployment' ]].fillna(0) risk = allData[[strYear+'_PRS'+strYear[2:]+'VA', \ strYear+'_PRS'+strYear[2:]+'PV', \ strYear+'_PRS'+strYear[2:]+'GE', \ strYear+'_PRS'+strYear[2:]+'RQ', \ strYear+'_PRS'+strYear[2:]+'RL', \ strYear+'_PRS'+strYear[2:]+'CC']].fillna(0) return data, risk def prepareData(self): # Someday we will have more data so we'll load more years # into our classifier data2016, risk2016 = self.getDataFromYear(2016) X = data2016.as_matrix() y = risk2016['2016_PRS16PV'].as_matrix() return X, y