def helper(X_train, Y_train): reg = linear_model.ElasticNet(alpha=alpha) reg.fit(X_train, Y_train) return reg.predict
def model_selector(model): # Ye good ol ugly if-elif switch to choose the model if model == "linear": regr = linear_model.LinearRegression(n_jobs=-1) elif model == "lasso": regr = linear_model.Lasso(random_state=17) elif model == "elasticnet" or model == "elastic": regr = linear_model.ElasticNet(random_state=17) elif model == "bayesian": regr = linear_model.BayesianRidge() elif model == "decision tree regressor" or model == "dtr": regr = tree.DecisionTreeRegressor(max_depth=8, min_samples_leaf=17, random_state=17) elif model == "tweedie regressor 0" or model == "normal distribution": regr = linear_model.TweedieRegressor(power=0) elif model == "tweedie regressor 1" or model == "poisson distribution": regr = linear_model.TweedieRegressor(power=1) elif model == "extra trees regressor" or model == "etr": regr = ensemble.ExtraTreesRegressor(max_depth=8, min_samples_leaf=17, random_state=17) elif model == "random forest regressor" or model == "rfr": regr = ensemble.RandomForestRegressor(n_estimators=500, oob_score=True, random_state=17, n_jobs=-1) elif model == "adaboost extra trees" or model == "boost et": regr = AdaBoostRegressor(ensemble.ExtraTreesRegressor(max_depth=8, min_samples_leaf=17, random_state=17), n_estimators=500, random_state=17) elif model == "k neighbours" or model == "k neighbor": regr = neighbors.KNeighborsRegressor(n_jobs=-1) elif model == "gradient boosting regressor" or model == "gbr": regr = ensemble.GradientBoostingRegressor(random_state=17) elif model == "voting": clf1 = linear_model.LinearRegression(n_jobs=-1) clf2 = ensemble.RandomForestRegressor(max_depth=8, min_samples_leaf=17, random_state=17, n_jobs=-1) clf3 = ensemble.GradientBoostingRegressor(random_state=17) regr = ensemble.VotingRegressor(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], n_jobs=-1) elif model == "logistic": regr = linear_model.LogisticRegression(max_iter=250, random_state=17, n_jobs=-1) elif model == "gaussian": regr = GaussianNB() elif model == "decision tree classifier" or model == "dtc": regr = tree.DecisionTreeClassifier(max_depth=8, min_samples_leaf=17, random_state=17) elif model == "extra tree classifier" or model == "etc": regr = ensemble.ExtraTreesClassifier(max_depth=8, min_samples_leaf=17, random_state=17) elif model == "random forest classifier" or model == "rfc": regr = ensemble.RandomForestClassifier(max_depth=8, min_samples_leaf=17, random_state=17) elif model == "linear svc": regr = svm.LinearSVC(random_state=17) elif model == "k neighbour classifier" or model == "k neighbor classifier": regr = neighbors.KNeighborsClassifier(n_jobs=-1, n_neighbors=2) elif model == "svc": regr = svm.SVC(kernel="rbf", probability=True, random_state=17) return regr
# Generate sample data n_samples_train, n_samples_test, n_features = 75, 150, 500 np.random.seed(0) coef = np.random.randn(n_features) coef[50:] = 0.0 # only the top 10 features are impacting the model X = np.random.randn(n_samples_train + n_samples_test, n_features) y = np.dot(X, coef) # Split train and test data X_train, X_test = X[:n_samples_train], X[n_samples_train:] y_train, y_test = y[:n_samples_train], y[n_samples_train:] ############################################################################### # Compute train and test errors alphas = np.logspace(-5, 1, 60) enet = linear_model.ElasticNet(l1_ratio=0.7) train_errors = list() test_errors = list() for alpha in alphas: enet.set_params(alpha=alpha) enet.fit(X_train, y_train) train_errors.append(enet.score(X_train, y_train)) test_errors.append(enet.score(X_test, y_test)) i_alpha_optim = np.argmax(test_errors) alpha_optim = alphas[i_alpha_optim] print("Optimal regularization parameter : %s" % alpha_optim) # Estimate the coef_ on full data with optimal regularization parameter enet.set_params(alpha=alpha_optim) coef_ = enet.fit(X, y).coef_
def lr_model(df): # dist_lambda=lambda x: distance.euclidean(x,(-122.3380,47.6075)) # location=list(zip(df["longitude"],df["latitude"])) # distances=list(map(dist_lambda,location)) # # distance from Seattle art museum # df['distances']=distances df = df[np.abs(df['price'] - df['price'].mean()) <= (3 * df['price'].std())] numerical_columns = [ "host_response_rate", "host_listings_count", "latitude", "longitude", "accommodates", "bathrooms", "bedrooms", "beds", "square_feet", "guests_included", "extra_people", "minimum_nights", "maximum_nights", "availability_30", "availability_60", "availability_90", "availability_365", "number_of_reviews", "review_scores_rating", "review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin", "review_scores_communication", "review_scores_location", "review_scores_value", "calculated_host_listings_count", "reviews_per_month", "distances" ] categorical_columns = [ "host_is_superhost", "host_neighbourhood", "host_verifications", "host_has_profile_pic", "host_identity_verified", "neighbourhood", "neighbourhood_cleansed", "neighbourhood_group_cleansed", "is_location_exact", "property_type", "room_type", "calendar_updated", "has_availability", "requires_license", "instant_bookable", "cancellation_policy", "require_guest_profile_picture", "require_guest_phone_verification", "bed_type", "amenities" ] # std = StandardScaler() # data=df[numerical_columns].to_numpy() # std.fit(data) # df1 = pd.DataFrame(data) # df1.columns=numerical_columns # df2=df[categorical_columns] # df3=df['price'] # df=pd.concat([df1,df2],axis=1,sort=False) # df['price']=df3 # X = df[categorical_columns + numerical_columns] # y = df['price'] X = df[categorical_columns + numerical_columns] y = df['price'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) categorical_pipe = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore')), ('TruncatedSVD', TruncatedSVD(n_components=30, n_iter=7, random_state=42)) ]) numerical_pipe = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median')), ('outliers', RobustScaler()), ('pca', decomposition.PCA(n_components=25))]) preprocessing = ColumnTransformer([ ('cat', categorical_pipe, categorical_columns), ('num', numerical_pipe, numerical_columns) ]) slr = Pipeline([('preprocess', preprocessing), ('classifier', linear_model.ElasticNet(alpha=0.1, l1_ratio=0.5))]) slr.fit(X_train, y_train) y_train_pred = slr.predict(X_train) y_test_pred = slr.predict(X_test) print("===testing accuracy===") # The mean squared error print('Mean squared error: %.2f' % mean_squared_error(y_test, y_test_pred)) # The coefficient of determination: 1 is perfect prediction print('Coefficient of determination: %.2f' % r2_score(y_test, y_test_pred)) fig, ax = plt.subplots() n = len(y_test) ax.scatter(y_test.values, y_test_pred, c='tab:blue', label='tab:blue', alpha=0.3, edgecolors='none') plt.xlabel('real price') plt.ylabel('predicted price') print("===training accuracy===") # The mean squared error print('Mean squared error: %.2f' % mean_squared_error(y_train, y_train_pred)) # The coefficient of determination: 1 is perfect prediction print('Coefficient of determination: %.2f' % r2_score(y_train, y_train_pred)) fig, ax = plt.subplots() n = len(y_train) ax.scatter(y_train.values, y_train_pred, c='tab:blue', label='tab:blue', alpha=0.3, edgecolors='none') plt.xlabel('real price') plt.ylabel('predicted price') plt.show() return slr
def fucking_paul(stock, log, Kin, Din, save_max, max_len, bitchCunt, tradeCost): arr = [] buy = [] sell = [] diff = [] perc = [] cumld = [] kar = [] dar = [] cumld = [] kar1 = [] dar1 = [] Kvl = np.zeros(2) Dvl = Kvl s1ar = [] s2ar = [] shortDiff = [] cuml = 1.0 mdd = 0 position = 0 stopLoss = False bull = 0 shit = 0 maxP = 0 minP = 0 for i, closeData in enumerate(stock): model = linear_model.ElasticNet(alpha=29.91) arr.append(closeData) trainX, trainY = create_dataset(arr, Din) if i > Kin: rarr = np.reshape(arr[-Din:], [1, -1]) model.fit(trainX, trainY) p = model.predict(rarr) if ((p > closeData) and position == -1 or position == 0): buy.append(closeData * (1 + tradeCost)) bull += 1 position = 1 elif (p < closeData) and position == 1 or position == 0: sell.append(closeData * (1 - tradeCost)) maxP = 0 shit += 1 position = -1 if position == 1 and closeData > maxP: maxP = closeData elif position == -1 or position == 0: maxP = closeData if position == -1 and closeData < minP: minP = closeData elif position == 1 or position == 0: minP = closeData #DYNAMIC BITCHCUNT DISTANCE IN DEVELOPMENT #WILL BE BASED ON ANALYSIS OF VARIANCE, AND CORRELATION WITH ENVIRONMENTAL VOLITILITY if (closeData < (maxP * (1 - bitchCunt)) and position == 1): sell.append(closeData * (1 - tradeCost)) maxP = 0 shit += 1 position = -1 if (closeData > (minP * (1 + bitchCunt)) and position == -1): buy.append(closeData * (1 + tradeCost)) shit += 1 position = 1 if position == 1: sell.append(stock[len(stock) - 1]) shit += 1 for i in range(bull): diff.append(sell[i] - buy[i]) if i < bull - 1: shortDiff.append(sell[i] - buy[i + 1]) for i in range(bull): perc.append(diff[i] / buy[i]) for i in range(bull - 1): perc[i] += shortDiff[i] / sell[i] for i in range(bull): cumld.append(cuml) cuml += cuml * perc[i] for i in range(len(cumld)): if i > 1: peak = max(cumld[:i]) trough = min(cumld[i:]) dd = (peak - trough) / peak if dd > mdd: mdd = dd print("tik:", log, "cuml:", cuml) if cuml > save_max and len(perc) <= max_len: write_that_shit(log, Kin, Din, perc, cuml, mdd, bitchCunt) print( f'\tYEEEEEE BOIIIIS: Kin: {Kin} Din: {Din} BitchCunt: {bitchCunt} MDD = {mdd} LEN = {len(perc)} CUML = {cuml}' ) # DONT F*****G MOVE/INDENT WRITE_THAT_SHIT!!!! # if mdd < 0.5: # plot(cumld) # plot2(s1ar, s2ar) return cuml
0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000 ] myML.plotML.PlotParam_Score(X_train, X_test, Y_train, Y_test, "linear_model.Lasso()", logX=True, alpha=alphas) # ---ElasticNet data = myML.DataPre.load_datasets(mode="diabetes") X_train, X_test, Y_train, Y_test = myML.DataPre.train_test_split( data.data, data.target, test_size=0.25, random_state=0) from sklearn import linear_model regr = linear_model.ElasticNet().fit(X_train, Y_train) myML.LinearModel.showModelTest(regr, X_test, Y_test) # test alpha and rhos alphas = np.logspace(-1, 1) rhos = np.linspace(0.01, 1) myML.plotML.PlotParam_Score(X_train, X_test, Y_train, Y_test, "linear_model.ElasticNet()", drawParam=2, plot3D=True, alpha=alphas, l1_ratio=rhos) # ---logistic 回归
# Generate sample data n_samples_train, n_samples_test, n_features = 75, 150, 500 np.random.seed(0) coef = np.random.randn(n_features) coef[50:] = 0.0 # only the top 10 features are impacting the model X = np.random.randn(n_samples_train + n_samples_test, n_features) y = np.dot(X, coef) # Split train and test data X_train, X_test = X[:n_samples_train], X[n_samples_train:] y_train, y_test = y[:n_samples_train], y[n_samples_train:] # ############################################################################# # Compute train and test errors alphas = np.logspace(-5, 1, 60) enet = linear_model.ElasticNet(l1_ratio=0.7, max_iter=10000) train_errors = list() test_errors = list() for alpha in alphas: enet.set_params(alpha=alpha) enet.fit(X_train, y_train) train_errors.append(enet.score(X_train, y_train)) test_errors.append(enet.score(X_test, y_test)) i_alpha_optim = np.argmax(test_errors) alpha_optim = alphas[i_alpha_optim] print("Optimal regularization parameter : %s" % alpha_optim) # Estimate the coef_ on full data with optimal regularization parameter enet.set_params(alpha=alpha_optim) coef_ = enet.fit(X, y).coef_
warnings.filterwarnings('ignore') save_file = False enable_stacking = True enable_add_result = False enable_lstm = False lstm_file = 'result_Dec_16_11-06-09_LSTM_batched_1.89311.csv' data_file = ["data/new_mixed_machine{}.csv".format(i) for i in range(1, 7)] dl_norm = [DataFrameMix(data_file[i]) for i in range(6)] # dl_norm = [DataLoader(g, DataConfig('norm')) for g in range(6)] svr = svm.SVR(C=200, gamma=0.001) regr = lm.Ridge(alpha=1300.0) lsr = lm.Lasso(alpha=0.035) enr = lm.ElasticNet(alpha=0.45, l1_ratio=0.5) krr = kernel_ridge.KernelRidge(kernel='polynomial') gbr = ensemble.GradientBoostingRegressor(loss='huber', max_features='sqrt', n_estimators=400) rfr = ensemble.RandomForestRegressor(n_estimators=90) xgbr = xgb.XGBRegressor(booster='gbtree', gamma=0.001, max_depth=3, min_child_weight=2, n_estimators=100) xgblr = xgb.XGBRegressor(booster='gblinear', n_estimators=300, gamma=0.0001) lgbr = lgb.LGBMRegressor(num_leaves=3, min_data_in_leaf=11, max_bin=55, learning_rate=0.05,
lowest_test_rmse_lasso = test_rmse_lasso bestmask_lasso = masks[i] bestalpha_lasso = alpha print('Alpha = ' + str(alpha) + ' is done!') print('Combination is: ' + str(bestmask_lasso)) print('Lowest test RMSE with Lasso Regularizer is ' + str(lowest_test_rmse_lasso) + ', alpha = ' + str(bestalpha_lasso)) print('-' * 20) lambda1_list = [1e-4, 1e-3, 1e-2] lambda2_list = [1e-2, 1e-1, 1, 1e1, 1e2, 1e3] lowest_test_rmse_elasticnet = 1 for lambda1 in lambda1_list: for lambda2 in lambda2_list: elasticnet = linear_model.ElasticNet(alpha=(lambda1 + lambda2), l1_ratio=lambda1 / (lambda1 + lambda2)) for i in range(32): enc = pre.OneHotEncoder(categorical_features=masks[i]) data_enc_noshuffle = enc.fit_transform(data_noshuffle) if True in masks[i]: data_enc_noshuffle = data_enc_noshuffle.toarray().astype(int) else: data_enc_noshuffle = data_enc_noshuffle.astype(int) data_enc_noshuffle_folds = kfold(data_enc_noshuffle) _, test_rmse_elasticnet = cv(data_enc_noshuffle_folds, target_noshuffle_folds, elasticnet) if test_rmse_elasticnet < lowest_test_rmse_elasticnet: lowest_test_rmse_elasticnet = test_rmse_elasticnet bestmask_elasticnet = masks[i] bestlambda1 = lambda1
print("=====================================") print("== Scaler + Elastic-net regression ==") print("=====================================") alphas = [.0001, .001, .01, .1, 1, 10, 100, 1000] l1_ratio = [.1, .5, .9] print("----------------------------") print("-- Parallelize outer loop --") print("----------------------------") enet = Pipeline([ ('standardscaler', preprocessing.StandardScaler()), ('enet', lm.ElasticNet(max_iter=10000)), ]) param_grid = {'enet__alpha':alphas , 'enet__l1_ratio':l1_ratio} enet_cv = GridSearchCV(enet, cv=5, param_grid=param_grid) %time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5, n_jobs=-1) print("Test r2:%.2f" % scores.mean()) print("-----------------------------------------------") print("-- Parallelize outer loop + built-in CV --") print("-- Remark: scaler is only done on outer loop --") print("-----------------------------------------------") enet_cv = Pipeline([ ('standardscaler', preprocessing.StandardScaler()), ('enet', lm.ElasticNetCV(max_iter=10000, l1_ratio=l1_ratio, alphas=alphas)),
def createHealthForecasterModels(): import pickle ## Aggregate relevant data for ML: data = createDataTable() fixedFactors = [ 'Age', 'Sex', 'Urban', 'ENT', 'OBGYN', 'Old_age_midLife_syndrome', 'alcohol_poisoning', 'dermatological', 'digestive', 'endocrine', 'heart', 'hematological', 'infectious_parasitic', 'injury', 'muscular_rheumatological', 'neurological', 'noDiagnosis', 'noReport', 'other', 'pyschiatric', 'respiratory', 'sexualDysfunction', 'tumor', 'unknown', 'urinary', 'High_BP', 'Diabetes', 'Heart_attack', 'Internal_bleeding', 'Pregnant', 'Height' ] fixedFactorIdxs = [ list(data.columns).index(varName) for varName in fixedFactors ] lifestyleFactors = [ 'Smoker', 'Cups_water_daily', 'Alcohol_frequency', 'Weight', 'Kcal', 'Carbs', 'Fat', 'Protein', 'Activity_level', 'Daily_screen_time', 'Hours_of_sleep' ] lifestyleFactorIdxs = [ list(data.columns).index(varName) for varName in lifestyleFactors ] responseVariables = [ 'Insulin', 'Triglycerides', 'HDL_C', 'LDL_C', 'Urea', 'Uric_acid', 'APO_A', 'Lipoprotein_A', 'High_sensitivity_CRP', 'Creatinine', 'APO_B', 'Mg', 'Ferritin', 'Hemoglobin', 'White_blood_cell', 'Red_blood_cell', 'Platelet', 'Glucose_field', 'HbA1c', 'Total_protein', 'Albumin', 'Glucose', 'Total_cholestorol', 'Alanine_AT', 'Transferrin', 'Transferrin_receptor', 'Systol', 'Diastol' ] responseVariableIdxs = [ list(data.columns).index(varName) for varName in responseVariables ] fatRelatedIdxs = [ responseVariables.index('APO_A'), responseVariables.index('Lipoprotein_A'), responseVariables.index('HDL_C'), responseVariables.index('LDL_C'), responseVariables.index('APO_B'), responseVariables.index('Triglycerides'), responseVariables.index('Total_cholestorol') ] gluRelatedIdxs = [ responseVariables.index('Insulin'), responseVariables.index('HbA1c'), responseVariables.index('Glucose') ] inputFeatures = fixedFactors + lifestyleFactors X = data[inputFeatures].to_numpy() Y = data[responseVariables].to_numpy() # Y_zscore = (Y-np.mean(Y,axis=0))/np.std(Y,axis=0) # X_Train, X_Test, Y_Train, Y_Test, cv = shuffleAndSplit(X, Y, test_size=.2, n_splits=5) # X_Train, X_Test, Y_Train_zscore, Y_Test_zscore, cv = shuffleAndSplit(X, Y_zscore, test_size=.2, n_splits=5) ## Create a second model to predict weight: # fixedFactors2 = ['age', 'sex', 'urban', 'ENT', 'OBGYN', 'Old_age_midLife_syndrome', 'alcohol_poisoning', # 'dermatological', 'digestive', 'endocrine', 'heart', 'hematological', 'infectious_parasitic', 'injury', # 'muscular_rheumatological', 'neurological', 'noDiagnosis', 'noReport', 'other', 'pyschiatric', 'respiratory', # 'sexualDysfunction', 'tumor', 'unknown', 'urinary', 'highBP', 'diabetes', 'heart_attack', 'internal_bleeding', # 'pregnant','height'] # fixedFactorIdxs2 = [list(data.columns).index(varName) for varName in fixedFactors] # lifestyleFactors2 = ['smoker', 'cups_water_daily', 'Alcohol_frequency', 'kcal', 'carbo', 'fat', 'protn', 'Activity_level', 'Daily_screen_time', 'Hours_of_sleep'] # lifestyleFactorIdxs2 = [list(data.columns).index(varName) for varName in lifestyleFactors] # responseVariables2 = ['weight'] # responseVariableIdxs2 = [list(data.columns).index(varName) for varName in responseVariables2] # inputFeatures2 = fixedFactors2+lifestyleFactors2 # X2 = data[fixedFactors2 + lifestyleFactors2].to_numpy() # Y2 = data[responseVariables2].to_numpy() # X_Train2, X_Test2, Y_Train2, Y_Test2, cv = shuffleAndSplit(X2, Y2, test_size=.2, n_splits=5) models = dict( ols=linear_model.LinearRegression(), lasso=linear_model.Lasso(alpha=0.75), ridge=linear_model.Ridge(alpha=0.75), elastic=linear_model.ElasticNet(alpha=0.1, l1_ratio=0.75), randomForest=ensemble.RandomForestRegressor( random_state=0, max_features='auto', min_samples_leaf=50, #max_depth = 3, n_estimators=200)) # Also define models to predict z_score Target Matrix # models_zscore = dict(ols=linear_model.LinearRegression(), # lasso=linear_model.Lasso(alpha=.5), # ridge=linear_model.Ridge(alpha=.5), # elastic=linear_model.ElasticNet(alpha=.5, l1_ratio=0.5), # randomForest = ensemble.RandomForestRegressor(random_state=0, # max_features = 'auto', # min_samples_leaf = 10, # n_estimators = 200) # weightModel = dict(ols=linear_model.LinearRegression(), # lasso=linear_model.Lasso(alpha=.5), # ridge=linear_model.Ridge(alpha=.5), # elastic=linear_model.ElasticNet(alpha=.5, l1_ratio=0.5), # randomForest = ensemble.RandomForestRegressor(random_state=0, # max_features = 'auto', # min_samples_leaf = 10, # n_estimators = 200)) # print('Training trainedWeightBPModels') # trainedWeightModels = {} # for name, mdl in weightModel.items(): # print('Training ' + str(name) + '...') # trainedWeightModels.update({name : mdl.fit(X2,Y2.ravel())}) # print('finished') # Train models print('Training trainedModels') trainedModels = {} for name, mdl in models.items(): print('Training ' + str(name) + '...') trainedModels.update({name: mdl.fit(X, Y)}) print('finished') # pickle.dump([trainedModels, trainedWeightModels, inputFeatures, responseVariables, inputFeatures2, responseVariables2], open("models.p", "wb")) pickle.dump([trainedModels, inputFeatures, responseVariables, data], open("models.p", "wb")) # return trainedModels, trainedWeightModels, inputFeatures, responseVariables, inputFeatures2, responseVariables2 return trainedModels, inputFeatures, responseVariables
dict_log_alpha["svm"] = 1e-4 dict_log_alpha["svr"] = np.array([1e-2, 1e-2]) # Set models to be tested models = {} models["lasso"] = Lasso(estimator=None) models["enet"] = ElasticNet(estimator=None) models["wLasso"] = WeightedLasso(estimator=None) models["logreg"] = SparseLogreg(estimator=None) models["svm"] = SVM(estimator=None) models["svr"] = SVR(estimator=None) custom_models = {} custom_models["lasso"] = Lasso(estimator=celer.Lasso( warm_start=True, fit_intercept=False)) custom_models["enet"] = ElasticNet( estimator=linear_model.ElasticNet(warm_start=True, fit_intercept=False)) custom_models["logreg"] = SparseLogreg( estimator=celer.LogisticRegression(warm_start=True, fit_intercept=False)) # Compute "ground truth" with cvxpylayer dict_cvxpy_func = { 'lasso': lasso_cvxpy, 'enet': enet_cvxpy, 'wLasso': weighted_lasso_cvxpy, 'logreg': logreg_cvxpy, 'svm': svm_cvxpy, 'svr': svr_cvxpy } dict_vals_cvxpy = {} dict_grads_cvxpy = {}
ycols = ['PRICE'] xcols = list(set(df.columns) - set(ycols)) X = df.loc[:, xcols].values y = np.ravel(df.loc[:, ycols].values) # specify cross-validation k = 10 # number of folds cvsplitter = ms.KFold(n_splits=k, shuffle=True, random_state=0) # cross-validation splitter # specify all models models = list() models.append(('LR', sl.LinearRegression())) models.append(('RIDGE', sl.Ridge())) models.append(('LASSO', sl.Lasso())) models.append(('EN', sl.ElasticNet())) models.append(('KNN', neighbors.KNeighborsRegressor())) models.append(('CART', tree.DecisionTreeRegressor())) models.append(('SVM', svm.SVR())) # fit and compute scores scoring = 'neg_mean_squared_error' algs = list() scores = list() for entry in models: score = -1 * ms.cross_val_score( entry[1], X, y, cv=cvsplitter, scoring=scoring) scores.append(score) algs.append(entry[0]) #print('{0} - {1:.4f} - {2:.4f}'.format(entry[0], np.mean(score), np.std(score, ddof = 1)))
def get_model_obj(modelType, n_clusters=None, **kwargs): if modelType == 'knn': from sklearn.neighbors import KNeighborsClassifier # 6 seems to give the best trade-off between accuracy and precision knn = KNeighborsClassifier(n_neighbors=6, **kwargs) return knn elif modelType == 'gaussianNB': from sklearn.naive_bayes import GaussianNB gnb = GaussianNB(**kwargs) return gnb elif modelType == 'multinomialNB': from sklearn.naive_bayes import MultinomialNB # TODO: figure out how to configure binomial distribution mnb = MultinomialNB(**kwargs) return mnb elif modelType == 'bernoulliNB': from sklearn.naive_bayes import BernoulliNB bnb = BernoulliNB(**kwargs) return bnb elif modelType == 'randomForest': from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(random_state=234, **kwargs) return rfc elif modelType == 'svm': from sklearn.svm import SVC svc = SVC(random_state=0, probability=True, **kwargs) return svc elif modelType == 'LinearRegression': #assert column, "Column name required for building a linear model" #assert dataframe[column].shape == target.shape from sklearn import linear_model l_reg = linear_model.LinearRegression(**kwargs) return l_reg elif modelType == 'RidgeRegression': from sklearn.linear_model import Ridge if not kwargs: kwargs = {'alpha': 0.5} ridge_reg = Ridge(**kwargs) return ridge_reg elif modelType == 'RidgeRegressionCV': from sklearn import linear_model if not kwargs: kwargs = {'alphas': [0.1, 1.0, 10.0]} ridge_cv_reg = linear_model.RidgeCV(**kwargs) return ridge_cv_reg elif modelType == 'LassoRegression': from sklearn import linear_model if not kwargs: kwargs = {'alpha': 0.1} lasso_reg = linear_model.Lasso(**kwargs) return lasso_reg elif modelType == 'ElasticNetRegression': from sklearn.metrics import r2_score from sklearn import linear_model if not kwargs: kwargs = {'alpha': 0.1, 'l1_ratio': 0.7} enet_reg = linear_model.ElasticNet(**kwargs) return enet_reg elif modelType == 'LogisticRegression': from sklearn.linear_model import LogisticRegression log_reg = LogisticRegression(random_state=123, **kwargs) return log_reg elif modelType == 'RANSACRegression': from sklearn.linear_model import LinearRegression, RANSACRegressor ransac_model = RANSACRegressor(LinearRegression()) return ransac_model elif modelType == 'kde': from sklearn.neighbors.kde import KernelDensity kde = KernelDensity(kernel='gaussian', bandwidth=0.2, **kwargs) return kde elif modelType == 'AR': import statsmodels.api as sm # fit an AR model and forecast ar_fitted = sm.tsa.AR(dataframe).fit(maxlag=9, method='mle', disp=-1, **kwargs) #ts_forecast = ar_fitted.predict(start='2008', end='2050') return ar_fitted elif modelType == 'SARIMAX': mod = sm.tsa.statespace.SARIMAX(df.riders, trend='n', order=(0, 1, 0), seasonal_order=(1, 1, 1, 12), **kwargs) return mod elif modelType == 'sgd': # Online classifiers http://scikit-learn.org/stable/auto_examples/linear_model/plot_sgd_comparison.html from sklearn.linear_model import SGDClassifier sgd = SGDClassifier(**kwargs) return sgd elif modelType == 'perceptron': from sklearn.linear_model import Perceptron perceptron = Perceptron(**kwargs) return perceptron elif modelType == 'xgboost': import xgboost as xgb xgbm = xgb.XGBClassifier(**kwargs) return xgbm elif modelType == 'baseNN': from keras.models import Sequential from keras.layers import Dense # create model model = Sequential() assert args.get('inputParams', None) assert args.get('outputParams', None) model.add(Dense(inputParams)) model.add(Dense(outputParams)) if args.get('compileParams'): # Compile model model.compile( compileParams ) # loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model elif modelType == 'lightGBMRegression': from pylightgbm.models import GBMRegressor lgbm_lreg = GBMRegressor(num_iterations=100, early_stopping_round=10, num_leaves=10, min_data_in_leaf=10) return lgbm_lreg elif modelType == 'lightGBMBinaryClass': from pylightgbm.models import GBMClassifier lgbm_bc = GBMClassifier(metric='binary_error', min_data_in_leaf=1) return lgbm_bc # Clustering models elif modelType == 'KMeans': assert n_clusters, "Number of clusters argument mandatory" cluster_callable = KMeans # seed of 10 for reproducibility. clusterer = cluster_callable(n_clusters=n_clusters, random_state=10) return clusterer elif modelType == 'dbscan': if not n_clusters: logging.warn( "Number of clusters irrelevant for cluster type : %s" % (modelType)) cluster_callable = DBSCAN clusterer = cluster_callable(eps=0.5) return clusterer elif modelType == 'affinity_prop': if not n_clusters: logging.warn( "Number of clusters irrelevant for cluster type : %s" % (modelType)) clusterer = AffinityPropagation(damping=.9, preference=-200) return clusterer elif modelType == 'spectral': assert n_clusters, "Number of clusters argument mandatory" clusterer = SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity="nearest_neighbors") return clusterer elif modelType == 'birch': if not n_clusters: logging.warn( "Number of clusters irrelevant for cluster type : %s" % (modelType)) clusterer = Birch(n_clusters=2) return clusterer elif modelType == 'agglomerativeCluster': # connectivity matrix for structured Ward connectivity = kneighbors_graph(dataframe, n_neighbors=10, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) clusterer = AgglomerativeClustering(n_clusters=cluster, linkage='ward', connectivity=connectivity) return clusterer elif modelType == 'meanShift': # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(dataframe, quantile=0.3) clusterer = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) return clusterer elif modelType == 'gmm': from sklearn import mixture gmm = mixture.GaussianMixture(n_components=5, covariance_type='full') return gmm elif modelType == 'dgmm': from sklearn import mixture dgmm = mixture.BayesianGaussianMixture(n_components=5, covariance_type='full') return dgmm else: raise 'Unknown model type: see utils.py for available'
# SVM regression(svm.SVR(kernel="rbf")), regression(svm.NuSVR(kernel="rbf")), classification_binary(svm.SVC(kernel="rbf", **SVC_PARAMS)), classification_binary(svm.SVC(kernel="linear", **SVC_PARAMS)), classification_binary(svm.SVC(kernel="poly", degree=2, **SVC_PARAMS)), classification_binary(svm.SVC(kernel="sigmoid", **SVC_PARAMS)), classification_binary(svm.NuSVC(kernel="rbf", **SVC_PARAMS)), classification(svm.SVC(kernel="rbf", **SVC_PARAMS)), classification(svm.NuSVC(kernel="rbf", **SVC_PARAMS)), # Linear Regression regression(linear_model.LinearRegression()), regression(linear_model.HuberRegressor()), regression(linear_model.ElasticNet(random_state=RANDOM_SEED)), regression(linear_model.ElasticNetCV(random_state=RANDOM_SEED)), regression(linear_model.TheilSenRegressor(random_state=RANDOM_SEED)), regression(linear_model.Lars()), regression(linear_model.LarsCV()), regression(linear_model.Lasso(random_state=RANDOM_SEED)), regression(linear_model.LassoCV(random_state=RANDOM_SEED)), regression(linear_model.LassoLars()), regression(linear_model.LassoLarsCV()), regression(linear_model.LassoLarsIC()), regression(linear_model.OrthogonalMatchingPursuit()), regression(linear_model.OrthogonalMatchingPursuitCV()), regression(linear_model.Ridge(random_state=RANDOM_SEED)), regression(linear_model.RidgeCV()), regression(linear_model.BayesianRidge()), regression(linear_model.ARDRegression()),
# In[89]: # ridge regression, rreg = linear_model.Ridge() rreg.fit(X, Y) Y_p_r = rreg.predict(X_t) print("Ridge Regression") print("Mean squared error: %.2f" % mean_squared_error(Y_t, Y_p_r)) print("Mean absolute error: %.2f" % mean_absolute_error(Y_t, Y_p_r)) # In[90]: # lasso regression lareg = linear_model.Lasso() lareg.fit(X, Y) Y_p_l = lareg.predict(X_t) print("Lasso Regression") print("Mean squared error: %.2f" % mean_squared_error(Y_t, Y_p_l)) print("Mean absolute error: %.2f" % mean_absolute_error(Y_t, Y_p_l)) # In[93]: # elasticnet regression enreg = linear_model.ElasticNet() enreg.fit(X, Y) Y_p_en = enreg.predict(X_t) print("ElasticNet Regression") print("Mean squared error: %.2f" % mean_squared_error(Y_t, Y_p_en)) print("Mean absolute error: %.2f" % mean_absolute_error(Y_t, Y_p_en))
train(algos, X_train, y_train) errorList = errors(algos, X_test, y_test) g = 0 for algo in algos: results[g, b] = errorList[algo] g = g + 1 b = b + 1 finalResult = {} g = 0 for algo in algos: finalResult[algo] = np.mean(results[g, :]) g = g + 1 return finalResult algos = { 'elasticNet': linear_model.ElasticNet(alpha=0.9, l1_ratio=.1) } data = pullAndMerge('SP500', 'TWEXB', 'CPIHOSNS', 'A191RL1Q225SBEA', 'PSAVERT', 'T10YFF', 'DGS10', 'BAMLH0A0HYM2', 'T10Y2Y', 'FEDFUNDS', 'USROA', 'USROE', 'USSTHPI', 'STLFSI', 'NCBCMDPMVCE', 'MPRIME', 'CILACBQ158SBOG', 'INTDSRUSM193N', 'TERMCBAUTO48NS', 'TOTLL', 'BAMLH0A0HYM2EY', 'BAMLC0A0CM', 'RU3000VTR', 'TOTBKCR') print(data.head()) dates = data['DATE'] num = data['SP500'] volatility = getFinalVolt(data['SP500'], regDepth=200, smoothDepth=200)[400:] indicators = data.drop(['DATE', 'SP500'], axis=1).as_matrix()[400:] X, y = setLag(indicators, volatility, lag=90)
async def ensemble_score(self, target, model_type, websocket, app_id, user_id, exclude_list=[]): await websocket.send_text(json.dumps({"type": "message", "data": "Importing different models", "percent": 5 })) all_estimators = { 'categorical': { "gradient_boost_c": { "name": "Gradient Boosting Classifier", "model": ensemble.GradientBoostingClassifier(), "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa'] }, "rf_c": { "name": "RandomForest Classifier", "model": ensemble.RandomForestClassifier(), "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa'] }, "dt_c": { "name": "Decision Tree Classifier", "model": tree.DecisionTreeClassifier(), "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa'] }, "ada_c": { "name": "Ada Boosting", "model": ensemble.AdaBoostClassifier(), "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa'] }, "lda_c": { "name": "Linear Discriminant Analysis", "model": discriminant_analysis.LinearDiscriminantAnalysis(), "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa'] }, "ridge_c": { "name": "Ridge Classifier", "model": linear_model.RidgeClassifier(), "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa'] }, "logistic_c": { "name": "Logistic Regression", "model": linear_model.LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=10000), "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa'] }, "knn_c": { "name": "K Neighbours Classifier", "model": neighbors.KNeighborsClassifier(), "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa'] }, "qda_c": { "name": "Quadratic Discriminant Analysis", "model": discriminant_analysis.QuadraticDiscriminantAnalysis(), "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa'] } }, 'numerical': { "linear_r": { "name": "Linear Regression", "model": linear_model.LinearRegression(), "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE'] }, "lasso_r": { "name": "Lasso Regression", "model": linear_model.Lasso(), "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE'] }, "ridge_r": { "name": "Ridge Regression", "model": linear_model.Ridge(), "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE'] }, "enet_r": { "name": "Elastic Net", "model": linear_model.ElasticNet(), "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE'] }, "least_angle_r": { "name": "Least Angle Regression", "model": linear_model.Lars(), "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE'] }, "lasso_least_angle_r": { "name": "Lasso Least Angle Regression", "model": linear_model.LassoLars(), "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE'] }, "gradient_boost_r": { "name": "Gradient Boosting Regression", "model": ensemble.GradientBoostingRegressor(), "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE'] }, "rf_r": { "name": "RandomForest Regression", "model": ensemble.RandomForestRegressor(), "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE'] }, "dt_r": { "name": "Decision Tree Regression", "model": tree.DecisionTreeRegressor(), "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE'] }, "ada_r": { "name": "Ada Boosting Regression", "model": ensemble.AdaBoostRegressor(), "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE'] }, "svm_linear_r": { "name": "SVM - Linear Kernel", "model": svm.SVR(), "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE'] }, "knn_r": { "name": "K Neighbours Classifier", "model": neighbors.KNeighborsRegressor(), "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE'] }, "naive_r": { "name": "Naive Bayes", "model": linear_model.BayesianRidge(), "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE'] } } } await websocket.send_text(json.dumps({"type": "message", "data": model_type, "percent": 6 })) exclude_list.append(target) await websocket.send_text(json.dumps({"type": "message", "data": 'Selecting Target', "percent": 7 }, cls=NpEncoder)) for col in self.columns: await websocket.send_text(json.dumps({"type": "message", "data": 'Detecting dates..', "percent": 8 }, cls=NpEncoder)) if self[col].dtype == 'object': try: self[col] = pd.to_datetime(self[col]) await websocket.send_text(json.dumps({"type": "message", "data": 'Convert objects to dates..', "percent": 9 }, cls=NpEncoder)) except ValueError: pass # Exclude all columns which are given in list X = self[self.columns.difference(exclude_list)] features = X.columns await websocket.send_text(json.dumps({"type": "message", "data": 'Selecting Features', "percent": 10 }, cls=NpEncoder)) if model_type=='categorical': y = self[target].astype('int') estimators = all_estimators[model_type] await websocket.send_text(json.dumps({"type": "message", "data": 'Classification models initiating', "percent": 11 })) else: y = self[target].astype('float') estimators = all_estimators[model_type] await websocket.send_text(json.dumps({"type": "message", "data": 'Regression models initiating', "percent": 11 })) counter = 10 # skf = model_selection.StratifiedKFold(n_splits=2, random_state=None) await websocket.send_text(json.dumps({"type": "message", "data": 'Cross validation initiated', "percent": counter })) model_summary = [] for index, model in enumerate(estimators.keys()): logger.info(estimators[model]['name'] + ' estimator initiating') score_auc = np.empty((0,0)) score_recall = np.empty((0,0)) score_acc = np.empty((0,0)) score_precision = np.empty((0,0)) score_kappa = np.empty((0,0)) score_f1 = np.empty((0,0)) score_mcc = np.empty((0,0)) score_mae = np.empty((0,0)) score_mse = np.empty((0,0)) score_rmse = np.empty((0,0)) score_r2 = np.empty((0,0)) score_rmsle = np.empty((0,0)) score_mape = np.empty((0,0)) start = time.time() # # for train_index, test_index in skf.split(X, y): counter = counter + index # X_train, X_test = X.iloc[train_index], X.iloc[test_index] # y_train, y_test = y.iloc[train_index], y.iloc[test_index] X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=.3) await websocket.send_text(json.dumps({"type": "message", "data": estimators[model]['name'] + ' estimator initiating', "percent": counter })) clf = estimators[model]['model'] clf.fit(X_train, y_train) y_pred = clf.predict(X_test) logger.info(estimators[model]['name'] + ' estimator completed') if model_type == 'categorical': # score_auc = np.append(score_auc, metrics.roc_auc_score(y_pred, y_test, multi_class="ovo")) score_acc = np.append(score_acc, metrics.accuracy_score(y_pred, y_test)) score_recall = np.append(score_recall, metrics.recall_score(y_pred, y_test, average=None)) score_precision = np.append(score_precision, metrics.precision_score(y_pred, y_test, average=None)) score_kappa = np.append(score_kappa, metrics.cohen_kappa_score(y_pred, y_test)) score_mcc = np.append(score_mcc, metrics.matthews_corrcoef(y_pred, y_test)) score_f1 = np.append(score_f1, metrics.f1_score(y_pred, y_test, average='micro')) if model_type == 'numerical': score_acc = np.append(score_acc, metrics.explained_variance_score(y_test, y_pred)) score_mae = np.append(score_mae, metrics.mean_absolute_error(y_test, y_pred)) score_mse = np.append(score_mse, metrics.mean_squared_error(y_test, y_pred)) score_rmse = np.append(score_rmse, metrics.mean_squared_error(y_test, y_pred, squared=False)) score_r2 = np.append(score_r2, metrics.r2_score(y_test, y_pred)) score_rmsle = np.append(score_rmsle, np.sqrt(np.mean( np.square(np.log1p(y_test.ravel() - y_test.ravel().min() + 1) - np.log1p(y_pred.ravel() - y_pred.ravel().min() + 1))))) # print("RMSE", y_test) # print("RMSE", y_test.shape) # print("RMSE2", y_pred.shape) # print("Predicted", y_pred) await websocket.send_text(json.dumps({"type": "message", "data": estimators[model]['name'] + ' accuracy', "percent": counter })) end = time.time() summary = {} for col in features: summary[col] = { "min": X[col].min(), "max": X[col].max(), "default": X[col].iloc[0], "dtype": str(X[col].dtype) } if model_type == 'categorical': temp = { "name": estimators[model]['name'], "type": model_type, "scores": score_acc, "accuracy": score_acc.mean(), "time_elasped": (end - start), # "auc": score_auc.mean(), 'f1': score_f1.mean(), "recall": score_recall.mean(), "precision": score_precision.mean(), "kappa": score_kappa.mean(), "features": summary, "target": target } # app_id, model_id, name, score) await websocket.send_text(json.dumps({"type": "message", "data": estimators[model]['name'] + ' saving into cloud', "percent": counter }, cls=NpEncoder)) model_id, model_path = await self.save_model_to_s3(clf, app_id, user_id, estimators[model]['name'], score_acc.mean(), json.dumps(temp, cls=NpEncoder)) temp['model_id'] = model_id model_summary.append(temp) else: temp = { "name": estimators[model]['name'], "type": model_type, "scores": score_acc, "accuracy": score_acc.mean(), "time_elasped": (end - start), "mae": score_mae.mean(), "mse": score_mse.mean(), "rmse": score_rmse.mean(), "rmsle": score_rmsle.mean(), "r2": score_r2.mean(), "features": summary, "target": target } await websocket.send_text(json.dumps({"type": "message", "data": estimators[model]['name'] + ' saving into cloud', "percent": counter }, cls=NpEncoder)) model_id, model_path = await self.save_model_to_s3(clf, app_id, user_id, estimators[model]['name'], score_acc.mean(), json.dumps(temp, cls=NpEncoder)) temp['model_id'] = model_id model_summary.append(temp) return model_summary
# Excluding outliers from data train = train[train['visitors_pool_total'] < 3500] # Extract validation data from training data def create_validation_data(training_data): train_validation, test_validation = train_test_split(training_data, test_size=0.3) return train_validation, test_validation # Creating elastic net object (alpha = 0.5, lambda = 0.8, max_iter=1000000, # tol=0.0000001) and predictors model = linear_model.ElasticNet(alpha=0.5, l1_ratio=0.8, fit_intercept=True, normalize=False, max_iter=1000000, tol=0.0000001) predictors = list(train.columns.values) predictors.remove('date') predictors.remove('visitors_pool_total') # Calculating error (RMSE) for the model rmse_group = list() model_counter = 0 for iter in xrange(CROSS_VALIDATION_ITER): model_counter += 1 train_validation, test_validation = create_validation_data(train) model.fit(train_validation[predictors], train_validation['visitors_pool_total'])
image_paths = lookup.values() mr = get_images_df(file_paths=image_paths, mask=standard_mask) image_ids = [int(os.path.basename(x).split(".")[0]) for x in image_paths] mr.index = image_ids # what we can do is generate a predicted image for a particular set of concepts (e.g, for a left out image) by simply multiplying the concept vector by the regression parameters at each voxel. then you can do the mitchell trick of asking whether you can accurately classify two left-out images by matching them with the two predicted images. regression_params = pandas.DataFrame(0, index=mr.columns, columns=concepts) print "Training voxels..." for voxel in mr.columns: train = mr.index Y = mr.loc[train, voxel].tolist() Xtrain = X.loc[train, :] # Use regularized regression clf = linear_model.ElasticNet(alpha=0.1) clf.fit(Xtrain, Y) regression_params.loc[voxel, :] = clf.coef_.tolist() regression_params.to_csv(output_file, sep="\t") # GENERATE BRAIN IMAGES FOR REGRESSION PARAMS image_folder = "%s/regression_param_images" % (update) if not os.path.exists(image_folder): os.mkdir(image_folder) for concept in regression_params.columns.tolist(): data = regression_params[concept].tolist() empty_nii = numpy.zeros(standard_mask.shape) empty_nii[standard_mask.get_data() != 0] = data nii = nibabel.Nifti1Image(empty_nii, affine=standard_mask.get_affine())
def elasticRegression(train, trainLable, testData): clf = linear_model.ElasticNet(alpha=0.6, l1_ratio=0.5) clf.fit(train, trainLable) predict = clf.predict(testData) return predict
y_train, cv=10, scoring=metrics.make_scorer(rmse)) crossValidate.get('fit_time').mean() crossValidate.get('score_time').mean() crossValidate.get('test_score').mean() crossValidate.get('train_score').mean() #Calculate the Sum of the Squared(Error) values of the weights, called L2. #It shrinks the parameters, therefore it is mostly used to prevent multicollinearity. #It reduces the model complexity by coefficient shrinkage. ridge_estimator = linear_model.Ridge() ridge_grid = {'alpha': [1.0], 'max_iter': [50]} ridgeModel = fit_model(ridge_estimator, ridge_grid, X_train1, y_train) coef = ridgeModel.coef_ intercept = ridgeModel.intercept_ #LASSO: Least Absolute Shrinkage Selector Operator #Calculate the sum of the Absolute values of the weights, called L1. #Nullifies low important features with 0 co-efficient #It is generally used when we have more number of features, because it automatically does feature selection. lasso_estimator = linear_model.Lasso() lasso_grid = {'alpha': [0.5, 0.9]} lassoModel = fit_model(lasso_estimator, lasso_grid, X_train1, y_train) coef = lassoModel.coef_ intercept = lassoModel.intercept_ #ElasticNet is a comibination of Ridge & LASSO enet_estimator = linear_model.ElasticNet() enet_grid = {'alpha': [0.1], 'l1_ratio': [0.3]} fit_model(enet_estimator, enet_grid, X_train1, y_train)
# TODO: sample weights: 'class_weight' and 'sample_weight' can be used ''' RUNNING OPTIONS FOR MODELS ''' visualizeCGI = True numAutoLabel = 0 # worse than label spreading? folds = 5 scaler = preprocessing.StandardScaler(copy=False) # scaler = preprocessing.MinMaxScaler(copy=False) regressors = [ dummy.DummyRegressor(), svm.LinearSVR(), svm.SVR(kernel='rbf', gamma='scale'), linear_model.Ridge(), linear_model.Lasso(), linear_model.ElasticNet(), ensemble.RandomForestRegressor(), ensemble.GradientBoostingRegressor(), ensemble.AdaBoostRegressor() ] classifiers = [ dummy.DummyClassifier(), svm.LinearSVC(), svm.SVC(kernel='rbf', gamma='scale'), neighbors.KNeighborsClassifier(), ensemble.RandomForestClassifier(), ensemble.GradientBoostingClassifier(), ensemble.AdaBoostClassifier() ]
def elastic_net(data, y, x_val, y_val, i): from sklearn import linear_model reg = linear_model.ElasticNet(random_state = i) reg.fit(data, y) y_predict = reg.predict(x_val) return reg.score(x_val, y_val)
noise_sd = 10 X, y, coef = datasets.make_regression(n_samples=50, n_features=100, noise=noise_sd, n_informative=2, random_state=42, coef=True) # Use this to tune the noise parameter such that snr < 5 print("SNR:", np.std(np.dot(X, coef)) / noise_sd) # param grid over alpha & l1_ratio param_grid = {'alpha': 10.**np.arange(-3, 3), 'l1_ratio': [.1, .5, .9]} # Warp model = GridSearchCV(lm.ElasticNet(max_iter=10000), param_grid, cv=5) # 1) Biased usage: fit on all data, ommit outer CV loop model.fit(X, y) print("Train r2:%.2f" % metrics.r2_score(y, model.predict(X))) print(model.best_params_) # 2) User made outer CV, useful to extract specific information cv = KFold(len(y), n_folds=5, random_state=42) y_test_pred = np.zeros(len(y)) y_train_pred = np.zeros(len(y)) alphas = list() for train, test in cv: X_train, X_test, y_train, y_test = X[train, :], X[ test, :], y[train], y[test]
format='pkl') fifth_features = ngrams.load_data( '../datasets/grams_dict2002-2016/5grams_feature_relative.pkl', format='pkl') # List of models to run model_zoo = Counter() # model_zoo['OLS'] = linear_model.LinearRegression() model_zoo['OLS'] = linear_model.RidgeCV( np.array([10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001])) model_zoo['PLS'] = cross_decomposition.PLSRegression(n_components=200) model_zoo['RF-30'] = RandomForestRegressor(max_features="sqrt", n_estimators=30) model_zoo['RF-log50'] = RandomForestRegressor(max_features="log2", n_estimators=50) model_zoo['Elastic Net'] = linear_model.ElasticNet(alpha=0.1, l1_ratio=0.7) # choosing number of grams features_list = [ uni_feature, bi_features, tri_features, forth_features, fifth_features ] # process and stack ngram features, make it numpy array if PLS will be used ngrams.stack_processed_features(train_data, test_data, judge_year_index, features_list, to_array=True) # Run various models
youngsters = np.where(y_t1 <= AGE) y_old = np.delete(y_t1, youngsters) data_old = np.delete(X, youngsters, axis=0) data_old = np.delete(data_old, np.where(areas_notdeg2 == 1), axis=1) adults = np.where(y_t1 > AGE) y_young = np.delete(y_t1, adults) data_young = np.delete(X, adults, axis=0) data_young = np.delete(data_young, np.where(areas_notdeg2 == 1), axis=1) #prepare a range of parameters to test alphas = np.array([ 1, ]) l1_ratio = np.array([0.9]) model = linear_model.ElasticNet( ) #We have chosen to just normalize the data by default, you could GridsearchCV this is you wanted grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas, l1_ratio=l1_ratio)) grid.fit(data_old, y_old) # we want 30 features grid.best_estimator_.alpha = 0.01 grid.best_estimator_.l1_ratio = .9999 lm = linear_model.ElasticNet(alpha=grid.best_estimator_.alpha, l1_ratio=grid.best_estimator_.l1_ratio) error_old_tot = np.empty((data_old.shape[1], NUM_REPS)) for idx in range(NUM_REPS): X_train, X_test, y_train, y_test_old = train_test_split(data_old, y_old) lm.fit(X_train, y_train) error_old_tot[:, idx] = lm.coef_ mean_pred_old = np.nanmean(error_old_tot, axis=1)
def calculate_spot_features(spot_id, all_input, all_params): bm_data, meta_data, db_path, NN_OUTPUT, scramble_seed = all_input alpha, radius, l1_ratio = all_params conn = sqlite3.connect(db_path) conn.text_factory = str c = conn.cursor() sel_cmd = ('SELECT {coi1} FROM {tn1} WHERE {coi2}=={0}').format(spot_id, coi1='cell_id',coi2='spot_id', tn1='cells') c.execute(sel_cmd) results = c.fetchall() cell_id = np.asarray([results[i][0] for i in xrange(len(results))]) spot_bm = bm_data[cell_id - 1, :] # biomarker values of the spot spot_meta = meta_data[cell_id -1, :] # x,y coordinates of each cell sel_cmd = ('SELECT {coi1} FROM {tn1} WHERE {coi2}=={0}').format(spot_id, coi1='spot_name',coi2='spot_id', tn1='spots') c.execute(sel_cmd) results = c.fetchall() spot_name = results[0][0] # get nearest neighbors spot_xy = spot_meta[:,:2] tree = KDTree(spot_xy) ind, dist = tree.query_radius(spot_xy, r = radius, return_distance = True) # if scramble if scramble_seed: np.random.seed(scramble_seed) spot_bm = spot_bm[np.random.permutation(np.arange(len(cell_id))),:] output_name = os.path.join(NN_OUTPUT,spot_name + '_seed_' + str(scramble_seed) + '.mat') else: output_name = os.path.join(NN_OUTPUT,spot_name + '.mat') if os.path.isfile(output_name): return 1 #num_non_zeros_nn = [] # number of contributing neighbors entropies = [] # entropy angle_std = [] # std neighbor_id = [] # id of neighbor residuals = [] # residuals at each cells coeff_neighbors = [] # coefficient of neighbor each cell pmf_neighbors = [] for j in xrange(spot_bm.shape[0]): clf = linear_model.ElasticNet(alpha=alpha, l1_ratio=l1_ratio) cell_bm = spot_bm[j,:].reshape(1,-1) # cell of question cell_xy = spot_xy[j,:] nb_bm = spot_bm[ind[j][dist[j]!=0], :] # eliminate itself nb_xy = spot_xy[ind[j][dist[j]!=0], :] curr_neigh_ind = ind[j][dist[j]!=0] neighbor_id.append(curr_neigh_ind) if np.min(nb_bm.shape) > 0: clf.fit(nb_bm.T,cell_bm.T) pred = clf.predict(nb_bm.T).reshape(1,-1) resid_biomarkers = np.square(cell_bm - pred) residuals.append(np.sqrt(resid_biomarkers.sum())/cell_bm.shape[1]) coeff_neighbors.append(clf.coef_) nb_diff_xy = nb_xy - cell_xy nb_diff_xy = nb_diff_xy/np.linalg.norm(nb_diff_xy,axis = 1).reshape(-1,1) # normalize the vector if len(clf.coef_) >= 2: # and (np.linalg.norm(clf.coef_) > 0): nb_angles = [0] for k in xrange(len(clf.coef_)-1): a = np.dot(nb_diff_xy[0,:],nb_diff_xy[k+1,:]) a = np.sign(a)*np.minimum(1,np.abs(a)) nb_angles.append(math.acos(a)) nb_angles = [(nb_angles[i] + np.pi/100) % 2*np.pi for i in xrange(len(nb_angles))] bin_vec = np.linspace(0, 2*np.pi, num=36) nb_bin = [(bin_vec -nb_angles[i] >=0).nonzero()[0][0] for i in range(len(nb_angles))] pk = np.zeros(36) for i in xrange(len(nb_angles)): pk[nb_bin[i]] = pk[nb_bin[i]] + np.abs(clf.coef_[i]) pmf_neighbors.append(np.array(pk)) cell_entropy = scipy.stats.entropy(pk + 1e-6) if np.isinf(cell_entropy): mat_dict={'passed': 0, 'spot_meta':spot_meta,'entropies':entropies, 'residuals':residuals,'angle_std':angle_std,'pmf':pmf_neighbors, 'coeff_neigh':coeff_neighbors, 'neighbor_id':neighbor_id } scipy.io.savemat(os.path.join(NN_OUTPUT,spot_name + '.mat'), mat_dict) return 0 entropies.append(cell_entropy) coef_ = np.abs(clf.coef_)/np.sum(np.abs(clf.coef_)) cos_mean_resultant = np.sum([coef_[i]*math.cos(nb_angles[i]) for i in range(len(coef_))]) sin_mean_resultant = np.sum([coef_[i]*math.sin(nb_angles[i]) for i in range(len(coef_))]) mean_resultant = np.sqrt(cos_mean_resultant**2+sin_mean_resultant**2) cell_angle_std = np.sqrt(2*(1-mean_resultant)) if np.isnan(cell_angle_std): mat_dict={'passed': 0, 'spot_meta':spot_meta,'entropies':entropies, 'residuals':residuals,'angle_std':angle_std,'pmf':pmf_neighbors, 'coeff_neigh':coeff_neighbors, 'neighbor_id':neighbor_id } scipy.io.savemat(output_name, mat_dict) return 0 angle_std.append(cell_angle_std) else: pmf_neighbors.append([0]) entropies.append(0) angle_std.append(0) else: #print spot_name, j, nb_bm.shape residuals.append(0) entropies.append(0) angle_std.append(0) pmf_neighbors.append([]) coeff_neighbors.append([]) mat_dict={'passed': 1, 'spot_meta':spot_meta,'entropies':entropies, 'residuals':residuals,'angle_std':angle_std,'pmf':pmf_neighbors, 'coeff_neigh':coeff_neighbors, 'neighbor_id':neighbor_id} scipy.io.savemat(output_name, mat_dict) return 1
def __init__( self, method, yrange, params, i=0 ): #TODO: yrange doesn't currently do anything. Remove or do something with it! self.algorithm_list = [ 'PLS', 'GP', 'OLS', 'OMP', 'Lasso', 'Elastic Net', 'Ridge', 'Bayesian Ridge', 'ARD', 'LARS', 'LASSO LARS', 'SVR', 'KRR', ] self.method = method self.outliers = None self.ransac = False print(params) if self.method[i] == 'PLS': self.model = PLSRegression(**params[i]) if self.method[i] == 'OLS': self.model = linear.LinearRegression(**params[i]) if self.method[i] == 'OMP': # check whether to do CV or not self.do_cv = params[i]['CV'] # create a temporary set of parameters params_temp = copy.copy(params[i]) # Remove CV parameter params_temp.pop('CV') if self.do_cv is False: self.model = linear.OrthogonalMatchingPursuit(**params_temp) else: if 'precompute' in params[i]: params_temp.pop('precompute') self.model = linear.OrthogonalMatchingPursuitCV(**params_temp) if self.method[i] == 'LASSO': # create a temporary set of parameters params_temp = copy.copy(params[i]) # check whether to do CV or not try: self.do_cv = params[i]['CV'] # Remove CV parameter params_temp.pop('CV') except: self.do_cv = False if self.do_cv is False: self.model = linear.Lasso(**params_temp) else: params_temp.pop('alpha') self.model = linear.LassoCV(**params_temp) if self.method[i] == 'Elastic Net': params_temp = copy.copy(params[i]) try: self.do_cv = params[i]['CV'] params_temp.pop('CV') except: self.do_cv = False if self.do_cv is False: self.model = linear.ElasticNet(**params_temp) else: params_temp['l1_ratio'] = [.1, .5, .7, .9, .95, .99, 1] self.model = linear.ElasticNetCV(**params_temp) if self.method[i] == 'Ridge': # create a temporary set of parameters params_temp = copy.copy(params[i]) try: # check whether to do CV or not self.do_cv = params[i]['CV'] # Remove CV parameter params_temp.pop('CV') except: self.do_cv = False if self.do_cv: self.model = linear.RidgeCV(**params_temp) else: self.model = linear.Ridge(**params_temp) if self.method[i] == 'BRR': self.model = linear.BayesianRidge(**params[i]) if self.method[i] == 'ARD': self.model = linear.ARDRegression(**params[i]) if self.method[i] == 'LARS': # create a temporary set of parameters params_temp = copy.copy(params[i]) try: # check whether to do CV or not self.do_cv = params[i]['CV'] # Remove CV parameter params_temp.pop('CV') except: self.do_cv = False if self.do_cv is False: self.model = linear.Lars(**params_temp) else: self.model = linear.LarsCV(**params_temp) if self.method[i] == 'LASSO LARS': model = params[i]['model'] params_temp = copy.copy(params[i]) params_temp.pop('model') if model == 0: self.model = linear.LassoLars(**params_temp) elif model == 1: self.model = linear.LassoLarsCV(**params_temp) elif model == 2: self.model = linear.LassoLarsIC(**params_temp) else: print("Something went wrong, \'model\' should be 0, 1, or 2") if self.method[i] == 'SVR': self.model = svm.SVR(**params[i]) if self.method[i] == 'KRR': self.model = kernel_ridge.KernelRidge(**params[i]) if self.method[i] == 'GP': # get the method for dimensionality reduction and the number of components self.reduce_dim = params[i]['reduce_dim'] self.n_components = params[i]['n_components'] # create a temporary set of parameters params_temp = copy.copy(params[i]) # Remove parameters not accepted by Gaussian Process params_temp.pop('reduce_dim') params_temp.pop('n_components') self.model = GaussianProcess(**params_temp)
# In[8]: # features_df = df[['sqft_living','bathrooms', 'sqft_living15', 'grade', 'bedrooms', 'floors', 'waterfront', \ # 'view', 'sqft_above', 'sqft_basement', 'sqft_lot15', 'lat', 'is_renovated_in_last_10_years']] features_df = df.drop('price', axis=1) features_df = SelectPercentile(percentile=75).fit( features_df, df.price).transform(features_df) features_df = StandardScaler().fit(features_df).transform(features_df) # In[9]: x_train, x_test, y_train, y_test = train_test_split(features_df, df.price) # In[15]: linear_regr = linear_model.ElasticNet( alpha=0.001, max_iter=5000) # RandomForestRegressor(n_estimators = 75) # model = linear_regr.fit(x_train, y_train) predictions = model.predict(x_test) # In[17]: plt.scatter(y_test, predictions) plt.rcParams["figure.figsize"] = (15, 12) plt.show() # In[19]: print("Mean squared error: %.3f" % mean_squared_error(y_test, predictions)) print("Mean absolute error: %.3f" % mean_absolute_error(y_test, predictions)) print('Variance score: %.3f' % r2_score(y_test, predictions))