regression(linear_model.LassoLarsIC()), regression(linear_model.OrthogonalMatchingPursuit()), regression(linear_model.OrthogonalMatchingPursuitCV()), regression(linear_model.Ridge(random_state=RANDOM_SEED)), regression(linear_model.RidgeCV()), regression(linear_model.BayesianRidge()), regression(linear_model.ARDRegression()), regression(linear_model.SGDRegressor(random_state=RANDOM_SEED)), regression( linear_model.PassiveAggressiveRegressor(random_state=RANDOM_SEED)), # Logistic Regression classification( linear_model.LogisticRegression(random_state=RANDOM_SEED)), classification( linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)), classification(linear_model.RidgeClassifier(random_state=RANDOM_SEED)), classification(linear_model.RidgeClassifierCV()), classification(linear_model.SGDClassifier(random_state=RANDOM_SEED)), classification_binary( linear_model.LogisticRegression(random_state=RANDOM_SEED)), classification_binary( linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)), classification_binary( linear_model.RidgeClassifier(random_state=RANDOM_SEED)), classification_binary(linear_model.RidgeClassifierCV()), classification_binary( linear_model.SGDClassifier(random_state=RANDOM_SEED)), # Decision trees regression(tree.DecisionTreeRegressor(**TREE_PARAMS)),
def classify(X, y): clf = linear_model.LogisticRegressionCV() clf.fit(X, y) return clf
test_X = imp.transform(test_X) #Get method print "Choose model. Options:" print "1. Logistic regression" print "2. Support vector classification (linear kernel)" print "3. Support vector classification (RBF kernel)" print "4. Decision trees" print "5. Random forests" print "6. Extra trees" choice = int(raw_input("Choose model (1-6): ")) #Make prediction if choice == 1: from sklearn import linear_model as lm clf = lm.LogisticRegressionCV() clf.fit(X, y) print "We chose from these C values for CV: ", print clf.Cs_ print "Best C value is: ", print clf.C_ print "Slope is: ", print clf.coef_ print "Intercept is: ", print clf.intercept_ howgood = clf.score(X, y) print "In-sample score is: %.5f" % howgood test_y = clf.predict(test_X) #print test_y elif choice == 2: from sklearn import svm, grid_search
print_image(vector2image(X_train[1,:]), 'Train Cat Example') print_image(vector2image(X_train[0,:]), 'Train Dog Example') print_image(vector2image(X_test[0,:]), 'Test Example') print_image(vector2image(X_test2[0,:]), 'Test2 Example') #-------------------------------- # II - Train Logistic Regression #-------------------------------- # we don't train a statsmodels GLM, because data are too heavy for its implentation. if not os.path.isfile('obj/adv.pkl'): # sklearn LR with L2 regularization # search for the best C hyperparameter lr_l2_CV = linear_model.LogisticRegressionCV(penalty = 'l2', solver='sag', Cs=100, random_state = 42, n_jobs=-1) # Cs=100 : grid of 100 values lr_l2_CV.fit(X_train, y_train) bestC = lr_l2_CV.C_[0] print('Best C found: {0}'.format(bestC)) del lr_l2_CV # retrain LR with the best C # this is the same than above, but currently adversarialLogistic # doesn't support linear_model.LogisticRegressionCV lr_l2 = linear_model.LogisticRegression(penalty = 'l2', solver='sag', random_state = 42, C=bestC, n_jobs=-1) lr_l2.fit(X_train, y_train) lr_l2_acc_is = lr_l2.score(X = X_train, y = y_train) lr_l2_acc_oos = lr_l2.score(X = X_test, y = y_test) print('Accuracy in-sample: {0}'.format(lr_l2_acc_is))
GridSearchCV(cv=None, estimator=LogisticRegression(C=1.0, intercept_scaling=1, dual=False, fit_intercept=True, penalty='l2', tol=0.0001), param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}) clf = clf.fit(X, y) y_predicted2 = clf.predict(X_test) #predicted class cm2 = ConfusionMatrix(y_test, y_predicted2) cm2.print_stats() acc2 = accuracy_score(y_test, y_predicted2) #print(classification_report(y_test, y_predicted)) cmatrix2 = confusion_matrix(y_test, y_predicted) ROI2 = cmatrix[1, 1] * 100 + cmatrix[0, 0] * 15 + cmatrix[0, 1] * ( -15) + cmatrix[1, 0] * (-30) ##Next I tried cross validation logisticCV = linear_model.LogisticRegressionCV( class_weight='balanced', scoring='roc_auc') #scoring =‘accuracy’ logisticCV = logisticCV.fit(X, y) y_predicted = logisticCV.predict(X_test) #predicted class cm = ConfusionMatrix(y_test, y_predicted) cm.print_stats() acc = accuracy_score(y_test, y_predicted) #print(classification_report(y_test, y_predicted)) cmatrix = confusion_matrix(y_test, y_predicted) ROI = cmatrix[1, 1] * 100 + cmatrix[0, 0] * 15 + cmatrix[0, 1] * ( -15) + cmatrix[1, 0] * (-30)
y, x = dmatrices(formula, rawdf, return_type="dataframe") y = y.values.flatten() logreg = linear_model.LogisticRegression(C=0.1, penalty='l1', tol=0.01) logreg.fit(x, y) scores = cross_val_score(logreg, x, y, cv=5) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) coeffdf = pd.DataFrame({'feature': x.columns, 'coeff': np.transpose(logreg.coef_).flatten()}) nflist = coeffdf[coeffdf.coeff != 0].feature.values.tolist() print(len(nflist)) # feature selection using best model from cross validation and get the best features fslogreg = linear_model.LogisticRegressionCV(penalty='l1', solver='liblinear') fslogreg.fit(x, y) fsmodel = SelectFromModel(fslogreg, prefit=True) x_new = fsmodel.transform(x) x_new.shape coeffdf = pd.DataFrame({'feature': x.columns, 'coeff': np.transpose(fslogreg.coef_).flatten()}) nflist = coeffdf[coeffdf.coeff != 0].feature.values.tolist() print(len(nflist)) ''' L2 regularisation on new features ''' formula = ModelDesc([Term([LookupFactor('rating')])], [Term([LookupFactor(c)]) for c in nflist]) y, x = dmatrices(formula, rawdf, return_type="dataframe")
X = data[:, :-1] y = data[:, -1] return X, y # Cargamos los datos de entrenamiento X_train, y_train = read_csv( '../datasets-clasificacion/train_countryriskmoodys.csv') # Cargamos los datos de generalización X_test, y_test = read_csv( '../datasets-clasificacion/test_countryriskmoodys.csv') # Escalamos datos scaler = preprocessing.StandardScaler(with_mean=False).fit(X_train) X_train_escalado = scaler.transform(X_train) X_test_escalado = scaler.transform(X_test) # ================================================================= # Probemos a optimizar el coste cd la regresión logística # http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV print( 'Rendimiento con datos estandarizados y optimizando el parámetro de coste') regr_cv = linear_model.LogisticRegressionCV(solver='liblinear') regr_cv.fit(X_train_escalado, y_train) y_test_predict = regr_cv.predict(X_test_escalado) accuracy = metrics.accuracy_score(y_test, y_test_predict) conf_matrix = metrics.confusion_matrix(y_test, y_test_predict) print('Precisión global:\t%0.2f' % (accuracy)) accuracy_per_class(conf_matrix)
def get_classifier(x, y): reg = linear_model.LogisticRegressionCV(fit_intercept=False, multi_class='multinomial') reg.fit(x, y) return reg
X_all, names, adjpvals = apply_ttest( X_all, names, outfile_root + "adj_pvals_features.txt") X_scaled = preprocessing.scale(X_all) make_scatter_plots(X_scaled, names, outfile_root + 'scatter_plots.pdf') data = np.zeros((3, 4)) dfper = pd.DataFrame( data, columns=['accuracy', 'precision', 'recall', 'roc_auc'], index=['DT', 'LR', 'RF']) # performance table clf = linear_model.LogisticRegressionCV(refit=True, random_state=PRNG) LR_imp, dfper = build_model(clf, X_scaled, Y, names, "LR", dfper, outfile_root + "LR_features.txt") clf = tree.DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=5, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=PRNG,
def analyze_logistic(X, y, model, scale_columns, analyze_params=False, balance_outcomes=False): """ Function for doing analysis of logistic regression. Plots cumulative gain, confusion matrix and grid search of optimal learning rate/epochs in SGD with k-fold CV (optional). Performs scaling of all continuous features in the data set. Inputs: - X: design matrix, shape (n, p) - y: targets, shape (n,) - scale_columns: list of indices of which columns to MinMax scale - analyze_params: boolean, option to perform grid search of learning rate and n_epochs in SGD - balance_outcomes: boolean, option to balance training data in case of skewed classes """ #split data in train/validate and test X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1) #balance training set such that outcomes are 50/50 in training data if balance_outcomes: non_default_inds = np.where(y_train_val == 0)[0] default_inds = np.where(y_train_val == 1)[0] remove_size = len(non_default_inds) - len(default_inds) remove_inds = np.random.choice(non_default_inds, size=remove_size, replace=False) X_train_val = np.delete(X, remove_inds, axis=0) y_train_val = np.delete(y, remove_inds, axis=0) #end if #scale continuous features minmaxscaler = MinMaxScaler(feature_range=(-1, 1)) scaler = ColumnTransformer(remainder='passthrough', transformers=[('minmaxscaler', minmaxscaler, scale_columns)]) #scale only test data at this point (CV scales training/validation) scaler.fit(X_train_val) X_test = scaler.transform(X_test) if analyze_params: #initialize vectors for saving results error_scores = pd.DataFrame( columns=['log eta', 'n_epochs', 'mse', 'r2', 'accuracy']) n_etas = 4 eta_vals = np.linspace(-1, -4, n_etas) n_epoch_vals = np.array([10, 100, 500, 1000]) n_epochs = len(n_epoch_vals) accuracy_scores = np.zeros((n_etas, n_epochs)) max_accuracy = 0 best_eta = 0 best_n_epochs = 0 #perform grid search of best learning rate #and number of epochs with k-fold cross-validation i = 0 for eta in eta_vals: model.set_eta(10**eta) j = 0 for epoch in n_epoch_vals: model.set_n_epochs(epoch) #perform cross validation mse, r2, accuracy = CV(X_train_val, y_train_val, model) accuracy_scores[i, j] = accuracy error_scores = error_scores.append( { 'log eta': eta, 'n_epochs': epoch, 'mse': mse, 'r2': r2, 'accuracy': accuracy }, ignore_index=True) #check if current configuration is better if accuracy > max_accuracy: max_accuracy = accuracy best_eta = eta best_n_epochs = epoch j += 1 #end for epoch i += 1 #end for eta #set optimal model parameters model.set_eta(10**best_eta) model.set_n_epochs(best_n_epochs) #plot heatmap of grid search acc_table = pd.pivot_table(error_scores, values='accuracy', index=['log eta'], columns='n_epochs') idx_i = np.where(acc_table == max_accuracy)[0] idx_j = np.where(acc_table == max_accuracy)[1] fig = plt.figure() ax = sns.heatmap(acc_table, annot=True, fmt='.2g', cbar=True, linewidths=1, linecolor='white', cbar_kws={'label': 'Accuracy'}) ax.add_patch( Rectangle((idx_j, idx_i), 1, 1, fill=False, edgecolor='red', lw=2)) ax.set_xlabel('Number of epochs') ax.set_ylabel(r'log$_{10}$ of Learning rate') bottom, top = ax.get_ylim() ax.set_ylim(bottom + 0.5, top - 0.5) plt.show() #end if #scale training data X_train_val = scaler.transform(X_train_val) #pylearn model model.fit(X_train_val, y_train_val) pred_train = model.predict(X_train_val) pred_test = model.predict(X_test) #sklearn model clf = linear_model.LogisticRegressionCV() clf.fit(X_train_val, y_train_val) pred_skl = clf.predict(X_test) #get accuracy scores accuracy_on_test = accuracy_score(y_test, pred_test) accuracy_on_train = accuracy_score(y_train_val, pred_train) accuracy_skl = accuracy_score(y_test, pred_skl) #predict pred_train_prob = model.predict(X_train_val, probability=True) pred_test_prob = model.predict(X_test, probability=True) #get area ratio and plot cumulaive gain area_ratio_train = cumulative_gain_area_ratio(y_train_val, pred_train_prob, title='Training results') area_ratio_test = cumulative_gain_area_ratio(y_test, pred_test_prob, title=None) plt.show() #plot confusion matrix ax1 = plot_confusion_matrix(y_test, pred_test, normalize=True, cmap='Blues', title=' ') ax2 = plot_confusion_matrix(y_train_val, pred_train, normalize=True, cmap='Blues', title='Training data') bottom, top = ax1.get_ylim() ax1.set_ylim(bottom + 0.5, top - 0.5) ax2.set_ylim(bottom + 0.5, top - 0.5) plt.show() #print some stats print('===accuracy and area ratio stats===') print('accuracy on test:', accuracy_on_test) print('accuracy on train:', accuracy_on_train) print('accuracy skl:', accuracy_skl) print('area ratio train:', area_ratio_train) print('area ratio test:', area_ratio_test) if analyze_params: print('===grid search stats===') print('max accuracy:', max_accuracy) print('eta:', best_eta) print('n_epochs:', best_n_epochs)
R = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]) xx, yy = np.dot(R, [xx, yy]) ## skalowanie xx /= max(np.absolute(xx)) yy /= max(np.absolute(yy)) ## przypisanie do X X[row, ::2] = xx X[row, 1::2] = yy X = decomposition.PCA().fit_transform(X) ## UTWORZENIE OBIEKTU KLASYFIKATORA WRAZ Z CROSS-VALIDACJĄ Cs = np.linspace(10, 12, 60) clf = linear_model.LogisticRegressionCV(Cs=Cs, fit_intercept=True, max_iter=10000, n_jobs=-1).fit(X, y) print(clf.score(X, y)) print(clf.scores_) print(clf.C_) ## TWORZENIE CONFUSSION MATRICES fig, (ax1, ax2) = plt.subplots(2) fig.suptitle('Confusion matrices (not)normalized') ax1.set_title('Nie znormalizowany') conf_mat_disp = metrics.plot_confusion_matrix(clf, X, y, display_labels=y_labels, cmap=plt.cm.Blues,
def fit_lasso(X, knockoffs, y, y_dist=None, use_lars=False, **kwargs): # Parse some kwargs/defaults if "max_iter" in kwargs: max_iter = kwargs.pop("max_iter") else: max_iter = 500 if "tol" in kwargs: tol = kwargs.pop("tol") else: tol = 1e-3 if "cv" in kwargs: cv = kwargs.pop("cv") else: cv = 5 if y_dist is None: y_dist = parse_y_dist(y) # Bind data p = X.shape[1] features = np.concatenate([X, knockoffs], axis=1) # Randomize coordinates to make sure everything is symmetric inds, rev_inds = random_permutation_inds(2 * p) features = features[:, inds] # Fit lasso warnings.filterwarnings("ignore") if y_dist == "gaussian": if not use_lars: gl = linear_model.LassoCV( alphas=DEFAULT_REG_VALS, cv=cv, verbose=False, max_iter=max_iter, tol=tol, **kwargs, ).fit(features, y) elif use_lars: gl = linear_model.LassoLarsCV( cv=cv, verbose=False, max_iter=max_iter, **kwargs, ).fit(features, y) elif y_dist == "binomial": gl = linear_model.LogisticRegressionCV( Cs=1 / DEFAULT_REG_VALS, penalty="l1", max_iter=max_iter, tol=tol, cv=cv, verbose=False, solver="liblinear", **kwargs, ).fit(features, y) else: raise ValueError( f"y_dist must be one of gaussian, binomial, not {y_dist}") warnings.resetwarnings() return gl, inds, rev_inds
def fill_sex_and_age(): clf = linear_model.LogisticRegressionCV() fill(file_tf_users, 'sex', 0, file_tf_fs_users, clf) fill(file_tf_dd_users, 'sex', 0, file_tf_dd_fs_users, clf) fill(file_tf_fs_users, 'age', 0, file_tf_fsa_users, clf) fill(file_tf_dd_fs_users, 'age', 0, file_tf_dd_fsa_users, clf)
def instanciate_estimators(clf_type, classifiers, clf_seed, y=None, **kw): score_metric, _ = get_score_metric(clf_type) param_grid_LGBM = { 'learning_rate': [0.1, .05, .5], 'num_leaves': [7, 15, 31] } param_grid_XGB = {'learning_rate': [0.1, .05, .3], 'max_depth': [3, 6, 9]} param_grid_MLP = { 'learning_rate_init': [.001, .0005, .005], 'hidden_layer_sizes': [(30, ), (50, ), (100, ), (30, 30), (50, 50), (100, 100)] } param_grid_EigenProGaussian = {'bandwidth': [1, 5, 25]} n_components_eigenpro = 160 param_grid_nystroem_ridgecv = { 'kernel_approx__n_components': [1000, 3000], 'kernel_approx__degree': [2, 3], } if clf_type == 'binary': print(('Fraction by class: True: %0.2f; False: %0.2f' % (list(y).count(True) / len(y), list(y).count(False) / len(y)))) cw = 'balanced' clfs = { 'L2RegularizedLinearModel': linear_model.LogisticRegressionCV(class_weight=cw, max_iter=100, solver='sag', penalty='l2', n_jobs=1, cv=3, multi_class='multinomial'), 'GradientBoosting': ensemble.GradientBoostingClassifier(n_estimators=100), 'LGBM': GridSearchCV(estimator=LGBMClassifier(n_estimators=100, n_jobs=1, is_unbalance=True), param_grid=param_grid_LGBM, cv=3, scoring=metrics.make_scorer(score_metric)), 'XGB': GridSearchCV(estimator=XGBClassifier(n_estimators=100, n_jobs=1), param_grid=param_grid_XGB, cv=3, scoring=metrics.make_scorer(score_metric)), 'MLP': MLPClassifier(hidden_layer_sizes=(30, 30), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10), 'MLPGridSearchCV': GridSearchCV(estimator=MLPClassifier(hidden_layer_sizes=(30, 30), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10), param_grid=param_grid_MLP, cv=3, scoring=metrics.make_scorer(score_metric)), 'EigenProPolynomial': FKC_EigenPro(batch_size="auto", n_epoch=10, n_components=n_components_eigenpro, subsample_size="auto", kernel="polynomial", bandwidth=5, gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), 'EigenProGaussian160': GridSearchCV(estimator=FKC_EigenPro( batch_size="auto", n_epoch=10, n_components=n_components_eigenpro, subsample_size="auto", kernel="gaussian", gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), param_grid=param_grid_EigenProGaussian, cv=3, scoring=metrics.make_scorer(score_metric)), 'EigenProGaussian1000': GridSearchCV(estimator=FKC_EigenPro(batch_size="auto", n_epoch=10, n_components=1000, subsample_size="auto", kernel="gaussian", gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), param_grid=param_grid_EigenProGaussian, cv=3, scoring=metrics.make_scorer(score_metric)), 'NystroemRidgeCV': GridSearchCV(estimator=Pipeline([ ('kernel_approx', Nystroem(kernel="polynomial", n_components=None, random_state=clf_seed, degree=2)), ('classifier', linear_model.LogisticRegressionCV(class_weight=cw, max_iter=100, solver='sag', penalty='l2', n_jobs=1, cv=3, multi_class='multinomial')) ]), param_grid=param_grid_nystroem_ridgecv, cv=3, scoring=metrics.make_scorer(score_metric)), } elif clf_type == 'multiclass': print('fraction of the most frequent class:', max([list(y).count(x) for x in set(list(y))]) / len(list(y))) clfs = { 'L2RegularizedLinearModel': linear_model.LogisticRegressionCV(penalty='l2', n_jobs=1, cv=3, multi_class='multinomial', solver='sag', max_iter=100), 'GradientBoosting': ensemble.GradientBoostingClassifier(n_estimators=100), 'LGBM': GridSearchCV(estimator=LGBMClassifier(n_estimators=100, n_jobs=1), param_grid=param_grid_LGBM, cv=3, scoring=metrics.make_scorer(score_metric)), 'XGB': GridSearchCV(estimator=XGBClassifier(n_estimators=100, n_jobs=1, objective='multi:softmax', num_class=len(np.unique(y))), param_grid=param_grid_XGB, cv=3, scoring=metrics.make_scorer(score_metric)), 'MLP': MLPClassifier(hidden_layer_sizes=(30, 30), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10), 'MLPGridSearchCV': GridSearchCV(estimator=MLPClassifier(hidden_layer_sizes=(30, 30), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10), param_grid=param_grid_MLP, cv=3, scoring=metrics.make_scorer(score_metric)), 'EigenProPolynomial': FKC_EigenPro(batch_size="auto", n_epoch=10, n_components=n_components_eigenpro, subsample_size="auto", kernel="polynomial", gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), 'EigenProGaussian160': GridSearchCV(estimator=FKC_EigenPro( batch_size="auto", n_epoch=10, n_components=n_components_eigenpro, subsample_size="auto", kernel="gaussian", gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), param_grid=param_grid_EigenProGaussian, cv=3, scoring=metrics.make_scorer(score_metric)), 'EigenProGaussian1000': GridSearchCV(estimator=FKC_EigenPro(batch_size="auto", n_epoch=10, n_components=1000, subsample_size="auto", kernel="gaussian", gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), param_grid=param_grid_EigenProGaussian, cv=3, scoring=metrics.make_scorer(score_metric)), 'NystroemRidgeCV': GridSearchCV(estimator=Pipeline([ ('kernel_approx', Nystroem(kernel="polynomial", n_components=None, random_state=clf_seed, degree=2)), ('classifier', linear_model.LogisticRegressionCV(penalty='l2', n_jobs=1, cv=3, multi_class='multinomial', solver='sag', max_iter=100)) ]), param_grid=param_grid_nystroem_ridgecv, cv=3, scoring=metrics.make_scorer(score_metric)), } elif clf_type == 'regression': clfs = { 'L2RegularizedLinearModel': linear_model.RidgeCV(cv=3), 'GradientBoosting': ensemble.GradientBoostingRegressor(n_estimators=100), 'LGBM': GridSearchCV(estimator=LGBMRegressor(n_estimators=100, n_jobs=1), param_grid=param_grid_LGBM, cv=3, scoring=metrics.make_scorer(score_metric)), 'XGB': GridSearchCV(estimator=XGBRegressor(n_estimators=100, n_jobs=1), param_grid=param_grid_XGB, cv=3, scoring=metrics.make_scorer(score_metric)), 'MLP': MLPRegressor(hidden_layer_sizes=(30, 30), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10), 'MLPGridSearchCV': GridSearchCV(estimator=MLPRegressor(hidden_layer_sizes=(30, 30), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10), param_grid=param_grid_MLP, cv=3, scoring=metrics.make_scorer(score_metric)), 'EigenProPolynomial': FKR_EigenPro(batch_size="auto", n_epoch=10, n_components=n_components_eigenpro, subsample_size="auto", kernel="polynomial", bandwidth=5, gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), 'EigenProGaussian160': GridSearchCV(estimator=FKR_EigenPro( batch_size="auto", n_epoch=10, n_components=n_components_eigenpro, subsample_size="auto", kernel="gaussian", gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), param_grid=param_grid_EigenProGaussian, cv=3, scoring=metrics.make_scorer(score_metric)), 'EigenProGaussian1000': GridSearchCV(estimator=FKR_EigenPro(batch_size="auto", n_epoch=10, n_components=1000, subsample_size="auto", kernel="gaussian", gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), param_grid=param_grid_EigenProGaussian, cv=3, scoring=metrics.make_scorer(score_metric)), 'NystroemRidgeCV': GridSearchCV(estimator=Pipeline([('kernel_approx', Nystroem(kernel="polynomial", n_components=None, random_state=clf_seed, degree=2)), ('classifier', linear_model.RidgeCV(cv=3))]), param_grid=param_grid_nystroem_ridgecv, cv=3, scoring=metrics.make_scorer(score_metric)), } else: raise ValueError("{} not recognized".format(clf_type)) clfs = [clfs[clf] for clf in classifiers] for clf in clfs: try: if 'random_state' in clf.estimator.get_params(): clf.estimator.set_params(random_state=clf_seed) except AttributeError: if 'random_state' in clf.get_params(): clf.set_params(random_state=clf_seed) return clfs
def main(): parser = argparse.ArgumentParser( description='Extract features and run models') parser.add_argument('--classifier', dest='classifier_type', help='lr svm ffn', default='') parser.add_argument( '--name', dest='model_name', help= 'model name base, automatically appends experiment features and classifier, None just puts classifier and features', default=None) parser.add_argument('--feature', dest='feature', help='feature type', default='user') parser.add_argument( '--output-dirpath', dest='output_dirpath', help='output dirpath; default /projects/websci2020_tumblr_identity', default='/projects/websci2020_tumblr_identity') args = parser.parse_args() feature_type = args.feature output_dirpath = args.output_dirpath # Classifier definitions classifiers = { 'lr': linear_model.LogisticRegressionCV(cv=10, n_jobs=10, max_iter=10000, verbose=0), 'svm': model_selection.GridSearchCV(svm.LinearSVC(dual=False, max_iter=10000, verbose=0), { 'C': [.01, .1, 1, 10, 100], 'penalty': ['l2'] }, n_jobs=10, cv=10, verbose=2), 'ffn': neural_network.MLPClassifier(hidden_layer_sizes=(32, 50), activation='relu', early_stopping=True, verbose=2) } # ### Post baseline print("Extracting post baseline features...") X_train, y_train, X_test, y_test = extract_features(feature_type) clf = classifiers[args.classifier_type] print("Running post baseline...") if args.model_name is None: model_name = f'baseline_{args.classifier_type}' else: model_name = f'{args.model_name}_{args.classifier_type}' model, score, baseline_preds = run_model(model_name, clf, X_train, y_train, X_test, y_test, feature_type, output_dirpath) print(f'\tBaseline score: {score: .4f}')
def plot_ROC(X, y, outfile_name, PRNG): "ROC curve summary for the classifiers - with cross-validation" cv = StratifiedKFold(n_splits=10) clf1 = tree.DecisionTreeClassifier( class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=5, ## min 5 in each leaf min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=PRNG, splitter='best') # min_samples_split is common in literature clf2 = linear_model.LogisticRegressionCV(refit=True, random_state=PRNG) clf3 = RandomForestClassifier(n_estimators=1000, random_state=PRNG) clfnames = ['DT', 'LR', 'RF'] colors = cycle(['blue', 'green', 'darkorange']) plt.figure(figsize=(4, 3), dpi=300) for clf, cnames, color in zip([clf1, clf2, clf3], clfnames, colors): mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) lw = 1.5 i = 0 for (train, test), color in zip(cv.split(X, y), colors): probas_ = clf.fit(X[train], y[train]).predict_proba(X[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = metrics.roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = metrics.auc(fpr, tpr) i += 1 mean_tpr /= cv.get_n_splits(X, y) mean_tpr[-1] = 1.0 mean_auc = metrics.auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, color=color, label='%s (%0.2f)' % (cnames, mean_auc), lw=lw) font = {'size': 12, 'weight': 'normal', 'family': 'sans-serif'} plt.rc('font', **font) plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('FPR') plt.ylabel('TPR') plt.title("") plt.legend(loc="lower right", prop={'size': 12}) plt.savefig(outfile_name)
#df[df.apply(lambda x: np.abs(x - x.mean()) / x.std() < 3).all(axis=1)] #df.describe() # In[345]: #Creation of the model X = np.array(df.drop(['double'],1)) y = np.array(df['double']) X_val = np.array(df_val.drop(['double'],1)) y_val = np.array(df_val['double']) model = linear_model.LogisticRegressionCV(Cs=100, class_weight={0:2.3, 1:3.7}, cv=10, random_state=7) model.fit(X,y) # In[346]: #Store the model and print the confusion matrix joblib.dump(model, 'model.pkl') y_pred = model.predict(X_val) cm = metrics.confusion_matrix(y_val, y_pred) print(cm) # In[347]:
def logistic_deconvolution(estimation_train, estimation_test, stimuli_train, stimuli_test, logistic_window, delay=0): """ Learn a deconvolution filter for classification given a time window using logistic regression. Parameters ---------- estimation_train: numpy array of shape [n_scans_train, n_categories] estimation of the categories time series for the train data estimation_test: numpy array of shape [n_scans_test, n_categories] estimation of the categories time series for the test data stimuli_train: numpy array of shape [n_scans_train, n_categories] time series of the train stimuli with one-hot encoding stimuli_test: numpy array of shape [n_scans_test, n_categories] time series of the test stimuli with one-hot encoding logistic_window: int size of the time window to be used for creating train and test data delay: int, optional delay between time series and stimuli to be applied to the data. Defaults to 0. Returns ------- score: numpy array of size [n_categories] prediction r2 score for each category """ log = linear_model.LogisticRegressionCV() # Add a delay between time series and stimuli if needed if delay != 0: estimation_train, estimation_test = (estimation_train[delay:], estimation_test[delay:]) stimuli_train, stimuli_test = (stimuli_train[:-delay], stimuli_test[:-delay]) # Create train and test masks for the stimuli (i.e. no 'rest' category) train_mask = np.sum(stimuli_train[:, 1:], axis=1).astype(bool) test_mask = np.sum(stimuli_test[:, 1:], axis=1).astype(bool) # Create train and test time windows time_windows_train = [ estimation_train[scan:scan + logistic_window].ravel() for scan in range(len(estimation_train) - logistic_window + 1) if train_mask[scan] ] time_windows_test = [ estimation_test[scan:scan + logistic_window].ravel() for scan in range(len(estimation_test) - logistic_window + 1) if test_mask[scan] ] # Create train and test stimuli labels stimuli_train = np.argmax(stimuli_train[train_mask], axis=1) stimuli_test = np.argmax(stimuli_test[test_mask], axis=1) # Fit logistic regression log.fit(time_windows_train, stimuli_train) accuracy = log.score(time_windows_test, stimuli_test) return accuracy
regression(linear_model.LassoLars()), regression(linear_model.LassoLarsIC()), regression(linear_model.OrthogonalMatchingPursuit()), regression(linear_model.OrthogonalMatchingPursuitCV()), regression(linear_model.Ridge(random_state=RANDOM_SEED)), regression(linear_model.RidgeCV()), regression(linear_model.BayesianRidge()), regression(linear_model.ARDRegression()), regression(linear_model.SGDRegressor(random_state=RANDOM_SEED)), regression(linear_model.PassiveAggressiveRegressor( random_state=RANDOM_SEED)), # Logistic Regression classification(linear_model.LogisticRegression( random_state=RANDOM_SEED)), classification(linear_model.LogisticRegressionCV( random_state=RANDOM_SEED)), classification(linear_model.RidgeClassifier(random_state=RANDOM_SEED)), classification(linear_model.RidgeClassifierCV()), classification(linear_model.SGDClassifier(random_state=RANDOM_SEED)), classification_binary(linear_model.LogisticRegression( random_state=RANDOM_SEED)), classification_binary(linear_model.LogisticRegressionCV( random_state=RANDOM_SEED)), classification_binary(linear_model.RidgeClassifier( random_state=RANDOM_SEED)), classification_binary(linear_model.RidgeClassifierCV()), classification_binary(linear_model.SGDClassifier( random_state=RANDOM_SEED)), # Decision trees
if_else('max_dtsl', '>', 10).if_else('max_dxcs', '>', 3).if_else('max_dxmj', '>', 1000).\ if_else('max_jzgd', '>', 100).if_else('max_jznl', '<', 0).if_else('max_jznl', '>', 10000).\ if_else('max_rzsl', '>', 50).if_else('max_zdmj', '>', 10000).if_else('ssdts', '>', 10).if_else('xfcds', '>', 10).\ if_else('xfdts', '>', 10).if_else('yhsl', '>', 300).if_else('zgrs', '>', 1000).if_else('zgsl', '>', 300).\ fill_na('zjhzsj', 20010101000000).fill_na('zjjcsj', 20010101000000).if_else('zjyhsl', '>', 10).\ if_else('zjzgsl', '>', 10).date_diff('zjhzsj', deadline).date_diff('zjjcsj', deadline).\ col_diff_if_else('yhsl', 'zgsl').col_diff_if_else('zjyhsl', 'zjzgsl') # 提取需要的数据列 fire_data = fire_data.fire_data.ix[:, [ 'dwid', 'Y', 'aqsks', 'dwdj_1', 'dwdj_2', 'dwxz_1', 'dwxz_2', 'dwxz_3', 'dwxz_4', 'hzsl', 'jcsl', 'jzmj', 'jzsl', 'sfgpdw', 'sfzdyhdw', 'ssdts', 'xfcds', 'xfdts', 'yhsl', 'zdxfss', 'zgrs', 'zjyhsl', 'zjzgsl', 'hzts_to_deadline', 'jcts_to_deadline', 'yhsl_minus_zgsl', 'zjyhsl_minus_zjzgsl', 'zddw', 'ybdw', 'jxdw', 'wxp', 'max_jzzt', 'max_jznl', 'max_jzgd', 'max_zdmj', 'max_dscs', 'max_dsmj', 'max_dxcs', 'max_dxmj', 'max_nhdj', 'max_rnrs', 'max_dtsl', 'max_xfkzs', 'max_rzsl', 'max_xfsssl' ]].fillna(0) # 运行模型:弹性网络模型 # enet = lm.ElasticNetCV(l1_ratio=1, cv=10, n_jobs=1) # l1_ratio=1表示Lasso回归 # enet.fit(X=fire_data.ix[:, 2:], y=fire_data.ix[:, 1]) # joblib.dump(enet, model_path+'/fire_risk_model_enet.pkl') # 运行模型:逻辑回归模型 lgr = lm.LogisticRegressionCV(cv=10, penalty='l1', solver='liblinear', n_jobs=1) lgr.fit(X=fire_data.ix[:, 2:], y=fire_data.ix[:, 1]) joblib.dump(lgr, model_path + '/fire_risk_model_lgr.pkl')
def compare_algorithm(data, target): x_train, x_cross, y_train, y_cross = train_test_split(data, target) MLA = [ # Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), # Gaussian Processes gaussian_process.GaussianProcessClassifier(), # GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(max_iter=1000, tol=0.001), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(max_iter=1000, tol=0.001), linear_model.Perceptron(max_iter=1000, tol=0.001), # Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), # Nearest Neighbor neighbors.KNeighborsClassifier(), # SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), # Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), # Discriminant Analysis discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), # xgboost: http://xgboost.readthedocs.io/en/latest/model.html xgb.XGBClassifier() ] MLA_columns = [] MLA_compare = pd.DataFrame(columns=MLA_columns) row_index = 0 for alg in MLA: predicted = alg.fit(x_train, y_train).predict(x_cross) fp, tp, th = roc_curve(y_cross, predicted) MLA_name = alg.__class__.__name__ MLA_compare.loc[row_index, 'MLA Name'] = MLA_name MLA_compare.loc[row_index, 'MLA Train Accuracy'] = round( alg.score(x_train, y_train), 4) MLA_compare.loc[row_index, 'MLA Test Accuracy'] = round( alg.score(x_cross, y_cross), 4) MLA_compare.loc[row_index, 'MLA Precission'] = precision_score( y_cross, predicted) MLA_compare.loc[row_index, 'MLA Recall'] = recall_score(y_cross, predicted) MLA_compare.loc[row_index, 'MLA AUC'] = auc(fp, tp) row_index = row_index + 1 MLA_compare.sort_values(by=['MLA Test Accuracy'], ascending=False, inplace=True) print(MLA_compare)
""" import sklearn as sk import sklearn.linear_model as skl import sklearn.preprocessing as skp import sklearn.datasets as skd import sklearn.decomposition as skD import numpy as np import kk_utils as kk dataset = skd.load_breast_cancer() linreg = skl.LinearRegression() rreg = skl.RidgeCV() lasreg = skl.LassoCV() logreg = skl.LogisticRegressionCV() scores = [] ''' # Experimental Part for n_components in range(60,65): print('Number of components:',n_components) pca = pca.set_params(n_components=n_components) X = dataset.data X = kk.MeanNormalizer(X) X = pca.fit_transform(X) score,best_cv = kk.fitModel(linreg, X, Y, cv=True, ncv = 10) cvs.append(best_cv) scores.append(score) ''' # BEST SCORE CLASSIFICATION
loss_clf_test=round(hamming_loss(y_test,y_clf_test),4) loss_train.append(loss_clf_train); loss_test.append(loss_clf_test) return [y_clf_train,y_clf_test,acc_clf_train, acc_clf_test,loss_clf_train,loss_clf_test] def get_classifier_results(): return pandas.DataFrame({'classifier':classifier_list, 'classifier_name':classifier_names, 'clf_dataset':clf_datasets, 'acc_train':acc_train,'acc_test':acc_test, 'loss_train':loss_train,'loss_test':loss_test}) classifier_list,classifier_names,clf_datasets=[],[],[] acc_train,acc_test,loss_train,loss_test=[],[],[],[] df_list=['classifier_name','acc_train','acc_test','loss_train','loss_test'] clf=[linear_model.LogisticRegression(solver='liblinear',multi_class='ovr'), linear_model.LogisticRegressionCV(solver='liblinear',multi_class='ovr'), linear_model.SGDClassifier(max_iter=1000,tol=0.00001), linear_model.RidgeClassifier(),linear_model.RidgeClassifierCV(), LinearDiscriminantAnalysis(),QuadraticDiscriminantAnalysis(), svm.LinearSVC(),svm.SVC(gamma='scale',C=10.0,kernel='poly'), svm.NuSVC(gamma='scale',kernel='poly'), KNeighborsClassifier(),RadiusNeighborsClassifier(radius=30), NearestCentroid(), DecisionTreeClassifier(),ExtraTreeClassifier(),GaussianNB(), BernoulliNB(),MultinomialNB(), BaggingClassifier(),RandomForestClassifier(n_estimators=64), AdaBoostClassifier(),GradientBoostingClassifier(), linear_model.Perceptron(max_iter=1000,tol=0.00001), linear_model.PassiveAggressiveClassifier(max_iter=1000,tol=0.00001), GaussianProcessClassifier(),LabelPropagation(),LabelSpreading()]
def trial_dataparams(data, target): random_state = 42 X, y = data, target X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=random_state, stratify=y) kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state) scoring = { 'Precision': make_scorer(precision_score), 'Recall': make_scorer(recall_score), 'F1_score': make_scorer(f1_score), 'Accuracy': make_scorer(accuracy_score) } cls_balanced = [ ('dtc', tree.DecisionTreeClassifier(class_weight='balanced', random_state=random_state)), ('rfc', ensemble.RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=random_state)), ('lr', linear_model.LogisticRegressionCV(class_weight='balanced', random_state=random_state)), ('svc', svm.SVC(probability=True, class_weight='balanced', random_state=random_state)), ('xgb', XGBClassifier(n_estimators=100, objective='binary:logistic', scale_pos_weight=13, random_state=random_state)) ] cls = [('dtc', tree.DecisionTreeClassifier(random_state=random_state)), ('bc', ensemble.BaggingClassifier(n_estimators=100, random_state=random_state)), ('gbc', ensemble.GradientBoostingClassifier(n_estimators=100, random_state=random_state)), ('rfc', ensemble.RandomForestClassifier(n_estimators=100, random_state=random_state)), ('lr', linear_model.LogisticRegressionCV(random_state=random_state)), ('knn', neighbors.KNeighborsClassifier()), ('svc', svm.SVC(probability=True, random_state=random_state)), ('xgb', XGBClassifier(n_estimators=100, objective='binary:logistic', random_state=random_state))] dfout = pd.DataFrame() for scaler in [StandardScaler(), RobustScaler(quantile_range=(2.5, 97.5))]: for pipelinesteps in [get_pipe_pca(scaler), get_pipe_nopca(scaler)]: for cls_train in [cls, cls_balanced]: dfout_i = train_cls(cls_train, pipelinesteps, X_train=X_train, y_train=y_train, kfolds=kfolds, scoring=scoring) dfout = dfout.append(dfout_i, ignore_index=True) dfout = dfout.sort_values(by='CV F1 Mean', ascending=False) dfout.to_csv(PROCESSED + os.sep + 'trial_dataparams_.csv', index=False, sep=';')
#grid_knn=grid(neighbors.KNeighborsClassifier()).grid_get(x_train_c,y_train,knn_grid) #grid_forest=grid(RandomForestClassifier()).grid_get(x_train_c,y_train,forest_grid) #grid_dtree=grid(tree.DecisionTreeClassifier()).grid_get(x_train_c,y_train,dtree_grid) #grid_lrc=grid(linear_model.LogisticRegressionCV()).grid_get(x_train_c,y_train,lrc_grid) #grid_rc=grid(linear_model.RidgeClassifierCV()).grid_get(x_train_c,y_train,rc_grid) # In[66]: svc = svm.SVC(C=5, gamma=1e-05, kernel='linear') knn = neighbors.KNeighborsClassifier(algorithm='kd_tree', n_neighbors=6, weights='distance') dtree = tree.DecisionTreeClassifier(criterion='gini', min_samples_split=0.05, random_state=0) lrc = linear_model.LogisticRegressionCV(Cs=1000) #rc=linear_model.RidgeClassifierCV(grid_rc) forest = RandomForestClassifier(criterion='gini', max_depth=8, n_estimators=80) bayes = naive_bayes.GaussianNB() models = [svc, forest] meta_model = knn # In[67]: from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone class stacking(BaseEstimator, RegressorMixin, TransformerMixin): def __init__(self, mod, meta_model): self.mod = mod self.meta_model = meta_model
print("Test AUC:%.2f; bACC:%.2f, Time: %.2fs" % (scores['test_roc_auc'].mean(), scores['test_balanced_accuracy'].mean(), scores['fit_time'].sum())) # %% # Models with built-in cross-validation # -------------------------------------- # # Let sklearn select the best parameters over a default grid. # # **Classification** print("== Logistic Ridge (L2 penalty) ==") mod_cv = lm.LogisticRegressionCV(class_weight='balanced', scoring='balanced_accuracy', n_jobs=-1, cv=5) scores = cross_val_score(estimator=mod_cv, X=X, y=y, cv=5) print("Test ACC:%.2f" % scores.mean()) # %% # **Regression** X, y, coef = datasets.make_regression(n_samples=50, n_features=100, noise=10, n_informative=2, random_state=42, coef=True) print("== Ridge (L2 penalty) ==")
# In[ ]: MLA = [ #Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), #Gaussian Processes gaussian_process.GaussianProcessClassifier(), #GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), #Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), #Nearest Neighbor neighbors.KNeighborsClassifier(), #SVM svm.SVC(probability=True), svm.NuSVC(probability=True),
def auto_model(X, y, X_pred, sub): models = { 'ridge ': linear_model.Ridge(alpha=.5, max_iter=1e8), 'ridgeCV': linear_model.RidgeCV(cv=3), 'lasso ': linear_model.Lasso(alpha=1e-6, max_iter=1e8), 'lr ': linear_model.LogisticRegression(solver='lbfgs', max_iter=1e4), 'lrCV ': linear_model.LogisticRegressionCV(solver='lbfgs', max_iter=1e4, cv=5), 'mlp_clf': neural_network.MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(256, 64, 32, 32, 32), random_state=1), 'mlp_reg': neural_network.MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(256, 64, 32, 32, 32), random_state=1), 'svc ': svm.SVC(), 'rfreg ': ensemble.RandomForestRegressor(max_depth=4), 'rfclf ': ensemble.RandomForestClassifier(max_depth=4), 'lgbclf ': lgb.LGBMClassifier(gamma='auto', num_leaves=4, learning_rate=0.001, n_estimators=2000), 'lgbreg ': lgb.LGBMRegressor(gamma='auto', num_leaves=31, learning_rate=0.001, n_estimators=20000), 'knn ': neighbors.KNeighborsClassifier(n_neighbors=5, n_jobs=15), 'nb ': naive_bayes.GaussianNB(), 'dt ': tree.DecisionTreeClassifier(), # 'catreg ': CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=3, verbose = True), # 'catclf ': CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=2, verbose = True), } print('\nall models: ', list(models.keys())) # In[all scalers] from sklearn import preprocessing from sklearn import feature_selection from sklearn.decomposition import PCA preprocessings = { 'standards': preprocessing.StandardScaler(), 'minmaxs': preprocessing.MinMaxScaler(), 'robusts': preprocessing.RobustScaler(), 'PCA': PCA(), 'PowerTransformer': preprocessing.Normalizer(), 'variance_threshold': feature_selection.VarianceThreshold(threshold=0.5), } print('all preprocessings: ', list(preprocessings.keys()), '\n') # In[Preprocessing Pipline] from sklearn.pipeline import make_pipeline from sklearn.model_selection import KFold nfolds = 5 kf = KFold(n_splits=nfolds, random_state=2019, shuffle=True) pipe_preprocessing = make_pipeline( # preprocessings['variance_threshold'], preprocessings['standards'], # preprocessings['minmaxs'], # preprocessings['robusts'], # preprocessings['PCA'], # preprocessings['PowerTransformer'], ) full = np.concatenate((X, X_pred), axis=0) pipe_preprocessing.fit(full) X = pipe_preprocessing.transform(X) X_pred = pipe_preprocessing.transform(X_pred) # mse_score = [] auc_score = [] valid_score = {} oof = y * 0 idx2 = 0 X_s = pd.DataFrame() X_s_preds = pd.DataFrame() for idx, model in enumerate(models.items()): model = model[1] try: print('trainning in: ', list(models.keys())[idx]) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] if (list(models.keys())[idx] == 'lgbclf ') | (list( models.keys())[idx] == 'lgbreg '): model.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='l1', early_stopping_rounds=5, verbose=0) elif (list(models.keys())[idx] == 'catreg ') | (list( models.keys())[idx] == 'catclf '): model.fit(X_train, y_train, eval_set=(X_test, y_test)) else: model.fit(X_train, y_train) oof[test_index] = model.predict(X_test) mse_score.append(mean_squared_error(y, oof)) try: auc_score.append(roc_auc_score(y, oof)) except: auc_score.append(0) X_s[list(models.keys())[idx]] = oof X_s_preds[list(models.keys())[idx]] = model.predict(X_pred) valid_score.update({ list(models.keys())[idx]: [ 'mse: ', "{0:.4f}".format(mse_score[idx2]), 'auc: ', "{0:.4f}".format(auc_score[idx2]) ] }) idx2 += 1 except: print('error in: ', list(models.keys())[idx]) print('-------------------------------------------') # In[Keras NN] X_s0 = np.asarray(X) X_s0_preds = np.asarray(X_pred) oof = np.zeros(len(X_s0)) predictions = np.asarray(sub.target * 0.) pipe_preprocessing = make_pipeline( preprocessings['minmaxs'], preprocessings['PowerTransformer'], ) full = np.concatenate((X_s0, X_s0_preds), axis=0) pipe_preprocessing.fit(full) X = pipe_preprocessing.transform(X_s0) X_pred = pipe_preprocessing.transform(X_s0_preds) # print('\nstart kerasNN ... ') for fold_, (train_index, test_index) in enumerate(kf.split(X)): model = Sequential([ Dense(256, input_shape=(X_s0.shape[1], )), Activation('relu'), Dense(128), Activation('relu'), Dense(64), Activation('relu'), Dense(32), Activation('relu'), Dense(32), Activation('relu'), Dense(1), Activation('sigmoid'), ]) model.compile(optimizer='adam', loss='binary_crossentropy') file_path = "NN_ml_" + "_model_" + "loop_" + str(fold_) + ".hdf5" X_tr, X_val = X_s0[train_index], X_s0[test_index] y_tr, y_val = y[train_index], y[test_index] callbacks = [ EarlyStopping(monitor='val_loss', mode='min', patience=20), ModelCheckpoint(filepath=file_path, monitor='val_loss', mode='min', save_best_only=True), lr_reduced ] model.fit(X_tr, y_tr, epochs=750, batch_size=512, callbacks=callbacks, shuffle=True, validation_data=(X_val, y_val), verbose=1) model.load_weights(file_path) oof[test_index] = np.ndarray.flatten(model.predict(X_val)) predictions += np.ndarray.flatten(model.predict(X_s0_preds)) predictions /= nfolds X_s['keras_nn'] = oof X_s_preds['keras_nn'] = predictions oof_k = np.asarray(oof) # In[Stacking] X_s = np.asarray(X_s) X_s_preds = np.asarray(X_s_preds) oof = np.zeros(len(X_s)) predictions = np.asarray(sub.target * 0.) pipe_preprocessing = make_pipeline(preprocessings['minmaxs'], ) full = np.concatenate((X_s, X_s_preds), axis=0) pipe_preprocessing.fit(full) X = pipe_preprocessing.transform(X_s) X_pred = pipe_preprocessing.transform(X_s_preds) # print('\nstart stacking... ') for fold_, (train_index, test_index) in enumerate(kf.split(X)): model = Sequential([ Dense(512, input_shape=(X_s.shape[1], )), Activation('linear'), Dense(1), Activation('sigmoid'), ]) model.compile(optimizer='adam', loss='binary_crossentropy') file_path = "NN_ml_" + "_model_" + "loop_" + str(fold_) + ".hdf5" X_tr, X_val = X_s[train_index], X_s[test_index] y_tr, y_val = y[train_index], y[test_index] callbacks = [ EarlyStopping(monitor='val_loss', mode='min', patience=20), ModelCheckpoint(filepath=file_path, monitor='val_loss', mode='min', save_best_only=True), lr_reduced ] model.fit(X_tr, y_tr, epochs=750, batch_size=512, callbacks=callbacks, shuffle=True, validation_data=(X_val, y_val), verbose=1) model.load_weights(file_path) oof[test_index] = np.ndarray.flatten(model.predict(X_val)) predictions += np.ndarray.flatten(model.predict(X_s_preds)) predictions /= nfolds # In[showing result] print('\nvalid score: \n', pd.DataFrame(valid_score).transpose()) inds = np.argmax(mse_score) print('\nmse score - worst model found: ', list(models.keys())[inds], np.max(mse_score)) inds = np.argmin(auc_score) print('auc score - worst model found: ', list(models.keys())[inds], np.min(auc_score)) inds = np.argmin(mse_score) print('mse score - best model found: ', list(models.keys())[inds], np.min(mse_score)) inds = np.argmax(auc_score) print('auc score - best model found: ', list(models.keys())[inds], np.max(auc_score)) try: mse_score_s = mean_squared_error(y, oof_k) print('\rmse score - keras_nn: ', mse_score_s) mse_score_s = mean_squared_error(y, oof) print('\rmse score - stacking: ', mse_score_s) except: pass try: auc_score_s = roc_auc_score(y, oof_k) print('auc score - keras_nn: ', auc_score_s) auc_score_s = roc_auc_score(y, oof) print('auc score - stacking: ', auc_score_s) except: pass sub['target'] = predictions
#Machine Learning Algorithm (MLA) Selection and initialization CLF = [ #Ensemble Methods ('ada', ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier())), ('bc', ensemble.BaggingClassifier()), ('etc', ensemble.ExtraTreesClassifier()), ('gbc', ensemble.GradientBoostingClassifier()), ('xgbc', xgb.XGBClassifier(max_depth=3)), # xgb.XGBClassifier()), # ('rfc', ensemble.RandomForestClassifier(n_estimators=50)), #Gaussian Processes ('gpc', gaussian_process.GaussianProcessClassifier()), #GLM - remove linear models, since this is a classifier algorithm ('lr', linear_model.LogisticRegressionCV()), ('pac', linear_model.PassiveAggressiveClassifier()), ('rc', linear_model.RidgeClassifierCV()), ('sgd', linear_model.SGDClassifier()), ('pct', linear_model.Perceptron()), #Navies Bayes ('gnb', naive_bayes.GaussianNB()), #Nearest Neighbor ('knn', neighbors.KNeighborsClassifier(n_neighbors=3)), #SVM ('svc', svm.SVC(probability=True)), ('lsvc', svm.LinearSVC()),
coefficient_results.append([col_name]) coefficient_results[counter_coe].append(scipy.stats.pearsonr(df_temp[col_name].values.tolist(), Y)[0]) counter_coe+=1 X = df_temp.values.tolist() X = np.array(X) X = X/X.max(axis=0) #five fold cross-validation numData.append(len(X)) score1= [] score2= [] kf = KFold(len(X), n_folds = 5, shuffle = True ) for train, test in kf: X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test] #X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2) logreg = linear_model.LogisticRegressionCV(penalty='l1', solver='liblinear', Cs=[1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8], refit=True) logreg.fit(X_train,y_train) print "step"+str(step[i]) print "Percentage of positive data: ",float(sum(Y))/len(Y) y_pred = logreg.predict(X_test) #print "Accuracy score w/o feature selection: ",logreg.score(X_test,y_test) score1.append(logreg.score(X_test,y_test)) """ # perform recursive feature selection(backward selectoin) rfecv = RFECV(estimator=logreg, step=1, cv=StratifiedKFold(y_train, 4),scoring='accuracy') #rfecv = RFECV(estimator=logreg, step=1, cv=StratifiedKFold(y_train, 4),scoring='roc_auc') rfecv.fit(X_train, y_train) print("Optimal number of features : %d" % rfecv.n_features_) print "rfecv support ",rfecv.support_ print "rfecv ranking ", rfecv.ranking_