rus = RandomUnderSampler(random_state=42) # test_X,test_y = rus.fit_resample(test_X,test_y) print("Training data size: ", train_X.shape) print("Test data size: ", test_X.shape) # Normalize using StandardScaler scaler = StandardScaler() train_X = scaler.fit_transform(train_X) test_X = scaler.transform(test_X) #NOTE: If you are trying to add some of your models - your input will be train_X and then test it on test_X. # Models to try try_models = [ LogisticRegression(), LogisticRegressionCV(), KNeighborsClassifier(n_neighbors=20), DecisionTreeClassifier(), RandomForestClassifier(n_estimators=500), MLPClassifier(hidden_layer_sizes=(300, ), verbose=True, max_iter=500, alpha=0.00001), SVC() ] # Gather metrics here accuracy_by_model = {} # Train then evaluate each model i = 0 for model in try_models:
y_train = data_train['Class Label'].values X_train = data_train.values y_train = y_train.reshape(len(y_train), 1) y_test = data_test['Class Label'].values X_test = data_test.values y_test = y_test.reshape(len(y_test), 1) df.head() # Fit a logistic regression classifier to the training set and report the accuracy of the classifier on the test set clf = LogisticRegressionCV( Cs=list(np.power(10.0, np.arange(-10, 10))) ,penalty='l2' ,cv=10 ,random_state=777 ,fit_intercept=True ,solver='newton-cg' ,tol=10) clf.fit(X_train, y_train) print('\n') print("The optimized L2 regularization paramater id:", clf.C_) # The coefficients print('Estimated beta1: \n', clf.coef_) print('Estimated beta0: \n', clf.intercept_) # Scoring clf_y_pred_test = clf.predict(X_test) clf_y_pred_test = clf_y_pred_test.reshape(len(clf_y_pred_test), 1) test_df = pd.DataFrame(clf_y_pred_test)
df = pd.read_csv("data/march_madness_history.csv") df['Winner'] =np.where(df['Winner'] =="TEAM_1", 1,0) df.head() # %% # Build models meta_data = dict( title="NCAA March Madness 2021", description="Very simple Estimate", analyst = "Kevin Joy", tags=["NCAA", "Basketball", "March Madness"] ) model = ct.Models( df = df[~df.isnull()], formulas = ['Winner~Round+Favorite', 'Winner~Round + Favorite + Seed_diff','Winner~Round + Favorite * Seed_diff + np.square(Seed_diff)' ], models = [LogisticRegression(), LogisticRegressionCV(cv=5), RandomForestClassifier(), AdaBoostClassifier()], test_split=.10, **meta_data, Type ="classifier" ) model_reg = ct.Models( df = df, formulas = ['Spread~Round+Favorite', 'Spread~Round + Favorite + Seed_diff','Spread~Round + Favorite * Seed_diff + np.square(Seed_diff)' ], models = [LinearRegression(), LassoCV(), RidgeCV(), RandomForestRegressor()], test_split=.10, **meta_data, Type = 'regression' )
regsNames = ['LinearRegression', 'RidgeCV', 'LassoCV', 'ElasticNetCV'] Regs = [ LinearRegression(normalize=True), RidgeCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0, 3.0, 5.0, 7.0, 10.0], cv=10, normalize=True), LassoCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0, 3.0, 5.0, 7.0, 10.0], cv=10, normalize=True), ElasticNetCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0, 3.0, 5.0, 7.0, 10.0], cv=10, normalize=True) ] Classifiers = [ LogisticRegressionCV(cv=10), DecisionTreeClassifier(max_depth=3), svm.SVC(kernel='rbf', probability=True), svm.SVC(kernel='linear', probability=True), neighbors.KNeighborsClassifier(n_neighbors=7) ] #naive_bayes.GaussianNB(),neighbors.KNeighborsClassifier(n_neighbors=7)] ClassifiersNames = [ 'LogisticRegressionCV', 'DecisionTreeClassifier', 'svm.SVC_rbf', 'svm.SVC_linear', 'neighbors.KNeighborsClassifier' ] RegsComp = [ LinearRegression(normalize=True), DecisionTreeRegressor(max_depth=3), svm.SVR(kernel='rbf'),
metrics.accuracy_score(y_test, lr_predict_test))) print(metrics.confusion_matrix(y_test, lr_predict_test)) print("") print("Classification Report") print(metrics.classification_report(y_test, lr_predict_test)) print(metrics.recall_score(y_test, lr_predict_test)) #%% [markdown] # ### LogisticRegressionCV #%% from sklearn.linear_model import LogisticRegressionCV lr_cv_model = LogisticRegressionCV( n_jobs=-1, random_state=42, Cs=3, cv=10, refit=False, class_weight="balanced" ) # set number of jobs to -1 which uses all cores to parallelize lr_cv_model.fit(X_train, y_train.ravel()) #%% [markdown] # ### Predict on Test data #%% lr_cv_predict_test = lr_cv_model.predict(X_test) # training metrics print("Accuracy: {0:.4f}".format( metrics.accuracy_score(y_test, lr_cv_predict_test))) print(metrics.confusion_matrix(y_test, lr_cv_predict_test))
test_label = test_label.values ################################ X_train = train_data X_test = test_data Y_train = train_label Y_test = test_label #对数据的训练集进行标准化 ss = StandardScaler() X_train = ss.fit_transform(X_train) #先拟合数据在进行标准化 lr = LogisticRegressionCV(multi_class="ovr", fit_intercept=True, Cs=np.logspace(-2, 2, 20), cv=2, penalty="l2", solver="lbfgs", tol=0.01) re = lr.fit(X_train, Y_train) r = re.score(X_train, Y_train) print('R(score):', r) print('coefficient:', re.coef_) print("intercept:", re.intercept_) print("稀疏化特征比率:%.2f%%" % (np.mean(lr.coef_.ravel() == 0) * 100)) print("=========sigmoid函数转化的值,即:概率p=========") print(re.predict_proba(X_test)) #sigmoid函数转化的值,即:概率p #模型的保存与持久化 from sklearn.externals import joblib
def QuickML_Ensembling(X_train, y_train, X_test, y_test='', modeltype='Regression', Boosting_Flag=False, scoring='', verbose=0): """ Quickly builds and runs multiple models for a clean data set(only numerics). """ start_time = time.time() seed = 99 FOLDS = 5 model_dict = {} model_tuples = [] if len(X_train) <= 100000 and X_train.shape[1] < 50: NUMS = 100 else: try: X_train = X_train.sample(frac=0.30, random_state=99) y_train = y_train[X_train.index] except: pass NUMS = 200 if modeltype == 'Regression': if scoring == '': scoring = 'neg_mean_squared_error' #scv = ShuffleSplit(n_splits=FOLDS,random_state=seed) scv = KFold(n_splits=FOLDS, shuffle=False, random_state=seed) if Boosting_Flag is None: ## Create an ensemble model #### model5 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor( random_state=seed, max_depth=1, min_samples_leaf=2), n_estimators=NUMS, random_state=seed) model_tuples.append(('Adaboost', model5)) elif not Boosting_Flag: model5 = LassoLarsCV(cv=scv) model_tuples.append(('LassoLarsCV', model5)) else: model5 = LassoLarsCV(cv=scv) model_tuples.append(('LassoLarsCV', model5)) if Boosting_Flag is None: model6 = BaggingRegressor(DecisionTreeRegressor(random_state=seed), n_estimators=NUMS, random_state=seed) model_tuples.append(('Bagging_Regressor', model6)) elif not Boosting_Flag: model6 = LinearSVR() model_tuples.append(('Linear_SVR', model6)) else: model6 = DecisionTreeRegressor(max_depth=5, min_samples_leaf=2) model_tuples.append(('Decision_Tree', model6)) model7 = KNeighborsRegressor(n_neighbors=8) model_tuples.append(('KNN_Regressor', model7)) if Boosting_Flag is None: #### If the Boosting_Flag is True, it means Boosting model is present. ### So choose a different kind of classifier here model8 = DecisionTreeRegressor(max_depth=5, min_samples_leaf=2) model_tuples.append(('Decision_Tree', model8)) elif not Boosting_Flag: #### If the Boosting_Flag is True, it means Boosting model is present. ### So choose a different kind of classifier here model8 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor( random_state=seed, max_depth=1, min_samples_leaf=2), n_estimators=NUMS, random_state=seed) model_tuples.append(('Adaboost', model8)) else: model8 = RandomForestRegressor(bootstrap=False, max_depth=10, max_features='auto', min_samples_leaf=2, n_estimators=200, random_state=99) model_tuples.append(('RF_Regressor', model8)) else: if scoring == '': scoring = 'accuracy' num_classes = len(np.unique(y_test)) scv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=seed) if Boosting_Flag is None: ## Create an ensemble model #### model5 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier( random_state=seed, max_depth=1, min_samples_leaf=2), n_estimators=NUMS, random_state=seed) model_tuples.append(('Adaboost', model5)) elif not Boosting_Flag: model5 = LinearDiscriminantAnalysis() model_tuples.append(('Linear_Discriminant', model5)) else: model5 = LogisticRegressionCV(Cs=[0.001, 0.01, 0.1, 1, 10, 100], solver='liblinear', random_state=seed) model_tuples.append(('Logistic_Regression_CV', model5)) if Boosting_Flag is None: model6 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=2) model_tuples.append(('Decision_Tree', model6)) elif not Boosting_Flag: model6 = LinearSVC() model_tuples.append(('Linear_SVC', model6)) else: model6 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=2) model_tuples.append(('Decision_Tree', model6)) if modeltype == 'Binary_Classification': model7 = GaussianNB() else: model7 = MultinomialNB() model_tuples.append(('Naive_Bayes', model7)) if Boosting_Flag is None: #### If the Boosting_Flag is True, it means Boosting model is present. ### So choose a different kind of classifier here model8 = RandomForestClassifier(bootstrap=False, max_depth=10, max_features='auto', min_samples_leaf=2, n_estimators=200, random_state=99) model_tuples.append(('Bagging_Classifier', model8)) elif not Boosting_Flag: #### If the Boosting_Flag is True, it means Boosting model is present. ### So choose a different kind of classifier here sgd_best_model = SGDClassifier(alpha=1e-06, loss='log', max_iter=1000, penalty='l2', learning_rate='constant', eta0=.1, random_state=3, tol=None) model8 = OneVsRestClassifier(sgd_best_model) model_tuples.append(('One_vs_Rest_Classifier', model8)) else: model8 = RandomForestClassifier(bootstrap=False, max_depth=10, max_features='auto', min_samples_leaf=2, n_estimators=200, random_state=99) model_tuples.append(('Bagging_Classifier', model8)) model_dict = dict(model_tuples) models, results = run_ensemble_models(model_dict, X_train, y_train, X_test, y_test, scoring, modeltype) return models, results
impute_value = train.Age.median() train.Age.fillna(impute_value, inplace=True) test.Age.fillna(impute_value, inplace=True) train['IsFemale'] = (train.Sex == 'female').astype(int) test['IsFemale'] = (test.Sex == 'female').astype(int) predictors = ['Pclass', 'IsFemale', 'Age'] X_train = train[predictors].values X_test = test[predictors].values y_train = train['Survived'].values X_train[:5] y_train[:5] from sklearn.linear_model import LogisticRegression model = LogisticRegression() model.fit(X_train, y_train) y_predict = model.predict(X_test) y_predict[:10] (y_predict == test['Survived'].values).mean() from sklearn.linear_model import LogisticRegressionCV model_cv = LogisticRegressionCV(10) model_cv.fit(X_train, y_train) y_predict = model_cv.predict(X_test) y_predict.shape test.shape from sklearn.model_selection import cross_val_score model = LogisticRegression(C=10) scores = cross_val_score(model, X_train, y_train, cv=4) scores
# 1) Set ridge regression penalty<br> # 2) Search 100 values of lambda<br> # 3) Set 10-fold cross validation<br> # 4) Use the liblinear solver<br> # 5) Set class weight to balanced<br> # 6) Use accuracy as the scoring measure # 7) Start with 1,000 iterations and increase as necessary<br> # In[19]: # Build a logistic regression model as a baseline logit_reg = LogisticRegressionCV(penalty="l2", Cs=100, solver='liblinear', cv=10, class_weight='balanced', scoring='accuracy', max_iter=1000) logit_reg.fit(train_X, train_y) # In[20]: # display confusion matrices for train and test data classificationSummary(train_y, logit_reg.predict(train_X)) classificationSummary(test_y, logit_reg.predict(test_X)) # In[21]:
def train(self, train_path, as_text=False, standardization=False, cut=True, multitrain=False): sys.stderr.write("o Reading training data...\n") if multitrain: df_train, todrop_train = self.read_conll_sentbreak( train_path, neighborwindowsize=self.windowsize, as_text=as_text, cut=False, multitrain=multitrain) else: df_train, todrop_train = self.read_conll_sentbreak( train_path, neighborwindowsize=self.windowsize, as_text=as_text, cut=cut) cols2keep = [ col for col in df_train.columns if col not in todrop_train ] X_train = df_train[cols2keep] Y_train = df_train['gold_seg'] df_train = None predictors_train = list(X_train) # standardization of vectors if standardization: from sklearn import preprocessing std_scale = preprocessing.StandardScaler().fit(X_train) X_train = std_scale.transform(X_train) gc.collect() # Free up memory for csr_matrix conversion X_train = X_train[sorted(X_train.columns)] #X_train = csr_matrix(X_train) logmodel = LogisticRegressionCV(cv=3, n_jobs=3, penalty='l1', solver="liblinear", random_state=42) if multitrain: if X_train.shape[0] <= 95000: multitrain_preds = get_multitrain_preds( logmodel, X_train, Y_train, 5) multitrain_preds = "\n".join( multitrain_preds.strip().split("\n")) with io.open(script_dir + os.sep + "multitrain" + os.sep + self.name + '_' + self.corpus, 'w', newline="\n") as f: sys.stderr.write( "o Serializing multitraining predictions\n") f.write(multitrain_preds) else: sys.stderr.write('o Skipping multitrain\n') # Fit complete dataset logmodel.fit(X_train, Y_train) logmodel.sparsify() if multitrain and X_train.shape[0] > 95000: preds, probas = zip(*self.predict(train_path, as_text=False)) with io.open(script_dir + os.sep + "multitrain" + os.sep + self.name + '_' + self.corpus, 'w', newline="\n") as f: sys.stderr.write( "o Serializing predictions from partial model\n") outlines = [ str(preds[i]) + "\t" + str(probas[i]) for i in range(len(probas)) ] outlines = "\n".join(outlines) f.write(outlines + "\n") pickle_objects = (logmodel, predictors_train) pickle.dump(pickle_objects, open(self.model_path, 'wb'))
features['importance'] = clf.feature_importances_ features.sort_values(by=['importance'], ascending=True, inplace=True) features.set_index('feature', inplace=True) features.plot(kind='barh', figsize=(25, 25)) model = SelectFromModel(clf, prefit=True) train_reduce = model.transform(train) train_reduce.shape test_reduce = model.transform(test) test_reduce.shape #MODEL BUILDING logreg = LogisticRegression() logreg_cv = LogisticRegressionCV() rf = RandomForestClassifier() gboost = GradientBoostingClassifier() models = [logreg, logreg_cv, rf, gboost] for model in models: print('Cross-validation of :{0}'.format(model.__class__)) score = compute_score(clf=model, X=train_reduce, y=targets, scoring='accuracy') print('CV score {0}='.format(score)) print('****')
result = pd.DataFrame( result, index=['HorseWin', 'HorseRankTop3', 'HorseRankTop50Percent']) return result df_train = pd.read_csv('data/training.csv') train_X = df_train[[ 'actual_weight', 'declared_horse_weight', 'draw', 'win_odds', 'recent_ave_rank', 'jockey_ave_rank', 'trainer_ave_rank', 'race_distance' ]].values train_Y = np.ravel(df_train[['finishing_position']].values) # 3.1.1 print("Start LogisticRegression CV") start = time.time() lr_model = LogisticRegressionCV(cv=10, random_state=3320) lr_model.fit(train_X, train_Y) print("End LogisticRegression CV, Time: %s s" % (time.time() - start)) # 3.1.2 print("Start GaussianNB CV") start = time.time() skf_list = list( StratifiedKFold(n_splits=10, random_state=3320, shuffle=True).split(train_X, train_Y)) nb_model = cvTrain(GaussianNB(), train_X, train_Y) print("End GaussianNB CV, Time: %s s" % (time.time() - start)) print("Start self NaiveBayes") start = time.time() clf = NaiveBayes()
def QuickML_Ensembling(X_train, y_train, X_test, y_test='', modeltype='Regression', Boosting_Flag=False, scoring='', verbose=0): """ Quickly builds and runs multiple models for a clean data set(only numerics). """ start_time = time.time() seed = 99 if len(X_train) <= 100000 or X_train.shape[1] < 50: NUMS = 100 FOLDS = 5 else: NUMS = 200 FOLDS = 10 ## create Voting models estimators = [] if modeltype == 'Regression': if scoring == '': scoring = 'neg_mean_squared_error' scv = ShuffleSplit(n_splits=FOLDS,random_state=seed) if Boosting_Flag is None: model5 = BaggingRegressor(DecisionTreeRegressor(random_state=seed), n_estimators=NUMS,random_state=seed) results1 = model5.fit(X_train,y_train).predict(X_test) if not isinstance(y_test, str): metrics1 = rmse(results1, y_test).mean() else: metrics1 = 0 estimators.append(('Bagging1',model5, metrics1)) else: model5 = LassoLarsCV(cv=scv) results1 = model5.fit(X_train,y_train).predict(X_test) if not isinstance(y_test, str): metrics1 = rmse(results1, y_test).mean() else: metrics1 = 0 estimators.append(('LassoLarsCV Regression',model5, metrics1)) model6 = LassoCV(alphas=np.logspace(-10,-1,50), cv=scv,random_state=seed) results2 = model6.fit(X_train,y_train).predict(X_test) if not isinstance(y_test, str): metrics2 = rmse(results2, y_test).mean() else: metrics2 = 0 estimators.append(('LassoCV Regularization',model6, metrics2)) model7 = RidgeCV(alphas=np.logspace(-10,-1,50), cv=scv) results3 = model7.fit(X_train,y_train).predict(X_test) if not isinstance(y_test, str): metrics3 = rmse(results3, y_test).mean() else: metrics3 = 0 estimators.append(('RidgeCV Regression',model7, metrics3)) ## Create an ensemble model #### if Boosting_Flag: model8 = BaggingRegressor(DecisionTreeRegressor(random_state=seed), n_estimators=NUMS,random_state=seed) results4 = model8.fit(X_train,y_train).predict(X_test) if not isinstance(y_test, str): metrics4 = rmse(results4, y_test).mean() else: metrics4 = 0 estimators.append(('Bagging2',model8, metrics4)) else: model8 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor( min_samples_leaf=2, max_depth=1, random_state=seed), n_estimators=NUMS, random_state=seed) results4 = model8.fit(X_train,y_train).predict(X_test) if not isinstance(y_test, str): metrics4 = rmse(results4, y_test).mean() else: metrics4 = 0 estimators.append(('Boosting',model8, metrics4)) estimators_list = [(tuples[0],tuples[1]) for tuples in estimators] estimator_names = [tuples[0] for tuples in estimators] if verbose >= 2: print('QuickML_Ensembling Model results:') print(' %s = %0.4f \n %s = %0.4f\n %s = %0.4f \n %s = %0.4f' %(estimator_names[0], metrics1, estimator_names[1], metrics2, estimator_names[2], metrics3, estimator_names[3], metrics4)) else: if scoring == '': scoring = 'accuracy' scv = StratifiedKFold(n_splits=FOLDS,random_state=seed) if Boosting_Flag is None: model5 = ExtraTreesClassifier(n_estimators=NUMS,min_samples_leaf=2,random_state=seed) results1 = model5.fit(X_train,y_train).predict(X_test) if not isinstance(y_test, str): metrics1 = accu(results1, y_test).mean() else: metrics1 = 0 estimators.append(('Bagging',model5, metrics1)) else: model5 = LogisticRegressionCV(Cs=np.linspace(0.01,100,20),cv=scv,scoring=scoring, random_state=seed) results1 = model5.fit(X_train,y_train).predict(X_test) if not isinstance(y_test, str): metrics1 = accu(results1, y_test).mean() else: metrics1 = 0 estimators.append(('Logistic Regression',model5, metrics1)) model6 = LinearDiscriminantAnalysis() results2 = model6.fit(X_train,y_train).predict(X_test) if not isinstance(y_test, str): metrics2 = accu(results2, y_test).mean() else: metrics2 = 0 estimators.append(('Linear Discriminant',model6, metrics2)) if modeltype == 'Binary_Classification': float_cols = X_train.columns[(X_train.dtypes==float).values].tolist() int_cols = X_train.columns[(X_train.dtypes==int).values].tolist() if (X_train[float_cols+int_cols]<0).astype(int).sum().sum() > 0: model7 = DecisionTreeClassifier(max_depth=5) else: model7 = GaussianNB() else: float_cols = X_train.columns[(X_train.dtypes==float).values].tolist() int_cols = X_train.columns[(X_train.dtypes==int).values].tolist() if (X_train[float_cols+int_cols]<0).astype(int).sum().sum() > 0: model7 = DecisionTreeClassifier(max_depth=5) else: model7 = MultinomialNB() results3 = model7.fit(X_train,y_train).predict(X_test) if not isinstance(y_test, str): metrics3 = accu(results3, y_test).mean() else: metrics3 = 0 estimators.append(('Naive Bayes',model7, metrics3)) if Boosting_Flag: #### If the Boosting_Flag is True, it means Boosting model is present. So choose a Bagging here. model8 = ExtraTreesClassifier(n_estimators=NUMS,min_samples_leaf=2,random_state=seed) results4 = model8.fit(X_train,y_train).predict(X_test) if not isinstance(y_test, str): metrics4 = accu(results4, y_test).mean() else: metrics4 = 0 estimators.append(('Bagging',model8, metrics4)) else: ## Create an ensemble model #### model8 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier( random_state=seed, max_depth=1, min_samples_leaf=2 ), n_estimators=NUMS, random_state=seed) results4 = model8.fit(X_train,y_train).predict(X_test) if not isinstance(y_test, str): metrics4 = accu(results4, y_test).mean() else: metrics4 = 0 estimators.append(('Boosting',model8, metrics4)) estimators_list = [(tuples[0],tuples[1]) for tuples in estimators] estimator_names = [tuples[0] for tuples in estimators] if not isinstance(y_test, str): if verbose >= 2: print('QuickML_Ensembling Model results:') print(' %s = %0.4f \n %s = %0.4f\n %s = %0.4f \n %s = %0.4f' %(estimator_names[0], metrics1, estimator_names[1], metrics2, estimator_names[2], metrics3, estimator_names[3], metrics4)) else: if verbose >= 1: print('QuickML_Ensembling completed.') stacks = np.c_[results1,results2,results3,results4] if verbose == 1: print(' Time taken for Ensembling: %0.1f seconds' %(time.time()-start_time)) return estimator_names, stacks #########################################################
from sklearn.linear_model import LogisticRegressionCV from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score if __name__ == "__main__": path = u'..\\9.Regression\\iris.data' # 数据文件路径 # data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type}) data = pd.read_csv(path, header=None) x, y = data[range(4)], data[4] y = pd.Categorical(y).codes x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=50) data_train = xgb.DMatrix(x_train, label=y_train) data_test = xgb.DMatrix(x_test, label=y_test) watch_list = [(data_test, 'eval'), (data_train, 'train')] param = {'max_depth': 4, 'eta': 0.3, 'silent': 1, 'objective': 'multi:softmax', 'num_class': 3} bst = xgb.train(param, data_train, num_boost_round=6, evals=watch_list) y_hat = bst.predict(data_test) result = y_test == y_hat print '正确率:\t', float(np.sum(result)) / len(y_hat) print 'END.....\n' models = [('LogisticRegression', LogisticRegressionCV(Cs=10, cv=3)), ('RandomForest', RandomForestClassifier(n_estimators=30, criterion='gini'))] for name, model in models: model.fit(x_train, y_train) print name, '训练集正确率:', accuracy_score(y_train, model.predict(x_train)) print name, '测试集正确率:', accuracy_score(y_test, model.predict(x_test))
tmp_text = " ".join( [row.complaint_type, row.descriptor, row.location_type]) except Exception: print(row) break raw_text.append(tmp_text) print("{} elapsed".format(time() - tick)) print("Split data & fit vectorizer/classifier") tick = time() # train/test split X_train, X_test, y_train, y_test = train_test_split(raw_text, targets, test_size=0.1, random_state=19) # BoNG (size=1,2) vec = CountVectorizer(ngram_range=(1, 2), lowercase=True, binary=False, stop_words="english") # LogisticRegression with automatic regularization tuning lr = LogisticRegressionCV(class_weight="balanced") # fit on train data lr.fit(vec.fit_transform(X_train), y_train) print("{} elapsed".format(time() - tick)) print("\nEVAL on held-out data\n") # eval on test print( classification_report(y_test, lr.predict(vec.transform(X_test)), digits=3))
def train_and_score(clf, X_train, y_train, X_test, y_test): clf = clf.fit(X_train, y_train) preds = clf.predict(X_test) cf = confusion_matrix(y_test, preds) print(plot_confusion_matrix(cf, class_names=positions)) print(" Accuracy: ", accuracy_score(y_test, preds)) print(" F1 score: ", metrics.f1_score(y_test, preds, average='weighted')) #Logistic Regression LR = LogisticRegressionCV(cv=5, random_state=20, solver='lbfgs', multi_class='multinomial') train_and_score(LR, X_train_dev, y_train_dev, X_test, y_test) plot_learning_curve(LR, "Logistic Regression Curve", X_train_dev, y_train_dev) #create new a knn model knn_model = KNeighborsClassifier() #create a dictionary of all values we want to test for n_neighbors param_grid = {'n_neighbors': np.arange(1, 25)} #use gridsearch to test all values for n_neighbors KNN = GridSearchCV(knn_model, param_grid, cv=5) train_and_score(KNN, X_train_dev, y_train_dev, X_test, y_test) # knn model
new_feat_test['total_sec'] = time_delt_sec_scaled[idx_split:] # print(get_auc_lr_valid(csr_matrix(hstack([X_train_sparse, new_feat_train[['start_month', 'start_hour']].values.reshape(-1, 2)])), y_train)) # print(get_auc_lr_valid(csr_matrix(hstack([X_train_sparse, new_feat_train[['start_month', 'morning']].values.reshape(-1, 2)])), y_train)) # print(get_auc_lr_valid(csr_matrix(hstack([X_train_sparse, new_feat_train[['start_month', 'start_hour', 'morning']].values.reshape(-1, 3)])), y_train)) mm_train = csr_matrix( hstack([ X_train_sparse, new_feat_train[[ 'start_month', 'start_hour', 'morning', 'day', 'evening', 'night', 'total_sites', 'total_sec' ]].values.reshape(-1, 8) ])) mm_test = csr_matrix( hstack([ X_test_sparse, new_feat_test[[ 'start_month', 'start_hour', 'morning', 'day', 'evening', 'night', 'total_sites', 'total_sec' ]].values.reshape(-1, 8) ])) # logit.fit(mm_train, y_train) # test_preds = logit.predict_proba(mm_test)[:, 1] """Подбор коефициента регуляризации""" C = np.logspace(-3, 1, 10) time_split = TimeSeriesSplit(n_splits=10) logitCV = LogisticRegressionCV(Cs=C, cv=time_split, scoring='roc_auc') logitCV.fit(mm_train, y_train) # print(get_auc_lr_valid(mm_train, y_train, C=logitCV.C_[0]))
# PCA analysis, analysis only done on traning data and transformed on both training and validation data pca = PCA() pca.fit(X_train) pcaTrain = pca.transform(X_train) pcaValid = pca.transform(X_valid) print(np.shape(pcaTrain)) # Dataset is not balanced. More samples of class 0 than 1. But the representation in the validation set is similar # to the representation in the training data print(np.bincount(y_train)) print(np.bincount(y_valid)) print() # Models, nr of folds = 5 logCV = LogisticRegressionCV(penalty='l1', solver='liblinear', cv=5) # Lasso-regualated Logistic regression ridgeCV = RidgeClassifierCV(alphas=np.array( [0.01, 0.1, 1, 100, 500, 1000, 5000, 10000]), cv=5) forest = ExtraTreesClassifier(n_estimators=nFeat) logResOrg = [] ridgeResOrg = [] logResPCA = [] ridgeResPCA = [] for ix in range(10): logCV.fit(X_train, y_train) ridgeCV.fit(X_train, y_train)
y = Gy[1] true_beta = Gy[0] # es = EarlyStopping(monitor='val_loss', patience=30, verbose=2) autoencoder = Sequential() autoencoder.add(Dense(r_hat, activation=aut_met, use_bias=False, input_shape=(p,))) autoencoder.add(Dense(p, activation=aut_met, use_bias=False)) autoencoder.compile(loss=aut_loss, optimizer=keras.optimizers.Adam()) autoencoder.fit(X, X, epochs=aut_epoch, batch_size=8, verbose=aut_verb) C = autoencoder.predict(X) E = X - C sigma = np.sqrt(np.sum(E ** 2) / (n * p)) X_ko = C + sigma * np.random.randn(n, p) Xnew = np.hstack((X, X_ko)) log = LogisticRegressionCV(penalty='l1', solver='liblinear', n_jobs=-1, cv=10).fit(Xnew, y.reshape((n, ))) beta = log.coef_[0] W = (beta[:p]) ** 2 - (beta[p:]) ** 2 t = np.sort(np.concatenate(([0], abs(W)))) ratio = [sum(W <= -tt) / max(1, sum(W >= tt)) for tt in t[:p]] ind = np.where(np.array(ratio) <= q)[0] if len(ind) == 0: T = float('inf') else: T = t[ind[0]] selected = np.where(W >= T)[0] ratio_plus = [(1 + sum(W <= -tt)) / max(1, sum(W >= tt)) for tt in t[:p]] ind_plus = np.where(np.array(ratio_plus) <= q)[0]
x_train_cat, x_test_cat, y_train_cat, y_test_cat = data_splits x_train_cat, x_val_cat, y_train_cat, y_val_cat = train_test_split(x_train_cat, y_train_cat, test_size=0.25, random_state=random_seed) # 0.25 x 0.7 = 0.175 eval_set_cat = [(x_val_cat, y_val_cat)] feature_names_cat = list(x_train_cat.columns.values) # ## 2. Modelling Helper Functions # # 3. Model Training # ## 3.1 Logistic Regression logit_cv = LogisticRegressionCV(cv=5, n_jobs=-1, random_state=random_seed, refit=True, scoring=custom_cost_scorer, ) logit_cv.fit(x_train, y_train) logit_report = report(logit_cv, x_train, y_train, x_test, y_test, importance_plot=True, feature_labels=feature_names, confusion_labels=confusion_lbs, verbose = False) # ## 3.2 Random Forests # ### Training
from sklearn.metrics import f1_score from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler from sklearn.tree import DecisionTreeClassifier from sklearn.svm import LinearSVC, NuSVC, SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, \ RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier from sklearn.neural_network import MLPClassifier models = [ DecisionTreeClassifier(random_state=rs, max_depth=15), SVC(gamma='auto'), NuSVC(gamma='auto'), LinearSVC(), SGDClassifier(max_iter=100, tol=1e-3), KNeighborsClassifier(), LogisticRegression(solver='lbfgs'), LogisticRegressionCV(cv=3), BernoulliNB(), BaggingClassifier(), ExtraTreesClassifier(n_estimators=200), RandomForestClassifier(n_estimators=200), AdaBoostClassifier(), GradientBoostingClassifier(random_state=rs), MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(2, 1), max_iter=2000)] def score_model(X, y, estimator, **kwargs): """ Test various estimators. """ y = LabelEncoder().fit_transform(y) model = Pipeline([ ('scaler', StandardScaler()), ('one_hot_encoder', OneHotEncoder()), ('estimator', estimator),
def Classification(self): return { 'RF': { 'estimator': RandomForestClassifier(oob_score=True, n_estimators=100, n_jobs=10), 'parameters': { 'GSCV': { 'max_features': [0.6, 0.7, 0.8, 0.9, 'auto', 'log2', None], 'max_depth': [3, 4, 5, None], 'n_estimators': [100], 'class_weight': ['balanced', 'balanced_subsample', None], 'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf': [2, 3, 4, 5], 'criterion': ['gini', 'entropy'] }, 'RSCV': { 'max_features': [0.6, 0.7, 0.8, 0.9, 0.99, 'auto', 'log2', None], 'max_depth': [3, 4, 5, 6, None], 'n_estimators': [100], 'criterion': ['gini', 'entropy'], 'class_weight': ['balanced', 'balanced_subsample', None], 'min_samples_split': [2, 3, 4, 5, 6, 8, 11, 15], 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 9, 11, 15], } } }, 'GBDT': { 'estimator': GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, subsample=1.0, max_depth=5), 'parameters': { 'GSCV': { 'max_features': (0.5, 0.75, 0.8, 0.9, 'auto'), 'loss': ['deviance', 'exponential'], 'max_depth': [3, 4, 5], 'n_estimators': [100], 'learning_rate': [0.005, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9], 'min_samples_split': [2, 3, 4, 5], 'subsample': [0.75, 0.85, 0.95] }, 'RSCV': { 'max_features': (0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 'auto', 'log2', None), 'loss': ['deviance', 'exponential'], 'max_depth': [3, 4, 5, 6, 7, 8, 9, 11, 15], 'n_estimators': [100], 'learning_rate': [0.005, 0.01, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9], 'min_samples_split': [2, 3, 4, 5, 6, 8], 'subsample': [0.7, 0.8, 0.9, 1] }, } }, 'XGB': { 'estimator': XGBClassifier( n_estimators=100, objective='binary:logistic', # multi:softprob booster='gbtree', silent=True, max_depth=4, missing=None, reg_alpha=0, reg_lambda=1, learning_rate=0.1, n_jobs=10), 'parameters': { 'GSCV': { 'colsample_bytree': [0.75, 0.85, 0.95], 'subsample': [0.75, 0.85, 0.95], 'reg_alpha': [0, 0.1, 0.5, 1, 2], 'reg_lambda': [0.1, 0.5, 1, 2, 2.5], 'max_depth': [3, 4, 5, 6], 'n_estimators': [100], 'learning_rate': [0.01, 0.05, 0.1, 0.2], #'min_child_weight' : [1, 2], }, 'RSCV': { 'subsample': [0.7, 0.8, 0.85], 'colsample_bytree': [0.7, 0.8, 0.9], 'colsample_bylevel': [0.7, 0.8, 0.9], 'max_delta_step': [0, 1], 'colsample_bynode': [1], 'scale_pos_weight': [0.8, 1, 1.2], 'base_score': [0.5], 'n_estimators': [100], 'gamma': [0, 0.005, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9], 'min_child_weight': [1, 2, 3, 4, 5], 'min_samples_split': [2, 3, 4, 5, 6, 7], 'max_depth': [3, 4, 5, 6, 7, 8], 'reg_lambda': [0.2, 0.3, 0.5, 0.7, 0.8, 0.9, 0.96, 1], 'reg_alpha': [0, 0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 0.7], 'learning_rate': [0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5], }, } }, 'LGBM': { 'estimator': LGBMClassifier( boosting_type='gbdt', num_leaves=33, max_depth=5, learning_rate=0.1, n_estimators=100, class_weight=None, min_split_gain=0.0, min_child_weight=1e-3, min_child_samples=10, subsample=0.8, subsample_freq=0, colsample_bytree=0.8, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=-1, silent=False, importance_type='split', verbose=-1, ), 'parameters': { 'GSCV': { 'num_leaves': [9, 17, 33, 65], 'max_depth': [-1, 3, 4, 5, 6], 'learning_rate': [0.01, 0.05, 0.1, 0.2], 'n_estimators': [100], 'reg_alpha': [0.01, 0.1, 0.5, 1, 1.5, 2], 'reg_lambda': [0.01, 0.1, 0.5, 1, 1.5, 2], 'class_weight': [ 'balanced', None, ], 'subsample': [0.7, 0.78, 0.85], 'colsample_bytree': [0.7, 0.78, 0.85], }, 'RSCV': { 'num_leaves': [9, 17, 33, 65, 129], 'max_depth': [-1, 3, 4, 5, 6, 7], 'learning_rate': [0.01, 0.05, 0.1, 0.2], 'n_estimators': [100], 'reg_alpha': [0, 0.01, 0.1, 0.5, 0.75, 1, 1.5, 2, 2.5, 3], 'reg_lambda': [0, 0.01, 0.1, 0.5, 0.75, 1, 1.5, 2, 2.5, 3], 'class_weight': [ 'balanced', None, ], 'subsample': [0.7, 0.78, 0.85], 'colsample_bytree': [0.7, 0.78, 0.85], } } }, 'AdaB_DT': { 'estimator': AdaBoostClassifier(DecisionTreeClassifier( criterion='gini', splitter='best', class_weight='balanced', min_samples_leaf=1, max_features=None), algorithm='SAMME.R', learning_rate=1, n_estimators=500), 'parameters': { 'GSCV': { 'n_estimators': range(400, 800, 1000), 'algorithm': ['SAMME', 'SAMME.R'], "learning_rate": [0.3, 0.5, 0.7, 0.9, 0.95, 1, 2] }, 'RSCV': {} } }, 'MLP': { 'estimator': MLPClassifier(max_iter=3000, solver='lbfgs', alpha=1e-05, tol=1e-4, hidden_layer_sizes=( 50, 50, ), random_state=None), 'parameters': { 'GSCV': { 'alpha': [10**i for i in [-6, -5, -4, -3, -1, 1]], 'max_iter': [15000], 'solver': ['adam', 'lbfgs', 'sgd'], 'activation': ["logistic", 'identity', "relu", "tanh"], 'hidden_layer_sizes': [( 50, 30, 20, 10, 10, ), ( 15, 25, 15, 10, ), ( 8, 15, 10, 10, ), ( 30, 10, 40, 10, ), ( 10, 20, 10, 10, )], 'learning_rate': ["constant", "invscaling", "adaptive"], 'tol': [1e-4] }, 'RSCV': { 'hidden_layer_sizes': [( randint.rvs(8, 30, 1), randint.rvs(5, 20, 1), randint.rvs(5, 30, 1), ), ( randint.rvs(8, 40, 1), randint.rvs(5, 40, 1), )], 'activation': ["logistic", 'identity', "relu", "tanh"], 'solver': ['adam', 'lbfgs', 'sgd'], 'alpha': [10**i for i in [-6, -5, -4, -3, -1, 1] ], #uniform(1e-06, 0.9), 'max_iter': [15000], 'learning_rate': ["constant", "invscaling", "adaptive"] } } }, 'LinearSVM': { 'estimator': LinearSVC(penalty='l2', dual=True, tol=0.0001, C=1, max_iter=2e6, random_state=None), 'parameters': { 'GSCV': { 'penalty': ['l2'], 'dual': [True], 'loss': ['hinge', 'squared_hinge'], 'tol': [ 5e-7, 1e-7, 5e-6, 1e-6, 1e-5, 5e-5, 1e-4, 5e-4, 5e-3, 1e-3, 1e-2, 5e-2 ], #'C': [ 0.1, 0.3, 0.5, 0.8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 19, 20, 50], 'C': np.power(10, np.linspace(-2, 2, 20)), 'max_iter': [4e6, 6e6, 8e6, 1.4e7] }, 'RSCV': {} } }, 'LinearSVMl1': { 'estimator': LinearSVC(penalty='l1', dual=False, tol=0.0001, C=1, max_iter=3e6, random_state=None), 'parameters': { 'GSCV': { 'penalty': ['l1', 'l2'], 'dual': [False, True], 'loss': ['hinge', 'squared_hinge'], 'tol': [ 5e-7, 1e-7, 5e-6, 1e-6, 1e-5, 5e-5, 1e-4, 5e-4, 5e-3, 1e-3, 1e-2, 5e-2 ], 'C': np.power(10, np.linspace(-2, 2, 20)), 'max_iter': [4e6, 6e6, 8e6, 1.4e7] }, 'RSCV': {} } }, 'SVMlinear': { 'estimator': SVC(kernel="linear", probability=True, C=1.0, decision_function_shape='ovr', random_state=None), 'parameters': { 'GSCV': [{ 'kernel': ['linear'], 'tol': [ 5e-7, 5e-6, 1e-6, 1e-5, 5e-5, 1e-4, 5e-4, 5e-3, 1e-3, 1e-2, 5e-2, 5e-1 ], 'C': np.power(10, np.linspace(-2, 2, 20)), }], 'RSCV': {} } }, 'SVMrbf': { 'estimator': SVC(kernel='rbf', gamma='scale', probability=True, C=1, decision_function_shape='ovr'), 'parameters': { 'GSCV': { 'kernel': ['rbf'], 'gamma': [ 1e1, 1e0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 5e-3, 5e-4, 'auto' ], 'tol': [ 5e-7, 5e-6, 1e-6, 1e-5, 5e-5, 1e-4, 5e-4, 5e-3, 1e-3, 1e-2, 5e-2, 5e-1 ], 'C': np.power(10, np.linspace(-2, 2, 20)), }, 'RSCV': {} } }, 'SVM': { 'estimator': SVC(kernel='rbf', gamma='scale', probability=True, C=1, decision_function_shape='ovr'), 'parameters': { 'GSCV': [ { 'kernel': ['rbf', 'sigmoid'], 'gamma': [ 1e1, 1e0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 5e-3, 5e-4, 'auto' ], 'tol': [ 5e-7, 5e-6, 1e-6, 1e-5, 5e-5, 1e-4, 5e-4, 5e-3, 1e-3, 1e-2, 5e-2, 5e-1 ], 'C': np.power(10, np.linspace(-2, 2, 20)), }, { 'kernel': ['poly'], 'degree': [2, 3, 4, 5], 'gamma': [ 1e1, 1e0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 5e-3, 5e-4, 'auto' ], 'tol': [ 5e-7, 5e-6, 1e-6, 1e-5, 5e-5, 1e-4, 5e-4, 5e-3, 1e-3, 1e-2, 5e-2, 5e-1 ], 'C': np.power(10, np.linspace(-2, 2, 20)), }, ], 'RSCV': {} } }, 'nuSVMrbf': { 'estimator': NuSVC(kernel='rbf', gamma='scale', probability=True, nu=0.5, decision_function_shape='ovr'), 'parameters': { 'GSCV': { 'kernel': ['rbf'], 'nu': [0.8, 0.9, 1], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5, 5e-2, 5e-3, 5e-4, 'auto'], 'tol': [1e-6, 1e-5, 5e-5, 1e-4, 5e-3, 3e-3, 1e-3, 1e-2, 5e-2] }, 'RSCV': {} } }, 'SGD': { 'estimator': SGDClassifier(penalty='l2', loss='hinge', alpha=0.0001, max_iter=5000, tol=1e-3, n_jobs=2), 'parameters': { 'GSCV': [ { 'penalty': ['l2', 'elasticnet'], 'loss': [ 'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron' ], 'alpha': [5e-5, 1e-4, 5e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1], 'l1_ratio': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9], 'max_iter': [5e5], #'eta0' : [0, 0.0001, 0.001, 0.01, 0.1], #'learning_rate' : ['optimal'], 'tol': [1e-3, 5e-3, 1e-4, 1e-5, 1e-2] }, ], 'RSCV': {} } }, 'KNN': { 'estimator': KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2), 'parameters': { 'GSCV': { 'n_neighbors': [3, 4, 5, 6, 7, 8, 10, 12, 15], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'leaf_size': [10, 20, 30, 40, 50, 60, 70, 100], 'p': [1, 2] }, 'RSCV': {} } }, 'RNN': { 'estimator': RadiusNeighborsClassifier(radius=1, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', outlier_label=None), 'parameters': { 'GSCV': { 'radius': [1, 2, 3, 4, 5, 10, 15, 20, 23, 26, 30, 35, 40], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'leaf_size': [10, 20, 30, 50, 70, 100, 200], 'p': [1, 2] }, 'RSCV': {} } }, 'GNB': { 'estimator': GaussianNB(priors=None, var_smoothing=1e-09), 'parameters': { 'GSCV': { 'var_smoothing': np.dot( np.array([[1e-11, 1e-10, 1e-09, 1e-08, 1e-07]]).T, np.array([[1, 3, 5, 7]])).flatten() }, 'RSCV': {} } }, 'BNB': { 'estimator': BernoulliNB(alpha=1.0, binarize=0.5, fit_prior=True, class_prior=None), 'parameters': { 'GSCV': { 'alpha': [i / 10 for i in range(1, 21, 1)], 'binarize': [i / 10 for i in range(1, 10, 1)] }, 'RSCV': {} } }, 'MNB': { 'estimator': MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None), 'parameters': { 'GSCV': { 'alpha': [i / 10 for i in range(1, 21, 1)] }, 'RSCV': {} } }, 'CNB': { 'estimator': ComplementNB(alpha=1.0, fit_prior=True, class_prior=None, norm=False), 'parameters': { 'GSCV': { 'alpha': [i / 10 for i in range(1, 21, 1)] }, 'RSCV': {} } }, 'DT': { 'estimator': DecisionTreeClassifier(criterion='gini', splitter='best', class_weight='balanced', min_samples_leaf=1, max_features=None), 'parameters': { 'GSCV': { 'max_features': (0.4, 0.5, 0.6, 0.7, 0.8, 'sqrt', 'log2'), 'min_samples_leaf': (1, 2, 3), 'max_depth': (5, 8, 10, 15, 25, 30, None), 'criterion': ['gini', 'entropy'] }, 'RSCV': {} } }, 'LR': { 'estimator': LogisticRegression(random_state=None, solver='liblinear', penalty='l1', fit_intercept=True, max_iter=10000, l1_ratio=None, multi_class='auto'), 'parameters': { 'GSCV': [ { 'penalty': ['l1'], 'tol': [ 1e-3, 1e-4, 1e-5, ], 'l1_ratio': [None], 'solver': ['liblinear', 'saga'] }, { 'penalty': ['l2'], 'tol': [ 1e-3, 1e-4, 1e-5, ], 'l1_ratio': [None], 'solver': ['lbfgs', 'sag'] }, { 'penalty': ['elasticnet'], 'tol': [ 1e-3, 1e-4, 1e-5, ], 'l1_ratio': [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95], 'solver': ['saga'] }, ], 'RSCV': {} } }, 'LRCV': { 'estimator': LogisticRegressionCV( random_state=None, solver='saga', penalty='elasticnet', Cs=10, cv=8, fit_intercept=True, max_iter=6e3, n_jobs=30, l1_ratios=[0.005, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9], multi_class='auto'), 'parameters': { 'GSCV': [ { 'penalty': ['elasticnet'], 'Cs': [ 10, np.power(10, np.arange(-4, 4, 0.4)), np.power(10, np.arange(-3, 3, 0.3)) ], 'l1_ratios': [ [0.005, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9], ], 'max_iter': [4e4], 'cv': [10], 'solver': ['saga'] }, ], 'RSCV': [ { 'penalty': ['l1'], 'Cs': [np.power(10, np.arange(-2, 4, 0.3))], 'l1_ratios': [None], 'max_iter': [6e4], 'cv': [10], 'solver': ['liblinear', 'saga'] }, { 'penalty': ['l2'], 'l1_ratios': [None], 'max_iter': [6e4], 'cv': [10], 'Cs': [np.power(10, np.arange(-2, 4, 0.3))], 'solver': ['lbfgs', 'sag'] }, { 'penalty': ['elasticnet'], 'Cs': [np.power(10, np.arange(-2, 4, 0.3))], 'max_iter': [6e4], 'cv': [10], 'solver': ['saga'] }, ], } }, 'LassoCV': { 'estimator': LassoCV(cv=10, alphas=[0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 1, 10], max_iter=20000, n_jobs=-1), 'parameters': { 'GSCV': { 'n_alphas': [200, 500], 'normalize': [False, True] }, 'RSCV': {} } }, 'Lasso': { 'estimator': Lasso(alpha=1, max_iter=15000), 'parameters': { 'GSCV': { 'alpha': [0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 1, 10], 'max_iter': [15000], 'normalize': [False, True], 'precompute': [False, True] }, 'RSCV': {} } }, 'LLIC': { 'estimator': LassoLarsIC(criterion='aic', fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=10000, eps=2.220446049250313e-16, copy_X=True, positive=False), 'parameters': { 'GSCV': { 'criterion': ['aic', 'bic'], }, 'RSCV': {} } }, 'ENet': { 'estimator': ElasticNetCV( cv=10, alphas=[0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1, 10], max_iter=20000, random_state=None, n_jobs=20, l1_ratio=[.01, .05, .1, .3, .5, .7, .9, .98]), 'parameters': { 'GSCV': { 'n_alphas': [200, 500], 'max_iter': [50000], 'tol': [ 1e-3, 1e-4, 1e-5, ], 'normalize': [False, True] }, 'RSCV': {} } }, }
plt.scatter(X[:, 0], X[:, 1], s=60, c=y, cmap=plt.cm.coolwarm) plt.xlabel('Feature $x_1$', fontsize=15) plt.ylabel('Feature $x_2$', fontsize=15) plt.title("2D Multi-Classification Dataset", fontsize=15) # plt.show() clf = LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False, fit_intercept=True, intercept_scaling=1.0, max_iter=200, multi_class='multinomial', n_jobs=1, penalty='l2', random_state=0, refit=True, scoring=None, solver='sag', tol=0.0001, verbose=0) clf.fit(X, y) def plot_decision_boundary(pred_func): # Set min and max values and give it some padding x_min, x_max = X_val[:, 0].min() - 0.5, X_val[:, 0].max() + 0.5 y_min, y_max = X_val[:, 1].min() - 0.5, X_val[:, 1].max() + 0.5 h = 0.01
# print(words) # new_vector=vec.transform(words) # print(type(new_vector)) # vec = TfidfTransformer() # vec.fit(cv.transform(words)) # vec.fit(cv.transform(z_words)) lr = LogisticRegression(penalty = 'l2') loss = cross_val_score(lr,vec.transform(words),y_train,cv=5,scoring='neg_log_loss') print('logloss of each fold is: ',-loss) print('cv logloss is:', -loss.mean()) Cs = [1,10,100,1000] lrcv = LogisticRegressionCV(Cs = Cs,cv = 5,scoring='neg_log_loss',penalty='l1', solver='liblinear', multi_class='ovr') lrcv.fit(vec.transform(words),y_train) LogisticRegressionCV(Cs=[1, 10, 100, 1000], class_weight=None, cv=5, dual=False, fit_intercept=True, intercept_scaling=1.0, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l1', random_state=None, refit=True, scoring='neg_log_loss', solver='liblinear', tol=0.0001, verbose=0) print(lrcv.scores_) #调用逻辑回归算法 lr = LogisticRegression(penalty = 'l2',solver = 'lbfgs',max_iter=100,class_weight={0:0.28,1:0.72}) param = {'C':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.5,2.0,2.5,3.0,1.6,1.7,1.8,1.9,2.1,2.2,2.3,2.4]} gc_lr = GridSearchCV(lr,param_grid=param,cv = 3) gc_lr.fit(vec.transform(words),y_train)
# In[28]: X[0] # In[29]: X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=0) # In[30]: logreg = LogisticRegressionCV(cv = 10,random_state=0, solver = 'lbfgs', multi_class = 'multinomial', max_iter=10000) # In[31]: logreg.fit(X_train,y_train) # In[32]: print(logreg.coef_) logreg.fit(X / np.std(X, 0), Y)
def compute_single_model(j, data_i, data_j, task, model_type, n_trees, n_folds, max_depth, symmetric): print("Examining response variable %d out of %d" % (j, np.shape(data_j)[1])) if symmetric: X = data_i[:, list(range(0, j)) + list(range(j + 1, data_i.shape[1]))] else: X = data_i y = data_j[:, j] score = [] importance = [] for k in range(0, n_folds): if task == "regression": rfm = RandomForestRegressor(n_estimators=n_trees, max_depth=max_depth, max_features='sqrt', n_jobs=-1) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) rfm.fit(X_train, y_train) score.append(rfm.score(X_test, y_test)) importance.append(rfm.feature_importances_) else: if "rf" in model_type: rfm = RandomForestClassifier(n_estimators=n_trees, max_depth=max_depth, max_features='sqrt', n_jobs=-1) try: X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, stratify=y) rfm.fit(X_train, y_train) y_test_matrix = np.eye(len(set( y_test.tolist())))[y_test.tolist()] score.append( roc_auc_score(y_test_matrix, rfm.predict_proba(X_test))) importance.append(rfm.feature_importances_) except: score.append(0.) importance.append(np.zeros(np.shape(X)[1])) else: rfm = LogisticRegressionCV() try: X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, stratify=y) rfm.fit(X_train, y_train) y_test_matrix = np.eye(len(set( y_test.tolist())))[y_test.tolist()] score.append( roc_auc_score(y_test_matrix, rfm.predict_proba(X_test))) importance.append(rfm.coef_) except: score.append(0.) importance.append(np.zeros(np.shape(X)[1])) score = np.mean(score) importance = np.mean(np.vstack(importance), axis=0) if symmetric: arr = np.zeros(data_i.shape[1]) arr[:j] = importance[:j] arr[(j + 1):] = importance[j:] importance = arr return (score, importance)
def run(submission_name): print('Prepare datasets...') train, X_test = prepare() y_train = train['open_account_flg'] X_train = train.drop('open_account_flg', axis=1) # Part 1. Boosting. (Part 2 must be commented) l1_features = { 'xgb': [ feature for feature in X_train.columns if feature not in [ 'monthly_payment', 'monthly_payment_to_income', 'credit_sum_to_income' ] ], 'nn': [ feature for feature in X_train.columns if feature not in ['credit_sum_to_income'] and feature[:13] != 'living_region' ], 'rf': [ feature for feature in X_train.columns if feature[:13] != 'living_region' ], 'gbm': [ feature for feature in X_train.columns if feature not in ['monthly_income', 'monthly_payment_to_income'] ] } l1_models_pool = { 'xgb': calibrated(xgb_bag()), # 'nn': calibrated(nn_bag()), # 'rf': calibrated(rf()), 'gbm': calibrated(gbm_bag()) } l2_model = LogisticRegressionCV(cv=5, scoring='roc_auc', max_iter=10000, solver='sag', class_weight='balanced', n_jobs=N_JOBS, random_state=SEED) l1_df, cv_score = fit_stacking(l1_models_pool, l2_model, X_train, y_train, X_test, l1_features=l1_features) print() print('Predict...') pred = l2_model.predict_proba(l1_df)[:, 1] # End part 1 # Part 2. Over previous submissions (Part 1 must be commented) df = pd.DataFrame({ 'stacked': pd.read_csv( 'submissions/stacking_xgb_gbm_calibrated_l2_lrcv_calibrated_0.79973117848.csv', index_col='_ID_')['_VAL_'].as_matrix(), 'xgb': load_l1_predictions('xgb')[0], 'gbm': load_l1_predictions('gbm')[0] }) pred = df.mean(axis=1) cv_score = '___' # End part 2 print() print('Build submission...') df = pd.read_csv('data/credit_test.csv', sep=';') submission = pd.DataFrame({'_ID_': df['client_id'], '_VAL_': pred}) print('Write submission to file (%s.csv)...' % submission_name) submission.to_csv('submissions/%s_%s.csv' % (submission_name, cv_score), index=False) print('Done!')
train = pd.concat([train, train_nan], axis=0) del train_nan #test test = pd.concat([test, test_nan], axis=0) del test_nan y = train['renewal'] x = train.drop('renewal', axis=1) ros = over_sampling.ADASYN() rus = under_sampling.NearMiss() rcs = combine.SMOTEENN() rcs2 = combine.SMOTETomek() log = BaggingClassifier(LogisticRegressionCV(Cs=6)) rf = BaggingClassifier(RandomForestClassifier()) gbc = BaggingClassifier( GradientBoostingClassifier(n_estimators=250, learning_rate=0.01)) sv = SVC(C=0.8, probability=True) for sample, sample_name in zip([rcs2, ros, rus, rcs, rcs2], ['rcs2', 'ros', 'rus', 'rcs']): print(sample_name) x_rs, y_rs = sample.fit_sample(x, y) for model, model_name in zip([log, rf, gbc], ['log', 'rf', 'gbc']): model.fit(x_rs, y_rs) filename = 'C:/Users/cheekati/Desktop/ml/AV Mck/' + str( model_name) + str(sample_name) + '.pkl' f = open(filename, 'wb') pickle.dump(model, f) print('model complete')
import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.cross_validation import train_test_split from sklearn.linear_model import LogisticRegressionCV # loading iris dataset into memory iris = sns.load_dataset("iris") sns.pairplot(iris, hue='species') #Seperating dependent and independent variable X = iris.values[:, :4] y = iris.values[:, 4] #Dividing dataset into training and testing X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=1) #creating a object of logisticregression model = LogisticRegressionCV() #fitting model with training data model.fit(X_train, y_train) print("Accuracy={:.2f}".format(model.score(X_test, y_test)))
'methods': ['predict', 'predict_proba', 'predict_log_proba', 'score'], 'dataset': 'classifier', }, { 'model': LogisticRegression(max_iter=100, multi_class='multinomial'), 'methods': [ 'decision_function', 'predict', 'predict_proba', 'predict_log_proba', 'score' ], 'dataset': 'classifier', }, { 'model': LogisticRegressionCV(max_iter=100), 'methods': [ 'decision_function', 'predict', 'predict_proba', 'predict_log_proba', 'score' ], 'dataset': 'classifier', }, { 'model': RandomForestRegressor(n_estimators=10), 'methods': ['predict', 'score'], 'dataset': 'regression', }, { 'model': LinearRegression(), 'methods': ['predict', 'score'],