# df_test = pd.read_csv(data_path('test.csv')) # # labels = df['TARGET'] # df_test_id = df_test['ID'] # # colls = ['saldo_var30', 'var15', 'saldo_var5', 'ind_var30', 'var38', 'saldo_medio_var5_ult3', 'num_meses_var5_ult3', 'saldo_medio_var5_hace3', 'var36', 'num_meses_var39_vig_ult3', 'num_var30', 'num_var5', 'num_var4', 'num_var45_hace2'] # print(sorted(colls)) # # df = df[colls] # df_test = df_test[colls] # poly = PolynomialFeatures(2) # df = poly.fit_transform(df) # df_test = poly.transform(df_test) clf = GradientBoostingClassifier(verbose=3) # clf = RandomForestClassifier() clf.fit(df, labels) scores = cross_validation.cross_val_score(clf, df, labels, cv=5, scoring='roc_auc') print(scores.mean(), scores) from src.submission import make_submission make_submission('gradient_boosting.csv', df_test_id, clf.predict_proba(df_test))
#Let's print the result percentage_hist = 0 print("Proportion of selected features from each histone marker") for i in range(5): value = histone_marker[i][1] / 2.5 print("%s : %f" % (histone_marker[i][0], value)) percentage_hist += value print("Total : %f " % percentage_hist) """ V. Improvement of the accuracy with different classifiers""" #First step : training the classifier seen in class classifiers1 = [(RandomForestClassifier(), "Random Forest"), (ExtraTreesClassifier(), "Extra-Trees"), (AdaBoostClassifier(), "AdaBoost"), (GradientBoostingClassifier(), "GB-Trees")] classifiers3 = [(KNeighborsClassifier(), "KNeighbors")] Results = [] Predicted_data = [] counter = 0 #spliting training data in two in order to test the accuracy X_train2, X_test2, y_train2, y_test2 = train_test_split(x_train, y_train, test_size=0.2) #implementing the 1st group of classifiers estimators = [100, 500, 1000] for clf, name in classifiers1: for est_val in estimators: clf.n_estimators = est_val #clf.n_jobs = -1
joblib.dump([elo_bins, mg_quants], blundermodel_dir + 'groups.p') features = [ 'side', 'halfply', 'moverscore', 'bestmove_is_capture', 'bestmove_is_check', 'depth', 'seldepth', 'num_bestmoves', 'num_bestmove_changes', 'bestmove_depths_agreeing', 'deepest_change' ] modelnum = 0 for elo_name, elo_df in train_df.groupby(train_df['elo_groups']): msg('working on elo group %s, of size %i' % (elo_name, elo_df.shape[0])) msg('computing perfect-move model') gbc = GradientBoostingClassifier(min_samples_split=500, min_samples_leaf=300, n_estimators=NUM_ESTIMATORS, verbose=1, subsample=0.5, learning_rate=0.2) X = elo_df[features] y = (elo_df['clipped_movergain'] == 0) gbc.fit(X, y) joblib.dump([elo_name, 1.0, gbc], '%s%i.p' % (blundermodel_dir, modelnum)) modelnum = modelnum + 1 for mg_quant in mg_quants: msg('computing mg_quant %f' % mg_quant) gbr = GradientBoostingRegressor(loss='quantile', alpha=mg_quant, min_samples_split=500, min_samples_leaf=300, n_estimators=NUM_ESTIMATORS,
'U_behaviors_sum10', 'Item_sale10', 'Item_sale5', 'Item_sale3', 'Item_sale1', 'car5', 'car4', 'car3', 'car2', 'car1', 'buy5', 'buy4', 'buy3', 'buy2', 'buy1', 'I_order10', 'I_order5', 'I_order3', 'I_order1', 'I_buyer10', 'I_buyer5', 'I_buyer3', 'I_buyer1', 'behav1', 'behav2', 'behav3', 'behav4', 'last_time' ] df_train = pd.read_csv("train_feature.csv") df_validation = pd.read_csv("validation_feature.csv") #数据归一化处理 ui = df_train[["user_id", "item_id"]] samples = df_train[features] target = df_train["tag"] classifier = GradientBoostingClassifier(n_estimators=200, learning_rate=1.0, max_depth=5, random_state=0) classifier.fit(samples, target) # 训练数据来学习,不需要返回值 validation_feature = df_validation[features] x = classifier.predict(validation_feature) # 测试数据,分类返回标记 print x validation_ui = df_validation[["user_id", "item_id"]] validation_ui["tag"] = x validation_result = validation_ui[validation_ui.tag == 1][[ "user_id", "item_id" ]] os.chdir('..') validation_result.to_csv("predict_v_Gbrt.csv", index=False)
from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix from plot_confusion_matrix import gen_confusion_matrix_figure X, y = load_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) xgb = xgboost.XGBClassifier(objective="multi:softprob", nthread=-1, reg_alpha=0.7, reg_lambda=0.05, subsample=0.9) gbrt = GradientBoostingClassifier(random_state=0) forest = RandomForestClassifier(n_jobs=-1, random_state=0) lr = LogisticRegression(C=0.03) eclf = VotingClassifier(estimators=[('xgboost', xgb), ('gbrt', gbrt), ('forest', forest), ('logistic regression', lr)], voting='soft', weights=None) classifier_list = [xgb, gbrt, forest, lr, eclf] for clf in classifier_list: y_pred = clf.fit(X_train, y_train).predict(X_test) y_train_pred = clf.fit(X_train, y_train).predict(X_train) # Compute confusion matrix
'dateOfBirth', 'popularity' ]] got_target = got.loc[:, 'isAlive'] X_train, X_test, y_train, y_test = train_test_split(got_data, got_target.values.ravel(), test_size=0.1, random_state=508, stratify=got_target) # Building a gbm gbm = GradientBoostingClassifier( loss='deviance', learning_rate=1.5, n_estimators=100, max_depth=3, criterion='friedman_mse', warm_start=False, random_state=508, ) gbm_basic_fit = gbm.fit(X_train, y_train) gbm_basic_predict = gbm_basic_fit.predict(X_test) # Training and Testing Scores print('Training Score', gbm_basic_fit.score(X_train, y_train).round(4)) print('Testing Score:', gbm_basic_fit.score(X_test, y_test).round(4)) cv_lr_3 = cross_val_score(gbm, got_data, got_target, cv=3, scoring='roc_auc')
''' This one almost always gets the best model. In v2, changed to use early stopping which has the result of setting n_estimators to a very high value (1000) and fixing both the validation_fraction and n_iter_no_change to results derived from Bayes testing. Max_depth and subsample were also always fixed in Bayes mode testing. Learning rate and splitting still show some variability depending on the data type, so we left a couple option in GridSearch. For some reason, it does seem to struggle with specifically homozygous SNVs. I wonder if the frequency of FP is just low enough to make it a challenge for this model type. ''' #" Most data scientist see number of trees, tree depth and the learning rate as most crucial parameters" - https://www.datacareer.de/blog/parameter-tuning-in-gradient-boosting-gbm/ #''' CLASSIFIERS.append(( 'GradientBoosting', GradientBoostingClassifier(random_state=0, learning_rate=0.1, loss='exponential', max_depth=4, max_features='sqrt', n_estimators=200), { 'random_state': [0], 'n_estimators': [ 1000 ], #prior tests: 100, 200; OBSOLETE: since adding n_iter_no_change, just set to a big number 'max_depth': [6], #prior tests: 3, 4 'learning_rate': [ 0.05, 0.1, 0.5 ], #prior tests: 0.01, 0.2; from bayes mode, all results were in the 0.04-0.2 range with the occasional "high" rate near 0.5 'loss': ['exponential'], #prior tests: 'deviance' 'max_features': ['sqrt'], 'min_samples_split': [ 2, 15, 50
def gradient_boosting_classifier(X_train, y_train): from sklearn.ensemble import GradientBoostingClassifier model = GradientBoostingClassifier(n_estimators=200, random_state=2) model.fit(X_train, y_train) return model
def gradient_boosting_classifier(train_x, train_y): from sklearn.ensemble import GradientBoostingClassifier model = GradientBoostingClassifier(n_estimators=200) model.fit(train_x, train_y) print model.feature_importances_ # 显示每一个特征的重要性指标,越大说明越重要 return model
def main(): experiment_config = { 'comment': 'Keel run', 'experiment_repetitions': 5, 'n_splits': 5, 'random_seed': int(os.urandom(1)[0] / 255 * (2**32)), } classifiers = [('LR', LogisticRegression()), ('GBM', GradientBoostingClassifier(), [{ 'n_estimators': [50, 100, 200] }]), ('KNN', KNeighborsClassifier(), [{ 'n_neighbors': [3, 5, 8] }])] oversampling_methods = [ ('None', None), ('RandomOverSampler', RandomOverSampler()), ('SMOTE', SMOTE(), [{ 'k_neighbors': [3, 5, 20] }]), ('B1-SMOTE', SMOTE(kind='borderline1'), [{ 'k_neighbors': [3, 5, 20] }]), ('B2-SMOTE', SMOTE(kind='borderline2'), [{ 'k_neighbors': [3, 5, 20] }]), ( 'KMeansSMOTE', KMeansSMOTE(), [ { 'imbalance_ratio_threshold': [1, float('Inf')], 'density_power': [0, 2, None], # None corresponds to n_features 'smote_args': [{ 'k_neighbors': 3 }, { 'k_neighbors': 5 }, { 'k_neighbors': 20 }, { 'k_neighbors': float('Inf') }], 'kmeans_args': [{ 'n_clusters': 2 }, { 'n_clusters': 20 }, { 'n_clusters': 50 }, { 'n_clusters': 100 }, { 'n_clusters': 250 }, { 'n_clusters': 500 }], 'use_minibatch_kmeans': [True], 'n_jobs': [-1] }, # SMOTE Limit Case { 'imbalance_ratio_threshold': [float('Inf')], 'kmeans_args': [{ 'n_clusters': 1 }], 'smote_args': [{ 'k_neighbors': 3 }, { 'k_neighbors': 5 }], 'use_minibatch_kmeans': [True], 'n_jobs': [-1] } ]) ] datasets = read_csv_dir(cfg['dataset_dir']) experiment = BinaryExperiment( datasets, classifiers, oversampling_methods, n_jobs=-1, experiment_repetitions=experiment_config['experiment_repetitions'], random_state=experiment_config['random_seed'], n_splits=experiment_config['n_splits'], scoring=[ 'geometric_mean_score', 'average_precision', 'roc_auc', 'f1', 'fp', 'fn', 'tp', 'tn' ]) with warnings.catch_warnings(): warnings.filterwarnings(action='ignore', message='Adapting smote_args\.k_neighbors') experiment.run() path = cfg['results_dir'] if 'session_id' not in globals(): session_id = (datetime.utcnow() + timedelta(hours=2, minutes=0)).strftime("%Y-%m-%d %Hh%M") os.makedirs('{}/{}'.format(path, session_id)) experiment.save('{}/{}/experiment.p'.format(path, session_id)) # stringify oversampling methods experiment_config['oversampling_methods'] = re.sub( '\\n *', ' ', str(oversampling_methods)) # save experiment config pd.Series(experiment_config).to_csv('{}/{}/experiment_config.csv'.format( path, session_id))
def feature_selection(self, X, y, method): """ purpose: select feature input: X:train data y:lable method: uesed method return: """ X_indices = np.arange(X.shape[-1]) score = [] # Removing features with low variance # correlation coefficient # SelectKBest(lambda X,Y: np.array(map(lambda x: pearsonr(x, Y), X.T)).T, k=2).fit_transform(data, target) # mutual information # SelectKBest(lambda X, Y: array(map(lambda x: mic(x, Y), X.T)).T, k=2).fit_transform(data, target) # Univariate feature selection (for classification) if method == 'chi-squared': skb = SelectKBest(chi2) skb.fit_transform(X, y) score = skb.scores_ # Univariate feature selection (for regression) if method == 'f_regression': skb = SelectKBest(f_regression) skb.fit_transform(X, y) score = skb.scores_ # L1-based feature selection (for classification) if method == 'LinearSVC': lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y) sfm = SelectFromModel(lsvc, prefit=True) X_new = sfm.transform(X) # L1-based feature selection (for regression) elif method == 'LassoCV': lasso = LassoCV().fit(X, y) score = lasso.coef_ sfm = SelectFromModel(lasso, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Tree-based feature selection (for classification) elif method == 'ExtraTreesClassifier': clf = ExtraTreesClassifier() clf = clf.fit(X, y) print clf.feature_importances_ sfm = SelectFromModel(clf, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Tree-based feature selection (for regression) elif method == 'ExtraTreesRegressor': clf = ExtraTreesRegressor() clf = clf.fit(X, y) score = clf.feature_importances_ sfm = SelectFromModel(clf, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Tree-based feature selection (for classifier) elif method == 'GradientBoostingClassifier': clf = GradientBoostingClassifier(learning_rate=0.01) clf = clf.fit(X, y) score = clf.feature_importances_ sfm = SelectFromModel(clf, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Tree-based feature selection (for regression) elif method == 'GradientBoostingRegressor': clf = GradientBoostingRegressor(learning_rate=0.01) clf = clf.fit(X, y) score = clf.feature_importances_ sfm = SelectFromModel(clf, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Print the feature ranking indices = np.argsort(score)[::-1] print("Feature ranking:") for f in X_indices: print("feature %d: %s (%f)" % (indices[f], self.columns[indices[f]], score[indices[f]])) #draw plot plt.figure() # plt.bar(indices, score, width=0.2, color='r') plt.barh(indices, score, height=0.2, color='r') plt.title(method) plt.xlabel("score") plt.ylabel("feature") plt.grid(axis='x') plt.show() pass
def gbc_y(): from sklearn.ensemble import GradientBoostingClassifier regressor_gb = GradientBoostingClassifier() regressor_gb.fit(X_train, y_train) y_pred_gb = regressor_gb.predict_proba(X_valid) return y_pred_gb[:, 1]
def return_model(mode, **kwargs): if inspect.isclass(mode): assert getattr(mode, 'fit', None) is not None, 'Custom model family should have a fit() method' model = mode(**kwargs) elif mode=='logistic': solver = kwargs.get('solver', 'liblinear') n_jobs = kwargs.get('n_jobs', None) max_iter = kwargs.get('max_iter', 5000) model = LogisticRegression(solver=solver, n_jobs=n_jobs, max_iter=max_iter, random_state=666) elif mode=='Tree': model = DecisionTreeClassifier(random_state=666) elif mode=='RandomForest': n_estimators = kwargs.get('n_estimators', 50) model = RandomForestClassifier(n_estimators=n_estimators, random_state=666) elif mode=='GB': n_estimators = kwargs.get('n_estimators', 50) model = GradientBoostingClassifier(n_estimators=n_estimators, random_state=666) elif mode=='AdaBoost': n_estimators = kwargs.get('n_estimators', 50) model = AdaBoostClassifier(n_estimators=n_estimators, random_state=666) elif mode=='SVC': kernel = kwargs.get('kernel', 'rbf') model = SVC(kernel=kernel, random_state=666) elif mode=='LinearSVC': model = LinearSVC(loss='hinge', random_state=666) elif mode=='GP': model = GaussianProcessClassifier(random_state=666) elif mode=='KNN': n_neighbors = kwargs.get('n_neighbors', 5) model = KNeighborsClassifier(n_neighbors=n_neighbors) elif mode=='NB': model = MultinomialNB() elif mode=='linear': model = LinearRegression(random_state=666) elif mode=='ridge': alpha = kwargs.get('alpha', 1.0) model = Ridge(alpha=alpha, random_state=666) elif 'conv' in mode: tf.reset_default_graph() address = kwargs.get('address', 'weights/conv') hidden_units = kwargs.get('hidden_layer_sizes', [20]) activation = kwargs.get('activation', 'relu') weight_decay = kwargs.get('weight_decay', 1e-4) learning_rate = kwargs.get('learning_rate', 0.001) max_iter = kwargs.get('max_iter', 1000) early_stopping= kwargs.get('early_stopping', 10) warm_start = kwargs.get('warm_start', False) batch_size = kwargs.get('batch_size', 256) kernel_sizes = kwargs.get('kernel_sizes', [5]) strides = kwargs.get('strides', [5]) channels = kwargs.get('channels', [1]) validation_fraction = kwargs.get('validation_fraction', 0.) global_averaging = kwargs.get('global_averaging', 0.) optimizer = kwargs.get('optimizer', 'sgd') if mode=='conv': model = CShapNN(mode='classification', batch_size=batch_size, max_epochs=max_iter, learning_rate=learning_rate, weight_decay=weight_decay, validation_fraction=validation_fraction, early_stopping=early_stopping, optimizer=optimizer, warm_start=warm_start, address=address, hidden_units=hidden_units, strides=strides, global_averaging=global_averaging, kernel_sizes=kernel_sizes, channels=channels, random_seed=666) elif mode=='conv_reg': model = CShapNN(mode='regression', batch_size=batch_size, max_epochs=max_iter, learning_rate=learning_rate, weight_decay=weight_decay, validation_fraction=validation_fraction, early_stopping=early_stopping, optimizer=optimizer, warm_start=warm_start, address=address, hidden_units=hidden_units, strides=strides, global_averaging=global_averaging, kernel_sizes=kernel_sizes, channels=channels, random_seed=666) elif 'NN' in mode: solver = kwargs.get('solver', 'adam') hidden_layer_sizes = kwargs.get('hidden_layer_sizes', (20,)) if isinstance(hidden_layer_sizes, list): hidden_layer_sizes = list(hidden_layer_sizes) activation = kwargs.get('activation', 'relu') learning_rate_init = kwargs.get('learning_rate', 0.001) max_iter = kwargs.get('max_iter', 5000) early_stopping= kwargs.get('early_stopping', False) warm_start = kwargs.get('warm_start', False) if mode=='NN': model = MLPClassifier(solver=solver, hidden_layer_sizes=hidden_layer_sizes, activation=activation, learning_rate_init=learning_rate_init, warm_start = warm_start, max_iter=max_iter, early_stopping=early_stopping) if mode=='NN_reg': model = MLPRegressor(solver=solver, hidden_layer_sizes=hidden_layer_sizes, activation=activation, learning_rate_init=learning_rate_init, warm_start = warm_start, max_iter=max_iter, early_stopping=early_stopping) else: raise ValueError("Invalid mode!") return model
import pandas as pd import numpy as np from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import GridSearchCV, cross_val_score data_loc = 'data/uci_data/poker_hand/poker-hand-training-true.data.txt' col_names = [ 'S1', 'C1', 'S2', 'C2', 'S3', 'C3', 'S4', 'C4', 'S5', 'C5', 'hand' ] df = pd.read_csv(data_loc, names=col_names) df_x = df.drop(col_names[-1], axis=1) df_y = df[col_names[-1]] ''' We lower the learning rate and increase the number of estimators proportionally. The parameter we have tuned might not be the optimum values but a good benchmark. ''' gb_tuned = GradientBoostingClassifier(random_state=1, learning_rate=0.05, n_estimators=500, max_depth=7, min_samples_split=2, min_samples_leaf=1, max_features=10, subsample=1) scores = cross_val_score(gb_tuned, df_x, df_y, cv=5, scoring='accuracy') print(scores) print("mean: %0.6f, std: %0.6f" % (scores.mean(), scores.std()))
# In[58]: # Boosting # In[69]: #Boosting on oversampled data from sklearn.ensemble import GradientBoostingClassifier lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1] for learning_rate in lr_list: gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0) gb_clf.fit(X_t, y_t) print("Learning rate: ", learning_rate) print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_t, y_t))) print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test, y_test))) # In[60]: gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=0.75, max_features=2, max_depth=2, random_state=0) gb_clf.fit(X_t, y_t) print("Learning rate: ", learning_rate) print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_t, y_t)))
import numpy as np import pandas as pd from sklearn.ensemble import GradientBoostingClassifier from sklearn.kernel_approximation import Nystroem from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from tpot.builtins import DatasetSelector # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=4) # Average CV score on the training set was:0.7802373007044865 exported_pipeline = make_pipeline( DatasetSelector(sel_subset=0, subset_list="subsets.csv"), Nystroem(gamma=0.2, kernel="cosine", n_components=10), GradientBoostingClassifier(learning_rate=0.5, max_depth=4, max_features=0.45, min_samples_leaf=4, min_samples_split=12, n_estimators=100, subsample=0.8500000000000001)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def define_clfs_params(): clfs = { 'BG': BaggingClassifier(n_estimators=10), 'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1), 'LR': LogisticRegression(penalty='l1', C=1e5), 'SVM': svm.SVC(kernel='linear', probability=True, random_state=0), 'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10), 'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200), 'DT': DecisionTreeClassifier(), 'KNN': KNeighborsClassifier(n_neighbors=3), 'NB': GaussianNB() } grid = { 'BG': { 'n_estimators': [10, 50] }, 'RF': { 'n_estimators': [1, 10, 100, 1000, 10000], 'max_depth': [1, 5, 10, 20, 50, 100], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 5, 10], 'n_jobs': [-1] }, 'LR': { 'penalty': ['l1', 'l2'], 'C': [0.00001, 0.001, 0.1, 1, 10] }, 'SVM': { 'C': [0.01, 0.1, 1], 'kernel': ['linear'] }, 'GB': { 'n_estimators': [1, 10, 100], 'learning_rate': [0.01, 0.1, 0.5], 'subsample': [0.1, 0.5, 1.0], 'max_depth': [1, 5, 20] }, 'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1, 10, 100] }, 'DT': { 'criterion': ['gini', 'entropy'], 'max_depth': [1, 5, 10, 20, 50, 100], 'min_samples_split': [2, 5, 10] }, 'KNN': { 'n_neighbors': [1, 5, 10, 25, 50, 100], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree'] }, 'NB': {} } return clfs, grid
b_data=Binarizer(threshold=0.5).fit_transform(boston.data) print(b_data[0:5,:]) # 哑编码,对boston数据集的目标值,返回值为哑编码后的数据 o_target=OneHotEncoder().fit_transform(boston.target) print(o_target[0:5]) ###特征选择### #方差选择法,返回值为特征选择后的数据 #参数threshold为方差的阈值 VarianceThreshold(threshold=3).fit_transform(iris.data) # 卡方检验,选择K个最好的特征,返回选择特征后的数据 select_data=SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target) # 递归特征消除法,返回特征选择后的数据 # 参数estimator为基模型 # 参数n_features_to_select为选择的特征个数 RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(iris.data, iris.target) #带L1惩罚项的逻辑回归作为基模型的特征选择 SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(iris.data, iris.target) #GBDT作为基模型的特征选择 SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data, iris.target)
## Explore results # Scikit-learn classification ## Step 1: Create and fit gradient boosting classifier parameters = {'n_estimators': 120, 'learning_rate': 0.12, 'min_samples_split': 3, 'min_samples_leaf': 2} from sklearn.datasets import load_digits from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split gbc = GradientBoostingClassifier(**parameters) X, y = load_digits(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=28743) gbc.fit(X_train, y_train) ## Step 2: Initialize Neptune import neptune neptune.init('shared/sklearn-integration', api_token='ANONYMOUS') ## Step 3: Create an Experiment neptune.create_experiment(params=parameters,
X1_re, y1_re, test_size=0.2, train_size=300) #, random_state = 0) X2_train, X2_test, y2_train, y2_test = train_test_split( X2_re, y2_re, test_size=0.2, train_size=300) #, random_state = 0) sc = StandardScaler() X1_train = sc.fit_transform(X1_train) X2_train = sc.fit_transform(X2_train) PCA1_train = PCA(n_components=8).fit(X1_train).transform(X1_train) PCA2_train = PCA(n_components=8).fit(X2_train).transform(X2_train) y1_train = np.array(y1_train).ravel() y2_train = np.array(y2_train).ravel() forest = RandomForestClassifier(max_depth=None) forestBoost = GradientBoostingClassifier(max_depth=None) MLP = MLPClassifier() svm = SVC() knn = KNeighborsClassifier() Names = np.vstack((Names1, Names2)) Names = pd.DataFrame(Names, columns=["Name", "ExName"]) mydict = dict(zip(Names.Name, Names.ExName)) X1_columns = pd.DataFrame(X1_column_names, columns=["Name"]) X1_columns = X1_columns.replace(mydict) X1_column_names = pd.DataFrame(X1_column_names) Named = np.hstack((X1_column_names, X1_columns)) Named = pd.DataFrame(Named, columns=["Name", "ExName"])
pred = pipe.predict(X_test) # In[35]: pipe.predict(['I think I am going to love it here!']) # In[37]: accuracy_score(y_test, pred) # In[64]: abc = AdaBoostClassifier() bag = BaggingClassifier() gbc = GradientBoostingClassifier() rfc = RandomForestClassifier() lr = LogisticRegression() # In[65]: lr.fit(X_train, y_train) # In[68]: lr_pred = lr.predict(X_test) # In[69]: abc.fit(X_train, y_train)
clf_RF = RandomForestClassifier(n_estimators=100).fit(X_train, y_train) y_predict = clf_RF.predict(X_test) fpr_RF, tpr_RF, thr_RF = roc_curve(y_test, y_predict) pr_RF, rec_RF, thr_RF = precision_recall_curve(y_test, y_predict, pos_label=1) delta_RF = datetime.now() - startTime i = list(thr_RF).index(1) print('\tPrecision: {0}'.format(pr_RF[i])) print('\tRecall: {0}'.format(rec_RF[i])) print('\tTime: {0}'.format(delta_RF)) # Run Gradient Boosting print('Running Gradient Boosting...') startTime = datetime.now() clf_GB = GradientBoostingClassifier(n_estimators=100).fit(X_train, y_train) y_predict = clf_GB.predict(X_test) fpr_GB, tpr_GB, _ = roc_curve(y_test, y_predict) pr_GB, rec_GB, thr_GB = precision_recall_curve(y_test, y_predict) delta_GB = datetime.now() - startTime i = list(thr_GB).index(1) print('\tPrecision: {0}'.format(pr_GB[i])) print('\tRecall: {0}'.format(rec_GB[i])) print('\tTime: {0}'.format(delta_GB)) # =========================================================================== # Repeat process with an oversampled balanced dataset print('Running models with oversampled balanced dataset...') # Count number of fraud and non-fraud data points min_idx = y_train == 1
def train_model(datasetvar, dataset): x = datasetvar y = dataset['Churn'].values sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0) print(sss) print('训练数据和测试数据被分成的组数:', sss.get_n_splits(x, y)) # 建立训练数据和测试数据 for train_index, test_index in sss.split(x, y): print('train:', train_index, 'test:', test_index) x_train, x_test = x.iloc[train_index], x.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] print('原始数据特征:', x.shape, '训练数据特征:', x_train.shape, '测试数据特征:', x_test.shape) print('原始数据特征:', y.shape, '训练数据特征:', y_train.shape, '测试数据特征:', y_test.shape) # 使用分类算法, 这里选用10中分类算法 Classifier = [['Random Forest', RandomForestClassifier()], ['Support Vector Machine', SVC()], ['LogisticRegression', LogisticRegression()], ['KNN', KNeighborsClassifier(n_neighbors=5)], ['Navie Bayes', GaussianNB()], ['Decision Tree', DecisionTreeClassifier()], ['AdaBosstClassifier', AdaBoostClassifier()], ['GradientBoostingClassifier', GradientBoostingClassifier()], ['XGB', XGBClassifier()], ['CatBoost', CatBoostClassifier(logging_level='silcat')]] # 训练模型 Classify_result = [] names = [] prediction = [] for name, classifier in Classifier: classifier = classifier classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) recall = recall_score(y_test, y_pred) precision = precision_score(y_test, y_pred) class_eva = pd.DataFrame([recall, precision]) Classify_result.append(class_eva) name = pd.Series(name) names.append(name) y_pred = pd.Series(y_pred) prediction.append(y_pred) # 训练模型 names = pd.DataFrame(names) names = names[0].tolist() result = pd.concat(Classify_result, axis=1) result.columns = names result.index = ['recall', 'precision', 'f1score'] print(result) # 实施方案 pred_x = datasetvar.tail(10) # 提取customerID pred_id = telcom_id.tail(10) # 使用朴素贝叶斯方法, 对预测数据集中的生存情况进行预测 model = GaussianNB() model.fit(x_train, y_train) pred_y = model.predict(pred_x) # 预测结果 predDf = pd.DataFrame({'customerID': pred_id, 'Churn': pred_y}) print(predDf)
# stack base predicts for training meta model #stacked_predictions = np.column_stack((rf_fit.predict(x_train),et_fit.predict(x_train),ada_fit.predict(x_train),gb_fit.predict(x_train),svc_fit.predict(x_train))) polymetamnalicac # train meta model from sklearn.linear_model import LinearRegression #meta_model = LinearRegression() #meta_model.fit(stacked_predictions, t_train) from sklearn import preprocessing satsuki = pd.read_csv('haruten.csv', index_col=0) mm = preprocessing.MinMaxScaler() # インスタンスの作成 satsuki_seiki = mm.fit_transform(satsuki) arima = pd.read_csv('arima.csv', index_col=0) from sklearn.ensemble import VotingClassifier estimators = [ ('svc', SVC()), ('rf', RandomForestClassifier()), ('et', ExtraTreesClassifier()), ('ada', AdaBoostClassifier()), ('gb', GradientBoostingClassifier()), ] sum = 0 buy = 0 voting = VotingClassifier(estimators) voting.fit(x, t) print(voting.predict(satsuki_seiki))
return df df = pd.read_csv('drugsCom_raw/drugsComTrain_raw.tsv', sep='\t', index_col=0) df['date'] = pd.to_datetime(df['date']) df = rm_sym(df) df_tem2 = df.sample(20000) #df_tem2.groupby('rating_cate').size() / df_tem2.groupby('rating_cate').size().sum() ## Generate table of words with their counts con_vec = TfidfVectorizer(stop_words='english', tokenizer=tokenize) X_train = con_vec.fit_transform(df_tem2['review']) y_train = df_tem2['rating_cate'] ## test set test = pd.read_csv("drugsCom_raw/drugsComTest_raw.tsv", sep='\t', index_col=0) test = rm_sym(test) X_test = con_vec.transform(test['review']) y_test = test['rating_cate'] pickle.dump(con_vec, open("gbc_20000_600_tfidf.sav", 'wb')) gbc = GradientBoostingClassifier(n_estimators=600) gbc.fit(X_train, y_train) y_test_predict = gbc.predict(X_test) acc = accuracy_score(y_test, y_test_predict) with open("gbc_20000_600_accuracy.txt", 'w') as outfile: outfile.write(str(acc)) pickle.dump(gbc, open("gbc_20000_600_gbc.sav", 'wb'))
param['bst:eta'] = 0.1 param['bst:max_depth'] = 6 param['eval_metric'] = 'auc' param['silent'] = 1 param['nthread'] = 4 plst = param.items() + [('eval_metric', '[email protected]')] watchlist = [(xgmat, 'train')] # boost 10 tres num_round = 10 print('loading data end, start to boost trees') print("training GBM from sklearn") tmp = time.time() gbm = GradientBoostingClassifier(n_estimators=num_round, max_depth=6, verbose=2) gbm.fit(data, label) print("sklearn.GBM costs: %s seconds" % str(time.time() - tmp)) #raw_input() print("training xgboost") threads = [1, 2, 4, 16] for i in threads: param['nthread'] = i tmp = time.time() plst = param.items() + [('eval_metric', '[email protected]')] bst = xgb.train(plst, xgmat, num_round, watchlist) print("XGBoost with %d thread costs: %s seconds" % (i, str(time.time() - tmp))) print('finish training')
from General.Paths import Gitlab_Path import pandas as pd from Scoring.scoring_func import f1_scores_plot import numpy as np from time import time fold1_df = load_dataframe(filename='fold1_NA_features.dat') fold2_df = load_dataframe(filename='fold2_NA_features.dat') del fold1_df['id'] del fold2_df['id'] n_features = int(len(fold1_df.columns) / 4) p0 = time() clf = GradientBoostingClassifier('deviance', learning_rate=0.05, n_estimators=100, max_features=n_features) clf.fit(fold1_df.iloc[:, 1:], fold1_df.iloc[:, 0]) preds_ens = clf.predict_proba(fold2_df.iloc[:, 1:])[:, 1] print(time() - p0) ## Ensemble the predictions true_values = fold2_df['label'] df, best_index = f1_scores_plot(preds_ens, true_values) df['f1_score'][best_index] #Li ### Check perfomance on fold3 fold3_df = load_dataframe(filename='fold3_NA_features.dat') del fold3_df['id'] dw_cols = [x for x in fold1_df.columns if x[-2:] == 'dw' and x[:3] == 'pca']
for i in corr_mat: for j in corr_mat: if (i == j): continue else: if (corr_mat[i][j] > 0.2): a.add(i) print(a) sve = SVC() sve.fit(data_pd, Y_train) print(sve.score(data_pd1, Y_test)) print(sve.score(data_pd, Y_train)) grb = GradientBoostingClassifier() grb.fit(data_pd, Y_train) print(grb.score(data_pd1, Y_test)) print(grb.score(data_pd, Y_train)) cor_matt = data_pd.corr() eig_vals, eig_vecs = np.linalg.eig(cor_matt) #print(eig_vals) #print('sdaddddddddddddddd') #print(eig_vecs) '''fiting and transforming pca''' pca = PCA(n_components=9) train_features = pca.fit_transform(data_pd) test_features = pca.transform(data_pd1) sve1 = SVC()
# List of comments comments = [] # https://stackoverflow.com/questions/49100615/nltk-detecting-whether-a-sentence-is-interogative-or-not nltk.download('nps_chat') posts = nltk.corpus.nps_chat.xml_posts() posts_text = [post.text for post in posts] #divide train and test in 80 20 train_text = posts_text[:int(len(posts_text) * 0.8)] test_text = posts_text[int(len(posts_text) * 0.2):] #Get TFIDF features vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=0.001, max_df=0.7, analyzer='word') X_train = vectorizer.fit_transform(train_text) X_test = vectorizer.transform(test_text) y = [post.get('class') for post in posts] y_train = y[:int(len(posts_text) * 0.8)] y_test = y[int(len(posts_text) * 0.2):] gb = GradientBoostingClassifier(n_estimators=400, random_state=0) gb.fit(X_train, y_train) question_comments = [] for comment in comments: type_of_comment = gb.predict(vectorizer.transform([comment])) if (type_of_comment == 'ynQuestion' or type_of_comment == 'whQuestion' or '?' in comment): question_comments.append(comment) question_comments
plt.savefig('plt_heatmap_svc.png', bbox_inches='tight') plt.show() sns.heatmap(table, mask=mask, vmax=.65, square=True, cmap="RdBu_r") """ Gradient Boosting --------------------------------------------------------- """ from sklearn.ensemble import GradientBoostingClassifier clf_fb = GradientBoostingClassifier(n_estimators = 200) #0.001,1000 clf_fb.fit(regressors_train_pca, target_train_bin) target_validation_bin_predicted_gb = clf_fb.predict(regressors_validation_pca) # Accuracy of Predictions accuracy_score(target_validation_bin, target_validation_bin_predicted_gb) # Confusion Matrix print(confusion_matrix(target_test.Box_Office_Range_Bins, target_validation_bin_predicted_gb)) """ Neural Network --------------------------------------------------------------