def test_bagging_classifier_with_missing_inputs(): # Check that BaggingClassifier can accept X with missing/infinite data X = np.array([ [1, 3, 5], [2, None, 6], [2, np.nan, 6], [2, np.inf, 6], [2, np.NINF, 6], ]) y = np.array([3, 6, 6, 6, 6]) classifier = DecisionTreeClassifier() pipeline = make_pipeline(FunctionTransformer(replace), classifier) pipeline.fit(X, y).predict(X) bagging_classifier = BaggingClassifier(pipeline) bagging_classifier.fit(X, y) y_hat = bagging_classifier.predict(X) assert y.shape == y_hat.shape bagging_classifier.predict_log_proba(X) bagging_classifier.predict_proba(X) # Verify that exceptions can be raised by wrapper classifier classifier = DecisionTreeClassifier() pipeline = make_pipeline(classifier) with pytest.raises(ValueError): pipeline.fit(X, y) bagging_classifier = BaggingClassifier(pipeline) with pytest.raises(ValueError): bagging_classifier.fit(X, y)
def trainModel(x, y): #lightgbm/xgboost的自定义评价指标 def self_metric(y_true, y_pred): score = -f1_score(y_true, 1 * (y_pred >= 0.5)) return 'f1', score, False from sklearn.ensemble import BaggingClassifier params = { "num_leaves": 81, "n_estimators": 550, "learning_rate": 0.2, #绝对需要的参数 # "subsample":0.9, "class_weight": { 1: 1, 0: 1 }, "reg_lambda": 2 #仅做尝试 } # params = {"num_leaves":121, "n_estimators":450, "learning_rate":0.2,#绝对需要的参数 # "subsample":0.9,"class_weight":{1:1,0:1},"reg_lambda":1 #仅做尝试 # } x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=345) lg = LGBMClassifier(**params) # lg = LGBMClassifier(random_seed=2019, n_jobs=-1, objective='binary', # learning_rate=0.1, n_estimators=2666, num_leaves=31, max_depth=-1, # min_child_samples=50, min_child_weight=9, subsample_freq=1, # # subsample=0.7, colsample_bytree=0.7, # reg_alpha=1, reg_lambda=5) model = BaggingClassifier(base_estimator=lg, n_estimators=100, max_samples=0.8, max_features=0.8) # model = lg # model.fit(x_train, y_train, eval_metric=self_metric, eval_set=[(x_train, y_train),(x_test, y_test)],early_stopping_rounds=100) # model.n_estimators = model.best_iteration_ model.fit(x_train, y_train) joblib.dump(model, "../result/lgb.m") #质变部分 - 取合理的阈值来指定 f1指标 #todo 可以自己划分多个阈值(2000个以上)直接计算f1指标,看哪个阈值最好,更加精确 pre_train = model.predict_proba(x_train)[:, 1] pre_test = model.predict_proba(x_test)[:, 1] fpr, tpr, thresholds = roc_curve(y_train, pre_train) thre_index = (tpr - fpr).argmax() thres = thresholds[thre_index] print("训练集阈值", thres) pre_train = 1 * (pre_train >= thres) pre_test = 1 * (pre_test >= thres) print("train f1_score", f1_score(y_train, pre_train)) print("test f1_score", f1_score(y_test, pre_test)) print("train recall_score", recall_score(y_train, pre_train)) print("test recall_score", recall_score(y_test, pre_test)) print("train precision_score", precision_score(y_train, pre_train)) print("test precision_score", precision_score(y_test, pre_test)) return model, thres
def test_probability(): # Predict probabilities. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=rng).fit(X_train, y_train) assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))) # Degenerate case, where some classes are missing ensemble = BaggingClassifier(base_estimator=LogisticRegression(), random_state=rng, max_samples=5).fit(X_train, y_train) assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)))
def test_bagging_classifier_with_missing_inputs(): # Check that BaggingClassifier can accept X with missing/infinite data X = np.array([ [1, 3, 5], [2, None, 6], [2, np.nan, 6], [2, np.inf, 6], [2, np.NINF, 6], ]) y = np.array([3, 6, 6, 6, 6]) classifier = DecisionTreeClassifier() pipeline = make_pipeline( FunctionTransformer(replace, validate=False), classifier ) pipeline.fit(X, y).predict(X) bagging_classifier = BaggingClassifier(pipeline) bagging_classifier.fit(X, y) y_hat = bagging_classifier.predict(X) assert_equal(y.shape, y_hat.shape) bagging_classifier.predict_log_proba(X) bagging_classifier.predict_proba(X) # Verify that exceptions can be raised by wrapper classifier classifier = DecisionTreeClassifier() pipeline = make_pipeline(classifier) assert_raises(ValueError, pipeline.fit, X, y) bagging_classifier = BaggingClassifier(pipeline) assert_raises(ValueError, bagging_classifier.fit, X, y)
def test_parallel_classification(): # Check parallel classification. rng = check_random_state(0) # Classification X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train) # predict_proba ensemble.set_params(n_jobs=1) y1 = ensemble.predict_proba(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y3) # decision_function ensemble = BaggingClassifier(SVC(decision_function_shape="ovr"), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) decisions1 = ensemble.decision_function(X_test) ensemble.set_params(n_jobs=2) decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) ensemble = BaggingClassifier(SVC(decision_function_shape="ovr"), n_jobs=1, random_state=0).fit(X_train, y_train) decisions3 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions3)
class MultiLabel(Model): def __init__(self, *args, **kwargs): n_estimators = 100 self.clf1 = RandomForestClassifier(n_estimators=n_estimators) self.clf2 = BaggingClassifier(n_jobs=1) self.clf3 = BaggingClassifier(n_jobs=1) def train(self, x, y): y1 = np.array([ACTIONS[i].split('-')[0] for i in y]) y2 = np.array([ACTIONS[i].split('-')[1] if ACTIONS[i] != 'SHIFT' else 'SHIFT' for i in y]) y3 = np.array([ACTIONS[i].split('-')[2] if ACTIONS[i] != 'SHIFT' else 'SHIFT' for i in y]) self.clf1.fit(x, y1) self.clf2.fit(x, y2) self.clf3.fit(x, y3) def predict(self, x): pred1 = self.clf1.predict_proba(x) pred2 = self.clf2.predict_proba(x) pred3 = self.clf3.predict_proba(x) a1 = 'REDUCE' # fix the action if needed a2_pred = self.clf2.classes_[np.argsort(pred2).squeeze()] a2 = a2_pred[-1] if a2_pred[-1] != 'SHIFT' else a2_pred[-2] a3_pred = self.clf3.classes_[np.argsort(pred3).squeeze()] a3 = a3_pred[-1] if a3_pred[-1] != 'SHIFT' else a3_pred[-2] if self.clf1.classes_[np.argmax(pred1)] == 'SHIFT': action = 'SHIFT' alter_action = '-'.join([a1, a2, a3]) else: action = '-'.join([a1, a2, a3]) alter_action = 'INVALID' return action, alter_action
def getPredictionResults(trainData, testData, featureSet, qIds=False): if(not(qIds)): trainFeatures = np.nan_to_num(np.array([np.array(extractFeatureValues(trainSample, featureSet)) for trainSample in trainData[:,-1]])) testFeatures = np.nan_to_num(np.array([np.array(extractFeatureValues(testSample, featureSet)) for testSample in testData[:,-1]])) # scaler = preprocessing.StandardScaler().fit(trainFeatures) trainLabels = trainData[:,-2].astype('int') # clf = RandomForestClassifier(**RF_PARAMS).fit(trainFeatures, trainLabels) base = GradientBoostingClassifier(**GBC_PARAMS, loss="deviance") clf = BaggingClassifier(base_estimator=base, n_estimators=10, bootstrap=False).fit(trainFeatures, trainLabels).fit(trainFeatures, trainLabels) probs = clf.predict_proba(testFeatures)[:,1] # clf = SVC(**SVM_PARAMS).fit(preprocessing.scale(trainFeatures), trainLabels) # probs = clf.predict_proba(preprocessing.scale(testFeatures))[:,1] return probs else: probs = np.zeros(len(testData)) for qId in qIds: trainFeatures = np.nan_to_num(np.array([np.array(extractFeatureValues(trainSample, featureSet)) for trainSample in trainData[trainData[:,0]==qId,-1]])) testFeatures = np.nan_to_num(np.array([np.array(extractFeatureValues(testSample, featureSet)) for testSample in testData[testData[:,0]==qId,-1]])) # scaler = preprocessing.StandardScaler().fit(trainFeatures) trainLabels = trainData[trainData[:,0]==qId,-2].astype('int') # clf = RandomForestClassifier(**RF_PARAMS).fit(trainFeatures, trainLabels) base = GradientBoostingClassifier(**GBC_PARAMS, loss="deviance") clf = BaggingClassifier(base_estimator=base, n_estimators=10, bootstrap=False).fit(trainFeatures, trainLabels).fit(trainFeatures, trainLabels) proba = clf.predict_proba(testFeatures) # clf = SVC(**SVM_PARAMS).fit(preprocessing.scale(trainFeatures), trainLabels) # proba = clf.predict_proba(preprocessing.scale(testFeatures)) probs[testData[:,0]==qId] = proba[:,1] return probs
def test_probability(): # Predict probabilities. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=rng).fit(X_train, y_train) assert_array_almost_equal( np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))) # Degenerate case, where some classes are missing ensemble = BaggingClassifier(base_estimator=LogisticRegression(), random_state=rng, max_samples=5).fit(X_train, y_train) assert_array_almost_equal( np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)))
def adaboost_train(train_file,test_file): _,x,y = readFile(train_file) print 'reading done.' ts = x.shape[0] id,x2 = readFile(test_file) print x.shape print x2.shape x = np.concatenate((x,x2)) print 'concatenate done.' from sklearn.preprocessing import scale x = scale(x,with_mean=False) print 'scale done.' x2 = x[ts:] x=x[0:ts] from sklearn.feature_selection import SelectKBest,chi2 x = SelectKBest(chi2,k=50000).fit_transform(x,y) from sklearn.cross_validation import train_test_split tmp_array = np.arange(x.shape[0]) train_i, test_i = train_test_split(tmp_array, train_size = 0.8, random_state = 500) train_x = x[train_i] test_x = x[test_i] train_y = y[train_i] test_y = y[test_i] from sklearn.ensemble import BaggingClassifier bagging = BaggingClassifier(LR(penalty='l2',dual=True),n_estimators = 10,max_samples=0.6,max_features=0.6) bagging.fit(train_x,train_y) print 'train done.' res = bagging.predict(train_x) print res from sklearn.metrics import roc_auc_score score = roc_auc_score(train_y,res) res = bagging.predict_proba(train_x) print res score = roc_auc_score(train_y,res[:,1]) print score print '-----------------------------------------' print res[:,1] res = bagging.predict_proba(test_x) score = roc_auc_score(test_y,res[:,1]) print score y=bagging.predict_proba(x2) output = pd.DataFrame( data={"id":id, "sentiment":y[:,1]} ) output.to_csv( "/home/chuangxin/Bagging_result.csv", index=False, quoting=3 ) return bagging
def test_parallel_classification(): # Check parallel classification. rng = check_random_state(0) # Classification X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train) # predict_proba ensemble.set_params(n_jobs=1) y1 = ensemble.predict_proba(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y3) # decision_function ensemble = BaggingClassifier(SVC(gamma='scale', decision_function_shape='ovr'), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) decisions1 = ensemble.decision_function(X_test) ensemble.set_params(n_jobs=2) decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1)))) assert_raise_message( ValueError, "Number of features of the model " "must match the input. Model n_features is {0} " "and input n_features is {1} " "".format(X_test.shape[1], X_err.shape[1]), ensemble.decision_function, X_err) ensemble = BaggingClassifier(SVC(gamma='scale', decision_function_shape='ovr'), n_jobs=1, random_state=0).fit(X_train, y_train) decisions3 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions3)
def test_parallel_classification(): # Check parallel classification. rng = check_random_state(0) # Classification X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train) # predict_proba ensemble.set_params(n_jobs=1) y1 = ensemble.predict_proba(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y3) # decision_function ensemble = BaggingClassifier(SVC(gamma='scale', decision_function_shape='ovr'), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) decisions1 = ensemble.decision_function(X_test) ensemble.set_params(n_jobs=2) decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1)))) assert_raise_message(ValueError, "Number of features of the model " "must match the input. Model n_features is {0} " "and input n_features is {1} " "".format(X_test.shape[1], X_err.shape[1]), ensemble.decision_function, X_err) ensemble = BaggingClassifier(SVC(gamma='scale', decision_function_shape='ovr'), n_jobs=1, random_state=0).fit(X_train, y_train) decisions3 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions3)
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est=100, depth=4, lrate=.1, n_fold=5, n_bag=50, subrow=.5, subcol=.8): feature_name = os.path.basename(train_file)[:-4] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='xg_bag{}_{}_{}_{}_{}_{}_{}.log'.format( n_bag, n_est, depth, lrate, subrow, subcol, feature_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) xg = xgb.XGBClassifier(max_depth=depth, learning_rate=lrate, n_estimators=n_est, colsample_bytree=.8, subsample=.5, nthread=4) clf = BG(xg, n_estimators=n_bag, max_samples=subrow, max_features=subcol) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) p_val = np.zeros_like(y) for i, (i_trn, i_val) in enumerate(cv, 1): logging.info('Training model #{}...'.format(i)) clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] logging.info('AUC TRN = {:.6f}'.format( AUC(y[i_trn], clf.predict_proba(X[i_trn])[:, 1]))) logging.info('AUC VAL = {:.6f}'.format(AUC(y[i_val], p_val[i_val]))) logging.info('AUC = {:.6f}'.format(AUC(y, p_val))) logging.info('Retraining with 100% data...') clf.fit(X, y) p_tst = clf.predict_proba(X_tst)[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def test_parallel_classification(): # Check parallel classification. rng = check_random_state(0) # Classification X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train) # predict_proba ensemble.set_params(n_jobs=1) y1 = ensemble.predict_proba(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y3) # decision_function ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) decisions1 = ensemble.decision_function(X_test) ensemble.set_params(n_jobs=2) decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1)))) err_msg = (f"Number of features of the model must match the input. Model " f"n_features is {X_test.shape[1]} and input n_features is " f"{X_err.shape[1]} ") with pytest.raises(ValueError, match=err_msg): ensemble.decision_function(X_err) ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'), n_jobs=1, random_state=0).fit(X_train, y_train) decisions3 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions3)
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est=100, depth=4, lrate=.1, n_fold=5): logging.info('Loading training and test data...') X, y = load_svmlight_file(train_file) X_tst, _ = load_svmlight_file(test_file) X = X.todense() X_tst = X_tst.todense() logging.info('Validation...') gbm = GBM(max_depth=depth, learning_rate=lrate, n_estimators=n_est, random_state=2015) clf = BG(base_estimator=gbm, n_estimators=5, max_samples=0.8, max_features=0.8, bootstrap=True, bootstrap_features=True, random_state=42, verbose=0) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) logging.info('Cross validation...') p_val = np.zeros_like(y) lloss = 0. for i_trn, i_val in cv: clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] lloss += log_loss(y[i_val], p_val[i_val]) logging.info('Log Loss = {:.4f}'.format(lloss)) logging.info('Retraining with 100% data...') clf.fit(X, y) p_tst = clf.predict_proba(X_tst)[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def main(): # The competition datafiles are in the directory /input # Read output csv format in case the file does not exists submit = pd.read_csv('sample_submission.csv') # Training cols print ("Loading training csv.") #train_cols = ['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market', 'hotel_cluster'] train_cols = ['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country', 'hotel_cluster'] train = pd.DataFrame(columns=train_cols) train_chunk = pd.read_csv('input/train.csv', chunksize=100000) print ("Training csv loaded.") # Read each chunk to train for chunk in train_chunk: #train = pd.concat( [ train, chunk ] ) train = pd.concat( [ train, chunk[chunk['is_booking']==1][train_cols] ] ) print ("Chunk done") # Load each column #x_train = train[['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market']].values x_train = train[['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country']].values y_train = train['hotel_cluster'].values # Run RandomForest on training data print ("Training RandomForest.") rf = RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=4) bclf = BaggingClassifier(rf, n_estimators=2, n_jobs=4) bclf.fit(x_train, y_train) print ("Training done.") print ("Loading testing csv.") test_chunk = pd.read_csv('input/test.csv', chunksize=100000) print ("Begin testing each chunk.") predict = np.array([]) # Read each chunk to test for i, chunk in enumerate(test_chunk): #test_X = chunk[['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market']].values test_X = chunk[['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country']].values test_X = np.nan_to_num(test_X) if i > 0: predict = np.concatenate( [predict, bclf.predict_proba(test_X)]) else: predict = bclf.predict_proba(test_X) print ("Chunk id: " + str(i)) submit['hotel_cluster'] = np.apply_along_axis(get5Best, 1, predict) submit.head() submit.to_csv('submission_random_forest.csv', index=False)
def bagging(X_train, X_test, y_train, y_test): from sklearn.ensemble import BaggingClassifier from sklearn.model_selection import cross_val_score from sklearn.metrics import accuracy_score from sklearn.metrics import precision_score import numpy as np bagc = BaggingClassifier(n_estimators=100) bagc.fit(X_train, y_train); y_preds_bagc = bagc.predict_proba(X_test)[:,1] y_preds_bagc_bin = bagc.predict(X_test) #TPRbagc, FPRbagc, thresholdsbagc = roc_curve(y_test, y_preds_bagc, pos_label=None, sample_weight=None, drop_intermediate=True) #plotroc(TPRbagc, FPRbagc) bagc_prec = np.mean(cross_val_score(bagc, X_train, y_train, scoring='precision', cv=5)) bagc_acc = np.mean(cross_val_score(bagc, X_train, y_train, scoring='accuracy', cv=5)) bagc_test_prec = precision_score(y_test, y_preds_bagc_bin) bagc_test_acc = accuracy_score(y_test, y_preds_bagc_bin) print("The cross validated precision score is {:0.3}".format(bagc_prec)) print("The cross validated accuracy score is {:0.3}".format(bagc_acc)) print("The test precision score is {:0.3}".format(bagc_test_prec)) print("The test accuracy score is {:0.3}".format(bagc_test_acc)) return(bagc)
def svm_ensemble(train_x, train_y, test_x, test_y): # Set the parameters by cross-validation (default:5-fold) tuned_svm = [{'C': [1, 10, 100, 1000]}] svm_clf = GridSearchCV(svm.SVC(kernel="linear", probability=True, class_weight="balanced"), tuned_svm, scoring="accuracy") svm_bagging = BaggingClassifier(svm_clf, n_estimators=ESTIMATOR, max_samples=0.2) svm_bagging.fit(train_x, train_y) probas = svm_bagging.predict_proba(test_x)[:, 1] tpr_values, fpr_values, roc_auc = get_auc(probas, test_y) y_true, y_pred = test_y, svm_bagging.predict(test_x) recall = recall_score(y_true, y_pred, average=None) acc = accuracy_score(y_true, y_pred) svm_imp = np.zeros((ESTIMATOR, 22)) for i in range(ESTIMATOR): grid_base = svm_bagging.estimators_[i] base = grid_base.best_estimator_ base.fit(train_x, train_y) svm_imp[i] = abs(base.coef_) svm_imp = np.mean(svm_imp, axis=0) return tpr_values, fpr_values, roc_auc, acc, recall, svm_imp
def vipsClassification(featureSetup, testSet, featureSets, resFileName): featureSet = [] for feat in featureSets: featureSet += FEATURE_SETS[feat] featurePath = VIPS_FEATURES_PATH resPath = VIPS_RES_PATH trainData, trainQIds = loadJsonData(join(featurePath,"train",featureSetup+".json")) testData, testQids = loadJsonData(join(featurePath,testSet,featureSetup+".json")) res = [([], testData, testSet, [])] trainFeatures = np.nan_to_num(np.array([np.array(extractFeatureValues(trainSample, featureSet)) for trainSample in trainData[:,-1]])) trainLabels = trainData[:,-2].astype('int') base = GradientBoostingClassifier(**GBC_PARAMS, loss="deviance", subsample=1) clf = BaggingClassifier(base_estimator=base, n_jobs=8, n_estimators=8, bootstrap=False).fit(trainFeatures, trainLabels) testFeatures = np.nan_to_num(np.array([np.array(extractFeatureValues(testSample, featureSet)) for testSample in testData[:,-1]])) probs = clf.predict_proba(testFeatures)[:,1] testData[:,-1] = probs with open(join(VIPS_RES_PATH, testSet + "_Error_Analysis.csv"),"w+") as f: featureWriter = csv.writer(f) featureWriter.writerows(testData) res[0][0].append(probs) res[0][3].append(testSet) resFile = join(resPath,resFileName + "_qRep.csv") questionReporting(res, resFile, testQids)
def BaggingClassifierPhase(fullData): Features = fullData[:, 0:NewDimension[1] - 1].astype(float) Target = fullData[:, NewDimension[1]] Bagmodel = BaggingClassifier() #Bagmodel = RandomForestClassifier() Bagmodel.fit(Features, Target) print( "Bagging Classifier is built and making predictions and calculating score of the model." ) TargetMat = np.ravel(Target) # Converts into 1D numpy.array predicted = Bagmodel.predict(Features) print( "\nf1_score:\n", metrics.f1_score(Target, predicted, pos_label='1.0', average='weighted')) print("\nConfusion matrix:\n 0.0\t 1.0\n", metrics.confusion_matrix(TargetMat, predicted)) print("Calculating the score of the model for 10-folds.") # Similar scores are obtained for every fold which shows that the model is well built and is NOT overfitting. print("\nScore of the model for each of the 10 folds:\n", cross_val_score(Bagmodel, Features, TargetMat, cv=10)) findROC(TargetMat, predicted) probability = Bagmodel.predict_proba(Features) listProb = probability[:, 1].tolist( ) # List of probability values for Flag = 0 for every observation. sumProb = 0 for z in listProb: sumProb += z AverageProbability = sumProb / len(listProb) print("\nAverage probability of deals being Captured is: ", AverageProbability) return AverageProbability
def othertest(precisionk, draw='False'): cleandata = pd.read_csv("./data/cleaned_knnimpute.csv") cleandata.index = cleandata.sid cleandata = cleandata.drop('sid', 1) mask = np.isnan(cleandata['Y']) cleandata = cleandata[mask == False] #After c is chosen, use this to draw AUC plot train_id, test_id = train_test_split(cleandata.index, test_size=0.2) # test_ratio = 0.2 train = cleandata.ix[train_id] test = cleandata.ix[test_id] coltest = precisionCol(train, precisionk) coltest = list(coltest) coltest.append('Y') train = train[coltest] test = test[coltest] model = BaggingClassifier(base_estimator=linear_model.LogisticRegression(), n_estimators=100, max_features=200, n_jobs=-1) model.fit(train.drop('Y', 1), train['Y']) fpr, tpr, thresholds = roc_curve( test['Y'], model.predict_proba(test.drop('Y', 1))[:, 1]) print auc(fpr, tpr) if draw == 'True': plotAUC(test['Y'], model.decision_function(test.drop('Y', 1)), 'Gradient Boosting') plt.savefig("testnorm_randomforest.png", dpi=120)
def query_by_bagging(X, y, current_model, batch_size, rng, base_model=SVC(C=1, kernel='linear'), n_bags=5, method="KL", D=None): """ :param base_model: Model that will be **fitted every iteration** :param n_bags: Number of bags on which train n_bags models :param method: 'entropy' or 'KL' :return: """ assert method == 'entropy' or method == 'KL' eps = 0.0000001 if method == 'KL': assert hasattr(base_model, 'predict_proba'), "Model with probability prediction needs to be passed to this strategy!" clfs = BaggingClassifier(base_model, n_estimators=n_bags, random_state=rng) clfs.fit(X[y.known], y[y.known]) pc = clfs.predict_proba(X[np.invert(y.known)]) # Settles page 17 if method == 'entropy': pc += eps fitness = np.sum(pc * np.log(pc), axis=1) ids = np.argsort(fitness)[:batch_size] elif method == 'KL': p = np.array([clf.predict_proba(X[np.invert(y.known)]) for clf in clfs.estimators_]) fitness = np.mean(np.sum(p * np.log(p / pc), axis=2), axis=0) ids = np.argsort(fitness)[-batch_size:] return y.unknown_ids[ids], fitness/np.max(fitness)
class BaggingClassifierImpl(): def __init__(self, base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0): self._hyperparams = { 'base_estimator': make_sklearn_compat(base_estimator), 'n_estimators': n_estimators, 'max_samples': max_samples, 'max_features': max_features, 'bootstrap': bootstrap, 'bootstrap_features': bootstrap_features, 'oob_score': oob_score, 'warm_start': warm_start, 'n_jobs': n_jobs, 'random_state': random_state, 'verbose': verbose} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
def cv_layer_1(clf_arguments, clf_type, datasets, non_clf_arguments): print("running %s experiment" % clf_type.__name__) for (cv1_train, cv1_test) in datasets: train_X, train_y = cv1_train test_X, test_y = cv1_test dataset_layer_2 = (train_X, train_y) best_res, best_arg = cv_layer_2(clf_arguments, clf_type, dataset_layer_2, non_clf_arguments) clf = clf_type(**best_arg) if non_clf_arguments["bagging"]: clf = BaggingClassifier(clf) clf.fit(train_X, train_y) pred_y = clf.predict_proba(test_X) validation_weights = compute_validation_weights(test_y) score = interpret_score(pred_y, test_y, validation_weights=validation_weights, show_roc=True) yield copy.deepcopy((score, best_arg))
def Model_2(train, test): ''' Trains the model and Saves the predictions in a CSV file train : Training set test : Test set ''' # Preprocessing X_train = [AAC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in train['Sequence']] X_test = [AAC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in test[' Sequence']] Y_train = train[' Label'] X_train, Y_train, X_test = np.array(X_train), np.array(Y_train), np.array(X_test) X_train,Y_train = shuffle(X_train,Y_train,random_state = 3) # Training param = {'max_depth':25,'objective':'reg:logistic','n_estimators':100,'booster':'gbtree', 'colsample_bylevel':0.7,'colsample_bytree': 1,'n_thread': 2} xgb = XGBClassifier( **param, random_state = 3) clf = BaggingClassifier(base_estimator = xgb, n_estimators = 23, random_state = 3, n_jobs = -1) clf.fit(X_train, Y_train) # Predicting Y_prob = [x[1] for x in clf.predict_proba(X_test)] Y_pred = clf.predict(X_test) result = pd.DataFrame() result["ID"] = test["ID"] result["Label"] = Y_prob result.to_csv("Submission_2.csv", index = False) result["Label"] = Y_pred result.to_csv("Prediction_2.csv", index = False)
def drawROC(XY, X, Y, class_label_list): X_new, X1_new, Y_new, Y1_new = train_test_split(X, Y, test_size=0.25, random_state=42) # classification # classify neigh = BaggingClassifier(random_state=42) neigh = neigh.fit(X_new, Y_new) # find out required parameters predicted_classes = neigh.predict_proba(X1_new) fpr, tpr, threshold = metrics.roc_curve( Y1_new, predicted_classes[:, 1], pos_label=class_label_list[len(class_label_list) - 1]) roc_auc = metrics.auc(fpr, tpr) #Plottting in graph plt.title('ROC curve') plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('Actual Positive Rate') plt.xlabel('Not Actual Positive Rate') plt.show()
def bagged_tree(): train_features, test_features = load_features() train_features = train_features.fillna(value=0) test_features = test_features.fillna(value=0) X_train = train_features.drop(["bidder_id", "outcome"], axis=1) Y_train = train_features["outcome"] X_test = test_features.drop(["bidder_id"], axis=1) print("Training Bagged forest classifier model") cart = DecisionTreeClassifier() bag_class = BaggingClassifier(base_estimator=cart, n_estimators=3000) print("Model trained") print("Cross validation score (Bagged Forest) : ") cv_score = np.mean( cross_val_score(bag_class, X_train, Y_train, cv=5, scoring='roc_auc')) print(cv_score) print("Generating submission file") bag_class.fit(X_train, Y_train) prediction = bag_class.predict_proba(X_test) test_features['prediction'] = prediction[:, 1] test_features[['bidder_id', 'prediction']].to_csv('data/submission_bagged.csv', index=False) print("Output file successfully created") print("Generating auc curve and auc score") auc = roc_auc(train_features, bag_class) print("AUC score : " + str(auc))
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est, depth, n_fold=5, n_bag=50): feature_name = os.path.basename(train_file)[:-4] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='et_bag{}_{}_{}_{}.log'.format( n_bag, n_est, depth, feature_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) et = ET(n_estimators=n_est, max_depth=depth, random_state=2015, class_weight='auto', bootstrap=True) clf = BG(et, n_estimators=n_bag, max_samples=.8, max_features=.9) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) p_val = np.zeros_like(y) for i, (i_trn, i_val) in enumerate(cv, 1): logging.info('Training model #{}...'.format(i)) clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] logging.info('AUC TRN = {:.6f}'.format( AUC(y[i_trn], clf.predict_proba(X[i_trn])[:, 1]))) logging.info('AUC VAL = {:.6f}'.format(AUC(y[i_val], p_val[i_val]))) logging.info('AUC = {:.6f}'.format(AUC(y, p_val))) logging.info('Retraining with 100% data...') clf.fit(X, y) p_tst = clf.predict_proba(X_tst)[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def main(): # The competition datafiles are in the directory /input # Read competition data files: train_csv = pd.read_csv("input/train.csv") test_csv = pd.read_csv("input/test.csv") # Prepare train by taking columns and filling NaNs #train = train[['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market', 'is_booking', 'cnt']] train = train_csv[[ 'site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market' ]] train = train.fillna(0) target = train_csv[['hotel_cluster']] target = target.fillna(0) # Prepare test by taking columns and filling NaNs test = test_csv[[ 'site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market' ]] test = test.fillna(0) # Run Random Forest on training data print "Training Random Forest" rf = RandomForestClassifier(n_estimators=100, n_jobs=2) svc = BaggingClassifier(rf, n_estimators=2, n_jobs=2) svc.fit(train, target.values.ravel()) # Predict with testing data print "Predicting test data" predict = svc.predict_proba(test) # Generate submission print "Generating submission" stuff = np.apply_along_axis(get5Best, 1, predict) subm = np.empty((len(predict), 6)) subm[:, 0] = np.arange(1, len(predict) + 1) subm[:, 1] = stuff[:, 0] subm[:, 2] = stuff[:, 1] subm[:, 3] = stuff[:, 2] subm[:, 4] = stuff[:, 3] subm[:, 5] = stuff[:, 4] np.savetxt('random_forest_submission.csv', subm, fmt='%d,%d %d %d %d %d', delimiter=',', header='id,hotel_cluster', comments='') print "Done"
def averaged_2models(trainX, validX, trainy, validy, testX, model1, model2, sub1, sub2): """ A try to average two models with different weights to see if it can be better, by grid search sub1 and sub2 are parameters to change name of submission file Better to use averaging_probas to avoid retraining classifiers """ if isinstance(model1, MLPClassifier) or isinstance(model1, xgb.XGBClassifier): calib1 = BaggingClassifier(model1) else: calib1 = CalibratedClassifierCV(model1, 'isotonic', 3) calib1.fit(trainX, trainy) print("model1 trained") if isinstance(model2, MLPClassifier) or isinstance(model2, xgb.XGBClassifier): calib2 = BaggingClassifier(model2) else: calib2 = CalibratedClassifierCV(model2, 'isotonic', 3) calib2.fit(trainX, trainy) print("model2 trained") if not len(validy) == 0: valid1 = calib1.predict_proba(validX) print("Evaluation model1(kaggle) of validation set :", evaluation(validy, valid1)) valid2 = calib2.predict_proba(validX) print("Evaluation model2(kaggle) of validation set :", evaluation(validy, valid2)) res1 = calib1.predict_proba(testX) saveResult(res1, "../" + sub1 + "_submission.csv") res2 = calib2.predict_proba(testX) saveResult(res2, "../" + sub2 + "_submission.csv") for x in [y / 10.0 for y in range(1, 10)]: combres = (x * res1 + (1 - x) * res2) if not len(validy) == 0: pred_valid = (x * valid1 + (1 - x) * valid2) print("Evaluation (kaggle) of validation set :", evaluation(validy, pred_valid)) saveResult( combres, "../combined_csv/combined_{:1.2f}{}_{:1.2f}{}.csv".format( x, sub1, 1 - x, sub2))
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est=100, depth=4, lrate=.1, n_fold=5): logging.info('Loading training and test data...') X, y = load_svmlight_file(train_file) X_tst, _ = load_svmlight_file(test_file) X = X.todense() X_tst = X_tst.todense() xg = XGBoostClassifier(n_estimators=n_est, eta=lrate, max_depth=depth, n_jobs=8) clf = BaggingClassifier(base_estimator=xg, n_estimators=5, max_samples=0.9, max_features=0.9, random_state=42) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) logging.info('Cross validation...') p_val = np.zeros_like(y) lloss = 0. for i_trn, i_val in cv: clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] lloss += log_loss(y[i_val], p_val[i_val]) logging.info('Log Loss = {:.4f}'.format(lloss / n_fold)) logging.info('Retraining with 100% data...') clf.fit(X, y) p_tst = clf.predict_proba(X_tst)[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def create_estimators(self, X_train, y_train, X_test): for model in self.models: param_grid = self.create_parameter_grid(model) for parameters in param_grid: clf = BaggingClassifier(base_estimator=model.set_params(**parameters), n_estimators=self.estimators, max_samples=0.95, n_jobs = 3) clf.fit(X_train, y_train) prediction = clf.predict_proba(X_test)[:,1] self.predictions.append(prediction)
def bagging_classifier(x_train, y_train, x_test): ''' creates a bagging classifier returns the predicted class probabilities on x_test from that classsifier ''' bc = BaggingClassifier(max_features=.75) bc.fit(x_train, y_train) return bc.predict_proba(x_test)[:, 0]
def run(): import numpy as np import pandas as pd import seaborn import matplotlib.pyplot as pyplot import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score from sklearn.svm import SVC from sklearn.ensemble import BaggingClassifier df = pd.read_table("./data/australian.csv", sep='\s+', header=None) y = df[14] X = df.drop(columns=14) y.value_counts() # Split features and target into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y, test_size=0.4) # Instantiate the Classifiers clf = BaggingClassifier(SVC(kernel='linear', random_state=1), max_samples=0.5, max_features=1.0, n_estimators=20) clf.fit(X_train, y_train) # Make predictions for the test set y_pred_test = clf.predict(X_test) # View accuracy score print(classification_report(y_test, y_pred_test)) clf_probs = clf.predict_proba(X_test) # keep probabilities for the positive outcome only clf_probs = clf_probs[:, 1] # calculate scores clf_auc = roc_auc_score(y_test, clf_probs) # summarize scores print('Bagged_SVM: ROC AUC=%.3f' % (clf_auc)) print("accuracy_score is %.3f" % (accuracy_score(y_test, y_pred_test, normalize=True))) # calculate roc curves clf_fpr, clf_tpr, _ = roc_curve(y_test, clf_probs) # plot the roc curve for the model pyplot.plot(clf_fpr, clf_tpr, marker='.', label='Bagged_SVM') # axis labels pyplot.xlabel('False Positive Rate') pyplot.ylabel('True Positive Rate') # show the legend pyplot.legend() # show the plot pyplot.show()
def test_parallel_classification(): # Check parallel classification. rng = check_random_state(0) # Classification X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train) # predict_proba ensemble.set_params(n_jobs=1) y1 = ensemble.predict_proba(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y3) # decision_function ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) decisions1 = ensemble.decision_function(X_test) ensemble.set_params(n_jobs=2) decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'), n_jobs=1, random_state=0).fit(X_train, y_train) decisions3 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions3)
def logistic_regression(xTrain, yTrain, xTest, model_eval='recall', ensemble=None, soft_output=True, verbose=True): # Check params if ensemble != None: if ensemble != 'bagging': print( 'Ensemble method not recognized. Supported value is: "bagging"' ) return nSamples, nFeatures = np.shape(xTrain) C = np.logspace(-3, 3, 10) param_grid = dict() param_grid.update({'C': C}) if verbose: print('Training logistic regression classifier') stkfold = StratifiedKFold(yTrain, n_folds=5) gs = GridSearchCV(LogisticRegression(class_weight='auto'), param_grid, scoring=model_eval, cv=stkfold, refit=True, n_jobs=-1) gs.fit(xTrain, yTrain) est = gs.best_estimator_ if ensemble == 'bagging': if verbose: print('Training bagging estimator') bag = BaggingClassifier(est, n_estimators=1000, oob_score=True, n_jobs=-1) bag.fit(xTrain, yTrain) if soft_output: scores = bag.predict_proba(xTest) else: scores = bag.predict(xTest) # print('Accuracy: '+str(bag.score(xTest,yTest))) else: if soft_output: scores = est.predict_proba(xTest) else: scores = est.predict(xTest) # print('Accuracy: '+str(est.score(xTest,yTest))) return est
def train_and_test(X_train, X_test, y_train, y_test): forest = BaggingClassifier(n_estimators=500, random_state=1234) forest = forest.fit(X_train, y_train) proba = forest.predict_proba(X_test) proba = proba[:, 1] y_test = np.array(y_test) fpr, tpr, thresholds = metrics.roc_curve(y_test, proba, pos_label=1) loss = metrics.auc(fpr, tpr) print loss return loss
def Bagging(X_train, y_train, X_test): bag_clf = BaggingClassifier(MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1), max_samples=0.5, max_features=0.5) bag_clf = bag_clf.fit(X_train, y_train) predicted = bag_clf.predict_proba(X_test) return bag_clf, predicted
def predict_with_best_model(estimator, xtrain, ytrain, xtest): from sklearn.ensemble import BaggingClassifier model = BaggingClassifier(base_estimator=estimator, n_estimators=10, max_samples=0.9, max_features=0.9, n_jobs=1, bootstrap=False, bootstrap_features=False, oob_score=False) model = model.fit(xtrain,ytrain) y = model.predict_proba(xtest) # print("Bagging score with oob estimates: ") # print model.oob_score_ print ("Model used: ") print model.base_estimator_ return y
class BaggingClassifier(BaseEstimator): def __init__(self, base_estimator=None, bag_kwargs=None): klass = dynamic_load(base_estimator['class']) svc = klass(**base_estimator['params']) self.__clf = SK_BaggingClassifier(base_estimator=svc, **bag_kwargs) def fit(self, X, y): return self.__clf.fit(X, y) def predict_proba(self, X): return self.__clf.predict_proba(X)
def tuned_classifier(TR1, TR1_outcome, TR2, type, tuned_parameters, model_type, ensemble_methods): prediction = None model = None #Step 2- K-fold cross validation on TR1 to obtain optimal regression model, model. if not tuned_parameters: model = type model.fit(TR1, TR1_outcome) if(ensemble_methods): model = BaggingClassifier(model, max_samples=1.0, max_features=1.0).fit(TR1, TR1_outcome) prediction = model.predict_proba(TR2) else: model = GridSearchCV(type, tuned_parameters, cv=10, scoring="accuracy").fit(TR1, TR1_outcome) if(ensemble_methods): model = BaggingClassifier(model.best_estimator_, max_samples=1.0, max_features=1.0).fit(TR1, TR1_outcome) if(model_type=="SVM" or model_type=="Decision Tree" or model_type=="SDG"): clf_isotonic = CalibratedClassifierCV(model, cv='prefit', method='sigmoid').fit(TR1, TR1_outcome) prediction = np.array(clf_isotonic.predict_proba(TR2)) else: prediction = np.array(model.predict_proba(TR2)) return { 'prediction': prediction, 'model': model, 'type': model_type}
class BaggingLearner(AbstractLearner): def __init__(self): self.learner = BaggingClassifier(KNeighborsClassifier()) def _train(self, x_train, y_train): self.learner = self.learner.fit(x_train, y_train) def _predict(self, x): return self.learner.predict(x) def _predict_proba(self, x): return self.learner.predict_proba(x)
def phenotype_imputation(data, config): ''' Function to impute the labels on II based on the classifier learned on I. Parameters ---------- data : an object of class Dataset that contains: genotypes, covariates, labels and information about random folds config : an object of class ConfigState. It contains the user-entered parameters in a YAML format. See the config_file parameter in the main script for more details. ''' # Parameters for this task num_folds = data.num_folds task_name = "phenotype_imputation" n_estimators = config.get_entry(task_name, "n_estimators") romans_trn = config.get_entry(task_name, "romans_used_for_learning") romans_tst = config.get_entry(task_name, "romans_used_for_imputing") # Iterate through the folds: i = 0 size_of_two = find_vec_entries_that_contain(data.folds[:,0], romans_tst).shape[0] soft_labels = np.zeros((size_of_two, num_folds)) X_scaled = preprocessing.scale(data.clin_covariate.transpose()).transpose() fpr = dict() tpr = dict() thres = dict() roc_auc = np.zeros(num_folds) for fold in data.folds.transpose(): logging.info("Fold=%d" % (i + 1)) sel_trn = find_vec_entries_that_contain(fold,[romans_trn]) sel_tst = find_vec_entries_that_contain(fold,[romans_tst]) model = BaggingClassifier(base_estimator=linear_model.LogisticRegression(), n_estimators=n_estimators, max_samples=0.632, # for small set I n_estimators=n_estimators, max_samples=0.8, max_features=5, bootstrap=True, bootstrap_features=True, oob_score=False, # for small set I bootstrap=False, bootstrap_features=True, oob_score=False, n_jobs=1, random_state=None, verbose=0) model.fit(X_scaled[:,sel_trn].transpose(), data.labels[:,sel_trn].transpose()) soft_labels[:,i] = model.predict_proba(X_scaled[:,sel_tst].transpose())[:,1] fpr[i], tpr[i], thres[i] = metrics.roc_curve(data.labels[0,sel_tst], soft_labels[:,i]) roc_auc[i] = metrics.auc(fpr[i], tpr[i]) i+=1 # Save the output of this task config.save_variable(task_name, "%f", soft_labels=soft_labels, roc_auc=roc_auc)
def validation(df_features_driver, df_features_other_train, df_features_other_test): df_train = df_features_driver.append(df_features_other_train) df_train.reset_index(inplace = True) df_train.Driver = df_train.Driver.astype(int) df_test = df_features_driver.append(df_features_other_test) df_test.reset_index(inplace = True) df_test.Driver = df_test.Driver.astype(int) # So far, the best result was achieved by using a RandomForestClassifier with Bagging # model = BaggingClassifier(base_estimator = ExtraTreesClassifier()) # model = BaggingClassifier(base_estimator = svm.SVC(gamma=2, C=1)) # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression()) # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression()) # model = BaggingClassifier(base_estimator = AdaBoostClassifier()) # model = RandomForestClassifier() model = BaggingClassifier(base_estimator = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0)) feature_columns_train= df_train.iloc[:, 4:] feature_columns_test= df_test.iloc[:, 4:] # Train the classifier model.fit(feature_columns_train, df_train.Driver) probs_array = model.predict_proba(feature_columns_test) # Return array with the probability for every driver probs_df = pd.DataFrame(probs_array) probs_list = np.array(['1', probs_df.ix[0, 1]]) for x in range(1, 200): # Column 1 should contain the driver of interest probs_list = np.vstack((probs_list, ['1', probs_df.ix[x, 1]])) for x in range(200,len(probs_df)): # Column 1 should contain the driver of interest probs_list = np.vstack((probs_list, ['0', probs_df.ix[x, 1]])) df_auc = AUC.AUC(probs_list) return df_auc
def test_parallel(): """Check parallel computations.""" rng = check_random_state(0) # Classification X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) for n_jobs in [-1, 3]: ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=n_jobs, random_state=0).fit(X_train, y_train) # predict_proba ensemble.set_params(n_jobs=1) y1 = ensemble.predict_proba(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y3) # decision_function ensemble = BaggingClassifier(SVC(), n_jobs=n_jobs, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) decisions1 = ensemble.decision_function(X_test) ensemble.set_params(n_jobs=2) decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) ensemble = BaggingClassifier(SVC(), n_jobs=1, random_state=0).fit(X_train, y_train) decisions3 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions3) # Regression X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) for n_jobs in [-1, 3]: ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) y1 = ensemble.predict(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict(X_test) assert_array_almost_equal(y1, y3)
dval = dtrain.ix[rows[:n_val]] label_dtrain = label.ix[rows[n_val:]] label_dval = label.ix[rows[:n_val]] #clf.cv(dtrain,label[0].values,5) calibrated_clf = CalibratedClassifierCV(clf, method='isotonic', cv=5) calibrated_clf.fit(dtrain_sp, label_dtrain[0].values) pred = calibrated_clf.predict_proba(dval) print("ROC score", metrics.roc_auc_score(label_dval[0].values, pred[:,1])) pred = calibrated_clf.predict_proba(dtest) sample = pd.read_csv('/Users/IkkiTanaka/Documents/KDDCup/sampleSubmission.csv',header=None) preds = pd.concat([sample[0],pd.DataFrame(pred[:,1])],axis=1) preds.to_csv('/Users/IkkiTanaka/Documents/KDDCup/pred/xgb/sk_GBM2.csv' ,header=None,index=False) #bagging clf = XGBoostClassifier(nthread=8,booster='gbtree',eta=0.08,gamma=1.0,max_depth=4,min_child_weight=4,subsample=0.9,colsample_bytree=0.9,l=0,alpha=0,lambda_bias=0,objective="binary:logistic",eval_metric='auc',seed=19920407,num_class=1,max_delta_step=0,early_stopping_rounds=None,num_round=450) bagging_clf = BaggingClassifier(base_estimator=clf, n_estimators=5, max_samples=.6, max_features=.8, bootstrap=True, bootstrap_features=False, oob_score=True, n_jobs=-1, random_state=19920407, verbose=1) bagging_clf.fit(dtrain_sp, label_dtrain[0].values) pred = bagging_clf.predict_proba(dval) print("ROC score", metrics.roc_auc_score(label_dval[0].values, pred[:,1]))
print "processing parameter combo:", i # configure model with j-th combo of parameters x = param_grid[i] model.C = x[0] model.class_weight = x[1] # loop over folds for j in range(0,n_folds): idx0 = np.where(fold_index != j) idx1 = np.where(fold_index == j) x0 = np.array(xtrain)[idx0,:][0]; x1 = np.array(xtrain)[idx1,:][0] y0 = np.array(ytrain)[idx0]; y1 = np.array(ytrain)[idx1] # fit the model on observations associated with subject whichSubject in this fold model.fit(x0, y0) mvalid[idx1,i] = model.predict_proba(x1)[:,1] # fit on complete dataset model.fit(xtrain, ytrain) mfull[:,i] = model.predict_proba(xtest)[:,1] ## store the results # add indices etc mvalid = pd.DataFrame(mvalid) mvalid.columns = [model_type + str(i) for i in range(0, mvalid.shape[1])] mvalid['QuoteNumber'] = id_train mvalid['QuoteConversion_Flag'] = ytrain mfull = pd.DataFrame(mfull) mfull.columns = [model_type + str(i) for i in range(0, mfull.shape[1])]
print "adaboost test:",roc_auc_score(y_test, ada.predict_proba(X_test)[:,1]) #print "adaboost train:",roc_auc_score(y_train, ada.predict_proba(X_train)[:,1]) #print "Fitting Decision Tree..." #dt.fit(X_train, y_train) #print "Decision tree test:", roc_auc_score(y_test, dt.predict_proba(X_test)[:,1]) #print "Decision tree train:",roc_auc_score(y_train, dt.predict_proba(X_train)[:,1]) #print "Fitting Random Forest..." #rf.fit(X_train, y_train) #print "random forest test:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]) #print "random forest train:", roc_auc_score(y_train, rf.predict_proba(X_train)[:,1]) print "Bagging Decision Trees..." bagged_dt.fit(X_train, y_train) print "bagged dt test:", roc_auc_score(y_test, bagged_dt.predict_proba(X_test)[:,1]) #print "bagged dt train",roc_auc_score(y_train, bagged_dt.predict_proba(X_train)[:,1]) print "Bagging RandomForests..." bagged_rf.fit(X_train, y_train) print "bagged rf test",roc_auc_score(y_test, bagged_rf.predict_proba(X_test)[:,1]) #print "bagged rf train",roc_auc_score(y_train, bagged_rf.predict_proba(X_train)[:,1]) '''print "Calibrating Bagged Decision Trees..." calibrated_dt.fit(X_train, y_train) print "calibrated_dt test:", roc_auc_score(y_test, calibrated_dt.predict_proba(X_test)[:,1]) print "Calibrating Bagged Random Forests..." calibrated_rf.fit(X_train, y_train) print "calibrated_rf test:", roc_auc_score(y_test, calibrated_rf.predict_proba(X_test)[:,1]) '''
lm_bagged = BaggingRegressor( base_estimator = lm, n_estimators = 75, max_samples = n_samp, max_features = n_feat, bootstrap = True, oob_score = False, warm_start = False, n_jobs = -1 ) log_bagged = BaggingClassifier( base_estimator = log, n_estimators = 75, max_samples = n_samp, max_features = n_feat, bootstrap = True, oob_score = False, warm_start = False, n_jobs = -1 ) lm_bagged.fit(X = train[features], y = train['y']) log_bagged.fit(X = train[features], y = train['y']) lm_bagged_preds = lm_bagged.predict(X = test[features]) log_bagged_preds = log_bagged.predict_proba(X = test[features]) write_function(lm_bagged_preds, '/tmp/lm_bagged_preds_nsamp-%s_nfeat-%s.txt' % (n_samp, n_feat)) write_function(second_pos_clip(log_bagged_preds), '/tmp/log_bagged_preds_nsamp-%s_nfeat-%s.txt' % (n_samp, n_feat))
X = df_train.values#_sparse # X_train, X_val, y_train, y_val = cross_validation.train_test_split(X, y, test_size=0.3) X_test = df_test[df_train.columns].values#_sparse rfc = RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=1, random_state=rs) eclf = BaggingClassifier(rfc, n_estimators=2, n_jobs=1,max_samples=0.1,max_features=3) eclf.fit(X, y) # Need to chunk to avoid memory error test_chunks = np.array_split(df_test.values,50) for i, chunk in enumerate(test_chunks): test_X = chunk if i > 0: test_y = np.concatenate( [test_y, eclf.predict_proba(test_X)]) else: test_y = eclf.predict_proba(test_X) print(i) test_prob = np.array(test_y) print(test_prob.shape) def makespace(x): return " ".join([str(int(z)) for z in x]) submissions = (-test_prob).argsort()[:,:5] submit = pd.read_csv(work_dir+'sample_submission.csv') intermediate = np.apply_along_axis(makespace, 1, submissions) submit['hotel_cluster'] = intermediate
# setup bagging classifier bag0 = BaggingClassifier(base_estimator=model, n_estimators=nbag, max_samples=0.05, max_features=0.97, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=1, random_state=seed_value, verbose=2) bag0.fit(x0, y0) prx = bag0.predict_proba(x1)[:,1] mvalid[idx1,i] = prx print log_loss(y1, prx) print "finished fold:", j # fit on complete dataset bag0.fit(xtrain, ytrain) mfull[:,i] = bag0.predict_proba(xtest)[:,1] print "finished full prediction" ======= for i in range(len(param_grid)): print "processing parameter combo:", i # configure model with j-th combo of parameters x = param_grid[i]
n_estimators=n_estimators/10, random_state=1, n_jobs=nb_parallel ).fit(x_local_train, y_local_train) else: M = RForestRegress(n_estimators, random_state=1, n_jobs=nb_parallel ).fit(x_local_train, y_local_train) else: vprint(verbose, "[-] task not recognized") break vprint(verbose, "[+] Fitting success, time spent so far %5.2f sec" % (time.time() - start)) # Make predictions on local validation set if task == 'binary.classification': y_local_valid_pred = M.predict_proba(x_local_valid)[:, 1] elif task == 'multiclass.classification': y_local_valid_pred = M.predict_proba(x_local_valid).T elif task == 'multilabel.classification': y_local_valid_pred = np.array([Ms[i].predict_proba(x_local_valid)[:, 1] for i in range(K)]).T elif task == 'regression': y_local_valid_pred = M.predict(x_local_valid) # Local validation # x_local_valid, y_local_valid metric_type = D.info['metric'] if 'f1_metric' == metric_type: metric = f1_metric(y_local_valid, y_local_valid_pred) elif 'r2_metric' == metric_type: metric = r2_metric(y_local_valid, y_local_valid_pred)
clf = AdaBoostClassifier(base_estimator=None, n_estimators=125, learning_rate=0.025, algorithm='SAMME.R', random_state=190, ) clf.fit(x0, y0) pr1 = clf.predict_proba(x1)[:,1] resmat[ii,0] = roc_auc_score(y1, pr1) # bagging + ada clf0 = AdaBoostClassifier(base_estimator=None, n_estimators=125, learning_rate=0.025, algorithm='SAMME.R', random_state=190, ) clf1 = BaggingClassifier(base_estimator=clf0, n_estimators=25, max_samples=0.5, max_features=0.95, bootstrap=False, bootstrap_features=False, oob_score=False, n_jobs=-1, random_state=xseed + 1, verbose=1) clf1.fit(x0, y0) pr2 = clf1.predict_proba(x1)[:,1] resmat[ii,1] = roc_auc_score(y1, pr2) # bagging clf0 = ExtraTreesClassifier(n_estimators = 10, n_jobs = -1, verbose = 1, class_weight = 'auto', min_samples_leaf = 5, random_state = xseed) clf1 = AdaBoostClassifier(base_estimator=clf0, n_estimators=25, learning_rate=0.025, algorithm='SAMME.R', random_state=190, ) clf2 = BaggingClassifier(base_estimator=clf1, n_estimators=25, max_samples=0.25, max_features=0.9, bootstrap=False, bootstrap_features=False, oob_score=False, n_jobs=-1, random_state=xseed + 1, verbose=1) clf2.fit(x0, y0) pr3 = clf2.predict_proba(x1)[:,1] resmat[ii,2] = roc_auc_score(y1, pr3)
forest = forest.fit(X_train, y_train) proba = forest.predict_proba(X_test) proba = proba[:, 1] y_test = np.array(y_test) fpr, tpr, thresholds = metrics.roc_curve(y_test, proba, pos_label=1) loss = metrics.auc(fpr, tpr) print loss return loss def kfold_validation(data=train, y=y, trials=10): skf = cross_validation.StratifiedKFold(y, n_folds=10) error = 0.0 for train_index, test_index in skf: X_train, X_test = data[train_index], data[test_index] y_train, y_test = y[train_index], y[test_index] error += train_and_test(X_train, X_test, y_train, y_test) return error/trials score = kfold_validation() print score forest = BaggingClassifier(n_estimators=1000, random_state=1234) forest = forest.fit(train, y) proba = forest.predict_proba(test) proba = proba[:, 1] submission = pd.DataFrame({"bidder_id": idx, "prediction": proba}) submission.to_csv("submissions/submission_bag1.csv", index=False) print 'Done.'
'''for name, clf in clfs: clf.fit(train_[cols], train_["TripType"]) clf.predict(test_[cols]) preds = clf.predict_proba(test_[cols]) #print(confusion_matrix(test['class'], clf.predict(test[cols]))) print (pd.crosstab(test_['TripType'], clf.predict(test_[cols]), rownames=["Actual"], colnames=["Predicted"])) print (classification_report(test_['TripType'], clf.predict(test_[cols]))) score=accuracy_score(test_['TripType'],clf.predict(test_[cols])) table.append([name,score]) print (table) ''' clf=BaggingClassifier(GradientBoostingClassifier()) clf.fit(train_[cols], train_["TripType"]) clf.predict(test_[cols]) preds = clf.predict_proba(test_[cols]) #print(confusion_matrix(test['class'], clf.predict(test[cols]))) print (pd.crosstab(test_['TripType'], clf.predict(test_[cols]), rownames=["Actual"], colnames=["Predicted"])) print (classification_report(test_['TripType'], clf.predict(test_[cols]))) score=accuracy_score(test_['TripType'],clf.predict(test_[cols])) table.append([score]) print (table) eclf = VotingClassifier(estimators = [('BaggingKNN', BaggingClassifier(KNeighborsClassifier(20))), ('RandomForest', RandomForestClassifier(10)), ('BaggingCART', BaggingClassifier(DecisionTreeClassifier()))], voting='soft', weights=[7,1,1]) eclf.fit(train[cols], train["TripType"]) #use the classifier to predict predicted=eclf.predict(test[cols]) #print (accuracy_score(predicted,test['TripType']))
print predicted_Tree print prob_Tree len_prob_tree = len(prob_Tree) count = 0 while count < len_prob_tree: print (prob_Tree[count][1]) count += 1 print print ("bagging classification") res_bagging = BaggingClassifier() res_bagging.fit(learnImage_list, learnImageType_list) predicted_bag = res_bagging.predict(learnUnknownImage_list) prob_bag = res_bagging.predict_proba(learnUnknownImage_list) print predicted_bag print prob_bag len_prob_bag = len(prob_bag) count = 0 while count < len_prob_bag: print (prob_bag[count][1]) count += 1 """ >>> X = [[0], [1], [2], [3]] >>> y = [0, 0, 1, 1] >>> from sklearn.neighbors import KNeighborsClassifier >>> neigh = KNeighborsClassifier(n_neighbors=3)
clf2 = AdaBoostClassifier(n_estimators=50) clf3 = GradientBoostingClassifier(n_estimators=50, learning_rate=1.0,max_depth=1, random_state=0) clf4 = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.5, kernel='poly',degree = 2, max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False ) #svm clf5 = linear_model.LogisticRegression()#logit clf4 = BaggingClassifier(clf4,max_samples=0.7, max_features=1.0,n_estimators=20) clf5 = BaggingClassifier(clf5,max_samples=0.7, max_features=1.0,n_estimators=20) clf1 = clf1.fit(x2_train, y_train) clf2 = clf2.fit(x2_train, y_train) clf3 = clf3.fit(x2_train, y_train) clf4 = clf4.fit(x2_train, y_train) clf5 = clf5.fit(x2_train, y_train) x1_test = list(zip(data2_2.X,data2_2.Y,data2_2.year,data2_2.month,data2_2.day,data2_2.hour)) x2_test = [0]*len(x1_test) for i in range(0,len(x1_test)): x2_test[i]= list(x1_test[i])+data2_2['dow'][i]+data2_2['dis'][i] r1 = clf1.predict_proba(x2_test) r2 = clf2.predict_proba(x2_test) r3 = clf3.predict_proba(x2_test) r4 = clf4.predict_proba(x2_test) r5 = clf5.predict_proba(x2_test)
X = pd.read_csv('train.csv') X= X.drop('id', axis=1) y = X.target.values y = LabelEncoder().fit_transform(y) X = X.drop('target', axis=1) print X.head(3) Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.20, random_state=36) print 'X splitted' clf = RFC(n_estimators=250, n_jobs=-1) # use BaggingClassifier to make 5 predictions, and average clfbag = BC(clf, n_estimators=5) print 'fitting bag clf ...' clfbag.fit(Xtrain, ytrain) print 'done !' ypreds = clfbag.predict_proba(Xtest) # will be 0.60 also print "%.2f" % log_loss(ytest, ypreds, eps=1e-15, normalize=True) clf = RFC(n_estimators=250, n_jobs=-1) # isotonic works better than the default sigmoid in this case clfcali = CalibratedClassifierCV(clf, method='isotonic', cv=5) print 'fitting calibration clf ...' clfcali.fit(Xtrain, ytrain) print 'done !' ypreds = clfcali.predict_proba(Xtest) # will be 0.49 also print "%.2f" % log_loss(ytest, ypreds, eps=1e-15, normalize=True)