def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1. assert_warns_message(UserWarning, "Warm-start fitting without increasing n_estimators does not", clf.fit, X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test))
def query_by_bagging(X, y, current_model, batch_size, rng, base_model=SVC(C=1, kernel='linear'), n_bags=5, method="KL", D=None): """ :param base_model: Model that will be **fitted every iteration** :param n_bags: Number of bags on which train n_bags models :param method: 'entropy' or 'KL' :return: """ assert method == 'entropy' or method == 'KL' eps = 0.0000001 if method == 'KL': assert hasattr(base_model, 'predict_proba'), "Model with probability prediction needs to be passed to this strategy!" clfs = BaggingClassifier(base_model, n_estimators=n_bags, random_state=rng) clfs.fit(X[y.known], y[y.known]) pc = clfs.predict_proba(X[np.invert(y.known)]) # Settles page 17 if method == 'entropy': pc += eps fitness = np.sum(pc * np.log(pc), axis=1) ids = np.argsort(fitness)[:batch_size] elif method == 'KL': p = np.array([clf.predict_proba(X[np.invert(y.known)]) for clf in clfs.estimators_]) fitness = np.mean(np.sum(p * np.log(p / pc), axis=2), axis=0) ids = np.argsort(fitness)[-batch_size:] return y.unknown_ids[ids], fitness/np.max(fitness)
def test_base(): # Check BaseEnsemble methods. ensemble = BaggingClassifier( base_estimator=Perceptron(tol=1e-3, random_state=None), n_estimators=3) iris = load_iris() ensemble.fit(iris.data, iris.target) ensemble.estimators_ = [] # empty the list and create estimators manually ensemble._make_estimator() random_state = np.random.RandomState(3) ensemble._make_estimator(random_state=random_state) ensemble._make_estimator(random_state=random_state) ensemble._make_estimator(append=False) assert_equal(3, len(ensemble)) assert_equal(3, len(ensemble.estimators_)) assert_true(isinstance(ensemble[0], Perceptron)) assert_equal(ensemble[0].random_state, None) assert_true(isinstance(ensemble[1].random_state, int)) assert_true(isinstance(ensemble[2].random_state, int)) assert_not_equal(ensemble[1].random_state, ensemble[2].random_state) np_int_ensemble = BaggingClassifier(base_estimator=Perceptron(tol=1e-3), n_estimators=np.int32(3)) np_int_ensemble.fit(iris.data, iris.target)
class ADABoost(Base): def train(self, data = None, plugin=None): """ With dataframe train mllib """ super(ADABoost, self).train(data, plugin) #cl = svm.SVC(gamma=0.001, C= 100, kernel='linear', probability=True) X = self.X_train.iloc[:,:-1] Y = self.X_train.iloc[:,-1] self.scaler = StandardScaler().fit(X) X = self.scaler.transform(X) cl = SGDClassifier(loss='hinge') p = Pipeline([("Scaler", self.scaler), ("svm", cl)]) self.clf = BaggingClassifier(p, n_estimators=50) #self.clf = AdaBoostClassifier(p, n_estimators=10) #self.clf = AdaBoostClassifier(SGDClassifier(loss='hinge'),algorithm='SAMME', n_estimators=10) self.clf.fit(X, Y) def predict(self, file, plugin=None): super(ADABoost, self).predict(file, plugin) data = file.vector X = data[plugin] X = self.scaler.transform(X) guess = self.clf.predict(X) return self.getTag(guess)
def test_bagging_sample_weight_unsupported_but_passed(): estimator = BaggingClassifier(DummyZeroEstimator()) rng = check_random_state(0) estimator.fit(iris.data, iris.target).predict(iris.data) assert_raises(ValueError, estimator.fit, iris.data, iris.target, sample_weight=rng.randint(10, size=(iris.data.shape[0])))
def bagging(X_train, X_test, y_train, y_test,n_est): n_est=51 estimators=range(1,n_est) decision_clf = DecisionTreeClassifier() for est in estimators: bagging_clf = BaggingClassifier(decision_clf, n_estimators=est, max_samples=0.67,max_features=0.67, bootstrap=True, random_state=9) bagging_clf.fit(X_train, y_train) # test line y_pred_bagging1 = bagging_clf.predict(X_test) score_bc_dt1 = accuracy_score(y_test, y_pred_bagging1) scores1.append(score_bc_dt1) # train line y_pred_bagging2 = bagging_clf.predict(X_train) score_bc_dt2 = accuracy_score(y_train, y_pred_bagging2) scores2.append(score_bc_dt2) plt.figure(figsize=(10, 6)) plt.title('Bagging Info') plt.xlabel('Estimators') plt.ylabel('Scores') plt.plot(estimators,scores1,'g',label='test line', linewidth=3) plt.plot(estimators,scores2,'c',label='train line', linewidth=3) plt.legend() plt.show()
def test_bagging_classifier_with_missing_inputs(): # Check that BaggingClassifier can accept X with missing/infinite data X = np.array([ [1, 3, 5], [2, None, 6], [2, np.nan, 6], [2, np.inf, 6], [2, np.NINF, 6], ]) y = np.array([3, 6, 6, 6, 6]) classifier = DecisionTreeClassifier() pipeline = make_pipeline( FunctionTransformer(replace, validate=False), classifier ) pipeline.fit(X, y).predict(X) bagging_classifier = BaggingClassifier(pipeline) bagging_classifier.fit(X, y) y_hat = bagging_classifier.predict(X) assert_equal(y.shape, y_hat.shape) bagging_classifier.predict_log_proba(X) bagging_classifier.predict_proba(X) # Verify that exceptions can be raised by wrapper classifier classifier = DecisionTreeClassifier() pipeline = make_pipeline(classifier) assert_raises(ValueError, pipeline.fit, X, y) bagging_classifier = BaggingClassifier(pipeline) assert_raises(ValueError, bagging_classifier.fit, X, y)
def train_dts(observations,targets,method='bagging'): """Trains a decision tree for each output :param observations: our train dataset :param targets: multiple target variables. :param method: bagging,random_forest,boosting :return: the dt models in a list, one for each target variable """ n_targets = len(targets[0]) tars = np.array(targets) dts = [] for i in range(n_targets): act_tar = tars[:,i].tolist() dt = None if method == 'bagging': dt = BaggingClassifier(tree.DecisionTreeClassifier(),n_estimators=100,max_samples=0.5, max_features=1.) elif method == 'random_forest': dt = RandomForestClassifier(n_estimators=100) elif method == 'boosting': dt = AdaBoostClassifier(n_estimators=100) else: dt = tree.DecisionTreeClassifier() # the dt cannot be trained if the outputs are all equal. In that case, we create a fake dt if len(set(act_tar)) > 1: # We want to have a balanced data set while training. bal_observations, bal_tar = sample_balanced_dataset(observations,act_tar) #from data_manipulation dt.fit(bal_observations,bal_tar) else: dt = FakeClassifier(act_tar[0]) dts.append(dt) return dts
def test_warm_start_smaller_n_estimators(): # Test if warm start'ed second fit with smaller n_estimators raises error. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BaggingClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) assert_raises(ValueError, clf.fit, X, y)
def test_bagging_with_pipeline(): estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(iris.data, iris.target) assert_true(isinstance(estimator[0].steps[-1][1].random_state, int))
def test_estimators_samples(): # Check that format of estimators_samples_ is correct and that results # generated at fit time can be identically reproduced at a later time # using data saved in object attributes. X, y = make_hastie_10_2(n_samples=200, random_state=1) bagging = BaggingClassifier(LogisticRegression(), max_samples=0.5, max_features=0.5, random_state=1, bootstrap=False) bagging.fit(X, y) # Get relevant attributes estimators_samples = bagging.estimators_samples_ estimators_features = bagging.estimators_features_ estimators = bagging.estimators_ # Test for correct formatting assert_equal(len(estimators_samples), len(estimators)) assert_equal(len(estimators_samples[0]), len(X) // 2) assert_equal(estimators_samples[0].dtype.kind, 'i') # Re-fit single estimator to test for consistent sampling estimator_index = 0 estimator_samples = estimators_samples[estimator_index] estimator_features = estimators_features[estimator_index] estimator = estimators[estimator_index] X_train = (X[estimator_samples])[:, estimator_features] y_train = y[estimator_samples] orig_coefs = estimator.coef_ estimator.fit(X_train, y_train) new_coefs = estimator.coef_ assert_array_almost_equal(orig_coefs, new_coefs)
def test_estimators_samples_deterministic(): # This test is a regression test to check that with a random step # (e.g. SparseRandomProjection) and a given random state, the results # generated at fit time can be identically reproduced at a later time using # data saved in object attributes. Check issue #9524 for full discussion. iris = load_iris() X, y = iris.data, iris.target base_pipeline = make_pipeline(SparseRandomProjection(n_components=2), LogisticRegression()) clf = BaggingClassifier(base_estimator=base_pipeline, max_samples=0.5, random_state=0) clf.fit(X, y) pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy() estimator = clf.estimators_[0] estimator_sample = clf.estimators_samples_[0] estimator_feature = clf.estimators_features_[0] X_train = (X[estimator_sample])[:, estimator_feature] y_train = y[estimator_sample] estimator.fit(X_train, y_train) assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
class BaggingSK(PoolGenerator): ''' This class should not be used, use brew.generation.bagging.Bagging instead. ''' def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'): self.base_classifier = base_classifier self.n_classifiers = n_classifiers # using the sklearn implementation of bagging for now self.sk_bagging = BaggingClassifier(base_estimator=base_classifier, n_estimators=n_classifiers, max_samples=1.0, max_features=1.0) self.ensemble = Ensemble() self.combiner = Combiner(rule=combination_rule) def fit(self, X, y): self.sk_bagging.fit(X, y) self.ensemble.add_classifiers(self.sk_bagging.estimators_) #self.classes_ = set(y) def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
def baggedDecisionTree( X_train, y_train, X_test, y_test, nEstimators ): print("\n### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###") print("baggedDecisionTree()\n") ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myBaggedDecisionTree = BaggingClassifier( base_estimator = DecisionTreeClassifier(), n_estimators = nEstimators, # max_samples = X_train.shape[0], bootstrap = True, oob_score = True, n_jobs = -1 # use all available cores ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myBaggedDecisionTree.fit(X_train,y_train) y_pred = myBaggedDecisionTree.predict(X_test) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### print( "nEstimators: " + str(nEstimators) ) print( "out-of-bag score: " + str(myBaggedDecisionTree.oob_score_) ) print( "accuracy score: " + str(accuracy_score(y_test,y_pred)) ) print( "out-of-bag decision function:" ) print( str(myBaggedDecisionTree.oob_decision_function_) ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( None )
def train_classifiers(data): train_vars = [ 'X', 'Y', 'Darkness', 'Moon', 'Hour', 'DayOfWeekInt', 'Day', 'Month', 'Year', 'PdDistrictInt', 'TemperatureC', 'Precipitationmm', 'InPdDistrict', 'Conditions', 'AddressCode', ] weather_mapping = { 'Light Drizzle': 1, 'Drizzle': 2, 'Light Rain': 3, 'Rain': 4, 'Heavy Rain': 5, 'Thunderstorm': 6, } data.Precipitationmm = data.Precipitationmm.fillna(-1) data.Conditions = data.Conditions.map(weather_mapping).fillna(0) train, test = split(data) X_train = train[train_vars] y_train = train.CategoryInt X_test = test[train_vars] y_test = test.CategoryInt bdt_real_2 = AdaBoostClassifier( DecisionTreeClassifier(max_depth=8), n_estimators=10, learning_rate=1 ) #bdt_real = DecisionTreeClassifier(max_depth=None, min_samples_split=1, #random_state=6065) bdt_real = BaggingClassifier(base_estimator=bdt_real_2, random_state=6065, n_estimators=100) #bdt_real = RandomForestClassifier(random_state=6065, #n_estimators=200) #bdt_real = ExtraTreesClassifier(random_state=6065, #min_samples_split=5, #n_estimators=200) bdt_real.fit(X_train, y_train) y_predict = pandas.Series(bdt_real.predict(X_test)) print len(y_predict[y_predict == y_test]) print len(y_predict) return bdt_real
def classification(self, x_train, y_train): ml = BaggingClassifier(DecisionTreeClassifier()) ml.fit(x_train, y_train) # print y_train[0] # print x_train[0] y_pred = ml.predict(x_train) print 'y_train ',y_train print 'y_pred ',y_pred.tolist()
def test_max_samples_consistency(): # Make sure validated max_samples and original max_samples are identical # when valid integer max_samples supplied by user max_samples = 100 X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1) bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=max_samples, max_features=0.5, random_state=1) bagging.fit(X, y) assert_equal(bagging._max_samples, max_samples)
def create_estimators(self, X_train, y_train, X_test): for model in self.models: param_grid = self.create_parameter_grid(model) for parameters in param_grid: clf = BaggingClassifier(base_estimator=model.set_params(**parameters), n_estimators=self.estimators, max_samples=0.95, n_jobs = 3) clf.fit(X_train, y_train) prediction = clf.predict_proba(X_test)[:,1] self.predictions.append(prediction)
def test_oob_score_consistency(): # Make sure OOB scores are identical when random_state, estimator, and # training data are fixed and fitting is done twice X, y = make_hastie_10_2(n_samples=200, random_state=1) bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5, oob_score=True, random_state=1) assert_equal(bagging.fit(X, y).oob_score_, bagging.fit(X, y).oob_score_)
def test_bagging_small_max_features(): # Check that Bagging estimator can accept low fractional max_features X = np.array([[1, 2], [3, 4]]) y = np.array([1, 0]) bagging = BaggingClassifier(LogisticRegression(), max_features=0.3, random_state=1) bagging.fit(X, y)
def adaboost_train(train_file,test_file): _,x,y = readFile(train_file) print 'reading done.' ts = x.shape[0] id,x2 = readFile(test_file) print x.shape print x2.shape x = np.concatenate((x,x2)) print 'concatenate done.' from sklearn.preprocessing import scale x = scale(x,with_mean=False) print 'scale done.' x2 = x[ts:] x=x[0:ts] from sklearn.feature_selection import SelectKBest,chi2 x = SelectKBest(chi2,k=50000).fit_transform(x,y) from sklearn.cross_validation import train_test_split tmp_array = np.arange(x.shape[0]) train_i, test_i = train_test_split(tmp_array, train_size = 0.8, random_state = 500) train_x = x[train_i] test_x = x[test_i] train_y = y[train_i] test_y = y[test_i] from sklearn.ensemble import BaggingClassifier bagging = BaggingClassifier(LR(penalty='l2',dual=True),n_estimators = 10,max_samples=0.6,max_features=0.6) bagging.fit(train_x,train_y) print 'train done.' res = bagging.predict(train_x) print res from sklearn.metrics import roc_auc_score score = roc_auc_score(train_y,res) res = bagging.predict_proba(train_x) print res score = roc_auc_score(train_y,res[:,1]) print score print '-----------------------------------------' print res[:,1] res = bagging.predict_proba(test_x) score = roc_auc_score(test_y,res[:,1]) print score y=bagging.predict_proba(x2) output = pd.DataFrame( data={"id":id, "sentiment":y[:,1]} ) output.to_csv( "/home/chuangxin/Bagging_result.csv", index=False, quoting=3 ) return bagging
def test_oob_score_removed_on_warm_start(): X, y = make_hastie_10_2(n_samples=2000, random_state=1) clf = BaggingClassifier(n_estimators=50, oob_score=True) clf.fit(X, y) clf.set_params(warm_start=True, oob_score=False, n_estimators=100) clf.fit(X, y) assert_raises(AttributeError, getattr, clf, "oob_score_")
def main(): '''main function''' bagging = BaggingClassifier(DecisionTreeClassifier()) iris = load_iris() x = iris.data y = iris.target #train, test, train_, test_ = train_test_split(x, y, test_size=0.2, random_state=42) bagging.fit(x, y) bagging.predict(x[:2]) print(bagging.score(x[:2], y[:2]))
def phenotype_imputation(data, config): ''' Function to impute the labels on II based on the classifier learned on I. Parameters ---------- data : an object of class Dataset that contains: genotypes, covariates, labels and information about random folds config : an object of class ConfigState. It contains the user-entered parameters in a YAML format. See the config_file parameter in the main script for more details. ''' # Parameters for this task num_folds = data.num_folds task_name = "phenotype_imputation" n_estimators = config.get_entry(task_name, "n_estimators") romans_trn = config.get_entry(task_name, "romans_used_for_learning") romans_tst = config.get_entry(task_name, "romans_used_for_imputing") # Iterate through the folds: i = 0 size_of_two = find_vec_entries_that_contain(data.folds[:,0], romans_tst).shape[0] soft_labels = np.zeros((size_of_two, num_folds)) X_scaled = preprocessing.scale(data.clin_covariate.transpose()).transpose() fpr = dict() tpr = dict() thres = dict() roc_auc = np.zeros(num_folds) for fold in data.folds.transpose(): logging.info("Fold=%d" % (i + 1)) sel_trn = find_vec_entries_that_contain(fold,[romans_trn]) sel_tst = find_vec_entries_that_contain(fold,[romans_tst]) model = BaggingClassifier(base_estimator=linear_model.LogisticRegression(), n_estimators=n_estimators, max_samples=0.632, # for small set I n_estimators=n_estimators, max_samples=0.8, max_features=5, bootstrap=True, bootstrap_features=True, oob_score=False, # for small set I bootstrap=False, bootstrap_features=True, oob_score=False, n_jobs=1, random_state=None, verbose=0) model.fit(X_scaled[:,sel_trn].transpose(), data.labels[:,sel_trn].transpose()) soft_labels[:,i] = model.predict_proba(X_scaled[:,sel_tst].transpose())[:,1] fpr[i], tpr[i], thres[i] = metrics.roc_curve(data.labels[0,sel_tst], soft_labels[:,i]) roc_auc[i] = metrics.auc(fpr[i], tpr[i]) i+=1 # Save the output of this task config.save_variable(task_name, "%f", soft_labels=soft_labels, roc_auc=roc_auc)
class BaggingDecisionTrees(object): def __init__(self, n_estimators): self.classifier = BaggingClassifier(n_estimators=n_estimators) def fit(self, xs, ys): xs = xs.values ys = ys['y'] self.classifier.fit(xs, ys) def predict(self, xs): xs = xs.values ys = self.classifier.predict(xs) return ys
def main(): # The competition datafiles are in the directory /input # Read output csv format in case the file does not exists submit = pd.read_csv('sample_submission.csv') # Training cols print ("Loading training csv.") #train_cols = ['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market', 'hotel_cluster'] train_cols = ['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country', 'hotel_cluster'] train = pd.DataFrame(columns=train_cols) train_chunk = pd.read_csv('input/train.csv', chunksize=100000) print ("Training csv loaded.") # Read each chunk to train for chunk in train_chunk: #train = pd.concat( [ train, chunk ] ) train = pd.concat( [ train, chunk[chunk['is_booking']==1][train_cols] ] ) print ("Chunk done") # Load each column #x_train = train[['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market']].values x_train = train[['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country']].values y_train = train['hotel_cluster'].values # Run RandomForest on training data print ("Training RandomForest.") rf = RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=4) bclf = BaggingClassifier(rf, n_estimators=2, n_jobs=4) bclf.fit(x_train, y_train) print ("Training done.") print ("Loading testing csv.") test_chunk = pd.read_csv('input/test.csv', chunksize=100000) print ("Begin testing each chunk.") predict = np.array([]) # Read each chunk to test for i, chunk in enumerate(test_chunk): #test_X = chunk[['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market']].values test_X = chunk[['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country']].values test_X = np.nan_to_num(test_X) if i > 0: predict = np.concatenate( [predict, bclf.predict_proba(test_X)]) else: predict = bclf.predict_proba(test_X) print ("Chunk id: " + str(i)) submit['hotel_cluster'] = np.apply_along_axis(get5Best, 1, predict) submit.head() submit.to_csv('submission_random_forest.csv', index=False)
def bagging_with_base_estimator(base_estimator, x_train, x_test, y_train, y_test, rands = None): """ Predict the lemons using a Bagging Classifier and a random seed both for the number of features, as well as for the size of the sample to train the data on ARGS: - x_train: :class:`pandas.DataFrame` of the x_training data - y_train: :class:`pandas.Series` of the y_training data - x_test: :class:`pandas.DataFrame` of the x_testing data - y_test: :class:`pandas.Series` of the y_testing data - rands: a :class:`tuple` of the (rs, rf) to seed the sample and features of the BaggingClassifier. If `None`, then rands are generated and provided in the return `Series` RETURNS: :class:`pandas.Series` of the f1-scores and random seeds """ #create a dictionary for the return values ret_d = {'train-f1':[], 'test-f1':[], 'rs':[], 'rf':[]} #use the randoms provided if there are any, otherwise generate them if not rands: rs = numpy.random.rand() rf = numpy.random.rand() while rf < 0.1: rf = numpy.random.rand() else: rs, rf = rands[0], rands[1] #place them into the dictionary ret_d['rs'], ret_d['rf'] = rs, rf #create and run the bagging classifier bc = BaggingClassifier(base_estimator = base_estimator, n_estimators = 300, max_samples = rs, max_features = rf, n_jobs = 1) bc.fit(x_train, y_train) y_hat_train = bc.predict(x_train) ret_d['train-f1'] = f1_score(y_train, y_hat_train) y_hat_test = bc.predict(x_test) ret_d['test-f1'] = f1_score(y_test, y_hat_test) return pandas.Series(ret_d)
def train(data, labels): """ classifier = VotingClassifier(estimators=[ ('rf', RandomForestClassifier(n_estimators=400, n_jobs=-1)), ('ada', AdaBoostClassifier(n_estimators=50, base_estimator=RandomForestClassifier( n_estimators=40, n_jobs=-1))), ('nc', NearestCentroid()) ]) """ classifier = BaggingClassifier(base_estimator=AdaBoostClassifier( base_estimator=RandomForestClassifier(n_estimators=40, n_jobs=-1)), n_jobs=-1) classifier.fit(data, labels) return classifier
def TrainKNeighbors(p_subject, p_save): print "Welcome to TrainKNeighbors(" + p_subject + ", " + str(p_save) + ")" training_data = pd.read_pickle(input_data_paths[p_subject]) # Ictal vs interictal kneighbors = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5) y = training_data.T["classification"] kneighbors.fit(training_data[:-2].T, y) # Save models if p_save: model_save_filename = "/Users/dryu/Documents/DataScience/Seizures/data/models/KN_" + p_subject + ".pkl" model_save_file = open(model_save_filename, 'w') pickle.dump(kneighbors, model_save_file) model_save_file.close() return {"simultaneous":kneighbors}
def train_bagging(): model = build_model() bagging_model = BaggingClassifier(base_estimator=model,n_estimators=bagging_num_estimator, max_samples=bagging_sample_fraction,oob_score=bagging_use_oob) #train model bagging_model.fit(XC, yc) #persist model if persist_model: models = bagging_model.estimators_ for m in zip(range(0, len(models)), models): model_file = model_file_directory + "/" + model_file_prefix + "_" + str(m[0] + 1) + ".mod" joblib.dump(m[1], model_file) score = bagging_model.score(XC, yc) print "average error %.3f" %(1.0 - score)
class ShapeletForestClassifier(BaseEstimator, ClassifierMixin): def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2, n_shapelets=10, min_shapelet_size=0, max_shapelet_size=1, metric='euclidean', metric_params=None, bootstrap=True, n_jobs=None, random_state=None): """A shapelet forest classifier """ self.n_estimators = n_estimators self.bootstrap = bootstrap self.n_jobs = n_jobs self.max_depth = max_depth self.min_samples_split = min_samples_split self.n_shapelets = n_shapelets self.min_shapelet_size = min_shapelet_size self.max_shapelet_size = max_shapelet_size self.metric = metric self.metric_params = metric_params self.random_state = random_state def predict(self, X, check_input=True): return self.classes_[np.argmax(self.predict_proba( X, check_input=check_input), axis=1)] def predict_proba(self, X, check_input=True): # Correct formating of x if len(X.iloc[0]) == 1: # UNI X = [ np.array(X.iloc[i].iloc[0]).tolist() for i in range(0, len(X)) ] else: # MULTI X = [[ np.array(X.iloc[i].iloc[j]).tolist() for j in range(0, len(X.iloc[i])) ] for i in range(0, len(X))] if check_input: X = check_array(X, dtype=np.float64, allow_nd=True, order="C") if X.ndim < 2 or X.ndim > 3: raise ValueError("illegal input dimensions X.ndim ({})".format( X.ndim)) if self.n_dims_ > 1 and X.ndim != 3: raise ValueError("illegal input dimensions X.ndim != 3") if X.shape[-1] != self.n_timestep_: raise ValueError("illegal input shape ({} != {})".format( X.shape[-1], self.n_timestep_)) if X.ndim > 2 and X.shape[1] != self.n_dims_: raise ValueError("illegal input shape ({} != {}".format( X.shape[1], self.n_dims)) if X.dtype != np.float64 or not X.flags.contiguous: X = np.ascontiguousarray(X, dtype=np.float64) X = X.reshape(X.shape[0], self.n_dims_ * self.n_timestep_) return self.bagging_classifier_.predict_proba(X) def fit(self, X, y, sample_weight=None, check_input=True): """Fit a random shapelet forest classifier """ # Correct formating of x if len(X.iloc[0]) == 1: # UNI X2 = [ np.array(X.iloc[i].iloc[0]).tolist() for i in range(0, len(X)) ] else: # MULTI X2 = [[ np.array(X.iloc[i].iloc[j]).tolist() for j in range(0, len(X.iloc[i])) ] for i in range(0, len(X))] random_state = check_random_state(self.random_state) if check_input: X = check_array(X2, dtype=np.float64, allow_nd=True, order="C") y = check_array(y, ensure_2d=False) if X.ndim < 2 or X.ndim > 3: raise ValueError("illegal input dimension") n_samples = X.shape[0] self.n_timestep_ = X.shape[-1] if X.ndim > 2: n_dims = X.shape[1] else: n_dims = 1 self.n_dims_ = n_dims if y.ndim == 1: self.classes_, y = np.unique(y, return_inverse=True) else: _, y = np.nonzero(y) if len(y) != n_samples: raise ValueError("Single label per sample expected.") self.classes_ = np.unique(y) if len(y) != n_samples: raise ValueError("Number of labels={} does not match " "number of samples={}".format(len(y), n_samples)) if X.dtype != np.float64 or not X.flags.contiguous: X = np.ascontiguousarray(X, dtype=np.float64) if not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=np.intp) shapelet_tree_classifier = ShapeletTreeClassifier( max_depth=self.max_depth, min_samples_split=self.min_samples_split, n_shapelets=self.n_shapelets, min_shapelet_size=self.min_shapelet_size, max_shapelet_size=self.max_shapelet_size, metric=self.metric, metric_params=self.metric_params, random_state=random_state, ) if n_dims > 1: shapelet_tree_classifier.force_dim = n_dims self.bagging_classifier_ = BaggingClassifier( base_estimator=shapelet_tree_classifier, bootstrap=self.bootstrap, n_jobs=self.n_jobs, n_estimators=self.n_estimators, random_state=self.random_state, ) X = X.reshape(n_samples, n_dims * self.n_timestep_) self.bagging_classifier_.fit(X, y, sample_weight=sample_weight) return self
accuracy_score(y_test, pre_rf) # check the accuracy # # bagging # In[40]: from sklearn.ensemble import BaggingClassifier # In[41]: bg = BaggingClassifier(RandomForestClassifier(), n_estimators=20, max_features=1.0, max_samples=0.5) bg.fit(x_train, y_train) # fitting the model # In[42]: pre_bag = bg.predict(x_test) # predicting the results # In[43]: accuracy_score(y_test, pre_bag) # # ada boosting # In[44]: from sklearn.ensemble import AdaBoostClassifier
decision_function_shape='ovo', class_weight='balanced', C=100, gamma=0.1), n_jobs=4) # sv = svm.SVC(probability=True, class_weight='balanced', random_state=42, C=100, gamma=0.1) # X_sv = train_dataset_full[train_dataset_full['type'] != 6.0 ].drop(columns=['type', 'session']) # y_sv = train_dataset_full[train_dataset_full['type'] != 6.0 ]['type'] X_sv = train_dataset_full.drop(columns=['type', 'session']) y_sv = train_dataset_full['type'] # print(y_sv.isna()) #%% mod_sv = sv.fit(X_sv, y_sv) #%% # sv.estimators_ #%% # del train_dataset_full with open('svm_trained_with_type_6.pkl', 'wb') as handle: pkl.dump(mod_sv, handle, protocol=-1) #%% #%% svm_model = mod_sv # svm_model = pkl.load(open('svm_trained_paper.pkl', 'rb')) test_svm_full = pd.concat(test_svm.values) svm_predicted = svm_model.predict_proba( test_svm_full.drop(columns=['type', 'session']).dropna())
X_train = count_vect.transform(X_train1) clf = MLPClassifier(alpha=1, random_state=65) clf.fit(X_train, y_train) clf2 = SVC(probability = True, gamma=2, C=1) clf2.fit(X_train, y_train) clf3 = DecisionTreeClassifier(random_state = 0) clf3.fit(X_train, y_train) clf4 = PassiveAggressiveClassifier() clf4.fit(X_train, y_train) clf5 = BaggingClassifier(random_state=54) clf5.fit(X_train, y_train) clf6 = ExtraTreesClassifier(random_state=0) clf6.fit(X_train, y_train) clf7 = GradientBoostingClassifier(random_state=32) clf7.fit(X_train, y_train) vc = VotingClassifier(estimators=[ ('mlp', clf), ('dt', clf3), ('et', clf6), ('bag', clf5), ('grad', clf7) ], voting='soft', weights=[0.3, 0.1, 0.2, 0.1, 0.3]) vc.fit(X_train, y_train) predicted = clf.predict(X_test) predicted2 = clf2.predict(X_test) predicted3 = clf3.predict(X_test)
#--------------------------------------------------------------------------------# ## Evaluate Bagging performance from sklearn.metrics import accuracy_score X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) ## Decision Tree classifier dt = DecisionTreeClassifier(random_state=1) dt.fit(X_train, y_train) y_pred = dt.predict(X_test) acc_test = accuracy_score(y_pred, y_test) print("Test set accuracy of dt: {:.2f}".format(acc_test)) # Fit bc to the training set bc.fit(X_train, y_train) # Predict test set labels y_pred = bc.predict(X_test) # Evaluate acc_test acc_test = accuracy_score(y_pred, y_test) print('Test set accuracy of bc: {:.2f}'.format(acc_test)) print("-" * 38) #--------------------------------------------------------------------------------# #--------------------------------------------------------------------------------# ## Out of Bag Evaluation ## Prepare the ground # Instantiate dt
y_pred6 = model6.predict(x_test) accuracy6 = accuracy_score(y_test, y_pred6) print("AdaBoost Accuracy: %.2f%%" % (accuracy6 * 100.0)) # ****** 7) Bagging ******************** from sklearn.ensemble import BaggingClassifier tree7 = DecisionTreeClassifier(criterion='entropy') model7 = BaggingClassifier(base_estimator=tree7, n_estimators=60, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=1, random_state=1) model7.fit(x_train, y_train) y_pred7 = model7.predict(x_test) accuracy7 = accuracy_score(y_test, y_pred7) print("Bagging Accuracy: %.2f%%" % (accuracy7 * 100.0)) # ****** 8) Random Forest ******************** from sklearn.ensemble import RandomForestClassifier model8 = RandomForestClassifier(n_estimators=60, random_state=0, n_jobs=-1) model8.fit(x_train, y_train) y_pred8 = model8.predict(x_test) accuracy8 = accuracy_score(y_test, y_pred8) print("Random Forest Accuracy: %.2f%%" % (accuracy8 * 100.0)) # ****** 9) XGBoost ******************** from xgboost import XGBClassifier model9 = XGBClassifier()
enc = LabelEncoder() enc.fit(y) y = enc.fit_transform(y) X = df.iloc[:, :6] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) dectree = tree.DecisionTreeClassifier(max_depth=5) bag = BaggingClassifier(n_estimators=100, oob_score=True) rf = RandomForestClassifier(n_estimators=1000, oob_score=True, max_features='auto') boost = AdaBoostClassifier(n_estimators=1000) dectree.fit(X_train, y_train) bag.fit(X_train, y_train) rf.fit(X_train, y_train) boost.fit(X_train, y_train) print('Tree', 'Bagging', 'Boosting', 'Random Forrest\n', np.round_(dectree.score(X_test, y_test), 2), np.round_(bag.score(X_test, y_test), 2), np.round_(boost.score(X_test, y_test), 2), np.round_(rf.score(X_test, y_test), 2), '\nTraining error\n', np.round_(dectree.score(X_train, y_train), 2), np.round_(bag.score(X_train, y_train), 2), np.round_(boost.score(X_train, y_train), 2), np.round_(rf.score(X_train, y_train), 2)) print('RF cross-val error:\n', 1 - rf.oob_score_) print('Bagging cross-val error:\n', 1 - bag.oob_score_) print( pd.DataFrame(rf.feature_importances_,
def test_bagging_with_pipeline(): estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(iris.data, iris.target) assert isinstance(estimator[0].steps[-1][1].random_state, int)
def test_warm_start_with_oob_score_fails(): # Check using oob_score and warm_start simultaneously fails X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BaggingClassifier(n_estimators=5, warm_start=True, oob_score=True) with pytest.raises(ValueError): clf.fit(X, y)
def model_train(model_type, X_train_, X_valid, y_train_, y_valid): """ tree,lightgbm, xgboost, catboost, randomforest, adaboost, logit, knn, gmm, svn, lda, naivebayes """ if model_type == "tree": treeclf = DecisionTreeClassifier(max_depth=7) treeclf.fit(X_train_, y_train_) pred_model = treeclf del treeclf if model_type == "bagging": bagclf = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5) bagclf.fit(X_train_, y_train_) pred_model = bagclf del bagclf if model_type == "lightgbm": # 0.8711 --> 22 minuti e 1 # dtrain = lgb.Dataset(X_train, label=y_train) #,categorical_feature = categorical_columns) # dvalid = lgb.Dataset(X_valid, label=y_valid) #,categorical_feature = categorical_columns) lgbclf = lgb.LGBMClassifier( num_leaves=512, # was 512 - default 31 n_estimators=512, # default 100 was 512 max_depth=8, # default -1, was 9 learning_rate=0.1, # default 0.1 feature_fraction=0.4, # default 1 was 0.4, bagging_fraction=0.4, # default 1 was 0.4, # subsample by row metric="auc", # binary_logloss auc boosting_type="gbdt", # goss # dart --> speed: goss>gbdt>dart lambda_l1=0.4, # default 0 - 0.4 lambda_l2=0.6, # default 0 - 0.6 scale_pos_weight=18, # defualt 1 ) lgbclf.fit(X_train_, y_train_) pred_model = lgbclf del lgbclf elif model_type == "xgboost": # sooo slow #0.8614 # scale_pos_weight and adjust settings # https://stats.stackexchange.com/questions/243207/what-is-the-proper-usage-of-scale-pos-weight-in-xgboost-for-imbalanced-datasets xgbclf = xgb.XGBClassifier( num_leaves=512, n_estimators=512, max_depth=25, learning_rate=0.1, feature_fraction=0.4, bagging_fraction=0.4, subsample=0.85, metric="auc", # binary_logloss colsample_bytree=0.85, boosting_type="gbdt", # goss # dart --> speed: goss>gbdt>dart reg_alpha=0.4, reg_lamdba=0.6, scale_pos_weight=82.9, ) xgbclf.fit(X_train_, y_train_) pred_model = xgbclf del xgbclf elif model_type == "catboost": # serve farlo anche negli altri modelli? ycopy = y_train_.copy() ycopy["target_class"] = ycopy["target_class"].apply(lambda x: 1 if (x >= 0.5) else 0) X_train_1, X_valid_1, y_train_1, y_valid_1 = train_test_split( X_train_, ycopy.values.flatten(), test_size=0.05) params = { "loss_function": "Logloss", # objective function "eval_metric": "AUC", # metric "verbose": 200, # output to stdout info about training process every 200 iterations } catclf = catboost.CatBoostClassifier(**params) catclf.fit( X_train_1, y_train_1, # data to train on (required parameters, unless we provide X as a pool object, will be shown below) eval_set=(X_valid_1, y_valid_1), # data to validate on use_best_model= True, # True if we don't want to save trees created after iteration with the best validation score plot= True, # True for visualization of the training process (it is not shown in a published kernel - try executing this code) ) del X_train_1, X_valid_1, y_train_1, y_valid_1 pred_model = catclf del catclf elif model_type == "randomforest": # 0.8476 # che senso associata ad una singola prediction a 1.6??? # https://towardsdatascience.com/an-implementation-and-explanation-of-the-random-forest-in-python-77bf308a9b76 rfclf = RandomForestClassifier(n_estimators=512, bootstrap=True, max_features="sqrt") rfclf.fit(X_train_, y_train_) pred_model = rfclf del rfclf elif model_type == "adaboost": # 0.851 -->8:16:45 ore # https://subscription.packtpub.com/book/big_data_and_business_intelligence/9781787286382/9/ch09lvl1sec95/tuning-an-adaboost-regressor # https://towardsdatascience.com/boosting-algorithm-adaboost-b6737a9ee60c adaclf = AdaBoostClassifier(n_estimators=512, learning_rate=0.0069) adaclf.fit(X_train_, y_train_) pred_model = adaclf del adaclf elif model_type == "logit": # 0.7764 --> 12:52 minuti senza GridSearch, con gridsearch 63.8% ## che senso associata ad una singola prediction a 1.6??? con grid search 0.5 # https://towardsdatascience.com/an-implementation-and-explanation-of-the-random-forest-in-python-77bf308a9b76 logregclf = LogisticRegression(penalty="l1", solver="saga", tol=1e-3) pipe = Pipeline([("model", logregclf)]) param_grid = {"model__max_iter": [1000]} # adding grid_search to logit logregclf_cv = GridSearchCV(pipe, param_grid=param_grid, scoring="roc_auc", cv=3) logregclf_cv.fit(X_train_, y_train_) # print('best_params_={}\nbest_score_={}'.format(repr(logregclf_cv.best_params_), repr(logregclf_cv.best_score_))) logregclf = logregclf_cv.best_estimator_ pred_model = logregclf del logregclf elif model_type == "knn": # 0.612 Tempo: 3:21:59.613695 # https://www.quora.com/How-can-I-choose-the-best-K-in-KNN-K-nearest-neighbour-classification knnclf = KNeighborsClassifier(n_neighbors=3, leaf_size=30) # ), 'p': 1}) knnclf.fit(X_train_, y_train_) pred_model = knnclf del knnclf elif model_type == "gmm": # 0 Tempo: # https://www.kaggle.com/albertmistu/detect-anomalies-using-gmm gmmclf = GaussianMixture() # gaussian mixture model ycopy = y_train_.copy() ycopy["target_class"] = ycopy["target_class"].apply(lambda x: 1 if (x >= 0.5) else 0) gmmclf.fit(X_train_, ycopy) pred_model = gmmclf del gmmclf elif model_type == "svm": # 0 Tempo: # https://www.kaggle.com/kojr1234/fraud-detection-using-svm svcclf = SVC(kernel="rbf", gamma=4 * 1e-3, C=10) svcclf.fit(X_train_, y_train_) pred_model = svcclf del svcclf elif model_type == "lda": # 0 Tempo: ldaclf = LinearDiscriminantAnalysis() ldaclf.fit(X_train_, y_train_) pred_model = ldaclf del ldaclf elif model_type == "naivebayes": # 0 Tempo: gnbclf = GaussianNB() # priors = [0.995,0.005]) gnbclf.fit(X_train_, y_train_) pred_model = gnbclf del gnbclf else: print("Please, try one of the possible models") del X_train_, y_train_ print("finish train") return pred_model, X_valid.copy(), y_valid.copy()
def main(): ############################################################################### # Preparing the dataset # --------------------- # In this part we load the breast cancer dataset from scikit-learn and # preprocess it in order to pass to the DS models. An important point here is # to normalize the data so that it has zero mean and unit variance, which is # a common requirement for many machine learning algorithms. # This step can be easily done using the StandardScaler class. rng = np.random.RandomState(123) data = load_breast_cancer() X = data.data y = data.target # split the data into training and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=rng) # Scale the variables to have 0 mean and unit variance scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Split the data into training and DSEL for DS techniques X_train, X_dsel, y_train, y_dsel = train_test_split(X_train, y_train, test_size=0.5, random_state=rng) # Train a pool of 100 base classifiers pool_classifiers = BaggingClassifier(Perceptron(max_iter=10), n_estimators=100, random_state=rng) pool_classifiers.fit(X_train, y_train) # Initialize the DS techniques knorau = KNORAU(pool_classifiers) kne = KNORAE(pool_classifiers) desp = DESP(pool_classifiers) ola = OLA(pool_classifiers) mcb = MCB(pool_classifiers) ############################################################################### # Calibrating base classifiers # ----------------------------- # Some dynamic selection techniques requires that the base classifiers estimate # probabilities in order to estimate its competence level. Since the Perceptron # model is not a probabilistic classifier (does not implements the # predict_proba method, it needs to be calibrated for # probability estimation before being used by such DS techniques. This step can # be conducted using the CalibrateClassifierCV class from scikit-learn. Note # that in this example we pass a prefited pool of classifiers to the # calibration method in order to use exactly the same pool used in the other # DS methods. calibrated_pool = [] for clf in pool_classifiers: calibrated = CalibratedClassifierCV(base_estimator=clf, cv='prefit') calibrated.fit(X_dsel, y_dsel) calibrated_pool.append(calibrated) apriori = APriori(calibrated_pool) meta = METADES(calibrated_pool) knorau.fit(X_dsel, y_dsel) kne.fit(X_dsel, y_dsel) desp.fit(X_dsel, y_dsel) ola.fit(X_dsel, y_dsel) mcb.fit(X_dsel, y_dsel) apriori.fit(X_dsel, y_dsel) meta.fit(X_dsel, y_dsel) ############################################################################### # Evaluating the methods # ----------------------- # Let's now evaluate the methods on the test set. We also use the performance # of Bagging (pool of classifiers without any selection) as a baseline # comparison. We can see that the majority of DS methods achieve higher # classification accuracy. print('Evaluating DS techniques:') print('Classification accuracy KNORA-Union: ', knorau.score(X_test, y_test)) print('Classification accuracy KNORA-Eliminate: ', kne.score(X_test, y_test)) print('Classification accuracy DESP: ', desp.score(X_test, y_test)) print('Classification accuracy OLA: ', ola.score(X_test, y_test)) print('Classification accuracy A priori: ', apriori.score(X_test, y_test)) print('Classification accuracy MCB: ', mcb.score(X_test, y_test)) print('Classification accuracy META-DES: ', meta.score(X_test, y_test)) print('Classification accuracy Bagging: ', pool_classifiers.score(X_test, y_test))
# ============================================================================= # # Bagging Classifier # ============================================================================= # Instantiate dt dt = DecisionTreeClassifier(random_state=6) # Instantiate bc bc = BaggingClassifier(base_estimator=dt, bootstrap=True, n_estimators=60, random_state=6) # Fit bc to the training set bc.fit(x_train, y_train) # Predict test set labels y_pred = bc.predict(x_test) # Evaluate training and test acc score. print("") print("Bagging result :-") print("Training Accuracy: {:.3f}".format(bc.score(x_train, y_train))) print("Testing Accuracy: {:.3f}".format(bc.score(x_test, y_test))) # ============================================================================= # # Random Forest Classifier # ============================================================================= # Instatiate a RandomForest 'rf'
import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier as knn import numpy as np from sklearn import svm from sklearn.model_selection import KFold from randomforest_featureselection import clf, xtrain, ytrain, xtest, ytest, X_important_train, X_important_test from sklearn.metrics import accuracy_score from sklearn.ensemble import BaggingClassifier clf_imp = BaggingClassifier(svm.SVC(kernel='linear', C=1).fit(xtrain, ytrain)) clf_imp.fit(X_important_train, ytrain) from sklearn.model_selection import cross_val_score, KFold n_folds = [] n_folds.append(('K2', 2)) n_folds.append(('K4', 4)) n_folds.append(('K5', 5)) n_folds.append(('K10', 10)) seed = 7 for name, n_split in n_folds: results = [] names = [] print(name) kfold = KFold(n_splits=n_split, random_state=seed) cv_results = cross_val_score(clf_imp, X_important_train, ytrain,
mask_threshold_0 = y_proba[:,0]>=0.46 y_proba[mask_threshold_0,:]=0 mask_threshold_1 = y_proba[:,1]>=0.5 y_proba[mask_threshold_1,:]=1 y_pred = y_proba[:,0] df_score_filter_methods.loc['rf','with ROC Curve'] = f1_score(y_test, y_pred, average=None)[0] #%% Bootstrap Aggregating from sklearn.ensemble import BaggingClassifier bagging = BaggingClassifier(clf_rf,n_estimators = 100,max_samples=0.7, max_features=0.15,bootstrap_features=True) y_pred = bagging.fit(x_train,y_train).predict(x_test) df_score_filter_methods.loc['rf','with Bagging'] = my_f1_score(y_test, y_pred) #%% Boosting #AdaBoost from sklearn.ensemble import AdaBoostClassifier clf_boost = AdaBoostClassifier(clf.best_estimator_,n_estimators=500) y_pred = clf_boost.fit(x_train,y_train).predict(x_test) df_score_filter_methods.loc['rf','AdaBoostClassifier'] = my_f1_score(y_test, y_pred) #%%Blending from sklearn.ensemble import VotingClassifier clf_voting = VotingClassifier(estimators=[ ('clf_boost', clf_boost), ('clf.best_estimator_', clf.best_estimator_), ('clf_rf', clf_rf),('clf_neigh', clf_neigh),('clf_svc', clf_svc)], voting='hard')#'soft'
def fit(self, df_X, df_y): logger.info("Fitting LightningClassification") if not df_y.shape[0] == df_X.shape[0]: raise ValueError("number of regions is not equal") if df_y.shape[1] != 1: raise ValueError("y needs to have 1 label column") if self.scale: # Scale motif scores df_X[:] = scale(df_X, axis=0) idx = list(range(df_y.shape[0])) y = df_y.iloc[idx] X = df_X.loc[y.index].values y = y.values.flatten() # Convert (putative) string labels label = LabelEncoder() y = label.fit_transform(y) # Split data X_train, X_test, y_train, y_test = train_test_split(X, y) logger.debug("Setting parameters through cross-validation") # Determine best parameters based on CV self.clf.fit(X_train, y_train) logger.debug("Average score ({} fold CV): {}".format( self.kfolds, self.clf.score(X_test, y_test))) logger.debug("Estimate coefficients using bootstrapping") # Estimate coefficients using bootstrappig # b = BaggingClassifier(self.clf.best_estimator_, # max_samples=0.75, n_jobs=-1, random_state=state) b = BaggingClassifier(self.clf.best_estimator_, max_samples=0.75, n_jobs=-1) b.fit(X, y) # Get mean coefficients coeffs = np.array([e.coef_ for e in b.estimators_]).mean(axis=0) # Create dataframe of predicted coefficients if len(label.classes_) == 2: self.act_ = pd.DataFrame(np.hstack((-coeffs.T, coeffs.T))) else: self.act_ = pd.DataFrame(coeffs.T) # Convert labels back to original names self.act_.columns = label.inverse_transform(range(len(label.classes_))) self.act_.index = df_X.columns if self.permute: # Permutations logger.debug("Permutations") random_dfs = [] for _ in range(10): y_random = np.random.permutation(y) b.fit(X, y_random) coeffs = np.array([e.coef_ for e in b.estimators_]).mean(axis=0) if len(label.classes_) == 2: random_dfs.append( pd.DataFrame(np.hstack((-coeffs.T, coeffs.T)))) else: random_dfs.append(pd.DataFrame(coeffs.T)) random_df = pd.concat(random_dfs) # Select cutoff based on percentile high_cutoffs = random_df.quantile(0.99) low_cutoffs = random_df.quantile(0.01) # Set significance self.sig_ = pd.DataFrame(index=df_X.columns) self.sig_["sig"] = False for col, c_high, c_low in zip(self.act_.columns, high_cutoffs, low_cutoffs): self.sig_["sig"].loc[self.act_[col] >= c_high] = True self.sig_["sig"].loc[self.act_[col] <= c_low] = True logger.info("Done")
def test_bagging_with_pipeline(): estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(iris.data, iris.target)
#Porcentajes de acierto print("Usando NuSVC se tiene una tasa de acierto del ", np.mean(scoresNu) * 100) print("Usando SVC se tiene una tasa de acierto del ", np.mean(scoresSvc) * 100) #Matrices de validación print("Matriz SVM - Nu: ", matrizCruzada(predsvNu)) print("Matriz SVM - SVC: ", matrizCruzada(predsvSvc)) #Cuarto algoritmo ENSEMBLED METHODS #Bagging meta-estimator bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5) bagging.fit(data_train, target_train) preBag = bagging.predict(data_test) scoresBag = cross_val_score(bagging, atributos, target, cv=5, scoring='accuracy') #Random Forests forests = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0) forests.fit(data_train, target_train) preFo = forests.predict(data_test) scoresFo = cross_val_score(forests,
a5 = metrics.accuracy_score(labels_test, pred5) q1.append(("MLP", { "alpha": 0.05, "solver": "adam", "batch_size": 800, "max_iter": 200, "beta_1": 0.85, "beta_2": 0.7 }, a5)) bagging = BaggingClassifier(base_estimator=tree.DecisionTreeClassifier(), n_estimators=9, max_samples=20586, max_features=17) # bagging start = time.perf_counter() bagging.fit(datasets, l) pred6 = bagging.predict(datasets_test) end = time.perf_counter() t1.append(end - start) a6 = metrics.accuracy_score(labels_test, pred6) q1.append(("bagging", { "base_estimator": "tree.DecisionTreeClassifier()", "n_estimators": 9, "max_samples": 20586, "max_features": 17 }, a6)) # draw table and show each classifier's parameters, accuracy and training time table = Texttable() table.add_rows([ ["classifier", "parameters", "accuracy", "training time"],
for j in range(1,100): X_train_aug = np.concatenate((X_train_aug, np.roll(X_train, j, axis=1))) X_train_aug = np.concatenate((X_train_aug, -np.roll(X_train, j, axis=1))) y_train_aug = np.concatenate((y_train_aug, y_train)) y_train_aug = np.concatenate((y_train_aug, y_train)) # Apply data augmentation on testing data X_test_aug, y_test_aug = X_test, y_test X_test_aug = np.concatenate((X_test_aug, -X_test)) y_test_aug = np.concatenate((y_test_aug, y_test)) for j in range(1,100): X_test_aug = np.concatenate((X_test_aug, np.roll(X_test, j, axis=1))) X_test_aug = np.concatenate((X_test_aug, -np.roll(X_test, j, axis=1))) y_test_aug = np.concatenate((y_test_aug, y_test)) y_test_aug = np.concatenate((y_test_aug, y_test)) # Fit the model clf.fit(X_train_aug, y_train_aug) train_score = clf.score(X_test, y_test) train_score_aug = clf.score(X_test_aug, y_test_aug) # Save the score scores = np.append(scores, train_score) scores_aug = np.append(scores_aug, train_score_aug) # Print final score with open('ris/OUT-score_alglorithms.txt', mode='a') as f: print('Average score:', scores.mean(), '+-', scores.std() / np.sqrt(n_splits), file=f) print('Average score (augmented):', scores_aug.mean(), '+-', scores_aug.std() / np.sqrt(n_splits), file=f) params = {'chat_id': telegram_bot_id['chat_id'], 'text': '[python] End Bagging Classifier k-fold validation.'} requests.post('https://api.telegram.org/' + telegram_bot_id['bot_id'] + '/sendMessage', params=params) params = {'chat_id': telegram_bot_id['chat_id'], 'text': '[python] End data augmentation part.'}
test_prediction = model.predict(X_test); #build a submit table format submit_table = y_test; submit_table.buy_next_day = test_prediction; submit_table = submit_table.loc[submit_table.buy_next_day==1.0]; submit_pair = submit_table.reset_index(level=[0, 1, 2]); submit_pair = submit_pair.loc[:, ['user_id', 'item_id']]; submit_pair.user_id = submit_pair.user_id.apply(str); submit_pair.item_id = submit_pair.item_id.apply(str); submit_pair.to_csv('tianchi_mobile_recommendation_predict.csv', index=False); #train the model with linear svm n_estimators = 10; model_svm = BaggingClassifier(LinearSVC(class_weight='balanced'), max_samples=1.0 / n_estimators, n_estimators=n_estimators); res_svm = model_svm.fit(X_train, y_train.values.ravel()); #train prediction train_prediction_svm = model_svm.predict(X_train); train_accuracy_svm = np.mean(train_prediction_svm==y_train.buy_next_day.values); train_f1_svm, train_precision_svm, train_recall_svm = F1(y_train.buy_next_day.values, train_prediction_svm); #cv prediction cv_prediction_svm = model_svm.predict(X_cv); cv_accuracy_svm = np.mean(cv_prediction_svm==y_cv.buy_next_day.values); cv_f1_svm, cv_precision_svm, cv_recall_svm = F1(y_cv.buy_next_day.values, cv_prediction_svm); #local test prediction local_test_prediction_svm = model_svm.predict(X_local_test); local_test_accuracy_svm = np.mean(local_test_prediction_svm==y_local_test.buy_next_day.values); local_test_f1_svm, local_test_precision_svm, local_test_recall_svm = F1(y_local_test.buy_next_day.values, local_test_prediction_svm); #test prediction submit test_prediction_svm = model.predict(X_test);
for n in range(1, 30): my_bgc = MyBaggingClassifier(tree_clf, n_estimators=n, max_samples=110, max_features=10) my_bgc.fit(bigDataset_X_train, bigDataset_Y_train) y = my_bgc.predict(bigDataset_X_test) myBaggingClassifierError.append(accuracy_score(y, bigDataset_Y_test)) baggingClassifierError = [] for n in range(1, 30): bgc = BaggingClassifier(tree_clf, n_estimators=n, max_samples=110, max_features=10) bgc.fit(smallDataset_X_train, smallDataset_Y_train) y = bgc.predict(smallDataset_X_test) baggingClassifierError.append(accuracy_score(y, smallDataset_Y_test)) randomForestClassifierError = [] for n in range(1, 30): bgc = RandomForestClassifier(n_estimators=n, max_features=8) bgc.fit(bigDataset_X_train, bigDataset_Y_train) y = bgc.predict(bigDataset_X_test) randomForestClassifierError.append(accuracy_score(y, bigDataset_Y_test)) gradientBoostingClassifierError = [] for n in range(1, 30): bgc = GradientBoostingClassifier(n_estimators=n, max_features=8) bgc.fit(bigDataset_X_train, bigDataset_Y_train) y = bgc.predict(bigDataset_X_test)
#Naive Bayes classifier classifierNB = GaussianNB() classifierNB.fit(x_train, y_train) pred = classifierNB.predict(X_ul) #XGBoost classifier classifierXGB = XGBClassifier(n_estimators=20, n_jobs=-1) classifierXGB.fit(x_train, y_train) pred = classifierXGB.predict(X_ul) #Bagging Classifier classifierBG = BaggingClassifier(tree.DecisionTreeClassifier(), n_estimators=20, n_jobs=-1) classifierBG.fit(x_train, y_train) pred = classifierBG.predict(X_ul) #Gradient Boosting Classifier classifierGB = GradientBoostingClassifier(n_estimators=20, learning_rate=1.0, max_depth=1).fit( x_train, y_train) pred = classifierGB.predict(X_ul) #Adaboost classifier classifierAB = AdaBoostClassifier(base_estimator=RandomForestClassifier( n_estimators=20, criterion='entropy', n_jobs=-1), n_estimators=20) classifierAB.fit(x_train, y_train) pred = classifierAB.predict(X_ul)
myGBDT=GradientBoostingClassifier() myBagging=BaggingClassifier(SVC(C=0.5),n_estimators=100) print("--training model...") myROC=0 rocList=[] for k in tqdm.tqdm(kfModel.split(X)): trainX=X[k[0]] trainY=y[k[0]] testX=X[k[1]] testY=y[k[1]] myGBDT.fit(trainX,trainY) preY1=myGBDT.predict(testX) myBagging.fit(trainX,trainY) preY2=myBagging.predict(testX) preY=(preY1+preY2)/2 try: tmpROC=roc_auc_score(testY,preY) except: continue rocList.append(tmpROC) if tmpROC>myROC: myROC=tmpROC joblib.dump(myGBDT,"model/myModel.model") print("roc:",myROC) print("recall:",recall_score(testY,preY)) print("precision:",precision_score(testY,preY))
length = (len(data) - 2) / 2 for j in range(0,length): value = (float64)(data[j * 2 + 1]) c = words_list[data[j * 2]] row.append(i) column.append(c) element.append((value + 1.0) * (value + 0.8)) i = i + 1 label.append(train_id_to_label[data[length * 2]]) feature = coo_matrix((element,(row,column)),shape=(i,tot_word)) source.close() print "finish step 4" X_train,X_test,Y_train,Y_test = train_test_split(feature,label,train_size = 0.8,random_state = 215) bagging = BaggingClassifier(LogisticRegression(penalty = 'l1',solver = 'liblinear',C = 0.1204,random_state = 215),n_estimators = 4,max_samples = 0.9,max_features = 0.9,random_state = 214) bagging.fit(X_train,Y_train) print "finish step 5" predict_X_test = bagging.predict_proba(X_test) source = open("bagging_validproba.csv","wb") writer = csv.writer(source) for each in predict_X_test: writer.writerow([each[1]]) source.close() y_score = [] for each in predict_X_test: y_score.append(each[1]) print metrics.roc_auc_score(Y_test,y_score) row = [] column = [] element = []
print("Errors: " + str(wrong), " Correct :" + str(right)) print("Accuracy: " + str(right/(right+wrong)*100)) print(classification_report(test[1], m[0][0])) print(confusion_matrix(test[1], m[0][0])) """## Bagging --- """ bagging1 = BaggingClassifier(base_estimator=clf_LR2, n_estimators=5, max_samples=0.8, max_features=0.8) bagging2 = BaggingClassifier(base_estimator=clf_NN1, n_estimators=5, max_samples=0.8, max_features=0.8) bagging3 = BaggingClassifier(base_estimator=clf_RBF1, n_estimators=5, max_samples=0.8, max_features=0.8) bagging4 = BaggingClassifier(base_estimator=clf1, n_estimators=5, max_samples=0.8, max_features=0.8) start_time = time.time() bagging1.fit(train[0], train[1]) print("--- %s seconds ---" % (time.time() - start_time)) start_time = time.time() bagging2.fit(train[0], train[1]) print("--- %s seconds ---" % (time.time() - start_time)) start_time = time.time() bagging3.fit(train[0], train[1]) print("--- %s seconds ---" % (time.time() - start_time)) start_time = time.time() bagging4.fit(train[0], train[1]) print("--- %s seconds ---" % (time.time() - start_time)) accuracy(bagging1,test[0],test[1]) accuracy(bagging2,test[0],test[1]) accuracy(bagging3,test[0],test[1]) accuracy(bagging4,test[0],test[1])
rfc = RandomForestClassifier(random_state=4) rfc.fit(X_train, y_train) y_pred = rfc.predict(X_test) roc_score = roc_auc_score(y_test, y_pred) print("Random forest score: ", roc_score) # Code ends here # -------------- # Import Bagging Classifier from sklearn.ensemble import BaggingClassifier # Code starts here bagging_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, max_samples=100, random_state=0) bagging_clf.fit(X_train, y_train) score_bagging = bagging_clf.score(X_test, y_test) print("Bagging 100 DTrees : ", score_bagging) # Code ends here # -------------- # Import libraries from sklearn.ensemble import VotingClassifier # Various models clf_1 = LogisticRegression() clf_2 = DecisionTreeClassifier(random_state=4) clf_3 = RandomForestClassifier(random_state=4) model_list = [('lr', clf_1), ('DT', clf_2), ('RF', clf_3)] # Code starts here voting_clf_hard = VotingClassifier(estimators=model_list, voting='hard') voting_clf_hard.fit(X_train, y_train)
err_ctree2_tr = ctree.score(test_data,test_label) #0.904761904762 export_graphviz(ctree, out_file='ctree_entropy.dot', feature_names=words, class_names=author_names, filled=True, rounded=True, special_characters=True) graph_gini = pydot.graph_from_dot_file('ctree_entropy.dot') graph_gini.write_png('ctree_entropy.png') # feature evaluation ind_entropy = np.argsort(ctree.feature_importances_) features_entropy = np.array(words)[ind_entropy][::-1] ############################################################################### # Bagging bagging = BaggingClassifier() bagging.fit(training_data, training_label) err_bag_tr = bagging.score(training_data, training_label) err_bag_ts = bagging.score(test_data,test_label) #0.996604414261 #0.94444444444 ############################################################################### # Boosting # AdaBoost adaboost = AdaBoostClassifier() adaboost.fit(training_data, training_label) err_ada_tr = adaboost.score(training_data, training_label) err_ada_ts = adaboost.score(test_data,test_label) #0.9015280135823429 #0.8134920634920634
# NOW z1 IS NEW x # VIEWING THE IMAGE plt.imshow(z1[0].reshape(28, 28)) # IMPLEMENTING CLASSIFIER MODELS # BAGGING CLASSIFIER model = DecisionTreeClassifier() num_trees = 100 model1 = BaggingClassifier(base_estimator=model) model1 # SPLITTING THE DATA INTO TRAIN AND TEST z1_train, z1_test, y_train, y_test = train_test_split(z1, y, test_size=0.3) model1.fit(z1_train, y_train) pred = model1.predict(z1_test) metrics.accuracy_score(y_test, pred) print(classification_report(y_test, pred)) confusion_matrix(y_test, pred) # RANDOM FOREST CLASSIFIER rf = RandomForestClassifier() rf.fit(z1_train, y_train) pred1 = rf.predict(z1_test) metrics.accuracy_score(y_test, pred1) print(classification_report(y_test, pred1)) confusion_matrix(y_test, pred1) # GRADIENT BOOSTING CLASSIFIER model2 = GradientBoostingClassifier(n_estimators=30, verbose=1)
title_dummies = pd.get_dummies(train_data['Title'], prefix='Title') train_data = pd.concat([train_data, title_dummies], axis=1) train_data.drop(columns=['Title'], inplace=True) X_train, X_test, y_train, y_test = train_test_split(train_data, target, test_size=0.25, random_state=0) bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1) bag_clf.fit(X_train, y_train) submit_df = pd.read_csv('dataset/test.csv') submit_data = make_df(submit_df, [ 'Pclass', 'Sex', 'Age', 'Embarked', 'Name', 'SibSp', 'Parch', ]) submit_data['Title'] = submit_data['Name'].map(lambda x: add_title(x)) submit_data.drop(columns=['Name'], inplace=True) submit_data['Embarked'] = submit_data['Embarked'].map(
''' ################################################################################################# ############################################ ENSEMBLE ########################################### ################################################################################################# ''' from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier ''' ################################### BOOSTTRAP AGGREGATING(BAGGING) ######################################## ''' classificadorBagging = BaggingClassifier(votingClf, max_samples=0.5, max_features=1.0, n_estimators=5) classificadorBagging.fit(previsores_treinamento, classe_treinamento) print("Bagging " + str(classificadorBagging.score(previsores_teste, classe_teste))) ''' ################################### ADAPTIVE BOOSTING(ADA-BOOST) ######################################## ''' #criando uma ensemble de AdaBoost com 20 árvores de decisão classificadorAdaBoost = AdaBoostClassifier(votingClf, n_estimators=5, learning_rate=1) classificadorAdaBoost.fit(previsores_treinamento, classe_treinamento) print("Ada-Boost " + str(classificadorAdaBoost.score(previsores_teste, classe_teste))) ''' xt = previsores[:10]