Esempio n. 1
1
def test_warm_start_equal_n_estimators():
    # Test that nothing happens when fitting without increasing n_estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    # modify X to nonsense values, this should not change anything
    X_train += 1.

    assert_warns_message(UserWarning,
                         "Warm-start fitting without increasing n_estimators does not",
                         clf.fit, X_train, y_train)
    assert_array_equal(y_pred, clf.predict(X_test))
Esempio n. 2
0
def query_by_bagging(X, y, current_model, batch_size, rng, base_model=SVC(C=1, kernel='linear'), n_bags=5, method="KL", D=None):
    """
    :param base_model: Model that will be  **fitted every iteration**
    :param n_bags: Number of bags on which train n_bags models
    :param method: 'entropy' or 'KL'
    :return:
    """
    assert method == 'entropy' or method == 'KL'
    eps = 0.0000001
    if method == 'KL':
        assert hasattr(base_model, 'predict_proba'), "Model with probability prediction needs to be passed to this strategy!"
    clfs = BaggingClassifier(base_model, n_estimators=n_bags, random_state=rng)
    clfs.fit(X[y.known], y[y.known])
    pc = clfs.predict_proba(X[np.invert(y.known)])
    # Settles page 17
    if method == 'entropy':
        pc += eps
        fitness = np.sum(pc * np.log(pc), axis=1)
        ids =  np.argsort(fitness)[:batch_size]
    elif method == 'KL':
        p = np.array([clf.predict_proba(X[np.invert(y.known)]) for clf in clfs.estimators_])
        fitness = np.mean(np.sum(p * np.log(p / pc), axis=2), axis=0)
        ids = np.argsort(fitness)[-batch_size:]

    return y.unknown_ids[ids], fitness/np.max(fitness)
Esempio n. 3
0
def test_base():
    # Check BaseEnsemble methods.
    ensemble = BaggingClassifier(
        base_estimator=Perceptron(tol=1e-3, random_state=None), n_estimators=3)

    iris = load_iris()
    ensemble.fit(iris.data, iris.target)
    ensemble.estimators_ = []  # empty the list and create estimators manually

    ensemble._make_estimator()
    random_state = np.random.RandomState(3)
    ensemble._make_estimator(random_state=random_state)
    ensemble._make_estimator(random_state=random_state)
    ensemble._make_estimator(append=False)

    assert_equal(3, len(ensemble))
    assert_equal(3, len(ensemble.estimators_))

    assert_true(isinstance(ensemble[0], Perceptron))
    assert_equal(ensemble[0].random_state, None)
    assert_true(isinstance(ensemble[1].random_state, int))
    assert_true(isinstance(ensemble[2].random_state, int))
    assert_not_equal(ensemble[1].random_state, ensemble[2].random_state)

    np_int_ensemble = BaggingClassifier(base_estimator=Perceptron(tol=1e-3),
                                        n_estimators=np.int32(3))
    np_int_ensemble.fit(iris.data, iris.target)
Esempio n. 4
0
class ADABoost(Base):

    def train(self, data = None, plugin=None):
        """ With dataframe train mllib """
        super(ADABoost, self).train(data, plugin)

            #cl = svm.SVC(gamma=0.001, C= 100, kernel='linear', probability=True)

        X = self.X_train.iloc[:,:-1]
        Y = self.X_train.iloc[:,-1]

        self.scaler = StandardScaler().fit(X)
        X = self.scaler.transform(X)

        cl = SGDClassifier(loss='hinge')
        p = Pipeline([("Scaler", self.scaler), ("svm", cl)])

        self.clf = BaggingClassifier(p, n_estimators=50)
        #self.clf = AdaBoostClassifier(p, n_estimators=10)
            #self.clf = AdaBoostClassifier(SGDClassifier(loss='hinge'),algorithm='SAMME', n_estimators=10)

        self.clf.fit(X, Y)

    def predict(self, file, plugin=None):
        super(ADABoost, self).predict(file, plugin)

        data = file.vector
        X = data[plugin]
        X = self.scaler.transform(X)
        guess = self.clf.predict(X)
        return self.getTag(guess)
Esempio n. 5
0
def test_bagging_sample_weight_unsupported_but_passed():
    estimator = BaggingClassifier(DummyZeroEstimator())
    rng = check_random_state(0)

    estimator.fit(iris.data, iris.target).predict(iris.data)
    assert_raises(ValueError, estimator.fit, iris.data, iris.target,
                  sample_weight=rng.randint(10, size=(iris.data.shape[0])))
def bagging(X_train, X_test, y_train, y_test,n_est):
    n_est=51
    estimators=range(1,n_est)
    decision_clf = DecisionTreeClassifier()
    
    for est in estimators:
        bagging_clf = BaggingClassifier(decision_clf, n_estimators=est, max_samples=0.67,max_features=0.67, 
                                    bootstrap=True, random_state=9)
        bagging_clf.fit(X_train, y_train)
        # test line
        y_pred_bagging1 = bagging_clf.predict(X_test)
        score_bc_dt1 = accuracy_score(y_test, y_pred_bagging1)
        scores1.append(score_bc_dt1)
        # train line
        y_pred_bagging2 = bagging_clf.predict(X_train)
        score_bc_dt2 = accuracy_score(y_train, y_pred_bagging2)
        scores2.append(score_bc_dt2)
    
    plt.figure(figsize=(10, 6))
    plt.title('Bagging Info')
    plt.xlabel('Estimators')
    plt.ylabel('Scores')
    plt.plot(estimators,scores1,'g',label='test line', linewidth=3)
    plt.plot(estimators,scores2,'c',label='train line', linewidth=3)
    plt.legend()
    plt.show()
Esempio n. 7
0
def test_bagging_classifier_with_missing_inputs():
    # Check that BaggingClassifier can accept X with missing/infinite data
    X = np.array([
        [1, 3, 5],
        [2, None, 6],
        [2, np.nan, 6],
        [2, np.inf, 6],
        [2, np.NINF, 6],
    ])
    y = np.array([3, 6, 6, 6, 6])
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(
        FunctionTransformer(replace, validate=False),
        classifier
    )
    pipeline.fit(X, y).predict(X)
    bagging_classifier = BaggingClassifier(pipeline)
    bagging_classifier.fit(X, y)
    y_hat = bagging_classifier.predict(X)
    assert_equal(y.shape, y_hat.shape)
    bagging_classifier.predict_log_proba(X)
    bagging_classifier.predict_proba(X)

    # Verify that exceptions can be raised by wrapper classifier
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(classifier)
    assert_raises(ValueError, pipeline.fit, X, y)
    bagging_classifier = BaggingClassifier(pipeline)
    assert_raises(ValueError, bagging_classifier.fit, X, y)
Esempio n. 8
0
def train_dts(observations,targets,method='bagging'):
    """Trains a decision tree for each output

    :param observations: our train dataset
    :param targets: multiple target variables.
    :param method: bagging,random_forest,boosting
    :return: the dt models in a list, one for each target variable
    """
    n_targets = len(targets[0])

    tars = np.array(targets)
    dts = []
    for i in range(n_targets):
        act_tar = tars[:,i].tolist()

        dt = None
        if method == 'bagging': dt = BaggingClassifier(tree.DecisionTreeClassifier(),n_estimators=100,max_samples=0.5, max_features=1.)
        elif method == 'random_forest': dt = RandomForestClassifier(n_estimators=100)
        elif method == 'boosting': dt = AdaBoostClassifier(n_estimators=100)
        else: dt = tree.DecisionTreeClassifier()
        # the dt cannot be trained if the outputs are all equal. In that case, we create a fake dt
        if len(set(act_tar)) > 1:
            # We want to have a balanced data set while training.
            bal_observations, bal_tar = sample_balanced_dataset(observations,act_tar) #from data_manipulation
            dt.fit(bal_observations,bal_tar)
        else:
            dt = FakeClassifier(act_tar[0])
        dts.append(dt)

    return dts
Esempio n. 9
0
def test_warm_start_smaller_n_estimators():
    # Test if warm start'ed second fit with smaller n_estimators raises error.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    clf = BaggingClassifier(n_estimators=5, warm_start=True)
    clf.fit(X, y)
    clf.set_params(n_estimators=4)
    assert_raises(ValueError, clf.fit, X, y)
Esempio n. 10
0
def test_bagging_with_pipeline():
    estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
                                                DecisionTreeClassifier()),
                                  max_features=2)
    estimator.fit(iris.data, iris.target)
    assert_true(isinstance(estimator[0].steps[-1][1].random_state,
                           int))
Esempio n. 11
0
def test_estimators_samples():
    # Check that format of estimators_samples_ is correct and that results
    # generated at fit time can be identically reproduced at a later time
    # using data saved in object attributes.
    X, y = make_hastie_10_2(n_samples=200, random_state=1)
    bagging = BaggingClassifier(LogisticRegression(), max_samples=0.5,
                                max_features=0.5, random_state=1,
                                bootstrap=False)
    bagging.fit(X, y)

    # Get relevant attributes
    estimators_samples = bagging.estimators_samples_
    estimators_features = bagging.estimators_features_
    estimators = bagging.estimators_

    # Test for correct formatting
    assert_equal(len(estimators_samples), len(estimators))
    assert_equal(len(estimators_samples[0]), len(X) // 2)
    assert_equal(estimators_samples[0].dtype.kind, 'i')

    # Re-fit single estimator to test for consistent sampling
    estimator_index = 0
    estimator_samples = estimators_samples[estimator_index]
    estimator_features = estimators_features[estimator_index]
    estimator = estimators[estimator_index]

    X_train = (X[estimator_samples])[:, estimator_features]
    y_train = y[estimator_samples]

    orig_coefs = estimator.coef_
    estimator.fit(X_train, y_train)
    new_coefs = estimator.coef_

    assert_array_almost_equal(orig_coefs, new_coefs)
Esempio n. 12
0
def test_estimators_samples_deterministic():
    # This test is a regression test to check that with a random step
    # (e.g. SparseRandomProjection) and a given random state, the results
    # generated at fit time can be identically reproduced at a later time using
    # data saved in object attributes. Check issue #9524 for full discussion.

    iris = load_iris()
    X, y = iris.data, iris.target

    base_pipeline = make_pipeline(SparseRandomProjection(n_components=2),
                                  LogisticRegression())
    clf = BaggingClassifier(base_estimator=base_pipeline,
                            max_samples=0.5,
                            random_state=0)
    clf.fit(X, y)
    pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy()

    estimator = clf.estimators_[0]
    estimator_sample = clf.estimators_samples_[0]
    estimator_feature = clf.estimators_features_[0]

    X_train = (X[estimator_sample])[:, estimator_feature]
    y_train = y[estimator_sample]

    estimator.fit(X_train, y_train)
    assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
Esempio n. 13
0
class BaggingSK(PoolGenerator):
    '''
    This class should not be used, use brew.generation.bagging.Bagging instead.
    '''

    def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'):

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers

        # using the sklearn implementation of bagging for now
        self.sk_bagging = BaggingClassifier(base_estimator=base_classifier,
                n_estimators=n_classifiers, max_samples=1.0, max_features=1.0)
        
        self.ensemble = Ensemble()
        self.combiner = Combiner(rule=combination_rule)

    def fit(self, X, y):
        self.sk_bagging.fit(X, y)
        self.ensemble.add_classifiers(self.sk_bagging.estimators_)
        #self.classes_ = set(y)

    def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out)
def baggedDecisionTree( X_train, y_train, X_test, y_test, nEstimators ):

    print("\n### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###")
    print("baggedDecisionTree()\n")

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myBaggedDecisionTree = BaggingClassifier(
        base_estimator = DecisionTreeClassifier(),
        n_estimators   = nEstimators,
        # max_samples    = X_train.shape[0],
        bootstrap      = True,
        oob_score      = True,
        n_jobs         = -1 # use all available cores
        )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myBaggedDecisionTree.fit(X_train,y_train)
    y_pred = myBaggedDecisionTree.predict(X_test)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    print( "nEstimators: "      + str(nEstimators)                     )
    print( "out-of-bag score: " + str(myBaggedDecisionTree.oob_score_) )
    print( "accuracy score: "   + str(accuracy_score(y_test,y_pred))   )
    print( "out-of-bag decision function:" )
    print( str(myBaggedDecisionTree.oob_decision_function_) )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
Esempio n. 15
0
def train_classifiers(data):
    train_vars = [
        'X', 'Y',
        'Darkness',
        'Moon',
        'Hour',
        'DayOfWeekInt',
        'Day',
        'Month',
        'Year',
        'PdDistrictInt',
        'TemperatureC',
        'Precipitationmm',
        'InPdDistrict',
        'Conditions',
        'AddressCode',
    ]
    weather_mapping = {
        'Light Drizzle': 1,
        'Drizzle': 2,
        'Light Rain': 3,
        'Rain': 4,
        'Heavy Rain': 5,
        'Thunderstorm': 6,
    }
    data.Precipitationmm = data.Precipitationmm.fillna(-1)
    data.Conditions = data.Conditions.map(weather_mapping).fillna(0)

    train, test = split(data)
    X_train = train[train_vars]
    y_train = train.CategoryInt
    X_test = test[train_vars]
    y_test = test.CategoryInt

    bdt_real_2 = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=8),
        n_estimators=10,
        learning_rate=1
    )

    #bdt_real = DecisionTreeClassifier(max_depth=None, min_samples_split=1,
                                      #random_state=6065)

    bdt_real = BaggingClassifier(base_estimator=bdt_real_2,
                                random_state=6065,
                                n_estimators=100)

    #bdt_real = RandomForestClassifier(random_state=6065,
                                      #n_estimators=200)

    #bdt_real = ExtraTreesClassifier(random_state=6065,
                                    #min_samples_split=5,
                                    #n_estimators=200)

    bdt_real.fit(X_train, y_train)
    y_predict = pandas.Series(bdt_real.predict(X_test))
    print len(y_predict[y_predict == y_test])
    print len(y_predict)
    return bdt_real
Esempio n. 16
0
    def classification(self, x_train, y_train):
        ml = BaggingClassifier(DecisionTreeClassifier())
        ml.fit(x_train, y_train)
#         print y_train[0]
#         print x_train[0]
        y_pred = ml.predict(x_train)
        print 'y_train ',y_train
        print 'y_pred ',y_pred.tolist()
Esempio n. 17
0
def test_max_samples_consistency():
    # Make sure validated max_samples and original max_samples are identical
    # when valid integer max_samples supplied by user
    max_samples = 100
    X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1)
    bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=max_samples, max_features=0.5, random_state=1)
    bagging.fit(X, y)
    assert_equal(bagging._max_samples, max_samples)
Esempio n. 18
0
 def create_estimators(self, X_train, y_train, X_test):
     for model in self.models:
         param_grid = self.create_parameter_grid(model)
         for parameters in param_grid:
             clf = BaggingClassifier(base_estimator=model.set_params(**parameters), n_estimators=self.estimators, max_samples=0.95, n_jobs = 3)
             clf.fit(X_train, y_train)
             prediction = clf.predict_proba(X_test)[:,1]
             self.predictions.append(prediction)
Esempio n. 19
0
def test_oob_score_consistency():
    # Make sure OOB scores are identical when random_state, estimator, and
    # training data are fixed and fitting is done twice
    X, y = make_hastie_10_2(n_samples=200, random_state=1)
    bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5,
                                max_features=0.5, oob_score=True,
                                random_state=1)
    assert_equal(bagging.fit(X, y).oob_score_, bagging.fit(X, y).oob_score_)
Esempio n. 20
0
def test_bagging_small_max_features():
    # Check that Bagging estimator can accept low fractional max_features

    X = np.array([[1, 2], [3, 4]])
    y = np.array([1, 0])

    bagging = BaggingClassifier(LogisticRegression(),
                                max_features=0.3, random_state=1)
    bagging.fit(X, y)
Esempio n. 21
0
def adaboost_train(train_file,test_file):
    _,x,y = readFile(train_file)
    print 'reading done.'
    ts = x.shape[0]
    id,x2 = readFile(test_file)
    
    print x.shape
    print x2.shape    

    x = np.concatenate((x,x2))
    print 'concatenate done.'
    from sklearn.preprocessing import scale
    x = scale(x,with_mean=False)
    print 'scale done.'

    x2 = x[ts:]
    x=x[0:ts]

    from sklearn.feature_selection import SelectKBest,chi2
    x = SelectKBest(chi2,k=50000).fit_transform(x,y)    


    from sklearn.cross_validation import train_test_split
    tmp_array = np.arange(x.shape[0])
    train_i, test_i = train_test_split(tmp_array, train_size = 0.8, random_state = 500)

    train_x = x[train_i]
    test_x = x[test_i]
    train_y = y[train_i]
    test_y = y[test_i]

    from sklearn.ensemble import BaggingClassifier
    bagging = BaggingClassifier(LR(penalty='l2',dual=True),n_estimators = 10,max_samples=0.6,max_features=0.6)
    bagging.fit(train_x,train_y)
    print 'train done.' 
    res = bagging.predict(train_x)
    print res
    from sklearn.metrics import roc_auc_score
    score = roc_auc_score(train_y,res)
    
    res = bagging.predict_proba(train_x)
    print res
    score = roc_auc_score(train_y,res[:,1])
    print score
    print '-----------------------------------------'
    
    print res[:,1]
    res = bagging.predict_proba(test_x)
    score = roc_auc_score(test_y,res[:,1])
    print score

    y=bagging.predict_proba(x2)
    output = pd.DataFrame( data={"id":id, "sentiment":y[:,1]} )
    output.to_csv( "/home/chuangxin/Bagging_result.csv", index=False, quoting=3 )

    return bagging
Esempio n. 22
0
def test_oob_score_removed_on_warm_start():
    X, y = make_hastie_10_2(n_samples=2000, random_state=1)

    clf = BaggingClassifier(n_estimators=50, oob_score=True)
    clf.fit(X, y)

    clf.set_params(warm_start=True, oob_score=False, n_estimators=100)
    clf.fit(X, y)

    assert_raises(AttributeError, getattr, clf, "oob_score_")
Esempio n. 23
0
def main():
    '''main function'''
    bagging = BaggingClassifier(DecisionTreeClassifier())
    iris = load_iris()
    x = iris.data
    y = iris.target
    #train, test, train_, test_ = train_test_split(x, y, test_size=0.2, random_state=42)
    bagging.fit(x, y)
    bagging.predict(x[:2])
    print(bagging.score(x[:2], y[:2]))
def phenotype_imputation(data, config):
    ''' 
    Function to impute the labels on II based on the classifier learned on I.
    
    Parameters 
    ---------- 
    data : an object of class Dataset that contains: genotypes, covariates, 
        labels and information about random folds 

    config : an object of class ConfigState. It contains the user-entered 
        parameters in a YAML format.
        See the config_file parameter in the main script for more details.
    '''
    # Parameters for this task
    num_folds = data.num_folds  
    task_name    = "phenotype_imputation"
    n_estimators = config.get_entry(task_name, "n_estimators")
    romans_trn   = config.get_entry(task_name, "romans_used_for_learning")
    romans_tst   = config.get_entry(task_name, "romans_used_for_imputing")
    
    # Iterate through the folds: 
    i = 0
    size_of_two = find_vec_entries_that_contain(data.folds[:,0], romans_tst).shape[0]
    soft_labels = np.zeros((size_of_two, num_folds))
    X_scaled = preprocessing.scale(data.clin_covariate.transpose()).transpose()
    fpr = dict()
    tpr = dict()
    thres = dict()
    roc_auc = np.zeros(num_folds)
    for fold in data.folds.transpose():      
        logging.info("Fold=%d" % (i + 1))
        sel_trn = find_vec_entries_that_contain(fold,[romans_trn])
        sel_tst = find_vec_entries_that_contain(fold,[romans_tst])

        model = BaggingClassifier(base_estimator=linear_model.LogisticRegression(),
                    n_estimators=n_estimators, max_samples=0.632, 
# for small set I   n_estimators=n_estimators, max_samples=0.8, 
                    max_features=5, 
                    bootstrap=True, bootstrap_features=True, oob_score=False, 
# for small set I   bootstrap=False, bootstrap_features=True, oob_score=False, 
                    n_jobs=1, random_state=None, verbose=0)
            
        model.fit(X_scaled[:,sel_trn].transpose(), data.labels[:,sel_trn].transpose())

        soft_labels[:,i] = model.predict_proba(X_scaled[:,sel_tst].transpose())[:,1]
        fpr[i], tpr[i], thres[i] = metrics.roc_curve(data.labels[0,sel_tst], soft_labels[:,i])
        roc_auc[i] = metrics.auc(fpr[i], tpr[i])
        i+=1

    # Save the output of this task
    config.save_variable(task_name, "%f", soft_labels=soft_labels, roc_auc=roc_auc)
Esempio n. 25
0
class BaggingDecisionTrees(object):

    def __init__(self, n_estimators):
        self.classifier = BaggingClassifier(n_estimators=n_estimators)

    def fit(self, xs, ys):
        xs = xs.values
        ys = ys['y']
        self.classifier.fit(xs, ys)

    def predict(self, xs):
        xs = xs.values
        ys = self.classifier.predict(xs)
        return ys
def main():
    # The competition datafiles are in the directory /input

    # Read output csv format in case the file does not exists
    submit = pd.read_csv('sample_submission.csv')

    # Training cols
    print ("Loading training csv.")
    #train_cols = ['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market', 'hotel_cluster']
    train_cols = ['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country', 'hotel_cluster']
    train = pd.DataFrame(columns=train_cols)
    train_chunk = pd.read_csv('input/train.csv', chunksize=100000)
    print ("Training csv loaded.")

    # Read each chunk to train
    for chunk in train_chunk:
        #train = pd.concat( [ train, chunk ] )
        train = pd.concat( [ train, chunk[chunk['is_booking']==1][train_cols] ] )
        print ("Chunk done")
    # Load each column
    #x_train = train[['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market']].values
    x_train = train[['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country']].values
    y_train = train['hotel_cluster'].values

    # Run RandomForest on training data
    print ("Training RandomForest.")
    rf = RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=4)
    bclf = BaggingClassifier(rf, n_estimators=2, n_jobs=4)
    bclf.fit(x_train, y_train)
    print ("Training done.")

    print ("Loading testing csv.")
    test_chunk = pd.read_csv('input/test.csv', chunksize=100000)
    print ("Begin testing each chunk.")
    predict = np.array([])
    # Read each chunk to test
    for i, chunk in enumerate(test_chunk):
        #test_X = chunk[['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market']].values
        test_X = chunk[['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country']].values
        test_X = np.nan_to_num(test_X)
        if i > 0:
            predict = np.concatenate( [predict, bclf.predict_proba(test_X)])
        else:
            predict = bclf.predict_proba(test_X)
        print ("Chunk id: " + str(i))

    submit['hotel_cluster'] = np.apply_along_axis(get5Best, 1, predict)
    submit.head()
    submit.to_csv('submission_random_forest.csv', index=False)
def bagging_with_base_estimator(base_estimator, x_train, x_test, y_train,
                                y_test, rands = None):
    """
    Predict the lemons using a Bagging Classifier and a random seed
    both for the number of features, as well as for the size of the
    sample to train the data on

    ARGS:

        - x_train: :class:`pandas.DataFrame` of the x_training data

        - y_train: :class:`pandas.Series` of the y_training data

        - x_test: :class:`pandas.DataFrame` of the x_testing data

        - y_test: :class:`pandas.Series` of the y_testing data

        - rands: a :class:`tuple` of the (rs, rf) to seed the sample
        and features of the BaggingClassifier.  If `None`, then
        rands are generated and provided in the return `Series`

    RETURNS:

        :class:`pandas.Series` of the f1-scores and random seeds
    """
    #create a dictionary for the return values
    ret_d = {'train-f1':[], 'test-f1':[], 'rs':[], 'rf':[]}

    #use the randoms provided if there are any, otherwise generate them
    if not rands:
        rs =  numpy.random.rand()
        rf = numpy.random.rand()
        while rf < 0.1:
            rf = numpy.random.rand()
    else:
        rs, rf = rands[0], rands[1]
    #place them into the dictionary
    ret_d['rs'], ret_d['rf'] = rs, rf
    #create and run the bagging classifier
    bc = BaggingClassifier(base_estimator = base_estimator, n_estimators = 300,
                           max_samples = rs, max_features = rf, n_jobs = 1)

    bc.fit(x_train, y_train)
    y_hat_train = bc.predict(x_train)
    ret_d['train-f1'] = f1_score(y_train, y_hat_train)
    y_hat_test = bc.predict(x_test)
    ret_d['test-f1'] = f1_score(y_test, y_hat_test)
    return pandas.Series(ret_d)
Esempio n. 28
0
def train(data, labels):
    """
    classifier = VotingClassifier(estimators=[
        ('rf', RandomForestClassifier(n_estimators=400, n_jobs=-1)),
        ('ada', AdaBoostClassifier(n_estimators=50,
                                   base_estimator=RandomForestClassifier(
                                       n_estimators=40, n_jobs=-1))),
        ('nc', NearestCentroid())
    ])
    """
    classifier = BaggingClassifier(base_estimator=AdaBoostClassifier(
        base_estimator=RandomForestClassifier(n_estimators=40, n_jobs=-1)),
                                   n_jobs=-1)

    classifier.fit(data, labels)
    return classifier
Esempio n. 29
0
def TrainKNeighbors(p_subject, p_save):
	print "Welcome to TrainKNeighbors(" + p_subject + ", " + str(p_save) + ")"
	training_data = pd.read_pickle(input_data_paths[p_subject])

	# Ictal vs interictal
	kneighbors = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)
	y = training_data.T["classification"]
	kneighbors.fit(training_data[:-2].T, y)

	# Save models
	if p_save:
		model_save_filename = "/Users/dryu/Documents/DataScience/Seizures/data/models/KN_" + p_subject + ".pkl"
		model_save_file = open(model_save_filename, 'w')
		pickle.dump(kneighbors, model_save_file)
		model_save_file.close()

	return {"simultaneous":kneighbors}
Esempio n. 30
0
def train_bagging():
	model = build_model()
	bagging_model = BaggingClassifier(base_estimator=model,n_estimators=bagging_num_estimator,
	max_samples=bagging_sample_fraction,oob_score=bagging_use_oob)
	
	#train model
	bagging_model.fit(XC, yc) 
	
	#persist model
	if persist_model:
		models = bagging_model.estimators_
		for m in zip(range(0, len(models)), models):
			model_file = model_file_directory + "/" + model_file_prefix + "_" + str(m[0] + 1) + ".mod"
			joblib.dump(m[1], model_file) 

	score = bagging_model.score(XC, yc)
	print "average error %.3f" %(1.0 - score)
class ShapeletForestClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 n_estimators=100,
                 max_depth=None,
                 min_samples_split=2,
                 n_shapelets=10,
                 min_shapelet_size=0,
                 max_shapelet_size=1,
                 metric='euclidean',
                 metric_params=None,
                 bootstrap=True,
                 n_jobs=None,
                 random_state=None):
        """A shapelet forest classifier
        """
        self.n_estimators = n_estimators
        self.bootstrap = bootstrap
        self.n_jobs = n_jobs
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.n_shapelets = n_shapelets
        self.min_shapelet_size = min_shapelet_size
        self.max_shapelet_size = max_shapelet_size
        self.metric = metric
        self.metric_params = metric_params
        self.random_state = random_state

    def predict(self, X, check_input=True):
        return self.classes_[np.argmax(self.predict_proba(
            X, check_input=check_input),
                                       axis=1)]

    def predict_proba(self, X, check_input=True):
        # Correct formating of x
        if len(X.iloc[0]) == 1:  # UNI
            X = [
                np.array(X.iloc[i].iloc[0]).tolist() for i in range(0, len(X))
            ]
        else:  # MULTI
            X = [[
                np.array(X.iloc[i].iloc[j]).tolist()
                for j in range(0, len(X.iloc[i]))
            ] for i in range(0, len(X))]

        if check_input:
            X = check_array(X, dtype=np.float64, allow_nd=True, order="C")

        if X.ndim < 2 or X.ndim > 3:
            raise ValueError("illegal input dimensions X.ndim ({})".format(
                X.ndim))

        if self.n_dims_ > 1 and X.ndim != 3:
            raise ValueError("illegal input dimensions X.ndim != 3")

        if X.shape[-1] != self.n_timestep_:
            raise ValueError("illegal input shape ({} != {})".format(
                X.shape[-1], self.n_timestep_))

        if X.ndim > 2 and X.shape[1] != self.n_dims_:
            raise ValueError("illegal input shape ({} != {}".format(
                X.shape[1], self.n_dims))

        if X.dtype != np.float64 or not X.flags.contiguous:
            X = np.ascontiguousarray(X, dtype=np.float64)

        X = X.reshape(X.shape[0], self.n_dims_ * self.n_timestep_)
        return self.bagging_classifier_.predict_proba(X)

    def fit(self, X, y, sample_weight=None, check_input=True):
        """Fit a random shapelet forest classifier
        """
        # Correct formating of x
        if len(X.iloc[0]) == 1:  # UNI
            X2 = [
                np.array(X.iloc[i].iloc[0]).tolist() for i in range(0, len(X))
            ]
        else:  # MULTI
            X2 = [[
                np.array(X.iloc[i].iloc[j]).tolist()
                for j in range(0, len(X.iloc[i]))
            ] for i in range(0, len(X))]

        random_state = check_random_state(self.random_state)
        if check_input:
            X = check_array(X2, dtype=np.float64, allow_nd=True, order="C")
            y = check_array(y, ensure_2d=False)

        if X.ndim < 2 or X.ndim > 3:
            raise ValueError("illegal input dimension")

        n_samples = X.shape[0]
        self.n_timestep_ = X.shape[-1]
        if X.ndim > 2:
            n_dims = X.shape[1]
        else:
            n_dims = 1

        self.n_dims_ = n_dims

        if y.ndim == 1:
            self.classes_, y = np.unique(y, return_inverse=True)
        else:
            _, y = np.nonzero(y)
            if len(y) != n_samples:
                raise ValueError("Single label per sample expected.")
            self.classes_ = np.unique(y)

        if len(y) != n_samples:
            raise ValueError("Number of labels={} does not match "
                             "number of samples={}".format(len(y), n_samples))

        if X.dtype != np.float64 or not X.flags.contiguous:
            X = np.ascontiguousarray(X, dtype=np.float64)

        if not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=np.intp)

        shapelet_tree_classifier = ShapeletTreeClassifier(
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            n_shapelets=self.n_shapelets,
            min_shapelet_size=self.min_shapelet_size,
            max_shapelet_size=self.max_shapelet_size,
            metric=self.metric,
            metric_params=self.metric_params,
            random_state=random_state,
        )

        if n_dims > 1:
            shapelet_tree_classifier.force_dim = n_dims

        self.bagging_classifier_ = BaggingClassifier(
            base_estimator=shapelet_tree_classifier,
            bootstrap=self.bootstrap,
            n_jobs=self.n_jobs,
            n_estimators=self.n_estimators,
            random_state=self.random_state,
        )
        X = X.reshape(n_samples, n_dims * self.n_timestep_)
        self.bagging_classifier_.fit(X, y, sample_weight=sample_weight)
        return self
Esempio n. 32
0
accuracy_score(y_test, pre_rf)  # check the accuracy

# # bagging

# In[40]:

from sklearn.ensemble import BaggingClassifier

# In[41]:

bg = BaggingClassifier(RandomForestClassifier(),
                       n_estimators=20,
                       max_features=1.0,
                       max_samples=0.5)
bg.fit(x_train, y_train)  # fitting the model

# In[42]:

pre_bag = bg.predict(x_test)  # predicting the results

# In[43]:

accuracy_score(y_test, pre_bag)

# # ada boosting

# In[44]:

from sklearn.ensemble import AdaBoostClassifier
Esempio n. 33
0
                               decision_function_shape='ovo',
                               class_weight='balanced',
                               C=100,
                               gamma=0.1),
                       n_jobs=4)
# sv = svm.SVC(probability=True, class_weight='balanced', random_state=42, C=100, gamma=0.1)

# X_sv = train_dataset_full[train_dataset_full['type'] != 6.0 ].drop(columns=['type', 'session'])
# y_sv = train_dataset_full[train_dataset_full['type'] != 6.0 ]['type']

X_sv = train_dataset_full.drop(columns=['type', 'session'])
y_sv = train_dataset_full['type']

# print(y_sv.isna())
#%%
mod_sv = sv.fit(X_sv, y_sv)
#%%
# sv.estimators_
#%%
# del train_dataset_full

with open('svm_trained_with_type_6.pkl', 'wb') as handle:
    pkl.dump(mod_sv, handle, protocol=-1)
#%%

#%%
svm_model = mod_sv
# svm_model = pkl.load(open('svm_trained_paper.pkl', 'rb'))
test_svm_full = pd.concat(test_svm.values)
svm_predicted = svm_model.predict_proba(
    test_svm_full.drop(columns=['type', 'session']).dropna())
Esempio n. 34
0
X_train = count_vect.transform(X_train1)

clf = MLPClassifier(alpha=1, random_state=65)
clf.fit(X_train, y_train)

clf2 = SVC(probability = True, gamma=2, C=1)
clf2.fit(X_train, y_train)

clf3 = DecisionTreeClassifier(random_state = 0)
clf3.fit(X_train, y_train)

clf4 = PassiveAggressiveClassifier()
clf4.fit(X_train, y_train)

clf5 = BaggingClassifier(random_state=54)
clf5.fit(X_train, y_train)

clf6 = ExtraTreesClassifier(random_state=0)
clf6.fit(X_train, y_train)

clf7 = GradientBoostingClassifier(random_state=32)
clf7.fit(X_train, y_train)

vc = VotingClassifier(estimators=[
    ('mlp', clf), ('dt', clf3), ('et', clf6), ('bag', clf5), ('grad', clf7)
], voting='soft', weights=[0.3, 0.1, 0.2, 0.1, 0.3])
vc.fit(X_train, y_train)

predicted = clf.predict(X_test)
predicted2 = clf2.predict(X_test)
predicted3 = clf3.predict(X_test)
Esempio n. 35
0
#--------------------------------------------------------------------------------#
## Evaluate Bagging performance
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)
## Decision Tree classifier
dt = DecisionTreeClassifier(random_state=1)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
acc_test = accuracy_score(y_pred, y_test)
print("Test set accuracy of dt: {:.2f}".format(acc_test))

# Fit bc to the training set
bc.fit(X_train, y_train)

# Predict test set labels
y_pred = bc.predict(X_test)

# Evaluate acc_test
acc_test = accuracy_score(y_pred, y_test)
print('Test set accuracy of bc: {:.2f}'.format(acc_test))
print("-" * 38)
#--------------------------------------------------------------------------------#

#--------------------------------------------------------------------------------#
## Out of Bag Evaluation
## Prepare the ground

# Instantiate dt
y_pred6 = model6.predict(x_test)
accuracy6 = accuracy_score(y_test, y_pred6)
print("AdaBoost Accuracy: %.2f%%" % (accuracy6 * 100.0))

# ****** 7) Bagging ********************
from sklearn.ensemble import BaggingClassifier
tree7 = DecisionTreeClassifier(criterion='entropy')
model7 = BaggingClassifier(base_estimator=tree7,
                           n_estimators=60,
                           max_samples=1.0,
                           max_features=1.0,
                           bootstrap=True,
                           bootstrap_features=False,
                           n_jobs=1,
                           random_state=1)
model7.fit(x_train, y_train)
y_pred7 = model7.predict(x_test)
accuracy7 = accuracy_score(y_test, y_pred7)
print("Bagging Accuracy: %.2f%%" % (accuracy7 * 100.0))

# ****** 8) Random Forest ********************
from sklearn.ensemble import RandomForestClassifier
model8 = RandomForestClassifier(n_estimators=60, random_state=0, n_jobs=-1)
model8.fit(x_train, y_train)
y_pred8 = model8.predict(x_test)
accuracy8 = accuracy_score(y_test, y_pred8)
print("Random Forest Accuracy: %.2f%%" % (accuracy8 * 100.0))

# ****** 9) XGBoost ********************
from xgboost import XGBClassifier
model9 = XGBClassifier()
Esempio n. 37
0
enc = LabelEncoder()
enc.fit(y)
y = enc.fit_transform(y)
X = df.iloc[:, :6]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
dectree = tree.DecisionTreeClassifier(max_depth=5)
bag = BaggingClassifier(n_estimators=100, oob_score=True)
rf = RandomForestClassifier(n_estimators=1000,
                            oob_score=True,
                            max_features='auto')
boost = AdaBoostClassifier(n_estimators=1000)

dectree.fit(X_train, y_train)
bag.fit(X_train, y_train)
rf.fit(X_train, y_train)
boost.fit(X_train, y_train)
print('Tree', 'Bagging', 'Boosting', 'Random Forrest\n',
      np.round_(dectree.score(X_test, y_test), 2),
      np.round_(bag.score(X_test, y_test), 2),
      np.round_(boost.score(X_test, y_test), 2),
      np.round_(rf.score(X_test, y_test), 2), '\nTraining error\n',
      np.round_(dectree.score(X_train, y_train), 2),
      np.round_(bag.score(X_train, y_train), 2),
      np.round_(boost.score(X_train, y_train), 2),
      np.round_(rf.score(X_train, y_train), 2))
print('RF cross-val error:\n', 1 - rf.oob_score_)
print('Bagging cross-val error:\n', 1 - bag.oob_score_)
print(
    pd.DataFrame(rf.feature_importances_,
Esempio n. 38
0
def test_bagging_with_pipeline():
    estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
                                                DecisionTreeClassifier()),
                                  max_features=2)
    estimator.fit(iris.data, iris.target)
    assert isinstance(estimator[0].steps[-1][1].random_state, int)
Esempio n. 39
0
def test_warm_start_with_oob_score_fails():
    # Check using oob_score and warm_start simultaneously fails
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    clf = BaggingClassifier(n_estimators=5, warm_start=True, oob_score=True)
    with pytest.raises(ValueError):
        clf.fit(X, y)
Esempio n. 40
0
def model_train(model_type, X_train_, X_valid, y_train_, y_valid):
    """ tree,lightgbm, xgboost, catboost, randomforest, adaboost, logit, knn, gmm, svn, lda, naivebayes """

    if model_type == "tree":
        treeclf = DecisionTreeClassifier(max_depth=7)
        treeclf.fit(X_train_, y_train_)
        pred_model = treeclf
        del treeclf
    if model_type == "bagging":
        bagclf = BaggingClassifier(KNeighborsClassifier(),
                                   max_samples=0.5,
                                   max_features=0.5)
        bagclf.fit(X_train_, y_train_)
        pred_model = bagclf
        del bagclf

    if model_type == "lightgbm":
        # 0.8711  --> 22 minuti e 1
        # dtrain = lgb.Dataset(X_train, label=y_train) #,categorical_feature = categorical_columns)
        # dvalid = lgb.Dataset(X_valid, label=y_valid) #,categorical_feature = categorical_columns)
        lgbclf = lgb.LGBMClassifier(
            num_leaves=512,  # was 512 - default 31
            n_estimators=512,  # default 100 was 512
            max_depth=8,  # default -1, was 9
            learning_rate=0.1,  # default 0.1
            feature_fraction=0.4,  # default 1 was 0.4,
            bagging_fraction=0.4,  # default 1 was 0.4, # subsample by row
            metric="auc",  # binary_logloss auc
            boosting_type="gbdt",  # goss # dart --> speed: goss>gbdt>dart
            lambda_l1=0.4,  # default 0 - 0.4
            lambda_l2=0.6,  # default 0 - 0.6
            scale_pos_weight=18,  # defualt 1
        )

        lgbclf.fit(X_train_, y_train_)
        pred_model = lgbclf
        del lgbclf

    elif model_type == "xgboost":
        # sooo slow  #0.8614
        # scale_pos_weight and adjust settings
        # https://stats.stackexchange.com/questions/243207/what-is-the-proper-usage-of-scale-pos-weight-in-xgboost-for-imbalanced-datasets
        xgbclf = xgb.XGBClassifier(
            num_leaves=512,
            n_estimators=512,
            max_depth=25,
            learning_rate=0.1,
            feature_fraction=0.4,
            bagging_fraction=0.4,
            subsample=0.85,
            metric="auc",  # binary_logloss
            colsample_bytree=0.85,
            boosting_type="gbdt",  # goss # dart --> speed: goss>gbdt>dart
            reg_alpha=0.4,
            reg_lamdba=0.6,
            scale_pos_weight=82.9,
        )
        xgbclf.fit(X_train_, y_train_)
        pred_model = xgbclf
        del xgbclf

    elif model_type == "catboost":
        # serve farlo anche negli altri modelli?
        ycopy = y_train_.copy()
        ycopy["target_class"] = ycopy["target_class"].apply(lambda x: 1 if
                                                            (x >= 0.5) else 0)
        X_train_1, X_valid_1, y_train_1, y_valid_1 = train_test_split(
            X_train_, ycopy.values.flatten(), test_size=0.05)
        params = {
            "loss_function": "Logloss",  # objective function
            "eval_metric": "AUC",  # metric
            "verbose":
            200,  # output to stdout info about training process every 200 iterations
        }
        catclf = catboost.CatBoostClassifier(**params)
        catclf.fit(
            X_train_1,
            y_train_1,  # data to train on (required parameters, unless we provide X as a pool object, will be shown below)
            eval_set=(X_valid_1, y_valid_1),  # data to validate on
            use_best_model=
            True,  # True if we don't want to save trees created after iteration with the best validation score
            plot=
            True,  # True for visualization of the training process (it is not shown in a published kernel - try executing this code)
        )

        del X_train_1, X_valid_1, y_train_1, y_valid_1
        pred_model = catclf
        del catclf

    elif model_type == "randomforest":
        # 0.8476
        # che senso associata ad una singola prediction a 1.6???
        # https://towardsdatascience.com/an-implementation-and-explanation-of-the-random-forest-in-python-77bf308a9b76
        rfclf = RandomForestClassifier(n_estimators=512,
                                       bootstrap=True,
                                       max_features="sqrt")

        rfclf.fit(X_train_, y_train_)
        pred_model = rfclf
        del rfclf

    elif model_type == "adaboost":
        # 0.851 -->8:16:45 ore
        # https://subscription.packtpub.com/book/big_data_and_business_intelligence/9781787286382/9/ch09lvl1sec95/tuning-an-adaboost-regressor
        # https://towardsdatascience.com/boosting-algorithm-adaboost-b6737a9ee60c
        adaclf = AdaBoostClassifier(n_estimators=512, learning_rate=0.0069)
        adaclf.fit(X_train_, y_train_)
        pred_model = adaclf
        del adaclf

    elif model_type == "logit":
        # 0.7764 --> 12:52 minuti senza GridSearch, con gridsearch 63.8%
        ## che senso associata ad una singola prediction a 1.6??? con grid search 0.5
        # https://towardsdatascience.com/an-implementation-and-explanation-of-the-random-forest-in-python-77bf308a9b76
        logregclf = LogisticRegression(penalty="l1", solver="saga", tol=1e-3)
        pipe = Pipeline([("model", logregclf)])
        param_grid = {"model__max_iter": [1000]}
        # adding grid_search to logit
        logregclf_cv = GridSearchCV(pipe,
                                    param_grid=param_grid,
                                    scoring="roc_auc",
                                    cv=3)

        logregclf_cv.fit(X_train_, y_train_)
        # print('best_params_={}\nbest_score_={}'.format(repr(logregclf_cv.best_params_), repr(logregclf_cv.best_score_)))

        logregclf = logregclf_cv.best_estimator_

        pred_model = logregclf

        del logregclf

    elif model_type == "knn":
        # 0.612  Tempo: 3:21:59.613695
        # https://www.quora.com/How-can-I-choose-the-best-K-in-KNN-K-nearest-neighbour-classification
        knnclf = KNeighborsClassifier(n_neighbors=3,
                                      leaf_size=30)  # ), 'p': 1})
        knnclf.fit(X_train_, y_train_)
        pred_model = knnclf
        del knnclf

    elif model_type == "gmm":
        # 0  Tempo:
        # https://www.kaggle.com/albertmistu/detect-anomalies-using-gmm
        gmmclf = GaussianMixture()  # gaussian mixture model
        ycopy = y_train_.copy()
        ycopy["target_class"] = ycopy["target_class"].apply(lambda x: 1 if
                                                            (x >= 0.5) else 0)
        gmmclf.fit(X_train_, ycopy)
        pred_model = gmmclf
        del gmmclf
    elif model_type == "svm":
        # 0  Tempo:
        # https://www.kaggle.com/kojr1234/fraud-detection-using-svm
        svcclf = SVC(kernel="rbf", gamma=4 * 1e-3, C=10)
        svcclf.fit(X_train_, y_train_)
        pred_model = svcclf
        del svcclf
    elif model_type == "lda":
        # 0  Tempo:
        ldaclf = LinearDiscriminantAnalysis()
        ldaclf.fit(X_train_, y_train_)
        pred_model = ldaclf
        del ldaclf
    elif model_type == "naivebayes":
        # 0  Tempo:
        gnbclf = GaussianNB()  # priors = [0.995,0.005])
        gnbclf.fit(X_train_, y_train_)
        pred_model = gnbclf
        del gnbclf

    else:
        print("Please, try one of the possible models")

    del X_train_, y_train_
    print("finish train")

    return pred_model, X_valid.copy(), y_valid.copy()
Esempio n. 41
0
def main():
    ###############################################################################
    # Preparing the dataset
    # ---------------------
    # In this part we load the breast cancer dataset from scikit-learn and
    # preprocess it in order to pass to the DS models. An important point here is
    # to normalize the data so that it has zero mean and unit variance, which is
    # a common requirement for many machine learning algorithms.
    # This step can be easily done using the StandardScaler class.

    rng = np.random.RandomState(123)
    data = load_breast_cancer()
    X = data.data
    y = data.target
    # split the data into training and test data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=rng)

    # Scale the variables to have 0 mean and unit variance
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Split the data into training and DSEL for DS techniques
    X_train, X_dsel, y_train, y_dsel = train_test_split(X_train,
                                                        y_train,
                                                        test_size=0.5,
                                                        random_state=rng)

    # Train a pool of 100 base classifiers
    pool_classifiers = BaggingClassifier(Perceptron(max_iter=10),
                                         n_estimators=100,
                                         random_state=rng)
    pool_classifiers.fit(X_train, y_train)

    # Initialize the DS techniques
    knorau = KNORAU(pool_classifiers)
    kne = KNORAE(pool_classifiers)
    desp = DESP(pool_classifiers)
    ola = OLA(pool_classifiers)
    mcb = MCB(pool_classifiers)

    ###############################################################################
    # Calibrating base classifiers
    # -----------------------------
    # Some dynamic selection techniques requires that the base classifiers estimate
    # probabilities in order to estimate its competence level. Since the Perceptron
    # model is not a probabilistic classifier (does not implements the
    # predict_proba method, it needs to be calibrated for
    # probability estimation before being used by such DS techniques. This step can
    # be conducted using the CalibrateClassifierCV class from scikit-learn. Note
    # that in this example we pass a prefited pool of classifiers to the
    # calibration method in order to use exactly the same pool used in the other
    # DS methods.
    calibrated_pool = []
    for clf in pool_classifiers:
        calibrated = CalibratedClassifierCV(base_estimator=clf, cv='prefit')
        calibrated.fit(X_dsel, y_dsel)
        calibrated_pool.append(calibrated)

    apriori = APriori(calibrated_pool)
    meta = METADES(calibrated_pool)

    knorau.fit(X_dsel, y_dsel)
    kne.fit(X_dsel, y_dsel)
    desp.fit(X_dsel, y_dsel)
    ola.fit(X_dsel, y_dsel)
    mcb.fit(X_dsel, y_dsel)
    apriori.fit(X_dsel, y_dsel)
    meta.fit(X_dsel, y_dsel)

    ###############################################################################
    # Evaluating the methods
    # -----------------------
    # Let's now evaluate the methods on the test set. We also use the performance
    # of Bagging (pool of classifiers without any selection) as a baseline
    # comparison. We can see that  the majority of DS methods achieve higher
    # classification accuracy.

    print('Evaluating DS techniques:')
    print('Classification accuracy KNORA-Union: ',
          knorau.score(X_test, y_test))
    print('Classification accuracy KNORA-Eliminate: ',
          kne.score(X_test, y_test))
    print('Classification accuracy DESP: ', desp.score(X_test, y_test))
    print('Classification accuracy OLA: ', ola.score(X_test, y_test))
    print('Classification accuracy A priori: ', apriori.score(X_test, y_test))
    print('Classification accuracy MCB: ', mcb.score(X_test, y_test))
    print('Classification accuracy META-DES: ', meta.score(X_test, y_test))
    print('Classification accuracy Bagging: ',
          pool_classifiers.score(X_test, y_test))
# =============================================================================
# # Bagging Classifier
# =============================================================================

# Instantiate dt
dt = DecisionTreeClassifier(random_state=6)

# Instantiate bc
bc = BaggingClassifier(base_estimator=dt,
                       bootstrap=True,
                       n_estimators=60,
                       random_state=6)

# Fit bc to the training set
bc.fit(x_train, y_train)

# Predict test set labels
y_pred = bc.predict(x_test)

# Evaluate training and test acc score.
print("")
print("Bagging result :-")
print("Training Accuracy: {:.3f}".format(bc.score(x_train, y_train)))
print("Testing Accuracy: {:.3f}".format(bc.score(x_test, y_test)))

# =============================================================================
# # Random Forest Classifier
# =============================================================================

# Instatiate a RandomForest 'rf'
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier as knn
import numpy as np
from sklearn import svm
from sklearn.model_selection import KFold
from randomforest_featureselection import clf, xtrain, ytrain, xtest, ytest, X_important_train, X_important_test
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier

clf_imp = BaggingClassifier(svm.SVC(kernel='linear', C=1).fit(xtrain, ytrain))
clf_imp.fit(X_important_train, ytrain)

from sklearn.model_selection import cross_val_score, KFold
n_folds = []
n_folds.append(('K2', 2))
n_folds.append(('K4', 4))
n_folds.append(('K5', 5))
n_folds.append(('K10', 10))

seed = 7

for name, n_split in n_folds:
    results = []
    names = []
    print(name)
    kfold = KFold(n_splits=n_split, random_state=seed)
    cv_results = cross_val_score(clf_imp,
                                 X_important_train,
                                 ytrain,
mask_threshold_0 = y_proba[:,0]>=0.46
y_proba[mask_threshold_0,:]=0

mask_threshold_1 = y_proba[:,1]>=0.5
y_proba[mask_threshold_1,:]=1

y_pred = y_proba[:,0]

df_score_filter_methods.loc['rf','with ROC Curve​'] = f1_score(y_test, y_pred, average=None)[0]

#%% Bootstrap Aggregating 
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(clf_rf,n_estimators = 100,max_samples=0.7, max_features=0.15,bootstrap_features=True)

y_pred = bagging.fit(x_train,y_train).predict(x_test)
df_score_filter_methods.loc['rf','with Bagging'] = my_f1_score(y_test, y_pred)

#%% Boosting
#AdaBoost
from sklearn.ensemble import AdaBoostClassifier
clf_boost = AdaBoostClassifier(clf.best_estimator_,n_estimators=500)

y_pred = clf_boost.fit(x_train,y_train).predict(x_test)
df_score_filter_methods.loc['rf','AdaBoostClassifier'] = my_f1_score(y_test, y_pred)
#%%Blending
from sklearn.ensemble import VotingClassifier

clf_voting = VotingClassifier(estimators=[
        ('clf_boost', clf_boost), ('clf.best_estimator_', clf.best_estimator_), ('clf_rf', clf_rf),('clf_neigh', clf_neigh),('clf_svc', clf_svc)], voting='hard')#'soft'
    
Esempio n. 45
0
    def fit(self, df_X, df_y):
        logger.info("Fitting LightningClassification")

        if not df_y.shape[0] == df_X.shape[0]:
            raise ValueError("number of regions is not equal")
        if df_y.shape[1] != 1:
            raise ValueError("y needs to have 1 label column")

        if self.scale:
            # Scale motif scores
            df_X[:] = scale(df_X, axis=0)

        idx = list(range(df_y.shape[0]))

        y = df_y.iloc[idx]
        X = df_X.loc[y.index].values
        y = y.values.flatten()

        # Convert (putative) string labels
        label = LabelEncoder()
        y = label.fit_transform(y)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        logger.debug("Setting parameters through cross-validation")
        # Determine best parameters based on CV
        self.clf.fit(X_train, y_train)

        logger.debug("Average score ({} fold CV): {}".format(
            self.kfolds, self.clf.score(X_test, y_test)))

        logger.debug("Estimate coefficients using bootstrapping")

        # Estimate coefficients using bootstrappig
        # b = BaggingClassifier(self.clf.best_estimator_,
        #        max_samples=0.75, n_jobs=-1, random_state=state)
        b = BaggingClassifier(self.clf.best_estimator_,
                              max_samples=0.75,
                              n_jobs=-1)
        b.fit(X, y)

        # Get mean coefficients
        coeffs = np.array([e.coef_ for e in b.estimators_]).mean(axis=0)

        # Create dataframe of predicted coefficients
        if len(label.classes_) == 2:
            self.act_ = pd.DataFrame(np.hstack((-coeffs.T, coeffs.T)))
        else:
            self.act_ = pd.DataFrame(coeffs.T)

        # Convert labels back to original names
        self.act_.columns = label.inverse_transform(range(len(label.classes_)))
        self.act_.index = df_X.columns

        if self.permute:
            # Permutations
            logger.debug("Permutations")
            random_dfs = []
            for _ in range(10):
                y_random = np.random.permutation(y)
                b.fit(X, y_random)
                coeffs = np.array([e.coef_
                                   for e in b.estimators_]).mean(axis=0)

                if len(label.classes_) == 2:
                    random_dfs.append(
                        pd.DataFrame(np.hstack((-coeffs.T, coeffs.T))))
                else:
                    random_dfs.append(pd.DataFrame(coeffs.T))
            random_df = pd.concat(random_dfs)

            # Select cutoff based on percentile
            high_cutoffs = random_df.quantile(0.99)
            low_cutoffs = random_df.quantile(0.01)

            # Set significance
            self.sig_ = pd.DataFrame(index=df_X.columns)
            self.sig_["sig"] = False

            for col, c_high, c_low in zip(self.act_.columns, high_cutoffs,
                                          low_cutoffs):
                self.sig_["sig"].loc[self.act_[col] >= c_high] = True
                self.sig_["sig"].loc[self.act_[col] <= c_low] = True
        logger.info("Done")
Esempio n. 46
0
def test_bagging_with_pipeline():
    estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
                                                DecisionTreeClassifier()),
                                  max_features=2)
    estimator.fit(iris.data, iris.target)
#Porcentajes de acierto
print("Usando NuSVC se tiene una tasa de acierto del ",
      np.mean(scoresNu) * 100)
print("Usando SVC se tiene una tasa de acierto del ", np.mean(scoresSvc) * 100)

#Matrices de validación
print("Matriz SVM - Nu: ", matrizCruzada(predsvNu))
print("Matriz SVM - SVC: ", matrizCruzada(predsvSvc))

#Cuarto algoritmo ENSEMBLED METHODS

#Bagging meta-estimator
bagging = BaggingClassifier(KNeighborsClassifier(),
                            max_samples=0.5,
                            max_features=0.5)
bagging.fit(data_train, target_train)
preBag = bagging.predict(data_test)
scoresBag = cross_val_score(bagging,
                            atributos,
                            target,
                            cv=5,
                            scoring='accuracy')

#Random Forests
forests = RandomForestClassifier(n_estimators=10,
                                 max_depth=None,
                                 min_samples_split=2,
                                 random_state=0)
forests.fit(data_train, target_train)
preFo = forests.predict(data_test)
scoresFo = cross_val_score(forests,
Esempio n. 48
0
a5 = metrics.accuracy_score(labels_test, pred5)
q1.append(("MLP", {
    "alpha": 0.05,
    "solver": "adam",
    "batch_size": 800,
    "max_iter": 200,
    "beta_1": 0.85,
    "beta_2": 0.7
}, a5))

bagging = BaggingClassifier(base_estimator=tree.DecisionTreeClassifier(),
                            n_estimators=9,
                            max_samples=20586,
                            max_features=17)  # bagging
start = time.perf_counter()
bagging.fit(datasets, l)
pred6 = bagging.predict(datasets_test)
end = time.perf_counter()
t1.append(end - start)
a6 = metrics.accuracy_score(labels_test, pred6)
q1.append(("bagging", {
    "base_estimator": "tree.DecisionTreeClassifier()",
    "n_estimators": 9,
    "max_samples": 20586,
    "max_features": 17
}, a6))

# draw table and show each classifier's parameters, accuracy and training time
table = Texttable()
table.add_rows([
    ["classifier", "parameters", "accuracy", "training time"],
    for j in range(1,100):
        X_train_aug = np.concatenate((X_train_aug, np.roll(X_train, j, axis=1)))
        X_train_aug = np.concatenate((X_train_aug, -np.roll(X_train, j, axis=1)))
        y_train_aug = np.concatenate((y_train_aug, y_train))
        y_train_aug = np.concatenate((y_train_aug, y_train))
    # Apply data augmentation on testing data
    X_test_aug, y_test_aug = X_test, y_test
    X_test_aug = np.concatenate((X_test_aug, -X_test))
    y_test_aug = np.concatenate((y_test_aug, y_test))
    for j in range(1,100):
        X_test_aug = np.concatenate((X_test_aug, np.roll(X_test, j, axis=1)))
        X_test_aug = np.concatenate((X_test_aug, -np.roll(X_test, j, axis=1)))
        y_test_aug = np.concatenate((y_test_aug, y_test))
        y_test_aug = np.concatenate((y_test_aug, y_test))
    # Fit the model
    clf.fit(X_train_aug, y_train_aug)
    train_score = clf.score(X_test, y_test)
    train_score_aug = clf.score(X_test_aug, y_test_aug)
    # Save the score
    scores = np.append(scores, train_score)
    scores_aug = np.append(scores_aug, train_score_aug)
    
# Print final score
with open('ris/OUT-score_alglorithms.txt', mode='a') as f:
    print('Average score:', scores.mean(), '+-', scores.std() / np.sqrt(n_splits), file=f)
    print('Average score (augmented):', scores_aug.mean(), '+-', scores_aug.std() / np.sqrt(n_splits), file=f)

params = {'chat_id': telegram_bot_id['chat_id'], 'text': '[python] End Bagging Classifier k-fold validation.'}
requests.post('https://api.telegram.org/' + telegram_bot_id['bot_id'] + '/sendMessage', params=params)

params = {'chat_id': telegram_bot_id['chat_id'], 'text': '[python] End data augmentation part.'}
Esempio n. 50
0
test_prediction = model.predict(X_test);
#build a submit table format
submit_table = y_test;
submit_table.buy_next_day = test_prediction;
submit_table = submit_table.loc[submit_table.buy_next_day==1.0];
submit_pair = submit_table.reset_index(level=[0, 1, 2]);
submit_pair = submit_pair.loc[:, ['user_id', 'item_id']];
submit_pair.user_id = submit_pair.user_id.apply(str);
submit_pair.item_id = submit_pair.item_id.apply(str);
submit_pair.to_csv('tianchi_mobile_recommendation_predict.csv', index=False);


#train the model with linear svm
n_estimators = 10;
model_svm = BaggingClassifier(LinearSVC(class_weight='balanced'), max_samples=1.0 / n_estimators, n_estimators=n_estimators);
res_svm = model_svm.fit(X_train, y_train.values.ravel());
#train prediction
train_prediction_svm = model_svm.predict(X_train);
train_accuracy_svm = np.mean(train_prediction_svm==y_train.buy_next_day.values);
train_f1_svm, train_precision_svm, train_recall_svm = F1(y_train.buy_next_day.values, train_prediction_svm);
#cv prediction
cv_prediction_svm = model_svm.predict(X_cv);
cv_accuracy_svm = np.mean(cv_prediction_svm==y_cv.buy_next_day.values);
cv_f1_svm, cv_precision_svm, cv_recall_svm = F1(y_cv.buy_next_day.values, cv_prediction_svm);
#local test prediction
local_test_prediction_svm = model_svm.predict(X_local_test);
local_test_accuracy_svm = np.mean(local_test_prediction_svm==y_local_test.buy_next_day.values);
local_test_f1_svm, local_test_precision_svm, local_test_recall_svm = F1(y_local_test.buy_next_day.values, local_test_prediction_svm);
#test prediction submit
test_prediction_svm = model.predict(X_test);
Esempio n. 51
0
for n in range(1, 30):
    my_bgc = MyBaggingClassifier(tree_clf,
                                 n_estimators=n,
                                 max_samples=110,
                                 max_features=10)
    my_bgc.fit(bigDataset_X_train, bigDataset_Y_train)
    y = my_bgc.predict(bigDataset_X_test)
    myBaggingClassifierError.append(accuracy_score(y, bigDataset_Y_test))

baggingClassifierError = []
for n in range(1, 30):
    bgc = BaggingClassifier(tree_clf,
                            n_estimators=n,
                            max_samples=110,
                            max_features=10)
    bgc.fit(smallDataset_X_train, smallDataset_Y_train)
    y = bgc.predict(smallDataset_X_test)
    baggingClassifierError.append(accuracy_score(y, smallDataset_Y_test))

randomForestClassifierError = []
for n in range(1, 30):
    bgc = RandomForestClassifier(n_estimators=n, max_features=8)
    bgc.fit(bigDataset_X_train, bigDataset_Y_train)
    y = bgc.predict(bigDataset_X_test)
    randomForestClassifierError.append(accuracy_score(y, bigDataset_Y_test))

gradientBoostingClassifierError = []
for n in range(1, 30):
    bgc = GradientBoostingClassifier(n_estimators=n, max_features=8)
    bgc.fit(bigDataset_X_train, bigDataset_Y_train)
    y = bgc.predict(bigDataset_X_test)
Esempio n. 52
0
    #Naive Bayes classifier
    classifierNB = GaussianNB()
    classifierNB.fit(x_train, y_train)
    pred = classifierNB.predict(X_ul)

    #XGBoost classifier
    classifierXGB = XGBClassifier(n_estimators=20, n_jobs=-1)
    classifierXGB.fit(x_train, y_train)
    pred = classifierXGB.predict(X_ul)

    #Bagging Classifier
    classifierBG = BaggingClassifier(tree.DecisionTreeClassifier(),
                                     n_estimators=20,
                                     n_jobs=-1)
    classifierBG.fit(x_train, y_train)
    pred = classifierBG.predict(X_ul)

    #Gradient Boosting Classifier
    classifierGB = GradientBoostingClassifier(n_estimators=20,
                                              learning_rate=1.0,
                                              max_depth=1).fit(
                                                  x_train, y_train)
    pred = classifierGB.predict(X_ul)

    #Adaboost classifier
    classifierAB = AdaBoostClassifier(base_estimator=RandomForestClassifier(
        n_estimators=20, criterion='entropy', n_jobs=-1),
                                      n_estimators=20)
    classifierAB.fit(x_train, y_train)
    pred = classifierAB.predict(X_ul)
Esempio n. 53
0
    myGBDT=GradientBoostingClassifier()
    myBagging=BaggingClassifier(SVC(C=0.5),n_estimators=100)

    print("--training model...")
    myROC=0
    rocList=[]
    for k in tqdm.tqdm(kfModel.split(X)):
        trainX=X[k[0]]
        trainY=y[k[0]]
        testX=X[k[1]]
        testY=y[k[1]]

        myGBDT.fit(trainX,trainY)
        preY1=myGBDT.predict(testX)
        
        myBagging.fit(trainX,trainY)
        preY2=myBagging.predict(testX)

        preY=(preY1+preY2)/2

        try:
            tmpROC=roc_auc_score(testY,preY)
        except:
            continue
        rocList.append(tmpROC)
        if tmpROC>myROC:
            myROC=tmpROC
            joblib.dump(myGBDT,"model/myModel.model")
            print("roc:",myROC)
            print("recall:",recall_score(testY,preY))
            print("precision:",precision_score(testY,preY))
Esempio n. 54
0
	length = (len(data) - 2) / 2
	for j in range(0,length):
		value = (float64)(data[j * 2 + 1])
		c = words_list[data[j * 2]]
		row.append(i)
		column.append(c)
		element.append((value + 1.0) * (value + 0.8))
	i = i + 1
	label.append(train_id_to_label[data[length * 2]])
feature = coo_matrix((element,(row,column)),shape=(i,tot_word))
source.close()
print "finish step 4"

X_train,X_test,Y_train,Y_test = train_test_split(feature,label,train_size = 0.8,random_state = 215)
bagging = BaggingClassifier(LogisticRegression(penalty = 'l1',solver = 'liblinear',C = 0.1204,random_state = 215),n_estimators = 4,max_samples = 0.9,max_features = 0.9,random_state = 214)
bagging.fit(X_train,Y_train)
print "finish step 5"
predict_X_test = bagging.predict_proba(X_test)
source = open("bagging_validproba.csv","wb")
writer = csv.writer(source)
for each in predict_X_test:
	writer.writerow([each[1]])
source.close()
y_score = []
for each in predict_X_test:
	y_score.append(each[1])
print metrics.roc_auc_score(Y_test,y_score)

row = []
column = []
element = []
Esempio n. 55
0
print("Errors: " + str(wrong), " Correct :" + str(right))
print("Accuracy: " + str(right/(right+wrong)*100))
print(classification_report(test[1], m[0][0]))
print(confusion_matrix(test[1], m[0][0]))

"""## Bagging

---
"""

bagging1 = BaggingClassifier(base_estimator=clf_LR2, n_estimators=5, max_samples=0.8, max_features=0.8)
bagging2 = BaggingClassifier(base_estimator=clf_NN1, n_estimators=5, max_samples=0.8, max_features=0.8)
bagging3 = BaggingClassifier(base_estimator=clf_RBF1, n_estimators=5, max_samples=0.8, max_features=0.8)
bagging4 = BaggingClassifier(base_estimator=clf1, n_estimators=5, max_samples=0.8, max_features=0.8)
start_time = time.time()
bagging1.fit(train[0], train[1]) 
print("--- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
bagging2.fit(train[0], train[1]) 
print("--- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
bagging3.fit(train[0], train[1])
print("--- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
bagging4.fit(train[0], train[1]) 
print("--- %s seconds ---" % (time.time() - start_time))

accuracy(bagging1,test[0],test[1])
accuracy(bagging2,test[0],test[1])
accuracy(bagging3,test[0],test[1])
accuracy(bagging4,test[0],test[1])
Esempio n. 56
0
rfc = RandomForestClassifier(random_state=4)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
roc_score = roc_auc_score(y_test, y_pred)
print("Random forest score: ", roc_score)
# Code ends here

# --------------
# Import Bagging Classifier
from sklearn.ensemble import BaggingClassifier
# Code starts here
bagging_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                n_estimators=100,
                                max_samples=100,
                                random_state=0)
bagging_clf.fit(X_train, y_train)
score_bagging = bagging_clf.score(X_test, y_test)
print("Bagging 100 DTrees : ", score_bagging)
# Code ends here

# --------------
# Import libraries
from sklearn.ensemble import VotingClassifier
# Various models
clf_1 = LogisticRegression()
clf_2 = DecisionTreeClassifier(random_state=4)
clf_3 = RandomForestClassifier(random_state=4)
model_list = [('lr', clf_1), ('DT', clf_2), ('RF', clf_3)]
# Code starts here
voting_clf_hard = VotingClassifier(estimators=model_list, voting='hard')
voting_clf_hard.fit(X_train, y_train)
err_ctree2_tr = ctree.score(test_data,test_label)
#0.904761904762
export_graphviz(ctree, out_file='ctree_entropy.dot',
                feature_names=words, class_names=author_names,
                filled=True, rounded=True,
                special_characters=True)
graph_gini = pydot.graph_from_dot_file('ctree_entropy.dot')
graph_gini.write_png('ctree_entropy.png')
# feature evaluation
ind_entropy = np.argsort(ctree.feature_importances_)
features_entropy = np.array(words)[ind_entropy][::-1]

###############################################################################
# Bagging
bagging = BaggingClassifier()
bagging.fit(training_data, training_label)
err_bag_tr =  bagging.score(training_data, training_label)
err_bag_ts =  bagging.score(test_data,test_label)
#0.996604414261
#0.94444444444


###############################################################################
# Boosting
# AdaBoost
adaboost = AdaBoostClassifier()
adaboost.fit(training_data, training_label)
err_ada_tr =  adaboost.score(training_data, training_label)
err_ada_ts =  adaboost.score(test_data,test_label)
#0.9015280135823429
#0.8134920634920634
Esempio n. 58
0
# NOW z1 IS NEW x

# VIEWING THE IMAGE
plt.imshow(z1[0].reshape(28, 28))

# IMPLEMENTING CLASSIFIER MODELS
# BAGGING CLASSIFIER
model = DecisionTreeClassifier()
num_trees = 100
model1 = BaggingClassifier(base_estimator=model)
model1

# SPLITTING THE DATA INTO TRAIN AND TEST
z1_train, z1_test, y_train, y_test = train_test_split(z1, y, test_size=0.3)

model1.fit(z1_train, y_train)
pred = model1.predict(z1_test)
metrics.accuracy_score(y_test, pred)
print(classification_report(y_test, pred))
confusion_matrix(y_test, pred)

# RANDOM FOREST CLASSIFIER
rf = RandomForestClassifier()
rf.fit(z1_train, y_train)
pred1 = rf.predict(z1_test)
metrics.accuracy_score(y_test, pred1)
print(classification_report(y_test, pred1))
confusion_matrix(y_test, pred1)

# GRADIENT BOOSTING CLASSIFIER
model2 = GradientBoostingClassifier(n_estimators=30, verbose=1)
Esempio n. 59
0
title_dummies = pd.get_dummies(train_data['Title'], prefix='Title')
train_data = pd.concat([train_data, title_dummies], axis=1)
train_data.drop(columns=['Title'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(train_data,
                                                    target,
                                                    test_size=0.25,
                                                    random_state=0)

bag_clf = BaggingClassifier(DecisionTreeClassifier(),
                            n_estimators=500,
                            max_samples=100,
                            bootstrap=True,
                            n_jobs=-1)

bag_clf.fit(X_train, y_train)
submit_df = pd.read_csv('dataset/test.csv')

submit_data = make_df(submit_df, [
    'Pclass',
    'Sex',
    'Age',
    'Embarked',
    'Name',
    'SibSp',
    'Parch',
])

submit_data['Title'] = submit_data['Name'].map(lambda x: add_title(x))
submit_data.drop(columns=['Name'], inplace=True)
submit_data['Embarked'] = submit_data['Embarked'].map(
Esempio n. 60
0
'''
#################################################################################################
############################################ ENSEMBLE ###########################################
#################################################################################################
'''

from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
'''
################################### BOOSTTRAP AGGREGATING(BAGGING) ########################################
'''

classificadorBagging = BaggingClassifier(votingClf,
                                         max_samples=0.5,
                                         max_features=1.0,
                                         n_estimators=5)
classificadorBagging.fit(previsores_treinamento, classe_treinamento)
print("Bagging " +
      str(classificadorBagging.score(previsores_teste, classe_teste)))
'''
################################### ADAPTIVE BOOSTING(ADA-BOOST) ########################################
'''
#criando uma ensemble de AdaBoost com 20 árvores de decisão
classificadorAdaBoost = AdaBoostClassifier(votingClf,
                                           n_estimators=5,
                                           learning_rate=1)
classificadorAdaBoost.fit(previsores_treinamento, classe_treinamento)
print("Ada-Boost " +
      str(classificadorAdaBoost.score(previsores_teste, classe_teste)))
'''
xt = previsores[:10]