Example #1
0
def test_bagging_classifier_with_missing_inputs():
    # Check that BaggingClassifier can accept X with missing/infinite data
    X = np.array([
        [1, 3, 5],
        [2, None, 6],
        [2, np.nan, 6],
        [2, np.inf, 6],
        [2, np.NINF, 6],
    ])
    y = np.array([3, 6, 6, 6, 6])
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(FunctionTransformer(replace), classifier)
    pipeline.fit(X, y).predict(X)
    bagging_classifier = BaggingClassifier(pipeline)
    bagging_classifier.fit(X, y)
    y_hat = bagging_classifier.predict(X)
    assert y.shape == y_hat.shape
    bagging_classifier.predict_log_proba(X)
    bagging_classifier.predict_proba(X)

    # Verify that exceptions can be raised by wrapper classifier
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(classifier)
    with pytest.raises(ValueError):
        pipeline.fit(X, y)
    bagging_classifier = BaggingClassifier(pipeline)
    with pytest.raises(ValueError):
        bagging_classifier.fit(X, y)
Example #2
0
def trainModel(x, y):

    #lightgbm/xgboost的自定义评价指标
    def self_metric(y_true, y_pred):
        score = -f1_score(y_true, 1 * (y_pred >= 0.5))
        return 'f1', score, False

    from sklearn.ensemble import BaggingClassifier
    params = {
        "num_leaves": 81,
        "n_estimators": 550,
        "learning_rate": 0.2,  #绝对需要的参数
        # "subsample":0.9,
        "class_weight": {
            1: 1,
            0: 1
        },
        "reg_lambda": 2  #仅做尝试
    }
    # params = {"num_leaves":121, "n_estimators":450, "learning_rate":0.2,#绝对需要的参数
    #           "subsample":0.9,"class_weight":{1:1,0:1},"reg_lambda":1 #仅做尝试
    #           }
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=345)
    lg = LGBMClassifier(**params)
    # lg = LGBMClassifier(random_seed=2019, n_jobs=-1, objective='binary',
    #                  learning_rate=0.1, n_estimators=2666, num_leaves=31, max_depth=-1,
    #                  min_child_samples=50, min_child_weight=9, subsample_freq=1,
    #                  # subsample=0.7, colsample_bytree=0.7,
    #                     reg_alpha=1, reg_lambda=5)
    model = BaggingClassifier(base_estimator=lg,
                              n_estimators=100,
                              max_samples=0.8,
                              max_features=0.8)
    # model  = lg
    # model.fit(x_train, y_train, eval_metric=self_metric, eval_set=[(x_train, y_train),(x_test, y_test)],early_stopping_rounds=100)
    # model.n_estimators = model.best_iteration_
    model.fit(x_train, y_train)
    joblib.dump(model, "../result/lgb.m")

    #质变部分 - 取合理的阈值来指定 f1指标
    #todo 可以自己划分多个阈值(2000个以上)直接计算f1指标,看哪个阈值最好,更加精确
    pre_train = model.predict_proba(x_train)[:, 1]
    pre_test = model.predict_proba(x_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_train, pre_train)
    thre_index = (tpr - fpr).argmax()
    thres = thresholds[thre_index]

    print("训练集阈值", thres)
    pre_train = 1 * (pre_train >= thres)
    pre_test = 1 * (pre_test >= thres)
    print("train f1_score", f1_score(y_train, pre_train))
    print("test f1_score", f1_score(y_test, pre_test))
    print("train recall_score", recall_score(y_train, pre_train))
    print("test recall_score", recall_score(y_test, pre_test))
    print("train precision_score", precision_score(y_train, pre_train))
    print("test precision_score", precision_score(y_test, pre_test))
    return model, thres
def test_probability():
    # Predict probabilities.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    with np.errstate(divide="ignore", invalid="ignore"):
        # Normal case
        ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                     random_state=rng).fit(X_train, y_train)

        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
                                         axis=1),
                                  np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))

        # Degenerate case, where some classes are missing
        ensemble = BaggingClassifier(base_estimator=LogisticRegression(),
                                     random_state=rng,
                                     max_samples=5).fit(X_train, y_train)

        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
                                         axis=1),
                                  np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))
def test_bagging_classifier_with_missing_inputs():
    # Check that BaggingClassifier can accept X with missing/infinite data
    X = np.array([
        [1, 3, 5],
        [2, None, 6],
        [2, np.nan, 6],
        [2, np.inf, 6],
        [2, np.NINF, 6],
    ])
    y = np.array([3, 6, 6, 6, 6])
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(
        FunctionTransformer(replace, validate=False),
        classifier
    )
    pipeline.fit(X, y).predict(X)
    bagging_classifier = BaggingClassifier(pipeline)
    bagging_classifier.fit(X, y)
    y_hat = bagging_classifier.predict(X)
    assert_equal(y.shape, y_hat.shape)
    bagging_classifier.predict_log_proba(X)
    bagging_classifier.predict_proba(X)

    # Verify that exceptions can be raised by wrapper classifier
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(classifier)
    assert_raises(ValueError, pipeline.fit, X, y)
    bagging_classifier = BaggingClassifier(pipeline)
    assert_raises(ValueError, bagging_classifier.fit, X, y)
Example #5
0
def test_parallel_classification():
    # Check parallel classification.
    rng = check_random_state(0)

    # Classification
    X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng)

    ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train)

    # predict_proba
    ensemble.set_params(n_jobs=1)
    y1 = ensemble.predict_proba(X_test)
    ensemble.set_params(n_jobs=2)
    y2 = ensemble.predict_proba(X_test)
    assert_array_almost_equal(y1, y2)

    ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=1, random_state=0).fit(X_train, y_train)

    y3 = ensemble.predict_proba(X_test)
    assert_array_almost_equal(y1, y3)

    # decision_function
    ensemble = BaggingClassifier(SVC(decision_function_shape="ovr"), n_jobs=3, random_state=0).fit(X_train, y_train)

    ensemble.set_params(n_jobs=1)
    decisions1 = ensemble.decision_function(X_test)
    ensemble.set_params(n_jobs=2)
    decisions2 = ensemble.decision_function(X_test)
    assert_array_almost_equal(decisions1, decisions2)

    ensemble = BaggingClassifier(SVC(decision_function_shape="ovr"), n_jobs=1, random_state=0).fit(X_train, y_train)

    decisions3 = ensemble.decision_function(X_test)
    assert_array_almost_equal(decisions1, decisions3)
Example #6
0
class MultiLabel(Model):

    def __init__(self, *args, **kwargs):
        n_estimators = 100
        self.clf1 = RandomForestClassifier(n_estimators=n_estimators)
        self.clf2 = BaggingClassifier(n_jobs=1)
        self.clf3 = BaggingClassifier(n_jobs=1)

    def train(self, x, y):
        y1 = np.array([ACTIONS[i].split('-')[0] for i in y])
        y2 = np.array([ACTIONS[i].split('-')[1] if ACTIONS[i] != 'SHIFT' else 'SHIFT' for i in y])
        y3 = np.array([ACTIONS[i].split('-')[2] if ACTIONS[i] != 'SHIFT' else 'SHIFT' for i in y])
        self.clf1.fit(x, y1)
        self.clf2.fit(x, y2)
        self.clf3.fit(x, y3)

    def predict(self, x):
        pred1 = self.clf1.predict_proba(x)
        pred2 = self.clf2.predict_proba(x)
        pred3 = self.clf3.predict_proba(x)
        a1 = 'REDUCE'
        # fix the action if needed
        a2_pred = self.clf2.classes_[np.argsort(pred2).squeeze()]
        a2 = a2_pred[-1] if a2_pred[-1] != 'SHIFT' else a2_pred[-2]
        a3_pred = self.clf3.classes_[np.argsort(pred3).squeeze()]
        a3 = a3_pred[-1] if a3_pred[-1] != 'SHIFT' else a3_pred[-2]
        if self.clf1.classes_[np.argmax(pred1)] == 'SHIFT':
            action = 'SHIFT'
            alter_action = '-'.join([a1, a2, a3])
        else:
            action = '-'.join([a1, a2, a3])
            alter_action = 'INVALID'
        return action, alter_action
Example #7
0
def getPredictionResults(trainData, testData, featureSet, qIds=False):
    if(not(qIds)):
        trainFeatures = np.nan_to_num(np.array([np.array(extractFeatureValues(trainSample, featureSet)) for trainSample in trainData[:,-1]]))
        testFeatures = np.nan_to_num(np.array([np.array(extractFeatureValues(testSample, featureSet)) for testSample in testData[:,-1]]))
        # scaler = preprocessing.StandardScaler().fit(trainFeatures)

        trainLabels = trainData[:,-2].astype('int')
        # clf = RandomForestClassifier(**RF_PARAMS).fit(trainFeatures, trainLabels)
        base = GradientBoostingClassifier(**GBC_PARAMS, loss="deviance")
        clf = BaggingClassifier(base_estimator=base, n_estimators=10, bootstrap=False).fit(trainFeatures, trainLabels).fit(trainFeatures, trainLabels)
        probs = clf.predict_proba(testFeatures)[:,1]
        # clf = SVC(**SVM_PARAMS).fit(preprocessing.scale(trainFeatures), trainLabels)
        # probs = clf.predict_proba(preprocessing.scale(testFeatures))[:,1]

        return probs
    else:
        probs = np.zeros(len(testData))
        for qId in qIds:
            trainFeatures = np.nan_to_num(np.array([np.array(extractFeatureValues(trainSample, featureSet)) for trainSample in trainData[trainData[:,0]==qId,-1]]))
            testFeatures = np.nan_to_num(np.array([np.array(extractFeatureValues(testSample, featureSet)) for testSample in testData[testData[:,0]==qId,-1]]))
            # scaler = preprocessing.StandardScaler().fit(trainFeatures)

            trainLabels = trainData[trainData[:,0]==qId,-2].astype('int')
            # clf = RandomForestClassifier(**RF_PARAMS).fit(trainFeatures, trainLabels)
            base = GradientBoostingClassifier(**GBC_PARAMS, loss="deviance")
            clf = BaggingClassifier(base_estimator=base, n_estimators=10, bootstrap=False).fit(trainFeatures, trainLabels).fit(trainFeatures, trainLabels)
            proba = clf.predict_proba(testFeatures)
            # clf = SVC(**SVM_PARAMS).fit(preprocessing.scale(trainFeatures), trainLabels)
            # proba = clf.predict_proba(preprocessing.scale(testFeatures))
            probs[testData[:,0]==qId] = proba[:,1]
        return probs
Example #8
0
def test_probability():
    # Predict probabilities.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    with np.errstate(divide="ignore", invalid="ignore"):
        # Normal case
        ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                     random_state=rng).fit(X_train, y_train)

        assert_array_almost_equal(
            np.sum(ensemble.predict_proba(X_test), axis=1),
            np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))

        # Degenerate case, where some classes are missing
        ensemble = BaggingClassifier(base_estimator=LogisticRegression(),
                                     random_state=rng,
                                     max_samples=5).fit(X_train, y_train)

        assert_array_almost_equal(
            np.sum(ensemble.predict_proba(X_test), axis=1),
            np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))
Example #9
0
def adaboost_train(train_file,test_file):
    _,x,y = readFile(train_file)
    print 'reading done.'
    ts = x.shape[0]
    id,x2 = readFile(test_file)
    
    print x.shape
    print x2.shape    

    x = np.concatenate((x,x2))
    print 'concatenate done.'
    from sklearn.preprocessing import scale
    x = scale(x,with_mean=False)
    print 'scale done.'

    x2 = x[ts:]
    x=x[0:ts]

    from sklearn.feature_selection import SelectKBest,chi2
    x = SelectKBest(chi2,k=50000).fit_transform(x,y)    


    from sklearn.cross_validation import train_test_split
    tmp_array = np.arange(x.shape[0])
    train_i, test_i = train_test_split(tmp_array, train_size = 0.8, random_state = 500)

    train_x = x[train_i]
    test_x = x[test_i]
    train_y = y[train_i]
    test_y = y[test_i]

    from sklearn.ensemble import BaggingClassifier
    bagging = BaggingClassifier(LR(penalty='l2',dual=True),n_estimators = 10,max_samples=0.6,max_features=0.6)
    bagging.fit(train_x,train_y)
    print 'train done.' 
    res = bagging.predict(train_x)
    print res
    from sklearn.metrics import roc_auc_score
    score = roc_auc_score(train_y,res)
    
    res = bagging.predict_proba(train_x)
    print res
    score = roc_auc_score(train_y,res[:,1])
    print score
    print '-----------------------------------------'
    
    print res[:,1]
    res = bagging.predict_proba(test_x)
    score = roc_auc_score(test_y,res[:,1])
    print score

    y=bagging.predict_proba(x2)
    output = pd.DataFrame( data={"id":id, "sentiment":y[:,1]} )
    output.to_csv( "/home/chuangxin/Bagging_result.csv", index=False, quoting=3 )

    return bagging
Example #10
0
def test_parallel_classification():
    # Check parallel classification.
    rng = check_random_state(0)

    # Classification
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                 n_jobs=3,
                                 random_state=0).fit(X_train, y_train)

    # predict_proba
    ensemble.set_params(n_jobs=1)
    y1 = ensemble.predict_proba(X_test)
    ensemble.set_params(n_jobs=2)
    y2 = ensemble.predict_proba(X_test)
    assert_array_almost_equal(y1, y2)

    ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                 n_jobs=1,
                                 random_state=0).fit(X_train, y_train)

    y3 = ensemble.predict_proba(X_test)
    assert_array_almost_equal(y1, y3)

    # decision_function
    ensemble = BaggingClassifier(SVC(gamma='scale',
                                     decision_function_shape='ovr'),
                                 n_jobs=3,
                                 random_state=0).fit(X_train, y_train)

    ensemble.set_params(n_jobs=1)
    decisions1 = ensemble.decision_function(X_test)
    ensemble.set_params(n_jobs=2)
    decisions2 = ensemble.decision_function(X_test)
    assert_array_almost_equal(decisions1, decisions2)

    X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1))))
    assert_raise_message(
        ValueError, "Number of features of the model "
        "must match the input. Model n_features is {0} "
        "and input n_features is {1} "
        "".format(X_test.shape[1], X_err.shape[1]), ensemble.decision_function,
        X_err)

    ensemble = BaggingClassifier(SVC(gamma='scale',
                                     decision_function_shape='ovr'),
                                 n_jobs=1,
                                 random_state=0).fit(X_train, y_train)

    decisions3 = ensemble.decision_function(X_test)
    assert_array_almost_equal(decisions1, decisions3)
Example #11
0
def test_parallel_classification():
    # Check parallel classification.
    rng = check_random_state(0)

    # Classification
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                 n_jobs=3,
                                 random_state=0).fit(X_train, y_train)

    # predict_proba
    ensemble.set_params(n_jobs=1)
    y1 = ensemble.predict_proba(X_test)
    ensemble.set_params(n_jobs=2)
    y2 = ensemble.predict_proba(X_test)
    assert_array_almost_equal(y1, y2)

    ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                 n_jobs=1,
                                 random_state=0).fit(X_train, y_train)

    y3 = ensemble.predict_proba(X_test)
    assert_array_almost_equal(y1, y3)

    # decision_function
    ensemble = BaggingClassifier(SVC(gamma='scale',
                                     decision_function_shape='ovr'),
                                 n_jobs=3,
                                 random_state=0).fit(X_train, y_train)

    ensemble.set_params(n_jobs=1)
    decisions1 = ensemble.decision_function(X_test)
    ensemble.set_params(n_jobs=2)
    decisions2 = ensemble.decision_function(X_test)
    assert_array_almost_equal(decisions1, decisions2)

    X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1))))
    assert_raise_message(ValueError, "Number of features of the model "
                         "must match the input. Model n_features is {0} "
                         "and input n_features is {1} "
                         "".format(X_test.shape[1], X_err.shape[1]),
                         ensemble.decision_function, X_err)

    ensemble = BaggingClassifier(SVC(gamma='scale',
                                     decision_function_shape='ovr'),
                                 n_jobs=1,
                                 random_state=0).fit(X_train, y_train)

    decisions3 = ensemble.decision_function(X_test)
    assert_array_almost_equal(decisions1, decisions3)
Example #12
0
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_est=100,
                  depth=4,
                  lrate=.1,
                  n_fold=5,
                  n_bag=50,
                  subrow=.5,
                  subcol=.8):

    feature_name = os.path.basename(train_file)[:-4]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='xg_bag{}_{}_{}_{}_{}_{}_{}.log'.format(
                            n_bag, n_est, depth, lrate, subrow, subcol,
                            feature_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    xg = xgb.XGBClassifier(max_depth=depth,
                           learning_rate=lrate,
                           n_estimators=n_est,
                           colsample_bytree=.8,
                           subsample=.5,
                           nthread=4)

    clf = BG(xg, n_estimators=n_bag, max_samples=subrow, max_features=subcol)
    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    p_val = np.zeros_like(y)
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}...'.format(i))
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        logging.info('AUC TRN = {:.6f}'.format(
            AUC(y[i_trn],
                clf.predict_proba(X[i_trn])[:, 1])))
        logging.info('AUC VAL = {:.6f}'.format(AUC(y[i_val], p_val[i_val])))

    logging.info('AUC = {:.6f}'.format(AUC(y, p_val)))

    logging.info('Retraining with 100% data...')
    clf.fit(X, y)
    p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Example #13
0
def test_parallel_classification():
    # Check parallel classification.
    rng = check_random_state(0)

    # Classification
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                 n_jobs=3,
                                 random_state=0).fit(X_train, y_train)

    # predict_proba
    ensemble.set_params(n_jobs=1)
    y1 = ensemble.predict_proba(X_test)
    ensemble.set_params(n_jobs=2)
    y2 = ensemble.predict_proba(X_test)
    assert_array_almost_equal(y1, y2)

    ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                 n_jobs=1,
                                 random_state=0).fit(X_train, y_train)

    y3 = ensemble.predict_proba(X_test)
    assert_array_almost_equal(y1, y3)

    # decision_function
    ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
                                 n_jobs=3,
                                 random_state=0).fit(X_train, y_train)

    ensemble.set_params(n_jobs=1)
    decisions1 = ensemble.decision_function(X_test)
    ensemble.set_params(n_jobs=2)
    decisions2 = ensemble.decision_function(X_test)
    assert_array_almost_equal(decisions1, decisions2)

    X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1))))
    err_msg = (f"Number of features of the model must match the input. Model "
               f"n_features is {X_test.shape[1]} and input n_features is "
               f"{X_err.shape[1]} ")
    with pytest.raises(ValueError, match=err_msg):
        ensemble.decision_function(X_err)

    ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
                                 n_jobs=1,
                                 random_state=0).fit(X_train, y_train)

    decisions3 = ensemble.decision_function(X_test)
    assert_array_almost_equal(decisions1, decisions3)
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_est=100,
                  depth=4,
                  lrate=.1,
                  n_fold=5):

    logging.info('Loading training and test data...')
    X, y = load_svmlight_file(train_file)
    X_tst, _ = load_svmlight_file(test_file)

    X = X.todense()
    X_tst = X_tst.todense()

    logging.info('Validation...')
    gbm = GBM(max_depth=depth,
              learning_rate=lrate,
              n_estimators=n_est,
              random_state=2015)

    clf = BG(base_estimator=gbm,
             n_estimators=5,
             max_samples=0.8,
             max_features=0.8,
             bootstrap=True,
             bootstrap_features=True,
             random_state=42,
             verbose=0)

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    logging.info('Cross validation...')
    p_val = np.zeros_like(y)
    lloss = 0.
    for i_trn, i_val in cv:
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        lloss += log_loss(y[i_val], p_val[i_val])

    logging.info('Log Loss = {:.4f}'.format(lloss))

    logging.info('Retraining with 100% data...')
    clf.fit(X, y)
    p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def main():
    # The competition datafiles are in the directory /input

    # Read output csv format in case the file does not exists
    submit = pd.read_csv('sample_submission.csv')

    # Training cols
    print ("Loading training csv.")
    #train_cols = ['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market', 'hotel_cluster']
    train_cols = ['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country', 'hotel_cluster']
    train = pd.DataFrame(columns=train_cols)
    train_chunk = pd.read_csv('input/train.csv', chunksize=100000)
    print ("Training csv loaded.")

    # Read each chunk to train
    for chunk in train_chunk:
        #train = pd.concat( [ train, chunk ] )
        train = pd.concat( [ train, chunk[chunk['is_booking']==1][train_cols] ] )
        print ("Chunk done")
    # Load each column
    #x_train = train[['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market']].values
    x_train = train[['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country']].values
    y_train = train['hotel_cluster'].values

    # Run RandomForest on training data
    print ("Training RandomForest.")
    rf = RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=4)
    bclf = BaggingClassifier(rf, n_estimators=2, n_jobs=4)
    bclf.fit(x_train, y_train)
    print ("Training done.")

    print ("Loading testing csv.")
    test_chunk = pd.read_csv('input/test.csv', chunksize=100000)
    print ("Begin testing each chunk.")
    predict = np.array([])
    # Read each chunk to test
    for i, chunk in enumerate(test_chunk):
        #test_X = chunk[['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market']].values
        test_X = chunk[['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country']].values
        test_X = np.nan_to_num(test_X)
        if i > 0:
            predict = np.concatenate( [predict, bclf.predict_proba(test_X)])
        else:
            predict = bclf.predict_proba(test_X)
        print ("Chunk id: " + str(i))

    submit['hotel_cluster'] = np.apply_along_axis(get5Best, 1, predict)
    submit.head()
    submit.to_csv('submission_random_forest.csv', index=False)
Example #16
0
def bagging(X_train, X_test, y_train, y_test):
    from sklearn.ensemble import BaggingClassifier
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import precision_score
    import numpy as np

    bagc = BaggingClassifier(n_estimators=100)
    bagc.fit(X_train, y_train);

    y_preds_bagc = bagc.predict_proba(X_test)[:,1]
    y_preds_bagc_bin = bagc.predict(X_test)

    #TPRbagc, FPRbagc, thresholdsbagc = roc_curve(y_test, y_preds_bagc, pos_label=None, sample_weight=None, drop_intermediate=True)

    #plotroc(TPRbagc, FPRbagc)

    bagc_prec = np.mean(cross_val_score(bagc, X_train, y_train, scoring='precision', cv=5))
    bagc_acc = np.mean(cross_val_score(bagc, X_train, y_train, scoring='accuracy', cv=5))
    bagc_test_prec = precision_score(y_test, y_preds_bagc_bin)
    bagc_test_acc = accuracy_score(y_test, y_preds_bagc_bin)
    print("The cross validated precision score is {:0.3}".format(bagc_prec))
    print("The cross validated accuracy score is {:0.3}".format(bagc_acc))
    print("The test precision score is {:0.3}".format(bagc_test_prec))
    print("The test accuracy score is {:0.3}".format(bagc_test_acc))

    return(bagc)
Example #17
0
def svm_ensemble(train_x, train_y, test_x, test_y):
    # Set the parameters by cross-validation (default:5-fold)
    tuned_svm = [{'C': [1, 10, 100, 1000]}]
    svm_clf = GridSearchCV(svm.SVC(kernel="linear",
                                   probability=True,
                                   class_weight="balanced"),
                           tuned_svm,
                           scoring="accuracy")
    svm_bagging = BaggingClassifier(svm_clf,
                                    n_estimators=ESTIMATOR,
                                    max_samples=0.2)
    svm_bagging.fit(train_x, train_y)
    probas = svm_bagging.predict_proba(test_x)[:, 1]
    tpr_values, fpr_values, roc_auc = get_auc(probas, test_y)
    y_true, y_pred = test_y, svm_bagging.predict(test_x)
    recall = recall_score(y_true, y_pred, average=None)
    acc = accuracy_score(y_true, y_pred)
    svm_imp = np.zeros((ESTIMATOR, 22))
    for i in range(ESTIMATOR):
        grid_base = svm_bagging.estimators_[i]
        base = grid_base.best_estimator_
        base.fit(train_x, train_y)
        svm_imp[i] = abs(base.coef_)
    svm_imp = np.mean(svm_imp, axis=0)
    return tpr_values, fpr_values, roc_auc, acc, recall, svm_imp
Example #18
0
def vipsClassification(featureSetup, testSet, featureSets, resFileName):
    featureSet = []
    for feat in featureSets:
        featureSet += FEATURE_SETS[feat]
    featurePath = VIPS_FEATURES_PATH
    resPath = VIPS_RES_PATH

    trainData, trainQIds = loadJsonData(join(featurePath,"train",featureSetup+".json"))
    testData, testQids = loadJsonData(join(featurePath,testSet,featureSetup+".json"))
    res = [([], testData, testSet, [])]

    trainFeatures = np.nan_to_num(np.array([np.array(extractFeatureValues(trainSample, featureSet)) for trainSample in trainData[:,-1]]))
    trainLabels = trainData[:,-2].astype('int')
    base = GradientBoostingClassifier(**GBC_PARAMS, loss="deviance", subsample=1)
    clf = BaggingClassifier(base_estimator=base, n_jobs=8, n_estimators=8, bootstrap=False).fit(trainFeatures, trainLabels)

    testFeatures = np.nan_to_num(np.array([np.array(extractFeatureValues(testSample, featureSet)) for testSample in testData[:,-1]]))
    probs = clf.predict_proba(testFeatures)[:,1]
    testData[:,-1] = probs
    with open(join(VIPS_RES_PATH, testSet + "_Error_Analysis.csv"),"w+") as f:
        featureWriter = csv.writer(f)
        featureWriter.writerows(testData)
    res[0][0].append(probs)
    res[0][3].append(testSet)
    resFile = join(resPath,resFileName + "_qRep.csv")
    questionReporting(res, resFile, testQids)
def BaggingClassifierPhase(fullData):
    Features = fullData[:, 0:NewDimension[1] - 1].astype(float)
    Target = fullData[:, NewDimension[1]]
    Bagmodel = BaggingClassifier()
    #Bagmodel = RandomForestClassifier()
    Bagmodel.fit(Features, Target)
    print(
        "Bagging Classifier is built and making predictions and calculating score of the model."
    )
    TargetMat = np.ravel(Target)  # Converts into 1D numpy.array
    predicted = Bagmodel.predict(Features)
    print(
        "\nf1_score:\n",
        metrics.f1_score(Target,
                         predicted,
                         pos_label='1.0',
                         average='weighted'))
    print("\nConfusion matrix:\n    0.0\t  1.0\n",
          metrics.confusion_matrix(TargetMat, predicted))
    print("Calculating the score of the model for 10-folds.")
    # Similar scores are obtained for every fold which shows that the model is well built and is NOT overfitting.
    print("\nScore of the model for each of the 10 folds:\n",
          cross_val_score(Bagmodel, Features, TargetMat, cv=10))
    findROC(TargetMat, predicted)
    probability = Bagmodel.predict_proba(Features)
    listProb = probability[:, 1].tolist(
    )  # List of probability values for Flag = 0 for every observation.
    sumProb = 0
    for z in listProb:
        sumProb += z
    AverageProbability = sumProb / len(listProb)
    print("\nAverage probability of deals being Captured is: ",
          AverageProbability)
    return AverageProbability
Example #20
0
def othertest(precisionk, draw='False'):
    cleandata = pd.read_csv("./data/cleaned_knnimpute.csv")
    cleandata.index = cleandata.sid
    cleandata = cleandata.drop('sid', 1)
    mask = np.isnan(cleandata['Y'])
    cleandata = cleandata[mask == False]
    #After c is chosen, use this to draw AUC plot
    train_id, test_id = train_test_split(cleandata.index,
                                         test_size=0.2)  # test_ratio = 0.2
    train = cleandata.ix[train_id]
    test = cleandata.ix[test_id]
    coltest = precisionCol(train, precisionk)
    coltest = list(coltest)
    coltest.append('Y')
    train = train[coltest]
    test = test[coltest]
    model = BaggingClassifier(base_estimator=linear_model.LogisticRegression(),
                              n_estimators=100,
                              max_features=200,
                              n_jobs=-1)
    model.fit(train.drop('Y', 1), train['Y'])
    fpr, tpr, thresholds = roc_curve(
        test['Y'],
        model.predict_proba(test.drop('Y', 1))[:, 1])
    print auc(fpr, tpr)
    if draw == 'True':
        plotAUC(test['Y'], model.decision_function(test.drop('Y', 1)),
                'Gradient Boosting')
        plt.savefig("testnorm_randomforest.png", dpi=120)
Example #21
0
def query_by_bagging(X, y, current_model, batch_size, rng, base_model=SVC(C=1, kernel='linear'), n_bags=5, method="KL", D=None):
    """
    :param base_model: Model that will be  **fitted every iteration**
    :param n_bags: Number of bags on which train n_bags models
    :param method: 'entropy' or 'KL'
    :return:
    """
    assert method == 'entropy' or method == 'KL'
    eps = 0.0000001
    if method == 'KL':
        assert hasattr(base_model, 'predict_proba'), "Model with probability prediction needs to be passed to this strategy!"
    clfs = BaggingClassifier(base_model, n_estimators=n_bags, random_state=rng)
    clfs.fit(X[y.known], y[y.known])
    pc = clfs.predict_proba(X[np.invert(y.known)])
    # Settles page 17
    if method == 'entropy':
        pc += eps
        fitness = np.sum(pc * np.log(pc), axis=1)
        ids =  np.argsort(fitness)[:batch_size]
    elif method == 'KL':
        p = np.array([clf.predict_proba(X[np.invert(y.known)]) for clf in clfs.estimators_])
        fitness = np.mean(np.sum(p * np.log(p / pc), axis=2), axis=0)
        ids = np.argsort(fitness)[-batch_size:]

    return y.unknown_ids[ids], fitness/np.max(fitness)
Example #22
0
class BaggingClassifierImpl():
    def __init__(self, base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0):
        self._hyperparams = {
            'base_estimator': make_sklearn_compat(base_estimator),
            'n_estimators': n_estimators,
            'max_samples': max_samples,
            'max_features': max_features,
            'bootstrap': bootstrap,
            'bootstrap_features': bootstrap_features,
            'oob_score': oob_score,
            'warm_start': warm_start,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'verbose': verbose}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
Example #23
0
def cv_layer_1(clf_arguments, clf_type, datasets, non_clf_arguments):
    print("running %s experiment" % clf_type.__name__)

    for (cv1_train, cv1_test) in datasets:
        train_X, train_y = cv1_train
        test_X, test_y = cv1_test

        dataset_layer_2 = (train_X, train_y)

        best_res, best_arg = cv_layer_2(clf_arguments, clf_type,
                                        dataset_layer_2, non_clf_arguments)

        clf = clf_type(**best_arg)

        if non_clf_arguments["bagging"]:
            clf = BaggingClassifier(clf)

        clf.fit(train_X, train_y)

        pred_y = clf.predict_proba(test_X)

        validation_weights = compute_validation_weights(test_y)

        score = interpret_score(pred_y,
                                test_y,
                                validation_weights=validation_weights,
                                show_roc=True)

        yield copy.deepcopy((score, best_arg))
def Model_2(train, test):
    ''' Trains the model and Saves the predictions in a CSV file
        train : Training set
        test : Test set
    '''
    # Preprocessing
    X_train = [AAC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in train['Sequence']]
    X_test = [AAC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in test[' Sequence']]
    Y_train = train[' Label']

    X_train, Y_train, X_test = np.array(X_train), np.array(Y_train), np.array(X_test)
    X_train,Y_train = shuffle(X_train,Y_train,random_state = 3)

    # Training
    param = {'max_depth':25,'objective':'reg:logistic','n_estimators':100,'booster':'gbtree',
            'colsample_bylevel':0.7,'colsample_bytree': 1,'n_thread': 2}

    xgb = XGBClassifier( **param, random_state = 3)
    clf = BaggingClassifier(base_estimator = xgb, n_estimators = 23, random_state = 3, n_jobs = -1)
    clf.fit(X_train, Y_train)

    # Predicting
    Y_prob = [x[1] for x in clf.predict_proba(X_test)]
    Y_pred = clf.predict(X_test)

    result = pd.DataFrame()
    result["ID"] = test["ID"]
    result["Label"] = Y_prob
    result.to_csv("Submission_2.csv", index = False)
    result["Label"] = Y_pred
    result.to_csv("Prediction_2.csv", index = False)
Example #25
0
def drawROC(XY, X, Y, class_label_list):
    X_new, X1_new, Y_new, Y1_new = train_test_split(X,
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=42)
    # classification
    # classify
    neigh = BaggingClassifier(random_state=42)
    neigh = neigh.fit(X_new, Y_new)
    # find out required parameters
    predicted_classes = neigh.predict_proba(X1_new)
    fpr, tpr, threshold = metrics.roc_curve(
        Y1_new,
        predicted_classes[:, 1],
        pos_label=class_label_list[len(class_label_list) - 1])
    roc_auc = metrics.auc(fpr, tpr)

    #Plottting in graph
    plt.title('ROC curve')
    plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('Actual Positive Rate')
    plt.xlabel('Not Actual Positive Rate')
    plt.show()
def bagged_tree():
    train_features, test_features = load_features()
    train_features = train_features.fillna(value=0)
    test_features = test_features.fillna(value=0)
    X_train = train_features.drop(["bidder_id", "outcome"], axis=1)
    Y_train = train_features["outcome"]
    X_test = test_features.drop(["bidder_id"], axis=1)
    print("Training Bagged forest classifier model")
    cart = DecisionTreeClassifier()
    bag_class = BaggingClassifier(base_estimator=cart, n_estimators=3000)
    print("Model trained")
    print("Cross validation score (Bagged Forest) : ")
    cv_score = np.mean(
        cross_val_score(bag_class, X_train, Y_train, cv=5, scoring='roc_auc'))
    print(cv_score)

    print("Generating submission file")
    bag_class.fit(X_train, Y_train)
    prediction = bag_class.predict_proba(X_test)
    test_features['prediction'] = prediction[:, 1]
    test_features[['bidder_id',
                   'prediction']].to_csv('data/submission_bagged.csv',
                                         index=False)
    print("Output file successfully created")

    print("Generating auc curve and auc score")
    auc = roc_auc(train_features, bag_class)
    print("AUC score : " + str(auc))
Example #27
0
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_est,
                  depth,
                  n_fold=5,
                  n_bag=50):

    feature_name = os.path.basename(train_file)[:-4]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='et_bag{}_{}_{}_{}.log'.format(
                            n_bag, n_est, depth, feature_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    et = ET(n_estimators=n_est,
            max_depth=depth,
            random_state=2015,
            class_weight='auto',
            bootstrap=True)

    clf = BG(et, n_estimators=n_bag, max_samples=.8, max_features=.9)
    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    p_val = np.zeros_like(y)
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}...'.format(i))
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        logging.info('AUC TRN = {:.6f}'.format(
            AUC(y[i_trn],
                clf.predict_proba(X[i_trn])[:, 1])))
        logging.info('AUC VAL = {:.6f}'.format(AUC(y[i_val], p_val[i_val])))

    logging.info('AUC = {:.6f}'.format(AUC(y, p_val)))

    logging.info('Retraining with 100% data...')
    clf.fit(X, y)
    p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Example #28
0
def main():
    # The competition datafiles are in the directory /input
    # Read competition data files:
    train_csv = pd.read_csv("input/train.csv")
    test_csv = pd.read_csv("input/test.csv")

    # Prepare train by taking columns and filling NaNs
    #train = train[['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market', 'is_booking', 'cnt']]
    train = train_csv[[
        'site_name', 'posa_continent', 'user_location_country',
        'user_location_region', 'user_location_city',
        'orig_destination_distance', 'user_id', 'is_mobile', 'is_package',
        'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt',
        'srch_destination_id', 'srch_destination_type_id', 'hotel_continent',
        'hotel_country', 'hotel_market'
    ]]
    train = train.fillna(0)
    target = train_csv[['hotel_cluster']]
    target = target.fillna(0)

    # Prepare test by taking columns and filling NaNs
    test = test_csv[[
        'site_name', 'posa_continent', 'user_location_country',
        'user_location_region', 'user_location_city',
        'orig_destination_distance', 'user_id', 'is_mobile', 'is_package',
        'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt',
        'srch_destination_id', 'srch_destination_type_id', 'hotel_continent',
        'hotel_country', 'hotel_market'
    ]]
    test = test.fillna(0)

    # Run Random Forest on training data
    print "Training Random Forest"
    rf = RandomForestClassifier(n_estimators=100, n_jobs=2)
    svc = BaggingClassifier(rf, n_estimators=2, n_jobs=2)
    svc.fit(train, target.values.ravel())

    # Predict with testing data
    print "Predicting test data"
    predict = svc.predict_proba(test)

    # Generate submission
    print "Generating submission"
    stuff = np.apply_along_axis(get5Best, 1, predict)
    subm = np.empty((len(predict), 6))
    subm[:, 0] = np.arange(1, len(predict) + 1)
    subm[:, 1] = stuff[:, 0]
    subm[:, 2] = stuff[:, 1]
    subm[:, 3] = stuff[:, 2]
    subm[:, 4] = stuff[:, 3]
    subm[:, 5] = stuff[:, 4]
    np.savetxt('random_forest_submission.csv',
               subm,
               fmt='%d,%d %d %d %d %d',
               delimiter=',',
               header='id,hotel_cluster',
               comments='')

    print "Done"
Example #29
0
def averaged_2models(trainX, validX, trainy, validy, testX, model1, model2,
                     sub1, sub2):
    """
    A try to average two models with different weights
    to see if it can be better, by grid search
    sub1 and sub2 are parameters to change name of submission file
    Better to use averaging_probas to avoid retraining classifiers
    """
    if isinstance(model1, MLPClassifier) or isinstance(model1,
                                                       xgb.XGBClassifier):
        calib1 = BaggingClassifier(model1)
    else:
        calib1 = CalibratedClassifierCV(model1, 'isotonic', 3)

    calib1.fit(trainX, trainy)
    print("model1 trained")

    if isinstance(model2, MLPClassifier) or isinstance(model2,
                                                       xgb.XGBClassifier):
        calib2 = BaggingClassifier(model2)
    else:
        calib2 = CalibratedClassifierCV(model2, 'isotonic', 3)
    calib2.fit(trainX, trainy)
    print("model2 trained")

    if not len(validy) == 0:
        valid1 = calib1.predict_proba(validX)
        print("Evaluation model1(kaggle) of validation set :",
              evaluation(validy, valid1))
        valid2 = calib2.predict_proba(validX)
        print("Evaluation model2(kaggle) of validation set :",
              evaluation(validy, valid2))

    res1 = calib1.predict_proba(testX)
    saveResult(res1, "../" + sub1 + "_submission.csv")
    res2 = calib2.predict_proba(testX)
    saveResult(res2, "../" + sub2 + "_submission.csv")
    for x in [y / 10.0 for y in range(1, 10)]:
        combres = (x * res1 + (1 - x) * res2)
        if not len(validy) == 0:
            pred_valid = (x * valid1 + (1 - x) * valid2)
            print("Evaluation (kaggle) of validation set :",
                  evaluation(validy, pred_valid))
        saveResult(
            combres, "../combined_csv/combined_{:1.2f}{}_{:1.2f}{}.csv".format(
                x, sub1, 1 - x, sub2))
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_est=100,
                  depth=4,
                  lrate=.1,
                  n_fold=5):

    logging.info('Loading training and test data...')
    X, y = load_svmlight_file(train_file)
    X_tst, _ = load_svmlight_file(test_file)

    X = X.todense()
    X_tst = X_tst.todense()

    xg = XGBoostClassifier(n_estimators=n_est,
                           eta=lrate,
                           max_depth=depth,
                           n_jobs=8)

    clf = BaggingClassifier(base_estimator=xg,
                            n_estimators=5,
                            max_samples=0.9,
                            max_features=0.9,
                            random_state=42)

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    logging.info('Cross validation...')
    p_val = np.zeros_like(y)
    lloss = 0.
    for i_trn, i_val in cv:
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        lloss += log_loss(y[i_val], p_val[i_val])

    logging.info('Log Loss = {:.4f}'.format(lloss / n_fold))

    logging.info('Retraining with 100% data...')
    clf.fit(X, y)
    p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Example #31
0
 def create_estimators(self, X_train, y_train, X_test):
     for model in self.models:
         param_grid = self.create_parameter_grid(model)
         for parameters in param_grid:
             clf = BaggingClassifier(base_estimator=model.set_params(**parameters), n_estimators=self.estimators, max_samples=0.95, n_jobs = 3)
             clf.fit(X_train, y_train)
             prediction = clf.predict_proba(X_test)[:,1]
             self.predictions.append(prediction)
def bagging_classifier(x_train, y_train, x_test):
    '''
    creates a bagging classifier
    
    returns the predicted class probabilities on x_test from that classsifier
    '''
    bc = BaggingClassifier(max_features=.75)
    bc.fit(x_train, y_train)
    return bc.predict_proba(x_test)[:, 0]
Example #33
0
def run():
    import numpy as np
    import pandas as pd
    import seaborn
    import matplotlib.pyplot as pyplot
    import seaborn as sns
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
    from sklearn.svm import SVC
    from sklearn.ensemble import BaggingClassifier

    df = pd.read_table("./data/australian.csv", sep='\s+', header=None)
    y = df[14]
    X = df.drop(columns=14)
    y.value_counts()
    # Split features and target into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=1,
                                                        stratify=y,
                                                        test_size=0.4)

    # Instantiate the Classifiers

    clf = BaggingClassifier(SVC(kernel='linear', random_state=1),
                            max_samples=0.5,
                            max_features=1.0,
                            n_estimators=20)

    clf.fit(X_train, y_train)
    # Make predictions for the test set
    y_pred_test = clf.predict(X_test)

    # View accuracy score

    print(classification_report(y_test, y_pred_test))

    clf_probs = clf.predict_proba(X_test)
    # keep probabilities for the positive outcome only
    clf_probs = clf_probs[:, 1]
    # calculate scores
    clf_auc = roc_auc_score(y_test, clf_probs)
    # summarize scores
    print('Bagged_SVM: ROC AUC=%.3f' % (clf_auc))
    print("accuracy_score is %.3f" %
          (accuracy_score(y_test, y_pred_test, normalize=True)))
    # calculate roc curves
    clf_fpr, clf_tpr, _ = roc_curve(y_test, clf_probs)
    # plot the roc curve for the model
    pyplot.plot(clf_fpr, clf_tpr, marker='.', label='Bagged_SVM')
    # axis labels
    pyplot.xlabel('False Positive Rate')
    pyplot.ylabel('True Positive Rate')
    # show the legend
    pyplot.legend()
    # show the plot
    pyplot.show()
def test_parallel_classification():
    # Check parallel classification.
    rng = check_random_state(0)

    # Classification
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                 n_jobs=3,
                                 random_state=0).fit(X_train, y_train)

    # predict_proba
    ensemble.set_params(n_jobs=1)
    y1 = ensemble.predict_proba(X_test)
    ensemble.set_params(n_jobs=2)
    y2 = ensemble.predict_proba(X_test)
    assert_array_almost_equal(y1, y2)

    ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                 n_jobs=1,
                                 random_state=0).fit(X_train, y_train)

    y3 = ensemble.predict_proba(X_test)
    assert_array_almost_equal(y1, y3)

    # decision_function
    ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
                                 n_jobs=3,
                                 random_state=0).fit(X_train, y_train)

    ensemble.set_params(n_jobs=1)
    decisions1 = ensemble.decision_function(X_test)
    ensemble.set_params(n_jobs=2)
    decisions2 = ensemble.decision_function(X_test)
    assert_array_almost_equal(decisions1, decisions2)

    ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
                                 n_jobs=1,
                                 random_state=0).fit(X_train, y_train)

    decisions3 = ensemble.decision_function(X_test)
    assert_array_almost_equal(decisions1, decisions3)
Example #35
0
def logistic_regression(xTrain,
                        yTrain,
                        xTest,
                        model_eval='recall',
                        ensemble=None,
                        soft_output=True,
                        verbose=True):

    # Check params
    if ensemble != None:
        if ensemble != 'bagging':
            print(
                'Ensemble method not recognized. Supported value is: "bagging"'
            )
            return

    nSamples, nFeatures = np.shape(xTrain)
    C = np.logspace(-3, 3, 10)
    param_grid = dict()
    param_grid.update({'C': C})

    if verbose:
        print('Training logistic regression classifier')
    stkfold = StratifiedKFold(yTrain, n_folds=5)
    gs = GridSearchCV(LogisticRegression(class_weight='auto'),
                      param_grid,
                      scoring=model_eval,
                      cv=stkfold,
                      refit=True,
                      n_jobs=-1)
    gs.fit(xTrain, yTrain)
    est = gs.best_estimator_

    if ensemble == 'bagging':
        if verbose:
            print('Training bagging estimator')
        bag = BaggingClassifier(est,
                                n_estimators=1000,
                                oob_score=True,
                                n_jobs=-1)
        bag.fit(xTrain, yTrain)
        if soft_output:
            scores = bag.predict_proba(xTest)
        else:
            scores = bag.predict(xTest)
#        print('Accuracy: '+str(bag.score(xTest,yTest)))
    else:
        if soft_output:
            scores = est.predict_proba(xTest)
        else:
            scores = est.predict(xTest)


#        print('Accuracy: '+str(est.score(xTest,yTest)))

    return est
def train_and_test(X_train, X_test, y_train, y_test):
    forest = BaggingClassifier(n_estimators=500, random_state=1234)
    forest = forest.fit(X_train, y_train)
    proba = forest.predict_proba(X_test)
    proba = proba[:, 1]
    y_test = np.array(y_test)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, proba, pos_label=1)
    loss = metrics.auc(fpr, tpr)
    print loss
    return loss
def Bagging(X_train, y_train, X_test):
    bag_clf = BaggingClassifier(MLPClassifier(solver='lbfgs',
                                              alpha=1e-5,
                                              hidden_layer_sizes=(5, 2),
                                              random_state=1),
                                max_samples=0.5,
                                max_features=0.5)
    bag_clf = bag_clf.fit(X_train, y_train)
    predicted = bag_clf.predict_proba(X_test)
    return bag_clf, predicted
Example #38
0
def train_and_test(X_train, X_test, y_train, y_test):
    forest = BaggingClassifier(n_estimators=500, random_state=1234)
    forest = forest.fit(X_train, y_train)
    proba = forest.predict_proba(X_test)
    proba = proba[:, 1]
    y_test = np.array(y_test)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, proba, pos_label=1)
    loss = metrics.auc(fpr, tpr)
    print loss
    return loss
Example #39
0
def predict_with_best_model(estimator, xtrain, ytrain, xtest):
    from sklearn.ensemble import BaggingClassifier
    model = BaggingClassifier(base_estimator=estimator, n_estimators=10, max_samples=0.9, max_features=0.9, n_jobs=1, 
                              bootstrap=False, bootstrap_features=False, oob_score=False)
    model = model.fit(xtrain,ytrain)
    y = model.predict_proba(xtest)
#     print("Bagging score with oob estimates: ")
#     print model.oob_score_
    print ("Model used: ")
    print model.base_estimator_
    return y 
Example #40
0
File: ensemble.py Project: smly/ume
class BaggingClassifier(BaseEstimator):
    def __init__(self, base_estimator=None, bag_kwargs=None):
        klass = dynamic_load(base_estimator['class'])
        svc = klass(**base_estimator['params'])
        self.__clf = SK_BaggingClassifier(base_estimator=svc, **bag_kwargs)

    def fit(self, X, y):
        return self.__clf.fit(X, y)

    def predict_proba(self, X):
        return self.__clf.predict_proba(X)
Example #41
0
def tuned_classifier(TR1, TR1_outcome, TR2, type, tuned_parameters, model_type, ensemble_methods):
	prediction = None
	model = None
	#Step 2- K-fold cross validation on TR1 to obtain optimal regression model, model. 
	if not tuned_parameters:
		model = type
		model.fit(TR1, TR1_outcome)
		if(ensemble_methods):
			model = BaggingClassifier(model, max_samples=1.0, max_features=1.0).fit(TR1, TR1_outcome)
		prediction = model.predict_proba(TR2)
	else:
		model = GridSearchCV(type, tuned_parameters, cv=10, scoring="accuracy").fit(TR1, TR1_outcome)
		if(ensemble_methods):
			model = BaggingClassifier(model.best_estimator_, max_samples=1.0, max_features=1.0).fit(TR1, TR1_outcome)
		if(model_type=="SVM" or model_type=="Decision Tree"  or model_type=="SDG"):
			clf_isotonic = CalibratedClassifierCV(model, cv='prefit', method='sigmoid').fit(TR1, TR1_outcome)
			prediction = np.array(clf_isotonic.predict_proba(TR2))
		else:
			prediction = np.array(model.predict_proba(TR2))
		
	return { 'prediction': prediction, 'model': model, 'type': model_type}
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_est=100, depth=4, lrate=.1, n_fold=5):

    logging.info('Loading training and test data...')
    X, y = load_svmlight_file(train_file)
    X_tst, _ = load_svmlight_file(test_file)

    X = X.todense()
    X_tst = X_tst.todense()

    xg = XGBoostClassifier(n_estimators=n_est,
                           eta=lrate,
                           max_depth=depth,
                           n_jobs=8)

    clf = BaggingClassifier(base_estimator=xg,
                            n_estimators=5,
                            max_samples=0.9,
                            max_features=0.9,
                            random_state=42)

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    logging.info('Cross validation...')
    p_val = np.zeros_like(y)
    lloss = 0.
    for i_trn, i_val in cv:
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        lloss += log_loss(y[i_val], p_val[i_val])

    logging.info('Log Loss = {:.4f}'.format(lloss / n_fold))

    logging.info('Retraining with 100% data...')
    clf.fit(X, y)
    p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Example #43
0
class BaggingLearner(AbstractLearner):

    def __init__(self):
        self.learner = BaggingClassifier(KNeighborsClassifier())

    def _train(self, x_train, y_train):
        self.learner = self.learner.fit(x_train, y_train)

    def _predict(self, x):
        return self.learner.predict(x)

    def _predict_proba(self, x):
        return self.learner.predict_proba(x)
def phenotype_imputation(data, config):
    ''' 
    Function to impute the labels on II based on the classifier learned on I.
    
    Parameters 
    ---------- 
    data : an object of class Dataset that contains: genotypes, covariates, 
        labels and information about random folds 

    config : an object of class ConfigState. It contains the user-entered 
        parameters in a YAML format.
        See the config_file parameter in the main script for more details.
    '''
    # Parameters for this task
    num_folds = data.num_folds  
    task_name    = "phenotype_imputation"
    n_estimators = config.get_entry(task_name, "n_estimators")
    romans_trn   = config.get_entry(task_name, "romans_used_for_learning")
    romans_tst   = config.get_entry(task_name, "romans_used_for_imputing")
    
    # Iterate through the folds: 
    i = 0
    size_of_two = find_vec_entries_that_contain(data.folds[:,0], romans_tst).shape[0]
    soft_labels = np.zeros((size_of_two, num_folds))
    X_scaled = preprocessing.scale(data.clin_covariate.transpose()).transpose()
    fpr = dict()
    tpr = dict()
    thres = dict()
    roc_auc = np.zeros(num_folds)
    for fold in data.folds.transpose():      
        logging.info("Fold=%d" % (i + 1))
        sel_trn = find_vec_entries_that_contain(fold,[romans_trn])
        sel_tst = find_vec_entries_that_contain(fold,[romans_tst])

        model = BaggingClassifier(base_estimator=linear_model.LogisticRegression(),
                    n_estimators=n_estimators, max_samples=0.632, 
# for small set I   n_estimators=n_estimators, max_samples=0.8, 
                    max_features=5, 
                    bootstrap=True, bootstrap_features=True, oob_score=False, 
# for small set I   bootstrap=False, bootstrap_features=True, oob_score=False, 
                    n_jobs=1, random_state=None, verbose=0)
            
        model.fit(X_scaled[:,sel_trn].transpose(), data.labels[:,sel_trn].transpose())

        soft_labels[:,i] = model.predict_proba(X_scaled[:,sel_tst].transpose())[:,1]
        fpr[i], tpr[i], thres[i] = metrics.roc_curve(data.labels[0,sel_tst], soft_labels[:,i])
        roc_auc[i] = metrics.auc(fpr[i], tpr[i])
        i+=1

    # Save the output of this task
    config.save_variable(task_name, "%f", soft_labels=soft_labels, roc_auc=roc_auc)
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_est=100, depth=4, lrate=.1, n_fold=5):

    logging.info('Loading training and test data...')
    X, y = load_svmlight_file(train_file)
    X_tst, _ = load_svmlight_file(test_file)

    X = X.todense()
    X_tst = X_tst.todense()

    logging.info('Validation...')
    gbm = GBM(max_depth=depth, learning_rate=lrate, n_estimators=n_est,
              random_state=2015)

    clf = BG(base_estimator=gbm, n_estimators=5, max_samples=0.8,
             max_features=0.8, bootstrap=True, bootstrap_features=True,
             random_state=42, verbose=0)

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    logging.info('Cross validation...')
    p_val = np.zeros_like(y)
    lloss = 0.
    for i_trn, i_val in cv:
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        lloss += log_loss(y[i_val], p_val[i_val])

    logging.info('Log Loss = {:.4f}'.format(lloss))

    logging.info('Retraining with 100% data...')
    clf.fit(X, y)
    p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Example #46
0
def validation(df_features_driver, df_features_other_train, df_features_other_test):

    df_train = df_features_driver.append(df_features_other_train)
    df_train.reset_index(inplace = True)
    df_train.Driver = df_train.Driver.astype(int)
    
    df_test = df_features_driver.append(df_features_other_test)
    df_test.reset_index(inplace = True)
    df_test.Driver = df_test.Driver.astype(int)

    # So far, the best result was achieved by using a RandomForestClassifier with Bagging
    # model = BaggingClassifier(base_estimator = ExtraTreesClassifier())
    # model = BaggingClassifier(base_estimator = svm.SVC(gamma=2, C=1))
    # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression())
    # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression())
    # model = BaggingClassifier(base_estimator = AdaBoostClassifier())
    # model = RandomForestClassifier()
    model = BaggingClassifier(base_estimator = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0))
    feature_columns_train= df_train.iloc[:, 4:]
    feature_columns_test= df_test.iloc[:, 4:]

    # Train the classifier
    model.fit(feature_columns_train, df_train.Driver)
    
    probs_array = model.predict_proba(feature_columns_test) # Return array with the probability for every driver
    probs_df = pd.DataFrame(probs_array)

    probs_list = np.array(['1', probs_df.ix[0, 1]])
    for x in range(1, 200):
        # Column 1 should contain the driver of interest
        probs_list = np.vstack((probs_list, ['1', probs_df.ix[x, 1]]))
    for x in range(200,len(probs_df)):
        # Column 1 should contain the driver of interest
        probs_list = np.vstack((probs_list, ['0', probs_df.ix[x, 1]]))
    
    
    df_auc = AUC.AUC(probs_list)    
    
    return df_auc  
Example #47
0
def test_parallel():
    """Check parallel computations."""
    rng = check_random_state(0)

    # Classification
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    for n_jobs in [-1, 3]:
        ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                     n_jobs=n_jobs,
                                     random_state=0).fit(X_train, y_train)

        # predict_proba
        ensemble.set_params(n_jobs=1)
        y1 = ensemble.predict_proba(X_test)
        ensemble.set_params(n_jobs=2)
        y2 = ensemble.predict_proba(X_test)
        assert_array_almost_equal(y1, y2)

        ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                     n_jobs=1,
                                     random_state=0).fit(X_train, y_train)

        y3 = ensemble.predict_proba(X_test)
        assert_array_almost_equal(y1, y3)

        # decision_function
        ensemble = BaggingClassifier(SVC(),
                                     n_jobs=n_jobs,
                                     random_state=0).fit(X_train, y_train)

        ensemble.set_params(n_jobs=1)
        decisions1 = ensemble.decision_function(X_test)
        ensemble.set_params(n_jobs=2)
        decisions2 = ensemble.decision_function(X_test)
        assert_array_almost_equal(decisions1, decisions2)

        ensemble = BaggingClassifier(SVC(),
                                     n_jobs=1,
                                     random_state=0).fit(X_train, y_train)

        decisions3 = ensemble.decision_function(X_test)
        assert_array_almost_equal(decisions1, decisions3)

    # Regression
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    for n_jobs in [-1, 3]:
        ensemble = BaggingRegressor(DecisionTreeRegressor(),
                                    n_jobs=3,
                                    random_state=0).fit(X_train, y_train)

        ensemble.set_params(n_jobs=1)
        y1 = ensemble.predict(X_test)
        ensemble.set_params(n_jobs=2)
        y2 = ensemble.predict(X_test)
        assert_array_almost_equal(y1, y2)

        ensemble = BaggingRegressor(DecisionTreeRegressor(),
                                    n_jobs=1,
                                    random_state=0).fit(X_train, y_train)

        y3 = ensemble.predict(X_test)
        assert_array_almost_equal(y1, y3)
Example #48
0
dval = dtrain.ix[rows[:n_val]]

label_dtrain = label.ix[rows[n_val:]]
label_dval = label.ix[rows[:n_val]]


#clf.cv(dtrain,label[0].values,5)


calibrated_clf = CalibratedClassifierCV(clf, method='isotonic', cv=5)
calibrated_clf.fit(dtrain_sp, label_dtrain[0].values)
pred = calibrated_clf.predict_proba(dval)
print("ROC score", metrics.roc_auc_score(label_dval[0].values, pred[:,1]))

pred = calibrated_clf.predict_proba(dtest)
sample = pd.read_csv('/Users/IkkiTanaka/Documents/KDDCup/sampleSubmission.csv',header=None)
preds = pd.concat([sample[0],pd.DataFrame(pred[:,1])],axis=1)
preds.to_csv('/Users/IkkiTanaka/Documents/KDDCup/pred/xgb/sk_GBM2.csv' ,header=None,index=False)



#bagging
clf = XGBoostClassifier(nthread=8,booster='gbtree',eta=0.08,gamma=1.0,max_depth=4,min_child_weight=4,subsample=0.9,colsample_bytree=0.9,l=0,alpha=0,lambda_bias=0,objective="binary:logistic",eval_metric='auc',seed=19920407,num_class=1,max_delta_step=0,early_stopping_rounds=None,num_round=450)

bagging_clf = BaggingClassifier(base_estimator=clf, n_estimators=5, max_samples=.6, max_features=.8, bootstrap=True, bootstrap_features=False, oob_score=True, n_jobs=-1, random_state=19920407, verbose=1)
bagging_clf.fit(dtrain_sp, label_dtrain[0].values)
pred = bagging_clf.predict_proba(dval)
print("ROC score", metrics.roc_auc_score(label_dval[0].values, pred[:,1]))


Example #49
0
            print "processing parameter combo:", i
            # configure model with j-th combo of parameters
            x = param_grid[i]
            model.C = x[0]
            model.class_weight = x[1]
            
            # loop over folds
            for j in range(0,n_folds):
                idx0 = np.where(fold_index != j)
                idx1 = np.where(fold_index == j)
                x0 = np.array(xtrain)[idx0,:][0]; x1 = np.array(xtrain)[idx1,:][0]
                y0 = np.array(ytrain)[idx0]; y1 = np.array(ytrain)[idx1]
			
                # fit the model on observations associated with subject whichSubject in this fold
                model.fit(x0, y0)
                mvalid[idx1,i] = model.predict_proba(x1)[:,1]
                
            # fit on complete dataset
            model.fit(xtrain, ytrain)
            mfull[:,i] = model.predict_proba(xtest)[:,1]
            
        
    ## store the results
    # add indices etc
    mvalid = pd.DataFrame(mvalid)
    mvalid.columns = [model_type + str(i) for i in range(0, mvalid.shape[1])]
    mvalid['QuoteNumber'] = id_train
    mvalid['QuoteConversion_Flag'] = ytrain
    
    mfull = pd.DataFrame(mfull)
    mfull.columns = [model_type + str(i) for i in range(0, mfull.shape[1])]
print "adaboost test:",roc_auc_score(y_test, ada.predict_proba(X_test)[:,1])
#print "adaboost train:",roc_auc_score(y_train, ada.predict_proba(X_train)[:,1])

#print "Fitting Decision Tree..."
#dt.fit(X_train, y_train)
#print "Decision tree test:", roc_auc_score(y_test, dt.predict_proba(X_test)[:,1])
#print "Decision tree train:",roc_auc_score(y_train, dt.predict_proba(X_train)[:,1])

#print "Fitting Random Forest..."
#rf.fit(X_train, y_train)
#print "random forest test:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])
#print "random forest train:", roc_auc_score(y_train, rf.predict_proba(X_train)[:,1])

print "Bagging Decision Trees..."
bagged_dt.fit(X_train, y_train)
print "bagged dt test:", roc_auc_score(y_test, bagged_dt.predict_proba(X_test)[:,1])
#print "bagged dt train",roc_auc_score(y_train, bagged_dt.predict_proba(X_train)[:,1])

print "Bagging RandomForests..."
bagged_rf.fit(X_train, y_train)
print "bagged rf test",roc_auc_score(y_test, bagged_rf.predict_proba(X_test)[:,1])
#print "bagged rf train",roc_auc_score(y_train, bagged_rf.predict_proba(X_train)[:,1])

'''print "Calibrating Bagged Decision Trees..."
calibrated_dt.fit(X_train, y_train)
print "calibrated_dt test:", roc_auc_score(y_test, calibrated_dt.predict_proba(X_test)[:,1])

print "Calibrating Bagged Random Forests..."
calibrated_rf.fit(X_train, y_train)
print "calibrated_rf test:", roc_auc_score(y_test, calibrated_rf.predict_proba(X_test)[:,1])
'''
        
        lm_bagged = BaggingRegressor(
          base_estimator = lm, 
          n_estimators = 75, 
          max_samples = n_samp, 
          max_features = n_feat,
          bootstrap = True, 
          oob_score = False, 
          warm_start = False, 
          n_jobs = -1
        )
        
        log_bagged = BaggingClassifier(
          base_estimator = log, 
          n_estimators = 75, 
          max_samples = n_samp, 
          max_features = n_feat,
          bootstrap = True, 
          oob_score = False, 
          warm_start = False, 
          n_jobs = -1
        )
        
        lm_bagged.fit(X = train[features], y = train['y'])
        log_bagged.fit(X = train[features], y = train['y'])        
        lm_bagged_preds = lm_bagged.predict(X = test[features])
        log_bagged_preds = log_bagged.predict_proba(X = test[features])
        
        write_function(lm_bagged_preds, '/tmp/lm_bagged_preds_nsamp-%s_nfeat-%s.txt' % (n_samp, n_feat))
        write_function(second_pos_clip(log_bagged_preds), '/tmp/log_bagged_preds_nsamp-%s_nfeat-%s.txt' % (n_samp, n_feat))
Example #52
0
X = df_train.values#_sparse
# X_train, X_val, y_train, y_val = cross_validation.train_test_split(X, y, test_size=0.3)
X_test = df_test[df_train.columns].values#_sparse

rfc = RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=1, random_state=rs)
eclf = BaggingClassifier(rfc, n_estimators=2, n_jobs=1,max_samples=0.1,max_features=3)

eclf.fit(X, y)

# Need to chunk to avoid memory error
test_chunks = np.array_split(df_test.values,50)
for i, chunk in enumerate(test_chunks):
	test_X = chunk
	if i > 0:
		test_y = np.concatenate( [test_y, eclf.predict_proba(test_X)])
	else:
		test_y = eclf.predict_proba(test_X)
	print(i)


test_prob = np.array(test_y)
print(test_prob.shape)

def makespace(x):    
	return " ".join([str(int(z)) for z in x])

submissions = (-test_prob).argsort()[:,:5]
submit = pd.read_csv(work_dir+'sample_submission.csv')
intermediate = np.apply_along_axis(makespace, 1, submissions)
submit['hotel_cluster'] = intermediate
Example #53
0
		
                # setup bagging classifier
                bag0 = BaggingClassifier(base_estimator=model, 
                        n_estimators=nbag, 
                        max_samples=0.05, 
                        max_features=0.97, 
                        bootstrap=True, 
                        bootstrap_features=False, 
                        oob_score=False, 
                        warm_start=False, 
                        n_jobs=1,
                        random_state=seed_value,
                        verbose=2)
                        
                bag0.fit(x0, y0)
                prx = bag0.predict_proba(x1)[:,1]
                mvalid[idx1,i] = prx
                print log_loss(y1, prx)
                print "finished fold:", j
                
            # fit on complete dataset
            bag0.fit(xtrain, ytrain)
            mfull[:,i] = bag0.predict_proba(xtest)[:,1]
            print "finished full prediction"
        
=======
    for i in range(len(param_grid)):

        print "processing parameter combo:", i
        # configure model with j-th combo of parameters
        x = param_grid[i]
Example #54
0
                                         n_estimators=n_estimators/10,
                                         random_state=1, n_jobs=nb_parallel
                                         ).fit(x_local_train, y_local_train)
                else:
                    M = RForestRegress(n_estimators, random_state=1,
                                       n_jobs=nb_parallel
                                       ).fit(x_local_train, y_local_train)
            else:
                vprint(verbose, "[-] task not recognized")
                break
            vprint(verbose, "[+] Fitting success, time spent so far %5.2f sec"
                   % (time.time() - start))

            # Make predictions on local validation set
            if task == 'binary.classification':
                y_local_valid_pred = M.predict_proba(x_local_valid)[:, 1]
            elif task == 'multiclass.classification':
                y_local_valid_pred = M.predict_proba(x_local_valid).T
            elif task == 'multilabel.classification':
                y_local_valid_pred = np.array([Ms[i].predict_proba(x_local_valid)[:, 1] for i in range(K)]).T
            elif task == 'regression':
                y_local_valid_pred = M.predict(x_local_valid)

            # Local validation
            # x_local_valid, y_local_valid
            metric_type = D.info['metric']

            if 'f1_metric' == metric_type:
                metric = f1_metric(y_local_valid, y_local_valid_pred)
            elif 'r2_metric' == metric_type:
                metric = r2_metric(y_local_valid, y_local_valid_pred)
Example #55
0
 clf = AdaBoostClassifier(base_estimator=None, n_estimators=125, 
                      learning_rate=0.025, algorithm='SAMME.R', 
                      random_state=190, )
 clf.fit(x0, y0)
 pr1 = clf.predict_proba(x1)[:,1]
 resmat[ii,0] = roc_auc_score(y1, pr1)
 # bagging + ada
 clf0 = AdaBoostClassifier(base_estimator=None, n_estimators=125, 
                      learning_rate=0.025, algorithm='SAMME.R', 
                      random_state=190, )
 clf1 = BaggingClassifier(base_estimator=clf0, n_estimators=25, 
                         max_samples=0.5, max_features=0.95, bootstrap=False, 
                         bootstrap_features=False, oob_score=False, 
                         n_jobs=-1, random_state=xseed + 1, verbose=1)
 clf1.fit(x0, y0)                        
 pr2 = clf1.predict_proba(x1)[:,1]
 resmat[ii,1] = roc_auc_score(y1, pr2)
 # bagging 
 clf0 = ExtraTreesClassifier(n_estimators = 10, n_jobs = -1, verbose = 1, 
                        class_weight = 'auto', min_samples_leaf = 5, 
                        random_state = xseed)
 clf1 = AdaBoostClassifier(base_estimator=clf0, n_estimators=25, 
                      learning_rate=0.025, algorithm='SAMME.R', 
                      random_state=190, )
 clf2 = BaggingClassifier(base_estimator=clf1, n_estimators=25, 
                         max_samples=0.25, max_features=0.9, bootstrap=False, 
                         bootstrap_features=False, oob_score=False, 
                         n_jobs=-1, random_state=xseed + 1, verbose=1)
 clf2.fit(x0, y0)                        
 pr3 = clf2.predict_proba(x1)[:,1]
 resmat[ii,2] = roc_auc_score(y1, pr3)
    forest = forest.fit(X_train, y_train)
    proba = forest.predict_proba(X_test)
    proba = proba[:, 1]
    y_test = np.array(y_test)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, proba, pos_label=1)
    loss = metrics.auc(fpr, tpr)
    print loss
    return loss


def kfold_validation(data=train, y=y, trials=10):
    skf = cross_validation.StratifiedKFold(y, n_folds=10)
    error = 0.0
    for train_index, test_index in skf:
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = y[train_index], y[test_index]
        error += train_and_test(X_train, X_test, y_train, y_test)
    return error/trials


score = kfold_validation()
print score

forest = BaggingClassifier(n_estimators=1000, random_state=1234)
forest = forest.fit(train, y)
proba = forest.predict_proba(test)
proba = proba[:, 1]
submission = pd.DataFrame({"bidder_id": idx, "prediction": proba})
submission.to_csv("submissions/submission_bag1.csv", index=False)
print 'Done.'
  
'''for name, clf in clfs:
    clf.fit(train_[cols], train_["TripType"])
    clf.predict(test_[cols])
    preds = clf.predict_proba(test_[cols])
    #print(confusion_matrix(test['class'], clf.predict(test[cols])))
    print (pd.crosstab(test_['TripType'], clf.predict(test_[cols]), rownames=["Actual"], colnames=["Predicted"]))
    print (classification_report(test_['TripType'], clf.predict(test_[cols])))
    score=accuracy_score(test_['TripType'],clf.predict(test_[cols]))
    table.append([name,score])
print (table)
'''
clf=BaggingClassifier(GradientBoostingClassifier())
clf.fit(train_[cols], train_["TripType"])
clf.predict(test_[cols])
preds = clf.predict_proba(test_[cols])
#print(confusion_matrix(test['class'], clf.predict(test[cols])))
print (pd.crosstab(test_['TripType'], clf.predict(test_[cols]), rownames=["Actual"], colnames=["Predicted"]))
print (classification_report(test_['TripType'], clf.predict(test_[cols])))
score=accuracy_score(test_['TripType'],clf.predict(test_[cols]))
table.append([score])
print (table)

eclf = VotingClassifier(estimators = [('BaggingKNN', BaggingClassifier(KNeighborsClassifier(20))),
    ('RandomForest', RandomForestClassifier(10)),
    ('BaggingCART', BaggingClassifier(DecisionTreeClassifier()))],
    voting='soft', weights=[7,1,1])
eclf.fit(train[cols], train["TripType"])
#use the classifier to predict
predicted=eclf.predict(test[cols])
#print (accuracy_score(predicted,test['TripType']))
Example #58
0
print predicted_Tree
print prob_Tree
len_prob_tree = len(prob_Tree)
count = 0
while count < len_prob_tree:
    print (prob_Tree[count][1])
    count += 1


print
print ("bagging classification")

res_bagging = BaggingClassifier()
res_bagging.fit(learnImage_list, learnImageType_list)
predicted_bag = res_bagging.predict(learnUnknownImage_list)
prob_bag = res_bagging.predict_proba(learnUnknownImage_list)
print predicted_bag
print prob_bag
len_prob_bag = len(prob_bag)
count = 0
while count < len_prob_bag:
    print (prob_bag[count][1])
    count += 1


"""

>>> X = [[0], [1], [2], [3]]
>>> y = [0, 0, 1, 1]
>>> from sklearn.neighbors import KNeighborsClassifier
>>> neigh = KNeighborsClassifier(n_neighbors=3)
Example #59
0
clf2 = AdaBoostClassifier(n_estimators=50)
clf3 = GradientBoostingClassifier(n_estimators=50, learning_rate=1.0,max_depth=1, random_state=0)
clf4 = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
gamma=0.5, kernel='poly',degree = 2, max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False )  #svm
clf5 = linear_model.LogisticRegression()#logit
clf4 = BaggingClassifier(clf4,max_samples=0.7, max_features=1.0,n_estimators=20)
clf5 = BaggingClassifier(clf5,max_samples=0.7, max_features=1.0,n_estimators=20)


clf1 = clf1.fit(x2_train, y_train)
clf2 = clf2.fit(x2_train, y_train)
clf3 = clf3.fit(x2_train, y_train)
clf4 = clf4.fit(x2_train, y_train)
clf5 = clf5.fit(x2_train, y_train)

x1_test = list(zip(data2_2.X,data2_2.Y,data2_2.year,data2_2.month,data2_2.day,data2_2.hour))
x2_test = [0]*len(x1_test)
for i in range(0,len(x1_test)):
    x2_test[i]= list(x1_test[i])+data2_2['dow'][i]+data2_2['dis'][i]


r1 = clf1.predict_proba(x2_test)
r2 = clf2.predict_proba(x2_test)
r3 = clf3.predict_proba(x2_test)
r4 = clf4.predict_proba(x2_test)
r5 = clf5.predict_proba(x2_test)



X = pd.read_csv('train.csv')
X= X.drop('id', axis=1)
y = X.target.values
y = LabelEncoder().fit_transform(y)
X = X.drop('target', axis=1)
print X.head(3)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.20, random_state=36)
print 'X splitted'

clf = RFC(n_estimators=250, n_jobs=-1)
# use BaggingClassifier to make 5 predictions, and average
clfbag = BC(clf, n_estimators=5)
print 'fitting bag clf ...'
clfbag.fit(Xtrain, ytrain)
print 'done !'
ypreds = clfbag.predict_proba(Xtest)
# will be 0.60 also
print "%.2f" % log_loss(ytest, ypreds, eps=1e-15, normalize=True)

clf = RFC(n_estimators=250, n_jobs=-1)
# isotonic works better than the default sigmoid in this case
clfcali = CalibratedClassifierCV(clf, method='isotonic', cv=5)
print 'fitting calibration clf ...'
clfcali.fit(Xtrain, ytrain)
print 'done !'
ypreds = clfcali.predict_proba(Xtest)
# will be 0.49 also
print "%.2f" % log_loss(ytest, ypreds, eps=1e-15, normalize=True)