Ejemplo n.º 1
0
def test_set_estimator_none(drop):
    """VotingClassifier set_params should be able to set estimators as None or
    drop"""
    # Test predict
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard', weights=[1, 0, 0.5]).fit(X, y)

    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard', weights=[1, 1, 0.5])
    with pytest.warns(None) as record:
        eclf2.set_params(rf=drop).fit(X, y)
    assert record if drop is None else not record
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))

    assert dict(eclf2.estimators)["rf"] is drop
    assert len(eclf2.estimators_) == 2
    assert all(isinstance(est, (LogisticRegression, GaussianNB))
               for est in eclf2.estimators_)
    assert eclf2.get_params()["rf"] is drop

    eclf1.set_params(voting='soft').fit(X, y)
    with pytest.warns(None) as record:
        eclf2.set_params(voting='soft').fit(X, y)
    assert record if drop is None else not record
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    msg = 'All estimators are dropped. At least one is required'
    with pytest.warns(None) as record:
        with pytest.raises(ValueError, match=msg):
            eclf2.set_params(lr=drop, rf=drop, nb=drop).fit(X, y)
    assert record if drop is None else not record

    # Test soft voting transform
    X1 = np.array([[1], [2]])
    y1 = np.array([1, 2])
    eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft', weights=[0, 0.5],
                             flatten_transform=False).fit(X1, y1)

    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft', weights=[1, 0.5],
                             flatten_transform=False)
    with pytest.warns(None) as record:
        eclf2.set_params(rf=drop).fit(X1, y1)
    assert record if drop is None else not record
    assert_array_almost_equal(eclf1.transform(X1),
                              np.array([[[0.7, 0.3], [0.3, 0.7]],
                                        [[1., 0.], [0., 1.]]]))
    assert_array_almost_equal(eclf2.transform(X1),
                              np.array([[[1., 0.],
                                         [0., 1.]]]))
    eclf1.set_params(voting='hard')
    eclf2.set_params(voting='hard')
    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
Ejemplo n.º 2
0
    def ensembleModel(self, list_of_models, train, cross_validation):
        logging.info(
            "preparing Target Variable for train and cross validation")
        train_Y = train[self.target]
        cross_validation_Y = cross_validation[self.target]
        logging.info("preparing train and CV data")
        train_X = train[train.columns.difference([self.ID, self.target])]
        cross_validation_X = cross_validation[
            cross_validation.columns.difference([self.ID, self.target])]

        clf = VotingClassifier(estimators=list_of_models,
                               voting='soft',
                               weights=[1, 5])
        clf.fit(train_X, train_Y)

        self.saveModel(clf, "rfc_and_xgb_model")

        print("accurary on cross_validation set",
              clf.score(cross_validation_X, cross_validation_Y))
        print('Overall RFC AUC on whole train set:',
              roc_auc_score(train_Y,
                            clf.predict_proba(train_X)[:, 1]))
        print(
            'Overall RFC AUC on whole cross_validation set:',
            roc_auc_score(cross_validation_Y,
                          clf.predict_proba(cross_validation_X)[:, 1]))

        return clf
def test_set_params():
    """set_params should be able to set estimators"""
    clf1 = LogisticRegression(random_state=123, C=1.0)
    clf2 = RandomForestClassifier(random_state=123, max_depth=None)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft',
                             weights=[1, 2])
    assert_true('lr' in eclf1.named_estimators)
    assert_true(eclf1.named_estimators.lr is eclf1.estimators[0][1])
    assert_true(eclf1.named_estimators.lr is eclf1.named_estimators['lr'])
    eclf1.fit(X, y)
    assert_true('lr' in eclf1.named_estimators_)
    assert_true(eclf1.named_estimators_.lr is eclf1.estimators_[0])
    assert_true(eclf1.named_estimators_.lr is eclf1.named_estimators_['lr'])

    eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft',
                             weights=[1, 2])
    eclf2.set_params(nb=clf2).fit(X, y)
    assert_false(hasattr(eclf2, 'nb'))

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params())
    assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params())

    eclf1.set_params(lr__C=10.0)
    eclf2.set_params(nb__max_depth=5)

    assert_true(eclf1.estimators[0][1].get_params()['C'] == 10.0)
    assert_true(eclf2.estimators[1][1].get_params()['max_depth'] == 5)
    assert_equal(eclf1.get_params()["lr__C"],
                 eclf1.get_params()["lr"].get_params()['C'])
def test_sample_weight():
    """Tests sample_weight parameter of VotingClassifier"""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = SVC(probability=True, random_state=123)
    eclf1 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('svc', clf3)],
        voting='soft').fit(X, y, sample_weight=np.ones((len(y),)))
    eclf2 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('svc', clf3)],
        voting='soft').fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))

    sample_weight = np.random.RandomState(123).uniform(size=(len(y),))
    eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft')
    eclf3.fit(X, y, sample_weight)
    clf1.fit(X, y, sample_weight)
    assert_array_equal(eclf3.predict(X), clf1.predict(X))
    assert_array_equal(eclf3.predict_proba(X), clf1.predict_proba(X))

    clf4 = KNeighborsClassifier()
    eclf3 = VotingClassifier(estimators=[
        ('lr', clf1), ('svc', clf3), ('knn', clf4)],
        voting='soft')
    msg = ('Underlying estimator \'knn\' does not support sample weights.')
    assert_raise_message(ValueError, msg, eclf3.fit, X, y, sample_weight)
Ejemplo n.º 5
0
def test_sample_weight():
    """Tests sample_weight parameter of VotingClassifier"""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = SVC(gamma='scale', probability=True, random_state=123)
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('svc', clf3)],
                             voting='soft').fit(X,
                                                y,
                                                sample_weight=np.ones(
                                                    (len(y), )))
    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('svc', clf3)],
                             voting='soft').fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))

    sample_weight = np.random.RandomState(123).uniform(size=(len(y), ))
    eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft')
    eclf3.fit(X, y, sample_weight)
    clf1.fit(X, y, sample_weight)
    assert_array_equal(eclf3.predict(X), clf1.predict(X))
    assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X))

    clf4 = KNeighborsClassifier()
    eclf3 = VotingClassifier(estimators=[('lr', clf1), ('svc', clf3),
                                         ('knn', clf4)],
                             voting='soft')
    msg = ('Underlying estimator \'knn\' does not support sample weights.')
    assert_raise_message(ValueError, msg, eclf3.fit, X, y, sample_weight)
Ejemplo n.º 6
0
def test_set_params():
    """set_params should be able to set estimators"""
    clf1 = LogisticRegression(random_state=123, C=1.0)
    clf2 = RandomForestClassifier(random_state=123, max_depth=None)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)],
                             voting='soft',
                             weights=[1, 2])
    assert 'lr' in eclf1.named_estimators
    assert eclf1.named_estimators.lr is eclf1.estimators[0][1]
    assert eclf1.named_estimators.lr is eclf1.named_estimators['lr']
    eclf1.fit(X, y)
    assert 'lr' in eclf1.named_estimators_
    assert eclf1.named_estimators_.lr is eclf1.estimators_[0]
    assert eclf1.named_estimators_.lr is eclf1.named_estimators_['lr']

    eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)],
                             voting='soft',
                             weights=[1, 2])
    eclf2.set_params(nb=clf2).fit(X, y)
    assert not hasattr(eclf2, 'nb')

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params())
    assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params())

    eclf1.set_params(lr__C=10.0)
    eclf2.set_params(nb__max_depth=5)

    assert eclf1.estimators[0][1].get_params()['C'] == 10.0
    assert eclf2.estimators[1][1].get_params()['max_depth'] == 5
    assert_equal(eclf1.get_params()["lr__C"],
                 eclf1.get_params()["lr"].get_params()['C'])
Ejemplo n.º 7
0
def test_set_estimator_none():
    """VotingClassifier set_params should be able to set estimators as None"""
    # Test predict
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard',
                             weights=[1, 0, 0.5]).fit(X, y)

    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard',
                             weights=[1, 1, 0.5])
    eclf2.set_params(rf=None).fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))

    assert_true(dict(eclf2.estimators)["rf"] is None)
    assert_true(len(eclf2.estimators_) == 2)
    assert_true(
        all([
            not isinstance(est, RandomForestClassifier)
            for est in eclf2.estimators_
        ]))
    assert_true(eclf2.get_params()["rf"] is None)

    eclf1.set_params(voting='soft').fit(X, y)
    eclf2.set_params(voting='soft').fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    msg = ('All estimators are None. At least one is required'
           ' to be a classifier!')
    assert_raise_message(ValueError, msg,
                         eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y)

    # Test soft voting transform
    X1 = np.array([[1], [2]])
    y1 = np.array([1, 2])
    eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft',
                             weights=[0, 0.5],
                             flatten_transform=False).fit(X1, y1)

    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft',
                             weights=[1, 0.5],
                             flatten_transform=False)
    eclf2.set_params(rf=None).fit(X1, y1)
    assert_array_almost_equal(
        eclf1.transform(X1),
        np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]]))
    assert_array_almost_equal(eclf2.transform(X1),
                              np.array([[[1., 0.], [0., 1.]]]))
    eclf1.set_params(voting='hard')
    eclf2.set_params(voting='hard')
    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_estimator_weights_format():
    # Test estimator weights inputs as list and array
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    eclf1 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], weights=[1, 2], voting="soft")
    eclf2 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], weights=np.array((1, 2)), voting="soft")
    eclf1.fit(X, y)
    eclf2.fit(X, y)
    assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
Ejemplo n.º 9
0
def classification_results(train,test):
    #Derivation of NBDriver using training data 
    """
    Arguments:
        train = feature matrix derived from Brown et al.
        test= feature matrix derived from Martelotto et al.
    Returns:
        best_model = Best ensemble model derived using the training data
        X_red= Dataframe derived after sampling that was used to train the model
        scores= probability based classification scores
    """
    sen=[];spe=[];acc=[];auc=[];c=[];m=[];s=[]
    train_x=train.drop('Label',axis=1);train_y=train['Label'];    
    test_x=test.drop('Label',axis=1);test_y=test['Label'];
    #Random undersampling to reduce the majority class size
    samp=RepeatedEditedNearestNeighbours(random_state=42)
    X_samp,y_samp=samp.fit_resample(train_x,train_y)
    X_samp = pd.DataFrame(X_samp, columns = train_x.columns)
    #Experimenting with different numbers of top features derived from the tree-based feature extraction method 
    top_n_feats=[30,40,50,60,70]
    X_r=feature_reduction_using_trees(X_samp,y_samp) 
    cols=X_r.columns
    for n in top_n_feats:
        print("For top: ",n," features")
        X_red=X_r[cols[0:n]]
        sv=SVC(kernel="linear",probability=True,C=0.01,random_state=42) #chosen from 5foldCV based grid search
        kde=KDEClassifier(bandwidth=1.27) #chosen from 5foldCV based grid search
        best_model = VotingClassifier(estimators=[('sv', sv), ('kde', kde)],
                        voting='soft',weights=[4, 7]) #best combination of weights selected by a brute force search (possible weights 1-10) using a cross-validation approach on the training data  
        
        best_model.fit(X_red,y_samp)
        y_probs = best_model.predict_proba(test_x[X_red.columns])[:,1]
        thresholds = arange(0, 1, 0.001)
        scores = [roc_auc_score(test_y, to_labels(y_probs, t)) for t in thresholds]
        ix= argmax(scores)
        y_test_predictions = np.where(best_model.predict_proba(test_x[X_red.columns])[:,1] > thresholds[ix], 2, 1)
        print("Thresh: ",thresholds[ix])
        sensi= sensitivity_score(test_y, y_test_predictions, pos_label=2)
        speci=specificity_score(test_y,y_test_predictions,pos_label=2)
        accu=accuracy_score(test_y,y_test_predictions)
        auro=roc_auc_score(test_y,y_test_predictions)
        mcc=metrics.matthews_corrcoef(test_y,y_test_predictions)
        tn, fp, fn, tp = confusion_matrix(test_y, y_test_predictions).ravel()
        ppv=tp/(tp+fp)
        npv=tn/(tn+fn)
        sen=tp/(tp+fn)
        spe=tn/(tn+fp)
        score=ppv+npv+sen+spe
        print("For kmer size: ",len(train.columns[0]))
        print("for top ",n," features")
        print(list(X_red.columns.values),"\n")
        score_dict={"Sen":sen,"Spe":spe,"PPV":ppv,"NPV":npv,"AUC":auro,"MCC":mcc,"ACC":accu}
        print(score)
        print(score_dict)
        df=pd.DataFrame(y_test_predictions)
        y_samp = pd.DataFrame(y_samp, columns = ['x'])
    return best_model,X_red,scores
Ejemplo n.º 10
0
def test_set_estimator_none():
    """VotingClassifier set_params should be able to set estimators as None"""
    # Test predict
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard', weights=[1, 0, 0.5]).fit(X, y)

    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard', weights=[1, 1, 0.5])
    eclf2.set_params(rf=None).fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))

    assert_true(dict(eclf2.estimators)["rf"] is None)
    assert_true(len(eclf2.estimators_) == 2)
    assert_true(all([not isinstance(est, RandomForestClassifier) for est in
                     eclf2.estimators_]))
    assert_true(eclf2.get_params()["rf"] is None)

    eclf1.set_params(voting='soft').fit(X, y)
    eclf2.set_params(voting='soft').fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    msg = ('All estimators are None. At least one is required'
           ' to be a classifier!')
    assert_raise_message(
        ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y)

    # Test soft voting transform
    X1 = np.array([[1], [2]])
    y1 = np.array([1, 2])
    eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft', weights=[0, 0.5],
                             flatten_transform=False).fit(X1, y1)

    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft', weights=[1, 0.5],
                             flatten_transform=False)
    eclf2.set_params(rf=None).fit(X1, y1)
    assert_array_almost_equal(eclf1.transform(X1),
                              np.array([[[0.7, 0.3], [0.3, 0.7]],
                                        [[1., 0.], [0., 1.]]]))
    assert_array_almost_equal(eclf2.transform(X1),
                              np.array([[[1., 0.],
                                         [0., 1.]]]))
    eclf1.set_params(voting='hard')
    eclf2.set_params(voting='hard')
    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_parallel_predict():
    """Check parallel backend of VotingClassifier on toy dataset."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
    y = np.array([1, 1, 2, 2])

    eclf1 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=1).fit(X, y)
    eclf2 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=2).fit(X, y)

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
Ejemplo n.º 12
0
def test_estimator_weights_format():
    # Test estimator weights inputs as list and array
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)],
                             weights=[1, 2],
                             voting='soft')
    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)],
                             weights=np.array((1, 2)),
                             voting='soft')
    eclf1.fit(X, y)
    eclf2.fit(X, y)
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
Ejemplo n.º 13
0
def my_classifier_predictions(X_train, Y_train, X_test, X_testt, Y_testt):
    #TODO: complete this

    clf1 = GradientBoostingClassifier()
    clf3 = GaussianNB()
    eclf = VotingClassifier(estimators=[('gbc', clf1), ('gnb', clf3)],
                            voting='soft')
    eclf = eclf.fit(X_train, Y_train)
    Y_train_pred = eclf.predict(X_train)
    print('train:')
    print(roc_auc_score(Y_train, Y_train_pred))
    Y_pred = eclf.predict_proba(X_test)[:, 1]
    print('test:')

    Y_predt = eclf.predict(X_testt)
    print(roc_auc_score(Y_testt, Y_predt))

    # parameters1=np.arange(0.5,1,0.01)
    # score1=[]
    # score2=[]
    # for parameter1 in parameters1:
    # 	clf=GradientBoostingClassifier(subsample=parameter1,max_features=28,max_depth=3,learning_rate=0.16,n_estimators=60,random_state=RANDOM_STATE)
    # 	clf=clf.fit(X_train,Y_train)
    # 	score_train=clf.score(X_train,Y_train)
    # 	score_test=clf.score(X_test,Y_test)
    # 	score1.append(score_train)
    # 	score2.append(score_test)
    # 	print(parameter1)
    # print(score1)
    # print(score2)
    # return 0

    # best_params_
    return Y_pred.flatten()
Ejemplo n.º 14
0
def test_voting_classifier_set_params():
    # check equivalence in the output when setting underlying estimators
    clf1 = LogisticRegression(random_state=123, C=1.0)
    clf2 = RandomForestClassifier(random_state=123, max_depth=None)
    clf3 = GaussianNB()

    eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft',
                             weights=[1, 2]).fit(X, y)
    eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft',
                             weights=[1, 2])
    eclf2.set_params(nb=clf2).fit(X, y)

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    assert eclf2.estimators[0][1].get_params() == clf1.get_params()
    assert eclf2.estimators[1][1].get_params() == clf2.get_params()
Ejemplo n.º 15
0
class VotingEnsemble(BaseEnsembleModel):
    def __init__(self, learners, weights=None, random_drop_rate=0.5):
        super(VotingEnsemble, self).__init__(learners)
        self.weights = weights
        self.learners = learners
        self.random_drop_rate = random_drop_rate
        self.classifier = VotingClassifier(estimators=[
            ('{}_{}'.format(learner.__class__, i), learner)
            for i, learner in enumerate(learners)
        ],
                                           voting='soft',
                                           weights=weights)

    def _fit(self, X, y):
        X_, _, y_, _ = train_test_split(X,
                                        y,
                                        test_size=self.random_drop_rate,
                                        random_state=random.randint(1, 100000))
        self.classifier.fit(X_, y_)

    def _predict(self, X):
        return self.classifier.predict(X)

    def _predict_proba(self, X):
        return self.classifier.predict_proba(X)
Ejemplo n.º 16
0
class VotingClassifierImpl():
    def __init__(self,
                 estimators=None,
                 voting='hard',
                 weights=None,
                 n_jobs=None,
                 flatten_transform=True):
        self._hyperparams = {
            'estimators': estimators,
            'voting': voting,
            'weights': weights,
            'n_jobs': n_jobs,
            'flatten_transform': flatten_transform
        }
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
Ejemplo n.º 17
0
def voting(X_train, X_test, y_train, y_test, estimators):
    # If model is already saved, load it and test it
    if os.path.isfile("./Models/votingModel.pkl"):
        ensemble = joblib.load('./Models/votingModel.pkl')
        predictions = ensemble.predict_proba(X_test)
        fpr, tpr, thresholds = roc_curve(y_test, predictions[:,1])
        auc_score = auc(fpr, tpr)
        train_acc = ensemble.score(X_train, y_train)
        acc = ensemble.score(X_test, y_test)
        print("Voting Ensemble Training Acc:", train_acc)
        print("Voting Ensemble Acc:", acc)
        print("Voting Ensemble AUC:", auc_score)
        return ensemble

    # Else create, train, save model and test model
    else:
        ensemble = VotingClassifier(estimators, voting='soft')
        ensemble.fit(X_train, y_train)
        joblib.dump(ensemble, "./Models/votingModel.pkl")
        predictions = ensemble.predict_proba(X_test)
        fpr, tpr, thresholds = roc_curve(y_test, predictions[:,1])
        auc_score = auc(fpr, tpr)
        acc = ensemble.score(X_test, y_test)
        print("Voting Ensemble Acc:", acc)
        print("Voting Ensemble AUC:", auc_score)
        return ensemble
    def process_cell(self, df_cell_train, df_cell_test, window):

        place_counts = df_cell_train.place_id.value_counts()
        mask = (place_counts[df_cell_train.place_id.values] >= th).values
        df_cell_train = df_cell_train.loc[mask]

        # Working on df_test
        row_ids = df_cell_test.index

        # Preparing data
        le = LabelEncoder()
        y = le.fit_transform(df_cell_train.place_id.values)
        X = df_cell_train.drop(['place_id', ], axis=1).values.astype(int)
        X_test = df_cell_test.values.astype(int)

        # Applying the classifier
        clf1 = KNeighborsClassifier(n_neighbors=50, weights='distance',
                                    metric='manhattan')
        clf2 = RandomForestClassifier(n_estimators=50, n_jobs=-1)
        eclf = VotingClassifier(estimators=[('knn', clf1), ('rf', clf2)], voting='soft')

        eclf.fit(X, y)
        y_pred = eclf.predict_proba(X_test)
        pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3])
        return pred_labels, row_ids
Ejemplo n.º 19
0
def process_one_cell(df_train, df_test, grid_id, th):
    """
    Classification inside one grid cell.
    """
    # Working on df_train
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    # Working on df_test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    row_ids = df_cell_test.index

    # Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id', 'grid_cell'],
                           axis=1).values.astype(int)
    X_test = df_cell_test.drop(['grid_cell'], axis=1).values.astype(int)

    # Applying the classifier
    clf1 = KNeighborsClassifier(n_neighbors=25,
                                weights='distance',
                                metric='manhattan')
    clf2 = RandomForestClassifier(n_estimators=30, n_jobs=-1)
    eclf = VotingClassifier(estimators=[('knn', clf1), ('rf', clf2)],
                            voting='soft')

    eclf.fit(X, y)
    y_pred = eclf.predict_proba(X_test)
    pred_labels = le.inverse_transform(
        np.argsort(y_pred, axis=1)[:, ::-1][:, :3])
    return pred_labels, row_ids
Ejemplo n.º 20
0
def ensemble(X_train, X_test, y_train):
    alg1 = linear_model.LogisticRegression()
    alg2 = svm.SVC(probability=True)
    alg3 = GaussianNB()
    alg4 = KNeighborsClassifier(n_neighbors=5)
    alg5 = MLPClassifier(hidden_layer_sizes=(30, 30, 30))
    alg6 = GradientBoostingClassifier()
    estimators = []

    estimators.append(('logistic', alg1))
    estimators.append(('svm', alg2))
    estimators.append(('Gussian', alg3))
    estimators.append(('KNeighbors', alg4))
    estimators.append(('MLP', alg5))
    estimators.append(('grad', alg6))
    ensemble = VotingClassifier(estimators,
                                voting='soft',
                                weights=[1, 1, 2, 2, 2, 2])

    ensemble.fit(X_train, y_train)

    predictions = ensemble.predict(X_test)
    y_prob = ensemble.predict_proba(X_test)

    return predictions, y_prob
Ejemplo n.º 21
0
class VotingEnsembler(BaseModel):
    """Class that combines models using a voting method. A hard voting method is
    equivalent to a majority voting. A soft voting returns the class with the
    highest probability (calculated as the sum of the probabilities predicted by
    each model)"""
    def __init__(self, configs, score_method, predict_as_probability,
                 voting_method):
        BaseModel.__init__(self, configs, score_method, predict_as_probability)
        self.models_ = []
        self.model_weights_ = []
        self.voting_method_ = voting_method
        self.ensemble_model_ = None

    def init(self):
        """Method responsible for the ensembler initialization"""
        if not self.register_models():
            return False

        self.ensemble_model_ = VotingClassifier(estimators=self.models_,\
          voting=self.voting_method_, weights=self.model_weights_)
        return True

    def register_models(self):
        """Method used to register the prediction models"""
        models = self.configs_['pipeline_models']

        for i in range(0, len(models)):
            new_model = PipelineModel(models[i], self.score_method_,
                                      self.predict_as_probability_)

            if not new_model.init():
                print 'Error registering model', models[i]['model']['id'],\
                  'in ensemble'
                return False
            print 'Model', new_model.get_name(), 'registered in ensemble'
            self.models_.append(
                (models[i]['model']['id'], new_model.get_sklearn_pipeline()))

            if 'weight' not in models[i]:
                self.model_weights_.append(1)
            else:
                self.model_weights_.append(models[i]['weight'])
        return True

    def get_name(self):
        """Get the label associated with a model (used for printing)"""
        if 'label' in self.configs_:
            return self.configs_['label']
        return self.configs_['id']

    def fit(self, input_data, targets):
        """Train the ensemble model"""
        self.ensemble_model_.fit(input_data, targets)

    def predict(self, input_data):
        """Predict the results of an ensemble model"""
        if not self.predict_as_probability_:
            return self.ensemble_model_.predict(input_data)
        else:
            return self.ensemble_model_.predict_proba(input_data)
Ejemplo n.º 22
0
    def test_pipeline_voting_tfidf_svc(self):
        pipe1 = Pipeline([
            ('tfidf1', TfidfVectorizer()),
            ('svc', SVC(probability=True, kernel='linear'))])
        pipe2 = Pipeline([
            ('tfidf2', TfidfVectorizer(norm='l2', use_idf=False)),
            ('sgd', SGDClassifier(alpha=0.0001, penalty='l2',
                                  loss='modified_huber'))])
        pipe3 = Pipeline([
            ('tfidf3', TfidfVectorizer()),
            ('mnb', MultinomialNB())])
        voting = VotingClassifier(
            [('p1', pipe1), ('p2', pipe2), ('p3', pipe3)],
            voting='soft', flatten_transform=False)
        data = numpy.array(["first sentance", "second sentence",
                            "many sentances", "dummy sentance",
                            "no sentance at all"])
        y = numpy.array([0, 0, 1, 0, 1])
        voting.fit(data, y)
        expected_label = voting.predict(data)
        expected_proba = voting.predict_proba(data)
        df = pandas.DataFrame(data)
        df.columns = ['text']

        model_onnx = convert_sklearn(
            voting, initial_types=[('text', StringTensorType([None, 1]))],
            target_opset=TARGET_OPSET,
            options={id(voting): {'zipmap': False}})
        # with open("debug.onnx", "wb") as f:
        #     f.write(model_onnx.SerializeToString())
        sess = InferenceSession(model_onnx.SerializeToString())
        got = sess.run(None, {'text': data.reshape((-1, 1))})
        assert_almost_equal(expected_proba, got[1], decimal=5)
        assert_almost_equal(expected_label, got[0])
Ejemplo n.º 23
0
def voting_model(X_train, X_test, y_train):
    vclf = VotingClassifier(estimators=[('xgb', xgb), ('rf', rf),
                                        ('lgb', lgb)],
                            voting='soft',
                            weights=[1, 1, 1])
    vclf.fit(X_train, y_train)
    predictions = vclf.predict_proba(X_test)[:, 1]
    return predictions
Ejemplo n.º 24
0
def test_parallel_fit():
    """Check parallel backend of VotingClassifier on toy dataset."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
    y = np.array([1, 1, 2, 2])

    eclf1 = VotingClassifier(
        estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=1
    ).fit(X, y)
    eclf2 = VotingClassifier(
        estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=2
    ).fit(X, y)

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
Ejemplo n.º 25
0
def test_notfitted():
    eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()),
                                        ('lr2', LogisticRegression())],
                            voting='soft')
    ereg = VotingRegressor([('dr', DummyRegressor())])
    msg = ("This %s instance is not fitted yet. Call \'fit\'"
           " with appropriate arguments before using this estimator.")
    with pytest.raises(NotFittedError, match=msg % 'VotingClassifier'):
        eclf.predict(X)
    with pytest.raises(NotFittedError, match=msg % 'VotingClassifier'):
        eclf.predict_proba(X)
    with pytest.raises(NotFittedError, match=msg % 'VotingClassifier'):
        eclf.transform(X)
    with pytest.raises(NotFittedError, match=msg % 'VotingRegressor'):
        ereg.predict(X_r)
    with pytest.raises(NotFittedError, match=msg % 'VotingRegressor'):
        ereg.transform(X_r)
Ejemplo n.º 26
0
def ensembleClassifier(models_folder,
                       data_folder,
                       save_suffix,
                       extension,
                       x=None,
                       y_true=None,
                       x_test=None,
                       y_test=None,
                       save=True):
    if (x == None):
        x, y_true, x_test, y_test = load_data(data_folder, save_suffix,
                                              extension)

    print("loading models")
    Gradient_Boost_CV = pickle.load(
        open("{}{}{}{}".format(models_folder, "GBCV", save_suffix, extension),
             'rb'))
    Logistic_CV = pickle.load(
        open("{}{}{}{}".format(models_folder, "LRCV", save_suffix, extension),
             'rb'))
    Forest_CV = pickle.load(
        open("{}{}{}{}".format(models_folder, "RFCV", save_suffix, extension),
             'rb'))
    SVMCV = pickle.load(
        open("{}{}{}{}".format(models_folder, "SVMCV", save_suffix, extension),
             'rb'))

    gb_param = Gradient_Boost_CV.best_params_
    lr_param = Logistic_CV.best_params_
    rf_param = Forest_CV.best_params_
    sv_param = SVMCV.best_params_

    gb = GradientBoostingClassifier(**gb_param)
    rf = RandomForestClassifier(**rf_param)
    lr = LogisticRegression(**lr_param)
    sv = SVC(**sv_param)

    classifier = VotingClassifier(estimators=[('gb', gb), ('lr', lr),
                                              ('rf', rf), ('sv', sv)],
                                  voting="soft")

    print("start fitting my model...")
    classifier.fit(x, y_true)

    if (save):
        with open(
                "{}Ensembler{}{}".format(models_folder, save_suffix,
                                         extension), 'wb') as f:
            pickle.dump(classifier, f)

    "finishing off..."
    test_score = classifier.score(x_test, y_test)
    y_pred = classifier.predict(x_test)
    y_pred_proba = classifier.predict_proba(x_test)

    print(test_score)

    return test_score, y_pred, y_pred_proba
Ejemplo n.º 27
0
def process_one_cell(df_train, df_test, x_min, x_max, y_min, y_max):

    x_border_augment = 0.025
    y_border_augment = 0.0125

    #Working on df_train
    df_cell_train = df_train[(df_train['x'] >= x_min-x_border_augment) & (df_train['x'] < x_max+x_border_augment) &
                               (df_train['y'] >= y_min-y_border_augment) & (df_train['y'] < y_max+y_border_augment)]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    #Working on df_test
    # to be delete: df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    df_cell_test = df_test[(df_test['x'] >= x_min) & (df_test['x'] < x_max) &
                               (df_test['y'] >= y_min) & (df_test['y'] < y_max)]
    row_ids = df_cell_test.index

    if(len(df_cell_train) == 0 or len(df_cell_test) == 0):
        return None, None

    #Feature engineering on x and y
    df_cell_train.loc[:,'x'] *= fw[0]
    df_cell_train.loc[:,'y'] *= fw[1]
    df_cell_test.loc[:,'x'] *= fw[0]
    df_cell_test.loc[:,'y'] *= fw[1]

    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values.astype(float)

    if 'place_id' in df_cell_test.columns:

        cols = df_cell_test.columns
        cols = cols.drop('place_id')

        X_test = df_cell_test[cols].values.astype(float)

    else:

        X_test = df_cell_test.values.astype(float)

    #Applying the classifier
    # clf = KNeighborsClassifier(n_neighbors=26, weights='distance',
    #                            metric='manhattan')
    clf1 = BaggingClassifier(KNeighborsClassifier(n_neighbors=26, weights='distance',
                                metric='manhattan'), n_jobs=-1, n_estimators=50)
    clf2 = RandomForestClassifier(n_estimators=100, n_jobs=-1)

    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='hard')

    eclf.fit(X, y)
    y_pred = eclf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])

    return pred_labels, row_ids
Ejemplo n.º 28
0
def test_sample_weight():
    """Tests sample_weight parameter of VotingClassifier"""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = SVC(probability=True, random_state=123)
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('svc', clf3)],
                             voting='soft').fit(X,
                                                y,
                                                sample_weight=np.ones(
                                                    (len(y), )))
    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('svc', clf3)],
                             voting='soft').fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))

    sample_weight = np.random.RandomState(123).uniform(size=(len(y), ))
    eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft')
    eclf3.fit(X, y, sample_weight)
    clf1.fit(X, y, sample_weight)
    assert_array_equal(eclf3.predict(X), clf1.predict(X))
    assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X))

    # check that an error is raised and indicative if sample_weight is not
    # supported.
    clf4 = KNeighborsClassifier()
    eclf3 = VotingClassifier(estimators=[('lr', clf1), ('svc', clf3),
                                         ('knn', clf4)],
                             voting='soft')
    msg = ('Underlying estimator KNeighborsClassifier does not support '
           'sample weights.')
    with pytest.raises(ValueError, match=msg):
        eclf3.fit(X, y, sample_weight)

    # check that _parallel_fit_estimator will raise the right error
    # it should raise the original error if this is not linked to sample_weight
    class ClassifierErrorFit(ClassifierMixin, BaseEstimator):
        def fit(self, X, y, sample_weight):
            raise TypeError('Error unrelated to sample_weight.')

    clf = ClassifierErrorFit()
    with pytest.raises(TypeError, match='Error unrelated to sample_weight'):
        clf.fit(X, y, sample_weight=sample_weight)
Ejemplo n.º 29
0
def main():

    train_path = csvPath + "train\\";   
    test_path = csvPath +"test\\"
    
    data = getTrainY()
    
    # Read Train Data
    train_id, Xtrain_data, damaged_train_Image, Ytrain = prepareData(train_path, data)
    print "Read Train Data"
   
   # Generated Train Model
    surfFeatures, temp = generateSURFFeatures(Xtrain_data)
    centroids, histo_tr = generateTrainKmeans(surfFeatures, temp)
    
    print "Generated Train Model"

    # Read Test Data
    test_id, test_data, damaged_test_Image = prepareTestData(test_path)
    print "Read Test Data"    
    
    surfTestFeatures, temp1 = generateSURFFeatures(test_data)
    histo_te = generateTestKmeans(surfTestFeatures, centroids,temp1 )    
     
    print "Generated Test Model"
    
    # Classifier
    clf3 = SVC(probability=True, decision_function_shape='ovr')

    # Scaling data
    standard_scaler = StandardScaler()
    svm_tr = standard_scaler.fit(histo_tr)
    svm_trf = svm_tr.transform(histo_tr)
    
    svm_tr1 = standard_scaler.fit(histo_te)    
    svm_tef = svm_tr1.transform(histo_te)
    
    
    clf1 = DecisionTreeClassifier(max_depth = 3)
    clf2 = KNeighborsClassifier()
    eclf = VotingClassifier(estimators=[('dt',clf1),('knn',clf2),('svc',clf3)],voting='soft',weights=[2,1,2])

    # Cross validation
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(svm_trf, Ytrain, test_size=0.3, random_state=0)

    # Fit the data
    eclf.fit(X_train, y_train)
    print eclf.score(X_test, y_test)
    pred_out = []

    for j in range(len(test_data)):
        pred_out.append(eclf.predict_proba([svm_tef[j]]))
     
    generateOutputCSV(test_id, pred_out, damaged_test_Image,'output.csv')
      
    print "Done!!!!!!"
Ejemplo n.º 30
0
def voting_process(df_list, label_list, scale=False):
    random_state = np.random.RandomState(20180213)
    vt_results = {
        'prediction': [],
        'probaility': [],
        'y_test': [],
        'y_score': []
    }
    try:
        if scale:
            df_list = [scale_df(df) for df in df_list]
            print('DF Scaling successful.')
    except:
        raise ValueError('Failed to execute DF Scaling.')

    for x, y in zip(df_list, label_list):

        try:
            x_train, x_test, y_train, y_test = train_test_split(
                x, y, test_size=.2, random_state=random_state)
        except:
            raise ValueError('Train/Test split failed.')
        vt = VotingClassifier(estimators=[
            ('basic_log', LogisticRegression()),
            ('et', ExtraTreesClassifier()), ('ada', AdaBoostClassifier()),
            ('rf', RandomForestClassifier()),
            ('gbm',
             GradientBoostingClassifier(n_estimators=100,
                                        max_depth=5,
                                        learning_rate=0.1))
        ],
                              voting='soft')
        weighting = lambda x: 1 if x else 50
        vt.fit(x_train, y_train, sample_weight=[weighting(i) for i in y_train])

        vt_results['y_test'].append(y_test)
        vt_results['prediction'].append(vt.predict(x_test))
        vt_results['probaility'].append(vt.predict_proba(x_test)[::, 1])
        try:
            vt_results['y_score'].append(vt.decision_function(x_test))
        except:
            vt_results['y_score'].append(vt.predict_proba(x_test)[::, 1])
    return vt_results
Ejemplo n.º 31
0
def test_notfitted():
    eclf = VotingClassifier(
        estimators=[("lr1", LogisticRegression()),
                    ("lr2", LogisticRegression())],
        voting="soft",
    )
    ereg = VotingRegressor([("dr", DummyRegressor())])
    msg = ("This %s instance is not fitted yet. Call 'fit'"
           " with appropriate arguments before using this estimator.")
    with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
        eclf.predict(X)
    with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
        eclf.predict_proba(X)
    with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
        eclf.transform(X)
    with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
        ereg.predict(X_r)
    with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
        ereg.transform(X_r)
Ejemplo n.º 32
0
def train_voting_classifier(estimators, X_train, y_train):
    seed_everything(seed=1903)
    voting_clf = VotingClassifier(estimators = estimators, voting = 'soft') 
    voting_clf.fit(X_train,y_train)
    vc_pred = voting_clf.predict_proba(X_test)[:, 1] # This grabs the positive class prediction
    score = roc_auc_score(y_test, vc_pred)
    print(f'voting: {score:0.5f}') 
    vc_df = pd.DataFrame(data=[roc_auc_score(y_test, vc_pred)], 
                 columns=['Voting Classifier Score'],
                 index=["ROC AUC Score"])
    return vc_df, voting_clf, vc_pred
Ejemplo n.º 33
0
def voteClassification(featureMatrix, targets, testFeatureMatrix
                       ):  #voting  using SVM, Nearest_neighbours, RandomForest
    clf1 = svm.SVC(kernel='poly', probability=True, degree=3)
    clf2 = KNeighborsClassifier(n_neighbors=5)
    clf3 = RandomForestClassifier(n_estimators=25)
    clf = VotingClassifier(estimators=[('svm', clf1), ('nei', clf2),
                                       ('rf', clf3)],
                           voting='soft',
                           weights=[1, 1, 1])
    clf.fit(featureMatrix, targets)
    return clf.predict_proba(testFeatureMatrix)
Ejemplo n.º 34
0
def run():
        import numpy as np
        import pandas as pd
        import seaborn
        import matplotlib.pyplot as pyplot
        import seaborn as sns
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
        from sklearn.ensemble import VotingClassifier
        from sklearn.neighbors import KNeighborsClassifier
        from sklearn.naive_bayes import GaussianNB
        from sklearn.ensemble import ExtraTreesClassifier
        
        df = pd.read_table("./data/australian.csv", sep='\s+', header=None)
        y = df[14]
        X = df.drop(columns = 14)
        y.value_counts()
        # Split features and target into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y, test_size = 0.4)
        
        # Instantiate the Classifiers
        
        estimators = [('GNB', GaussianNB()), ('extc', ExtraTreesClassifier()), ('KNN', KNeighborsClassifier())]
        
        clf = VotingClassifier(estimators=estimators, voting='soft')
        
        clf.fit(X_train, y_train)
        # Make predictions for the test set
        y_pred_test = clf.predict(X_test)


        # View accuracy score
        
        print(classification_report(y_test, y_pred_test))

        clf_probs = clf.predict_proba(X_test)
        # keep probabilities for the positive outcome only
        clf_probs = clf_probs[:, 1]
        # calculate scores
        clf_auc = roc_auc_score(y_test, clf_probs)
        # summarize scores
        print('ensemble: ROC AUC=%.3f' % (clf_auc))
        print("accuracy_score is %.3f" % (accuracy_score(y_test, y_pred_test, normalize=True)))
        # calculate roc curves
        clf_fpr, clf_tpr, _ = roc_curve(y_test, clf_probs)
        # plot the roc curve for the model
        pyplot.plot(clf_fpr, clf_tpr, marker='.', label='Ensemble')
        # axis labels
        pyplot.xlabel('False Positive Rate')
        pyplot.ylabel('True Positive Rate')
        # show the legend
        pyplot.legend()
        # show the plot
        pyplot.show()
Ejemplo n.º 35
0
    def voting_model(self, X_train, X_test, y_train, bst_xgb, bst_forest,
                     bst_gradient, bst_lgb):

        vclf = VotingClassifier(estimators=[('xgb', bst_xgb),
                                            ('rf', bst_forest),
                                            ('gbm', bst_gradient),
                                            ('lgb', bst_lgb)],
                                voting='soft',
                                weights=[2, 1, 1, 2])
        vclf.fit(X_train, y_train)
        predictions = vclf.predict_proba(X_test)[:, 1]
        return predictions
Ejemplo n.º 36
0
def main():
    df_train = pd.read_csv('data/train_data.csv')
    df_valid = pd.read_csv('data/valid_data.csv')
    df_test = pd.read_csv('data/test_data.csv')

    feature_cols = [f for f in list(df_train) if "feature" in f]
    target_col = df_train.columns[-1]

    X_train = df_train[feature_cols]
    y_train = df_train[target_col]

    X_valid = df_valid[feature_cols]
    y_valid = df_valid[target_col]

    X_test = df_test[feature_cols]

    clf1 = LogisticRegression(C=1e-2, penalty='l2', n_jobs=-1)
    clf2 = RandomForestClassifier(n_jobs=-1, warm_start=True)
    clf3 = CatBoostClassifier(learning_rate=1e-2)

    ensemble = VotingClassifier( \
        estimators=[('lr', clf1), ('rf', clf2), ('cb', clf3)], \
        voting='soft', \
        n_jobs=-1)

    print('Fitting...')
    start_time = time.time()
    ensemble.fit(X_train, y_train)
    print('Fit: {}s'.format(time.time() - start_time))

    p_valid = ensemble.predict_proba(X_valid)
    loss = log_loss(y_valid, p_valid)
    print('Loss: {}'.format(loss))

    p_test = ensemble.predict_proba(X_test)
    df_pred = pd.DataFrame({'id': df_test['id'], 'probability': p_test[:, 1]})
    csv_path = 'predictions/predictions_{}_{}.csv'.format(
        int(time.time()), loss)
    df_pred.to_csv(csv_path, columns=('id', 'probability'), index=None)
    print('Saved: {}'.format(csv_path))
Ejemplo n.º 37
0
def test_sample_weight():
    """Tests sample_weight parameter of VotingClassifier"""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = SVC(probability=True, random_state=123)
    eclf1 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('svc', clf3)],
        voting='soft').fit(X, y, sample_weight=np.ones((len(y),)))
    eclf2 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('svc', clf3)],
        voting='soft').fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))

    sample_weight = np.random.RandomState(123).uniform(size=(len(y),))
    eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft')
    eclf3.fit(X, y, sample_weight)
    clf1.fit(X, y, sample_weight)
    assert_array_equal(eclf3.predict(X), clf1.predict(X))
    assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X))

    # check that an error is raised and indicative if sample_weight is not
    # supported.
    clf4 = KNeighborsClassifier()
    eclf3 = VotingClassifier(estimators=[
        ('lr', clf1), ('svc', clf3), ('knn', clf4)],
        voting='soft')
    msg = ('Underlying estimator KNeighborsClassifier does not support '
           'sample weights.')
    with pytest.raises(ValueError, match=msg):
        eclf3.fit(X, y, sample_weight)

    # check that _parallel_fit_estimator will raise the right error
    # it should raise the original error if this is not linked to sample_weight
    class ClassifierErrorFit(BaseEstimator, ClassifierMixin):
        def fit(self, X, y, sample_weight):
            raise TypeError('Error unrelated to sample_weight.')
    clf = ClassifierErrorFit()
    with pytest.raises(TypeError, match='Error unrelated to sample_weight'):
        clf.fit(X, y, sample_weight=sample_weight)
Ejemplo n.º 38
0
def main(argv):
    trainX = pd.read_csv('trainingData.txt','\t', header = None)
    trainX.drop(trainX.columns[len(trainX.columns)-1], axis = 1, inplace = True)
    trainY = pd.read_csv("trainingTruth.txt", header = None, names = ['Y'])
    df = trainX.join(trainY)
    index = df.isnull().sum(axis=1) <= 2
    df = df[index]
    df.fillna(df.median(), inplace = True)
    print(len(df))
    #df.dropna(axis=0, inplace=True) # drop the row with NA in training.
    X = df.iloc[:,0:-1].values
    Y = df['Y'].values

    Y_binary = np.ones((len(Y),3)) * (-1)
    for i in range(3):
        index = Y == (i+1)
        Y_binary[index,i] = 1

    X_scaled = preprocessing.scale(X)
    X_PCA = PCA(n_components=30).fit_transform(X_scaled)

    clf1 = LogisticRegression(random_state=1)
    clf2 = RandomForestClassifier(random_state=1, n_estimators=20)
    clf3 = GaussianNB()

    clf4 = DecisionTreeClassifier(max_depth=4)
    clf5 = KNeighborsClassifier(n_neighbors=7)
    clf6 = SVC(kernel='rbf', probability=True)
    clf7 = AdaBoostClassifier(random_state=1)

    testX = pd.read_csv('testData.txt','\t', header = None)
    testX.drop(testX.columns[len(testX.columns)-1], axis = 1, inplace = True)
    testX.fillna(testX.median(), inplace = True) # Handle NA in test data, although not necessary for this assignment.

    testX_scaled = preprocessing.scale(testX)
    testX_PCA = PCA(n_components=30).fit_transform(testX_scaled)

    proba = np.zeros((len(testX),3))
    for i in range(3):
        eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3),
                                         ('dt', clf4), ('kn', clf5), ('svc', clf6)], 
                                 voting='soft').fit(X_PCA,Y_binary[:,i])

        proba[:,i] = eclf.predict_proba(testX_PCA)[:,1]
        

    # Write to file
    results = pd.DataFrame(proba)
    results['prediction'] = np.argmax(proba, axis=1) + 1
    results.to_csv('testY.txt', sep='\t', header = False, index = False)

    print(results.iloc[0:10,:])
Ejemplo n.º 39
0
def voting_classifier(files, var):
    res_df = list()
    fpath = files['path']
    datasets = pd.read_csv(fpath)
    datasets = datasets.dropna()
    datasets.drop_duplicates(inplace=True)
    dsets = shuffle(datasets)
    if int(var) == 1:
        d_train, d_test, l_train, l_test = model_selection.train_test_split(datasets['text'],datasets['spam'],test_size=0.33, random_state=42)
        dtrain_msg = features_transform(mail=d_train, dtrain=d_train, var1='VOTE')
        n_est1 = hp.BAG_class(files)
        n_est2 = hp.RFC_class(files)
        n_est3 = hp.AB_class(files)
        alp_dict = hp.MNB_class(files)
        bag_class =  BaggingClassifier(n_estimators=100)
        model_rf=RandomForestClassifier(n_estimators=20,criterion='entropy')
        ada_class = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=62)
        modelMNB = naive_bayes.MultinomialNB()
        eclf = VotingClassifier(estimators=[('BgC', bag_class), ('RF', model_rf), ('Ada', ada_class), ('MNB', modelMNB) ], voting='soft')
        train_classifier(eclf, dtrain_msg, l_train, typ="VOTE")
        eclf.fit(dtrain_msg, l_train)
        pred_train = eclf.predict(dtrain_msg)
        mnb_dict = model_assessment(u_classify='Voting EM', y_data=l_train, predicted_class=pred_train)
        return mnb_dict
    elif int(var) == 2:
        print("Inside training phase : ")
        d_test = dsets['text']
        eclf = load(open('VOTE.pkl', 'rb'))
        vect = load(open('vectVOTE.pkl', 'rb'))
        tf = TfidfTransformer()
        load_vect = CountVectorizer(vocabulary=vect)
        last = len(d_test)
        for i in range(0, last):
            if d_test.get(i) != None:
                tup = [d_test[i],]
                dtest_msg = tf.fit_transform(load_vect.fit_transform(tup))
                pred_test = eclf.predict(dtest_msg)
                pred = eclf.predict_proba(dtest_msg)
                res_df.append((i+1, [pred[0][0], pred[0][1], pred_test[0]]))
        df = pd.DataFrame.from_items(res_df, orient='index', columns=['Class O', 'Class 1', 'Result'])
        print(df.head(15))
        return df
    else:
        d_train, d_test, l_train, l_test = model_selection.train_test_split(datasets['text'],datasets['spam'],test_size=0.33, random_state=42)
        eclf = load(open('VOTE.pkl', 'rb'))
        vect = load(open('vectVOTE.pkl', 'rb'))
        tf = TfidfTransformer()
        load_vect = CountVectorizer(vocabulary=vect)
        dtest_msg = tf.fit_transform(load_vect.fit_transform(d_test))
        pred_test = eclf.predict(dtest_msg)
        mnb_dict = model_assessment(u_classify='Voting EM', y_data=l_test, predicted_class=pred_test)
        return mnb_dict
Ejemplo n.º 40
0
def voting_class(X,training_target,Y):
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import VotingClassifier
    
    clf1 = LogisticRegression(random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')
    eclf.fit(X[:,0:6],training_target)
    proba = eclf.predict_proba(Y[:,0:6])
    
    eclf.predict()
Ejemplo n.º 41
0
def all_classifer(X_train,y_train,X_test,y_test):
    rf=RandomForestClassifier(n_estimators=100,class_weight ='balanced') 
    score1=scores(y_test,rf.fit(X_train,y_train).predict(X_test),rf.predict_proba(X_test)[:,1],'RT')
    gbc = GradientBoostingClassifier(n_estimators=50,learning_rate=0.05).fit(X_train,y_train)
    score2=scores(y_test,gbc.fit(X_train,y_train).predict(X_test),gbc.predict_proba(X_test)[:,1],'gbc') 
    ets=ExtraTreesClassifier(n_estimators=100,max_depth=None,min_samples_split=1,random_state=0)
    score3=scores(y_test,ets.fit(X_train,y_train).predict(X_test),ets.predict_proba(X_test)[:,1],'ets') 
#    lgr = LogisticRegression()
#    score4=scores(y_test,lgr.fit(X_train,y_train).predict(X_test),'lgr') 
    ab = AdaBoostClassifier(algorithm='SAMME.R',n_estimators=50,learning_rate=0.7)
    score5=scores(y_test,ab.fit(X_train,y_train).predict(X_test),ab.predict_proba(X_test)[:,1],'abboost') 
#    print roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
#    bagging=BaggingClassifier()
#    score8=scores(y_test,bagging.fit(X_train,y_train).predict(X_test),'bagging')    
    
#    dt = DecisionTreeClassifier(max_depth=None, min_samples_split=1,random_state=0)
#    score6=scores(y_test,dt.fit(X_train,y_train).predict(X_test),'dt') 
    eclf = VotingClassifier(estimators=[ ('rf', rf), 
                                        ('gd',gbc),('ETs',ets),('ab',ab)],
                                         voting='soft',weights =[score1[0],score2[0],score3[0],score5[0]])
    score7=scores(y_test,eclf.fit(X_train,y_train).predict(X_test),eclf.predict_proba(X_test)[:,1],'voting') 
    print eclf
    return [score1,score2,score3,score5,score7]
Ejemplo n.º 42
0
class VtClassifier(Model):
    '''
    Voting Classfier
    '''

    def __init__(self, *args):
        Model.__init__(self)
        self.modelIndex = ['GNB', 'SVClassifier', 'LRModel', 'ABClassifier', 'GBClassifier']
        self.models = []
        self.estimators = []
        for arg in args:
            index = self.modelIndex.index(arg)
            if index == 0:
                self.models.append(Model())
                self.estimators.append((arg, Model().model))
            elif index == 1:
                self.models.append(SVClassifier())
                self.estimators.append((arg, SVClassifier().model))
            elif index == 2:
                self.models.append(LRModel())
                self.estimators.append((arg, LRModel().model))
            elif index == 3:
                self.models.append(ABClassifier())
                self.estimators.append((arg, ABClassifier().model))
            elif index == 4:
                self.models.append(GBClassifier())
                self.estimators.append((arg, GBClassifier().model))
        self.model = VotingClassifier(estimators=self.estimators, voting='hard')

    def train(self, data, target):
        for model in self.models:
            model.train(data, target)
        self.model.fit(data, target)

    def predict(self, test):
        return self.model.predict_proba(test)
    clf.predict(test_[cols])
    preds = clf.predict_proba(test_[cols])
    #print(confusion_matrix(test['class'], clf.predict(test[cols])))
    print (pd.crosstab(test_['TripType'], clf.predict(test_[cols]), rownames=["Actual"], colnames=["Predicted"]))
    print (classification_report(test_['TripType'], clf.predict(test_[cols])))
    score=accuracy_score(test_['TripType'],clf.predict(test_[cols]))
#    table.append([name,score])
print (score)
'''
clf= VotingClassifier(estimators = [('BaggingKNN', BaggingClassifier(KNeighborsClassifier(20))),
    ('RandomForest', RandomForestClassifier(10)),
    ('BaggingCART', BaggingClassifier(DecisionTreeClassifier()))],
    voting='soft', weights=[7,1,1])
clf.fit(train_[cols], train_["TripType"])
clf.predict(test_[cols])
preds = clf.predict_proba(test_[cols])
#print(confusion_matrix(test['class'], clf.predict(test[cols])))
print (pd.crosstab(test_['TripType'], clf.predict(test_[cols]), rownames=["Actual"], colnames=["Predicted"]))
print (classification_report(test_['TripType'], clf.predict(test_[cols])))
score=accuracy_score(test_['TripType'],clf.predict(test_[cols]))
#table.append([score])
print (score)

eclf = VotingClassifier(estimators = [('BaggingKNN', BaggingClassifier(KNeighborsClassifier(20))),
    ('BaggingRandomForest', RandomForestClassifier(10)),
    ('BaggingCART', BaggingClassifier(DecisionTreeClassifier()))],
    voting='soft', weights=[7,1,1])
eclf.fit(train[cols], train["TripType"])
#use the classifier to predict
predicted=eclf.predict(test[cols])
#print (accuracy_score(predicted,test['TripType']))
'''
####################################
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators=200,max_depth = 15,random_state=1)
clf3 = GaussianNB()

clf4 = xgb.XGBClassifier(missing=np.nan, max_depth=15, n_estimators=200, learning_rate=0.02, nthread=16, subsample=0.95, colsample_bytree=0.85, seed=4242)
clf5 = AdaBoostClassifier(n_estimators=300, learning_rate=0.02,random_state=1)

eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('xgb', clf4),('adb',clf5)], voting='soft')

print("fitting..")
eclf1 = eclf1.fit(X_train, y_train)

print("predicting..")
rfpreds = eclf1.predict_proba(X_test)

print("arrived at verdict..")
###################################

x,y,thresholds =roc_curve(y_test,rfpreds[:,1],1)
plt.figure()
plt.plot(x,y)
plt.show()

print (auc(x,y))
bestMCCR =0
for threshold in thresholds:
    predicted = rfpreds[:,1] > threshold
    CCR1, CCR2, mCCR = MCCR(predicted,y_test,0,1);
    bestMCCR = max(bestMCCR,mCCR)
Ejemplo n.º 45
0
param1 = {'max_depth':7, 'learning_rate':0.1, 'silent':0, 'objective':'multi:softprob','num_class':5,
        'eval_metric':'mlogloss','subsample':0.75,'colsample_bytree':0.85,'reg_lambda':1,'n_estimators':num_round}

param2 = {'max_depth':6, 'learning_rate':0.1, 'silent':0, 'objective':'multi:softprob','num_class':5,
        'eval_metric':'mlogloss','subsample':0.85,'colsample_bytree':0.75,'reg_lambda':1,'n_estimators':num_round}

param3 = {'max_depth':8, 'learning_rate':0.03, 'silent':0, 'objective':'multi:softprob','num_class':5,
        'eval_metric':'mlogloss','subsample':0.65,'colsample_bytree':0.75,'reg_lambda':1,'n_estimators':num_round}

param4 = {'max_depth':9, 'learning_rate':0.03, 'silent':0, 'objective':'multi:softprob','num_class':5,
        'eval_metric':'mlogloss','subsample':0.55,'colsample_bytree':0.65,'reg_lambda':1,'n_estimators':num_round}

param5 = {'max_depth':12, 'learning_rate':0.03, 'silent':0, 'objective':'multi:softprob','num_class':5,
        'eval_metric':'mlogloss','subsample':1,'colsample_bytree':1,'reg_lambda':1,'n_estimators':num_round}

bst1 = xgb.XGBClassifier(param1)
bst2 = xgb.XGBClassifier(param2)
bst3 = xgb.XGBClassifier(param3)
bst4 = xgb.XGBClassifier(param4)
bst5 = xgb.XGBClassifier(param5)

# prob = (bst1.predict(test) + bst2.predict(test) + bst3.predict(test) +  bst4.predict(test) +  bst5.predict(test))/5

model = VotingClassifier(estimators=[('xgb1',bst1),('xgb2',bst2),('xgb3',bst3),('xgb4',bst4),('xgb5',bst5)],voting='soft')
model.fit(X,y)
prob = model.predict_proba(test)
sub = pd.DataFrame(prob,columns=['Adoption','Died','Euthanasia','Return_to_owner','Transfer'])
sub['ID'] = IDs
sub = sub[['ID','Adoption','Died','Euthanasia','Return_to_owner','Transfer']]
sub.to_csv('sub.csv',index=False)
Ejemplo n.º 46
0
    def fit(self):
        clf_list=[]
        # # KNN
        # print "KNN"
        # knn = KNeighborsClassifier(n_neighbors=35, weights='distance', leaf_size=2)
        # print "Fitting KNN"
        # knn.fit(self.X_train, self.y_train)
        # print('KNN {score}'.format(score=log_loss(self.y_test, knn.predict_proba(self.X_test))))
        # self.clfs['knn'] = knn
        # clf_list.append(knn)
        # Random forests
        print "Random forest on gini"
        rfc = RandomForestClassifier(n_estimators=43,
                                     criterion='gini',
                                     random_state=4141,
                                     n_jobs=-1,
                                     max_depth=21,
                                     max_features=0.12)
        print "Fitting random forest with gini"
        rfc.fit(self.X_train, self.y_train)
        print('RFC LogLoss {score}'.format(score=log_loss(self.y_test, rfc.predict_proba(self.X_test))))
        self.clfs['rfc']=rfc
        clf_list.append(rfc)
        print "Random forest with entropy"
        rfc2 = RandomForestClassifier(n_estimators=80,
                                      criterion='entropy',
                                      random_state=1337,
                                      n_jobs=-1,
                                      max_depth=36,
                                      max_features=0.06)
        print "Fitting random forest with entropy"
        rfc2.fit(self.X_train, self.y_train)
        print('RFC2 LogLoss {score}'.format(score=log_loss(self.y_test, rfc2.predict_proba(self.X_test))))
        self.clfs['rfc2']=rfc2
        clf_list.append(rfc2)
        # Logistic regression
        print "Logistic regression on logloss"
        logreg = LogisticRegression(C=1.05, penalty='l2')
        print "Fitting logistic regression"
        logreg.fit(self.X_train, self.y_train)
        print('LR LogLoss {score}'.format(score=log_loss(self.y_test, logreg.predict_proba(self.X_test))))
        self.clfs['lr']=logreg
        clf_list.append(logreg)

        # # gradient boosting
        # gbt1=GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth = 1, random_state = 0)
        # print "Fitting gradient boosting tree"
        # gbt1.fit(self.X_train, self.y_train)
        # print('Gbt1 LogLoss {score}'.format(score=log_loss(self.y_test, gbt1.predict_proba(self.X_test))))
        # self.clfs['gbt1']=gbt1
        # clf_list.append(gbt1)

        # # Bad performance
        # # Multinomial Naive Bayes
        # print "Multinomial naive bayes"
        # mnb = MultinomialNB(fit_prior=False,alpha=0.25)
        # print "Fitting multinomial naive bayes"
        # mnb.fit(self.X_train, self.y_train)
        # print('MNB {score}'.format(score=log_loss(self.y_test, mnb.predict_proba(self.X_test))))
        # self.clfs['mnb'] = mnb
        # clf_list.append(mnb)

        # Adaboost
        print "Adaboost trees"
        abc = AdaBoostClassifier(n_estimators=100,learning_rate=0.5)
        print "Fitting Adaboost trees"
        abc.fit(self.X_train, self.y_train)
        print('ABC {score}'.format(score=log_loss(self.y_test, abc.predict_proba(self.X_test))))
        self.clfs['abc'] = abc
        clf_list.append(abc)


        # Ensemble to models
        eclf3 = VotingClassifier(estimators=[('lr', logreg), ('rf', rfc), ('rf2', rfc2),('abc',abc)], voting='soft',
                                 weights=[2, 2, 2, 1])
        eclf3.estimators_ = clf_list
        print "Dig into the voting classifier"
        innerClfs = eclf3.estimators_
        print "Check estimators"
        print innerClfs
        print('Ensemble LogLoss {score}'.format(score=log_loss(self.y_test, eclf3.predict_proba(self.X_test))))
        self.ensembleClf=eclf3
        print "Ensemble fitting finished"
                                   seed=23)
XGBClassifier2 = xgb.XGBClassifier(objective='binary:logistic',
                                   missing=9999999999,
                                   max_depth=8,
                                   n_estimators=1000,
                                   learning_rate=0.05,
                                   nthread=4,
                                   subsample=0.8,
                                   colsample_bytree=0.5,
                                   min_child_weight=8,
                                   seed=1313)
#0.825461
classifier = VotingClassifier([('clf1', XGBClassifier1), ('clf2', XGBClassifier2)], voting='soft', weights=[1, 1])

classifier.fit(X_train, y_train)


testingPreds=classifier.predict_proba(sel_test);
submission = pd.DataFrame({"ID":test.index, "TARGET":testingPreds[:,1]})
submission.to_csv("XGBoostEnsembled.csv", index=False)


# mapFeat = dict(zip(["f"+str(i) for i in range(len(features))],features))
# ts = pd.Series(clf.booster().get_fscore())
# #ts.index = ts.reset_index()['index'].map(mapFeat)
# ts.sort_values()[-15:].plot(kind="barh", title=("features importance"))
#
# featp = ts.sort_values()[-15:].plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
# plt.title('XGBoost Feature Importance')
# fig_featp = featp.get_figure()
# fig_featp.savefig('feature_importance_xgb.png', bbox_inches='tight', pad_inches=1)
Ejemplo n.º 48
0
xgb2 = xgb.XGBClassifier(max_depth=11,
                            n_estimators=100,
                            learning_rate=0.03,
                            subsample=0.96,
                            colsample_bytree=0.45,
                            colsample_bylevel=0.45,
                            objective='binary:logistic',
                            nthread=4,
                            seed=1313)
#score = log_loss(y_test, extc.predict_proba(X_test)[:, 1])

X_train, X_test, y_train, y_test = cross_validation.train_test_split(train, target, random_state=1301, test_size=0.3)

clfs = [('etc', etc1), ('rf', rf1), ('xgb', xgb1), ('etc2', etc2)]
# # set up ensemble of rf_1 and rf_2
clf = VotingClassifier(estimators=clfs, voting='soft', weights=[1, 1, 1, 1])
st = time.time()
scores = cross_validation.cross_val_score(clf, X_train, y_train, scoring='log_loss', cv=5, verbose=2)
print(scores.mean()*-1)
print("time elaspe", time.time() - st)
exit()

clf.fit(train, target)
print('Predict...')
y_pred = clf.predict_proba(test)

# print y_pred

pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:, 1]}).to_csv('data/extra_trees_1_7.csv', index=False)
bagged_rf.fit(X_train, y_train)
print "bagged rf test",roc_auc_score(y_test, bagged_rf.predict_proba(X_test)[:,1])
#print "bagged rf train",roc_auc_score(y_train, bagged_rf.predict_proba(X_train)[:,1])

'''print "Calibrating Bagged Decision Trees..."
calibrated_dt.fit(X_train, y_train)
print "calibrated_dt test:", roc_auc_score(y_test, calibrated_dt.predict_proba(X_test)[:,1])

print "Calibrating Bagged Random Forests..."
calibrated_rf.fit(X_train, y_train)
print "calibrated_rf test:", roc_auc_score(y_test, calibrated_rf.predict_proba(X_test)[:,1])
'''
print "Voting with all models...."
voted_model = VotingClassifier(estimators=[('one', ada), ('two', bagged_rf), ('four', bagged_dt)], voting='soft')
voted_model.fit(X_train, y_train)
print "Voted Model test:",roc_auc_score(y_test, voted_model.predict_proba(X_test)[:,1])
#print "Voted Model train",roc_auc_score(y_train, voted_model.predict_proba(X_train)[:,1])

####Loading test file and saving predictions

print "Saving Voted Submission"
X_test = np.genfromtxt ('test_normal_286.csv', delimiter=",")
ncounts = np.zeros((X_test.shape[0], 1))
for i in range(0, X_test.shape[0]):
	ncounts[i, 0] = (X_test[i, :] == 0).sum(0)
X_test = np.append(X_test, ncounts, axis = 1)

categories_test = clusters.predict(X_test)
cats = np.zeros((len(categories_test), 1))
for i in range(0, cats.shape[0]):
	cats[i, 0] = categories_test[i]
Ejemplo n.º 50
0
def main(argv):
  f = open("trainingData.txt")

  cols = []
  for col_i in range(N_COLS):
    col = []
    cols.append(col)

  rows = []
  rows_na = []
  n_na = 0
  curr_n_na = 0
  freq_dict = {}

  while True:
    row = f.readline()
    if row == "": break
    features = [float(number) if number != 'NA' else PLACEHOLDER for number in row.split()]

    curr_n_na = 0
    for col_i in range(N_COLS):
      if features[col_i] != PLACEHOLDER:
        cols[col_i].append(features[col_i])
      else:
        n_na += 1
        curr_n_na += 1

    rows.append(features)
    rows_na.append(curr_n_na)

    if curr_n_na in freq_dict:
      freq_dict[curr_n_na] += 1
    else:
      freq_dict[curr_n_na] = 1
  f.close()

  print ("NA distribution: ",freq_dict)
  print ("Total # of NA:", n_na)

  medians = []
  for col_i in range(N_COLS):
    medians.append(statistics.median(cols[col_i]))

  for i, features in enumerate(rows):
    for j, feature in enumerate(features):
      if feature == PLACEHOLDER:
        rows[i][j] = medians[j]

  X = np.array(rows)

  f = open("trainingTruth.txt")
  rows = []
  while True:
    row = f.readline()
    if row == "": break
    rows.append(int(row))
  f.close()

  Y = np.array(rows)

  print ("# of each label:", np.bincount(Y))

  take = []
  for i in range(X.shape[0]):
    if Y[i] == 1 and rows_na[i] == 0:
      take.append(i)
    elif Y[i] == 2 and rows_na[i] == 0:
      take.append(i)
    elif Y[i] == 3 and rows_na[i] <= 1:
      take.append(i)

  X = X[take]
  Y = Y[take]

  print ("# of each label after normalization:", np.bincount(Y))

  X_scaled = preprocessing.scale(X)
  X_PCA = PCA(n_components=3).fit_transform(X_scaled)

  clf1 = LogisticRegression(random_state=1)
  clf2 = RandomForestClassifier(random_state=1, n_estimators=20)
  clf3 = GaussianNB()
  clf4 = DecisionTreeClassifier(max_depth=4)
  clf5 = KNeighborsClassifier(n_neighbors=7)
  clf6 = SVC(kernel='rbf', probability=True)

  estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3),
              ('dt', clf4), ('kn', clf5), ('svc', clf6)]

  eclf = VotingClassifier(estimators, voting='soft').fit(X_PCA,Y)

  testX = pd.read_csv('testData.txt','\t', header = None)
  testX.drop(testX.columns[len(testX.columns)-1], axis = 1, inplace = True)
  # testX.fillna(testX.median(), inplace = True) # Handle NA in test data, although not necessary for this assignment.

  testX_scaled = preprocessing.scale(testX)
  testX_PCA = PCA(n_components=3).fit_transform(testX_scaled)

  proba = eclf.predict_proba(testX_PCA)
  prediction = eclf.predict(testX_PCA)
  
  # Write to file
  results = pd.DataFrame(proba)
  results['prediction'] = prediction
  results.to_csv('testY_1114.txt', sep='\t', header = False, index = False)
  # results['prediction'].to_csv('testY_1114.txt', sep='\t', header = False, index = False)

  print(results.iloc[0:10,:])

  return

  for i, estimator in enumerate(estimators):
    print (i)

    curr_clf = estimator[1]
    curr_clf.fit(X_PCA, Y)

    proba = curr_clf.predict_proba(testX_PCA)
    prediction = curr_clf.predict(testX_PCA)

    results = pd.DataFrame(proba)
    results['prediction'] = prediction
    
    print(results.iloc[0:10,:])
Ejemplo n.º 51
0
class VotingWeightSearchCV(BaseEstimator, ClassifierMixin, TransformerMixin):
    """
    Soft voting classifier that chooses weights based on test dataset
    """
    def __init__(self, estimators, test_size=0.33, starting_weights=None,
                 verbose=0, random_state=None, refit=False):
        self.test_size = test_size
        self.estimators = estimators
        self.verbose = verbose
        self.random_state = random_state
        self.refit = refit

        if starting_weights is not None:
            self.starting_weights = starting_weights
        else:
            self.starting_weights = [0.5] * len(estimators)

        self.best_estimator_ = None
        self.weights_ = None
        self.peak_score_ = None

    def _log(self, msg, verbosity=0):
        if self.verbose >= verbosity:
            print "{pre} {ind}{msg}".format(
                pre = "(SW)",
                ind = "".join(["  "] * verbosity),
                msg = msg
            )

    def fit(self, X, y):
        """Train and find the optimum weights.

        https://www.kaggle.com/hsperr/otto-group-product-classification-challenge/finding-ensamble-weights/code
        https://www.kaggle.com/sushanttripathy/otto-group-product-classification-challenge/wrapper-for-models-ensemble/code
        """

        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size = self.test_size,
            random_state = self.random_state,
            stratify = y
        )

        fitted_estimators = []
        predictions = []

        def log_loss_func(weights):
            final_prediction = 0
            for weight, prediction in zip(weights, predictions):
                final_prediction += weight * prediction

            return log_loss(y_test, final_prediction)

        # Fit on train set
        self._log("Fitting on train subset...")

        for label, clf in self.estimators:
            self._log("fitting {0}...".format(label), 1)
            fitted_clf = clone(clf).fit(X_train, y_train)
            fitted_estimators.append((label, fitted_clf))

        # Predict on test set
        self._log("Predict on test subset...")

        for label, clf in fitted_estimators:
            self._log("predict using {0}...".format(label), 1)
            predictions.append(clf.predict_proba(X_test))

        # Search weights
        self._log("Searching weights...")

        cons = ({"type": "eq", "fun": lambda w: 1 - sum(w)})
        bounds = [(0,1)]*len(predictions)
        res = minimize(
            log_loss_func,
            self.starting_weights,
            method = "SLSQP",
            bounds = bounds,
            constraints = cons
        )

        self.weights_ = list(res["x"])
        self.peak_score_ = res["fun"]

        self._log("Best weights: {0}".format(self.weights_), 1)
        self._log("Peak score: {0}".format(self.peak_score_), 1)

        # Build voting classifier
        self.best_estimator_ = VotingClassifier(
            estimators = self.estimators,
            voting = "soft",
            weights = self.weights_
        )

        if self.refit:
            self._log("Refitting using best weights...")
            self.best_estimator_.fit(X, y)

        return self

    def predict(self, X):
        return self.best_estimator_.predict(X)

    def predict_proba(self, X):
        return self.best_estimator_.predict_proba(X)

    def transform(self, X):
        return self.best_estimator_.transform(X)