Esempio n. 1
0
def main(directory, tools_directory, non_tools_dir):
    global path
    path = sys.path[0]
    start = time.time()
    if directory is None or not os.path.isdir(directory):
        print "Please input directory containing pdf publications to classify"
        sys.exit(1)
    x_train, y_train = fetch_from_file()
    x_test, test_files = get_test_set(directory)
    # Just for testing, update machine learning part later

    x_train, x_test = normalize_scale(x_train, x_test)
    classifier = VotingClassifier(
        [("first", classifier_list[0]), ("second", classifier_list[1]), ("second", classifier_list[2])]
    )
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    if os.path.isdir(tools_directory):
        shutil.rmtree(tools_directory)
    os.makedirs(tools_directory)

    if os.path.isdir(non_tools_dir):
        shutil.rmtree(non_tools_dir)
    os.makedirs(non_tools_dir)

    for num, pub in zip(y_pred, test_files):
        if num:
            shutil.copy2(directory + pub, tools_directory + pub)
        else:
            shutil.copy2(directory + pub, non_tools_dir + pub)

    print "Classification:    Seconds taken: " + str(time.time() - start)
Esempio n. 2
0
def voting_fit(X, y, RESULT_TEST_PATH,RESULT_PATH):
    ada_best = fit_adaboost(X, y)
    extratree_best = fit_extratree(X, y)
    rf_best = fit_rf(X, y)
    gbdt_best = fit_xgboost(X, y)
    svc_best = fit_svc(X, y)
    lr_best = fit_lr(X, y)

    votingC = VotingClassifier(estimators=[('rfc', rf_best), ('extc', extratree_best),('lr',lr_best),
                                            ('adac', ada_best), ('gbc', gbdt_best)], voting='soft',
                               n_jobs=4)
    votingC.fit(X, y)

    test_df = pd.read_csv(RESULT_TEST_PATH)
    test = np.array(test_df)

    #test_Survived = pd.Series(votingC.predict(test), name="Survived")

    result = votingC.predict(test)
    test_df.insert(test_df.columns.size, 'Survived', result)

    test_df = test_df[['PassengerId', 'Survived']]
    test_df['PassengerId'] = test_df['PassengerId'].apply(np.int64)
    test_df.to_csv(RESULT_PATH, index=False)
    print("finish!")
Esempio n. 3
0
 def _voting(estimators, **kwargs):
     """Build the classifier
     """
     clfObj = VotingClassifier([(k.shStr, k) for k in estimators], n_jobs=1, **kwargs)
     clfObj.lgStr = ' + '.join([k.lgStr for k in estimators])
     clfObj.shStr = ' + '.join([k.shStr for k in estimators])
     return clfObj
def test_predict_on_toy_problem():
    """Manually check predicted class labels for toy dataset."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()

    X = np.array([[-1.1, -1.5],
                  [-1.2, -1.4],
                  [-3.4, -2.2],
                  [1.1, 1.2],
                  [2.1, 1.4],
                  [3.1, 2.3]])

    y = np.array([1, 1, 1, 2, 2, 2])

    assert_equal(all(clf1.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
    assert_equal(all(clf2.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
    assert_equal(all(clf3.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))

    eclf = VotingClassifier(estimators=[
                            ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
                            voting='hard',
                            weights=[1, 1, 1])
    assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))

    eclf = VotingClassifier(estimators=[
                            ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
                            voting='soft',
                            weights=[1, 1, 1])
    assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
Esempio n. 5
0
 def predict(self,X_test):
     '''
     predict the class for each sample
     '''
     if self.use_append == True:
         self.__X_test = X_test
     elif self.use_append == False:
         temp = []
     
     # first stage
     for clf in self.stage_one_clfs:
         y_pred = clf[1].predict(X_test)
         y_pred  = np.reshape(y_pred,(len(y_pred),1))
         if self.use_append == True:
             self.__X_test = np.hstack((self.__X_test,y_pred)) 
         elif self.use_append == False:
             temp.append(y_pred)
     
     if self.use_append == False:
         self.__X_test = np.array(temp).T[0]
     
     # second stage
     majority_voting = VotingClassifier(estimators=self.stage_two_clfs, voting="hard", weights=self.weights)
     y_out = majority_voting.predict(self.__X_test)
     return y_out
    def process_cell(self, df_cell_train, df_cell_test, window):

        place_counts = df_cell_train.place_id.value_counts()
        mask = (place_counts[df_cell_train.place_id.values] >= th).values
        df_cell_train = df_cell_train.loc[mask]

        # Working on df_test
        row_ids = df_cell_test.index

        # Preparing data
        le = LabelEncoder()
        y = le.fit_transform(df_cell_train.place_id.values)
        X = df_cell_train.drop(['place_id', ], axis=1).values.astype(int)
        X_test = df_cell_test.values.astype(int)

        # Applying the classifier
        clf1 = KNeighborsClassifier(n_neighbors=50, weights='distance',
                                    metric='manhattan')
        clf2 = RandomForestClassifier(n_estimators=50, n_jobs=-1)
        eclf = VotingClassifier(estimators=[('knn', clf1), ('rf', clf2)], voting='soft')

        eclf.fit(X, y)
        y_pred = eclf.predict_proba(X_test)
        pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3])
        return pred_labels, row_ids
Esempio n. 7
0
def test_transform():
    """Check transform method of VotingClassifier on toy dataset."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
    y = np.array([1, 1, 2, 2])

    eclf1 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
        voting='soft').fit(X, y)
    eclf2 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
        voting='soft',
        flatten_transform=True).fit(X, y)
    eclf3 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
        voting='soft',
        flatten_transform=False).fit(X, y)

    assert_array_equal(eclf1.transform(X).shape, (4, 6))
    assert_array_equal(eclf2.transform(X).shape, (4, 6))
    assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
    assert_array_almost_equal(eclf1.transform(X),
                              eclf2.transform(X))
    assert_array_almost_equal(
            eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)),
            eclf2.transform(X)
    )
def test_tie_situation():
    """Check voting classifier selects smaller class label in tie situation."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    eclf = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], voting="hard")
    assert_equal(clf1.fit(X, y).predict(X)[73], 2)
    assert_equal(clf2.fit(X, y).predict(X)[73], 1)
    assert_equal(eclf.fit(X, y).predict(X)[73], 1)
Esempio n. 9
0
def process_one_cell(df_train, df_test, x_min, x_max, y_min, y_max):

    x_border_augment = 0.025
    y_border_augment = 0.0125

    #Working on df_train
    df_cell_train = df_train[(df_train['x'] >= x_min-x_border_augment) & (df_train['x'] < x_max+x_border_augment) &
                               (df_train['y'] >= y_min-y_border_augment) & (df_train['y'] < y_max+y_border_augment)]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    #Working on df_test
    # to be delete: df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    df_cell_test = df_test[(df_test['x'] >= x_min) & (df_test['x'] < x_max) &
                               (df_test['y'] >= y_min) & (df_test['y'] < y_max)]
    row_ids = df_cell_test.index

    if(len(df_cell_train) == 0 or len(df_cell_test) == 0):
        return None, None

    #Feature engineering on x and y
    df_cell_train.loc[:,'x'] *= fw[0]
    df_cell_train.loc[:,'y'] *= fw[1]
    df_cell_test.loc[:,'x'] *= fw[0]
    df_cell_test.loc[:,'y'] *= fw[1]

    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values.astype(float)

    if 'place_id' in df_cell_test.columns:

        cols = df_cell_test.columns
        cols = cols.drop('place_id')

        X_test = df_cell_test[cols].values.astype(float)

    else:

        X_test = df_cell_test.values.astype(float)

    #Applying the classifier
    # clf = KNeighborsClassifier(n_neighbors=26, weights='distance',
    #                            metric='manhattan')
    clf1 = BaggingClassifier(KNeighborsClassifier(n_neighbors=26, weights='distance',
                                metric='manhattan'), n_jobs=-1, n_estimators=50)
    clf2 = RandomForestClassifier(n_estimators=100, n_jobs=-1)

    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='hard')

    eclf.fit(X, y)
    y_pred = eclf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])

    return pred_labels, row_ids
def test_tie_situation():
    """Check voting classifier selects smaller class label in tie situation."""
    clf1 = LogisticRegression(random_state=123, solver='liblinear')
    clf2 = RandomForestClassifier(random_state=123)
    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)],
                            voting='hard')
    assert_equal(clf1.fit(X, y).predict(X)[73], 2)
    assert_equal(clf2.fit(X, y).predict(X)[73], 1)
    assert_equal(eclf.fit(X, y).predict(X)[73], 1)
Esempio n. 11
0
def classify():
    train_X,Y = load_svmlight_file('data/train_last')
    test_X,test_Y = load_svmlight_file('data/test_last')
    train_X = train_X.toarray()
    test_X = test_X.toarray()
    Y = [int(y) for y in Y]
    # print 'Y:',len(Y)
    rows = pd.read_csv('data/log_test2.csv',index_col=0).sort_index().index.unique()
    train_n = train_X.shape[0]
    m = train_X.shape[1]
    test_n = test_X.shape[0]
    print train_n,m,#test_n
     # 先用训练集训练出所有的分类器
    print 'train classify...'
    clf1 = LinearDiscriminantAnalysis()
    clf2 = GaussianNB()
    clf3 = LogisticRegression()
    clf4 = RandomForestClassifier()
    clf5 = KNeighborsClassifier(n_neighbors=12)
    clf6 = AdaBoostClassifier()
    # x_train,x_test,y_train,y_test = train_test_split(train_X,Y,test_size=0.2) # 对训练集进行划分

    # print x_train.shape
    # print x_test.shape
    # clf.fit(train_X,Y)
    clf = VotingClassifier(estimators=[('la',clf1),('nb',clf2),('lr',clf3),('rf',clf4),('nn',clf5),('ac',clf6)], voting='soft', weights=[1.5,1,1,1,1,1])
    # clf1.fit(x_train,y_train)
    # clf2.fit(x_train,y_train)
    # clf3.fit(x_train,y_train)
    # clf4.fit(x_train,y_train)
    clf.fit(train_X,Y)
    print 'end train classify'

    print 'start classify....'
    # print metrics.classification_report(Y,predict_Y)
    # clf2.fit(train_X,Y)
    # print 'clf2 fited...'
    # clf3.fit(train_X,Y)
    # print 'clf3 fited...'
    # clf4.fit(train_X,Y)
    # print 'clf4 fited...'
    # clf1.fit(train_X,Y)
    # print 'clf1 fited...'
    # 第一个分类结果
    predict_Y = clf.predict(train_X)
    # predict_Y = clf.predict(train_X)
    print 'classify result:'
    print metrics.classification_report(Y,predict_Y)

    predict_Y = clf.predict(test_X)
    # print predict_Y,len(predict_Y)
    print 'end classify...'
    # predict_Y = clf.predict(X[cnt_train:]) # 训练注释这一行,输出测试集打开这一行,注释之后的print metric
    # predict_Y = clf.predict(test_X) # 训练注释这一行,输出测试集打开这一行,注释之后的print metric
    DataFrame(predict_Y,index=rows).to_csv('data/info_test2.csv', header=False)
def test_multilabel():
    """Check if error is raised for multilabel classification."""
    X, y = make_multilabel_classification(n_classes=2, n_labels=1, allow_unlabeled=False, random_state=123)
    clf = OneVsRestClassifier(SVC(kernel="linear"))

    eclf = VotingClassifier(estimators=[("ovr", clf)], voting="hard")

    try:
        eclf.fit(X, y)
    except NotImplementedError:
        return
def test_predict_for_hard_voting():
    # Test voting classifier with non-integer (float) prediction
    clf1 = FaultySVC(random_state=123)
    clf2 = GaussianNB()
    clf3 = SVC(probability=True, random_state=123)
    eclf1 = VotingClassifier(estimators=[
        ('fsvc', clf1), ('gnb', clf2), ('svc', clf3)], weights=[1, 2, 3],
        voting='hard')

    eclf1.fit(X, y)
    eclf1.predict(X)
Esempio n. 14
0
    def train(self):
        for bin_id in sorted(self.xy_bins):
            file_name = xybins_file_name_str.format(bin_id)
            print 'Training model: {} of {}'.format(bin_id, max(self.xy_bins))
            df = self.df
            wdf = df[df.xy_bin == bin_id]
            X = wdf[self.features]
            y = wdf.place_id

            model = VotingClassifier(self.models)
            model.fit(X, y)
            joblib.dump(model, file_name, compress=3, )
def test_sample_weight_kwargs():
    """Check that VotingClassifier passes sample_weight as kwargs"""
    class MockClassifier(BaseEstimator, ClassifierMixin):
        """Mock Classifier to check that sample_weight is received as kwargs"""
        def fit(self, X, y, *args, **sample_weight):
            assert_true('sample_weight' in sample_weight)

    clf = MockClassifier()
    eclf = VotingClassifier(estimators=[('mock', clf)], voting='soft')

    # Should not raise an error.
    eclf.fit(X, y, sample_weight=np.ones((len(y),)))
Esempio n. 16
0
def main(argv):
    trainX = pd.read_csv('trainingData.txt','\t', header = None)
    trainX.drop(trainX.columns[len(trainX.columns)-1], axis = 1, inplace = True)
    trainY = pd.read_csv("trainingTruth.txt", header = None, names = ['Y'])
    df = trainX.join(trainY)
    index = df.isnull().sum(axis=1) <= 2
    df = df[index]
    df.fillna(df.median(), inplace = True)
    print(len(df))
    #df.dropna(axis=0, inplace=True) # drop the row with NA in training.
    X = df.iloc[:,0:-1].values
    Y = df['Y'].values

    Y_binary = np.ones((len(Y),3)) * (-1)
    for i in range(3):
        index = Y == (i+1)
        Y_binary[index,i] = 1

    X_scaled = preprocessing.scale(X)
    X_PCA = PCA(n_components=30).fit_transform(X_scaled)

    clf1 = LogisticRegression(random_state=1)
    clf2 = RandomForestClassifier(random_state=1, n_estimators=20)
    clf3 = GaussianNB()

    clf4 = DecisionTreeClassifier(max_depth=4)
    clf5 = KNeighborsClassifier(n_neighbors=7)
    clf6 = SVC(kernel='rbf', probability=True)
    clf7 = AdaBoostClassifier(random_state=1)

    testX = pd.read_csv('testData.txt','\t', header = None)
    testX.drop(testX.columns[len(testX.columns)-1], axis = 1, inplace = True)
    testX.fillna(testX.median(), inplace = True) # Handle NA in test data, although not necessary for this assignment.

    testX_scaled = preprocessing.scale(testX)
    testX_PCA = PCA(n_components=30).fit_transform(testX_scaled)

    proba = np.zeros((len(testX),3))
    for i in range(3):
        eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3),
                                         ('dt', clf4), ('kn', clf5), ('svc', clf6)], 
                                 voting='soft').fit(X_PCA,Y_binary[:,i])

        proba[:,i] = eclf.predict_proba(testX_PCA)[:,1]
        

    # Write to file
    results = pd.DataFrame(proba)
    results['prediction'] = np.argmax(proba, axis=1) + 1
    results.to_csv('testY.txt', sep='\t', header = False, index = False)

    print(results.iloc[0:10,:])
Esempio n. 17
0
File: util.py Progetto: pvigier/sa
def train_classifier(algorithm, features, train):
    print('Train classifier ({})...'.format(algorithm))
    estimators = []
    if 'rf' in algorithm:
        estimators.append(('rf', RandomForestClassifier(n_estimators=100)))
    if 'lr' in algorithm:
        estimators.append(('lr', LogisticRegression()))
    if 'mb' in algorithm:
        estimators.append(('mb', MultinomialNB()))
    # Training
    classifier = VotingClassifier(estimators=estimators, voting='soft')
    classifier.fit(features, train['sentiment'])
    return classifier
def test_predict_proba_on_toy_problem():
    """Calculate predicted probabilities on toy dataset."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
    y = np.array([1, 1, 2, 2])

    clf1_res = np.array([[0.59790391, 0.40209609],
                         [0.57622162, 0.42377838],
                         [0.50728456, 0.49271544],
                         [0.40241774, 0.59758226]])

    clf2_res = np.array([[0.8, 0.2],
                         [0.8, 0.2],
                         [0.2, 0.8],
                         [0.3, 0.7]])

    clf3_res = np.array([[0.9985082, 0.0014918],
                         [0.99845843, 0.00154157],
                         [0., 1.],
                         [0., 1.]])

    t00 = (2*clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4
    t11 = (2*clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4
    t21 = (2*clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4
    t31 = (2*clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4

    eclf = VotingClassifier(estimators=[
                            ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
                            voting='soft',
                            weights=[2, 1, 1])
    eclf_res = eclf.fit(X, y).predict_proba(X)

    assert_almost_equal(t00, eclf_res[0][0], decimal=1)
    assert_almost_equal(t11, eclf_res[1][1], decimal=1)
    assert_almost_equal(t21, eclf_res[2][1], decimal=1)
    assert_almost_equal(t31, eclf_res[3][1], decimal=1)

    try:
        eclf = VotingClassifier(estimators=[
                                ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
                                voting='hard')
        eclf.fit(X, y).predict_proba(X)

    except AttributeError:
        pass
    else:
        raise AssertionError('AttributeError for voting == "hard"'
                             ' and with predict_proba not raised')
Esempio n. 19
0
def run_voting(training_set, train_set_labels, validation_set, validation_set_labels):
    from sklearn.ensemble import VotingClassifier
    standard_train_inputs = standard_data(training_set)
    standard_valid_inputs = standard_data(validation_set)
    kknn_class = KNeighborsClassifier(weights='uniform', n_neighbors=5)

    logistic_regression_solver = sklearn.linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.01, C=1.0, fit_intercept=True,
                                                                         intercept_scaling=1, class_weight=None, random_state=None, solver='newton-cg',
                                                                         max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=2)
    svm_class = svm.SVC(decision_function_shape='ovo', tol=0.001)
    eclf1 = VotingClassifier(estimators=[('knn', kknn_class), ('lr', logistic_regression_solver), ('svm', svm_class)], voting='hard')
    eclf1.fit(standard_train_inputs,train_set_labels.ravel())

    accuracy = eclf1.score(standard_valid_inputs,validation_set_labels.ravel())
    print accuracy
Esempio n. 20
0
    def classifier(self, scoring, cv, eval_using):
        
        adaclf = AdaBoostClassifier(algorithm='SAMME')
        xtr = StandardScaler().fit_transform(self.xtr)
        xte = StandardScaler().fit_transform(self.xte)
        
        # iterate over each grid score for param tuner
        for score in scoring:
            
            print('Tuning parameters of inital classifiers...')
            passive_params = param_tuner(PassiveAggressiveClassifier(), 
                                         score=score, cv=cv, xtr=xtr, 
                                         ytr=self.ytr)
            passclf = PassiveAggressiveClassifier().set_params(**passive_params)  
            sgd_params = param_tuner(SGDClassifier(), score=score, cv=cv,
                                     xtr=xtr, ytr=self.ytr)
            sgdclf = SGDClassifier().set_params(**sgd_params)
            
            # cant use resampling/bagging with passive aggressive classifier
            # will raise ValueError: The number of class labels must be > 1
            # since resampling may results in training sets with 1 class. 
            
            print('\n'+'Tuning meta-classifiers with tuned classifier/s...') 
            bagsgd_params = param_tuner(BaggingClassifier(sgdclf), 
                                         score=score, cv=cv, xtr=xtr, 
                                         ytr=self.ytr)
            bg_sgdclf = BaggingClassifier(sgdclf).set_params(**bagsgd_params)
            
            adasgd_params = param_tuner(adaclf.set_params(base_estimator=sgdclf), 
                                        score =score, cv=cv, xtr=xtr, 
                                        ytr=self.ytr)
            ada_sgdclf = adaclf.set_params(**adasgd_params)
            
            print('Voting on meta-classifiers/classifiers then predicting...')
            vote = VotingClassifier(estimators=[('BagSGD', bg_sgdclf),
                                                ('adaboostSGD', ada_sgdclf),
                                                ('Passive', passclf)],
                                    voting='hard').fit(xtr, self.ytr)

            start = time.time()
            y_true, y_pred = self.yte, vote.predict(xte)
            print('\n' + '-'*5, 'FINAL PREDICTION RESULTS','-'*5 +'\n', 
                  '{0:.4f}'.format(time.time()-start)+'--prediction time(secs)')
                  
            clf_evaluation = report(*eval_using, y_true=y_true, y_pred=y_pred)
            for reports in clf_evaluation:
                print('---',reports)
                print(clf_evaluation[reports])
def test_estimator_weights_format():
    # Test estimator weights inputs as list and array
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    eclf1 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], weights=[1, 2], voting="soft")
    eclf2 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], weights=np.array((1, 2)), voting="soft")
    eclf1.fit(X, y)
    eclf2.fit(X, y)
    assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
Esempio n. 22
0
def do_ml(ticker):
    X, y, df = extract_featuresets(ticker)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.25)


    #clf = neighbors.KNeighborsClassifier()
    clf = VotingClassifier([('lsvc', svm.LinearSVC()),
                            ('knn', neighbors.KNeighborsClassifier()),
                            ('rfor', RandomForestClassifier())] )

    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print('Accuracy', confidence)
    predictions = clf.predict(X_test)
    print('Predicted spread:', Counter(predictions))

    return confidence
Esempio n. 23
0
def train_assembling_average(categories, comments, badwords):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import SGDClassifier
    from sklearn.ensemble import VotingClassifier

    text_clf = Pipeline([('vect', TfidfVectorizer(lowercase=True, ngram_range=(1, 3), analyzer="word", min_df=3)),
                      ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))])

    custom = CustomTransformer(badwords)
    clf = Pipeline([('vect', custom),
                    ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))])

    final_classifier = VotingClassifier(estimators=[('text', text_clf), ('custom', clf)],
                                        voting='soft', weights=[3,1])
    final_classifier = final_classifier.fit(comments, categories)
    return final_classifier
def createVotingClassifier(n_trees,X,y,depth,min_saples=2,max_feat=0.2,overhead=2.0,voting_='soft'):
    N_data = int(overhead*len(X)/n_trees)
    print(str(N_data)+' will be used by classifier')
    estimators_ = []
    estimators = []
    for i in range(n_trees):
        clf = RandomForestClassifier(max_depth=depth,min_samples_leaf=min_saples,max_features=max_feat)
	if (i+1)*N_data<len(X):
        	clf.fit(X[i*N_data:(i+1)*N_data],y[i*N_data:(i+1)*N_data])
	else:
		X,y = shuffle(X,y)
		clf.fit(X[:N_data],y[:N_data])
        estimators_.append((str(i),clf))
        estimators.append(clf)
    tmp = VotingClassifier(estimators=estimators_, voting=voting_)
    tmp.estimators_ = estimators
    return tmp
def test_transform():
    """Check transform method of VotingClassifier on toy dataset."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
    y = np.array([1, 1, 2, 2])

    eclf1 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
        voting='soft').fit(X, y)
    eclf2 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
        voting='soft',
        flatten_transform=True).fit(X, y)
    eclf3 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
        voting='soft',
        flatten_transform=False).fit(X, y)

    warn_msg = ("'flatten_transform' default value will be "
                "changed to True in 0.21. "
                "To silence this warning you may"
                " explicitly set flatten_transform=False.")
    res = assert_warns_message(DeprecationWarning, warn_msg,
                               eclf1.transform, X)
    assert_array_equal(res.shape, (3, 4, 2))
    assert_array_equal(eclf2.transform(X).shape, (4, 6))
    assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
    assert_array_almost_equal(res.swapaxes(0, 1).reshape((4, 6)),
                              eclf2.transform(X))
    assert_array_almost_equal(
            eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)),
            eclf2.transform(X)
    )
Esempio n. 26
0
def combine_voting_NB_classifier(X_train, X_test, y_train, y_test,X_train_meta, X_test_meta, y_train_meta, y_test_meta):
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.neighbors import NearestCentroid
    from sklearn.ensemble import VotingClassifier

    clf_1 = BernoulliNB(alpha = 0.10000000000000001).fit(X_train_meta, y_train_meta)
    from sklearn.svm import SVC
    clf_2 = SVC(C=100, gamma=0.1).fit(X_train_meta, y_train_meta)
    clf_3 = NearestCentroid().fit(X_train_meta, y_train_meta)

    eclf = VotingClassifier(estimators=[('nb1', clf_1),('nb2', clf_3)], voting='hard')

    eclf = eclf.fit(X_train_meta, y_train_meta)
    y_voting_predicted = eclf.predict(X_test_meta)

    np.savetxt('oto_wyniki.csv',y_voting_predicted, delimiter=',')
    print "\n Here is the classification report for Voting classifier:"
    print metrics.classification_report(y_test_meta, y_voting_predicted)
def vclas(w1,w2,w3, w4, w5):
    Xtrain,Xtest, ytrain,ytest= cv.train_test_split(trainX,trainY,test_size=0.4)

    clf1 = LogisticRegression()
    clf2 = GaussianNB()
    clf3 = RandomForestClassifier(n_estimators=10,bootstrap=True)
    clf4= ExtraTreesClassifier(n_estimators=10, bootstrap=True)
    clf5 = GradientBoostingClassifier(n_estimators=10)

    clfes=[clf1,clf2,clf3,clf4, clf5]

    eclf = VotingClassifier(estimators=[('lr', clf1), ('gnb', clf2), ('rf', clf3),('et',clf4), ('gb',clf5)],
                            voting='soft',
                            weights=[w1, w2, w3,w4, w5])

    [c.fit(Xtrain, ytrain) for c in (clf1, clf2, clf3,clf4, clf5, eclf)]
 
    N = 6
    ind = np.arange(N)
    width = 0.3
    fig, ax = plt.subplots()

    for i, clf in enumerate(clfes):
        print(clf,i)
        p1=ax.bar(i,clfes[i].score(Xtrain,ytrain,), width=width,color="blue", alpha=0.5)
        p2=ax.bar(i+width,clfes[i].score(Xtest,ytest,), width=width,color="red", alpha=0.5)
    ax.bar(len(clfes)+width,eclf.score(Xtrain,ytrain,), width=width,color="blue", alpha=0.5)
    ax.bar(len(clfes)+width *2,eclf.score(Xtest,ytest,), width=width,color="red", alpha=0.5)
    plt.axvline(4.8, color='k', linestyle='dashed')
    ax.set_xticks(ind + width)
    ax.set_xticklabels(['LogisticRegression',
                        'GaussianNB',
                        'RandomForestClassifier',
                        'ExtraTrees',
                        'GradientBoosting',
                        'VotingClassifier'],
                       rotation=40,
                       ha='right')
    plt.title('Training and Test Score for Different Classifiers')
    plt.legend([p1[0], p2[0]], ['training', 'test'], loc='lower left')
    plt.show()
Esempio n. 28
0
def acc_VotingClassifier():
    kf = KFold(900, n_folds=10,shuffle=True)
    acc = 0.0
    temp = 1
    conf_mat = [[0 for i in range(10)] for j in range(10)]
    clf1 = GaussianNB()
    clf2 = RandomForestClassifier(n_estimators=20,max_features=None,class_weight="balanced_subsample")
    clf3 = SVC(kernel='rbf', probability=False)
    clf4 = LogisticRegression()
    eclf = VotingClassifier(estimators=[('gnb', clf1), ('rf', clf2),  ('lr', clf4)], voting='hard', weights=[1,3,3])
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        eclf = eclf.fit(X_train, y_train)
        y_predict = eclf.predict(X_test)
        acc_loop = getAccuracy(y_predict,y_test)
        conf_mat = buildConfusionMatrix(conf_mat,y_predict,y_test)
        print("*** Accuracy*** for "+str(temp)+"th time: "+str(acc_loop))
        acc += acc_loop
        temp +=1
    # Checking if the data set is transformed into MFCC(13) or FFT(1000) or KPCA features(else)
    if (X.shape[1]==13):
        print 'In 13 features if'
        valid_mfcc = eclf.predict(validation_set_mfcc)
    elif (X.shape[1]==1000):
        print 'In 1000 features elif'
        valid_fft = eclf.predict(validation_set_fft)
    elif (X.shape[1]==100):
        print 'In KPCA features else'
        valid_kpca = eclf.predict(validation_set_kpca)
    acc = (acc/10.0)
    printConfusionMatrix(conf_mat)
    return acc, getAccuracyFromConfusion(conf_mat),valid_mfcc, valid_fft, valid_kpca
Esempio n. 29
0
 def fit_voting(self):
     voting = 'soft'
     names = [
         # 'svm(word_n_grams,char_n_grams,all_caps,hashtags,punctuations,punctuation_last,emoticons,emoticon_last,'
         # 'elongated,negation_count)',
         # 'logreg(w2v_doc)',
         # 'logreg(w2v_word_avg_google)',
         'word2vec_bayes',
         'cnn_word(embedding=google)',
         'rnn_word(embedding=google)',
     ]
     classifiers = [ExternalModel({
         self.val_docs: os.path.join(self.data_dir, 'results/val/{}.json'.format(name)),
         self.test_docs: os.path.join(self.data_dir, 'results/test/{}.json'.format(name)),
     }) for name in names]
     all_scores = []
     for classifier in classifiers:
         scores = classifier.predict_proba(self.val_docs)
         if voting == 'hard':
             scores = Binarizer(1 / 3).transform(scores)
         all_scores.append(scores)
     all_scores = np.array(all_scores)
     all_scores_first, all_scores_rest = all_scores[0], all_scores[1:]
     le = LabelEncoder().fit(self.classes_)
     val_label_indexes = le.transform(self.val_labels())
     # assume w_0=1 as w is invariant to scaling
     w = basinhopping(
         lambda w_: -(val_label_indexes == np.argmax((
             all_scores_first + all_scores_rest * w_.reshape((len(w_), 1, 1))
         ).sum(axis=0), axis=1)).sum(), np.ones(len(classifiers) - 1), niter=1000,
         minimizer_kwargs=dict(method='L-BFGS-B', bounds=[(0, None)] * (len(classifiers) - 1))
     ).x
     w = np.hstack([[1], w])
     w /= w.sum()
     logging.info('w: {}'.format(w))
     estimator = VotingClassifier(list(zip(names, classifiers)), voting=voting, weights=w)
     estimator.le_ = le
     estimator.estimators_ = classifiers
     return 'vote({})'.format(','.join(names)), estimator
Esempio n. 30
0
def all_classifer(X_train,y_train,X_test,y_test):
    rf=RandomForestClassifier(n_estimators=100,class_weight ='balanced') 
    score1=scores(y_test,rf.fit(X_train,y_train).predict(X_test),rf.predict_proba(X_test)[:,1],'RT')
    gbc = GradientBoostingClassifier(n_estimators=50,learning_rate=0.05).fit(X_train,y_train)
    score2=scores(y_test,gbc.fit(X_train,y_train).predict(X_test),gbc.predict_proba(X_test)[:,1],'gbc') 
    ets=ExtraTreesClassifier(n_estimators=100,max_depth=None,min_samples_split=1,random_state=0)
    score3=scores(y_test,ets.fit(X_train,y_train).predict(X_test),ets.predict_proba(X_test)[:,1],'ets') 
#    lgr = LogisticRegression()
#    score4=scores(y_test,lgr.fit(X_train,y_train).predict(X_test),'lgr') 
    ab = AdaBoostClassifier(algorithm='SAMME.R',n_estimators=50,learning_rate=0.7)
    score5=scores(y_test,ab.fit(X_train,y_train).predict(X_test),ab.predict_proba(X_test)[:,1],'abboost') 
#    print roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
#    bagging=BaggingClassifier()
#    score8=scores(y_test,bagging.fit(X_train,y_train).predict(X_test),'bagging')    
    
#    dt = DecisionTreeClassifier(max_depth=None, min_samples_split=1,random_state=0)
#    score6=scores(y_test,dt.fit(X_train,y_train).predict(X_test),'dt') 
    eclf = VotingClassifier(estimators=[ ('rf', rf), 
                                        ('gd',gbc),('ETs',ets),('ab',ab)],
                                         voting='soft',weights =[score1[0],score2[0],score3[0],score5[0]])
    score7=scores(y_test,eclf.fit(X_train,y_train).predict(X_test),eclf.predict_proba(X_test)[:,1],'voting') 
    print eclf
    return [score1,score2,score3,score5,score7]
Esempio n. 31
0
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Loading some example data
iris = datasets.load_iris()
X = iris.data[:, [0, 2]]
y = iris.target

# Training classifiers
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(kernel='rbf', probability=True)
eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2),
                                    ('svc', clf3)],
                        voting='soft',
                        weights=[2, 1, 2])

clf1.fit(X, y)
clf2.fit(X, y)
clf3.fit(X, y)
eclf.fit(X, y)

# Plotting decision regions
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

f, axarr = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(10, 8))
reduce_train_arr_0, reduce_train_arr_1, answer_train_0, answer_train_1 = train_test_split(
    reduce_train_arr, answer_train, test_size=0.25, random_state=42)

test_zero_time_arr_0, test_zero_time_arr_1, answer_test_zero__0, answer_test_zero__1 = train_test_split(
    test_zero_time_arr, test_zero_time_answer, test_size=0.25, random_state=42)
"""__________________________Start_train_______________________________"""
# Import the model we are using
#
# Instantiate model with 1000 decision trees

rfc = RandomForestClassifier(n_estimators=3700,
                             random_state=100,
                             min_samples_leaf=7)
log = LogisticRegression(penalty='l2', C=15, random_state=0, max_iter=2200)
voiting = VotingClassifier(estimators=[
    ('rfc', rfc),
    ('log', log),
])
rf = RandomForestRegressor(n_estimators=3500,
                           random_state=100,
                           min_samples_leaf=5)
# Train the model on training data
rf.fit(reduce_train_arr_0, answer_train_0)
voiting.fit(test_zero_time_arr_0, answer_test_zero__0)

predictions_b = rf.predict(reduce_train_arr_1)

optR = OptimizedRounder()
optR.fit(predictions_b.reshape(-1, ), answer_train_1)
coefficients = optR.coefficients()
predictions = optR.predict(predictions_b.reshape(-1, ), coefficients)
print("\ncoef=", coefficients)
Esempio n. 33
0
print('Dummy', clf.score(test_X, test_y))
clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', n_jobs=3)
clf.fit(train_X, train_y)
print('Log', clf.score(test_X, test_y))
clf = MultinomialNB()
clf.fit(train_X, train_y)
print('NB', clf.score(test_X, test_y))
clf = RandomForestClassifier()
clf.fit(train_X, train_y)
print('RF', clf.score(test_X, test_y))
clf = DecisionTreeClassifier()
clf.fit(train_X, train_y)
print('DT', clf.score(test_X, test_y))
clf = VotingClassifier(estimators=[
    ('log', LogisticRegression(solver='lbfgs', multi_class='multinomial')),
    ('NB', MultinomialNB()), ('RF', RandomForestClassifier()),
    ('DT', DecisionTreeClassifier())
],
                       n_jobs=3)
clf.fit(train_X, train_y)
print('voting', clf.score(test_X, test_y))
# clf = GaussianNB()
# clf.fit(train_X, train_y)
# print('GNB', clf.score(test_X, test_y))
# clf = KNeighborsClassifier(n_jobs=3)
# clf.fit(train_X, train_y)
# print('KNN', clf.score(test_X, test_y))
# exit()
# clf = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=10)
# clf.fit(train_X, train_y)
# print('Ada + DT', clf.score(test_X, test_y))
# clf = SVC()
#
# 3)Boosting.

# ## Voting Classifier
#
# It is the simplest way of combining predictions from many different simple machine learning models. It gives an average prediction result based on the prediction of all the submodels. The submodels or the basemodels are all of diiferent types.

# In[ ]:

from sklearn.ensemble import VotingClassifier

ensemble_lin_rbf = VotingClassifier(estimators=[
    ('KNN', KNeighborsClassifier(n_neighbors=10)),
    ('RBF', svm.SVC(probability=True, kernel='rbf', C=0.5, gamma=0.1)),
    ('RFor', RandomForestClassifier(n_estimators=500, random_state=0)),
    ('LR', LogisticRegression(C=0.05)),
    ('DT', DecisionTreeClassifier(random_state=0)), ('NB', GaussianNB()),
    ('svm', svm.SVC(kernel='linear', probability=True))
],
                                    voting='soft').fit(train_X, train_Y)
print('The accuracy for ensembled model is:',
      ensemble_lin_rbf.score(test_X, test_Y))
cross = cross_val_score(ensemble_lin_rbf, X, Y, cv=10, scoring="accuracy")
print('The cross validated score is', cross.mean())

# ## Bagging
#
# Bagging is a general ensemble method. It works by applying similar classifiers on small partitions of the dataset and then taking the average of all the predictions. Due to the averaging,there is reduction in variance. Unlike Voting Classifier, Bagging makes use of similar classifiers.
#
# #### Bagged KNN
#
Esempio n. 35
0
clf_xgb = xgb.XGBClassifier(max_depth=2,
                            n_estimators=500,
                            subsample=0.5,
                            learning_rate=0.1)
clf_xgb.fit(X, y)

clf_pctr = Perceptron(class_weight='balanced')
clf_pctr = clf_pctr.fit(X, y)

clf_vote = VotingClassifier(
    estimators=[
        #('tree', clf_tree),
        ('knn', clf_knn),
        ('svm', clf_svm),
        ('extra', clf_ext),
        #('gb', clf_gb),
        ('xgb', clf_xgb),
        ('percep', clf_pctr),
        ('logistic', clf_log),
        #('RF', clf_rf),
    ],
    weights=[2, 2, 3, 3, 1, 2],
    voting='hard')
clf_vote.fit(X, y)

clf = clf_svm
df2 = test.loc[:, cols].fillna(method='pad')
surv_pred = clf.predict(df2)

submit = pd.DataFrame({
    'PassengerId': test.loc[:, 'PassengerId'],
    'Survived': surv_pred.T
Esempio n. 36
0
score = ((tp / (tp + fn)) + (tn / (tn + fp))) / 2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_test, y_pred))
print("confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("classification report:\n", classification_report(y_test, y_pred))

#Decision Tree Classifier
Decisiontree = DecisionTreeClassifier()
Decisiontree.fit(X_train, y_train)
y_pred = Decisiontree.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
score = ((tp / (tp + fn)) + (tn / (tn + fp))) / 2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_test, y_pred))
print("confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("classification report:\n", classification_report(y_test, y_pred))

#Ensemble using Voting classifier of Decision tree, Random forest, linear support vector classifier, Logistics Regression, Neural network, Gradient boosting classifier
ensemble = VotingClassifier(estimators=[('GBM', GBM), ('DT', Decisiontree),
                                        ('RF', Randomforest),
                                        ('SVC', Linear_SVC),
                                        ('NN', MLP_Classifier), ('LR', lr)],
                            n_jobs=-1)
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
score = ((tp / (tp + fn)) + (tn / (tn + fp))) / 2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_test, y_pred))
print("confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("classification report:\n", classification_report(y_test, y_pred))
Esempio n. 37
0
    max_depth=2,
    n_estimators=250,
    subsample=0.7,
    learning_rate=0.1
    )
clf_xgb_grid.fit(X,y)
score_xgb_grid = cross_val_score(clf_xgb_grid, X, y, cv=5).mean()
print("xgb grid",score_xgb_grid)
print ("xgb grid",pd.DataFrame(list(zip(X.columns, np.transpose(clf_xgb_grid.feature_importances_))) \
            ).sort_values(1, ascending=False))

clf_vote = VotingClassifier(
    estimators=[
        ('log', clf_log),
        ('knn', clf_knn),
        ('rf', clf_rf),
        ('xgb_grid', clf_xgb_grid)
        ],
    weights=[1,1,1,1],
    voting='hard')
clf_vote.fit(X,y)

scores = cross_val_score(clf_vote, X, y, cv=5, scoring='accuracy')
print("Voting: Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))


df2 = test.loc[:,cols].fillna(method='pad')
print df2.head(1)
clf_list = [clf_vote, clf_rf, clf_xgb_grid, clf_log, clf_knn]
clf_name = ['vote', 'rf', 'xgb_grid', 'clf_log', 'clf_knn']
Esempio n. 38
0
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.decomposition import RandomizedPCA
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE',
                          delimiter='COLUMN_SEPARATOR',
                          dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    make_union(
        VotingClassifier([("est",
                           KNeighborsClassifier(n_neighbors=5,
                                                weights="uniform"))]),
        FunctionTransformer(lambda X: X)), RandomizedPCA(iterated_power=1),
    RandomForestClassifier(n_estimators=500))

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
        'AdaBoosting': ab,
        'GradientBoosting': gb
    }
    
    return models


# ### Voting Classifier
# It is the simplest way of ensembling, giving an average prediction result based on the prediction of all the submodels.

# In[ ]:


from sklearn.ensemble import VotingClassifier
models = base_learners()
ensemble_voting=VotingClassifier(estimators=list(zip(models.keys(),models.values())), 
                       voting='soft')
scores=cross_val_score(ensemble_voting,Xtrain,ytrain, cv = 10,scoring = "accuracy")
print('The cross validated score is',scores.mean())


# ### Bagging
# 
# Bagging is a general ensemble method. It works by applying similar classifiers on small partitions of the dataset and then taking the average of all the predictions. Due to the averaging, there is reduction in variance. Unlike Voting Classifier, Bagging makes use of similar classifiers. Actually, random forest is a bagging method of decision tree. **sklearn** provides a **BaggingClassifier** to wrap various base learners.

# #### Bagged KNN
# 
# Bagging works best with models with high variance. We can use KNN with small value of **n_neighbours**, as small value of *n_neighbours*.

# In[ ]:

Esempio n. 40
0
def test_sample_weight():
    """Tests sample_weight parameter of VotingClassifier"""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = SVC(probability=True, random_state=123)
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('svc', clf3)],
                             voting='soft').fit(X,
                                                y,
                                                sample_weight=np.ones(
                                                    (len(y), )))
    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('svc', clf3)],
                             voting='soft').fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))

    sample_weight = np.random.RandomState(123).uniform(size=(len(y), ))
    eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft')
    eclf3.fit(X, y, sample_weight)
    clf1.fit(X, y, sample_weight)
    assert_array_equal(eclf3.predict(X), clf1.predict(X))
    assert_array_equal(eclf3.predict_proba(X), clf1.predict_proba(X))

    clf4 = KNeighborsClassifier()
    eclf3 = VotingClassifier(estimators=[('lr', clf1), ('svc', clf3),
                                         ('knn', clf4)],
                             voting='soft')
    msg = ('Underlying estimator \'knn\' does not support sample weights.')
    assert_raise_message(ValueError, msg, eclf3.fit, X, y, sample_weight)
Esempio n. 41
0
## Implementing Neural Network

import keras
from keras.layers import Dense
from keras.models import Sequential

classifier10 = Sequential()
classifier10.add(Dense(input_dim = 6, activation = 'relu', output_dim = 5, init = 'uniform'))
classifier10.add(Dense(activation = 'relu', output_dim = 4, init = 'uniform'))
classifier10.add(Dense(activation = 'relu', output_dim = 4, init = 'uniform'))
classifier10.add(Dense(activation = 'relu', output_dim = 3, init = 'uniform'))
classifier10.add(Dense(activation = 'sigmoid', output_dim = 1, init = 'uniform'))
classifier10.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
classifier10.fit(X_train, y_train, batch_size = 10, nb_epoch = 100)

## Implementing the Voting Classifier

from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(estimators = [('RF', classifier3),
                                         ('svc', classifier4), 
                                         ('gaussian', classifier5), 
                                         ('bagging', classifier6),
                                         ('adaboost', classifier7),
                                         ('grboost', classifier8),
                                         ('xgboost', classifier9)], n_jobs = -1)
  
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
cm9 = confusion_matrix(y_test, y_pred)
y_pred1 = voting.predict(X_test1)
Esempio n. 42
0
def test_set_params():
    """set_params should be able to set estimators"""
    clf1 = LogisticRegression(random_state=123, C=1.0)
    clf2 = RandomForestClassifier(random_state=123, max_depth=None)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)],
                             voting='soft',
                             weights=[1, 2])
    eclf1.fit(X, y)
    eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)],
                             voting='soft',
                             weights=[1, 2])
    eclf2.set_params(nb=clf2).fit(X, y)
    assert_false(hasattr(eclf2, 'nb'))

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params())
    assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params())

    eclf1.set_params(lr__C=10.0)
    eclf2.set_params(nb__max_depth=5)

    assert_true(eclf1.estimators[0][1].get_params()['C'] == 10.0)
    assert_true(eclf2.estimators[1][1].get_params()['max_depth'] == 5)
    assert_equal(eclf1.get_params()["lr__C"],
                 eclf1.get_params()["lr"].get_params()['C'])
Esempio n. 43
0
    plt.text(0.30,
             2.8,
             '46% Difference between \n duration and contacts',
             color='k',
             fontsize=15)


feature_importance_graph(indices, importances, feature_names)
plt.savefig('static/img9.png')

# In[87]:

from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(estimators=[('gbc', grad_clf), ('nav', nav_clf),
                                          ('neural', neural_clf)],
                              voting='soft')

voting_clf.fit(X_train, y_train)

# In[88]:

from sklearn.metrics import accuracy_score

for clf in (grad_clf, nav_clf, neural_clf, voting_clf):
    clf.fit(X_train, y_train)
    predict = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, predict))


@app.route('/')
Esempio n. 44
0
class Ensemble(object):
    """
     Class that implements voting and stacking techniques for ensemble classification.
    """
    def __init__(self, algorithm='stack', threshold=.5):
        """Class initializer.  Sets up 1st level classifiers.
        
        Args:
            algorithm: Ensemble type.  Either 'vote' or 'stack'
            threshold:  Value for custom thresholding of the minority class
        
        Returns:
            None
        
        Raises:
            None
        """
        logging.config.fileConfig('./logging.conf')

        if algorithm == 'stack' or algorithm == 'vote':
            self.algorithm = algorithm
        else:
            raise Exception('invalid algorithm type')

        self.threshold = threshold
        rbm = Pipeline(steps=[('minmax', MinMaxScaler()), \
                                   ('rbm', BernoulliRBM(learning_rate=0.001,n_iter=20,n_components=100)), \
                                   ('logistic', LogisticRegression(C=6.0))])
        svd = Pipeline(steps=[('svd', TruncatedSVD(
            n_components=20)), ('logistic', LogisticRegression(C=6.0))])
        gbc = GradientBoostingClassifier(learning_rate=.1,
                                         max_depth=5,
                                         n_estimators=36)
        mlp = Pipeline(steps=[('stdScaler', StandardScaler()), \
                                   ('mlp', MLPClassifier(alpha=10.0**-7, random_state=1, early_stopping=True, \
                                        hidden_layer_sizes=(20,10,10), max_iter=1000, batch_size=128))])

        #Object variable holding the classifiers.  Note this has been defined as an OrderedDict.  Maintaing
        #order of the classifiers is mandatory.
        self.estimators = OrderedDict([('gbc', gbc), ('mlp', mlp),
                                       ('rbm', rbm), ('svd', svd)])

    def classification_report(self, name, labels_test, preds):
        """Public helper function for printing classification score
        
        Args:
            name: Classifier name
            threshold:  Test labels
            preds: Predictions based on test features
        
        Returns:
            None
        
        Raises:
            None
        """
        print('{} Classification Report'.format(name))
        print(
            classification_report(labels_test,
                                  preds,
                                  target_names=['Default', 'Paid']))

    def confusion_matrix(self, name, labels_test, preds):
        """Public helper function for printing classification confusion matrix
        
        Args:
            name: Classifier name
            threshold:  Test labels
            preds: Predictions based on test features
        
        Returns:
            None
        
        Raises:
            None
        """
        print('{} Confusion Matrix ({} samples): '.format(
            name, len(labels_test)))
        print(confusion_matrix(labels_test, preds))

    def fit(self, features_train, labels_train):
        """Public interface to fit the ensemble
        
        Args:
            features_train: Array of training features
            lablels_train:  Array of training labels
           
        Returns:
            None
        
        Raises:
            None
        """
        logging.debug('Entering fit()')
        if self.algorithm == 'vote':
            self.__fit_vote(features_train, labels_train)
        else:
            if self.algorithm == 'stack':
                self.__fit_stack(features_train, labels_train)
        logging.debug('Exiting fit()')

    def predict(self, features):
        """Public interface to generate predictions from the ensemble
        
        Args:
            features: Array of features
           
        Returns:
            Array of predctions
        
        Raises:
            None
        """
        logging.debug('Entering predict()')
        preds = None
        if self.algorithm == 'vote':
            preds = self.__predict_vote(features)
        else:
            if self.algorithm == 'stack':
                preds = self.__predict_stack(features)
        logging.debug('Exiting predict()')
        return preds

    def test(self, features_train, labels_train, features_test, labels_test):
        """Public helper function to display test results of 1st level predictors and ensemble
        
        Args:
            features_train: Array of training features
            labels_train: Array of training labels
            features_test: Arrays of test features
            labels_test: Arrays of test labels
           
        Returns:
            None
        
        Raises:
            None
        """
        pool = mp.Pool(processes=mp.cpu_count())
        results = []

        for name, clf in self.estimators.items():
            try:
                self.estimators[name] = joblib.load('./models/' + name +
                                                    '.pkl')
            except FileNotFoundError:
                logging.debug('{} not pickled'.format(name))
                results.append(
                    pool.apply_async(lvl1_fit,
                                     args=(clf, name, features_train,
                                           labels_train)))

        pool.close()
        pool.join()
        for result in results:
            item = result.get()
            name = item['name']
            self.estimators[name] = item['fittedclf']

        #Print confusion matrix and score for each clf.
        corr_list = []
        clf_list = []
        for name, clf in self.estimators.items():
            preds = clf.predict(features_test)
            self.confusion_matrix(name, labels_test, preds)
            print()
            self.classification_report(name, labels_test, preds)
            corr_list.append((name, preds))
            clf_list.append(name)

        #Print a matrix of correlations between clfs
        frame = pd.DataFrame(index=clf_list, columns=clf_list)

        for pair in itertools.combinations(corr_list, 2):
            res = pearsonr(pair[0][1], pair[1][1])[0]
            frame[pair[0][0]][pair[1][0]] = res
            frame[pair[1][0]][pair[0][0]] = res
        frame['mean'] = frame.mean(skipna=True, axis=1)
        pd.options.display.width = 180
        print('Correlation Matrix')
        print(frame)

    #Private class variable containing vectorized, threshold prediction function
    __custom_predict = np.vectorize(vfunc, otypes=[np.int])

    def __fit_stack(self, features_train, labels_train):
        """Private function implementing the classifier fit for a stacking ensemble
        
        Args:
            features_train: Array of training features
            labels_train: Array of training labels
            
        Returns:
            None
        
        Raises:
            None
        """
        logging.debug('Entering __fit_stack()')

        pool = mp.Pool(processes=mp.cpu_count())
        results = [
        ]  #array for holding the result objects from the pool processes

        #fit 1st level estimators with a multiprocessing pool of workers
        for name, clf in self.estimators.items():
            try:
                self.estimators[name] = joblib.load('./models/' + name +
                                                    '.pkl')
            except FileNotFoundError:
                logging.debug('Level 1: {} not pickled'.format(name))
                results.append(
                    pool.apply_async(lvl1_fit,
                                     args=(clf, name, features_train,
                                           labels_train)))

        pool.close()
        pool.join()

        for result in results:
            item = result.get()
            name = item['name']
            self.estimators[name] = item[
                'fittedclf']  #reassign a fitted clf to the estimator dictionary

        #fit 2nd level estimator with a multiprocessing pool of workers that perform a k-fold cross-val of
        #training data
        pool = mp.Pool(processes=mp.cpu_count())
        del results[:]
        try:
            self.lrc = joblib.load(
                './models/lrc.pkl'
            )  #try to load the 2nd level estimator from disk
        except FileNotFoundError:  #2nd level estimator not fitted yet
            logging.debug('Level 2: LRC not pickled')
            folds = list(
                StratifiedKFold(n_splits=5).split(features_train,
                                                  labels_train))
            #define a frame for holding the k-fold test results of the 1st level classifiers
            lvl2_frame = pd.DataFrame(index=range(0, len(features_train)),
                                      columns=list(self.estimators.keys()))
            lvl2_frame[LABEL_COL] = labels_train

            #launch multiprocessing pool workers (1 per fold) that fit 1st level classifers and perform
            #predictions that become the training data for the 2nd level classifier (Logistic Regression)
            for name, clf in self.estimators.items():
                fold = 1
                for train_idx, test_idx in folds:
                    X_train, X_test = features_train[
                        train_idx], features_train[test_idx]
                    Y_train = labels_train[train_idx]
                    col_loc = lvl2_frame.columns.get_loc(name)
                    results.append(pool.apply_async(lvl2_fit, args=(clf, name, fold, test_idx, \
                                                                    col_loc, X_train, Y_train, X_test)))
                    fold = fold + 1
            pool.close()
            pool.join()

            #fetch worker results and put them into a frame that will be used to train a 2nd Level/Logistic
            #regression classifier
            for result in results:
                item = result.get()
                name = item['name']
                test_idx = item['test_idx']
                col_loc = item['col_loc']
                preds = item['preds']
                lvl2_frame.iloc[test_idx, col_loc] = preds

            #lvl2_frame.to_csv('./models/lvl2frame.csv')
            self.lrc = LogisticRegression(C=2.0)
            ti = time()
            X = lvl2_frame.drop(LABEL_COL, axis=1).values
            Y = lvl2_frame[LABEL_COL].values
            self.lrc.fit(X, Y)
            logging.debug('LRC fit time: {:0.4f}'.format(time() - ti))
            joblib.dump(
                self.lrc,
                './models/lrc.pkl')  #cache the Logistical Regressor to disk
        logging.debug('Exiting __fit_stack()')

    def __fit_vote(self, features_train, labels_train):
        """Private function implementing the classifier fit for a voting ensemble.  Wrapper around the
        SKLearn voting classifier.
        
        Args:
            features_train: Array of training features
            labels_train: Array of training labels
            
        Returns:
            None
        
        Raises:
            None
        """
        logging.debug('Entering __fit_vote()')
        try:
            self.voteclf = joblib.load('./models/voteclf.pkl')
        except FileNotFoundError:
            ti = time()
            self.voteclf = VotingClassifier(estimators=list(
                self.estimators.items()),
                                            voting='soft',
                                            n_jobs=-1)
            self.voteclf.fit(features_train, labels_train)
            logging.debug('fit time: {:0.4f}'.format(time() - ti))
            joblib.dump(
                self.voteclf,
                './models/voteclf.pkl')  #cache the fitted model to disk
        logging.debug('Exiting __fit_vote()')

    def __predict_stack(self, features):
        """Private function that collects the 1st level classifier probabilities and then uses them as
        the feature set to a 2nd level classifier (Logistic Regression).
        
        Args:
            features: Array of features
            
        Returns:
            Array of predictions
        
        Raises:
            None
        """
        logging.debug('Entering __predict_stack()')
        lvl1_frame = pd.DataFrame()
        #1st level predictions
        for name, clf in self.estimators.items():
            lvl1_frame[name] = clf.predict_proba(features)[:, MINORITY_POS]

        #2nd level predictions
        preds = self.__predict_with_threshold(self.lrc, lvl1_frame.values)

        logging.debug('Exiting __predict_stack()')
        return preds

    def __predict_vote(self, features):
        """Private function that is a wrapper for the SKLearn voting classifier prediction method.
        
        Args:
            features: Array of features
            
        Returns:
            Array of predictions
        
        Raises:
            None
        """
        logging.debug('Entering __predict_vote()')
        preds = self.__predict_with_threshold(self.voteclf, features)
        logging.debug('Exiting __predict_vote()')
        return preds

    def __predict_with_threshold(self, clf, features):
        """Private function that wraps a classifier's predict method with functionality to implement
        thresholding for the minority class
        
        Args:
            clf: SKLearn classifier
            features: Array of features
            
        Returns:
            Array of predictions
        
        Raises:
            None
        """
        logging.debug('Entering __predict_with_threshold()')
        ti = time()
        predictions = Ensemble.__custom_predict(clf.predict_proba(features)[:, MINORITY_POS], \
                                                clf.predict(features), self.threshold)
        logging.debug('prediction time: {:0.4f}'.format(time() - ti))
        logging.debug('Exiting __predict_with_threshold()')
        return predictions
Esempio n. 45
0
# We can see that all of them have comparable performances with only notably poorer performance of the Naive Bayes Classifier. Since they arive at the assessment using different techniques, they will tend to make different errors as well. If we combine all the models in some way, we usually get a better performance than from any individual model, since the errors of any individual model are in a way 'compensated for' by the other models. This is the whole idea behind ensemble learning. In this particular case, we can use a soft voting classifier, that looks at probabilities of each individual classifier prediction - the more confident the predictor the more weight its 'vote' gets in the ensemble.

# In[ ]:


classifiers = [
    ('KNeighbors', KNeighborsClassifier(n_neighbors=3)), 
    ('LogisticRegression', LogisticRegression(solver="lbfgs", C=10)), 
    ('GaussianNB', GaussianNB()),
    ('SupportVectorMachine', SVC(probability=True)),
    ('Random Forest', RandomForestClassifier(max_depth=4, n_estimators=100, n_jobs=-1)), 
    ('AdaBoost', AdaBoostClassifier()), 
    ('GradientBoosting', GradientBoostingClassifier())
]
vc = VotingClassifier(estimators=classifiers, voting='soft')
vc = vc.fit(X_train_features, y_train)

preds = vc.predict(X_test_features)


# Let's produce the output file:

# In[ ]:


submission = pd.DataFrame({
        "PassengerId": data_test["PassengerId"],
        "Survived": preds
    })
submission.to_csv('titanic.csv', index=False)
Esempio n. 46
0
cv = CountVectorizer(tokenizer=my_tokenizer)
X_train_bag_of_words = cv.fit_transform(train_list)
dtm = pd.DataFrame(X_train_bag_of_words.toarray(),
                   columns=cv.get_feature_names())
dtm.index = tsv_read.index

# Creating models

clf1 = MultinomialNB(alpha=0.7)
clf2 = RandomForestClassifier(n_estimators=55,
                              random_state=0,
                              criterion='entropy')
clf3 = SVC(C=2)

# Ensemble of models

eclf = VotingClassifier(estimators=[('MNB', clf1), ('RFC', clf2),
                                    ('SVC', clf3)],
                        voting='hard')

# Target classifiaction y for training
y = tsv_read['sentiment']

# For testing
X_test_bag_of_words = cv.transform(test_list)

# Fitting the models
model = eclf.fit(X_train_bag_of_words, y)

# Predicting
predict_and_test(model, X_test_bag_of_words)
Esempio n. 47
0
#train-test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y)

#data oversampling
method = RandomOverSampler()
X_resampled, y_resampled = method.fit_sample(X_train, y_train)

#defining classifiers
clf1 = LogisticRegression(class_weight='balanced', random_state=1)
clf2 = RandomForestClassifier(class_weight='balanced', random_state=1)
clf3 = GaussianNB()
ensemble_model = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                              ('gnb', clf3)],
                                  voting='hard')
ensemble_model.fit(X_resampled, y_resampled)

predicted = ensemble_model.predict(X_test)
f1_score_value = f1_score(y_test, predicted, average='weighted')
#print(f1_score_value)
from sklearn.metrics import classification_report, confusion_matrix
# Print classification report using predictions
print(classification_report(y_test, predicted))
# Print confusion matrix using predictions
print(confusion_matrix(y_test, predicted))
from sklearn.externals import joblib
filename = 'file.sav'
joblib.dump(ensemble_model, filename)
Esempio n. 48
0
######################################################
'''
array = instance_data.values
X = array[:,0:39]
Y = array[:,40]
seed = 7
num_trees = 100
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
'''

######################################################
# Voting Ensemble for Classification
######################################################

seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)
print(results.mean())
print('10-fold cross validation:\n')

for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
    scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='roc_auc')
    print('ROC AUC: %0.2f (+/- %0.2f) [%s]' %
          (scores.mean(), scores.std(), label))

from sklearn.ensemble import VotingClassifier

# Replace Majority Vote Classifier by using VotingClassifier from scikit-learn
mv_clf = VotingClassifier(estimators=[('pipe1', pipe1), ('clf2', clf2),
                                      ('pipe3', pipe3)],
                          voting='soft')
clf_labels += ['Majority voting']
all_clf = [pipe1, clf2, pipe3, mv_clf]
for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='roc_auc')
    print('Accuracy: %0.2f (+/- %0.2f) [%s]' %
          (scores.mean(), scores.std(), label))

# Get parameter from VotingClassifier
# Need to run the params to get the name of each classifier's params
# for example : pipe1__clf__C, pipe1__clf__n_jobs, clf2__max_depth, clf2__criterion, etc
Esempio n. 50
0
test = test.drop(feature_drop, axis=1)
train = train.drop(['PassengerId'], axis=1)
train_data = train.drop('Survived', axis=1)
target = train['Survived']
print(train_data.shape, target.shape)

print(train.info())

# train (VotingClassifier)
X_train, X_test, y_train, y_test = train_test_split(
    train_data, target, test_size=0.2, random_state=np.random.seed())
log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(random_state=42)
voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf),
                                          ('svc', svm_clf)],
                              voting='hard')
voting_clf.fit(X_train, y_train)

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
test_data = test.drop('PassengerId', axis=1).copy()
prediction = voting_clf.predict(test_data)

submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': prediction
})
Esempio n. 51
0
plt.title('Receiver Operating Characteristic Random Forest Classifier')
plt.plot(fpr_RFC,
         tpr_RFC,
         'b',
         label='AUC Random Forest Classifier= %0.2f' % roc_auc_RFC)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate Random Forest Classifier')
plt.xlabel('False Positive Rate Random Forest Classifier')
plt.show()

### Ensemble of above three models
eclf1 = VotingClassifier(estimators=[('NaiveBayes', clf),
                                     ('LogisticReg', clf_Reg),
                                     ('RandomForest', clf_RFC)],
                         voting='hard')
eclf1 = eclf1.fit(X_train, data_train['final_senti_vote'])
y_predicted_Ensemble = eclf1.predict(X_test)
print("Accuracy Ensemble Classifier: %.2f" %
      accuracy_score(y_test, y_predicted_Ensemble))
##Accuracy Ensemble Classifier: 0.80

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label='AUC_Naive_Bayes = %0.2f' % roc_auc)
plt.plot(fpr_Reg,
         tpr_Reg,
         'b',
         label='AUC Logistic Regression = %0.2f' % roc_auc_Reg)
plt.plot(fpr_RFC,
         tpr_RFC,
Esempio n. 52
0
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from qa.estimators import TextFeatureTransformer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from joblib import dump

model_estimator = VotingClassifier(
    estimators=[("logistic", LogisticRegression(max_iter=1000)),
                ("guass", GaussianNB()), ("svc", SVC(gamma='auto'))],
    voting="hard",
)


def run_pipeline(dataset, labels, outfile, accuracy_threshold=0.85):
    """ running trainng pipleine and save the
    model """

    pipeline = Pipeline(steps=[
        ("transform", TextFeatureTransformer()),
        ("model", model_estimator),
    ])

    train_x, test_x, train_y, test_y = train_test_split(dataset, labels)
    pipeline.fit(train_x, train_y)
    scores = cross_val_score(pipeline, train_x, train_y, cv=5)

    try:
Esempio n. 53
0
bag = BaggingClassifier(base_estimator=neighbors.KNeighborsClassifier())
bag.fit(input_train, target_train)
target_predict = bag.predict(input_test)
print(accuracy_score(target_test,target_predict))

ada = AdaBoostClassifier()
ada.fit(input_train, target_train)
target_predict = ada.predict(input_test)
print(accuracy_score(target_test,target_predict))

kNNReg = neighbors.KNeighborsClassifier()
logisticReg = linear_model.LogisticRegression()
sVMReg = svm.SVC(probability=True)

maj = VotingClassifier(estimators=[('lr', logisticReg), ('svc', sVMReg), ('knn', kNNReg)],voting='soft')

from sklearn.metrics import roc_curve
from sklearn.metrics import auc
colors = ['black', 'orange', 'blue', 'green']
linestyles = [':', '--', '-.', '-']
for clf, label, clr, ls \
    in zip([logisticReg,sVMReg,kNNReg,maj], ['lr','svc','knn','maj'], colors, linestyles):
    # assuming the label of the positive class is 1
    y_pred = clf.fit(input_train,target_train).predict_proba(input_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_true=target_test,y_score=y_pred)
    roc_auc = auc(x=fpr, y=tpr)
    plt.plot(fpr, tpr,color=clr,linestyle=ls,label='%s (auc = %0.2f)' % (label, roc_auc))
plt.legend(loc='lower right')
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
Esempio n. 54
0
#Logistic Regression
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression(multi_class='auto', solver='lbfgs')
lg.fit(xtrain, ytrain)
lgPredict = lg.predict(xtest)
label.append('Log. Regression')

print(confusion_matrix(lgPredict, ytest))
print(accuracy_score(lgPredict, ytest))  #result
accuracy.append(accuracy_score(lgPredict, ytest))
#hyper parameter tuning -solver->warn << solver->auto

#voting classifier
from sklearn.ensemble import VotingClassifier
vc = VotingClassifier(estimators=[('GaussianNB', nb), ('DecisionTree', Dtc),
                                  ('LogisticRegression', lg)],
                      voting='hard')
vc.fit(xtrain, ytrain)
vcPredict = vc.predict(xtest)
label.append('Vot. Classifier')

print(confusion_matrix(vcPredict, ytest))  # result
print(accuracy_score(vcPredict, ytest))
accuracy.append(accuracy_score(vcPredict, ytest))

# Bagging Classifier
from sklearn.ensemble import BaggingClassifier
bg = BaggingClassifier(n_estimators=12, base_estimator=Dtc, random_state=25)
bg.fit(xtrain, ytrain)
bgPredict = bg.predict(xtest)
label.append('Bagging')
Esempio n. 55
0
                               class_weight='balanced',
                           ))])

    char_clf = Pipeline([
        ('vect',
         CountVectorizer(analyzer="word",
                         ngram_range=(1, 2),
                         stop_words=None,
                         max_features=None,
                         decode_error='ignore')),
        #('tfidf', TfidfTransformer(use_idf=False)),
        ('clf', SVC(C=5.2, kernel='linear', probability=True))
    ])

    vot_clf = VotingClassifier(estimators=[('glove', glove_clf),
                                           ('linear', char_clf)],
                               voting='soft')

    print char_clf.named_steps

    print "TRAIN"
    print 80 * '='
    cv = StratifiedKFold(data.Stance, n_folds=10, shuffle=True, random_state=1)

    pred_stances = cross_val_predict(vot_clf,
                                     data.Abstract,
                                     data.Stance,
                                     cv=cv)

    print classification_report(data.Stance, pred_stances, digits=4)
Esempio n. 56
0
# Code starts here
bagging_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                n_estimators=100,
                                max_samples=100,
                                random_state=0)
bagging_clf.fit(X_train, y_train)

score_bagging = bagging_clf.score(X_test, y_test)
print(score_bagging)
# Code ends here

# --------------
# Import libraries
from sklearn.ensemble import VotingClassifier

# Various models
clf_1 = LogisticRegression()
clf_2 = DecisionTreeClassifier(random_state=4)
clf_3 = RandomForestClassifier(random_state=4)

model_list = [('lr', clf_1), ('DT', clf_2), ('RF', clf_3)]

# Code starts here
voting_clf_hard = VotingClassifier(estimators=model_list, voting='hard')
voting_clf_hard.fit(X_train, y_train)
hard_voting_score = voting_clf_hard.score(X_test, y_test)
print(hard_voting_score)

# Code ends here
                  ['clf', clf3]])

clf_labels = ['Logistic Regression', 'Decision Tree', 'KNN']

print('10-fold cross validation:\n')
for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
    scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='roc_auc')
    print("ROC AUC: %0.2f (+/- %0.2f) [%s]"
          % (scores.mean(), scores.std(), label))
    
## 创建投票型分类器
mv_clf = VotingClassifier(estimators=[('lr',pipe1), ('dt',clf2), ('kn', pipe3)], voting = "soft")
clf_labels += ['Majority Voting']
all_clf = [pipe1, clf2, pipe3, mv_clf]

for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='roc_auc')
    print("ROC AUC: %0.2f (+/- %0.2f) [%s]"
          % (scores.mean(), scores.std(), label))
########################################################
## 评估与调优集成分类器
## 绘制ROC曲线,并计算auc
from sklearn.metrics import roc_curve
Esempio n. 58
0
def training():
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression
    from sklearn.linear_model import SGDClassifier
    from sklearn.feature_selection import f_regression
    from sklearn.metrics import classification_report
    from sklearn.svm import LinearSVC #基於線性假設的支援向量機分類器LinearSVC
    from sklearn.neighbors import KNeighborsClassifier #k鄰近分類器
    from sklearn.tree import DecisionTreeClassifier #決策樹分類器
    from sklearn.ensemble import RandomForestClassifier #隨機森林分類器
    from sklearn.ensemble import GradientBoostingClassifier #梯度提升決策樹
    from sklearn.ensemble import VotingClassifier
    from sklearn.metrics import accuracy_score
    
    lr = LogisticRegression() #initialize Logistic Regression
    lr.fit(x_train,y3_train) #train
    lr_y_predict = lr.predict(x_test) #predict
    
    lsvc = LinearSVC() #初始化線性假設的支援向量機分類器
    lsvc.fit(x_train,y3_train)
    lsvc_y_predict = lsvc.predict(x_test)
    
    knc = KNeighborsClassifier()
    knc.fit(x_train,y3_train)
    knc_y_predict = knc.predict(x_test)
    
    dtc = DecisionTreeClassifier()
    dtc.fit(x_train,y3_train)
    dtc_y_predict = dtc.predict(x_test)
    
    rfc = RandomForestClassifier()
    rfc.fit(x_train,y3_train)
    rfc_y_pred = rfc.predict(x_test)
    
    gbc = GradientBoostingClassifier()
    gbc.fit(x_train,y3_train)
    gbc_y_pred = gbc.predict(x_test)

    voting_clf = VotingClassifier( estimators=[("lr", lr), ("svm", lsvc), ("knc", knc), ("dtc", dtc)], voting="hard" )

    for clf in ( lr, lsvc, knc, dtc, voting_clf ):
        clf.fit( x_train, y3_train )
        y_pred = clf.predict( x_test )
    #print( clf.__class__.__name__, accuracy_score(y3_test, y_pred) )
    #print( voting_clf.score(x_train,y3_train) )
    #print (classification_report(y3_test,y_pred,target_names = ['not rec','rec', 'very re']))
    
    print ('The Accuracy of EnsembleLearning: ', voting_clf.score(x_train,y3_train))
    print ('The Accuracy of EnsembleLearning: ', voting_clf.score(x_test,y3_test))
    print (classification_report(y_pred, y3_test))
        
    #print ('Accuracy of LR Classifier:', lr.score(x_train,y3_train))
    #print ('Accuracy of LR Classifier:', lr.score(x_test,y3_test))
    #print(f_regression(x_train,y3_train)[1])
    #print (classification_report(y3_test,lr_y_predict,target_names = ['not rec','rec', 'very re']))
    #print ('---------------------------------------------------------')
    #print ('The Accuracy of Linear SVC is',lsvc.score(x_train,y3_train))
    #print ('The Accuracy of Linear SVC is',lsvc.score(x_test,y3_test))
    #print (classification_report(y3_test,lsvc_y_predict,target_names=['not rec','rec', 'very re']))
    #print ('---------------------------------------------------------')
    #print ('The Accuracy of K-nearest Neighbor Classifier is ', knc.score(x_train,y3_train))
    #print ('The Accuracy of K-nearest Neighbor Classifier is ', knc.score(x_test,y3_test))
    #print (classification_report(y3_test,knc_y_predict,target_names=['not rec','rec', 'very re']))
    #print ('---------------------------------------------------------')
    #print ('Decision Tree Score: ', dtc.score(x_train,y3_train))
    #print ('Decision Tree Score: ', dtc.score(x_test,y3_test))
    #print (classification_report(y3_test, dtc_y_predict,target_names=['not rec','rec', 'very re']))
    print ('---------------------------------------------------------')
    print ('The Accuracy of RandomForestClassifier: ', rfc.score(x_train,y3_train))
    print ('The Accuracy of RandomForestClassifier: ', rfc.score(x_test,y3_test))
    print (classification_report(rfc_y_pred, y3_test))
    print ('---------------------------------------------------------')
    print ('The Accuracy of GradientBoostingClassifier: ', gbc.score(x_train,y3_train))
    print ('The Accuracy of GradientBoostingClassifier: ', gbc.score(x_test,y3_test))
    print (classification_report(gbc_y_pred, y3_test))
Esempio n. 59
0
def test_predictproba_hardvoting():
    eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()),
                                        ('lr2', LogisticRegression())],
                            voting='hard')
    msg = "predict_proba is not available when voting='hard'"
    assert_raise_message(AttributeError, msg, eclf.predict_proba, X)
Esempio n. 60
0
def test_set_estimator_none():
    """VotingClassifier set_params should be able to set estimators as None"""
    # Test predict
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard',
                             weights=[1, 0, 0.5]).fit(X, y)

    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard',
                             weights=[1, 1, 0.5])
    eclf2.set_params(rf=None).fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))

    assert_true(dict(eclf2.estimators)["rf"] is None)
    assert_true(len(eclf2.estimators_) == 2)
    assert_true(
        all([
            not isinstance(est, RandomForestClassifier)
            for est in eclf2.estimators_
        ]))
    assert_true(eclf2.get_params()["rf"] is None)

    eclf1.set_params(voting='soft').fit(X, y)
    eclf2.set_params(voting='soft').fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    msg = ('All estimators are None. At least one is required'
           ' to be a classifier!')
    assert_raise_message(ValueError, msg,
                         eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y)

    # Test soft voting transform
    X1 = np.array([[1], [2]])
    y1 = np.array([1, 2])
    eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft',
                             weights=[0, 0.5]).fit(X1, y1)

    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft',
                             weights=[1, 0.5])
    eclf2.set_params(rf=None).fit(X1, y1)
    assert_array_equal(
        eclf1.transform(X1),
        np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]]))
    assert_array_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]]))
    eclf1.set_params(voting='hard')
    eclf2.set_params(voting='hard')
    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))