Esempio n. 1
0
def run_voting_model(X, y, cv_split):

    # Using default hyper-parameters
    MLA_dict = get_algorithms()
    # Removing models without attribute 'predict_proba' (required for vote classifier)
    # and models with a 1.0 correlation to another model
    clf_keys = [
        "ada",
        "etc",
        "gbc",
        "rfc",
        "gpc",
        "lr",
        "bnb",
        "gnb",
        "knn",
        "svc",
        "lda",
        "qda",
    ]
    vote_est = []
    for clf_name in clf_keys:
        vote_est.append((clf_name, MLA_dict[clf_name]))

    # Hard Vote or majority rules
    vote_hard = ensemble.VotingClassifier(estimators=vote_est, voting="hard")
    vote_hard_cv = model_selection.cross_validate(vote_hard,
                                                  X,
                                                  y,
                                                  cv=cv_split,
                                                  return_train_score=True,
                                                  n_jobs=-1)
    vote_hard.fit(X, y)

    print("Hard Voting Training accuracy: {:.2f}".format(
        vote_hard_cv["train_score"].mean() * 100))
    print("Hard Voting Test accuracy: {:.2f}".format(
        vote_hard_cv["test_score"].mean() * 100))
    print("Hard Voting Test 3*std: +/- {:.2f}".format(
        vote_hard_cv["test_score"].std() * 100 * 3))
    print("-" * 10)

    # Soft Vote or weighted probabilities
    vote_soft = ensemble.VotingClassifier(estimators=vote_est, voting="soft")
    vote_soft_cv = model_selection.cross_validate(vote_soft,
                                                  X,
                                                  y,
                                                  cv=cv_split,
                                                  return_train_score=True,
                                                  n_jobs=-1)
    vote_soft.fit(X, y)

    print("Soft Voting Training accuracy: {:.2f}".format(
        vote_soft_cv["train_score"].mean() * 100))
    print("Soft Voting Test accuracy: {:.2f}".format(
        vote_soft_cv["test_score"].mean() * 100))
    print("Soft Voting Test 3*std: +/- {:.2f}".format(
        vote_soft_cv["test_score"].std() * 100 * 3))
    print("-" * 10)
    return vote_est, vote_hard_cv, vote_soft_cv
Esempio n. 2
0
def train_and_evaluate(complete_tag_count, prediction, predicted_class, nonpredicted_class):

    tag_total = np.array(complete_tag_count)
    predicted_final = np.array(predicted_class)
    nonpredicted_final = np.array(nonpredicted_class)

    #features_total = np.array(features)

    clf1 = linear_model.LogisticRegression(n_jobs=9)
    clf2 = ensemble.RandomForestClassifier(n_estimators=100, n_jobs=9)
    clf3 = ensemble.ExtraTreesClassifier(n_estimators=1000, max_depth=None, min_samples_split=1, random_state=0, criterion='entropy',
                                        n_jobs=9)
    clf4 = tree.DecisionTreeClassifier(max_depth=3)
    clf5 = naive_bayes.GaussianNB()
    clf6 = naive_bayes.BernoulliNB()
    clf7 = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=1, random_state=0)
    clf8 = ensemble.AdaBoostClassifier(n_estimators=100)
    clf9 = OneVsRestClassifier(clf4, n_jobs=9)
    clf10 = svm.SVC(kernel='linear', probability=True, C=0.05)

    eclf = ensemble.VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('ext', clf3), ('dt', clf4), ('gnb', clf5),
                                                 ('bnb', clf6), ('gbc', clf7), ('ada', clf8), ('1vr', clf9), ('svc', clf10)
                                                 ], voting='soft')
    eclf2 = ensemble.VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('ext', clf3), ('dt', clf4), ('gnb', clf5),
                                                 ('bnb', clf6), ('gbc', clf7), ('ada', clf8), ('1vr', clf9), ('svc', clf10)
                                                 ], voting='hard')

    cv = cross_validation.StratifiedKFold(predicted_final, 10)

    for clf, label in zip([clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, eclf, eclf2], ['Logistic Regression',
        'Random Forest', 'Extra Trees', 'Decision Tree','Gaussian NB', 'Bernoulli NB', 'Gradient Boosting Classifier',
        'AdaBoost', 'One vs Rest', 'SVC Linear', 'Soft Voting Ensemble', 'Hard Voting Ensemble']):
        """scores = cross_validation.cross_val_score(clf, tag_total, predicted_final, cv=cv, scoring='accuracy')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))"""
        if prediction == 'age':
            results = cross_validation.cross_val_predict(clf, tag_total, nonpredicted_final, cv=cv)
            final_tags = []
            for i in range(len(tag_total)):
                user = tag_total[i]
                user_gender = results[i]
                if user_gender == 'M' or user_gender == 'MALE':
                    g = 0
                elif user_gender == 'F' or user_gender == 'FEMALE':
                    g = 1
                user = np.append(user, g)
                final_tags.append(user)
        else:
            final_tags = tag_total
        scores = cross_validation.cross_val_score(clf, final_tags, predicted_final, cv=cv, scoring='accuracy')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
Esempio n. 3
0
def define_model(modelname):
    """
    Outputs model type and parameters

    Input
    ----
    model: str
       model type e.g., Logistic Regression
    parameters: ls
       hyperparameters of corresponding model

    Output
    ------
    clf: model object
       Model Object Classifier

    """
    if modelname == 'LR':
        return linear_model.LogisticRegression()
    elif modelname == 'NN':
        return neighbors.KNeighborsClassifier()
    elif modelname == 'DT':
        return tree.DecisionTreeClassifier()
    elif modelname == 'RF':
        return ensemble.RandomForestClassifier()
    elif modelname == 'NB':
        return naive_bayes.GaussianNB()
    elif modelname == 'SVM':
        return svm.SVC()
    elif modelname == 'ET':
        return ensemble.ExtraTreesClassifier()
    elif modelname == 'SGD':
        return linear_model.SGDClassifier()
    elif modelname == 'AB':
        return ensemble.AdaBoostClassifier(
            tree.DecisionTreeClassifier(max_depth=1)
        )
    elif modelname == 'GB':
        return ensemble.GradientBoostingClassifier()
    elif modelname == 'VC':
        return ensemble.VotingClassifier(estimators=[
            ('RFC', ensemble.RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0)), ('ETC', ensemble.ExtraTreesClassifier(max_depth=None, max_features=5, n_estimators=10, random_state=0, min_samples_split=1)), ('ABC', ensemble.AdaBoostClassifier())],
            voting='soft')
    elif modelname == 'VC2':
        return ensemble.VotingClassifier(estimators=[
            ('LR', linear_model.LogisticRegression(C=0.1, random_state=1)), ('RFC', ensemble.RandomForestClassifier(max_depth=None, n_estimators=10, random_state=0, min_samples_split=1)), ('ETC', ensemble.ExtraTreesClassifier(max_depth=None, max_features=5, n_estimators=10, random_state=0, min_samples_split=1))],
            voting='soft')

    else:
        raise ConfigError("Can't find the model: {}".format(model))
def get_classifiers(feature_extraction_technique, data, label, report):
    # Four base classifiers and five ensemble classifiers using soft voting.
    # In order to replicate our experiment, we set the random_state = 0 for the classifiers if necessary.

    classifiers = [
        ('NB', naive_bayes.MultinomialNB(alpha=1.0)),
        ('LR', linear_model.LogisticRegression(C=1.0, max_iter=100, random_state=0)),
        ('SVM', svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True)),
        ('RF', ensemble.RandomForestClassifier(n_estimators=100, criterion='gini', random_state=0)),
        ('SVE1', ensemble.VotingClassifier(
            estimators=[('NB', naive_bayes.MultinomialNB(alpha=1.0)),
                        ('LR', linear_model.LogisticRegression(C=1.0, max_iter=100, random_state=0)),
                        ('SVM', svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True)),
                        ('RF', ensemble.RandomForestClassifier(n_estimators=100, criterion='gini', random_state=0))
                        ],
            voting='soft')
         ),
        ('SVE2', ensemble.VotingClassifier(estimators=[('NB', naive_bayes.MultinomialNB(alpha=1.0)),
                                                       ('LR', linear_model.LogisticRegression(C=1.0, max_iter=100,
                                                                                              random_state=0)),
                                                       ('SVM',
                                                        svm.SVC(kernel='linear', C=1.0, random_state=0,
                                                                probability=True))],
                                           voting='soft')),
        ('SVE3', ensemble.VotingClassifier(
            estimators=[('LR', linear_model.LogisticRegression(C=1.0, max_iter=100, random_state=0)),
                        ('SVM',
                         svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True)),
                        ('RF', ensemble.RandomForestClassifier(n_estimators=100, criterion='gini', random_state=0))],
            voting='soft')),
        ('SVE4', ensemble.VotingClassifier(
            estimators=[('NB', naive_bayes.MultinomialNB(alpha=1.0)),
                        ('SVM', svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True)),
                        ('RF', ensemble.RandomForestClassifier(n_estimators=100, criterion='gini', random_state=0))
                        ],
            voting='soft')
         ),
        ('SVE5', ensemble.VotingClassifier(
            estimators=[('NB', naive_bayes.MultinomialNB(alpha=1.0)),
                        ('LR', linear_model.LogisticRegression(C=1.0, max_iter=100, random_state=0)),
                        ('RF', ensemble.RandomForestClassifier(n_estimators=100, criterion='gini', random_state=0))
                        ],
            voting='soft')
         )

    ]

    for name, model in classifiers:
        training_and_evaluation(feature_extraction_technique, name, model, data, label, report)
Esempio n. 5
0
def all_classifiers():
    soft_voting_classifiers = [
        ('RF', RandomForestClassifier(n_jobs=-1)),
        ('GB', ensemble.GradientBoostingClassifier()),
        ('LR', LogisticRegression(n_jobs=-1, solver='saga'))
        # ('GNB', GaussianNB())
    ]
    hard_voting_classifiers = [('RF', RandomForestClassifier(n_jobs=-1)),
                               ('GB', ensemble.GradientBoostingClassifier()),
                               ('GNB', GaussianNB()),
                               ('LR',
                                LogisticRegression(n_jobs=-1, solver='saga'))]
    return [
        # ('BalancedRandomForest', RandomForestClassifier(max_depth=None, class_weight="balanced", n_jobs=-1)),
        # ('RandomForest', RandomForestClassifier(max_depth=None, n_jobs=-1)),
        # ('GradientBoosting', gradient_booster()),
        # ('AdaBoost', AdaBoostClassifier()),
        # ('BalancedSVM', svm.SVC(class_weight='balanced')),
        # ('SVM', svm.SVC()),
        # ('GaussianNB', GaussianNB()),
        # ('LogisticRegression', LogisticRegression(n_jobs=-1, solver='saga')),
        # ('SoftVoting', ensemble.VotingClassifier(
        #     estimators=soft_voting_classifiers,
        #     voting='soft',
        #     n_jobs=-1)
        # ),
        ('HardVoting',
         ensemble.VotingClassifier(estimators=hard_voting_classifiers,
                                   voting='hard',
                                   n_jobs=-1)),
    ]
Esempio n. 6
0
 def fit(self, X, y):
     # Split into categorical,numerical categories:
     self.cat_clf = pipeline.Pipeline((('cat-tf', CategoricalTransformer()),
                                       ('bnb', naive_bayes.BernoulliNB())))
     self.num_clf = pipeline.Pipeline(
         (('num-tf', NumericalTransformer()), ('gnb',
                                               naive_bayes.GaussianNB())))
     weights_range = [[
         a, 1.0 - a
     ] for a in [0., .1, .2, .3, .4, .5, .6, .7, .8, .9, 1.0]]
     voting_range = ['soft']
     param_grid = dict(voting=voting_range, weights=weights_range)
     print "Using param grid " + str(param_grid)
     cv = model_selection.StratifiedShuffleSplit(n_splits=5,
                                                 test_size=0.2,
                                                 random_state=0)
     self.clf = ensemble.VotingClassifier(
         estimators=[('num-clf', self.num_clf), ('cat-clf', self.cat_clf)])
     self.clf = model_selection.GridSearchCV(self.clf,
                                             param_grid=param_grid,
                                             cv=cv,
                                             n_jobs=7)
     self.clf.fit(X, y)
     print "Best params: " + str(
         self.clf.best_params_) + " and corresponding score is " + str(
             self.clf.best_score_)
Esempio n. 7
0
def model_ensemble(train, test, label):
    cl1 = GradientBoostingClassifier(loss='deviance',
                                     learning_rate=0.1,
                                     n_estimators=100)
    cl2 = KNeighborsClassifier(n_neighbors=3)
    cl3 = sm.LabelPropagation(kernel='rbf',
                              gamma=20,
                              n_neighbors=3,
                              alpha=1,
                              max_iter=100,
                              tol=0.0001)
    cl4 = svm.SVC(kernel='rbf', probability=True)
    cl5 = linear_model.LogisticRegression()
    cl6 = ske.RandomForestClassifier(n_estimators=100, criterion="gini")

    rf = ske.VotingClassifier(estimators=[('gradient boost', cl1),
                                          ('knn', cl2), ('labelprop', cl3),
                                          ('svm', cl4), ('logistic reg', cl5),
                                          ('rforest', cl6)],
                              voting='soft',
                              weights=[2, 4, 3, 5, 2, 3])
    cl1.fit(train[label], train["Survived"])
    cl2.fit(train[label], train["Survived"])
    cl3.fit(train[label], train["Survived"])
    cl4.fit(train[label], train["Survived"])
    cl5.fit(train[label], train["Survived"])
    cl6.fit(train[label], train["Survived"])
    rf.fit(train[label], train["Survived"])

    test_predict = pd.DataFrame.copy(test)
    test_predict["Survived"] = rf.predict(test_predict[label])
    return test_predict
Esempio n. 8
0
    def getAndScoreVotingEnsemble(self,
                                  trainingDataFrame,
                                  predictorColumns,
                                  labelColumn,
                                  votingMethod="hard"):

        trainingInputs = trainingDataFrame[predictorColumns]
        #trainingInputs = preprocessing.normalize(trainingInputs, axis=0)

        trainingLabels = trainingDataFrame[labelColumn]

        cv_split = model_selection.ShuffleSplit(n_splits=10,
                                                test_size=.3,
                                                train_size=.7,
                                                random_state=0)

        voter = ensemble.VotingClassifier(estimators=self.MLA,
                                          voting=votingMethod)

        voter_cv = model_selection.cross_validate(voter,
                                                  trainingInputs,
                                                  trainingLabels,
                                                  cv=cv_split)

        voter.fit(trainingDataFrame[predictorColumns],
                  trainingDataFrame[labelColumn])

        print("{} Voting Training mean Score: {:.2f}".format(
            votingMethod, voter_cv['train_score'].mean() * 100))
        print("{} Voting Test mean Score: {:.2f}".format(
            votingMethod, voter_cv['test_score'].mean() * 100))
        print("{} Voting Test Score 3*std: +/- {:.2f}".format(
            votingMethod, voter_cv['test_score'].std() * 100 * 3))
        print('-' * 10)
Esempio n. 9
0
def main(name):
    data = Data(name)
    data.readData()
    model1 = SVC()
    model2 = RandomForestClassifier(n_estimators=100)
    model3 = Ensemble.GradientBoostingClassifier(n_estimators=100)
    model4 = KNeighborsClassifier()
    model = Ensemble.VotingClassifier(estimators=[('svm', model1),
                                                  ('rf', model2),
                                                  ('gb', model3),
                                                  ('kn', model4)],
                                      weights=[3, 2, 2, 1])
    grid = GridSearchCV(estimator=model1, param_grid={'C': [0.5, 2, 10]}, cv=5)
    all_feature = np.concatenate((np.array(data.feature_train,
                                           dtype=np.float32),
                                  np.array(data.feature_validation,
                                           dtype=np.float32)))
    all_label = np.concatenate((np.array(data.label_train, dtype=np.float32),
                                np.array(data.label_validation,
                                         dtype=np.float32)))
    model3.fit(all_feature, all_label)
    # model2.fit(data.feature_train, data.label_train)
    # print "Best params: " , grid.best_estimator_.get_params()
    ans = model3.predict(data.test)
    print ans, sum(ans)
    np.save('gdbt_' + name + '.npy', ans)
Esempio n. 10
0
def train_classify(train_file, test_file):
    train_vectors, train_class, test_vectors = feature_generation(
        train_file, test_file)
    plot_distribution(train_class, train_file + ' Before sampling')
    train_vectors, train_class = over_sample(train_vectors, train_class)

    if test:
        eclf = ensemble.VotingClassifier(estimators=[
            ('nbm', models['Multinomial NB']),
            ('tree', models['Decision Tree']),
            ('rf', models['Random Forest']),
            ('lr', models['Logistic Regression']),
        ],
                                         voting='soft')
        preds = classify(eclf, train_vectors, train_class, test_vectors)
        f = open('data/' + candidate + '_predictions' + '.txt', 'w+')
        for index, pred in enumerate(preds):
            f.write(str(index + 1) + ';;' + str(preds[index]) + '\n')
        f.close()
    else:
        metrics = []
        for index, model in enumerate(models):
            print "Classifying using", model
            accScore, precision, recall, f1score = classify(
                models[model], train_vectors, train_class, test_vectors)
            metrics.append({})
            metrics[index]['Classifier'] = model
            metrics[index]['accuracy'] = accScore
            metrics[index]['possitive f1score'] = f1score[0]
            metrics[index]['negative f1score'] = f1score[1]
        pd.io.json.json_normalize(metrics).plot(kind='bar', x='Classifier')
        plt.title(train_file)
        plt.grid(True, axis='y')
        plt.ylim(ymax=1)
        plt.xticks(rotation=0)
Esempio n. 11
0
def fit_model(X_train, Y_train, X_2, Y_2, X_3, Y_3):
    """ Learn the classifier, prints metrics"""
    #Gradient Boosting Classifier
    gb = Class_Fit(clf=ensemble.GradientBoostingClassifier)
    param_grid = {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}
    gb.grid_search(parameters=param_grid, Kfold=5)
    gb.grid_fit(X=X_train, Y=Y_train)
    # объединяем
    gb_best = ensemble.GradientBoostingClassifier(**gb.grid.best_params_)
    votingC = ensemble.VotingClassifier(estimators=[('gb', gb_best)],
                                        voting='soft')
    # и обучаю его
    votingC = votingC.fit(X_train, Y_train)

    predictions_baseline = votingC.predict(X_2)  # baseline
    print("____________________")
    print('Balanced sampling baseline model metrics:')
    print_result(Y_2, predictions_baseline)
    predictions_3 = (votingC.predict_proba(X_2)[:, 1] >= 0.8).astype(bool)
    print('Balanced sampling threshold metrics :')
    print_result(Y_2, predictions_3)
    predictions_4 = (votingC.predict_proba(X_3)[:, 1] >= 0.8).astype(bool)
    print('Only prof threshold metrics:')
    print_result(Y_3, predictions_4)
    predictions_5 = votingC.predict(X_3)
    print('Only prof metrics with baseline classifier:')
    print_result(Y_3, predictions_5)
    return votingC
Esempio n. 12
0
def voting_ensemble():
    # Last but not least, let's combine some of these models
    # To try for better predictive performance
    n_trees = 100
    models = np.empty([2, 2], dtype='object')

    # Voting ensembles
    # Number 1: Hard Vote (Predicted class labels used for majority rule voting)
    models[0] = [
        'Voting Classifier 1',
        ensemble.VotingClassifier(estimators=[
            ('lr', linear_model.LogisticRegression(random_state=1)),
            ('gbm', ensemble.GradientBoostingClassifier(random_state=1)),
        ],
                                  voting='hard')
    ]

    # Number 2: Soft Vote (Argmax of sums of predicted probabilities used)
    # Recommended for ensemble of well-calibrated classifiers
    models[1] = [
        'Voting Classifier 2',
        ensemble.VotingClassifier(estimators=[
            ('lda', discriminant_analysis.LinearDiscriminantAnalysis()),
            ('rf',
             ensemble.RandomForestClassifier(random_state=1,
                                             n_estimators=n_trees,
                                             max_features=3))
        ],
                                  voting='soft')
    ]

    # Number 3: Soft Vote with weights
    # Some models will be more valuable than others

    # Fit & evaluate models
    for name, model in models:
        # Different model metrics
        for scoring in ('accuracy', 'roc_auc'):
            cross_validation(name, model, X, Y, scoring)

        # Fit model and make predictions
        fitted_model = model.fit(X_train, Y_train)
        Y_pred = fitted_model.predict(X_test)

        # Classification report & Confusion Matrix (needs separate training and evaluation process)
        classification_report(name, Y_test, Y_pred)
        confusion_matrix(name, Y_test, Y_pred)
Esempio n. 13
0
 def fit(self, X, y):
     # Split into categorical,numerical categories:
     self.cat_clf = pipeline.Pipeline((('cat-tf', CategoricalTransformer()),
                                       ('bnb', naive_bayes.BernoulliNB())))
     self.num_clf = pipeline.Pipeline(
         (('num-tf', NumericalTransformer()),
          ('scaler', preprocessing.StandardScaler()),
          ('gnb', naive_bayes.GaussianNB())))
     self.clf = ensemble.VotingClassifier(
         estimators=[('num-clf', self.num_clf), ('cat-clf', self.cat_clf)])
     self.clf.fit(X, y)
Esempio n. 14
0
def vote_comparison(vote_est, trainX, trainY):
    # Hard vote or majority rules
    hard_vote = ensemble.VotingClassifier(estimators=vote_est, voting='hard')
    hard_vote_cv = model_selection.cross_validate(hard_vote, trainX, trainY, cv=cv_split, return_train_score=True)
    hard_vote.fit(trainX, trainY)

    print('Hard Voting Training w/bin score mean: {:.2f}'.format(hard_vote_cv['train_score'].mean() * 100))
    print('Hard Voting Test w/bin score mean: {:.2f}'.format(hard_vote_cv['test_score'].mean() * 100))
    print('Hard Voting Test w/bin score 3*std: {:.2f}'.format(hard_vote_cv['test_score'].std() * 100 * 3))
    print('-' * 10)

    # Soft vote or majority rules
    soft_vote = ensemble.VotingClassifier(estimators=vote_est, voting='soft')
    soft_vote_cv = model_selection.cross_validate(soft_vote, trainX, trainY, cv=cv_split, return_train_score=True)
    soft_vote.fit(trainX, trainY)

    print('Soft Voting Training w/bin score mean: {:.2f}'.format(soft_vote_cv['train_score'].mean() * 100))
    print('Soft Voting Test w/bin score mean: {:.2f}'.format(soft_vote_cv['test_score'].mean() * 100))
    print('Soft Voting Test w/bin score 3*std: {:.2f}'.format(soft_vote_cv['test_score'].std() * 100 * 3))
    print('-' * 10)
Esempio n. 15
0
 def model(self, **kwargs):
     svm_params = {'C': 20000, 'gamma': 1e-3, 'kernel': 'rbf'}
     rfor_params = {
         'criterion': 'entropy',
         'max_depth': 17,
         'max_features': 'auto',
         'n_estimators': 150,
         'random_state': 0
     }
     return ensemble.VotingClassifier([
         ('svm', svm.SVC(**svm_params)),
         ('rfor', ensemble.RandomForestClassifier(**rfor_params))
     ])
Esempio n. 16
0
def hard_vote_tune(trainX, trainY, vote_est, test):
    grid_hard = ensemble.VotingClassifier(estimators=vote_est, voting='hard')
    grid_hard_cv = model_selection.cross_validate(grid_hard, trainX, trainY, cv=cv_split, return_train_score=True)
    grid_hard.fit(trainX, trainY)
    # print('Hard Voting w/Tuned Hyperparameters Training w/bin score mean: {:.2f}'
    #       .format(grid_hard_cv['train_score'].mean() * 100))
    # print('Hard Voting w/Tuned Hyperparameters Test w/bin score mean: {:.2f}'
    #       .format(grid_hard_cv['test_score'].mean() * 100))
    # print('Hard Voting w/Tuned Hyperparameters Test w/bin score 3*std: {:.2f}'
    #       .format(grid_hard_cv['test_score'].std() * 100 * 3))
    # print('-' * 10)
    pre_result = grid_hard.predict(test)
    return pre_result  # 68.899%
Esempio n. 17
0
def real_test(x_train, y_train, x_test, y_test, FINAL_ALGO):
    results = {}
    voting_estimators = []
    for classfier_name, clf in FINAL_ALGOS.iteritems():
        voting_estimators.append((classfier_name, clf))
        clf.fit(x_train, y_train)
        prediction = clf.predict(x_test)
        accuracy = metrics.accuracy_score(prediction, y_test)
        results[classfier_name] = accuracy
    clf = ensemble.VotingClassifier(estimators=voting_estimators)
    clf.fit(x_train, y_train)
    accuracy = metrics.accuracy_score(clf.predict(x_test), y_test)
    results["Voting"] = accuracy
    return results
def machine_learning(ticker):
    """
    DOCSTRING
    """
    features, labels, _ = extract_features(ticker)
    features_train, features_test, labels_train, labels_test = \
        model_selection.train_test_split(features, labels, test_size=0.25)
    classifier_a = ensemble.VotingClassifier(
        [('Linear_SVC', svm.LinearSVC()),
         ('K_Neighbors', neighbors.KNeighborsClassifier()),
         ('Random_Forest', ensemble.RandomForestClassifier())])
    classifier_a.fit(features_train, labels_train)
    accuracy = classifier_a.score(features_test, labels_test)
    predictions = classifier_a.predict(features_test)
    print('Accuracy:', accuracy)
    print('Prediction Spread:', collections.Counter(predictions))
    return accuracy
Esempio n. 19
0
def SklearnVotingClassifier(X_train, Y_train, X_test):
    """
    :type X_train: numpy.ndarray
    :type X_test: numpy.ndarray
    :type Y_train: numpy.ndarray
    
    :rtype: List[numpy.ndarray] 
    """
    from sklearn import ensemble
    from sklearn import neighbors
    from sklearn import tree
    from sklearn import svm
    from sklearn import linear_model
    from sklearn import metrics
    from sklearn.preprocessing import StandardScaler

    #data normalization
    scaler = StandardScaler()
    norm_val = scaler.fit(np.vstack((X_train, X_test)))
    X_train = norm_val.transform(X_train)
    X_test = norm_val.transform(X_test)

    SVM_2 = svm.SVC(kernel='linear', C=1)
    SVM_2 = SVM_2.fit(X=X_train, y=Y_train)

    Logistic_model = linear_model.LogisticRegression()
    Logistic_model = Logistic_model.fit(X_train, Y_train)

    Decision_Tree = tree.DecisionTreeClassifier()
    Decision_Tree = Decision_Tree.fit(X_train, Y_train)

    KNN_2 = neighbors.KNeighborsClassifier(n_neighbors=5)
    KNN_2 = KNN_2.fit(X_train, Y_train)

    voting_classifier = ensemble.VotingClassifier(estimators=[
        ('SVM', SVM_2), ('LogisticRegression', Logistic_model),
        ('DecisionTree', Decision_Tree), ('KNN', KNN_2)
    ],
                                                  voting='hard')

    voting_classifier.fit(X_train, Y_train)
    vote_pred = voting_classifier.predict(X_test)
    print("Accuracy of Voting Classifier = " +
          str(metrics.accuracy_score(y_true, vote_pred) * 100))

    return ([vote_pred])
Esempio n. 20
0
def voting(X_tra, y_tra, X_val, y_val, index_no, classifier_num):

    #
    classifier_list = GVal.getPARA('classifier_list_PARA')
    # dVM[3400] = ['estimators', [21, 23, 25, 30, 31], [21, 23, 25, 30, 31]]
    estims = []
    for i in range(len(dVM[3400][2])):
        clf_temp = (classifier_list[dVM[3400][2][i]][1], classifier_list[int(
            str(dVM[3400][2][i])[0:2])][0](X_tra, y_tra, X_val, y_val,
                                           index_no, dVM[3400][2][i])[0])
        estims.append(clf_temp)

    y_tra, X_tra, y_val, X_val, weights = dataRegulationSKL(
        y_tra, X_tra, y_val, X_val, index_no)
    clf = skemb.VotingClassifier(estimators=estims, voting=dVM[3401][2])
    clf.fit(X_tra, y_tra)

    return processLearning(clf, X_tra, y_tra, X_val, y_val)
Esempio n. 21
0
def majority_vote(x_train, y_train):
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train),
                                             shuffle=True,
                                             n_folds=10,
                                             random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets,
         val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        final_estimators = [(i, FINAL_ALGOS[i]) for i in FINAL_ALGOS.keys()]
        clf = ensemble.VotingClassifier(estimators=final_estimators)
        clf.fit(tr_data, tr_targets)
        prediction = clf.predict(val_data)
        accuracy = metrics.accuracy_score(prediction, val_targets)
        result_df.loc[foldnum, "Voting"] = accuracy
    return result_df
Esempio n. 22
0
def voting_model():
    rf_est = ensemble.RandomForestClassifier(n_estimators = 750, criterion = 'gini', max_features = 'sqrt',
                                             max_depth = 3, min_samples_split = 4, min_samples_leaf = 2,
                                             n_jobs = 50, random_state = 42, verbose = 1)

    gbm_est = ensemble.GradientBoostingClassifier(n_estimators=900, learning_rate=0.0008, loss='exponential',
                                                  min_samples_split=3, min_samples_leaf=2, max_features='sqrt',
                                                  max_depth=3, random_state=42, verbose=1)

    et_est = ensemble.ExtraTreesClassifier(n_estimators=750, max_features='sqrt', max_depth=35, n_jobs=50,
                                           criterion='entropy', random_state=42, verbose=1)

    lr_est = LogisticRegression(penalty='l1', C=2, max_iter=100, solver='liblinear', n_jobs=32)

    des_tree_est = tree.DecisionTreeClassifier(criterion="entropy")

    voting_est = ensemble.VotingClassifier(estimators = [('rf', rf_est),('lr',lr_est),('gbm', gbm_est),('et', et_est),('ds',des_tree_est)],
                                       voting = 'soft', weights = [3,4,5,2,2],
                                       n_jobs = 50)
    return voting_est
Esempio n. 23
0
def main():
    train = pd.read_csv(sys.argv[1])
    train = handle_missing_values(train)
    train = preprocess(train)

    # All parameters were optimized using grid search with cross-validation (GridSearchCV).

    rfc = ensemble.RandomForestClassifier(n_estimators=100,
                                          max_depth=9,
                                          max_features=3,
                                          min_samples_leaf=1e-5,
                                          min_samples_split=1e-5,
                                          criterion='entropy',
                                          random_state=360)

    gbc = ensemble.GradientBoostingClassifier(n_estimators=275,
                                              max_depth=6,
                                              max_features=9,
                                              min_samples_leaf=1e-9,
                                              min_samples_split=1e-9,
                                              subsample=0.9,
                                              random_state=360)

    clf = ensemble.VotingClassifier(estimators=[('rfc', rfc), ('gbc', gbc)],
                                    weights=[3.5, 6.5],
                                    voting='soft')

    x_train = train.drop(FTR_DVCAT, axis=1).values
    y_train = train[FTR_DVCAT].values

    if VALIDATING:
        cross_validate(clf, x_train, y_train)
    else:
        test = pd.read_csv(sys.argv[2])
        test = preprocess(test)

        x_test = test.drop(FTR_DVCAT, axis=1).values
        y_test = test[FTR_DVCAT].values

        score = solve(clf, x_train, y_train, x_test, y_test)
        print(score)
Esempio n. 24
0
movie_tfidf = extract_features(movie_sentiment_data.data)

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    movie_tfidf, movie_sentiment_data.target, test_size=0.30, random_state=42)

# similar to nltk.NaiveBayesClassifier.train()
clf1 = linear_model.LogisticRegression()
clf1.fit(X_train, y_train)
print('Logistic Regression performance: {}'.format(clf1.score(X_test, y_test)))

clf2 = linear_model.SGDClassifier()
clf2.fit(X_train, y_train)
print('SGDClassifier performance: {}'.format(clf2.score(X_test, y_test)))

clf3 = naive_bayes.MultinomialNB()
clf3.fit(X_train, y_train)
print('MultinomialNB performance: {}'.format(clf3.score(X_test, y_test)))

clf4 = naive_bayes.BernoulliNB()
clf4.fit(X_train, y_train)
print('BernoulliNB performance: {}'.format(clf4.score(X_test, y_test)))

voting_model = ensemble.VotingClassifier(estimators=[('lr', clf1),
                                                     ('sgd', clf2),
                                                     ('mnb', clf3),
                                                     ('bnb', clf4)],
                                         voting='hard')
voting_model.fit(X_train, y_train)
print('Voting classifier performance: {}'.format(
    voting_model.score(X_test, y_test)))
Esempio n. 25
0
def _make_ensemble(classifiers, voting='hard'):
  return ensemble.VotingClassifier(
      [('c' + str(i), classifier)
       for i, classifier in enumerate(classifiers)],
      voting=voting
  )
Esempio n. 26
0
result = model_selection.cross_val_score(model,X,y,cv=kfold)
print(f'Accuracy of RF: {result.mean()*100:.2f}%')


''' 2.3 ExtraTreesClassifier 极端随机树 (ET)
        思想: ET是RF的变种,其区别在于采用全部数据集训练个体分类器,且节点选择随机、树算法选择随机
'''
model = ensemble.ExtraTreesClassifier(n_estimators=100,
                                      max_features=4,
                                      random_state=3)
kfold = model_selection.KFold(n_splits=10,random_state=1)
result = model_selection.cross_val_score(model,X,y,cv=kfold)
print(f'Accuracy of ET: {result.mean()*100:.2f}%')


""" 2.4 Voting算法 
        思想:设置多种不同个体分类器,使用投票法进行预测
"""
base1 = tree.DecisionTreeClassifier()
base2 = svm.SVC(gamma='auto')
base3 = naive_bayes.GaussianNB()

bases = []
bases.append(('Decision Tree',base1))
bases.append(('SVC',base2))
bases.append(('Naive Bayes', base3))

model = ensemble.VotingClassifier(estimators=bases)
kfold = model_selection.KFold(n_splits=10,random_state=1)
result = model_selection.cross_val_score(model,X,y,cv=kfold)
print(f'Accuracy of Voting: {result.mean()*100:.2f}%')
Esempio n. 27
0
    inplace=False)
titanic2.shape

x_train = titanic2[0:titanic_train.shape[0]]
x_train.shape
x_train.info()
y_train = titanic_train['Survived']

#create estimators for voting classifier
dt_estimator = tree.DecisionTreeClassifier(random_state=100)  #Model1
rf_estimator = ensemble.RandomForestClassifier(random_state=100)  #Model2
ada_estimator = ensemble.AdaBoostClassifier(random_state=100)  #Model3

voting_estimator = ensemble.VotingClassifier(estimators=[('dt', dt_estimator),
                                                         ('rf', rf_estimator),
                                                         ('ada', ada_estimator)
                                                         ],
                                             voting='soft',
                                             weights=[10, 20, 25])
voting_grid = {
    'dt__max_depth': [3, 5, 7],
    'rf__n_estimators': [20],
    'rf__max_features': [5, 6],
    'rf__max_depth': [5],
    'ada__n_estimators': [10]
}
grid_voting_estimator = model_selection.GridSearchCV(voting_estimator,
                                                     voting_grid,
                                                     cv=10,
                                                     n_jobs=5)
grid_voting_estimator.fit(x_train, y_train)
print(grid_voting_estimator.grid_scores_)
Esempio n. 28
0
                                                    max_depth=6,
                                                    n_estimators=866,
                                                    subsample=0.95))

    RandomForest = (ensemble
                    .RandomForestClassifier(max_features=4,
                                            min_samples_leaf=3,
                                            n_estimators=424))

    estimators = [('svm', SVM),
                  ('ada', AdaBoost),
                  ('gb', GradientBoosting),
                  ('rf', RandomForest)]

    model = ensemble.VotingClassifier(estimators=estimators,
                                      voting='hard',
                                      n_jobs=4)

    model.fit(X_train, y_train)

    # There is 1 test sample without a Fare variable (sample #152). Since
    # all our algorithms use the Fare variable, we'll just ignore this sample
    # and mark it as not surviving

    X_test = np.delete(X_test, (152), axis=0)

    X_test = scale(X_test)
    y_test_predict = model.predict(X_test)
    y_test_predict = np.insert(y_test_predict, (152), 0, axis=0)

    predictions = np.column_stack([np.array(range(892, 1310)), y_test_predict])
Esempio n. 29
0
# In[ ]:

from sklearn import model_selection, ensemble, svm
import xgboost as xgb

# initialise classifiers
rf_clf = ensemble.RandomForestClassifier(n_estimators=100, random_state=0)
et_clf = ensemble.ExtraTreesClassifier(n_estimators=100, random_state=0)
gb_clf = ensemble.GradientBoostingClassifier(n_estimators=100, random_state=0)
ada_clf = ensemble.AdaBoostClassifier(n_estimators=100, random_state=0)
svm_clf = svm.LinearSVC(C=0.1, random_state=0)
xgb_clf = xgb.XGBClassifier(n_estimators=100)

e_clf = ensemble.VotingClassifier(estimators=[('xgb', xgb_clf), (
    'rf', rf_clf), ('et', et_clf), ('gbc', gb_clf), ('ada',
                                                     ada_clf), ('svm',
                                                                svm_clf)])

# score using cross validation
clf_list = [xgb_clf, rf_clf, et_clf, gb_clf, ada_clf, svm_clf, e_clf]
name_list = [
    'XGBoost', 'Random Forest', 'Extra Trees', 'Gradient Boosted', 'AdaBoost',
    'Support Vector Machine', 'Ensemble'
]

for clf, name in zip(clf_list, name_list):
    scores = model_selection.cross_val_score(clf, features, target, cv=10)
    print("Accuracy: %0.2f +/- %0.2f (%s 95%% CI)" %
          (scores.mean(), scores.std() * 2, name))

# **We choose SVM for as our predictor:**
Esempio n. 30
0
    
    ('rfc', ensemble.RandomForestClassifier()),
    
    #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html
    ('knn', neighbors.KNeighborsClassifier()),
    
    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
   ('xgb', XGBClassifier()),
   ('lgbm',LGBMClassifier())

]

seed = 123
skf = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = seed )
#Hard Vote or majority rules
vote_hard = ensemble.VotingClassifier(estimators = vote_est , voting = 'hard')
vote_hard_cv = model_selection.cross_validate(vote_hard, data1_x_bin, data[Target], cv  = skf,scoring='f1')
vote_hard.fit(data1_x_bin, data[Target])
#print("Hard Voting Training w/bin score mean: {:.2f}". format(vote_hard_cv['train_score'].mean()*100)) 
print("Hard Voting Test w/bin score mean: {:.2f}". format(vote_hard_cv['test_score'].mean()*100))
print("Hard Voting Test w/bin score 3*std: +/- {:.2f}". format(vote_hard_cv['test_score'].std()*100*3))
print('-'*10)


#Soft Vote or weighted probabilities
vote_soft = ensemble.VotingClassifier(estimators = vote_est , voting = 'soft')
vote_soft_cv = model_selection.cross_validate(vote_soft, data1_x_bin, data[Target], cv  = skf,scoring='f1')
vote_soft.fit(data1_x_bin, data[Target])

#print("Soft Voting Training w/bin score mean: {:.2f}". format(vote_soft_cv['train_score'].mean()*100)) 
print("Soft Voting Test w/bin score mean: {:.2f}". format(vote_soft_cv['test_score'].mean()*100))