Ejemplo n.º 1
0
def test_svm_model(kernel,
                   training_examples,
                   training_labels,
                   C=1.0,
                   gamma='auto',
                   n_estimators=10):
    model = ensemble.BaggingClassifier(svm.SVC(kernel=kernel,
                                               gamma=gamma,
                                               random_state=RAND_SEED,
                                               probability=True),
                                       n_estimators=n_estimators,
                                       max_samples=0.632)
    model.fit(training_examples, training_labels)
    test_set, test_labels, test_idxs = make_test_set(training_examples,
                                                     training_labels)
    test_score = model.score(test_set, test_labels)
    get_true_false_positive_negative(model.predict(test_set), test_labels)
    return model, test_score
Ejemplo n.º 2
0
def ModelSelection(test_data, features, label):
    MLA = [
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),
        gaussian_process.GaussianProcessClassifier(),
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(),
        linear_model.Perceptron(),
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),
        neighbors.KNeighborsClassifier(),
        svm.SVC(probability=True),
        svm.NuSVC(probability=True),
        svm.LinearSVC(),
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),
    ]

    MLA_columns = ['MLA Name', 'MLA Parameters', 'MLA Score']
    MLA_compare = pd.DataFrame(columns=MLA_columns)
    x_train, x_test, y_train, y_test = train_test_split(train_data[features],
                                                        train_data[label],
                                                        test_size=0.2)
    row_index = 0
    MLA_predict = train_data[label]
    for alg in MLA:

        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
        alg.fit(x_train, y_train)
        MLA_predict[MLA_name] = alg.predict(x_test)
        MLA_compare.loc[row_index, 'MLA Score'] = alg.score(x_test, y_test)
        row_index += 1

    MLA_compare.sort_values(by=['MLA Score'], ascending=False, inplace=True)
    return MLA_compare, x_train, x_test, y_train, y_test
Ejemplo n.º 3
0
    def __init__(self, df, run_prefix):
        #code that will prepare the data
        y = df.PHENO
        X = df.drop(columns=['PHENO'])

        # Split the data
        X_train, X_test, y_train, y_test = model_selection.train_test_split(
            X, y, test_size=0.3, random_state=42)  # 70:30
        IDs_train = X_train.ID
        IDs_test = X_test.ID
        X_train = X_train.drop(columns=['ID'])
        X_test = X_test.drop(columns=['ID'])

        # Saving the prepped data the other classes will need
        self.df = df
        self.run_prefix = run_prefix
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.IDs_train = IDs_train
        self.IDs_test = IDs_test

        # Where the results will be stored
        self.log_table = None
        self.best_algo = None
        self.algo = None
        self.rfe_df = None

        #The methods we will use
        self.algorithms = [
            linear_model.LogisticRegression(solver='lbfgs'),
            ensemble.RandomForestClassifier(n_estimators=100),
            ensemble.AdaBoostClassifier(),
            ensemble.GradientBoostingClassifier(),
            linear_model.SGDClassifier(loss='modified_huber'),
            svm.SVC(probability=True, gamma='scale'),
            neural_network.MLPClassifier(),
            neighbors.KNeighborsClassifier(),
            discriminant_analysis.LinearDiscriminantAnalysis(),
            discriminant_analysis.QuadraticDiscriminantAnalysis(),
            ensemble.BaggingClassifier(),
            xgboost.XGBClassifier()
        ]
def all_classifiers():
    # Model Data
    MLA = [
        # Ensemble Methods
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),

        # Gaussian Processes
        gaussian_process.GaussianProcessClassifier(),

        # GLM
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(),
        linear_model.Perceptron(),

        # Navies Bayes
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),

        # Nearest Neighbor
        neighbors.KNeighborsClassifier(),  # SVM
        svm.SVC(probability=True),
        svm.NuSVC(probability=True),
        svm.LinearSVC(),

        # Trees
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),

        # Discriminant Analysis
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),

        # xgboost: http://xgboost.readthedocs.io/en/latest/model.html
        XGBClassifier()
    ]
    return MLA
Ejemplo n.º 5
0
    def __init__(self, df, run_prefix, max_iter, cv_count):
        self.run_prefix = run_prefix
        self.max_iter = max_iter
        self.cv_count = cv_count

        self.y_tune = df.PHENO
        self.IDs_tune = df.ID
        self.X_tune = df.drop(columns=['PHENO', 'ID'])

        best_algo_name_in = run_prefix + '.best_algorithm.txt'
        best_algo_df = pd.read_csv(best_algo_name_in,
                                   header=None,
                                   index_col=False)
        self.best_algo = str(best_algo_df.iloc[0, 0])

        self.algorithms = [
            linear_model.LogisticRegression(),
            ensemble.RandomForestClassifier(),
            ensemble.AdaBoostClassifier(),
            ensemble.GradientBoostingClassifier(),
            linear_model.SGDClassifier(loss='modified_huber'),
            svm.SVC(probability=True),
            neural_network.MLPClassifier(),
            neighbors.KNeighborsClassifier(),
            discriminant_analysis.LinearDiscriminantAnalysis(),
            discriminant_analysis.QuadraticDiscriminantAnalysis(),
            ensemble.BaggingClassifier(),
            xgboost.XGBClassifier()
        ]
        self.log_table = None
        self.best_algo_name_in = None
        self.best_algo_df = None
        self.hyperparameters = None
        self.scoring_metric = None
        self.cv_tuned = None
        self.cv_baseline = None
        self.algo = None
        self.searchCVResults = None
        self.rand_search = None
        self.algo_tuned = None
        self.tune_out = None
Ejemplo n.º 6
0
def testAllClassifiers(Xfile, yfile):
    X, Xtrain, Xtest, y, ytrain, ytest = loadAndSplitData(Xfile, yfile)
    clfs = [
        linear_model.Perceptron(max_iter=1000),
        neighbors.KNeighborsClassifier(15, weights='uniform'),
        linear_model.LogisticRegression(),
        tree.DecisionTreeClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.AdaBoostClassifier(),
        ensemble.RandomForestClassifier(),
        svm.LinearSVC()
    ]

    clfNames = [
        "perceptron", "kNN, k=15", "logistic regression", "decision tree",
        "bagging", "boosting", "random forest", "support vector machines"
    ]

    for i, clf in enumerate(clfs):
        clf.fit(Xtrain, ytrain)
        print(clfNames[i] + " :", clf.score(Xtest, ytest))
Ejemplo n.º 7
0
def constructModel(corpus, classList, features, modelOutput):
    """
    Trains a Decision Tree model on the test corpus.

    Args:
        corpus: A list of lists, containing the GC content, coverage, and class number.
        classList: A list of class names.
        features: List of variables used by each contig.
        modelOutput: Location to save model as GraphViz DOT, or False to save no model.
    Returns:
        classifier: A DecisionTreeClassifier object that has been trained on the test corpus.
    """
    corpus.sort()  # just in case
    X = []
    Y = []
    for item in corpus:
        X.append(item[:-1]) # all but the last item
        Y.append(item[-1]) # only the last item
    X_train, X_test, Y_train, Y_test = mscv.train_test_split(X, Y, test_size=0.3, random_state=0)
    # TODO: implement classifier testing and comparison, now only baggingClassifier is used as per paper
    #treeClassifier = tree.DecisionTreeClassifier()
    #treeClassifier = treeClassifier.fit(X_train, Y_train)
    #click.echo("Decision tree classifier built, score is %s out of 1.00" % treeClassifier.score(X_test, Y_test))
    baggingClassifier = ensemble.BaggingClassifier()
    baggingClassifier = baggingClassifier.fit(X_train, Y_train)
    click.echo("Bagging classifier built, score is %s out of 1.00" % baggingClassifier.score(X_test, Y_test))
    #forestClassifier = ensemble.RandomForestClassifier(n_estimators=10)
    #forestClassifier = forestClassifier.fit(X_train, Y_train)
    #click.echo("Random forest classifier built, score is %s out of 1.00" % forestClassifier.score(X_test, Y_test))
    #adaClassifier = ensemble.AdaBoostClassifier(n_estimators=100)
    #adaClassifier = adaClassifier.fit(X_train, Y_train)
    #click.echo("AdaBoost classifier built, score is %s out of 1.00" % adaClassifier.score(X_test, Y_test))
    #gradientClassifier = ensemble.GradientBoostingClassifier(n_estimators=100)
    #gradientClassifier = gradientClassifier.fit(X_train, Y_train)
    #click.echo("Gradient tree boosting classifier built, score is %s out of 1.00" % gradientClassifier.score(X_test, Y_test))
    if modelOutput:
        with open(modelOutput, 'w') as dotfile:
            tree.export_graphviz(baggingClassifier, out_file=dotfile, feature_names=features,
                                 class_names=classList, filled=True, rounded=True, special_characters=True)
    return baggingClassifier
Ejemplo n.º 8
0
def Cross(X, Y, typ=0, n=5):
    if typ == 0:
        print('Starting SVM')
        classifier = make_pipeline(
            preprocessing.StandardScaler(), svm.LinearSVC())
    elif typ == 1:
        print('Starting DTree')
        classifier = make_pipeline(
            preprocessing.StandardScaler(), tree.DecisionTreeClassifier())
    elif typ == 2:
        print('Starting RForest')
        classifier = make_pipeline(
            preprocessing.StandardScaler(), ensemble.RandomForestClassifier())
    elif typ == 3:
        print('Starting GaussianNB')
        classifier = make_pipeline(
            preprocessing.StandardScaler(), naive_bayes.GaussianNB())
    elif typ == 4:
        print('Starting AdaBoost')
        classifier = make_pipeline(
            preprocessing.StandardScaler(), ensemble.AdaBoostClassifier())
    elif typ == 5:
        print('Starting Bagging')
        classifier = make_pipeline(
            preprocessing.StandardScaler(), ensemble.BaggingClassifier())
    elif typ == 6:
        print('Starting ExTree')
        classifier = make_pipeline(
            preprocessing.StandardScaler(), ensemble.ExtraTreesClassifier())
    elif typ == 7:
        print('Starting GradBoost')
        classifier = make_pipeline(
            preprocessing.StandardScaler(), ensemble.GradientBoostingClassifier())
    else:
        return
    scores = cross_val_score(classifier, X, Y, cv=n)
    # print scores
    print sum(scores)/n
Ejemplo n.º 9
0
def tryAllClassifers(Xtrain, Xtest, ytrain, ytest):
    # try with different classifiers
    _, axes = plt.subplots(3, 3, figsize=(14, 14))
    models = [
        neighbors.KNeighborsClassifier(n_neighbors=5),
        linear_model.LogisticRegression(),
        svm.SVC(),
        tree.DecisionTreeClassifier(),
        neural_network.MLPClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.RandomForestClassifier(),
        ensemble.AdaBoostClassifier(),
        ensemble.GradientBoostingClassifier()
    ]
    for ax, model in zip(axes.flatten(), models):
        clf = model.fit(Xtrain, ytrain)
        # clf = linear_model.LogisticRegression().fit(Xtrain, ytrain)
        metrics.plot_confusion_matrix(clf,
                                      Xtest,
                                      ytest,
                                      ax=ax,
                                      values_format='d')
        f1 = metrics.f1_score(y_true=ytest, y_pred=clf.predict(Xtest))
        ax.set(title=f'{type(clf).__name__}\nF1={f1:.2f}')
Ejemplo n.º 10
0
    def Bagging(self):

        # model = ensemble.BaggingClassifier(svm.SVC(gamma=0.6, kernel='rbf',C=0.3))
        model = ensemble.BaggingClassifier(
            KNeighborsClassifier(n_neighbors=14), n_estimators=200)
        # 3、模型训练
        model.fit(self.x_train, self.y_train)
        # 4、模型预测
        pred_y = model.predict(self.x_test)

        # 5、模型评估
        score = round(metrics.accuracy_score(self.y_test, pred_y), 2)
        # 6、模型应用
        pred = model.predict(self.pred)[0]
        if pred:
            pred = "-存活-"
        else:
            pred = "-死亡-"

        # 返回结果:str
        str1 = f"预测结果:{pred}"
        # 返回模型评估结果
        str2 = f"Bag模型正确率:{score}"
        return str1, str2
if __name__ == "__main__":
    # --------------- Data preparation --------------- #
    train = scipy.io.loadmat('./TrainGaborized.mat')
    public = scipy.io.loadmat('./PublicGaborized.mat')
    hidden = scipy.io.loadmat('./HiddenGaborized.mat')
    labeled_images = scipy.io.loadmat('./labeled_images.mat')
    train_labels = labeled_images['tr_labels']
    train_images = train['TrainImages']
    public_images = public['PublicImages']
    hidden_images = hidden['HiddenImages']
    # ------------------------------------------------ #
    svc = SVC(C=100, cache_size=500, class_weight='auto', coef0=0.0, degree=8, gamma=1.0000000000000001e-04,
              kernel='rbf',
              max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
    engine = ensemble.BaggingClassifier(base_estimator=svc, n_estimators=50)
    engine.fit(train_images.T, train_labels.reshape(-1))
    # save_model(engine)
    public_predictions = engine.predict(public_images.T)
    hidden_predictions = engine.predict(hidden_images.T)
    predictions = public_predictions
    for hidden_pred in hidden_predictions:
        predictions = np.append(predictions, hidden_pred)
    file_name = "solution2"
    create_csv(file_name, predictions)
    create_mat(file_name, predictions)
    # Perform cross validation 
    print "Starting cross validation..."
    kfold = cross_validation.KFold(train_labels.shape[0], n_folds=8, shuffle=True)
    scores = cross_validation.cross_val_score(engine, train_images.T, train_labels.reshape(-1), n_jobs=-1, cv=kfold)
    print 'Cross validation performances: ', scores
Ejemplo n.º 12
0
                    annot_kws={'fontsize': 12})

    plt.title('Pearson Correlation of Features', y=1.05, size=15)


correlation_heatmap(data1)

# # Step 5: Model Data

# In[*]

#Machine Learning Algorithm (MLA) Selection and initialization
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(n_estimators=100),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),

    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),

    #Navies Bayes
Ejemplo n.º 13
0
def classifyAndTest():
    global currentDataFile 
    global trainedModel
    global featuresList
    global predictedLabels

    #Variables for Cost sensitive learning
    has_costs_columns_boolean = False
    costs = []

    #Read in file panda dataframe 
    try:
        numColumns = len(currentDataFile.columns)
        X_input = currentDataFile.iloc[:,0:(numColumns - 1)]
        y = currentDataFile.iloc[:,(numColumns - 1): (numColumns)]
  
        if 'Costs' in X_input.columns:
            has_costs_columns_boolean = True
            costs = X_input['Costs'].to_numpy()
            del X_input['Costs']
            
        
        '''Deal with Categorical Features'''
        categoricals = X_input.select_dtypes(include=['object'])
    
        if (not categoricals.empty):
            ohe_categoricals = pd.get_dummies(X_input.select_dtypes(include=['object']).copy())
        else:
            ohe_categoricals = categoricals

        X = pd.concat([X_input.select_dtypes(exclude=['object']), ohe_categoricals],axis = 1)       
         
        featuresList = convert_dataframe_schema(X)   

        X = X.to_numpy()
        y = y.to_numpy()
    except:
        modelTrainingResults.insert(tk.END,"ERROR: Unable to process file\n")
        return
    
    if(costSensitiveToggle.get()):
        if not (has_costs_columns_boolean):
            modelTrainingResults.insert(tk.END,"ERROR: No costs for cost-sensitive learning\n") 
            return
        else:
            y_pandas = currentDataFile.iloc[:,(numColumns - 1): (numColumns)]
            weight_dict = construct_weight_vector_simple(costs,y_pandas,costSensitiveType.get())
            clf = tree.DecisionTreeClassifier(class_weight = weight_dict)
       
    else:   
        models = { 
            'SVM': svm.SVC(),
            'Random Forest': ensemble.RandomForestClassifier(),
            'Adaboost': ensemble.AdaBoostClassifier(),
            'Bagging': ensemble.BaggingClassifier(),
            'Gradient Boosting': GradientBoostingClassifier(loss = 'deviance', max_depth = 6, n_estimators = 100),
            'Decision Tree': tree.DecisionTreeClassifier()
            }  
     
        clf = models.get(modelChoice.get(),"Invalid choice of Model")
         
    '''Write Results '''
    skf = StratifiedKFold(n_splits=crossVals.get(), shuffle = True)
    stratifiedAccuracy = 0.0
    
    for train_indices, test_indices in skf.split(X, np.ravel(y)):
        clf_test = clf.fit(X[train_indices],np.ravel(y[train_indices]))
        y_pred = clf_test.predict(X[test_indices])
        stratifiedAccuracy += accuracy_score(y[test_indices], y_pred) *100
       
    start = time.time()
    trainedModel  = clf.fit(X,np.ravel(y))
    elapsed_time = (time.time() - start)
    
    predictedLabels = clf.predict(X)
    
    modelTrainingResults.insert(tk.END, "Results for " + str(dataFileName.get()) + " using " + str(modelChoice.get()) + "\n")
    modelTrainingResults.insert(tk.END, "Time to build model is " + str(elapsed_time) + " seconds\n" )
    modelTrainingResults.insert(tk.END, "Accuracy when trained on all data is " + str(clf.score(X, y) * 100) + "%\n" )
    modelTrainingResults.insert(tk.END, "Average accuracy over cross-validated sets is " + str(stratifiedAccuracy/crossVals.get()) + "%\n\n")
Ejemplo n.º 14
0
def getClassifier(data, target):
    score = 0
    temp = 0

    # Classifier to use in BaggingClassifier
    classifier1 = ensemble.ExtraTreesClassifier(min_samples_split=3,
                                                n_estimators=10,
                                                max_features=4)

    # Classifier for GridSearch
    classifier = ensemble.BaggingClassifier(classifier1)

    # Params
    param_grid = {'n_estimators': range(5, 25)}
    #param_grid = {'n_estimators' : np.linspace(10,11, num = 2)}

    # GridSearch
    grid_search = sklearn.grid_search.GridSearchCV(
        classifier,
        param_grid,
        scoring=sklearn.metrics.make_scorer(accuracy_score),
        cv=5,
        n_jobs=4)
    grid_search.fit(data, target)
    clf = grid_search.best_estimator_

    # Print Estimator
    print(clf)

    # Print Cross of Validations Scores
    print(cross_val_score(clf, data, target, cv=5, scoring='accuracy'))

    # Print Mean of Cross Validations Scores
    temp = np.mean(cross_val_score(clf, data, target, cv=5,
                                   scoring='accuracy'))
    print("Built-in Cross-Validation: {} ".format(temp))

    # Martins Version of Cross Validation
    chunk_size = len(data) / CVSize
    for x in range(CVSize):

        # These describe where to cut to get our crossdat
        first_step = x * chunk_size
        second_step = (x + 1) * chunk_size

        # Get the data parts we train on
        cross_data = np.vstack((data[:first_step], data[second_step:]))
        cross_target = np.append(target[:first_step], target[second_step:])

        # fit and save the coef
        clf.fit(cross_data, cross_target)

        # Find mean squared error and print it
        sample_data = data[first_step:second_step]
        sample_target = target[first_step:second_step]

        # Get scores for our model
        pred = clf.predict(sample_data)
        RMSE = accuracy_score(sample_target, pred)
        score += RMSE

    score = score / CVSize

    print("Cross-Validation RMSE: {} ".format(score))

    # Get global score
    #clf.fit(data, target)
    #pred = clf.predict(data)
    #RMSE = accuracy_score(target, pred)
    #print("RMSE on whole dataset {}".format(RMSE))

    # Return estimator/classifier
    return clf
Ejemplo n.º 15
0
# genrate the accuracy metric
metrics.accuracy_score(y_valid, y_pred)
# 0.7425373134328358

#-- AdaBoosted Model --########################################################

# AdaBoost does not suppot knn

#-- Bagging Model -############################################################

# intiate the base model
tunebayes = naive_bayes.MultinomialNB(alpha=0, fit_prior=True)

# initiate bag model
bagbayes = ensemble.BaggingClassifier(base_estimator=tunebayes)

# save the parametre features to tune as a dictionary
params = {
    'n_estimators': [10, 50, 100, 200, 400, 800],
    'max_samples': [1.0, 0.9, 0.8, 0.7, 0.6],
    'max_features': [1.0, 0.8, 0.6, 0.4, 0.2],
    'random_state': [123]
}

# initate the tuning procedure, optimise on accuracy
tunebagbayes = model_selection.GridSearchCV(estimator=bagbayes,
                                            param_grid=params,
                                            scoring='accuracy')

# tune the model
Ejemplo n.º 16
0
    },
    'model': ensemble.AdaBoostClassifier()
}

bagging = {
    'features': {
        'base_estimator':
        [tree.DecisionTreeClassifier(max_depth=8, random_state=random_seed)],
        'max_samples': [1, 0.75, 0.5, 0.25],
        'max_features': [1, 0.75, 0.5],
        'n_estimators': [20],
        'bootstrap': [True, False],
        'bootstrap_features': [True, False],
        'random_state': [random_seed]
    },
    'model': ensemble.BaggingClassifier()
}

logit = {
    'features': {
        'C': [0.001, 0.01, 0.1, 1, 100, 1000, 10000],
        'solver': ['newton-cg', 'lbfgs', 'liblinear'],
        'class_weight': ['auto'],
        'random_state': [random_seed]
    },
    'model': linear_model.LogisticRegression()
}

knn = {
    'features': {
        'n_neighbors': list(range(5, 25, 5)),
Ejemplo n.º 17
0
    svm.SVC(kernel="rbf", C=1, probability=True),
    svm.SVC(kernel="rbf", C=0.1, probability=True),
    svm.SVC(kernel="rbf", C=0.025, probability=True),
    tree.DecisionTreeClassifier(),
    ensemble.RandomForestClassifier(n_estimators=200),  # chosen
    ensemble.AdaBoostClassifier(n_estimators=100),
    ensemble.AdaBoostClassifier(n_estimators=100, algorithm='SAMME.R'),
    ensemble.AdaBoostClassifier(n_estimators=100,
                                algorithm='SAMME.R',
                                learning_rate=1.2),
    ensemble.AdaBoostClassifier(n_estimators=200),
    ensemble.AdaBoostClassifier(n_estimators=200, algorithm='SAMME.R'),
    ensemble.AdaBoostClassifier(n_estimators=200,
                                algorithm='SAMME.R',
                                learning_rate=1.2),
    ensemble.BaggingClassifier(n_estimators=10),
    ensemble.BaggingClassifier(n_estimators=10, bootstrap=False),
    ensemble.BaggingClassifier(n_estimators=20),
    ensemble.BaggingClassifier(n_estimators=20, bootstrap=False),
    ensemble.BaggingClassifier(n_estimators=50),
    ensemble.BaggingClassifier(n_estimators=50, bootstrap=False),
    ensemble.BaggingClassifier(n_estimators=100),
    ensemble.BaggingClassifier(n_estimators=100, bootstrap=False),
    naive_bayes.GaussianNB(),
    naive_bayes.GaussianNB(priors=None),
    neural_network.MLPClassifier()
]


def print_full(x):
    pd.set_option('display.max_rows', len(x))
Ejemplo n.º 18
0
pass_ratio = rawstat.iloc[:, 1] / rawstat.iloc[:, 2]
shot_ratio = rawstat.iloc[:, 3] / rawstat.iloc[:, 4]

ratio.append(pass_ratio)
ratio.append(shot_ratio)
ratio = np.array(ratio)
ratio = ratio.astype('float')

x_min, x_max = ratio[0].min() - 0.05, ratio[0].max() + 0.05
y_min, y_max = ratio[1].min() - 0.05, ratio[1].max() + 0.05
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
                     np.arange(y_min, y_max, 0.01))

boost_tree = ensemble.AdaBoostClassifier(
    tree.DecisionTreeClassifier(max_depth=3)).fit(ratio.T, category)
bag_tree = ensemble.BaggingClassifier(
    tree.DecisionTreeClassifier(max_depth=3)).fit(ratio.T, category)

plt.figure(1)
fig, axarr = plt.subplots(1, 2)
for i in [0, 1]:
    decision_tree = tree.DecisionTreeClassifier(max_depth=i + 4).fit(
        ratio.T, category)
    tree_result = decision_tree.predict(np.c_[xx.ravel(),
                                              yy.ravel()]).reshape(xx.shape)

    axarr[i].pcolormesh(xx, yy, tree_result, cmap=plt.cm.Paired)
    axarr[i].scatter(pass_ratio[category == 0],
                     shot_ratio[category == 0],
                     c='r',
                     marker='o')
    axarr[i].scatter(pass_ratio[category == 1],
Ejemplo n.º 19
0
 def get_skl_estimator(self, **default_parameters):
     return ensemble.BaggingClassifier(**default_parameters)
Ejemplo n.º 20
0
              " and num_trees = " + str(tup[1]))
        h_ens = boosting(Xtrn, ytrn, max_depth=tup[0], num_stumps=tup[1])
        y_pred = [predict_ensemble_example(x, h_ens) for x in Xtst]
        confusion_matrix(y_pred, ytst)
        print()

    ######### Problem c - Scikit-learn
    #### Bagging
    l = [(3, 10), (3, 20), (5, 10), (5, 20)]
    print("-------Problem c - Scikit-learn bagging-------")
    for tup in l:
        print("Scikit-learn bagging with max_depth = " + str(tup[0]) +
              " and num_trees = " + str(tup[1]))
        cart = tree.DecisionTreeClassifier(max_depth=tup[0])
        num_trees = tup[1]
        model = ensemble.BaggingClassifier(base_estimator=cart,
                                           n_estimators=num_trees)
        clf = model.fit(Xtrn, ytrn)
        y_pred = clf.predict(Xtst)
        confusion_matrix(y_pred, ytst)
        print()

    #### Boosting
    l = [(1, 20), (1, 40), (2, 20), (2, 40)]
    print("-------Problem c - Scikit-learn AdaBoost-------")
    for tup in l:
        print("Scikit-learn AdaBoost with max_depth = " + str(tup[0]) +
              " and num_stumps = " + str(tup[1]))
        cart = tree.DecisionTreeClassifier(max_depth=tup[0])
        num_trees = tup[1]
        model = ensemble.AdaBoostClassifier(base_estimator=cart,
                                            n_estimators=num_trees)
Ejemplo n.º 21
0
train_Acc = []
test_Acc = []
## Random Forest Classifier
clf = ensemble.RandomForestClassifier(n_estimators=numBaseClassifiers)
clf.fit(X_train, Y_train)
Y_predict_train_EM = clf.predict(X_train)
Y_predict_test_EM = clf.predict(X_test)
train_Acc.append(accuracy_score(Y_train, Y_predict_train_EM))
test_Acc.append(accuracy_score(Y_test, Y_predict_test_EM))
print(
    "Ensemble Method by Random Forest Classifier give us train accuracy: %f and test accuracy: %f "
    % (accuracy_score(Y_train, Y_predict_train_EM),
       accuracy_score(Y_test, Y_predict_test_EM)))
## Bagging Classifier
clf = ensemble.BaggingClassifier(
    DecisionTreeClassifier(max_depth=max_depth_EM),
    n_estimators=numBaseClassifiers)
clf.fit(X_train, Y_train)
Y_predict_train_EM = clf.predict(X_train)
Y_predict_test_EM = clf.predict(X_test)
train_Acc.append(accuracy_score(Y_train, Y_predict_train_EM))
test_Acc.append(accuracy_score(Y_test, Y_predict_test_EM))
print(
    "Ensemble Method by Bagging Classifier give us train accuracy: %f and test accuracy: %f "
    % (accuracy_score(Y_train, Y_predict_train_EM),
       accuracy_score(Y_test, Y_predict_test_EM)))
## Adaboost Classifier
clf = ensemble.AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=max_depth_EM),
    n_estimators=numBaseClassifiers)
clf.fit(X_train, Y_train)
Ejemplo n.º 22
0
    ['PassengerId', 'Name', 'Age', 'Ticket', 'Cabin', 'Survived'],
    axis=1,
    inplace=False)
titanic2.shape

#Extract only train records 0:891
X_train = titanic2[0:titanic_train.shape[0]]
X_train.shape
X_train.info()
y_train = titanic_train['Survived']

#oob scrore is computed as part of model construction process
dt_estimator = tree.DecisionTreeClassifier()
#This is what the real Bagging model is
#In-order to specify, which model to be used is what base_estimator is: In this case we are building using Decission Tree Classifier
bt_estimator = ensemble.BaggingClassifier(base_estimator=dt_estimator,
                                          random_state=2017)
#n_estimators means how many no. of tree to be grown
#base_estimator__ (Double underscore__ acts as prefix)
bt_grid = {'n_estimators': [5, 6], 'base_estimator__max_depth': [3, 4, 5]}

grid_bt_estimator = model_selection.GridSearchCV(bt_estimator,
                                                 bt_grid,
                                                 cv=10,
                                                 n_jobs=5)
grid_bt_estimator.fit(X_train, y_train)
print(grid_bt_estimator.grid_scores_)  #In SK Learn Verion 0.18

print(grid_bt_estimator.best_score_)
print(grid_bt_estimator.best_params_)
print(grid_bt_estimator.score(X_train, y_train))
Ejemplo n.º 23
0
count_vectorizer = feature_extraction.text.CountVectorizer()

train_vectors = count_vectorizer.fit_transform(train["text"])
print(train_vectors)
test_vectors = count_vectorizer.transform(test["text"])
"""
clf = linear_model.RidgeClassifier()

scores = model_selection.cross_val_score(clf, train_vectors, train["target"], cv=3, scoring="f1")
print(scores)
"""

Methodes = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    #gaussian_process.GaussianProcessClassifier(),

    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.LogisticRegression(C=1000, random_state=0,
                                    solver='liblinear'),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
Ejemplo n.º 24
0
import numpy as np
from sklearn import preprocessing,neighbors,ensemble, decomposition, model_selection,ensemble
import pandas as pd
import pickle

f = open('knn.pickle','wb')
df = pd.read_csv('voice.csv')
df.replace('?',-99999, inplace=True)


X = np.array(df[['meanfun','Q25','sd','IQR','sfm','meanfreq','mode']])
y = np.array(df['label'])
'''pca = decomposition.PCA()
X = pca.fit_transform(X)'''
gender_encoder = preprocessing.LabelEncoder()
y = gender_encoder.fit_transform(y)
scaler = preprocessing.StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2,random_state=5)

clf = ensemble.BaggingClassifier(neighbors.KNeighborsClassifier(),max_features=7)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
pickle.dump(clf,f)
f.close()
Ejemplo n.º 25
0
from sklearn import neighbors
from sklearn.grid_search import ParameterGrid
from datetime import timedelta
import matplotlib.pyplot as plt
from scipy import optimize
#dictionary of models and parameters - inspiration from
#https://github.com/rayidghani/magicloops/blob/master/simpleloop.py

MODELS = {
    'decision_tree': tree.DecisionTreeClassifier(),
    'logistic_regression': linear_model.LogisticRegression(),
    'knn': neighbors.KNeighborsClassifier(),
    'random_forest': ensemble.RandomForestClassifier(),
    'support_vector_machine': svm.SVC(),
    'boosting': ensemble.AdaBoostClassifier(),
    'bagging': ensemble.BaggingClassifier()
}

PARAMS = {
    'decision_tree': {
        'max_depth': [1, 3, 5, 8, 20]
    },
    'logistic_regression': {
        'C': [0.001, 0.01, 0.1, 1, 10]
    },
    'knn': {
        'n_neighbors': [5, 10, 25]
    },
    'random_forest': {
        'n_estimators': [1, 2, 3, 4, 10]
    },
Ejemplo n.º 26
0
""" 2. Bagging

    · 显然,Boosting策略受到数据集的影响较大,
      为了获得更好的泛化能力,有必要使个体分类器尽可能独立
    
    · Bagging策略:
      Step 1:对给定数据集进行采样,获得N个子数据集(可以有交集)
      Step 2:用N个子数据集训练N个个体分类器
      Step 3:将所有个体分类器并行结合,对分类任务采用投票法预测
"""

''' 2.1 BaggingClassifier (实现基本的Bagging策略) '''
base = tree.DecisionTreeClassifier()
model = ensemble.BaggingClassifier(base_estimator=base,
                                   n_estimators=10,
                                   random_state=1)
kfold = model_selection.KFold(n_splits=20,random_state=1)
result = model_selection.cross_val_score(model,X,y,cv=kfold)
print(f'Accuracy of Bagging: {result.mean()*100:.2f}%')


''' 2.2 RandomForestClassifier 随机森林 (RF) 
    思想:在Bagging策略的基础上,构建决策树时,随机选择特征作为节点
'''
model = ensemble.RandomForestClassifier(n_estimators=30, random_state=1)
kfold = model_selection.KFold(n_splits=10,random_state=1)
result = model_selection.cross_val_score(model,X,y,cv=kfold)
print(f'Accuracy of RF: {result.mean()*100:.2f}%')

Ejemplo n.º 27
0
test_accuracy = clf.score(test_array, test_label)

print "--- Decision Tree Classifier ---"
print "tree_depth", tree_depth
print "train accuracy:", train_accuracy
#print "validate accuracy:", validate_accuracy
print "test accuracy:", test_accuracy
print ""

# Bagged Decision Tree
from sklearn import ensemble
tree_depth = 3
est = 15
clf = ensemble.BaggingClassifier(
    tree.DecisionTreeClassifier(max_depth=tree_depth),
    max_samples=1.0,
    max_features=1.0,
    n_estimators=est)

clf = clf.fit(train_array, train_label)

train_accuracy = clf.score(train_array, train_label)
#validate_accuracy = clf.score(validate_array, validate_label)
test_accuracy = clf.score(test_array, test_label)

print "--- Bagging Tree Classifier ---"
print "n_estimators:", est
print "train accuracy:", train_accuracy
#print "validate accuracy:", validate_accuracy
print "test accuracy:", test_accuracy
print ""
Ejemplo n.º 28
0
# coding=utf-8
"""Comparison of various classifiers acting alone and inside an bagging ensemble."""

from sklearn import datasets, model_selection, metrics, tree, ensemble

if __name__ == "__main__":
    print("Loading data...")
    X, y = datasets.load_iris(return_X_y=True)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)

    print("Fitting classifiers...")
    t = tree.DecisionTreeClassifier()
    t.fit(X_train, y_train)

    e = ensemble.BaggingClassifier(tree.DecisionTreeClassifier(),
                                   n_estimators=35,
                                   max_features=0.5,
                                   max_samples=0.5)
    e.fit(X_train, y_train)

    print("Evaluating classifiers...")

    print("#" * 128)
    print("Decision tree:")
    print("Test:")
    print(metrics.classification_report(y_test, t.predict(X_test)))
    print(metrics.confusion_matrix(y_test, t.predict(X_test)))
    print("Training:")
    print(metrics.classification_report(y_train, t.predict(X_train)))
    print(metrics.confusion_matrix(y_train, t.predict(X_train)))

    print("#" * 128)
Ejemplo n.º 29
0
stkf = sms.StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

C_space = np.logspace(-3, 2, 6)
for c in tqdm.tqdm(C_space):
    lr = slm.LogisticRegression(C=c, random_state=1)
print(
    c,
    sms.cross_val_score(lr, R_train_2, y_train_2, scoring='accuracy',
                        cv=stkf).mean())

#  я не возлагаю много надежд на логит, поэтому сразу подтюним его

### ЛР
lr = slm.LogisticRegression(C=0.1, random_state=1)
bg_lr = se.BaggingClassifier(base_estimator=lr,
                             n_estimators=100,
                             random_state=1,
                             n_jobs=1)

params = {
    'max_features': [3, 6, 12, 24, 48, 96, 192, 384],
    'max_samples': [0.5, 0.75, 0.9]
}
rs_lr = sms.RandomizedSearchCV(estimator=bg_lr,
                               n_jobs=2,
                               cv=stkf,
                               verbose=2,
                               param_distributions=params,
                               scoring='accuracy',
                               n_iter=20,
                               random_state=1)
rs_lr.fit(R_train_2, y_train_2)
Ejemplo n.º 30
0
# Predict probability on test data
y_pred_proba_ada = ada.predict_proba(X_test)
# Accuracy metrics (log-loss)
logloss = metrics.log_loss(y_test, y_pred_proba_ada)
print('Log-loss: {:.6f}'.format(logloss))


# ### Bagging classifier

# In[ ]:

# Bagging classifier 
# base estimator is a decision tree if not stated otherhwise
bagg = ensemble.BaggingClassifier(base_estimator=ensemble.ExtraTreesClassifier(n_estimators=50, 
                                  criterion='entropy', max_depth=5), n_estimators=100, 
                                  max_samples=0.6, max_features=0.8, oob_score=True, n_jobs=-1)
# Fit
bagg.fit(X_train, y_train)


# In[ ]:

# Predict probability on test data
y_pred_proba_bagg = bagg.predict_proba(X_test)
# Accuracy metrics (log-loss)
logloss = metrics.log_loss(y_test, y_pred_proba_bagg)
print('Log-loss: {:.6f}'.format(logloss))


# ### Stochastic Gradient Descent