Beispiel #1
0
def determined_train_and_predict(train_datas, train_lables, test_ids, test_datas):
    class_fier = AdaBoostClassifier(RandomForestClassifier(n_estimators=300), algorithm="SAMME", n_estimators=400)
#     class_fier = RandomForestClassifier(n_estimators=300)
    class_fier.fit(train_datas, train_lables)
    
    predict_lables = class_fier.predict(test_datas)
    result_dic = {}
    result_dic['Id'] = test_ids
    result_dic['Response'] = predict_lables
    out_file_content = pd.DataFrame(result_dic)
    out_file_content.to_csv('sample3.csv', index=False)
Beispiel #2
0
def AB(pth):
     train_desc=np.load(pth+'/training_features.npy')
     nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0)
     idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32')

# Scaling the words
     stdSlr = StandardScaler().fit(train_desc)
     train_desc = stdSlr.transform(train_desc)
     modelAB=AdaBoostClassifier(n_estimators=100)
     modelAB.fit(train_desc,np.array(train_labels))

     joblib.dump((modelAB, img_classes, stdSlr), pth+"/ab-bof.pkl", compress=3) 
     test(pth, "ab-")
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
Beispiel #4
0
def ada():
    pipeline = Pipeline([('count_vectorizer',
                          CountVectorizer(binary=True,
                                          ngram_range=(1, 2),
                                          max_features=15000,
                                          stop_words=stopwords)),
                         ('clf', AdaBoostClassifier())])
    train_report(pipeline)
Beispiel #5
0
class AdaBoostClassifierImpl():
    def __init__(self,
                 base_estimator=None,
                 n_estimators=50,
                 learning_rate=1.0,
                 algorithm='SAMME.R',
                 random_state=None):
        self._hyperparams = {
            'base_estimator': base_estimator,
            'n_estimators': n_estimators,
            'learning_rate': learning_rate,
            'algorithm': algorithm,
            'random_state': random_state
        }
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
Beispiel #6
0
class AdaBoostClassifierImpl():

    def __init__(self, base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None):
        if isinstance(base_estimator, lale.operators.Operator):
            if isinstance(base_estimator, lale.operators.IndividualOp):
                base_estimator = base_estimator._impl_instance()._wrapped_model
            else:
                raise ValueError("If base_estimator is a Lale operator, it needs to be an individual operator. ")
        self._hyperparams = {
            'base_estimator': base_estimator,
            'n_estimators': n_estimators,
            'learning_rate': learning_rate,
            'algorithm': algorithm,
            'random_state': random_state}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
Beispiel #7
0
def check_classifiers(n_samples=10000):
    """
    This function is not tested by default, it should be called manually
    """
    testX, testY = generate_sample(n_samples, 10, 0.6)
    trainX, trainY = generate_sample(n_samples, 10, 0.6)
    uniform_features = ['column0']

    ada = AdaBoostClassifier(n_estimators=50)
    ideal_bayes = GaussianNB()

    uBoost_SAMME = uBoostClassifier(
        uniform_features=uniform_features,
        uniform_label=1,
        n_neighbors=50,
        efficiency_steps=5,
        n_estimators=50,
        algorithm="SAMME")

    uBoost_SAMME_R = uBoostClassifier(
        uniform_features=uniform_features,
        uniform_label=1,
        n_neighbors=50,
        efficiency_steps=5,
        n_estimators=50,
        algorithm="SAMME.R")

    uBoost_SAMME_R_threaded = uBoostClassifier(
        uniform_features=uniform_features,
        uniform_label=1,
        n_neighbors=50,
        efficiency_steps=5,
        n_estimators=50,
        n_threads=3,
        subsample=0.9,
        algorithm="SAMME.R")

    clf_dict = OrderedDict({
        "Ada": ada,
        "uBOOST": uBoost_SAMME,
        "uBOOST.R": uBoost_SAMME_R,
        "uBOOST.R2": uBoost_SAMME_R_threaded
    })

    cvms = {}
    for clf_name, clf in clf_dict.items():
        clf.fit(trainX, trainY)
        p = clf.predict_proba(testX)
        metric = KnnBasedCvM(uniform_features=uniform_features)
        metric.fit(testX, testY)
        cvms[clf_name] = metric(testY, p, sample_weight=np.ones(len(testY)))

    assert cvms['uBOOST'] < cvms['ada']
    print(cvms)
Beispiel #8
0
def classify(X,y,cv):
    #clf = DecisionTreeClassifier(criterion='entropy',min_samples_split=10,random_state=5)
    #clf = RandomForestClassifier(n_estimators=1000)
    clf = AdaBoostClassifier()
    #clf = ExtraTreesClassifier()

    score = cross_val_score(clf, X, y, cv=cv)
    print '%s-fold cross validation accuracy: %s' % (cv,sum(score)/score.shape[0])
    clf = clf.fit(X,y)

    #print 'Feature Importances'
    #print clf.feature_importances_
    #X = clf.transform(X,threshold=.3)
    
    preds = clf.predict(X)
    print 'predictions counter'
    print Counter(clf.predict(X))
    fp=0
    tp=0
    fn=0
    tn=0
    for a in range(len(y)):
        if y[a]==preds[a]:
            if preds[a]==0:
                tn+=1
            elif preds[a]==1:
                tp+=1
        elif preds[a]==1:fp+=1
        elif preds[a]==0:fn+=1
    
    print 'correct positives:', tp
    print 'correct negatives:', tn
    print 'false positives:', fp
    print 'false negatives:', fn
    print 'precision:',float(tp)/(tp+fp)
    print 'recall (tp)/(tp+fn):',float(tp)/(tp+fn)
    print 'false positive rate (fp)/(fp+tn):', float(fp)/(fp+tn)
    print 'false positive rate2 (fp)/(fp+tp):', float(fp)/(fp+tp)
    print 'prediction accuracy: %s%s\n' % (100*float(tp+tn)/(tp+tn+fp+fn),'%') 
    return clf
Beispiel #9
0
 def __init__(self, base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None):
     if isinstance(base_estimator, lale.operators.Operator):
         if isinstance(base_estimator, lale.operators.IndividualOp):
             base_estimator = base_estimator._impl_instance()._wrapped_model
         else:
             raise ValueError("If base_estimator is a Lale operator, it needs to be an individual operator. ")
     self._hyperparams = {
         'base_estimator': base_estimator,
         'n_estimators': n_estimators,
         'learning_rate': learning_rate,
         'algorithm': algorithm,
         'random_state': random_state}
     self._wrapped_model = SKLModel(**self._hyperparams)
Beispiel #10
0
 def __init__(self,
              base_estimator=None,
              n_estimators=50,
              learning_rate=1.0,
              algorithm='SAMME.R',
              random_state=None):
     self._hyperparams = {
         'base_estimator': base_estimator,
         'n_estimators': n_estimators,
         'learning_rate': learning_rate,
         'algorithm': algorithm,
         'random_state': random_state
     }
     self._wrapped_model = SKLModel(**self._hyperparams)
Beispiel #11
0
def get_feature_ranking(X_train, y_train):
    print("feature ranking running....-> LogisticRegression")
    model1 = LogisticRegression(max_iter=500)
    rfe = RFECV(estimator=model1,
                step=1,
                cv=StratifiedKFold(2),
                scoring='accuracy')
    rfe = rfe.fit(X_train, y_train)
    logr_ranking = []
    for x, d in zip(rfe.ranking_, X_train.columns):
        logr_ranking.append([d, x])
    logr_ranking = pd.DataFrame(logr_ranking, columns=['features1', 'logr'])
    logr_ranking.sort_values('features1', inplace=True)

    print("feature ranking running....-> GradientBoostingClassifier")
    model2 = GradientBoostingClassifier()
    rfe = RFECV(estimator=model2,
                step=1,
                cv=StratifiedKFold(2),
                scoring='accuracy')
    rfe = rfe.fit(X_train, y_train)
    gboost_ranking = []
    for x, d in zip(rfe.ranking_, X_train.columns):
        gboost_ranking.append([d, x])
    gboost_ranking = pd.DataFrame(gboost_ranking,
                                  columns=['features2', 'gboost'])
    gboost_ranking.sort_values('features2', inplace=True)

    print("feature ranking running....-> AdaBoostClassifier")
    model3 = AdaBoostClassifier()
    rfe = RFECV(estimator=model3,
                step=1,
                cv=StratifiedKFold(2),
                scoring='accuracy')
    rfe = rfe.fit(X_train, y_train)
    adaboost_ranking = []
    for x, d in zip(rfe.ranking_, X_train.columns):
        adaboost_ranking.append([d, x])
    adaboost_ranking = pd.DataFrame(adaboost_ranking,
                                    columns=['features3', 'adaboost'])
    adaboost_ranking.sort_values('features3', inplace=True)

    feature_sum = logr_ranking['logr'] + gboost_ranking[
        'gboost'] + adaboost_ranking['adaboost']
    df_ranked = pd.concat([logr_ranking['features1'], feature_sum], axis=1)
    df_ranked.sort_values(0, inplace=True)

    return df_ranked
Beispiel #12
0
def check_classifiers(n_samples=10000, output_name_pattern=None):
    """
    This function is not tested by default, it should be called manually
    """
    testX, testY = generate_sample(n_samples, 10, 0.6)
    trainX, trainY = generate_sample(n_samples, 10, 0.6)
    uniform_variables = ['column0']

    ada = AdaBoostClassifier(n_estimators=50)
    ideal_bayes = HidingClassifier(train_variables=trainX.columns[1:],
                                   base_estimator=GaussianNB())

    uBoost_SAMME = uBoostClassifier(uniform_variables=uniform_variables,
                                    n_neighbors=50,
                                    efficiency_steps=5,
                                    n_estimators=50,
                                    algorithm="SAMME")
    uBoost_SAMME_R = uBoostClassifier(uniform_variables=uniform_variables,
                                      n_neighbors=50,
                                      efficiency_steps=5,
                                      n_estimators=50,
                                      algorithm="SAMME.R")

    clf_dict = ClassifiersDict({
        "Ada": ada,
        "Ideal": ideal_bayes,
        "uBOOST": uBoost_SAMME,
        "uBOOST.R": uBoost_SAMME_R
    })

    clf_dict.fit(trainX, trainY)

    predictions = Predictions(clf_dict, testX, testY)
    # predictions.print_mse(uniform_variables, in_html=False)
    print(predictions.compute_metrics())

    predictions.sde_curves(uniform_variables)
    if output_name_pattern is not None:
        pl.savefig(output_name_pattern % "mse_curves", bbox="tight")
    _ = pl.figure()
    predictions.learning_curves()
    if output_name_pattern is not None:
        pl.savefig(output_name_pattern % "learning_curves", bbox="tight")
    predictions.efficiency(uniform_variables)
    if output_name_pattern is not None:
        pl.savefig(output_name_pattern % "efficiency_curves", bbox="tight")
Beispiel #13
0
def defaultModels(df_xmat, df_ymat_cat):

    #### representitive common classifiers in sklearn ####
    classifiers = [
        GaussianNB(),
        LogisticRegression(max_iter=500),
        DecisionTreeClassifier(),
        KNeighborsClassifier(),
        SVC(kernel='rbf'),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        RandomForestClassifier(),
    ]

    cv = StratifiedKFold(n_splits=10)

    res = []

    for clf in classifiers:

        print('processing...' + str(clf)[:10])

        metrics_cv = []

        for train_index, test_index in cv.split(df_xmat.values, df_ymat_cat):

            X_train = df_xmat.iloc[train_index, :].values
            X_test = df_xmat.iloc[test_index, :].values
            y_train = [df_ymat_cat[i] for i in train_index]
            y_test = [df_ymat_cat[i] for i in test_index]

            clf.fit(X_train, y_train)

            metrics_cv.append(clf.score(X_test, y_test))

        res.append([
            str(clf)[:10],
            np.array(metrics_cv).mean(axis=0),
            np.array(metrics_cv).std(axis=0)
        ])

    return res
Beispiel #14
0
    def __init__(self):
        self.random_rate=33
        clf1=SVC(C=1.0,random_state=33)
        clf2=XGBClassifier(n_estimators=220,learning_rate=0.2,min_child_weight=2.3)
        clf3=RandomForestClassifier(n_estimators=80,random_state=330,n_jobs=-1)
        clf4=BaggingClassifier(n_estimators=40,random_state=101)
        clf5=AdaBoostClassifier(n_estimators=70,learning_rate=1.5,random_state=33)
        clf6=GradientBoostingClassifier(n_estimators=250,learning_rate=0.23,random_state=33)

        clf7=XGBClassifier(n_estimators=100,learning_rate=0.12,min_child_weight=1)


        base_model=[
            ['svc',clf1],
            ['xgbc',clf2],
            ['rfc',clf3],
            ['bgc',clf4],
            ['adbc',clf5],
            ['gdbc',clf6]
        ]

        self.base_models=base_model
        self.XGB=clf7
Beispiel #15
0
def init_model(input_data, target_data):
    model = AdaBoostClassifier(n_estimators=285, learning_rate=0.19, algorithm='SAMME.R')
    model.fit(input_data, target_data)    
    return model
Beispiel #16
0
        print("LEARNING STEP")

    #default
    classifier = "not_init"

    if alg == 0:
        classifier = DecisionTreeClassifier(max_depth=tree_depth)
    if alg == 1:
        classifier = RandomForestClassifier(n_estimators=random_forest_size,
                                            random_state=seed,
                                            n_jobs=10)
    if alg == 2:
        classifier = create_ensemble(seed)
    if alg == 3:
        classifier = AdaBoostClassifier(DecisionTreeClassifier(),
                                        n_estimators=boosting_size,
                                        random_state=seed)
    if alg == 4:
        scaler = StandardScaler()
        svr = SVR(kernel='rbf',
                  cache_size=4000,
                  C=1e3,
                  gamma=0.0001,
                  max_iter=200000,
                  epsilon=0.0001)
        classifier = Pipeline([('standardize', scaler), ('svr', svr)])
    if alg == 5:
        classifier = GaussianNB()

    if classifier == "not_init":
        print("Classifier not init, exit")
class SkClassifier(MultiClassifier):
    type_map = dict(
        MultiClassifier.type_map,
        c=float,
        gamma=float,
        cache=int,
        n_estimators=int,
        n_neighbors=int,
        radius=float,
        # probability=str_to_bool, # TODO
        # class_weights # TODO
    )

    base_param_grid = {'svc__C': np.logspace(-2, 3, 5)}

    classifiers = {
        'svm': {
            'build':
            lambda self: SVC(kernel='linear',
                             C=self.c,
                             probability=self.probability,
                             cache_size=self.cache,
                             class_weight=SVM_CLASS_WEIGHTS
                             if SVM_CLASS_WEIGHTS else None),
            'param_grid':
            dict(base_param_grid),
            'test_params': {
                'probability': False
            },
            'roc_params': {
                'probability': True
            },
        },
        'svm-rbf': {
            'build':
            lambda self: SVC(kernel='rbf',
                             C=self.c,
                             probability=self.probability,
                             cache_size=self.cache,
                             gamma=self.gamma,
                             class_weight=SVM_CLASS_WEIGHTS
                             if SVM_CLASS_WEIGHTS else None),
            'param_grid':
            dict(base_param_grid, svc__gamma=np.logspace(-9, 3, 5)),
            'test_params': {
                'probability': False
            },
            'roc_params': {
                'probability': True
            },
        },
        'mlp': {
            'build':
            lambda self: MLPClassifier(solver='lbfgs',
                                       alpha=1e-5,
                                       hidden_layer_sizes=(5, 2),
                                       random_state=42),
            'param_grid':
            dict(),
        },
        'knn': {
            'build':
            lambda self: KNeighborsClassifier(n_neighbors=self.n_neighbors,
                                              n_jobs=self.n_jobs),
            'param_grid':
            dict(),
        },
        'rnn': {
            'build':
            lambda self: RadiusNeighborsClassifier(
                radius=self.radius, n_jobs=self.n_jobs, outlier_label=0),
            'param_grid':
            dict(),
        },
        'ada': {
            'build': lambda self: AdaBoostClassifier(),
            'param_grid': dict(),
        },
        'ada-svm': {
            'build':
            lambda self: AdaBoostClassifier(base_estimator=SVC(
                probability=True,
                kernel='rbf',
                C=self.c,
                gamma=self.gamma,
                cache_size=self.cache)),
            'param_grid':
            dict(),
        },
        'ada-sgd': {
            'build':
            lambda self: AdaBoostClassifier(
                base_estimator=SGDClassifier(loss='hinge'), algorithm='SAMME'),
            'param_grid':
            dict(),
        },
        'rf': {
            'build':
            lambda self: RandomForestClassifier(n_estimators=self.n_estimators,
                                                n_jobs=self.n_jobs),
            'param_grid':
            dict(),
        },
        'et': {
            'build':
            lambda self: ExtraTreesClassifier(n_estimators=self.n_estimators,
                                              n_jobs=self.n_jobs),
            'param_grid':
            dict(),
        },
        'gnb': {
            'build': lambda self: GaussianNB(),
            'param_grid': dict(),
        },
        'bnb': {
            'build': lambda self: BernoulliNB(),
            'param_grid': dict(),
        },
    }

    def __init__(self,
                 classifier_name='svm',
                 c=1000,
                 gamma=0.02,
                 cache=2000,
                 n_estimators=200,
                 n_neighbors=16,
                 radius=1.0,
                 probability=True,
                 **kwargs):
        super().__init__(classifier_name, **kwargs)

        logger.info(
            'Initializing Scikit-learn classifier {}'.format(classifier_name))
        self.c = c
        self.gamma = gamma
        self.cache = cache
        self.probability = probability
        self.n_estimators = n_estimators
        self.n_neighbors = n_neighbors
        self.radius = radius

    def load_model(self, model_dir):
        super().load_model(model_dir=model_dir)
        self.clf = joblib.load(os.path.join(model_dir, 'clf.pkl'))

    def save_model(self, output_dir):
        super().save_model(output_dir=output_dir)
        joblib.dump(self.clf, os.path.join(output_dir, 'clf.pkl'))

    def _build_classifier(self, *args, **kwargs):
        return self.classifier_dict['build'](self)

    def _get_param_grid(self):
        return self.classifier_dict['param_grid']

    def _get_test_params(self):
        return self.classifier_dict.get('test_params', {})

    def _get_cv_params(self):
        return {
            **self._get_test_params(),
            **self.classifier_dict.get('cv_params', {})
        }

    def _get_roc_params(self):
        return self.classifier_dict.get('roc_params', {})
Beispiel #18
0
from sklearn.svm.classes import SVC
import os
import warnings
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import pickle
from sklearn.model_selection import train_test_split
import shutil
from statistics import mean

warnings.filterwarnings('ignore')

classifiers = [
    AdaBoostClassifier(),
    BaggingClassifier(),
    BernoulliNB(),
    CalibratedClassifierCV(),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    ExtraTreesClassifier(),
    GaussianNB(),
    GaussianProcessClassifier(),
    GradientBoostingClassifier(),
    KNeighborsClassifier(),
    LabelPropagation(),
    LabelSpreading(),
    LinearDiscriminantAnalysis(),
    LogisticRegression(),
    LogisticRegressionCV(),
Beispiel #19
0
# NOTE:  Adjust Trainingset / Testset division ratio:
divratio = 0.3


# Normalization (L1 & L2):
# NOTE:  Change 'normtype' value to 'l1' / 'l2' to change normalization type:
normtype = 'l2'#'l1'


# model_selection is used for manually enabling the individual models.
# NOTE:  Setting boolean value, eanbles/disables model.
model_selection = {
    'ExtraTrees': ( True, ExtraTreesClassifier(n_estimators='warn', criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None) ),
    'RandomForest': ( True, RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) ),
    'AdaBoost': ( True, AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None) ),
    'DecisionTree': ( True, DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=5, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False) ),
    'GradientBoosting': (True, GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001) ),
    'BernoulliNB': (True, BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None) ),
    'BaggingClassifier': (True, BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0) ),
    'NearestNeighbors': (True, KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None) ), # (n_neighbors=4) ),
    'LogisticRegressionCV': (True, LogisticRegressionCV(Cs=10, fit_intercept=True, cv='warn', dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=0.0001, max_iter=100, class_weight=None, n_jobs=None, verbose=0, refit=True, intercept_scaling=1.0, multi_class='warn', random_state=None, l1_ratios=None) ),
    'LDA': (True, LinearDiscriminantAnalysis(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) ),
    'LogisticRegression': (True, LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None) ),
    'CalibratedClassifierCV': (True, CalibratedClassifierCV(base_estimator=None, method='sigmoid', cv='warn') ),
    'LinearSVC': (True, LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000) ),
    'LinearSVM': ( True, SVC(kernel='linear', C=0.025) ),  # (C=0.01, penalty='l1', dual=False) ),
    'RBF_SVM': (True, SVC(gamma='auto') ),#gamma=2, C=1) ), #
    'Nu_SVM': (True, NuSVC(gamma='auto') ),
    'GaussianProcess': (False, GaussianProcessClassifier() ), #(1.0 * RBF(1.0)) ),
    'NeuralNet': (True, MLPClassifier(alpha=1, max_iter=1000) ),
Beispiel #20
0
from sklearn.tree import DecisionTreeClassifier

# classification models
classifiers = {
    'K-Nearest Neighbors (Braycurtis norm)':
    KNeighborsClassifier(n_neighbors=3, algorithm='auto', metric='braycurtis'),
    'Random Forest':
    RandomForestClassifier(n_estimators=80, n_jobs=1),
    'SVM':
    SVC(gamma=2, C=1),
    'Linear Support Vector Machine':
    SVC(kernel="linear", C=0.025),
    'Decision Tree':
    DecisionTreeClassifier(max_depth=5),
    'Ada Boost':
    AdaBoostClassifier(n_estimators=80, learning_rate=0.4),
    'Naive Bayes':
    GaussianNB(),
}
vc = VotingClassifier(estimators=list(classifiers.items()), voting='hard')


def evaluate_model(model_name, model, x, y):
    """Evaluate model accuracy via cross validation."""
    print('%s:' % model_name)
    model.fit(x, y.values.ravel())
    print('CV f1_micro (not reusing data): %s' % np.mean(
        cross_val_score(model, x, y.values.ravel(), cv=5, scoring='f1_micro')))


def predict(x, y, signal_matrix, verbose=1):
Beispiel #21
0
from sklearn.cluster.bicluster import SpectralBiclustering
from sklearn.cluster.spectral import SpectralClustering
from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.manifold.spectral_embedding_ import SpectralEmbedding
from sklearn.preprocessing.data import StandardScaler
from sklearn.manifold.t_sne import TSNE
from sklearn.linear_model.theil_sen import TheilSenRegressor
from sklearn.mixture.dpgmm import VBGMM
from sklearn.feature_selection.variance_threshold import VarianceThreshold

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


clf_dict = {'ARDRegression':ARDRegression(),
			'AdaBoostClassifier':AdaBoostClassifier(),
			'AdaBoostRegressor':AdaBoostRegressor(),
			'AdditiveChi2Sampler':AdditiveChi2Sampler(),
			'AffinityPropagation':AffinityPropagation(),
			'AgglomerativeClustering':AgglomerativeClustering(),
			'BaggingClassifier':BaggingClassifier(),
			'BaggingRegressor':BaggingRegressor(),
			'BayesianGaussianMixture':BayesianGaussianMixture(),
			'BayesianRidge':BayesianRidge(),
			'BernoulliNB':BernoulliNB(),
			'BernoulliRBM':BernoulliRBM(),
			'Binarizer':Binarizer(),
			'Birch':Birch(),
			'CCA':CCA(),
			'CalibratedClassifierCV':CalibratedClassifierCV(),
			'DBSCAN':DBSCAN(),
g_train = g.iloc[train_ind, :]
g_test = g.iloc[test_ind, :]

clf = tree.DecisionTreeClassifier(criterion='gini',
                                  max_depth=6,
                                  min_samples_leaf=3)
####################
clf = RandomForestClassifier(criterion='gini',
                             max_depth=6,
                             min_samples_leaf=3,
                             n_estimators=50)
####################
clf = AdaBoostClassifier(DecisionTreeClassifier(criterion='gini',
                                                max_depth=6,
                                                min_samples_leaf=3),
                         n_estimators=200,
                         learning_rate=0.1)
####################
clf = neighbors.KNeighborsClassifier(100, weights='uniform')
clf = neighbors.KNeighborsClassifier(100, weights='distance')
####################
clf = GaussianNB()
##############################
t0 = time()
param_grid = {
    'C': [150, 500, 750, 1000],
    'gamma': [0.0005, 0.001, 0.05, .01],
}
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
clf = clf.fit(X_train, y_train)
etclf.fit(x_train, y_train)

# Print Confusion Matrix
metrics.confusion_matrix(etclf.predict(x_test), y_test)


from sklearn.ensemble.forest import RandomForestClassifier

rdclf = RandomForestClassifier(n_estimators=20, max_depth=10)
rdclf.fit(x_train, y_train)
metrics.confusion_matrix(rdclf.predict(x_test), y_test)


from sklearn.ensemble.weight_boosting import AdaBoostClassifier

adaclf = AdaBoostClassifier(n_estimators=20)
adaclf.fit(x_train, y_train)
metrics.confusion_matrix(adaclf.predict(x_test), y_test)




metrics.confusion_matrix(etclf.predict(x_test), y_test)
metrics.confusion_matrix(rdclf.predict(x_test), y_test)
metrics.confusion_matrix(adaclf.predict(x_test), y_test)


#The base random forest model seems to do best here.


import time
    #    homesite.train_x = homesite.train_x[reduced_range]
    #    homesite.train_y = homesite.train_y[reduced_range]

    C = [256, 512]
    for c in C:
        # Creating classifier.
        mean_acc = 0.0
        mean_recall = 0.0
        mean_precision = 0.0
        mean_tpr = 0.0
        mean_fpr = np.linspace(0, 1, 100)
        all_tpr = []

        cvs = StratifiedKFold(homesite.train_y, n_folds=5)

        clf = AdaBoostClassifier(n_estimators=c, random_state=0)

        # Train classifier.
        print "\nTraining classifier param %d" % c
        for i, (train, test) in enumerate(cvs):
            sm = OverSampler(verbose=False, ratio=2.5)

            train_oversampled_x, train_oversampled_train_y = sm.fit_transform(
                homesite.train_x[train], homesite.train_y[train])

            probas_ = clf.fit(train_oversampled_x,
                              train_oversampled_train_y).predict_proba(
                                  homesite.train_x[test])

            fpr, tpr, thresholds = roc_curve(homesite.train_y[test],
                                             probas_[:, 1])
Beispiel #25
0
for f in field:
    print("field", f)
    temp = groups[f].median()
    for i in range(0, 100945):
        if (isnull(dataset.loc[i, f])):
            condition = dataset.loc[i, '_conds']
            dataset.loc[i, f] = temp[condition]
            print("values: ", dataset.loc[i, f], " ; ", temp[condition])

dataset['_heatindexm'].fillna(dataset['_heatindexm'].median(), inplace=True)
dataset['_hum'].fillna(dataset['_hum'].median(), inplace=True)
dataset['_tempm'].fillna(dataset['_tempm'].median(), inplace=True)
dataset['_vism'].fillna(dataset['_vism'].median(), inplace=True)

dataset = dataset.values
X = dataset[:, 1:len(dataset[0])]
Y = dataset[:, 0]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
for dept in range(5, 8):
    for feats in range(5, 8):
        classifier = AdaBoostClassifier(DecisionTreeClassifier(
            max_depth=dept,
            max_features=feats,
            splitter="best",
            criterion="entropy"),
                                        learning_rate=1.0)
        classifier.fit(X_train, Y_train)
        print("depth: ", dept, "features: ", feats)
        print("Score", classifier.score(X_train, Y_train))
l_train = l_train.join(pd.get_dummies(l_train['Transmission']))
l_train = l_train.join(pd.get_dummies(l_train['WheelType']))
l_train = l_train.join(pd.get_dummies(l_train['Size']))


l_train = l_train.drop(['Auction','Transmission','WheelType','Size'],axis=1)
l_train = l_train.dropna()

data = l_train.drop('IsBadBuy',axis=1)
target = l_train['IsBadBuy']
x_train, x_test, y_train, y_test = cross_validation.train_test_split(data, target, test_size=.3)


# AdaBoost Runs the best

model = AdaBoostClassifier()
clf = model.fit(x_train, y_train)
scores = clf.score(x_train,y_train)

print metrics.classification_report(y_train, clf.predict(x_train))
print metrics.classification_report(y_test, clf.predict(x_test))
y_pred = clf.predict(x_test)

metrics.roc_auc_score(y_train,clf.predict(x_train))
metrics.roc_auc_score(y_test,clf.predict(x_test))

# Create a submission
#submission = pd.DataFrame({ 'RefId' : l_test.RefId, 'prediction' : y_pred })
#submission.to_csv('/users/alexandersedgwick/desktop/submission.csv')

def result():
    if request.method == 'POST':
        path = request.files.get('myFile')

        df = pd.read_csv(path, encoding="ISO-8859-1")

        filename = request.form['filename']

        str1 = request.form['feature']
        str2 = request.form['label']

        if str1 in list(df) and str2 in list(df):
            y = df[str2]
            X = df[str1]
        else:
            return render_template('nameError.html')

        x = []
        for subject in X:
            result = re.sub(r"http\S+", "", subject)
            replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result)
            x.append(replaced)
        X = pd.Series(x)

        X = X.str.lower()
        """
        texts = []
        for doc in X:
            doc = nlp(doc, disable=['parser', 'ner'])
            tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
            tokens = [tok for tok in tokens if tok not in stopwords]
            tokens = ' '.join(tokens)
            texts.append(tokens)

        X = pd.Series(texts)
        """
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.33)

        tfidfvect = TfidfVectorizer(ngram_range=(1, 1))
        X_train_tfidf = tfidfvect.fit_transform(X_train)

        start = time()
        clf1 = LinearSVC()
        clf1.fit(X_train_tfidf, y_train)
        pred_SVC = clf1.predict(tfidfvect.transform(X_test))

        a1 = accuracy_score(y_test, pred_SVC)
        end = time()
        print("accuracy SVC: {} and time: {} s".format(a1, (end - start)))

        start = time()
        clf2 = LogisticRegression(n_jobs=-1,
                                  multi_class='multinomial',
                                  solver='newton-cg')
        clf2.fit(X_train_tfidf, y_train)
        pred_LR = clf2.predict(tfidfvect.transform(X_test))
        a2 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy LR: {} and time: {}".format(a2, (end - start)))

        start = time()
        clf3 = RandomForestClassifier(n_jobs=-1)

        clf3.fit(X_train_tfidf, y_train)
        pred = clf3.predict(tfidfvect.transform(X_test))
        a3 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy RFC: {} and time: {}".format(a3, (end - start)))

        start = time()
        clf4 = MultinomialNB()

        clf4.fit(X_train_tfidf, y_train)
        pred = clf4.predict(tfidfvect.transform(X_test))
        a4 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy MNB: {} and time: {}".format(a4, (end - start)))

        start = time()
        clf5 = GaussianNB()

        clf5.fit(X_train_tfidf.toarray(), y_train)
        pred = clf5.predict(tfidfvect.transform(X_test).toarray())
        a5 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy GNB: {} and time: {}".format(a5, (end - start)))

        start = time()
        clf6 = LogisticRegressionCV(n_jobs=-1)
        clf6.fit(X_train_tfidf, y_train)
        pred_LR = clf6.predict(tfidfvect.transform(X_test))
        a6 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy LRCV: {} and time: {}".format(a6, (end - start)))

        start = time()
        clf7 = AdaBoostClassifier()
        clf7.fit(X_train_tfidf, y_train)
        pred_LR = clf7.predict(tfidfvect.transform(X_test))
        a7 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy ABC: {} and time: {}".format(a7, (end - start)))

        start = time()
        clf8 = BernoulliNB()

        clf8.fit(X_train_tfidf.toarray(), y_train)
        pred = clf8.predict(tfidfvect.transform(X_test).toarray())
        a8 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy BNB: {} and time: {}".format(a8, (end - start)))

        start = time()
        clf9 = Perceptron(n_jobs=-1)

        clf9.fit(X_train_tfidf.toarray(), y_train)
        pred = clf9.predict(tfidfvect.transform(X_test).toarray())
        a9 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy Per: {} and time: {}".format(a9, (end - start)))
        start = time()
        clf10 = RidgeClassifierCV()

        clf10.fit(X_train_tfidf.toarray(), y_train)
        pred = clf10.predict(tfidfvect.transform(X_test).toarray())
        a10 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy RidCV: {} and time: {}".format(a10, (end - start)))

        start = time()
        clf11 = SGDClassifier(n_jobs=-1)

        clf11.fit(X_train_tfidf.toarray(), y_train)
        pred = clf11.predict(tfidfvect.transform(X_test).toarray())
        a11 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy SGDC: {} and time: {}".format(a11, (end - start)))
        start = time()
        clf12 = SGDClassifier(n_jobs=-1)

        clf12.fit(X_train_tfidf.toarray(), y_train)
        pred = clf12.predict(tfidfvect.transform(X_test).toarray())
        a12 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy XGBC: {} and time: {}".format(a12, (end - start)))

        acu_list = [a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12]
        max_list = max(acu_list)

        if max_list == a1:
            pickle.dump(clf1, open(filename + '_model', 'wb'))
        elif max_list == a2:
            pickle.dump(clf2, open(filename + '_model', 'wb'))
        elif max_list == a3:
            pickle.dump(clf3, open(filename + '_model', 'wb'))
        elif max_list == a4:
            pickle.dump(clf4, open(filename + '_model', 'wb'))
        elif max_list == a5:
            pickle.dump(clf5, open(filename + '_model', 'wb'))
        elif max_list == a6:
            pickle.dump(clf6, open(filename + '_model', 'wb'))
        elif max_list == a7:
            pickle.dump(clf7, open(filename + '_model', 'wb'))
        elif max_list == a8:
            pickle.dump(clf8, open(filename + '_model', 'wb'))
        elif max_list == a9:
            pickle.dump(clf9, open(filename + '_model', 'wb'))
        elif max_list == a10:
            pickle.dump(clf10, open(filename + '_model', 'wb'))
        elif max_list == a11:
            pickle.dump(clf11, open(filename + '_model', 'wb'))
        elif max_list == a12:
            pickle.dump(clf12, open(filename + '_model', 'wb'))

        pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb'))

        return render_template("result.html",
                               ac1=a1,
                               ac2=a2,
                               ac3=a3,
                               ac4=a4,
                               ac5=a5,
                               ac6=a6,
                               ac7=a7,
                               ac8=a8,
                               ac9=a9,
                               ac10=a10,
                               ac11=a11,
                               ac12=a12)
Beispiel #28
0
# Parameters
n_classes = 3
n_estimators = 30
plot_colors = "bry"
plot_step = 0.02

# Load data
iris = load_iris()

plot_idx = 1

for pair in ([0, 1], [0, 2], [2, 3]):
    for model in (DecisionTreeClassifier(),
                  RandomForestClassifier(n_estimators=n_estimators),
                  ExtraTreesClassifier(n_estimators=n_estimators),
                  AdaBoostClassifier(DecisionTreeClassifier(),
                                     n_estimators=n_estimators)):
        # We only take the two corresponding features
        X = iris.data[:, pair]
        y = iris.target
        # Shuffle
        idx = np.arange(X.shape[0])
        np.random.seed(13)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]
        # Standardize
        mean = X.mean(axis=0)
        std = X.std(axis=0)
        X = (X - mean) / std
        # Train
        clf = model.fit(X, y)
    temp=groups[f].median()
    for i in range(0,768):
        if (dataset.loc[i,f]==0) & (dataset.loc[i,'outcome']==0):
            dataset.loc[i,f]=temp[0]
        if (dataset.loc[i,f]==0) & (dataset.loc[i,'outcome']==1):
            dataset.loc[i,f]=temp[1]


dataset = dataset.values
X = dataset[:,0:len(dataset[0]) -1]
Y = dataset[:, (len(dataset[0])-1)]


#this is for decision tree
data=[[0,0,0,0,0]]
df=pd.DataFrame(data,columns=['feats','depth','split','max_leaf','acc'])
for feats in range(2, 7):
    for dept in range(2, 6):
        acc = 0
        for split in range(5,40,5):
            for leaf in range(7,10):
                for i in range(20):
                    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
                    classifier=AdaBoostClassifier(DecisionTreeClassifier(max_depth=dept, max_features=feats,min_samples_split=split,splitter="best",criterion="entropy",max_leaf_nodes=leaf),learning_rate=1.0)
                    classifier.fit(X_train, Y_train)
                    res = classifier.score(X_test, Y_test)
                    acc = acc + res
                acc = acc / 20    
                print('feats:', feats, 'Depth:', dept,'split:',split,'max_leaf',leaf, 'acc:', acc*100)
                df=df.append({'feats':feats,'depth':dept,'split':split,'max_leaf':leaf,'acc':acc},ignore_index=True)
df.to_csv('Adaboost_result.csv', sep=',')
Beispiel #30
0
from matplotlib import pyplot as plt

from custom_models import LoanPytorchModel

#Pulling in all data from 2007-2014
wayne_all = WayneLoanApprovalLoader(savename='wayneall_indicator',
                                    csvfile='wayne_county_2007_2014.tsv')

# We have some data, now lets choose a model and some metrics, before putting them into experiment objects.
lr1 = LogisticRegression()
lr2 = LogisticRegression()

lrb1 = LogisticRegression(class_weight='balanced')
lrb2 = LogisticRegression(class_weight='balanced')

ada1 = AdaBoostClassifier()
ada2 = AdaBoostClassifier()

timemodels = [lr1, lr2]

criterion = accuracy_score  # Thankfully this task has a pretty easy evaluation... you either get it right or wrong

# Getting temporally contiguous cuts of data, putting them into different experiments
data_time1 = wayne_all.get_dates([2007, 2008, 2009, 2010])
expmt_time1 = StratifiedExperiment(timemodels[0],
                                   criterion,
                                   data_time1[:, :-1],
                                   data_time1[:, -1],
                                   test_size=0.8)

data_time2 = wayne_all.get_dates([2011, 2012, 2013, 2014])
Beispiel #31
0
    # Train and test random forests.
    # load_path = "../homesite_data/resources/oversampled_normalized_data_ratio_2.5.bin"
    load_path = "../homesite_data/resources/oversampled_normalized_data_ratio_2.bin"
    homesite = Data()
    homesite.load_sliptted_data(load_path)
    del homesite.test_x  # Deleted to save memory.

    clf_ann = NeuralNetwork(path = "../homesite_data/ann_weights.bin", lr = 0.00005, \
                        lamb = 0)
    train_output_ann = clf_ann.get_hidden_output(homesite.train_x)
    validation_output_ann = clf_ann.get_hidden_output(homesite.validation_x)
    # train_output_ann = np.hstack((train_output_ann, homesite.train_x))
    # validation_output_ann = np.hstack((validation_output_ann, homesite.validation_x))

    for c in range(1, 10):
        # Train classifier.
        print "Training classifier."
        clf = AdaBoostClassifier(n_estimators=1 + 100 * c)
        clf.fit(train_output_ann, homesite.train_y)

        # Test classifier.
        print 'Testing classifier.'
        predicted_labels = clf.predict_proba(validation_output_ann)[:, 1]

        # Show final results.
        results = confusion_matrix(homesite.validation_y,
                                   np.round(predicted_labels))
        accuracy, precision, recall = compute_performance_metrics(results)
        auc = compute_auc(homesite.validation_y, predicted_labels)
Beispiel #32
0
plt.title("Variance VS Components")
plt.show()

# Selecting the ideal number of components and fitting the data
pca = PCA(n_components=35)
X = pca.fit_transform(X)

### Training the models ###
models = [
    ("Gaussian NB", GaussianNB()),
    ("KNN", KNeighborsClassifier()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("Logistic Regression", LogisticRegression()),
    ("LDA", LinearDiscriminantAnalysis()),
    ("AdaBoost", AdaBoostClassifier()),
    ("QDA", QuadraticDiscriminantAnalysis()),
    ("Neural Net", MLPClassifier()),
    ("Gradient Boosting", GradientBoostingClassifier()),
    ("Extra Trees", ExtraTreesClassifier()),
    # ("SVM", SVC(kernel="linear")),
    ("XGBOOST Classifer", XGBClassifier()),
]

## Model comparison ###
start = timeit.default_timer()

accuracies = []
for name, model in models:

    # kfold = model_selection.KFold(n_splits=10)
Beispiel #33
0
train_data = train_data.dropna()
train_data = preprocess_data(train_data)

X = train_data[['is_1', 'is_2', 'is_3', 'Fare', 'is_male', 'is_female']]
Y = train_data['Survived']

XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size=0.2)

n_estimators = 100

models = [
    DecisionTreeClassifier(max_depth=3),
    BaggingClassifier(n_estimators=n_estimators),
    RandomForestClassifier(n_estimators=n_estimators),
    ExtraTreesClassifier(n_estimators=n_estimators),
    AdaBoostClassifier(n_estimators=n_estimators)
]

model_title = [
    'DecisionTree', 'Bagging', 'RandomForest', 'ExtraTrees', 'AdaBoost'
]

surv_preds, surv_probs, scores, fprs, tprs, thres = ([] for i in range(6))

for i, model in enumerate(models):
    print('Fitting {0}'.format(model_title[i]))

    clf = model.fit(XTrain, YTrain)
    surv_preds.append(model.predict(XTest))
    surv_probs.append(model.predict_proba(XTest))
    scores.append(model.score(XTest, YTest))
Beispiel #34
0
    tuned_parameters = [{'n_estimators':[5, 10, 100, 200],
                         'criterion':['gini', 'entropy'],
                         'max_features':['log2', 'sqrt'],
                         'max_depth':[10, 100]
                     }]
    algo = RandomForestClassifier()

elif choice=='i' or choice=='I':
    print("\n**********************************\n")
    print("  \t AdaBoost Classifier")
    tuned_parameters = [{'n_estimators':[5, 10, 50, 100, 200],
                         'learning_rate':[0.1, 0.2, 0.5, 1],
                         'algorithm':['SAMME', 'SAMME.R'],
                         'random_state':[1, 2, 3, 5]
                     }]
    algo = AdaBoostClassifier()
    
elif choice=='j' or choice=='J':
    print("\n**********************************\n")
    print("  \t Gradient Boosting Classifier")
    tuned_parameters = [{'n_estimators':[5, 10, 50, 100, 200],
                         'learning_rate':[0.1, 0.2, 0.5, 1],
                         'min_impurity_decrease': [0.0001],
                         'max_depth':[10, 100]
                     }]
    algo = GradientBoostingClassifier()
    
elif choice=='k' or choice=='K':
    print("\n**********************************\n")
    print("  \t XG Boost")
    tuned_parameters = [{'n_estimators':[5, 10, 50, 100, 200],
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

#:# preprocessing

transform_pipeline = Pipeline([('scaler', StandardScaler())])

X_train = pd.DataFrame(transform_pipeline.fit_transform(X_train),
                       columns=X_train.columns)

#:# model

params = {'learning_rate': 0.5, 'n_estimators': 300}

classifier = AdaBoostClassifier(**params)
classifier.fit(X_train, y_train)

#:# hash
#:# e595f5d5683f3e3692608020cd5bde18
md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest()
print(f'md5: {md5}')

#:# audit
y_pred = classifier.predict(transform_pipeline.transform(X_test))
y_pred_proba = classifier.predict_proba(
    transform_pipeline.transform(X_test))[:, 1]

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f'acc: {accuracy_score(y_test, y_pred)}')
Beispiel #36
0
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm.classes import SVR
from sklearn.tree import DecisionTreeClassifier


DECISION_TREE = DecisionTreeClassifier()
LOGISTIC_REGRESSION = LogisticRegression()
NAIVE_BAYS = GaussianNB()

K_N_N = KNeighborsClassifier()
SUPPORT_VECTOR = svm.SVC(kernel="linear")

# Ensemble classifiers
RANDOM_FOREST = RandomForestClassifier(n_estimators=100)
GRADIENT_BOOST_CL = GradientBoostingClassifier(n_estimators=100)
ADA_BOOST = AdaBoostClassifier(n_estimators=100)
EXTRA_TREE = ExtraTreesClassifier(n_estimators=100)


# Regressors
GRADIENT_BOOST_RG = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
LINEAR_RG = LinearRegression()
RIDGE_RG = Ridge()
LASSO_RG = Lasso()
SVR_RG = SVR()

def getClassifierMap():
    CLASSIFIER_MAP = {
    "DECISION_TREE": DECISION_TREE,
    "LOGISTIC_REGRESSION": LOGISTIC_REGRESSION,
    "NAIVE_BAYS": NAIVE_BAYS,