Beispiel #1
0
def ada():
    pipeline = Pipeline([('count_vectorizer',
                          CountVectorizer(binary=True,
                                          ngram_range=(1, 2),
                                          max_features=15000,
                                          stop_words=stopwords)),
                         ('clf', AdaBoostClassifier())])
    train_report(pipeline)
Beispiel #2
0
def check_classifiers(n_samples=10000):
    """
    This function is not tested by default, it should be called manually
    """
    testX, testY = generate_sample(n_samples, 10, 0.6)
    trainX, trainY = generate_sample(n_samples, 10, 0.6)
    uniform_features = ['column0']

    ada = AdaBoostClassifier(n_estimators=50)
    ideal_bayes = GaussianNB()

    uBoost_SAMME = uBoostClassifier(
        uniform_features=uniform_features,
        uniform_label=1,
        n_neighbors=50,
        efficiency_steps=5,
        n_estimators=50,
        algorithm="SAMME")

    uBoost_SAMME_R = uBoostClassifier(
        uniform_features=uniform_features,
        uniform_label=1,
        n_neighbors=50,
        efficiency_steps=5,
        n_estimators=50,
        algorithm="SAMME.R")

    uBoost_SAMME_R_threaded = uBoostClassifier(
        uniform_features=uniform_features,
        uniform_label=1,
        n_neighbors=50,
        efficiency_steps=5,
        n_estimators=50,
        n_threads=3,
        subsample=0.9,
        algorithm="SAMME.R")

    clf_dict = OrderedDict({
        "Ada": ada,
        "uBOOST": uBoost_SAMME,
        "uBOOST.R": uBoost_SAMME_R,
        "uBOOST.R2": uBoost_SAMME_R_threaded
    })

    cvms = {}
    for clf_name, clf in clf_dict.items():
        clf.fit(trainX, trainY)
        p = clf.predict_proba(testX)
        metric = KnnBasedCvM(uniform_features=uniform_features)
        metric.fit(testX, testY)
        cvms[clf_name] = metric(testY, p, sample_weight=np.ones(len(testY)))

    assert cvms['uBOOST'] < cvms['ada']
    print(cvms)
Beispiel #3
0
def get_feature_ranking(X_train, y_train):
    print("feature ranking running....-> LogisticRegression")
    model1 = LogisticRegression(max_iter=500)
    rfe = RFECV(estimator=model1,
                step=1,
                cv=StratifiedKFold(2),
                scoring='accuracy')
    rfe = rfe.fit(X_train, y_train)
    logr_ranking = []
    for x, d in zip(rfe.ranking_, X_train.columns):
        logr_ranking.append([d, x])
    logr_ranking = pd.DataFrame(logr_ranking, columns=['features1', 'logr'])
    logr_ranking.sort_values('features1', inplace=True)

    print("feature ranking running....-> GradientBoostingClassifier")
    model2 = GradientBoostingClassifier()
    rfe = RFECV(estimator=model2,
                step=1,
                cv=StratifiedKFold(2),
                scoring='accuracy')
    rfe = rfe.fit(X_train, y_train)
    gboost_ranking = []
    for x, d in zip(rfe.ranking_, X_train.columns):
        gboost_ranking.append([d, x])
    gboost_ranking = pd.DataFrame(gboost_ranking,
                                  columns=['features2', 'gboost'])
    gboost_ranking.sort_values('features2', inplace=True)

    print("feature ranking running....-> AdaBoostClassifier")
    model3 = AdaBoostClassifier()
    rfe = RFECV(estimator=model3,
                step=1,
                cv=StratifiedKFold(2),
                scoring='accuracy')
    rfe = rfe.fit(X_train, y_train)
    adaboost_ranking = []
    for x, d in zip(rfe.ranking_, X_train.columns):
        adaboost_ranking.append([d, x])
    adaboost_ranking = pd.DataFrame(adaboost_ranking,
                                    columns=['features3', 'adaboost'])
    adaboost_ranking.sort_values('features3', inplace=True)

    feature_sum = logr_ranking['logr'] + gboost_ranking[
        'gboost'] + adaboost_ranking['adaboost']
    df_ranked = pd.concat([logr_ranking['features1'], feature_sum], axis=1)
    df_ranked.sort_values(0, inplace=True)

    return df_ranked
Beispiel #4
0
def check_classifiers(n_samples=10000, output_name_pattern=None):
    """
    This function is not tested by default, it should be called manually
    """
    testX, testY = generate_sample(n_samples, 10, 0.6)
    trainX, trainY = generate_sample(n_samples, 10, 0.6)
    uniform_variables = ['column0']

    ada = AdaBoostClassifier(n_estimators=50)
    ideal_bayes = HidingClassifier(train_variables=trainX.columns[1:],
                                   base_estimator=GaussianNB())

    uBoost_SAMME = uBoostClassifier(uniform_variables=uniform_variables,
                                    n_neighbors=50,
                                    efficiency_steps=5,
                                    n_estimators=50,
                                    algorithm="SAMME")
    uBoost_SAMME_R = uBoostClassifier(uniform_variables=uniform_variables,
                                      n_neighbors=50,
                                      efficiency_steps=5,
                                      n_estimators=50,
                                      algorithm="SAMME.R")

    clf_dict = ClassifiersDict({
        "Ada": ada,
        "Ideal": ideal_bayes,
        "uBOOST": uBoost_SAMME,
        "uBOOST.R": uBoost_SAMME_R
    })

    clf_dict.fit(trainX, trainY)

    predictions = Predictions(clf_dict, testX, testY)
    # predictions.print_mse(uniform_variables, in_html=False)
    print(predictions.compute_metrics())

    predictions.sde_curves(uniform_variables)
    if output_name_pattern is not None:
        pl.savefig(output_name_pattern % "mse_curves", bbox="tight")
    _ = pl.figure()
    predictions.learning_curves()
    if output_name_pattern is not None:
        pl.savefig(output_name_pattern % "learning_curves", bbox="tight")
    predictions.efficiency(uniform_variables)
    if output_name_pattern is not None:
        pl.savefig(output_name_pattern % "efficiency_curves", bbox="tight")
Beispiel #5
0
def defaultModels(df_xmat, df_ymat_cat):

    #### representitive common classifiers in sklearn ####
    classifiers = [
        GaussianNB(),
        LogisticRegression(max_iter=500),
        DecisionTreeClassifier(),
        KNeighborsClassifier(),
        SVC(kernel='rbf'),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        RandomForestClassifier(),
    ]

    cv = StratifiedKFold(n_splits=10)

    res = []

    for clf in classifiers:

        print('processing...' + str(clf)[:10])

        metrics_cv = []

        for train_index, test_index in cv.split(df_xmat.values, df_ymat_cat):

            X_train = df_xmat.iloc[train_index, :].values
            X_test = df_xmat.iloc[test_index, :].values
            y_train = [df_ymat_cat[i] for i in train_index]
            y_test = [df_ymat_cat[i] for i in test_index]

            clf.fit(X_train, y_train)

            metrics_cv.append(clf.score(X_test, y_test))

        res.append([
            str(clf)[:10],
            np.array(metrics_cv).mean(axis=0),
            np.array(metrics_cv).std(axis=0)
        ])

    return res
Beispiel #6
0
    def __init__(self):
        self.random_rate=33
        clf1=SVC(C=1.0,random_state=33)
        clf2=XGBClassifier(n_estimators=220,learning_rate=0.2,min_child_weight=2.3)
        clf3=RandomForestClassifier(n_estimators=80,random_state=330,n_jobs=-1)
        clf4=BaggingClassifier(n_estimators=40,random_state=101)
        clf5=AdaBoostClassifier(n_estimators=70,learning_rate=1.5,random_state=33)
        clf6=GradientBoostingClassifier(n_estimators=250,learning_rate=0.23,random_state=33)

        clf7=XGBClassifier(n_estimators=100,learning_rate=0.12,min_child_weight=1)


        base_model=[
            ['svc',clf1],
            ['xgbc',clf2],
            ['rfc',clf3],
            ['bgc',clf4],
            ['adbc',clf5],
            ['gdbc',clf6]
        ]

        self.base_models=base_model
        self.XGB=clf7
Beispiel #7
0
# NOTE:  Adjust Trainingset / Testset division ratio:
divratio = 0.3


# Normalization (L1 & L2):
# NOTE:  Change 'normtype' value to 'l1' / 'l2' to change normalization type:
normtype = 'l2'#'l1'


# model_selection is used for manually enabling the individual models.
# NOTE:  Setting boolean value, eanbles/disables model.
model_selection = {
    'ExtraTrees': ( True, ExtraTreesClassifier(n_estimators='warn', criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None) ),
    'RandomForest': ( True, RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) ),
    'AdaBoost': ( True, AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None) ),
    'DecisionTree': ( True, DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=5, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False) ),
    'GradientBoosting': (True, GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001) ),
    'BernoulliNB': (True, BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None) ),
    'BaggingClassifier': (True, BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0) ),
    'NearestNeighbors': (True, KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None) ), # (n_neighbors=4) ),
    'LogisticRegressionCV': (True, LogisticRegressionCV(Cs=10, fit_intercept=True, cv='warn', dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=0.0001, max_iter=100, class_weight=None, n_jobs=None, verbose=0, refit=True, intercept_scaling=1.0, multi_class='warn', random_state=None, l1_ratios=None) ),
    'LDA': (True, LinearDiscriminantAnalysis(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) ),
    'LogisticRegression': (True, LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None) ),
    'CalibratedClassifierCV': (True, CalibratedClassifierCV(base_estimator=None, method='sigmoid', cv='warn') ),
    'LinearSVC': (True, LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000) ),
    'LinearSVM': ( True, SVC(kernel='linear', C=0.025) ),  # (C=0.01, penalty='l1', dual=False) ),
    'RBF_SVM': (True, SVC(gamma='auto') ),#gamma=2, C=1) ), #
    'Nu_SVM': (True, NuSVC(gamma='auto') ),
    'GaussianProcess': (False, GaussianProcessClassifier() ), #(1.0 * RBF(1.0)) ),
    'NeuralNet': (True, MLPClassifier(alpha=1, max_iter=1000) ),
Beispiel #8
0
from sklearn.svm.classes import SVC
import os
import warnings
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import pickle
from sklearn.model_selection import train_test_split
import shutil
from statistics import mean

warnings.filterwarnings('ignore')

classifiers = [
    AdaBoostClassifier(),
    BaggingClassifier(),
    BernoulliNB(),
    CalibratedClassifierCV(),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    ExtraTreesClassifier(),
    GaussianNB(),
    GaussianProcessClassifier(),
    GradientBoostingClassifier(),
    KNeighborsClassifier(),
    LabelPropagation(),
    LabelSpreading(),
    LinearDiscriminantAnalysis(),
    LogisticRegression(),
    LogisticRegressionCV(),
Beispiel #9
0
from sklearn.cluster.bicluster import SpectralBiclustering
from sklearn.cluster.spectral import SpectralClustering
from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.manifold.spectral_embedding_ import SpectralEmbedding
from sklearn.preprocessing.data import StandardScaler
from sklearn.manifold.t_sne import TSNE
from sklearn.linear_model.theil_sen import TheilSenRegressor
from sklearn.mixture.dpgmm import VBGMM
from sklearn.feature_selection.variance_threshold import VarianceThreshold

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


clf_dict = {'ARDRegression':ARDRegression(),
			'AdaBoostClassifier':AdaBoostClassifier(),
			'AdaBoostRegressor':AdaBoostRegressor(),
			'AdditiveChi2Sampler':AdditiveChi2Sampler(),
			'AffinityPropagation':AffinityPropagation(),
			'AgglomerativeClustering':AgglomerativeClustering(),
			'BaggingClassifier':BaggingClassifier(),
			'BaggingRegressor':BaggingRegressor(),
			'BayesianGaussianMixture':BayesianGaussianMixture(),
			'BayesianRidge':BayesianRidge(),
			'BernoulliNB':BernoulliNB(),
			'BernoulliRBM':BernoulliRBM(),
			'Binarizer':Binarizer(),
			'Birch':Birch(),
			'CCA':CCA(),
			'CalibratedClassifierCV':CalibratedClassifierCV(),
			'DBSCAN':DBSCAN(),
Beispiel #10
0
from sklearn.tree import DecisionTreeClassifier

# classification models
classifiers = {
    'K-Nearest Neighbors (Braycurtis norm)':
    KNeighborsClassifier(n_neighbors=3, algorithm='auto', metric='braycurtis'),
    'Random Forest':
    RandomForestClassifier(n_estimators=80, n_jobs=1),
    'SVM':
    SVC(gamma=2, C=1),
    'Linear Support Vector Machine':
    SVC(kernel="linear", C=0.025),
    'Decision Tree':
    DecisionTreeClassifier(max_depth=5),
    'Ada Boost':
    AdaBoostClassifier(n_estimators=80, learning_rate=0.4),
    'Naive Bayes':
    GaussianNB(),
}
vc = VotingClassifier(estimators=list(classifiers.items()), voting='hard')


def evaluate_model(model_name, model, x, y):
    """Evaluate model accuracy via cross validation."""
    print('%s:' % model_name)
    model.fit(x, y.values.ravel())
    print('CV f1_micro (not reusing data): %s' % np.mean(
        cross_val_score(model, x, y.values.ravel(), cv=5, scoring='f1_micro')))


def predict(x, y, signal_matrix, verbose=1):
Beispiel #11
0
    # Train and test random forests.
    # load_path = "../homesite_data/resources/oversampled_normalized_data_ratio_2.5.bin"
    load_path = "../homesite_data/resources/oversampled_normalized_data_ratio_2.bin"
    homesite = Data()
    homesite.load_sliptted_data(load_path)
    del homesite.test_x  # Deleted to save memory.

    clf_ann = NeuralNetwork(path = "../homesite_data/ann_weights.bin", lr = 0.00005, \
                        lamb = 0)
    train_output_ann = clf_ann.get_hidden_output(homesite.train_x)
    validation_output_ann = clf_ann.get_hidden_output(homesite.validation_x)
    # train_output_ann = np.hstack((train_output_ann, homesite.train_x))
    # validation_output_ann = np.hstack((validation_output_ann, homesite.validation_x))

    for c in range(1, 10):
        # Train classifier.
        print "Training classifier."
        clf = AdaBoostClassifier(n_estimators=1 + 100 * c)
        clf.fit(train_output_ann, homesite.train_y)

        # Test classifier.
        print 'Testing classifier.'
        predicted_labels = clf.predict_proba(validation_output_ann)[:, 1]

        # Show final results.
        results = confusion_matrix(homesite.validation_y,
                                   np.round(predicted_labels))
        accuracy, precision, recall = compute_performance_metrics(results)
        auc = compute_auc(homesite.validation_y, predicted_labels)
Beispiel #12
0
for f in field:
    print("field", f)
    temp = groups[f].median()
    for i in range(0, 100945):
        if (isnull(dataset.loc[i, f])):
            condition = dataset.loc[i, '_conds']
            dataset.loc[i, f] = temp[condition]
            print("values: ", dataset.loc[i, f], " ; ", temp[condition])

dataset['_heatindexm'].fillna(dataset['_heatindexm'].median(), inplace=True)
dataset['_hum'].fillna(dataset['_hum'].median(), inplace=True)
dataset['_tempm'].fillna(dataset['_tempm'].median(), inplace=True)
dataset['_vism'].fillna(dataset['_vism'].median(), inplace=True)

dataset = dataset.values
X = dataset[:, 1:len(dataset[0])]
Y = dataset[:, 0]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
for dept in range(5, 8):
    for feats in range(5, 8):
        classifier = AdaBoostClassifier(DecisionTreeClassifier(
            max_depth=dept,
            max_features=feats,
            splitter="best",
            criterion="entropy"),
                                        learning_rate=1.0)
        classifier.fit(X_train, Y_train)
        print("depth: ", dept, "features: ", feats)
        print("Score", classifier.score(X_train, Y_train))
    #    homesite.train_x = homesite.train_x[reduced_range]
    #    homesite.train_y = homesite.train_y[reduced_range]

    C = [256, 512]
    for c in C:
        # Creating classifier.
        mean_acc = 0.0
        mean_recall = 0.0
        mean_precision = 0.0
        mean_tpr = 0.0
        mean_fpr = np.linspace(0, 1, 100)
        all_tpr = []

        cvs = StratifiedKFold(homesite.train_y, n_folds=5)

        clf = AdaBoostClassifier(n_estimators=c, random_state=0)

        # Train classifier.
        print "\nTraining classifier param %d" % c
        for i, (train, test) in enumerate(cvs):
            sm = OverSampler(verbose=False, ratio=2.5)

            train_oversampled_x, train_oversampled_train_y = sm.fit_transform(
                homesite.train_x[train], homesite.train_y[train])

            probas_ = clf.fit(train_oversampled_x,
                              train_oversampled_train_y).predict_proba(
                                  homesite.train_x[test])

            fpr, tpr, thresholds = roc_curve(homesite.train_y[test],
                                             probas_[:, 1])
Beispiel #14
0
def select_algorithm(X_all, y_all):
    print("\n\n")

    algos = [
        {
            "Name": "NB",
            "Classifier": GaussianNB(),
            "ParamGrid": {},
        },
        {
            "Name": "SVM",
            "Classifier": SVC(),
            "ParamGrid": {
                # 'kernel': ['linear', 'rbf'],
                # 'C': [1, 1e3, 5e3, 1e4, 5e4, 1e5],
                #    'gamma': ['auto', 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
                #    'max_iter': [-1, 1, 2, 3, 4, 5],
            },
        },
        {
            "Name": "KNN",
            "Classifier": KNeighborsClassifier(),
            "ParamGrid": {
                "n_neighbors": [3, 4, 5, 6, 7, 8, 9],
                "p": [1, 2, 3],
                'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            },
        },
        {
            "Name": "ADA",
            "Classifier": AdaBoostClassifier(),
            "ParamGrid": {
                "n_estimators": [10, 20, 30, 40, 50, 60, 70, 80],
                "learning_rate": [0.5, 1.0, 1.5, 2.0],
                'algorithm': ['SAMME', 'SAMME.R'],
            },
        },
        {
            "Name": "CART",
            "Classifier": DecisionTreeClassifier(),
            "ParamGrid": {
                "min_samples_split": [2, 3, 4, 5],
                "max_depth": [None, 4, 5, 6, 7, 8],
                'criterion': ['gini', 'entropy'],
            },
        },
    ]

    results = []
    names = [d['Name'] for d in algos]

    best_clf = None
    best_score = 0

    print("Selecting algorithm using cross validation")
    print("With GridSearchCV for parameter tuning")
    print("Candidates are: {0}".format(names))
    for algo in algos:
        # name = algo["Classifier"].__doc__[:24].strip()
        name = algo["Name"]
        print("\n\n")
        print("{0}".format(name))

        t0 = time()
        cv_score, cv_clf = tune_and_test_classifier(algo["Classifier"],
                                                    algo["ParamGrid"],
                                                    my_dataset, features_list)
        print("Algorithm tuned and tested in %0.3fs" % (time() - t0))
        print("Score: {0}".format(cv_score))

        if cv_score > best_score:
            best_score = cv_score
            best_clf = cv_clf

    print("\n\n")
    print("Best classifier:")
    print(best_clf)
    print("Best score: {0}".format(best_score))

    assert best_score > 0, "THRESHOLD NOT MET"

    print("\n\n")
    return best_clf
def result():
    if request.method == 'POST':
        path = request.files.get('myFile')

        df = pd.read_csv(path, encoding="ISO-8859-1")

        filename = request.form['filename']

        str1 = request.form['feature']
        str2 = request.form['label']

        if str1 in list(df) and str2 in list(df):
            y = df[str2]
            X = df[str1]
        else:
            return render_template('nameError.html')

        x = []
        for subject in X:
            result = re.sub(r"http\S+", "", subject)
            replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result)
            x.append(replaced)
        X = pd.Series(x)

        X = X.str.lower()
        """
        texts = []
        for doc in X:
            doc = nlp(doc, disable=['parser', 'ner'])
            tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
            tokens = [tok for tok in tokens if tok not in stopwords]
            tokens = ' '.join(tokens)
            texts.append(tokens)

        X = pd.Series(texts)
        """
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.33)

        tfidfvect = TfidfVectorizer(ngram_range=(1, 1))
        X_train_tfidf = tfidfvect.fit_transform(X_train)

        start = time()
        clf1 = LinearSVC()
        clf1.fit(X_train_tfidf, y_train)
        pred_SVC = clf1.predict(tfidfvect.transform(X_test))

        a1 = accuracy_score(y_test, pred_SVC)
        end = time()
        print("accuracy SVC: {} and time: {} s".format(a1, (end - start)))

        start = time()
        clf2 = LogisticRegression(n_jobs=-1,
                                  multi_class='multinomial',
                                  solver='newton-cg')
        clf2.fit(X_train_tfidf, y_train)
        pred_LR = clf2.predict(tfidfvect.transform(X_test))
        a2 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy LR: {} and time: {}".format(a2, (end - start)))

        start = time()
        clf3 = RandomForestClassifier(n_jobs=-1)

        clf3.fit(X_train_tfidf, y_train)
        pred = clf3.predict(tfidfvect.transform(X_test))
        a3 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy RFC: {} and time: {}".format(a3, (end - start)))

        start = time()
        clf4 = MultinomialNB()

        clf4.fit(X_train_tfidf, y_train)
        pred = clf4.predict(tfidfvect.transform(X_test))
        a4 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy MNB: {} and time: {}".format(a4, (end - start)))

        start = time()
        clf5 = GaussianNB()

        clf5.fit(X_train_tfidf.toarray(), y_train)
        pred = clf5.predict(tfidfvect.transform(X_test).toarray())
        a5 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy GNB: {} and time: {}".format(a5, (end - start)))

        start = time()
        clf6 = LogisticRegressionCV(n_jobs=-1)
        clf6.fit(X_train_tfidf, y_train)
        pred_LR = clf6.predict(tfidfvect.transform(X_test))
        a6 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy LRCV: {} and time: {}".format(a6, (end - start)))

        start = time()
        clf7 = AdaBoostClassifier()
        clf7.fit(X_train_tfidf, y_train)
        pred_LR = clf7.predict(tfidfvect.transform(X_test))
        a7 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy ABC: {} and time: {}".format(a7, (end - start)))

        start = time()
        clf8 = BernoulliNB()

        clf8.fit(X_train_tfidf.toarray(), y_train)
        pred = clf8.predict(tfidfvect.transform(X_test).toarray())
        a8 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy BNB: {} and time: {}".format(a8, (end - start)))

        start = time()
        clf9 = Perceptron(n_jobs=-1)

        clf9.fit(X_train_tfidf.toarray(), y_train)
        pred = clf9.predict(tfidfvect.transform(X_test).toarray())
        a9 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy Per: {} and time: {}".format(a9, (end - start)))
        start = time()
        clf10 = RidgeClassifierCV()

        clf10.fit(X_train_tfidf.toarray(), y_train)
        pred = clf10.predict(tfidfvect.transform(X_test).toarray())
        a10 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy RidCV: {} and time: {}".format(a10, (end - start)))

        start = time()
        clf11 = SGDClassifier(n_jobs=-1)

        clf11.fit(X_train_tfidf.toarray(), y_train)
        pred = clf11.predict(tfidfvect.transform(X_test).toarray())
        a11 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy SGDC: {} and time: {}".format(a11, (end - start)))
        start = time()
        clf12 = SGDClassifier(n_jobs=-1)

        clf12.fit(X_train_tfidf.toarray(), y_train)
        pred = clf12.predict(tfidfvect.transform(X_test).toarray())
        a12 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy XGBC: {} and time: {}".format(a12, (end - start)))

        acu_list = [a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12]
        max_list = max(acu_list)

        if max_list == a1:
            pickle.dump(clf1, open(filename + '_model', 'wb'))
        elif max_list == a2:
            pickle.dump(clf2, open(filename + '_model', 'wb'))
        elif max_list == a3:
            pickle.dump(clf3, open(filename + '_model', 'wb'))
        elif max_list == a4:
            pickle.dump(clf4, open(filename + '_model', 'wb'))
        elif max_list == a5:
            pickle.dump(clf5, open(filename + '_model', 'wb'))
        elif max_list == a6:
            pickle.dump(clf6, open(filename + '_model', 'wb'))
        elif max_list == a7:
            pickle.dump(clf7, open(filename + '_model', 'wb'))
        elif max_list == a8:
            pickle.dump(clf8, open(filename + '_model', 'wb'))
        elif max_list == a9:
            pickle.dump(clf9, open(filename + '_model', 'wb'))
        elif max_list == a10:
            pickle.dump(clf10, open(filename + '_model', 'wb'))
        elif max_list == a11:
            pickle.dump(clf11, open(filename + '_model', 'wb'))
        elif max_list == a12:
            pickle.dump(clf12, open(filename + '_model', 'wb'))

        pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb'))

        return render_template("result.html",
                               ac1=a1,
                               ac2=a2,
                               ac3=a3,
                               ac4=a4,
                               ac5=a5,
                               ac6=a6,
                               ac7=a7,
                               ac8=a8,
                               ac9=a9,
                               ac10=a10,
                               ac11=a11,
                               ac12=a12)
Beispiel #16
0
# Parameters
n_classes = 3
n_estimators = 30
plot_colors = "bry"
plot_step = 0.02

# Load data
iris = load_iris()

plot_idx = 1

for pair in ([0, 1], [0, 2], [2, 3]):
    for model in (DecisionTreeClassifier(),
                  RandomForestClassifier(n_estimators=n_estimators),
                  ExtraTreesClassifier(n_estimators=n_estimators),
                  AdaBoostClassifier(DecisionTreeClassifier(),
                                     n_estimators=n_estimators)):
        # We only take the two corresponding features
        X = iris.data[:, pair]
        y = iris.target
        # Shuffle
        idx = np.arange(X.shape[0])
        np.random.seed(13)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]
        # Standardize
        mean = X.mean(axis=0)
        std = X.std(axis=0)
        X = (X - mean) / std
        # Train
        clf = model.fit(X, y)
Beispiel #17
0
        print("LEARNING STEP")

    #default
    classifier = "not_init"

    if alg == 0:
        classifier = DecisionTreeClassifier(max_depth=tree_depth)
    if alg == 1:
        classifier = RandomForestClassifier(n_estimators=random_forest_size,
                                            random_state=seed,
                                            n_jobs=10)
    if alg == 2:
        classifier = create_ensemble(seed)
    if alg == 3:
        classifier = AdaBoostClassifier(DecisionTreeClassifier(),
                                        n_estimators=boosting_size,
                                        random_state=seed)
    if alg == 4:
        scaler = StandardScaler()
        svr = SVR(kernel='rbf',
                  cache_size=4000,
                  C=1e3,
                  gamma=0.0001,
                  max_iter=200000,
                  epsilon=0.0001)
        classifier = Pipeline([('standardize', scaler), ('svr', svr)])
    if alg == 5:
        classifier = GaussianNB()

    if classifier == "not_init":
        print("Classifier not init, exit")
Beispiel #18
0
from matplotlib import pyplot as plt

from custom_models import LoanPytorchModel

#Pulling in all data from 2007-2014
wayne_all = WayneLoanApprovalLoader(savename='wayneall_indicator',
                                    csvfile='wayne_county_2007_2014.tsv')

# We have some data, now lets choose a model and some metrics, before putting them into experiment objects.
lr1 = LogisticRegression()
lr2 = LogisticRegression()

lrb1 = LogisticRegression(class_weight='balanced')
lrb2 = LogisticRegression(class_weight='balanced')

ada1 = AdaBoostClassifier()
ada2 = AdaBoostClassifier()

timemodels = [lr1, lr2]

criterion = accuracy_score  # Thankfully this task has a pretty easy evaluation... you either get it right or wrong

# Getting temporally contiguous cuts of data, putting them into different experiments
data_time1 = wayne_all.get_dates([2007, 2008, 2009, 2010])
expmt_time1 = StratifiedExperiment(timemodels[0],
                                   criterion,
                                   data_time1[:, :-1],
                                   data_time1[:, -1],
                                   test_size=0.8)

data_time2 = wayne_all.get_dates([2011, 2012, 2013, 2014])
    temp=groups[f].median()
    for i in range(0,768):
        if (dataset.loc[i,f]==0) & (dataset.loc[i,'outcome']==0):
            dataset.loc[i,f]=temp[0]
        if (dataset.loc[i,f]==0) & (dataset.loc[i,'outcome']==1):
            dataset.loc[i,f]=temp[1]


dataset = dataset.values
X = dataset[:,0:len(dataset[0]) -1]
Y = dataset[:, (len(dataset[0])-1)]


#this is for decision tree
data=[[0,0,0,0,0]]
df=pd.DataFrame(data,columns=['feats','depth','split','max_leaf','acc'])
for feats in range(2, 7):
    for dept in range(2, 6):
        acc = 0
        for split in range(5,40,5):
            for leaf in range(7,10):
                for i in range(20):
                    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
                    classifier=AdaBoostClassifier(DecisionTreeClassifier(max_depth=dept, max_features=feats,min_samples_split=split,splitter="best",criterion="entropy",max_leaf_nodes=leaf),learning_rate=1.0)
                    classifier.fit(X_train, Y_train)
                    res = classifier.score(X_test, Y_test)
                    acc = acc + res
                acc = acc / 20    
                print('feats:', feats, 'Depth:', dept,'split:',split,'max_leaf',leaf, 'acc:', acc*100)
                df=df.append({'feats':feats,'depth':dept,'split':split,'max_leaf':leaf,'acc':acc},ignore_index=True)
df.to_csv('Adaboost_result.csv', sep=',')
g_train = g.iloc[train_ind, :]
g_test = g.iloc[test_ind, :]

clf = tree.DecisionTreeClassifier(criterion='gini',
                                  max_depth=6,
                                  min_samples_leaf=3)
####################
clf = RandomForestClassifier(criterion='gini',
                             max_depth=6,
                             min_samples_leaf=3,
                             n_estimators=50)
####################
clf = AdaBoostClassifier(DecisionTreeClassifier(criterion='gini',
                                                max_depth=6,
                                                min_samples_leaf=3),
                         n_estimators=200,
                         learning_rate=0.1)
####################
clf = neighbors.KNeighborsClassifier(100, weights='uniform')
clf = neighbors.KNeighborsClassifier(100, weights='distance')
####################
clf = GaussianNB()
##############################
t0 = time()
param_grid = {
    'C': [150, 500, 750, 1000],
    'gamma': [0.0005, 0.001, 0.05, .01],
}
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
clf = clf.fit(X_train, y_train)
Beispiel #21
0
plt.title("Variance VS Components")
plt.show()

# Selecting the ideal number of components and fitting the data
pca = PCA(n_components=35)
X = pca.fit_transform(X)

### Training the models ###
models = [
    ("Gaussian NB", GaussianNB()),
    ("KNN", KNeighborsClassifier()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("Logistic Regression", LogisticRegression()),
    ("LDA", LinearDiscriminantAnalysis()),
    ("AdaBoost", AdaBoostClassifier()),
    ("QDA", QuadraticDiscriminantAnalysis()),
    ("Neural Net", MLPClassifier()),
    ("Gradient Boosting", GradientBoostingClassifier()),
    ("Extra Trees", ExtraTreesClassifier()),
    # ("SVM", SVC(kernel="linear")),
    ("XGBOOST Classifer", XGBClassifier()),
]

## Model comparison ###
start = timeit.default_timer()

accuracies = []
for name, model in models:

    # kfold = model_selection.KFold(n_splits=10)
class SkClassifier(MultiClassifier):
    type_map = dict(
        MultiClassifier.type_map,
        c=float,
        gamma=float,
        cache=int,
        n_estimators=int,
        n_neighbors=int,
        radius=float,
        # probability=str_to_bool, # TODO
        # class_weights # TODO
    )

    base_param_grid = {'svc__C': np.logspace(-2, 3, 5)}

    classifiers = {
        'svm': {
            'build':
            lambda self: SVC(kernel='linear',
                             C=self.c,
                             probability=self.probability,
                             cache_size=self.cache,
                             class_weight=SVM_CLASS_WEIGHTS
                             if SVM_CLASS_WEIGHTS else None),
            'param_grid':
            dict(base_param_grid),
            'test_params': {
                'probability': False
            },
            'roc_params': {
                'probability': True
            },
        },
        'svm-rbf': {
            'build':
            lambda self: SVC(kernel='rbf',
                             C=self.c,
                             probability=self.probability,
                             cache_size=self.cache,
                             gamma=self.gamma,
                             class_weight=SVM_CLASS_WEIGHTS
                             if SVM_CLASS_WEIGHTS else None),
            'param_grid':
            dict(base_param_grid, svc__gamma=np.logspace(-9, 3, 5)),
            'test_params': {
                'probability': False
            },
            'roc_params': {
                'probability': True
            },
        },
        'mlp': {
            'build':
            lambda self: MLPClassifier(solver='lbfgs',
                                       alpha=1e-5,
                                       hidden_layer_sizes=(5, 2),
                                       random_state=42),
            'param_grid':
            dict(),
        },
        'knn': {
            'build':
            lambda self: KNeighborsClassifier(n_neighbors=self.n_neighbors,
                                              n_jobs=self.n_jobs),
            'param_grid':
            dict(),
        },
        'rnn': {
            'build':
            lambda self: RadiusNeighborsClassifier(
                radius=self.radius, n_jobs=self.n_jobs, outlier_label=0),
            'param_grid':
            dict(),
        },
        'ada': {
            'build': lambda self: AdaBoostClassifier(),
            'param_grid': dict(),
        },
        'ada-svm': {
            'build':
            lambda self: AdaBoostClassifier(base_estimator=SVC(
                probability=True,
                kernel='rbf',
                C=self.c,
                gamma=self.gamma,
                cache_size=self.cache)),
            'param_grid':
            dict(),
        },
        'ada-sgd': {
            'build':
            lambda self: AdaBoostClassifier(
                base_estimator=SGDClassifier(loss='hinge'), algorithm='SAMME'),
            'param_grid':
            dict(),
        },
        'rf': {
            'build':
            lambda self: RandomForestClassifier(n_estimators=self.n_estimators,
                                                n_jobs=self.n_jobs),
            'param_grid':
            dict(),
        },
        'et': {
            'build':
            lambda self: ExtraTreesClassifier(n_estimators=self.n_estimators,
                                              n_jobs=self.n_jobs),
            'param_grid':
            dict(),
        },
        'gnb': {
            'build': lambda self: GaussianNB(),
            'param_grid': dict(),
        },
        'bnb': {
            'build': lambda self: BernoulliNB(),
            'param_grid': dict(),
        },
    }

    def __init__(self,
                 classifier_name='svm',
                 c=1000,
                 gamma=0.02,
                 cache=2000,
                 n_estimators=200,
                 n_neighbors=16,
                 radius=1.0,
                 probability=True,
                 **kwargs):
        super().__init__(classifier_name, **kwargs)

        logger.info(
            'Initializing Scikit-learn classifier {}'.format(classifier_name))
        self.c = c
        self.gamma = gamma
        self.cache = cache
        self.probability = probability
        self.n_estimators = n_estimators
        self.n_neighbors = n_neighbors
        self.radius = radius

    def load_model(self, model_dir):
        super().load_model(model_dir=model_dir)
        self.clf = joblib.load(os.path.join(model_dir, 'clf.pkl'))

    def save_model(self, output_dir):
        super().save_model(output_dir=output_dir)
        joblib.dump(self.clf, os.path.join(output_dir, 'clf.pkl'))

    def _build_classifier(self, *args, **kwargs):
        return self.classifier_dict['build'](self)

    def _get_param_grid(self):
        return self.classifier_dict['param_grid']

    def _get_test_params(self):
        return self.classifier_dict.get('test_params', {})

    def _get_cv_params(self):
        return {
            **self._get_test_params(),
            **self.classifier_dict.get('cv_params', {})
        }

    def _get_roc_params(self):
        return self.classifier_dict.get('roc_params', {})
Beispiel #23
0
    tuned_parameters = [{'n_estimators':[5, 10, 100, 200],
                         'criterion':['gini', 'entropy'],
                         'max_features':['log2', 'sqrt'],
                         'max_depth':[10, 100]
                     }]
    algo = RandomForestClassifier()

elif choice=='i' or choice=='I':
    print("\n**********************************\n")
    print("  \t AdaBoost Classifier")
    tuned_parameters = [{'n_estimators':[5, 10, 50, 100, 200],
                         'learning_rate':[0.1, 0.2, 0.5, 1],
                         'algorithm':['SAMME', 'SAMME.R'],
                         'random_state':[1, 2, 3, 5]
                     }]
    algo = AdaBoostClassifier()
    
elif choice=='j' or choice=='J':
    print("\n**********************************\n")
    print("  \t Gradient Boosting Classifier")
    tuned_parameters = [{'n_estimators':[5, 10, 50, 100, 200],
                         'learning_rate':[0.1, 0.2, 0.5, 1],
                         'min_impurity_decrease': [0.0001],
                         'max_depth':[10, 100]
                     }]
    algo = GradientBoostingClassifier()
    
elif choice=='k' or choice=='K':
    print("\n**********************************\n")
    print("  \t XG Boost")
    tuned_parameters = [{'n_estimators':[5, 10, 50, 100, 200],
Beispiel #24
0
train_data = train_data.dropna()
train_data = preprocess_data(train_data)

X = train_data[['is_1', 'is_2', 'is_3', 'Fare', 'is_male', 'is_female']]
Y = train_data['Survived']

XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size=0.2)

n_estimators = 100

models = [
    DecisionTreeClassifier(max_depth=3),
    BaggingClassifier(n_estimators=n_estimators),
    RandomForestClassifier(n_estimators=n_estimators),
    ExtraTreesClassifier(n_estimators=n_estimators),
    AdaBoostClassifier(n_estimators=n_estimators)
]

model_title = [
    'DecisionTree', 'Bagging', 'RandomForest', 'ExtraTrees', 'AdaBoost'
]

surv_preds, surv_probs, scores, fprs, tprs, thres = ([] for i in range(6))

for i, model in enumerate(models):
    print('Fitting {0}'.format(model_title[i]))

    clf = model.fit(XTrain, YTrain)
    surv_preds.append(model.predict(XTest))
    surv_probs.append(model.predict_proba(XTest))
    scores.append(model.score(XTest, YTest))
Beispiel #25
0
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm.classes import SVR
from sklearn.tree import DecisionTreeClassifier


DECISION_TREE = DecisionTreeClassifier()
LOGISTIC_REGRESSION = LogisticRegression()
NAIVE_BAYS = GaussianNB()

K_N_N = KNeighborsClassifier()
SUPPORT_VECTOR = svm.SVC(kernel="linear")

# Ensemble classifiers
RANDOM_FOREST = RandomForestClassifier(n_estimators=100)
GRADIENT_BOOST_CL = GradientBoostingClassifier(n_estimators=100)
ADA_BOOST = AdaBoostClassifier(n_estimators=100)
EXTRA_TREE = ExtraTreesClassifier(n_estimators=100)


# Regressors
GRADIENT_BOOST_RG = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
LINEAR_RG = LinearRegression()
RIDGE_RG = Ridge()
LASSO_RG = Lasso()
SVR_RG = SVR()

def getClassifierMap():
    CLASSIFIER_MAP = {
    "DECISION_TREE": DECISION_TREE,
    "LOGISTIC_REGRESSION": LOGISTIC_REGRESSION,
    "NAIVE_BAYS": NAIVE_BAYS,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

#:# preprocessing

transform_pipeline = Pipeline([('scaler', StandardScaler())])

X_train = pd.DataFrame(transform_pipeline.fit_transform(X_train),
                       columns=X_train.columns)

#:# model

params = {'learning_rate': 0.5, 'n_estimators': 300}

classifier = AdaBoostClassifier(**params)
classifier.fit(X_train, y_train)

#:# hash
#:# e595f5d5683f3e3692608020cd5bde18
md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest()
print(f'md5: {md5}')

#:# audit
y_pred = classifier.predict(transform_pipeline.transform(X_test))
y_pred_proba = classifier.predict_proba(
    transform_pipeline.transform(X_test))[:, 1]

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f'acc: {accuracy_score(y_test, y_pred)}')
def all_classifier_models():
    models = []
    metrix = []
    c_report = []
    train_accuracy = []
    test_accuracy = []
    
    models.append(('LogisticRegression', LogisticRegression(solver='liblinear', multi_class='ovr')))
    models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()))
    models.append(('KNeighborsClassifier', KNeighborsClassifier()))
    models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
    models.append(('GaussianNB', GaussianNB()))
    models.append(('RandomForestClassifier', RandomForestClassifier(n_estimators=100)))
    models.append(('SVM', SVC(gamma='auto')))
    models.append(('Linear_SVM', LinearSVC()))
    models.append(('XGB', XGBClassifier()))
    models.append(('SGD', SGDClassifier()))
    models.append(('Perceptron', Perceptron()))
    models.append(('ExtraTreeClassifier', ExtraTreeClassifier()))
    models.append(('OneClassSVM', OneClassSVM(gamma = 'auto')))
    models.append(('NuSVC', NuSVC()))
    models.append(('MLPClassifier', MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)))
    models.append(('RadiusNeighborsClassifier', RadiusNeighborsClassifier(radius=2.0)))
    models.append(('OutputCodeClassifier', OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),random_state=0)))
    models.append(('OneVsOneClassifier', OneVsOneClassifier(estimator = RandomForestClassifier(random_state=1))))
    models.append(('OneVsRestClassifier', OneVsRestClassifier(estimator = RandomForestClassifier(random_state=1))))
    models.append(('LogisticRegressionCV', LogisticRegressionCV()))
    models.append(('RidgeClassifierCV', RidgeClassifierCV()))
    models.append(('RidgeClassifier', RidgeClassifier()))
    models.append(('PassiveAggressiveClassifier', PassiveAggressiveClassifier()))
    models.append(('GaussianProcessClassifier', GaussianProcessClassifier()))
    models.append(('HistGradientBoostingClassifier', HistGradientBoostingClassifier()))
    estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),('svr', make_pipeline(StandardScaler(),LinearSVC(random_state=42)))]
    models.append(('StackingClassifier', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())))
    clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
    clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
    clf3 = GaussianNB()
    models.append(('VotingClassifier', VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')))
    models.append(('AdaBoostClassifier', AdaBoostClassifier()))
    models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
    models.append(('BaggingClassifier', BaggingClassifier()))
    models.append(('ExtraTreesClassifier', ExtraTreesClassifier()))
    models.append(('CategoricalNB', CategoricalNB()))
    models.append(('ComplementNB', ComplementNB()))
    models.append(('BernoulliNB', BernoulliNB()))
    models.append(('MultinomialNB', MultinomialNB()))
    models.append(('CalibratedClassifierCV', CalibratedClassifierCV()))
    models.append(('LabelPropagation', LabelPropagation()))
    models.append(('LabelSpreading', LabelSpreading()))
    models.append(('NearestCentroid', NearestCentroid()))
    models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis()))
    models.append(('GaussianMixture', GaussianMixture()))
    models.append(('BayesianGaussianMixture', BayesianGaussianMixture()))
    
    test_accuracy= []
    names = []
    for name, model in models:
        try:
            m = model
            m.fit(X_train, y_train)
            y_pred = m.predict(X_test)
            train_acc = round(m.score(X_train, y_train) * 100, 2)
            test_acc = metrics.accuracy_score(y_test,y_pred) *100
            c_report.append(classification_report(y_test, y_pred))
            test_accuracy.append(test_acc)
            names.append(name)
            metrix.append([name, train_acc, test_acc])
        except:
            print("Exception Occurred  :",name)
    return metrix,test_accuracy,names