Beispiel #1
0
def train(
    features_npy: str,
    targets_csv: str,
    n_splits: int = 5,
    n_repeats: int = 10,
    logdir: str = '.',
    random_seed=82
):
    model = LogisticRegression(
        penalty='elasticnet',
        C=1.0,
        class_weight='balanced',
        random_state=random_seed,
        solver='saga',
        max_iter=200,
        n_jobs=-1,
        l1_ratio=1.0
    )

    X = np.load(features_npy)
    df = pd.read_csv(targets_csv)
    y = df['label'].values

    logs = []
    splitter = model_selection.RepeatedStratifiedKFold(
        n_splits=n_splits, n_repeats=n_repeats, random_state=random_seed
    )
    pbar = tqdm(
        splitter.split(X, y, groups=df['id']), desc='folds', total=splitter.get_n_splits()
    )
    for i, (train_index, valid_index) in enumerate(pbar):
        model_ = clone(model)
        X_train, X_test = X[train_index], X[valid_index]
        y_train, y_test = y[train_index], y[valid_index]

        model_.fit(X_train, y_train)
        preds = model_.predict_proba(X_test)[:, 1]
        logs.append({'auc': metrics.roc_auc_score(y_test, preds)})
        pbar.set_postfix(**logs[-1])
    auc_ = np.array([it['auc'] for it in logs])
    print(f'AUC (mean): {auc_.mean()}\tAUC (str): {auc_.std()}')
    with open(os.path.join(logdir, 'logs.pkl'), 'wb') as f:
        pickle.dump(logs, f)

    # train final model on all data
    model.fit(X, y)
    with open(os.path.join(logdir, 'model.pkl'), 'wb') as f:
        pickle.dump(model, f)
Beispiel #2
0
def get_models_CV_scores(X_train, Y_train, models, cv_loops):
    # Spot Check Algorithms with cross validation
    # evaluate each model in turn
    scores = []
    names = []
    results = []
    for name, model in models:
        cv_results_colection = None
        kfold = model_selection.RepeatedStratifiedKFold(n_repeats=cv_loops,
                                                        n_splits=10)
        try:
            cv_results = model_selection.cross_val_score(model,
                                                         X_train,
                                                         Y_train,
                                                         cv=kfold,
                                                         n_jobs=-1,
                                                         scoring=SCORING)
        except ValueError as e:
            print('ValueError{}'.format(None))
        else:
            # cv_results_colection.extend(cv_results)
            scores.append(cv_results)
            names.append(name)
            results.append({
                "model": SkFModel(name, model),
                "score": cv_results.mean()
            })
    # Compare Algorithms
    fig = plt.figure()
    plt.title('Source-{} ,{} Attributes'.format(os.path.basename(TRAIN_FILE),
                                                X_train.shape[-1]),
              fontsize=10)
    plt.suptitle('Algorithm Comparison', fontsize=16)
    ax = fig.add_subplot(111)
    plt.boxplot(scores)
    ax.set_xticklabels(names)
    plt.ylabel('Score ({})'.format(SCORING), fontsize=14)
    if config['outputs']['save_charts']:
        fig.savefig(os.path.join(
            OUTPUT, '{}_Algorithm_Comparison.png'.format(X_train.shape[-1])),
                    dpi=1200)
    if config['outputs']['show_charts']:
        plt.show()
    plt.close(fig)
    return results
Beispiel #3
0
from sklearn import model_selection, metrics
from tqdm import tqdm

repetitions = 10
datasets = h.datasets()
clfs = h.classifiers()

for dataset in datasets:
    print(dataset)
    # Gather dataset
    ds = pd.read_csv(dataset[0], header=None).as_matrix()
    X, y = ds[:, :-1], ds[:, -1].astype("int")

    # CV
    for repetition in range(repetitions):
        cv = model_selection.RepeatedStratifiedKFold(
            n_splits=2, n_repeats=5, random_state=np.random.randint(9999))
        fold = 0
        k_accuracies = []
        for train, test in cv.split(X, y):
            fold_X_train, fold_y_train = X[train], y[train]
            fold_X_test, fold_y_test = X[test], y[test]

            clf_accuracies = []
            for clf_n in clfs:
                clf = clfs[clf_n]
                clf.fit(fold_X_train, fold_y_train)
                probas = clf.predict_proba(fold_X_test)
                prediction = np.argmax(probas, axis=1)
                accuracy = metrics.accuracy_score(fold_y_test, prediction)
                clf_accuracies.append(accuracy)
            k_accuracies.append(clf_accuracies)
models.append(('HDDT', HDDT))
models.append(('RF', RF))
models.append(('HDRF', HDRF))
models.append(('ADADT', ADADT))
models.append(('ADASVM', ADASVM))
models.append(('BAGDT', BAGDT))
models.append(('BAGSVM', BAGSVM))
models.append(('XGB', XGB))

# COMPUTE RESULTS
results = []
names = []
scoring = make_scorer(metrics.fbeta_score, beta=2)
for name, model in models:
    repkfold = model_selection.RepeatedStratifiedKFold(n_splits=5,
                                                       n_repeats=3,
                                                       random_state=seed)
    cv_results_new = model_selection.cross_val_score(model,
                                                     X_train,
                                                     Y_train,
                                                     cv=repkfold,
                                                     scoring=scoring)
    results.append(cv_results_new)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results_new.mean(), cv_results_new.std())
    print(msg)

# BOXPLOT
fig = plt.figure()
ax = fig.add_subplot(111)
plt.boxplot(results, showmeans=True)
Beispiel #5
0
    'bank_r_b_124002971',
    'pay_by_ach',
    'bank_a_left_blank',
    'addres_z_84010'
]
target_col = 'good_loan'
thresh = 0.0

# INITIALIZE ------------------------------------------------------------------

# Load data
data = pd.read_csv(data_file, index_col='customer_id')

# Create evaluation folds
fold_list = model_selection.RepeatedStratifiedKFold(n_splits=5,
                                                    n_repeats=2,
                                                    random_state=1111)

# LOOK AT DATA ----------------------------------------------------------------

print('Data Shape: ', data.shape)

# Print min/max/mean/std
print(data.agg(['min', 'mean', 'median', 'max', 'std']).transpose())

# Look at correlation
rfpimp.plot_corr_heatmap(data[train_cols], figsize=(10, 8))
plt.show(block=False)

# BUILD A CLASSIFIER ----------------------------------------------------------
    return history, test_acc

def get_generators(train_index, test_index, graph_labels, batch_size):
    train_gen = generator.flow(
        train_index, targets=graph_labels.iloc[train_index].values, batch_size=batch_size
    )
    test_gen = generator.flow(
        test_index, targets=graph_labels.iloc[test_index].values, batch_size=batch_size
    )

    return train_gen, test_gen


stratified_folds = model_selection.RepeatedStratifiedKFold(
    n_splits=folds, n_repeats=n_repeats
).split(graph_labels, graph_labels)

for i, (train_index, test_index) in enumerate(stratified_folds):
    print(i)
    train_gen, test_gen = get_generators(
        train_index, test_index, graph_labels, batch_size=20
    )

    model = create_graph_classification_model(generator)

    history, acc = train_fold(model, train_gen, test_gen, es, epochs)

    test_accs.append(acc)

print(np.mean(test_accs))
Seleção Wrapper RFE
"""
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection

#Cria estimador
estimator3 = RandomForestRegressor()
#Cria seletor com validação cruzada 3-fold e 10 repetições
selector3 = RFECV(estimator3,
                  min_features_to_select=1,
                  step=1,
                  cv=model_selection.RepeatedStratifiedKFold(n_splits=3,
                                                             n_repeats=10),
                  scoring='r2',
                  n_jobs=-1)
selector3 = selector3.fit(dataFrame3[colsNotSalePrice2],
                          dataFrame3["SalePrice"])
for col in colsNotSalePrice2[selector3.ranking_ == 1]:
    print("%s" % col, end=", ")

#Seleção por forward SequentialFeatureSelector
from mlxtend import feature_selection

#SFS para random forests
sfs3 = feature_selection.SequentialFeatureSelector(
    estimator3,
    k_features=79,
    forward=True,
    def fit_predict(self,
                    X,
                    y,
                    test_X,
                    categorical_feature,
                    num_iterations=150,
                    early_stopping_rounds=30):
        random_seed = 2019
        model_idx = 0
        predicted = []
        random_seed += 1
        kfold = model_selection.RepeatedStratifiedKFold(
            n_splits=self.n_split, n_repeats=10, random_state=random_seed)
        data = lgb.Dataset(X,
                           label=y,
                           categorical_feature=categorical_feature,
                           free_raw_data=True)
        # data initialization for time calculation
        lgb.train(self.params_list[0],
                  data,
                  1,
                  categorical_feature=categorical_feature)
        for fold_idx, (train_index,
                       valid_index) in enumerate(kfold.split(X, y)):
            self.timer.print("{} model learning".format(model_idx))
            learn_start_time = time.time()

            # train_X, valid_X = X[train_index], X[valid_index]
            # train_y, valid_y = y[train_index], y[valid_index]
            # train_data = lgb.Dataset(train_X, label=train_y)
            # valid_data = lgb.Dataset(valid_X, label=valid_y)
            train_data = data.subset(train_index)
            valid_data = data.subset(valid_index)

            random_seed += 1
            params = self.params_list[model_idx % len(self.params_list)]
            params["seed"] = random_seed

            model = lgb.train(
                params,
                train_data,
                num_iterations,
                valid_data,
                early_stopping_rounds=early_stopping_rounds,
                verbose_eval=50,
                categorical_feature=categorical_feature,
                callbacks=[TimeoutCallback(self.timer, params['metric'])])
            self.models.append(model)
            print(model.current_iteration())

            predicted.append(model.predict(test_X))
            gc.collect()

            # del train_X, valid_X, train_y, valid_y, train_data, valid_data
            del train_data, valid_data
            gc.collect()
            self.learning_time.append(
                (time.time() - learn_start_time) * num_iterations / min([
                    model.current_iteration() + early_stopping_rounds,
                    num_iterations
                ]))

            self.timer.print_memory_usage()
            if self.timer.time_remain < (1.5 * np.max(self.learning_time) +
                                         0.05 * self.timer.time_budget + 10):
                break
            model_idx += 1
            if model_idx >= self.max_model:
                break
        if len(predicted) > 0:
            return np.stack(predicted).mean(axis=0)
        else:
            return np.zeros(len(test_X))
Beispiel #9
0
    START_TIME = datetime.now().isoformat(timespec='minutes')
    print(f'Start time is: {START_TIME}')
    tpot_name = f'exported_pipeline.time.{START_TIME}.{args.feat}.tpot.py'
    run_log = 'run_' + tpot_name

    print("TPOT runscript written to:", runscripts_dir / run_log)
    copyfile(os.path.realpath(__file__), runscripts_dir / run_log)

    # Read in data (spatial, temporal, features)
    import cnmfereview as cr
    X_train, X_test, y_train, y_test = cr.set_up_remote_job(DATA_PATH,
                                                            feature=args.feat)

    # stratified K fold chooses same proportion of labels per fold
    kf = model_selection.RepeatedStratifiedKFold(n_splits=cv_folds,
                                                 n_repeats=5,
                                                 random_state=random_state)

    config_dict = None  # 'TPOT light'
    tpot = TPOTClassifier(population_size=population_size,
                          verbosity=2,
                          scoring='f1',
                          random_state=random_state,
                          cv=kf,
                          n_jobs=n_jobs,
                          max_time_mins=max_time_mins,
                          max_eval_time_mins=max_eval_time_mins,
                          config_dict=config_dict,
                          memory=None,
                          periodic_checkpoint_folder=checkpoint_path)
    print(f'Starting TPOT training at: {START_TIME}')
Beispiel #10
0
print(dict(zip(unique, counts))) #0s are overweighted by 1s with a ratio of 0.217. Therefore, models might not be able to predict 0s as much as 1s..

#general train test splits
Xtrain, Xtest, ytrain, ytest = ms.train_test_split(x_data, y_data,
random_state=1)
#feature selection to reduce unnecessary variables
selector = fs.SelectKBest(fs.f_classif, k=5)
selector.fit(Xtrain, ytrain)

#Support Vector Machine training w/o weight

from sklearn import svm
model = svm.SVC()
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
cv = ms.RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
print("SVM normal Accuracy Score", met.accuracy_score(ytest, y_model)) #accuracy
cross_val = ms.cross_val_score(model, Xtrain, ytrain, scoring='roc_auc', cv=cv, n_jobs=-1) #cross val
print("SVM normal Cross Validation Score", np.mean(cross_val))

#confusion matrix
mat = met.confusion_matrix(ytest, y_model)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true SVM(normal)')
plt.ylabel('predicted SVM(normal)')
plt.show()

#SVM with "weights"

from sklearn import svm
def k_fold_cross_validation(x, y, splits, repeats):

    seed = 7

    # classificadores para o ensemble
    clf1 = LogisticRegression(random_state=seed, C=625, penalty='l1')
    clf2 = MultinomialNB(alpha=1130)
    clf3 = GaussianNB()
    clf4 = KNeighborsClassifier(n_neighbors=450)
    clf5 = ExtraTreesClassifier(random_state=seed,
                                criterion='gini',
                                n_estimators=1000,
                                max_features=5)
    clf6 = QuadraticDiscriminantAnalysis()
    eclf = VotingClassifier(estimators=[('LR', clf1), ('NBM', clf2),
                                        ('NBG', clf3), ('KNN', clf4),
                                        ('ET', clf5), ('ADQ', clf6)],
                            voting='hard')

    # Algoritmos comparados
    models = []

    models.append(
        ('RL', LogisticRegression(random_state=seed, C=625, penalty='l1')))
    models.append(('ADL', LinearDiscriminantAnalysis()))
    models.append(('ADQ', QuadraticDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier(n_neighbors=450)))
    models.append(('NBG', GaussianNB()))
    models.append(('NBM', MultinomialNB(alpha=1130)))
    models.append(('SVML', SVC(random_state=seed, kernel='linear', C=0.1)))
    models.append(
        ('SVMR', SVC(random_state=seed, kernel='rbf', C=1, gamma=0.0001)))
    models.append(('RF',
                   RandomForestClassifier(random_state=seed,
                                          criterion='entropy',
                                          n_estimators=1000,
                                          max_features=5)))
    models.append(('ET',
                   ExtraTreesClassifier(random_state=seed,
                                        criterion='gini',
                                        n_estimators=1000,
                                        max_features=5)))
    models.append(('ENS', eclf))

    # loop que analisa cada algoritmo
    score = 'accuracy'
    results1 = []
    names1 = []
    mean1 = []
    std1 = []

    for name, model in models:
        kfold = model_selection.RepeatedStratifiedKFold(n_splits=splits,
                                                        n_repeats=repeats,
                                                        random_state=seed)
        cv_results = model_selection.cross_val_score(model,
                                                     x,
                                                     y,
                                                     cv=kfold,
                                                     scoring=score)
        results1.append(cv_results)
        names1.append(name)
        mean1.append(cv_results.mean() * 100)
        std1.append(cv_results.std() * 100)
        msg = "%s: %f (%f)" % (name, cv_results.mean() * 100,
                               cv_results.std() * 100)
        print(msg)

    list_results_acc = list(zip(names1, results1))
    print(list_results_acc)
    df_results_acc = pd.DataFrame(list_results_acc)
    if part_ign == 3:
        df_results_acc.to_csv('df_results_acc_3.csv', sep=';')
    if part_ign == 10:
        df_results_acc.to_csv('df_results_acc_10.csv', sep=';')
    if part_ign == 19:
        df_results_acc.to_csv('df_results_acc_19.csv', sep=';')

    if score == 'accuracy':
        list_acc = list(zip(names1, mean1, std1))
        df_acc = pd.DataFrame(list_acc)
        if part_ign == 3:
            df_acc.to_csv('df_acc_3.csv', sep=';')
        if part_ign == 10:
            df_acc.to_csv('df_acc_10.csv', sep=';')
        if part_ign == 19:
            df_acc.to_csv('df_acc_19.csv', sep=';')

    # classificadores para o ensemble
    clf1 = LogisticRegression(random_state=seed, C=625, penalty='l1')
    clf2 = MultinomialNB(alpha=15)
    clf3 = GaussianNB()
    clf4 = KNeighborsClassifier(n_neighbors=10)
    clf5 = ExtraTreesClassifier(random_state=seed,
                                criterion='entropy',
                                n_estimators=1000,
                                max_features=17)
    clf6 = QuadraticDiscriminantAnalysis()
    eclf = VotingClassifier(estimators=[('LR', clf1), ('NBM', clf2),
                                        ('NBG', clf3), ('KNN', clf4),
                                        ('ET', clf5), ('ADQ', clf6)],
                            voting='hard')

    models = []

    models.append(
        ('RL', LogisticRegression(random_state=seed, C=625, penalty='l1')))
    models.append(('ADL', LinearDiscriminantAnalysis()))
    models.append(('ADQ', QuadraticDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier(n_neighbors=10)))
    models.append(('NBG', GaussianNB()))
    models.append(('NBM', MultinomialNB(alpha=15)))
    models.append(('SVML', SVC(random_state=seed, kernel='linear', C=10)))
    models.append(
        ('SVMR', SVC(random_state=seed, kernel='rbf', C=10, gamma=0.001)))
    models.append(('RF',
                   RandomForestClassifier(random_state=seed,
                                          criterion='gini',
                                          n_estimators=1000,
                                          max_features=17)))
    models.append(('ET',
                   ExtraTreesClassifier(random_state=seed,
                                        criterion='entropy',
                                        n_estimators=1000,
                                        max_features=17)))
    models.append(('ENS', eclf))

    # loop que analisa cada algoritmo
    score = 'f1_macro'
    results2 = []
    names2 = []
    mean2 = []
    std2 = []

    for name, model in models:
        kfold = model_selection.RepeatedStratifiedKFold(n_splits=splits,
                                                        n_repeats=repeats,
                                                        random_state=seed)
        cv_results = model_selection.cross_val_score(model,
                                                     x,
                                                     y,
                                                     cv=kfold,
                                                     scoring=score)
        results2.append(cv_results)
        names2.append(name)
        mean2.append(cv_results.mean() * 100)
        std2.append(cv_results.std() * 100)
        msg = "%s: %f (%f)" % (name, cv_results.mean() * 100,
                               cv_results.std() * 100)
        print(msg)

    list_results_f1 = list(zip(names2, results2))
    print(list_results_f1)
    df_results_f1 = pd.DataFrame(list_results_f1)
    if part_ign == 3:
        df_results_f1.to_csv('df_results_f1_3.csv', sep=';')
    if part_ign == 10:
        df_results_f1.to_csv('df_results_f1_10.csv', sep=';')
    if part_ign == 19:
        df_results_f1.to_csv('df_results_f1_10.csv', sep=';')

    if score == 'f1_macro':
        list_f1 = list(zip(names2, mean2, std2))
        df_f1 = pd.DataFrame(list_f1)
        if part_ign == 3:
            df_f1.to_csv('df_f1_3.csv', sep=';')
        if part_ign == 10:
            df_f1.to_csv('df_f1_10.csv', sep=';')
        if part_ign == 19:
            df_f1.to_csv('df_f1_19.csv', sep=';')

# plotando gráfico
    fig = plt.figure(figsize=(15, 5))
    ax1 = fig.add_subplot(211)
    ax2 = fig.add_subplot(212)
    plt.subplot(211)
    plt.boxplot(results1)
    ax1.set_xticklabels(names1, fontsize=14)
    plt.ylabel('Acurácia', fontsize=18)
    plt.xlabel('(a)', fontsize=18)
    plt.yticks(rotation='horizontal', fontsize=14)
    plt.axhline(y=0.4656, xmin=0, xmax=1, hold=None, color='g')
    plt.axhline(y=0.5024, xmin=0, xmax=1, hold=None, color='b')
    plt.subplot(212)
    plt.xlabel('(b)\nClassificadores', fontsize=18)
    plt.boxplot(results2)
    plt.ylabel('F1-score', fontsize=18)
    ax2.set_xticklabels(names2, fontsize=14)
    plt.yticks(rotation='horizontal', fontsize=14)
    ax2.annotate(
        'RL = Regressao Logistica\nADL = Analise Discr. Linear\n\
ADQ = Analise Discr. Quadratica\nKNN = K-Nearest Neighbors\n\
NBG = Naive Bayes Gaussiano\nNBM = Naive Bayes Multinomial\n\
SVML = SVM Linear\nSVMR = SVM kernel rbf\nRF = Random Forest\n\
ET = Extra Trees',

        # The point that we'll place the text in relation to
        xy=(1.01, 0.5),
        # Interpret the x as axes coords, and the y as figure coords
        xycoords=('axes fraction', 'figure fraction'),

        # The distance from the point that the text will be at
        xytext=(0, 0),
        # Interpret `xytext` as an offset in points...
        textcoords='offset points',

        # Any other text parameters we'd like
        size=12,
        ha='left',
        va='center')
    plt.subplot(212)
    plt.show(fig)
Beispiel #12
0
import pickle
from sklearn.preprocessing import scale
import descritores as desc

warnings.simplefilter("ignore")

ss = pylab.loadtxt(sys.argv[1])
path = sys.argv[2]
dim = ss.shape[1]-8
with open(path+"classes.txt","rb") as f:
 with open(path+"names.pkl","rb") as g:
   cl = pickle.load(f)
   nomes = pickle.load(g)

   clf = neighbors.KNeighborsClassifier(n_neighbors = 3)
   it = model_selection.RepeatedStratifiedKFold(n_splits = 5,n_repeats = 50)

   for s in ss:
    sigma = s[4:4+dim]
    SI,DB,CH = s[dim+4],s[dim+5],s[dim+6]

    db = {}
    for im_file in nomes:
      nmbe = desc.bendenergy(path+im_file,sigma)
      db[im_file] = np.hstack((cl[im_file],np.log(nmbe())))
    # nome das figuras

    Y = np.array([db[i][0] for i in db.keys()]).astype(int)
    X = scale(np.array([db[i][1:] for i in db.keys()]))
    res =  model_selection.cross_val_score(clf,X,Y,cv = it,scoring = "accuracy")
    st = str("{0} {1} {2} {3} {4} {5}").format(s[1],s[2],s[3],SI,DB,CH)
Beispiel #13
0
# from sklearn.ensemble import RandomForestRegressor
# from sklearn import model_selection

# #Cria estimador
# estimator3 = RandomForestRegressor()
# #Cria seletor com validação cruzada 3-fold e 10 repetições
# selector3 = RFECV(estimator3, step=1, cv=model_selection.RepeatedStratifiedKFold(n_splits=4, n_repeats=10),
#               scoring='r2', n_jobs=-1)
# selector3 = selector3.fit(dataFrame3[colsNotSalePrice2], dataFrame3["SalePrice"])

# pd.to_pickle(selector3, "./PickledObjects/selector3.pkl")

from mlxtend import feature_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection

from mlxtend import feature_selection

estimator = RandomForestRegressor()
sfs3 = feature_selection.SequentialFeatureSelector(
    estimator,
    k_features=79,
    forward=True,
    scoring="r2",
    cv=model_selection.RepeatedStratifiedKFold(3, 10),
    n_jobs=-1)

sfs4 = sfs3.fit(dataFrame3[colsNotSalePrice2], dataFrame3["SalePrice"])

pd.to_pickle(sfs4, "./PickledObjects/sfs4.pkl")