def objective_xgb(space):
    
    numfolds = 5
    total = 0
    kf = StratifiedKFold(n_splits=numfolds, shuffle=True,random_state=666)
            
    
    clf = xgb.XGBClassifier(n_estimators = 100, 
                            max_depth = space['max_depth'],
                            learning_rate = space['learning_rate'],
                            min_child_weight = space['min_child_weight'],
                            subsample = space['subsample'],
                            colsample_bytree = space['colsample_bytree'])
#                            colsample_bylevel = space['colsample_bylevel'],
#                           nthread = -1)

    
    for train_index, test_index in kf.split(X_train_pred,Y_train_new.is_duplicate):
        xtrain, xtest = X_train_pred.iloc[train_index], X_train_pred.iloc[test_index]
        ytrain, ytest = Y_train_new.iloc[train_index], Y_train_new.iloc[test_index]
        
        eval_set = [(xtrain, ytrain),(xtest, ytest)]

        clf.fit(xtrain, ytrain.values.ravel(), eval_metric="logloss",eval_set = eval_set, early_stopping_rounds=50)
 
#        rf.fit(xtrain,ytrain.values.ravel())
 
        pred = clf.predict_proba(xtest)[:,1]
     
        logloss = log_loss(ytest, pred)
        print ("SCORE:", logloss)  
        total += logloss
    total = total/numfolds
    print (total)
    return{'loss':total, 'status': STATUS_OK }
Example #2
0
def cross_validation(data, label, para_c, para_o):
    kfold = para_c['kfold']
    neg = 0
    pos = 1
    gF1 = []
    ggmean = []
    gauc = []
    path = 'collection.xls'
    from vae4 import mnist_vae
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=kfold)
    for train_index, test_index in skf.split(data, label):
        train = data[train_index]
        test = data[test_index]
        train, test = standard_scale(train, test)
        train_label = label[train_index]
        test_label = label[test_index]
        negative = train[train_label == neg]
        positive = train[train_label == pos]
        from sklearn.naive_bayes import GaussianNB
        gnb = GaussianNB()
        #        from sklearn.ensemble import RandomForestClassifier
        #        gnb = RandomForestClassifier()
        if para_c['over_sampling'] == 'SMOTE':
            s = Smote(positive, N=100)
            gene = s.over_sampling()
        elif para_c['over_sampling'] == 'vae':
            gene_size = positive.shape[0]
            gene = mnist_vae(positive, gene_size, para_o)


#            print(gene.shape)
        elif para_c['over_sampling'] == 'random_walk':
            gene_size = positive.shape[0]
            gene = random_walk(positive, gene_size)
        else:
            gene = []
        train, train_label = app(positive, negative, gene)
        y_predne = gnb.fit(train, train_label).predict(test)
        temf, temg, tema = compute(test_label, y_predne)
        print('F1', temf, 'AUC', tema, 'gmean', temg)
        gF1.append(temf)
        ggmean.append(temg)
        gauc.append(tema)
    print(
        '##########################zhouying###################################'
    )
    #    if para_c['over_sampling'] == 'vae':
    #        write(path,dict(para_c,**para_o),{'F1':gF1,'AUC':gauc,'gmean':ggmean})
    #    else:
    #        write(path,para_c,{'F1':gF1,'AUC':gauc,'gmean':ggmean})
    print('mean F1:', np.mean(gF1), 'mean AUC:', np.mean(gauc), 'mean gmean:',
          np.mean(ggmean))
    return
def classifer_stacking(data_file,alertgroup_name,classifier_list):
    classifiers = {'KNN':KNeighborsClassifier(),
                   # n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric_params=None, n_jobs=1),
                   # 'LR': LogisticRegression(),
                   'RF':  RandomForestClassifier(),
                   # n_estimators=60,max_depth=13,min_samples_split=120,min_samples_leaf=20,random_state=10
                   'DT': tree.DecisionTreeClassifier(),
                   # criterion='gini',splitter=random,max_features=None,max_depth=13,min_samples_leaf=2
                   'GBDT': GradientBoostingClassifier()
                       # loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1,max_depth=3,verbose=0,presort='auto')
                   # 'XGB':xgboost_classifier
                   }
    all_data = pd.read_csv(data_file, sep=',', dtype=str)
    for alertgroup, group in all_data.groupby('alertgroup'):
        if alertgroup == alertgroup_name:
            train_x, test_x, train_y, test_y = get_data(group, split=True)
            arr_x = train_x.values
            arr_y = train_y.values
            max_fs = 0
            best_model = None
            stratified_folder = StratifiedKFold(n_folds=3,random_state=0,shuffle=False)

            for train_index,test_index in stratified_folder.split(train_x):
                train_x = arr_x[train_index]
                train_y = arr_y[train_index]
                test_x = arr_x[test_index]
                test_y = arr_y[test_index]
                classifiers_list = [classifiers[cl] for cl in classifier_list]
                stack_model = StackingClassifier(classifiers = classifiers_list,use_probas=True,
                                                average_probas=True,meta_classifier=classifiers['RF'])

                stack_model.fit(train_x,train_y)
                predict = stack_model.predict(test_x)
                fbetascore = fbeta_score(test_y, predict, 1)
                print(' f2score:' + str(fbetascore))
                if fbetascore > max_fs:
                    max_fs = fbetascore
                    best_model = stack_model

            stack_model = best_model
            predict = stack_model.predict(test_x)
            precision = metrics.precision_score(test_y, predict)
            recall = metrics.recall_score(test_y, predict)
            fbetascore = fbeta_score(test_y, predict, 0.5)
            accuracy = metrics.accuracy_score(test_y, predict)
            print('final performance:')
            print(alertgroup_name)
            print('precision: %.6f' % (100 *precision))
            print('recall: %.6f' % (100 * recall))
            print('f0.5score: %.6f' % (100 * fbetascore))
            print('accuracy: %.6f%%' % (100 * accuracy))

            return best_model
def objective(space):
    
    numfolds = 5
    total = 0
    kf = StratifiedKFold(n_splits=numfolds, shuffle=True,random_state=13)
            
    
    rf = RandomForestClassifier(n_estimators = 200, 
                            max_depth = space['max_depth'],
                            max_features = space['max_features'],
                            criterion = space['criterion'],
                            min_impurity_split = 0.0005,
#                            min_impurity_split = space['min_impurity_split'],
             #               scale = space['scale'],
             #               normalize = space['normalize'],
             #               min_samples_leaf = space['min_samples_leaf'],
             #               min_weight_fraction_leaf  = space['min_weight_fraction_leaf'],
             #               min_impurity_split = space['min_impurity_split'],
                            random_state = 666,
             #               warm_start = True,                            
                            n_jobs = -1
                            )
    
    
    for train_index, test_index in kf.split(X_train_new,Y_train_new.is_duplicate):
        xtrain, xtest = X_train_new.iloc[train_index], X_train_new.iloc[test_index]
        ytrain, ytest = Y_train_new.iloc[train_index], Y_train_new.iloc[test_index]
        
#        eval_set = [(xtrain, ytrain),(xtest, ytest)]

#        clf.fit(xtrain, ytrain, eval_metric="logloss",eval_set = eval_set, early_stopping_rounds=50)
 
        rf.fit(xtrain,ytrain.values.ravel())
 
        pred = rf.predict_proba(xtest)[:,1]
     
        logloss = log_loss(ytest, pred)
        print ("SCORE:", logloss)  
        total += logloss
    total = total/numfolds
    print (total)
    return{'loss':total, 'status': STATUS_OK }
Example #5
0
def create_cross_validation(data, positive, N):
    #将输入的全部数据转换成平均交叉验证的数据包,data为数据和标签包,
    #positive为该次生成数据中的正类样本的标签,N为交叉验证的折数
    #data的形式为list,数据在前,为样本数*维数的形式,label在后
    #为样本数*1的形式
    label = data[1]
    data = data[0]
    label[label != positive] = 0
    label[label == positive] = 1
    result = {}
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=N)
    i = 0
    for train_index, test_index in skf.split(data, label):
        train = data[train_index]
        test = data[test_index]
        train_label = label[train_index]
        test_label = label[test_index]
        result[str(i)] = [train, train_label, test, test_label]
        i = i + 1
    return result
# Create a Classifier Service.
# Classifier process starts using a default configuration.
classifier = Classifier.run(Config())

# Prepare arrays to keep true/predicted labels to display a report later.
true_labels = []
predicted_labels = []

# Run stratified K-fold validation.
labels = list(dataset.get_labels())
if sklearn_version < 18:
    train_test_indices = StratifiedKFold(labels, n_folds=10)
else:
    skf = StratifiedKFold(n_splits=10)
    train_test_indices = skf.split(labels, labels)

for train_idx, test_idx in train_test_indices:
    # Clear the classifier (call `clear` RPC).
    classifier.clear()

    # Split the dataset to train/test dataset.
    (train_ds, test_ds) = (dataset[train_idx], dataset[test_idx])

    # Train the classifier using train dataset.
    for (idx, label) in classifier.train(train_ds):
        # You can peek records being trained.
        #print('train[{0}]: (label: {1}) => {2}'.format(idx, label, train_ds[idx]))
        pass

    # Test the classifier using test dataset.
Example #7
0
def grid_search(data, label, para_c, para_o):
    kfold = para_c['kfold']
    neg = 0
    pos = 1
    gF1 = []
    ggmean = []
    gauc = []
    path = 'collection.xls'
    mF1 = 0
    maxF1 = {}
    mgmean = 0
    maxgmean = {}
    mauc = 0
    maxauc = {}
    from vae4 import mnist_vae
    from sklearn.model_selection import StratifiedKFold
    from sklearn.preprocessing import MinMaxScaler
    for hidden_encoder_dim in np.arange(1, data.shape[1], 5):
        para_o['hidden_encoder_dim'] = hidden_encoder_dim
        for hidden_decoder_dim in np.arange(1, data.shape[1], 5):
            para_o['hidden_decoder_dim'] = hidden_decoder_dim
            for epochs in np.arange(20, 50, 10):
                para_o['epochs'] = epochs
                for batch_size in np.arange(1, 20, 3):
                    para_o['batch_size'] = batch_size
                    for learning_rate in np.linspace(0.001, 0.1, 10):
                        para_o['learning_rate'] = learning_rate
                        for lam in np.linspace(0, 0.25 * learning_rate, 4):
                            para_o['lam'] = lam
                            skf = StratifiedKFold(n_splits=kfold)
                            for train_index, test_index in skf.split(
                                    data, label):
                                train = data[train_index]
                                test = data[test_index]
                                min_max_scaler = MinMaxScaler()
                                min_max_scaler.fit_transform(train)
                                min_max_scaler.transform(test)
                                train_label = label[train_index]
                                test_label = label[test_index]
                                negative = train[train_label == neg]
                                positive = train[train_label == pos]
                                from sklearn.naive_bayes import GaussianNB
                                gnb = GaussianNB()
                                gene_size = negative.shape[0] - positive.shape[
                                    0]
                                gene = mnist_vae(positive, gene_size, para_o)

                                train, train_label = app(
                                    positive, negative, gene)
                                #        print(train.shape)
                                y_predne = gnb.fit(train,
                                                   train_label).predict(test)
                                temf, temg, tema = compute(
                                    test_label, y_predne)
                                gF1.append(temf)
                                ggmean.append(temg)
                                gauc.append(tema)
                            if mF1 < np.mean(gF1):
                                mF1 = gF1
                                maxF1 = para_o.copy()
                            if mgmean < np.mean(ggmean):
                                mgmean = ggmean
                                maxgmean = para_o.copy()
                            if mauc < np.mean(gauc):
                                mauc = gauc
                                maxauc = para_o.copy()
                            gF1 = []
                            ggmean = []
                            gauc = []
                            print(
                                '##########################zhouying###################################'
                            )
    print(
        '##########################zhouying###################################'
    )
    #    print(dict(para_c,**maxF1))
    #    print({'max F1':mF1})
    #    print(dict(para_c,**maxgmean))
    #    print({'max gmean':mgmean})
    #    print(dict(para_c,**maxauc))
    #    print({'max auc':mauc})
    write(path, dict(para_c, **maxF1), {'max F1': mF1})
    write(path, dict(para_c, **maxgmean), {'max gmean': mgmean})
    write(path, dict(para_c, **maxauc), {'max auc': mauc})
    return
Example #8
0
    def BuildModel(this, clip_csv=None):
        """
            Builds the DNN model used to classify partial clips

            @TODO:  document me
        """
        cfg = this.config

        if not clip_csv:
            clip_csv = cfg.full_clips_csv

        #first: load the clip lists
        clipFiles = pd.read_csv(clip_csv)

        #second : load the actual clip data
        this.log.debug('loading audio data')
        X_train = this._prepare_data(clipFiles)

        #third, index and binarize the labels
        this.log.debug('binarizing labels')
        y_train = pd.get_dummies(clipFiles['label'])

        # now we can actually build the model
        if (cfg.useDummyModel):
            model = this._buildDummyModel()
        else:
            model = this._buildModel()

        # and run it
        clipFiles['label_idx'] = clipFiles['label'].astype(
            'category').cat.codes
        try:
            skf = StratifiedKFold(clipFiles.label_idx, n_folds=cfg.num_folds)
        except TypeError:
            n_samples = len(clipFiles.label_idx)
            skf = StratifiedKFold(n_splits=cfg.num_folds)
            skf = skf.split(np.zeros(n_samples), clipFiles.label_idx)

        for i, (train_split, val_split) in enumerate(skf):
            X, y = X_train[train_split], y_train.values[train_split]
            X_val, y_val = X_train[val_split], y_train.values[val_split]

            checkpoint = ModelCheckpoint(cfg.models_dir + '/best_%d.h5' % i,
                                         monitor='val_loss',
                                         verbose=1,
                                         save_best_only=True)
            early = EarlyStopping(monitor="val_loss", mode="min", patience=5)

            tb = TensorBoard(log_dir=cfg.logs_dir + '/fold_%i' % i,
                             write_graph=True)

            callbacks_list = [checkpoint, early, tb]

            print("#" * 50)
            print("Fold: ", i)

            history = model.fit(X,
                                y,
                                validation_data=(X_val, y_val),
                                callbacks=callbacks_list,
                                batch_size=64,
                                epochs=cfg.max_epochs)

            # run predict on our test set
            model.load_weights(cfg.models_dir + '/best_%d.h5' % i)
            predictions = model.predict(X_train, batch_size=64, verbose=1)

            # save the column names for the model
            columns = pd.Series(['name'] + list(y_train))
            columns.to_csv(cfg.column_names_csv, header=False)

            # Save train predictions
            np.save(cfg.self_prediction_dir + "/train_predictions_%d.npy" % i,
                    predictions)
            y_predict = pd.DataFrame(predictions, columns=list(y_train))
            y_predict.to_csv(cfg.self_prediction_dir +
                             "/train_predictions_%d.csv" % i,
                             index=True,
                             index_label='idx')
Example #9
0
# Create a Classifier Service.
# Classifier process starts using a default configuration.
classifier = Classifier.run(Config())

# Prepare arrays to keep true/predicted labels to display a report later.
true_labels = []
predicted_labels = []

# Run stratified K-fold validation.
labels = list(dataset.get_labels())
if sklearn_version < 18:
    train_test_indices = StratifiedKFold(labels, n_folds=10)
else:
    skf = StratifiedKFold(n_splits=10)
    train_test_indices = skf.split(labels, labels)

for train_idx, test_idx in train_test_indices:
  # Clear the classifier (call `clear` RPC).
  classifier.clear()

  # Split the dataset to train/test dataset.
  (train_ds, test_ds) = (dataset[train_idx], dataset[test_idx])

  # Train the classifier using train dataset.
  for (idx, label) in classifier.train(train_ds):
    # You can peek records being trained.
    #print('train[{0}]: (label: {1}) => {2}'.format(idx, label, train_ds[idx]))
    pass

  # Test the classifier using test dataset.
    xgb_model = xgb.XGBClassifier(n_estimators = 300, 
                            max_depth = 5,  # 7
                            learning_rate = 0.05, # 0.168
                            min_child_weight = 7,
                            subsample = 0.97,
                            colsample_bytree = 0.82794) 

 

del X_train_pred['pred_lgbm']
del X_test_pred['pred_lgbm']
   

pred_test_full = np.zeros(X_test_pred.shape[0]) 

    for train_index, test_index in kf.split(X_train_preds_new,Y_train_new.is_duplicate):
        xtrain, xtest = X_train_preds_new.iloc[train_index], X_train_preds_new.iloc[test_index]
        ytrain, ytest = Y_train_new.iloc[train_index], Y_train_new.iloc[test_index]
        
        eval_set = [(xtrain, ytrain),(xtest, ytest)]


        xgb_model.fit(xtrain, ytrain.values.ravel(), eval_metric="logloss",eval_set = eval_set, early_stopping_rounds=50)
#         rf.fit(xtrain, ytrain)
        
        pred_test = xgb_model.predict_proba(X_test_pred)[:,1]
        pred_test_full += pred_test
        


        pred = xgb_model.predict_proba(xtest)[:,1]
Example #11
0
                            min_impurity_split = 0.005356707662170046,
             #               scale = space['scale'],
             #               normalize = space['normalize'],
             #               min_samples_leaf = space['min_samples_leaf'],
             #               min_weight_fraction_leaf  = space['min_weight_fraction_leaf'],
             #               min_impurity_split = space['min_impurity_split'],
                            random_state = 13,
                            warm_start = True,                            
                            n_jobs = -1
                            )
 



   
    for train_index, test_index in kf.split(X_train_rf,Y_train_new.is_duplicate):
        xtrain, xtest = X_train_rf.iloc[train_index], X_train_rf.iloc[test_index]
        ytrain, ytest = Y_train_new.iloc[train_index], Y_train_new.iloc[test_index]
        
        eval_set = [(xtrain, ytrain),(xtest, ytest)]

        rf.fit(xtrain, ytrain)
        pred = rf.predict_proba(xtest)[:,1]
     
        logloss = log_loss(ytest, pred)
#        print ("SCORE:", logloss)  
        total += logloss
    total = total/numfolds
    print (total)

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
tprs_test = []
aucs_test = []
mean_fpr_test = np.linspace(0, 1, 100)

tpot_plot = plt.figure()
epochs = 100
for epoch in range(epochs):
    X=x_train
    Y=y_train.values
    X_test= x_test
    Y_test= y_test.values
    i = 0
    for train, test in cv.split(X,Y):
        probas_ = gbc.fit(X[train], Y[train]).predict_proba(X[test])
    # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(Y[test], probas_[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        #plt.plot(fpr, tpr, lw=1, alpha=0.3, label='Epoch %d, ROC fold %d (AUC = %0.2f)' % (epoch, i, roc_auc))
        #i += 1
    test_probas_ = gbc.predict_proba(X_test)
    # Compute ROC curve and area the curve
    fpr_test, tpr_test, thresholds_test = roc_curve(Y_test, test_probas_[:, 1])
    tprs_test.append(interp(mean_fpr_test, fpr_test, tpr_test))
    tprs_test[-1][0] = 0.0
    roc_auc_test = auc(fpr_test, tpr_test)