Example #1
0
def study(data):
    x = data.loc[:, data.columns != 'y']
    y = data['y']
    xtrain, xtest, ytrain, ytest = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=480)
    xticks = np.arange(500, 600, 5)
    train_scores = []
    test_scores = []
    for i in xticks:
        model = XGBC(n_estimators=i, learning_rate=0.05)
        model.fit(xtrain, ytrain)
        ytrain_pred = model.predict(xtrain)
        ytest_pred = model.predict(xtest)
        train_score = accuracy_score(ytrain, ytrain_pred)
        test_score = accuracy_score(ytest, ytest_pred)
        train_scores.append(train_score)
        test_scores.append(test_score)
    # sorted_feature_importances = model.feature_importances_[np.argsort(-model.feature_importances_)]
    test_scores = np.array(test_scores, dtype='float32')
    sorted_test_scores = test_scores[np.argsort(-test_scores)]
    sorted_xtick = xticks[np.argsort(-test_scores)]
    print([*zip(sorted_test_scores, sorted_xtick)])
    plt.plot(xticks, train_scores, label='train')
    plt.plot(xticks, test_scores, label='test')
    plt.legend()
    plt.show()
Example #2
0
 def xgbcv(num_round, subsample, eta, max_depth):
     val = cross_val_score(
         XGBC(num_round=int(num_round),
             subsample=float(subsample),
             eta=min(eta, 0.999),
             max_depth = int(max_depth),
             random_state=2
         ),
         X, y, score , cv=kfold
     ).mean()
     return val
Example #3
0
# In[]:
X = pd.DataFrame(X, columns=["X1", "X2"])
y = pd.DataFrame(y, columns=["y"])
data = pd.concat([X, y], axis=1)

# In[]:
ft.Sample_imbalance(data, "y")

# In[]:
Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420)
# In[]:
ft.sample_category(Ytest, Ytrain)

# In[]:
# 在sklearn下建模#
clf = XGBC().fit(Xtrain, Ytrain)
ypred = clf.predict(Xtest)
ypred_proba = clf.predict_proba(Xtest)
# In[]:
print(clf.score(Xtest, Ytest))  # 默认模型评估指标 - 准确率
print(cm(Ytest, ypred, labels=[1, 0]))  # 少数类写在前面
print(recall(Ytest, ypred))
print(auc(Ytest, clf.predict_proba(Xtest)[:, 1]))

# In[]:
clf = XGBC(scale_pos_weight=10).fit(Xtrain, Ytrain)  # 负:0/正:1 样本比例
ypred = clf.predict(Xtest)
ypred_proba = clf.predict_proba(Xtest)
# In[]:
print(clf.score(Xtest, Ytest))  #默认模型评估指标 - 准确率
print(cm(Ytest, ypred, labels=[1, 0]))  # 少数类写在前面
Example #4
0
# split into train test sets using t_t_s
# because we combined the datasets to apply uniform
# one hot and label encoding, we set 'shuffle' parameter as false
# we also know that there should be 15060 rows in the test sets
test_set_size = test_dataset_nomissing.shape[0]
print('\n test_set_size...')
print(test_set_size)
X_train, X_test, Y_train, Y_test = t_t_s(rescaledX,
                                         Y,
                                         test_size=test_set_size,
                                         random_state=seed,
                                         shuffle=False)

# instantiate XGBC class using defaults
model = XGBC()

# fit model to training datasets
print('\n training d model...')
model.fit(X_train, Y_train)

# view trained model
print('\n model...')
print(model)

# make predictions for test data
print('\n making predictions...')
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
train_predictions = model.score(X_train, Y_train)
Example #5
0
def train(data):
    x = data.loc[:, data.columns != 'y']
    y = data['y']
    xtrain, xtest, ytrain, ytest = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=100)
    model = XGBC(n_estimators=500, learning_rate=0.05, eval_metric='auc')
    model.fit(xtrain, ytrain)

    # train score: 0.900719
    #  test score: 0.893923
    # train score: 0.920631
    #  test score: 0.899448
    # train score: 0.927821
    #  test score: 0.903867

    ytrain_pred = model.predict(xtrain)
    ytest_pred = model.predict(xtest)
    train_score = accuracy_score(ytrain, ytrain_pred)
    test_score = accuracy_score(ytest, ytest_pred)
    print('train score: %f \n test score: %f' % (train_score, test_score))
    print('roc auc', roc_auc_score(ytrain, ytrain_pred))

    sorted_feature_importances = model.feature_importances_[np.argsort(
        -model.feature_importances_)]
    feature_importance_names = x.columns[np.argsort(
        -model.feature_importances_)]
    print([*zip(feature_importance_names, sorted_feature_importances)])
    fi = pd.DataFrame(
        [*zip(feature_importance_names, sorted_feature_importances)],
        columns=['name', 'score'])
    fi = fi.sort_values(by=['score'], ascending=True)
    fi = fi.reset_index(drop=True)

    ax = plt.gca()
    ax.hlines(y=fi.index,
              xmin=0,
              xmax=fi.score,
              color='firebrick',
              alpha=0.4,
              linewidth=30)
    for index, row in fi.iterrows():
        plt.text(row['score'],
                 index,
                 round(row['score'], 2),
                 horizontalalignment='left',
                 verticalalignment='center',
                 fontdict={
                     'color': 'black',
                     'fontsize': 30
                 })

    plt.yticks(fi.index, fi.name, fontsize=30)
    # ax.scatter(x=fi.index, y=fi.score, s=75, color='firebrick', alpha=0.7)
    plt.show()

    train_confusion_matrix = confusion_matrix(ytrain, ytrain_pred)
    test_confusion_matrix = confusion_matrix(ytest, ytest_pred)
    print('train confusion matrix:\n %s' % train_confusion_matrix)
    print('test confusion matrix:\n %s' % test_confusion_matrix)
    train_classification_report = classification_report(ytrain, ytrain_pred)
    test_classification_report = classification_report(ytest, ytest_pred)
    print('train classification report:\n %s' % train_classification_report)
    print('test classification repor:\n %s' % test_classification_report)
    return model, fi
Example #6
0
def predict_sent(vecs, xgb_model_analyze):
    xgb = XGBC()
    xgb.load_model(xgb_model_analyze)
    pred = predict(vecs, w2v_model, xgb, 300)
    df = pd.DataFrame(pred, columns=['sent'])
    return df
Example #7
0
                        find_ID(X_test[i].tolist(), X_all.tolist()))

        to_get_single_stats = False
        if to_get_single_stats:
            print 'Accuracy:', sum(res_pred == y_test) / float(len(y_test))
            print clf.coef_[0]
            fig, ax = plt.subplots()
            plt.barh(range(len(y_ticklabels)), clf.coef_[0])
            ax.set_yticklabels(y_ticklabels)
            plt.xlabel('coefs from linear SVM', fontsize=20)
            plt.tight_layout()
            plt.show()
    elif use_method == 'xgbc':
        from xgboost import XGBClassifier as XGBC
        from xgboost import plot_importance
        clf = XGBC()
        clf.fit(X_train, y_train)
        res_pred = clf.predict(X_test)
        #print res_pred
        #print y_test

        print 'Accuracy:', sum(res_pred == y_test) / float(len(y_test))

        print(clf.feature_importances_)
        plot_importance(clf)
        ax = plt.gca()
        curr_labels = ax.get_yticklabels()
        curr_inds = []
        for one_label in curr_labels:
            curr_label = one_label.get_text()
            one_ind = int(curr_label[1])
from xgboost import XGBClassifier as XGBC
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import confusion_matrix as cm, recall_score as recall, roc_auc_score as auc

class_1 = 500
class_2 = 50

centers = [[0.0, 0.0], [2.0, 2.0]]
clusters_std = [1.5, 0.5]

X, y = make_blobs(n_samples=[class_1, class_2], centers=centers, cluster_std=clusters_std, random_state=0,
                  shuffle=False)
Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420)

clf = XGBC().fit(Xtrain, Ytrain)
ypred = clf.predict(Xtest)
clf.score(Xtest, Ytest)
cm(Ytest, ypred, labels=[1, 0])
recall(Ytest, ypred)
auc(Ytest, clf.predict_proba(Xtest)[:, 1])

clf_ = XGBC(scale_pos_weight=10).fit(Xtrain, Ytrain)
ypred_ = clf_.predict(Xtest)
clf_.score(Xtest, Ytest)
cm(Ytest, ypred_, labels=[1, 0])
recall(Ytest, ypred_)
auc(Ytest, clf_.predict_proba(Xtest)[:, 1])

for i in [1, 5, 10, 20, 30]:
    clf_ = XGBC(scale_pos_weight=i).fit(Xtrain, Ytrain)
Example #9
0
from sklearn.metrics import roc_auc_score, accuracy_score
from xgboost import XGBClassifier as XGBC

battles = pd.read_csv('data/battles.csv')
character_predictions = pd.read_csv('data/character-predictions.csv')
battle, character_pred = q01_feature_engineering(battles,
                                                 character_predictions)
death_preds = q08_preprocessing(character_pred)
X = death_preds[death_preds.actual == 0].sample(350, random_state=62).append(
    death_preds[death_preds.actual == 1].sample(
        350, random_state=62)).copy(deep=True).astype(np.float64)
Y = X.actual.values
tX = death_preds[~death_preds.index.isin(X.index)].copy(deep=True).astype(
    np.float64)
tY = tX.actual.values
X.drop(['SNo', 'actual', 'DateoFdeath'], 1, inplace=True)
tX.drop(['SNo', 'actual', 'DateoFdeath'], 1, inplace=True)

clf_xgb = XGBC(subsample=.8, colsample_bytree=.8, seed=14, max_depth=3)


def q09_XGBoost(X_train, y_train, X_test, y_test, clf_xgb):
    'write your solution here'
    model = clf_xgb
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    pred_prob = clf_xgb.predict_proba(tX)
    roc_auc = roc_auc_score(y_test, pred_prob[:, 1])
    accuracy = accuracy_score(y_test, np.argmax(pred_prob, axis=1))
    return roc_auc, accuracy
xtrain, xtest, ytrain, ytest = TTS(x, y, test_size=0.3, random_state=420)

(y == 1).sum() / y.shape[0]

# In[11]:

x.shape

# In[12]:

y.shape

# In[13]:

clf = XGBC().fit(xtrain, ytrain)
ypred = clf.predict(xtest)

# In[14]:

ypred

# In[15]:

cm(ytest, ypred, labels=[1, 0])

# In[16]:

recall(ytest, ypred)

# In[17]:
Example #11
0
clf = RandomForestClassifier()
print '_' * 20, clf.__class__.__name__, '_' * 20
print "Training the data"

t0 = time()
results_rf = test_classifier(clf, my_dataset, feature_list, folds)
print("done in %0.3fs" % (time() - t0))

cm_rf = [[results_rf['true_negatives'], results_rf['false_negatives']],
         [results_rf['true_positives'], results_rf['false_positives']]]

# In[44]:

from xgboost import XGBClassifier as XGBC

clf = XGBC()

print '_' * 20, clf.__class__.__name__, '_' * 20
print "Training the data"

t0 = time()
results_xgb = test_classifier(clf, my_dataset, feature_list, folds)
print("done in %0.3fs" % (time() - t0))

cm_xgb = [[results_xgb['true_negatives'], results_xgb['false_negatives']],
          [results_xgb['true_positives'], results_xgb['false_positives']]]

# In[36]:

from sklearn.linear_model import LogisticRegression
def optimCurveFit(strategy, method_clsf, ratio=0.8, NV_type='NVequals'):
    constrain_time = True

    ######################
    #TODO Step 1: Data input
    ######################
    data_set = 'mitdb'  # 'ecgiddb', 'mitdb'
    channel = 0
    records, IDs, fss, annss = mf.load_data(
        data_set, channel)  #, num_persons=60, record_time=20)
    fs = fss[0]

    records = np.array(records)
    IDs = np.array(IDs)
    annss = np.array(annss)
    ######################

    ######################
    #TODO Step 2: Data selection
    ######################

    if (strategy == 'allN_data') or (strategy == 'all_data'):
        ''  # do nothing here
    elif strategy == 'NV_data':
        NV_inds = [6, 15, 18, 23, 24, 26, 29, 31, 33, 35, 39, 41, 42, 46]
        #for i in NV_inds: #range(annss.shape[0]): #
        #    print i, Counter(annss[i][1])['V']

        records = records[NV_inds, :]
        IDs = IDs[NV_inds]
        annss = annss[NV_inds, :]

        ## re-numbering the IDs... wtf
        for i in range(len(NV_inds)):
            IDs[i] = i
    elif strategy == 'combine_IDs':
        num_to_combine = 4
        print IDs

        for i in range(int(len(records) / num_to_combine)):
            for j in range(num_to_combine - 1):
                IDs[i * num_to_combine + j + 1] = IDs[i * num_to_combine + j]
            #IDs[i*2+1] = IDs[i*2]
        for i in range(len(IDs)):
            IDs[i] /= num_to_combine

    if constrain_time:
        look_time = 600.  # in s
        look_ind = int(look_time * fs)
        records = records[:, :look_ind]
        annss = annss[:, :look_ind]

    recs = []
    for i in range(len(records)):
        curr_rec = Rec(records[i], fs, IDs[i], annss[i])
        recs.append(curr_rec)
    ######################

    ######################
    #TODO Step 3: Data filtering
    ######################

    ######################

    ######################
    #TODO Step 4: Data segmentation
    ######################
    USE_BIOSPPY_FILTERED = True
    sigs, labels_bySegs = mf.get_seg_data(records,
                                          IDs,
                                          fss,
                                          USE_BIOSPPY_FILTERED,
                                          annss=annss)
    sigs, labels_bySegs = np.array(sigs), np.array(labels_bySegs)
    mrks_bySegs = np.array([x[-1] for x in labels_bySegs])

    if strategy == 'allN_data':
        N_masks = (mrks_bySegs == 'N')
        sigs = sigs[N_masks, :]
        labels_bySegs = labels_bySegs[N_masks]

    IDs_bySegs = [int(x[:-1]) for x in labels_bySegs]
    mrks_bySegs = [x[-1] for x in labels_bySegs]
    IDs_bySegs, mrks_bySegs = np.array(IDs_bySegs), np.array(mrks_bySegs)

    segs = []
    for i in range(len(sigs)):
        curr_seg = Seg(sig=sigs[i],
                       fs=fs,
                       ID=IDs_bySegs[i],
                       mrk=mrks_bySegs[i])
        segs.append(curr_seg)
    segs = np.array(segs)
    ######################

    #for one_label in labels_all:
    #    if ('N' in one_label) or ('V' in one_label):
    #        print one_label
    #quit()

    #segs_all, labels_all = np.array(segs_all), np.array(labels_all)

    ######################
    #TODO Step 5: feature extraction
    ######################
    X_all = []
    y_all = []
    method_feat = 'PCA'  # 'template_matching'

    if method_feat == 'PCA':
        feat_dim = 20
        pca = PCA(n_components=feat_dim)
        X_all = np.array([x.sig for x in segs])
        X_all = pca.fit(X_all).transform(X_all)

        for i in range(len(segs)):
            segs[i].feat = X_all[i, :]
        y_all = np.array([x.ID for x in segs])

    X_all = np.array(X_all)
    ######################

    ######################
    #TODO Step 6: Data split
    ######################
    if strategy != 'NV_data':
        X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                            y_all,
                                                            test_size=0.2,
                                                            random_state=42)
    else:
        X_train, X_test, y_train, y_test = [], [], [], []
        y_test_mrks = []
        for i in range(len(NV_inds)):
            curr_mrks = mrks_bySegs[IDs_bySegs == i]  #current people's mrks\
            #print curr_mrks

            curr_segs = segs[IDs_bySegs == i]
            curr_labels = labels_bySegs[IDs_bySegs == i]

            curr_inds_Vs = np.where(curr_mrks == 'V')[0]
            curr_inds_Ns = np.where(curr_mrks == 'N')[0]

            curr_num_Vs = sum(np.array(curr_mrks) == 'V')  #all his Vs
            curr_num_Ns = sum(np.array(curr_mrks) == 'N')

            if NV_type == 'fixV':
                train_num_Vs = int(curr_num_Vs * .8)
                train_num_Ns = min(
                    [int(curr_num_Ns * .8),
                     int(ratio * train_num_Vs)])
            elif NV_type == 'NVequals':
                train_num_Vs = int(curr_num_Vs * ratio)
                train_num_Ns = train_num_Vs

            train_inds_Vs = random.sample(curr_inds_Vs, train_num_Vs)
            test_inds_Vs = [
                x for x in curr_inds_Vs if not (x in train_inds_Vs)
            ]

            #test_inds_Vs = curr_inds_Vs[~ train_inds_Vs]
            train_inds_Ns = random.sample(curr_inds_Ns, train_num_Ns)
            test_inds_Ns = [
                x for x in curr_inds_Ns if not (x in train_inds_Ns)
            ]

            #print len(train_inds_Vs), len(test_inds_Vs)
            #print len(train_inds_Ns), len(test_inds_Ns)

            #test_inds_Ns = curr_inds_Vs[~ train_inds_Ns]
            #        print train_inds_Ns
            #        print test_inds_Ns

            curr_IDs = IDs_bySegs[IDs_bySegs == i]
            #print curr_IDs

            for one_seg in curr_segs[train_inds_Vs]:
                X_train.append(one_seg.feat.tolist())
            for one_lab in curr_IDs[train_inds_Vs]:
                y_train.append(one_lab)

            for one_seg in curr_segs[train_inds_Ns]:
                X_train.append(one_seg.feat.tolist())
            for one_lab in curr_IDs[train_inds_Ns]:
                y_train.append(one_lab)

            for one_seg in curr_segs[test_inds_Vs]:
                X_test.append(one_seg.feat.tolist())
            for one_lab in curr_IDs[test_inds_Vs]:
                y_test.append(one_lab)
            for one_mrk in curr_mrks[test_inds_Vs]:
                y_test_mrks.append(one_mrk)

            for one_seg in curr_segs[test_inds_Ns]:
                X_test.append(one_seg.feat.tolist())
            for one_lab in curr_IDs[test_inds_Ns]:
                y_test.append(one_lab)
            for one_mrk in curr_mrks[test_inds_Ns]:
                y_test_mrks.append(one_mrk)

            #print i
            #print len(X_train), len(y_train), len(X_test), len(y_test)

    X_train, y_train, X_test, y_test = \
    np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)

    ######################

    #print X_train.shape, y_train.shape, X_test.shape, y_test.shape
    #quit()
    #print X_train
    #print X_test
    #y_train = [int(y[:-1]) for y in y_train]
    #y_test = [int(y[:-1]) for y in y_test]

    ######################
    #TODO Step 7: Model training
    ######################
    time_before_training = Time()

    if method_clsf == 'SVM':
        not_trained = True
        from sklearn.externals import joblib
        if not_trained:
            clf = svm.SVC(kernel='rbf', C=10., gamma=0.1)
            clf.fit(X_train, y_train)
            joblib.dump(clf, 'test_clf.pkl')
        else:
            clf = joblib.load('test_clf.pkl')
        res_pred = clf.predict(X_test)
    elif method_clsf == 'Logit':
        clf = LR(C=10.)
        clf.fit(X_train, y_train)
        res_pred = clf.predict(X_test)
    elif method_clsf == 'kNN':
        clf = KNC()
        clf.fit(X_train, y_train)
        res_pred = clf.predict(X_test)
    elif method_clsf == 'DTC':
        clf = DTC()
        clf.fit(X_train, y_train)
        res_pred = clf.predict(X_test)
    elif method_clsf == 'boosting':
        clf = XGBC()
        clf.fit(X_train, y_train)
        res_pred = clf.predict(X_test)
    elif method_clsf == 'GNB':
        clf = GNB()
        clf.fit(X_train, y_train)
        res_pred = clf.predict(X_test)
    elif method_clsf == 'DL':
        not_trained = True
        from sklearn.externals import joblib

        if not_trained:
            model = Sequential()
            model.add(
                Dense(feat_dim, activation='relu', input_shape=(feat_dim, )))
            #model.add(Dense(input_dim,activation='relu'))

            num_categs = len(set(y_train))

            print y_train, num_categs
            Y_train = np_utils.to_categorical(y_train, num_categs)
            Y_test = np_utils.to_categorical(y_test, num_categs)

            model.add(Dense(num_categs, activation='softmax'))

            model.compile(loss='categorical_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])
            X_train = np.array(X_train)
            Y_train = np.array(Y_train)
            #print X_train.shape
            #print Y_train.shape

            model.fit(X_train,
                      Y_train,
                      validation_split=0.2,
                      batch_size=32,
                      nb_epoch=50,
                      verbose=0)
            #model.save('test_clf_DL.pkl')
        else:
            model = keras.models.load_model('test_clf_DL.pkl')
        #score = model.evaluate(X_test, Y_test, verbose=0)

    time_after_training = Time()

    ######################
    #TODO Step 8: Model testing
    ######################
    if method_clsf != 'DL':
        res_pred = clf.predict(X_test)
    else:
        res_pred = model.predict_classes(X_test)
    ######################

    ######################
    #TODO Step 9: Result output
    ######################
    train_time = time_after_training - time_before_training

    print_res = False
    if print_res:
        print ''
        print 'Parameters:'
        print 'strategy:', strategy
        print 'constrain_time:', constrain_time
        print 'ratio:', ratio
        print 'method_clsf:', method_clsf

        #print ''

        print 'Results:'
        print 'Used time for training:', time_after_training - time_before_training

    res_look = []
    for i in range(len(res_pred)):
        res_look.append((res_pred[i], y_test[i]))
    #print res_look

    if False:
        res_pred_IDs = np.array([y[:-1] for y in res_pred])
        res_pred_mrks = np.array([y[-1] for y in res_pred])

        only_test_ID = True
        if only_test_ID:
            to_be_predct = res_pred_IDs
            to_be_tested = y_test
        else:
            to_be_predct = res_pred
            to_be_tested = y_test

    ##TODO: adjust accordingly
    if strategy == 'NV_data':
        look_stat = 'V'
        y_test_mrks = np.array(y_test_mrks)
        #print y_test_mrks
        to_be_predct = res_pred[y_test_mrks == look_stat]
        to_be_tested = y_test[y_test_mrks == look_stat]

        res_by_seg = mf.get_corr_ratio(res_pred=to_be_predct,
                                       y_test=to_be_tested,
                                       type='by_seg')
        res_by_categ = mf.get_corr_ratio(res_pred=to_be_predct,
                                         y_test=to_be_tested,
                                         type='by_categ')
        one_res = (float(format(res_by_seg,
                                '.3f')), float(format(res_by_categ, '.3f')))
        accuBySeg_V = one_res[0]
        #print len(to_be_predct), one_res

        look_stat = 'N'
        to_be_predct = res_pred[y_test_mrks == look_stat]
        to_be_tested = y_test[y_test_mrks == look_stat]

        res_by_seg = mf.get_corr_ratio(res_pred=to_be_predct,
                                       y_test=to_be_tested,
                                       type='by_seg')
        res_by_categ = mf.get_corr_ratio(res_pred=to_be_predct,
                                         y_test=to_be_tested,
                                         type='by_categ')
        one_res = (float(format(res_by_seg,
                                '.3f')), float(format(res_by_categ, '.3f')))
        accuBySeg_N = one_res[0]
        #print len(to_be_predct), one_res
        return [accuBySeg_V, accuBySeg_N, train_time]
    else:
        to_be_predct = res_pred
        to_be_tested = y_test

        res_by_seg = mf.get_corr_ratio(res_pred=to_be_predct,
                                       y_test=to_be_tested,
                                       type='by_seg')
        res_by_categ = mf.get_corr_ratio(res_pred=to_be_predct,
                                         y_test=to_be_tested,
                                         type='by_categ')
        one_res = (float(format(res_by_seg,
                                '.3f')), float(format(res_by_categ, '.3f')))
        return [one_res[0], train_time]