Exemple #1
0
def gen_model():
    '''
    for online submit
    train on entire data
    '''
    with open('../data/features_all_v2.5.pkl', 'rb') as my_input:
        all_pid = dill.load(my_input)
        all_feature = dill.load(my_input)
        all_label = dill.load(my_input)

    all_pid = np.array(all_pid)
    all_feature = np.array(all_feature)
    all_label = np.array(all_label)
    print('all feature shape: {0}'.format(all_feature.shape))

    clf_1 = MyXGB()
    clf_2 = MyXGB()
    clf_3 = MyXGB()
    clf_4 = MyXGB()
    clf_5 = MyXGB()
    clf_final = Encase([clf_1, clf_2, clf_3, clf_4, clf_5])

    print('start training')
    clf_final.fit(all_feature, all_label)
    print('done training')

    pred = clf_final.predict(all_feature)
    print(MyEval.F1Score3(pred, all_label))

    # with open('../../tmp_model/v2.5_xgb4_all.pkl', 'wb') as fout:
    with open('../model/v2.5_xgb5_all_v2.pkl', 'wb') as fout:
        dill.dump(clf_final, fout)
    print('save model done')
Exemple #2
0
def XGBcv(all_pid, all_feature, all_label, 
          subsample, max_depth, colsample_bytree, min_child_weight):
    '''
    TODO: 
        try kf = StratifiedKFold(n_splits=5, shuffle=True)
    '''
    
    wrong_stat = []
    
    ## k-fold cross validation
    all_pid = np.array(all_pid)
    all_feature = np.array(all_feature)
    all_label = np.array(all_label)
    F1_list = []
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    for train_index, test_index in kf.split(all_feature, all_label):
        train_data = all_feature[train_index]
        train_label = all_label[train_index]
        test_data = all_feature[test_index]
        test_label = all_label[test_index]
        
        clf = MyXGB(subsample=subsample, 
                    max_depth=max_depth, 
                    colsample_bytree=colsample_bytree, 
                    min_child_weight=min_child_weight)
        clf.fit(train_data, train_label)
        
        pred = clf.predict(test_data)
        F1_list.append(MyEval.F1Score3(pred, test_label, False))
            
    print('\n\nAvg F1: ', np.mean(F1_list))

    return np.mean(F1_list)
Exemple #3
0
def gen_model():
    with open('../data/features_all_v1.6.pkl', 'rb') as my_input:
        all_pid = dill.load(my_input)
        all_feature = dill.load(my_input)
        all_label = dill.load(my_input)

    all_pid = np.array(all_pid)
    all_feature = np.array(all_feature)
    all_label = np.array(all_label)

    clf = MyXGB()
    clf.fit(all_feature, all_label)

    pred = clf.predict(all_feature)
    print(MyEval.F1Score3(pred, all_label))

    with open('../model/v1.6_xgb.pkl', 'wb') as fout:
        dill.dump(clf, fout)
Exemple #4
0
def TestEncase(all_pid, all_feature, all_label):
    #if __name__ == "__main__":

    #    with open('../data/features_all_v2.2.pkl', 'rb') as my_input:
    #        all_pid = np.array(dill.load(my_input))
    #        feat_feature = np.array(dill.load(my_input))
    #        all_label = np.array(dill.load(my_input))
    #
    ##    mean_wave = np.array(ReadData.read_mean_wave())
    #    mean_wave = np.array(ReadData.read_mean_wave_simp())
    #    all_feature = np.array(np.c_[mean_wave, feat_feature])
    #    all_feature = np.array(mean_wave)

    wrong_stat = []

    clf_final_list = []

    ## k-fold cross validation

    all_feature = np.array(all_feature)
    all_label = np.array(all_label)
    all_pid = np.array(all_pid)
    F1_list = []
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    i_fold = 1
    print('all feature shape: {0}'.format(all_feature.shape))
    for train_index, test_index in kf.split(all_feature, all_label):
        train_data = all_feature[train_index]
        train_label = all_label[train_index]
        test_data = all_feature[test_index]
        test_label = all_label[test_index]
        test_pid = all_pid[test_index]

        clf_1 = MyXGB()
        clf_2 = MyXGB()
        clf_3 = MyXGB()
        clf_4 = MyXGB()
        clf_5 = MyXGB()

        clf_final = Encase([clf_1, clf_2, clf_3, clf_4, clf_5])
        clf_final.fit(train_data, train_label)

        pred = clf_final.predict(test_data)
        #        pred_train = clf_final.predict(train_data)
        #        MyEval.F1Score3(pred_train, train_label)
        F1_list.append(MyEval.F1Score3(pred, test_label, False))
        # wrong_stat.extend(MyEval.WrongStat(i_fold, pred, test_label, test_pid))
        i_fold += 1

        clf_final_list.append(clf_final)

    avg_f1 = np.mean(F1_list)
    print('\n\nAvg F1: ', avg_f1)
    # wrong_stat = pd.DataFrame(wrong_stat, columns=['i_fold', 'pid', 'gt', 'pred'])
    # wrong_stat.to_csv('../../stat/wrong_stat_f1'+str(np.mean(F1_list))+'.csv')

    clf_final_final = Encase(clf_final_list)
    pred = clf_final_final.predict(all_feature)
    print(MyEval.F1Score3(pred, all_label))

    with open('../../tmp_model/v2.5_v0.1/v2.5_v0.1_' + str(avg_f1) + '.pkl',
              'wb') as fout:
        dill.dump(clf_final_final, fout)
Exemple #5
0
def TestXGB(fout, original_pid, original_label, all_pid, all_feature,
            all_label):

    #    wrong_stat = []

    ## k-fold cross validation
    original_pid = np.array(original_pid)
    original_label = np.array(original_label)
    all_feature = np.array(all_feature)
    all_label = np.array(all_label)

    F1_list_set = []
    F1_list_seq = []
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    i_fold = 1
    for original_train_index, original_test_index in kf.split(
            original_label, original_label):

        original_train_pid = set(original_pid[original_train_index])
        original_test_pid = set(original_pid[original_test_index])

        train_index = []
        test_index = []
        for ii in range(len(all_pid)):
            ii_pid = all_pid[ii].split('_')[0]
            if ii_pid in original_train_pid:
                train_index.append(ii)
            elif ii_pid in original_test_pid:
                test_index.append(ii)
            else:
                print('wrong')
        train_index = np.array(train_index)
        test_index = np.array(test_index)

        train_data = all_feature[train_index]
        train_label = all_label[train_index]
        train_pid = np.array(all_pid)[train_index]
        test_data = all_feature[test_index]
        test_label = all_label[test_index]
        test_pid = np.array(all_pid)[test_index]

        clf = MyXGB()
        clf.fit(train_data, train_label)

        pred = clf.predict(test_data)
        pred_train = clf.predict(train_data)

        _, pred_train_seq = shrink_set_to_seq(train_pid, pred_train)
        _, train_label_seq = shrink_set_to_seq(train_pid, train_label)
        print('pred_train')
        MyEval.F1Score3(pred_train, train_label)
        print('pred_train_seq')
        MyEval.F1Score3(pred_train_seq, train_label_seq)

        _, pred_seq = shrink_set_to_seq(test_pid, pred)
        _, test_label_seq = shrink_set_to_seq(test_pid, test_label)
        print('\n pred')
        F1_list_set.append(MyEval.F1Score3(pred, test_label))
        print('pred_seq')
        f1_pred = MyEval.F1Score3(pred_seq, test_label_seq)
        F1_list_seq.append(f1_pred)
        print('=====================================')
        #        wrong_stat.extend(MyEval.WrongStat(i_fold, pred, test_label, test_pid))
        fout.write('{0}, {1} \n'.format(i_fold, f1_pred))
        i_fold += 1

#        with open('../tmp_model/v1.9_xgb_z_'+str(f1_pred)+'.pkl', 'wb') as fout:
#            dill.dump(f1_pred, fout)
#        break
    avg_f1 = np.mean(F1_list_seq)
    print('\n\nAvg F1: ', avg_f1)
    #    wrong_stat = pd.DataFrame(wrong_stat, columns=['i_fold', 'pid', 'gt', 'pred'])
    #    wrong_stat.to_csv('../../result/wrong_stat.csv')
    fout.write('avg, {0} \n'.format(f1_pred))
Exemple #6
0
     feat_deep_centerwave = np.array(dill.load(my_input))
     print('feat_deep_centerwave shape: ', feat_deep_centerwave.shape)
 
 with open('../data/feat_resnet.pkl', 'rb') as my_input:
     feat_resnet = np.array(dill.load(my_input))
     print('feat_resnet shape: ', feat_resnet.shape)
     
 
 # k-fold cross validation
 all_feature = np.c_[all_feature, feat_deep_centerwave, feat_resnet]
 all_label = np.array(all_label)
 
 train_data = all_feature
 train_label = all_label
 
 clf = MyXGB()
 clf.fit(train_data, train_label)
 print('train done')
 
 imp_scores = clf.get_importance()
 feat_num = all_feature.shape[1]
 imp_scores_key_num = set([int(k[1:]) for k in imp_scores.keys()])
 print(feat_num)
 print(len(imp_scores))
 
 pred_train = clf.predict(train_data)
 MyEval.F1Score3(pred_train, train_label)
 
 with open('../../stat/feat_imp_v2.5_v0.1_v0.1.csv', 'w') as fout:
     for i in range(1,feat_num+1):
         if i in imp_scores_key_num:
Exemple #7
0
def TestExp(all_pid, all_feature, all_label, method, i_iter):

    kf = StratifiedKFold(n_splits=5, shuffle=True)
    i_fold = 1
    print('all feature shape: {0}'.format(all_feature.shape))
    for train_index, test_index in kf.split(all_feature, all_label):
        train_data = all_feature[train_index]
        train_label = all_label[train_index]
        test_data = all_feature[test_index]
        test_label = all_label[test_index]
        test_pid = all_pid[test_index]

        ### ENCASE
        if method == 'ENCASE_E':
            selected_cols = list(range(258, 558))
            train_data = train_data[:, selected_cols]
            test_data = test_data[:, selected_cols]
            clf_1 = MyXGB()
            clf_2 = MyXGB()
            clf_3 = MyXGB()
            clf_4 = MyXGB()
            clf_5 = MyXGB()
            clf_final = Encase([clf_1, clf_2, clf_3, clf_4, clf_5])
            clf_final.fit(train_data, train_label)
            pred = clf_final.predict(test_data)

        if method == 'ENCASE_EC':
            selected_cols = list(range(0, 558))
            train_data = train_data[:, selected_cols]
            test_data = test_data[:, selected_cols]
            clf_1 = MyXGB()
            clf_2 = MyXGB()
            clf_3 = MyXGB()
            clf_4 = MyXGB()
            clf_5 = MyXGB()
            clf_final = Encase([clf_1, clf_2, clf_3, clf_4, clf_5])
            clf_final.fit(train_data, train_label)
            pred = clf_final.predict(test_data)

        if method == 'ENCASE_ECD':
            clf_1 = MyXGB()
            clf_2 = MyXGB()
            clf_3 = MyXGB()
            clf_4 = MyXGB()
            clf_5 = MyXGB()
            clf_final = Encase([clf_1, clf_2, clf_3, clf_4, clf_5])
            clf_final.fit(train_data, train_label)
            pred = clf_final.predict(test_data)

        elif method == 'XGBoost_E':
            selected_cols = list(range(258, 558))
            train_data = train_data[:, selected_cols]
            test_data = test_data[:, selected_cols]
            clf_final = MyXGB(n_estimators=100, num_round=50)
            clf_final.fit(train_data, train_label)
            pred = clf_final.predict(test_data)

        elif method == 'XGBoost_EC':
            selected_cols = list(range(0, 558))
            train_data = train_data[:, selected_cols]
            test_data = test_data[:, selected_cols]
            clf_final = MyXGB(n_estimators=100, num_round=50)
            clf_final.fit(train_data, train_label)
            pred = clf_final.predict(test_data)

        elif method == 'XGBoost_ECD':
            clf_final = MyXGB(n_estimators=100, num_round=50)
            clf_final.fit(train_data, train_label)
            pred = clf_final.predict(test_data)

        elif method == 'LR_E':
            selected_cols = list(range(258, 558))
            train_data = train_data[:, selected_cols]
            test_data = test_data[:, selected_cols]
            clf_final = MyLR()
            clf_final.fit(train_data, train_label)
            pred = clf_final.predict(test_data)

        elif method == 'LR_EC':
            selected_cols = list(range(0, 558))
            train_data = train_data[:, selected_cols]
            test_data = test_data[:, selected_cols]
            clf_final = MyLR()
            clf_final.fit(train_data, train_label)
            pred = clf_final.predict(test_data)

        elif method == 'LR_ECD':
            clf_final = MyLR()
            clf_final.fit(train_data, train_label)
            pred = clf_final.predict(test_data)

        elif method == 'SampleEn':
            selected_cols = [300, 301, 302, 303]
            train_data = train_data[:, selected_cols]
            test_data = test_data[:, selected_cols]
            clf_final = MyLR()
            clf_final.fit(train_data, train_label)
            pred = clf_final.predict(test_data)

        elif method == 'CDF':
            selected_cols = [304, 305, 306]
            train_data = train_data[:, selected_cols]
            test_data = test_data[:, selected_cols]
            clf_final = MyLR()
            clf_final.fit(train_data, train_label)
            pred = clf_final.predict(test_data)

        elif method == 'MAD':
            selected_cols = [307]
            train_data = train_data[:, selected_cols]
            test_data = test_data[:, selected_cols]
            clf_final = MyLR()
            clf_final.fit(train_data, train_label)
            pred = clf_final.predict(test_data)

        elif method == 'Variability':
            selected_cols = [346, 347, 348, 349, 350]
            train_data = train_data[:, selected_cols]
            test_data = test_data[:, selected_cols]
            clf_final = MyLR()
            clf_final.fit(train_data, train_label)
            pred = clf_final.predict(test_data)

        res = MyEval.F14Exp(pred, test_label)
        print(res)

        with open('../../stat/res_exp_for_paper.csv', 'a') as fout:
            fout.write('{0},{1},{2},{3},{4},{5},{6},{7}\n'.format(
                method, i_iter, i_fold, res[0], res[1], res[2], res[3],
                res[4]))

        i_fold += 1