コード例 #1
0
def TestOptF():
#if __name__ == "__main__":
    '''
    result: 
                
    LR: 
    [[ 915.   97.]
     [ 267.  223.]]
    0.834092980857 0.550617283951
    0.692355132404
    [[ 970.   94.]
     [ 204.  233.]]
    0.866845397676 0.609947643979
    0.738396520828
    [[ 932.  102.]
     [ 259.  208.]]
    0.837752808989 0.535392535393
    0.686572672191
    [[ 884.   62.]
     [ 307.  248.]]
    0.827328029949 0.573410404624
    0.700369217286
    [[ 919.   75.]
     [ 292.  215.]]
    0.833560090703 0.539523212045
    0.686541651374
    Avg F1:  0.700847038817
    
    
    OptF: 
    [[ 721.  291.]
     [ 130.  360.]]
    0.774020397209 0.631025416301
    0.702522906755
    [[ 794.  270.]
     [ 116.  321.]]
    0.804457953394 0.624513618677
    0.714485786036
    [[ 709.  325.]
     [ 130.  337.]]
    0.757074212493 0.596988485385
    0.677031348939
    [[ 719.  227.]
     [ 160.  395.]]
    0.787945205479 0.671197960918
    0.729571583199
    [[ 719.  275.]
     [ 158.  349.]]
    0.768572955639 0.617152961981
    0.69286295881
    Avg F1:  0.703294916748
    
    conclusion: 
        F1 on O is promoted, but F1 on N is decreased
        after avg, no obvious diff
    '''
    with open('../../data2/features_all.pkl', 'rb') as my_input:
        all_pid = dill.load(my_input)
        all_feature = dill.load(my_input)
        all_label = dill.load(my_input)

    ### preprocess
    all_feature = np.array(all_feature)
    selected = [i for i, x in enumerate(all_label) if x == 'N' or x == 'O']
    all_label = np.array(all_label)
    all_feature = all_feature[selected]
    all_label = all_label[selected]
    all_label_num = np.array(ReadData.LabelTo2(all_label, 'O'))
    
    ## k-fold cross validation
#    F1_list = []
#    kf = KFold(n_splits=5)
#    for train_index, test_index in kf.split(all_label):
#        train_data = all_feature[train_index]
#        train_label = all_label[train_index]
#        train_label_num = all_label_num[train_index]
#        test_data = all_feature[test_index]
#        test_label = all_label[test_index]
#        test_label_num = all_label_num[test_index]
#        
#        clf = LRSimp()
#        clf.fit(train_data, train_label)
#        pred = []
#        n_row, n_col = test_data.shape
#        for i in range(n_row):
#            pred.extend(clf.predict(list(test_data[i])))
#    #        break
#        F1_list.append(MyEval.F1Score2(pred, test_label))
#    
#    print('\n\nAvg F1: ', np.mean(F1_list))

    F1_list = []
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(all_label):
        train_data = all_feature[train_index]
        train_label = all_label[train_index]
        train_label_num = all_label_num[train_index]
        test_data = all_feature[test_index]
        test_label = all_label[test_index]
        test_label_num = all_label_num[test_index]
        
        test_data = sklearn.preprocessing.scale(test_data, axis=0)
        
        clf = OptF()
        clf.fit(train_data, train_label_num)
        pred = []
        n_row, n_col = test_data.shape
        for i in range(n_row):
            pred_prob = clf.predict_prob(list(test_data[i]))[0]
            if pred_prob > 0.5:
                pred.append('O')
            else:
                pred.append('N')
    #        break
        F1_list.append(MyEval.F1Score2(pred, test_label))
    
    print('\n\nAvg F1: ', np.mean(F1_list))