Example #1
0
def train_xgb():
    #
    import xgboost as xgb

    # Set our parameters for xgboost
    params = {}
    params['objective'] = 'binary:logistic'
    params['eval_metric'] = 'logloss'
    params['eta'] = 0.02
    params['max_depth'] = 4
    feature1 = DataUtil.load_matrix('../feature_train/feature_min_max.txt')
    feature2 = pd.read_csv('../feature_train/feature_deepnet.csv').values
    print(feature1.shape)
    feature = np.concatenate([feature1, feature2], axis=1)
    print(feature.shape)
    label = np.load('../data/train_label.npy')
    x_train, x_valid, y_train, y_valid = train_test_split(feature, label)
    feature1_test = DataUtil.load_matrix('../feature_test/feature_min_max.txt')
    feature2_test = pd.read_csv('../feature_test/feature_deepnet.csv').values
    feature_test = np.concatenate([feature1_test, feature2_test], axis=1)
    d_train = xgb.DMatrix(x_train, label=y_train)
    d_valid = xgb.DMatrix(x_valid, label=y_valid)
    d_test = xgb.DMatrix(feature_test)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    bst = xgb.train(params,
                    d_train,
                    5000,
                    watchlist,
                    early_stopping_rounds=50,
                    verbose_eval=10)
    pd.DataFrame(bst.predict(d_test)).to_csv('../result/result.csv')
Example #2
0
def train():
    "训练模型"

    Train_left = np.load('./data/X_train_question1.npy')
    Train_right = np.load('./data/X_train_question2.npy')
    Train_label= np.load('./data/train_label.npy')
    Train_label = Train_label.astype(np.int64)
    # stack_tr = np.zeros((Train_label.shape[0]))
    statistics_feature = DataUtil.load_matrix('./feature_train/feature_min_max.txt')
    sta2 = pd.read_csv('./feature_train/feature_deepnet.csv').values
    for k,(tr,va) in enumerate(StratifiedKFold(Train_label,random_state=27,n_folds=N_FOLD)):
        model =  esim()
        print(' stack:{}/{}'.format(k+1,N_FOLD))
        X_train_left = Train_left[tr]
        X_train_right = Train_right[tr]
        Y_train = Train_label[tr]
        train_stistics = statistics_feature[tr]
        train_sta2 = sta2[tr]
        val_sta2 = sta2[va]
        val_stistics = statistics_feature[va]
        X_val_left = Train_left[va]
        X_val_right = Train_right[va]
        Y_val = Train_label[va]
        print ("Train...")
        checkpoint = ModelCheckpoint('./model_file/CIKM_dec_Attention_classify_{}.hdf5'.format(k), monitor='val_loss', verbose=1,
                                 save_best_only=True, mode='min')
        early = EarlyStopping(monitor='val_loss', mode='min', patience=10)
        callbacks_list = [checkpoint, early]
        model.fit([X_train_left,X_train_right,train_stistics,train_sta2], Y_train, batch_size=128, epochs=N_EPOCH, verbose=1,
                  validation_data=([X_val_left,X_val_right,val_stistics,val_sta2], Y_val),callbacks=callbacks_list)
def train():
    "训练模型"

    Train_left = np.load('../data/X_train_question1.npy')
    Train_right = np.load('../data/X_train_question2.npy')
    Train_label = np.load('../data/train_label.npy')
    Train_label = Train_label.astype(np.int64)
    stack_tr = np.zeros((Train_label.shape[0]))
    statistics_feature = DataUtil.load_matrix(
        '../feature_train/feature_min_max.txt')
    sta2 = pd.read_csv('../feature_train/feature_deepnet.csv').values
    for k, (tr, va) in enumerate(
            StratifiedKFold(Train_label, random_state=27, n_folds=N_FOLD)):
        model = decomposable_attention()
        print(' stack:{}/{}'.format(k + 1, N_FOLD))
        X_train_left = Train_left[tr]
        X_train_right = Train_right[tr]
        Y_train = Train_label[tr]
        train_stistics = statistics_feature[tr]
        train_sta = sta2[tr]
        val_sta2 = sta2[va]
        val_stistics = statistics_feature[va]
        X_val_left = Train_left[va]
        X_val_right = Train_right[va]
        Y_val = Train_label[va]
        X_train_left1,X_train_left2,X_train_right1,X_train_right2,train_stistics1,train_stistics2,train_sta1,train_sta2,Y_train1,Y_train2 = \
            train_test_split(X_train_left,X_train_right,train_stistics,train_sta,Y_train,test_size=0.2,stratify=Y_train)

        print("Train...")
        checkpoint = ModelCheckpoint(
            '../model_file/attenion1_{}.hdf5'.format(k),
            monitor='val_loss',
            verbose=1,
            save_best_only=True,
            mode='min')
        early = EarlyStopping(monitor='val_loss', mode='min', patience=5)
        callbacks_list = [checkpoint, early]
        model.fit([X_train_left1, X_train_right1, train_stistics1, train_sta1],
                  Y_train1,
                  batch_size=128,
                  epochs=N_EPOCH,
                  verbose=1,
                  validation_data=([
                      X_train_left2, X_train_right2, train_stistics2,
                      train_sta2
                  ], Y_train2),
                  callbacks=callbacks_list)
        model.load_weights('../model_file/attenion1_{}.hdf5'.format(k))
        val_pre = model.predict(
            [X_val_left, X_val_right, val_stistics, val_sta2]).flatten()

        print(val_pre.shape, val_pre)
        stack_tr[va] += val_pre
        print('log_loss', log_loss(Y_val, val_pre))
    df_train_result = pd.DataFrame({'Score': stack_tr})
    df_train_result.to_csv('../result/attention1_train.txt',
                           header=False,
                           index=False)
Example #4
0
def test():
    "预测得分"
    test_left = np.load('../data/test_left.npy')
    test_right = np.load('../data/test_right.npy')
    statistics_feature = DataUtil.load_matrix('../feature_test/feature_min_max.txt')
    sta2 = pd.read_csv('../feature_test/feature_deepnet.csv').values
    result_np = np.zeros((len(test_right,)))
    for i in range(5):
        merged_model.load_weights('../model_file/deepnet1_{}.hdf5'.format(i))
        score = merged_model.predict([test_left,test_right,
                               statistics_feature,sta2])
        score = np.reshape(score,(len(score),))
        result_np +=score
    result_df = pd.DataFrame({"score":result_np/5})
    import datetime
    result_df.to_csv('../result/deepnet1_test.txt',index=False,header=False)
Example #5
0
def test():
    "预测得分"
    version = 'clean_stops_number_punciton'
    model = esim()
    test_left = np.load('./data/test_left.npy')
    test_right = np.load('./data/test_right.npy')
    statistics_feature = DataUtil.load_matrix('./feature_test/feature_min_max.txt')
    sta2 = pd.read_csv('./feature_test/feature_deepnet.csv').values
    result_np = np.zeros((len(test_right,)))
    for i in range(N_FOLD):
        model.load_weights('./model_file/CIKM_dec_Attention_classify_{}.hdf5'.format(i))
        score = model.predict([test_left,test_right,statistics_feature,sta2])
        score = np.reshape(score,(len(score),))
        result_np +=score
    result_df = pd.DataFrame({"score":result_np/N_FOLD})
    import datetime
    unquie_flag = datetime.datetime.now().strftime('%m_%d_%H_%M')
    result_df.to_csv('./result/submit_'+version+'_{}.txt'.format(unquie_flag),index=False,header=False)
Example #6
0
def test():
    "预测得分"
    version = 'esim'
    # model =  decomposable_attention()
    model = esim()
    test_left = np.load('../data/test_left.npy')
    test_right = np.load('../data/test_right.npy')
    statistics_feature = DataUtil.load_matrix('../feature_test/feature_min_max.txt')
    sta2 = pd.read_csv('../feature_test/feature_deepnet.csv').values
    result_np = np.zeros((len(test_right,)))
    for i in range(N_FOLD):
        model.load_weights('../model_file/esim_{}.hdf5'.format(i))
        score = model.predict([test_left,test_right,statistics_feature,sta2])
        score = np.reshape(score,(len(score),))
        result_np +=score
    result_df = pd.DataFrame({"score":result_np/N_FOLD})
    import datetime
    unquie_flag = datetime.datetime.now().strftime('%m_%d_%H_%M')
    result_df.to_csv('../result/esim1_test.txt'.format(unquie_flag),index=False,header=False)
Example #7
0
def save_feature(df,step):
    # powerful_word_oside_feature = df.apply(extract_powerful_word_oside,axis=1)
    # DataUtil.save_matrix('../feature_{}/powerful_word_oside_feature.txt'.format(step),powerful_word_oside_feature,'w')
    PowerfulWordDoubleSideRate_feature = df.apply(extract_PowerfulWordDoubleSideRate,axis=1)
    DataUtil.save_matrix('../feature_{}/PowerfulWordDoubleSideRate_feature.txt'.format(step),PowerfulWordDoubleSideRate_feature,'w')
    # PowerfulWordOneSideRate_feautre = df.apply(extract_PowerfulWordOneSideRate,axis=1)
    # DataUtil.save_matrix('../feature_{}/PowerfulWordOneSideRate_feautre.txt'.format(step),PowerfulWordOneSideRate_feautre,'w')

    powerful_word_dside_feature = df.apply(extract_powerful_word_dside,axis=1).values
    DataUtil.save_matrix('../feature_{}/powerful_word_dside_feature.txt'.format(step),powerful_word_dside_feature,'w')
    ngramDistance_feature = df.apply(extract_ngramDistance,axis=1).values
    DataUtil.save_matrix('../feature_{}/ngramDistance_feature_feature.txt'.format(step),ngramDistance_feature,'w')
    # # print(ngramDistance_feature)
    NgramDiceDistance_feature = df.apply(extract_NgramDiceDistance,axis=1).values
    DataUtil.save_matrix('../feature_{}/NgramDiceDistance_feature.txt'.format(step),NgramDiceDistance_feature,'w')
    NgramJaccardCoef_feature = df.apply(extract_NgramJaccardCoef,axis=1).values
    DataUtil.save_matrix('../feature_{}/NgramJaccardCoef_feature.txt'.format(step),NgramJaccardCoef_feature,'w')
    Distance_feature = df.apply(extract_edit_Distance,axis=1)
    DataUtil.save_matrix('../feature_{}/Distance_feature.txt'.format(step),Distance_feature,'w')
    no_feature = df.apply(extract_no,axis=1)
    DataUtil.save_matrix('../feature_{}/feature_no.txt'.format(step),no_feature,'w')
    # '''word-match-feature'''
    word_match = df.apply(word_match_share,axis=1)
    DataUtil.save_vector('../feature_{}/word_match.txt'.format(step),word_match,'w')
    # '''tf-idf-word-share'''
    tf_idf_word_share_feature = df.apply(tf_idf_word_share,axis=1)
    DataUtil.save_vector('../feature_{}/tf_idf_word_share_feature.txt'.format(step),tf_idf_word_share_feature,'w')
    # '''len-feature'''
    print("start")
    len_feature = df.apply(len_word_sentence_feature,axis=1).values
    lendiff_feature = df.apply(lengthdiff,axis=1)
    lendiffrate_feature = df.apply(LengthDiffRate,axis=1)
    DataUtil.save_matrix('../feature_{}/len_feature.txt'.format(step),len_feature,'w')
    DataUtil.save_matrix('../feature_{}/lendiff_feature.txt'.format(step),lendiff_feature,'w')
    DataUtil.save_matrix('../feature_{}/lendiffrate_feature.txt'.format(step),lendiffrate_feature,'w')

    tfidf_feature= df.apply(extract_tfidf_feature,axis=1)
    DataUtil.save_matrix('../feature_{}/tfidf_feature.txt'.format(step),tfidf_feature,'w')
    # '''dul_num_feature'''
    # print('start load')
    dul_num = df.apply(extract_dul_num,axis=1)
    DataUtil.save_matrix('../feature_{}/dul_num.txt'.format(step),dul_num,'w')
Example #8
0
def save_feature(step):
    '''path'''
    NgramDiceDistance_feature_path = '../feature_{}/NgramDiceDistance_feature.txt'.format(
        step)
    NgramJaccardCoef_feature_path = '../feature_{}/NgramJaccardCoef_feature.txt'.format(
        step)
    Distance_feature_path = '../feature_{}/Distance_feature.txt'.format(step)
    no_feature_path = '../feature_{}/feature_no.txt'.format(step)
    word_match_path = '../feature_{}/word_match.txt'.format(step)
    tf_idf_word_share_feature_path = '../feature_{}/tf_idf_word_share_feature.txt'.format(
        step)
    len_feature_path = '../feature_{}/len_feature.txt'.format(step)
    lendiff_feature_path = '../feature_{}/lendiff_feature.txt'.format(step)
    lendiffrate_feature_path = '../feature_{}/lendiffrate_feature.txt'.format(
        step)
    tfidf_feature_path = '../feature_{}/tfidf_feature.txt'.format(step)
    ngramDistance_feature_path = '../feature_{}/ngramDistance_feature_feature.txt'.format(
        step)
    powerful_word_dside_feature_path = '../feature_{}/powerful_word_dside_feature.txt'.format(
        step)
    powerful_word_oside_feature_path = '../feature_{}/powerful_word_oside_feature.txt'.format(
        step)
    PowerfulWordDoubleSideRate_feature_path = '../feature_{}/PowerfulWordDoubleSideRate_feature.txt'.format(
        step)
    PowerfulWordOneSideRate_feautre_path = '../feature_{}/PowerfulWordOneSideRate_feautre.txt'.format(
        step)
    dul_num_path = '../feature_{}/dul_num.txt'.format(step)
    '''
    load feature
    '''
    powerful_word_dside_feature = DataUtil.load_matrix(
        powerful_word_dside_feature_path)
    PowerfulWordDoubleSideRate_feature = DataUtil.load_matrix(
        PowerfulWordDoubleSideRate_feature_path)
    no_feature = DataUtil.load_matrix(no_feature_path)
    # no_feature_min_max_transfer= preprocessing.MinMaxScaler()
    # no_feature_train_minmax = no_feature_min_max_transfer.fit_transform(no_feature)
    word_match_feature = DataUtil.load_matrix(word_match_path)
    ngramDistance_feature = DataUtil.load_matrix(ngramDistance_feature_path)
    tf_idf_word_share_feature = DataUtil.load_matrix(
        tf_idf_word_share_feature_path)

    dul_num = DataUtil.load_matrix(dul_num_path)
    dul_num = preprocessing.scale(dul_num)
    NgramDiceDistance_feature = DataUtil.load_matrix(
        NgramDiceDistance_feature_path)
    NgramJaccardCoef_feature = DataUtil.load_matrix(
        NgramJaccardCoef_feature_path)
    # Distance_feature = DataUtil.load_matrix(Distance_feature_path)
    len_feature = DataUtil.load_matrix(len_feature_path)
    lendiff_feature = DataUtil.load_matrix(lendiff_feature_path)
    lendiffrate_feature = DataUtil.load_matrix(lendiffrate_feature_path)
    tfidf_feature = DataUtil.load_matrix(tfidf_feature_path)
    tfidf_feature = np.nan_to_num(tfidf_feature)
    if step == 'train':
        cut_index = 3
    else:
        cut_index = 2
    train_distance_feature = pd.read_csv(
        '../feature_{}/w2vec_features_scale.csv'.format(step), encoding='gbk')
    train_distance_feature = train_distance_feature.fillna(value=0)
    train_distance_feature = train_distance_feature.iloc[:, cut_index:]
    train_distance_feature.to_csv(
        '../feature_{}/feature_deepnet.csv'.format(step), index=False)
    comb_feature = pd.read_csv('../feature_{}/comb.csv'.format(step),
                               encoding='gbk')
    print('comb', comb_feature.shape)
    print('distance_featrue', train_distance_feature.shape)
    feature = np.concatenate(
        (comb_feature, ngramDistance_feature, lendiffrate_feature,
         lendiff_feature, len_feature, tfidf_feature,
         powerful_word_dside_feature, NgramJaccardCoef_feature,
         NgramDiceDistance_feature, dul_num, no_feature, word_match_feature,
         tf_idf_word_share_feature),
        axis=1)
    DataUtil.save_matrix('../feature_{}/feature.txt'.format(step), feature,
                         'w')
Example #9
0
    feature2_test = pd.read_csv('../feature_test/feature_deepnet.csv').values
    feature_test = np.concatenate([feature1_test, feature2_test], axis=1)
    d_train = xgb.DMatrix(x_train, label=y_train)
    d_valid = xgb.DMatrix(x_valid, label=y_valid)
    d_test = xgb.DMatrix(feature_test)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    bst = xgb.train(params,
                    d_train,
                    5000,
                    watchlist,
                    early_stopping_rounds=50,
                    verbose_eval=10)
    pd.DataFrame(bst.predict(d_test)).to_csv('../result/result.csv')


if __name__ == '__main__':
    #
    save_feature('train')
    save_feature('test')
    train_feature = DataUtil.load_matrix('../feature_train/feature.txt')
    test_feature = DataUtil.load_matrix('../feature_test/feature.txt')
    print(train_feature.shape)
    feature = np.concatenate([train_feature, test_feature], axis=0)
    scale_transfer = preprocessing.StandardScaler()
    scale_transfer_fit = scale_transfer.fit(feature)
    train_feature_min_max = scale_transfer_fit.transform(train_feature)
    test_feature_min_max = scale_transfer_fit.transform(test_feature)
    DataUtil.save_matrix('../feature_train/feature_min_max.txt',
                         train_feature_min_max, 'w')
    DataUtil.save_matrix('../feature_test/feature_min_max.txt',
                         test_feature_min_max, 'w')