예제 #1
0
def get_data(trainfile,
             testfile,
             w2v_file,
             c2v_file,
             base_datafile,
             user_datafile,
             w2v_k,
             c2v_k=100,
             data_split=1,
             maxlen=50):
    """
    数据处理的入口函数
    Converts the input files  into the model input formats

    """
    '''
    pos_vob, pos_idex_word = get_Feature_index([trainfile,devfile,testfile])
    pos_train = make_idx_POS_index(trainfile, max_s, pos_vob)
    pos_dev = make_idx_POS_index(devfile, max_s, pos_vob)
    pos_test = make_idx_POS_index(testfile, max_s, pos_vob)
    pos_W, pos_k = load_vec_character(pos_vob, 30)
    # pos_k, pos_W = load_vec_onehot(pos_vob)

    # print('entlabel vocab size:'+str(len(entlabel_vob)))
    print('shape in pos_W:', pos_W.shape)
    '''

    if not os.path.exists(base_datafile):

        print("Precess base data....")
        char_vob, idex_2char, target_vob, idex_2target, max_s = get_Character_index(
            {trainfile})
        print("source char size: ", char_vob.__len__())
        print("max_s: ", max_s)
        # max_s = 136
        # print("max_s: ", max_s)
        print("source char: ", len(idex_2char))
        print("target vocab size: ", len(target_vob), str(target_vob))
        print("target vocab size: ", len(idex_2target))

        if 'DoubleEmd' in c2v_file:
            char_k, char_W = load_vec_txt_DoubleEmd(c2v_file, char_vob, c2v_k)
        else:
            char_k, char_W = load_vec_txt(c2v_file, char_vob, c2v_k)
        print('character_W shape:', char_W.shape)

        print("base dataset created!")
        out = open(base_datafile, 'wb')
        pickle.dump([
            char_vob, target_vob, idex_2char, idex_2target, char_W, char_k,
            max_s
        ], out, 0)
        out.close()

    else:
        print("base data has existed ....")
        char_vob, target_vob,\
        idex_2char, idex_2target,\
        char_W,\
        char_k,\
        max_s = pickle.load(open(base_datafile, 'rb'))

    train_all, target_all = make_idx_Char_index(trainfile, max_s, char_vob,
                                                target_vob)

    file = './data/subtask1_training_all.txt'
    EntCharDict, OutECDict, count_allc, count_entc = Sensitivity.GetVariousDist(
        file)
    train_all_SensitiV = calSensitiValues(trainfile, max_s, EntCharDict,
                                          OutECDict)

    extra_test_num = int(len(train_all) / 5)
    # test_all, test_target_all = make_idx_Char_index(testfile, max_s, char_vob, target_vob)
    # test = train_all[:extra_test_num]
    # test_label = target_all[:extra_test_num]
    # train = train_all[extra_test_num:] + test_all[:]
    # train_label = target_all[extra_test_num:] + test_target_all[:]
    # print('extra_test_num', extra_test_num)

    test = train_all[extra_test_num * (data_split - 1):extra_test_num *
                     data_split]
    test_SensitiV = train_all_SensitiV[extra_test_num *
                                       (data_split - 1):extra_test_num *
                                       data_split]
    test_label = target_all[extra_test_num * (data_split - 1):extra_test_num *
                            data_split]
    train = train_all[:extra_test_num *
                      (data_split - 1)] + train_all[extra_test_num *
                                                    data_split:]
    train_SensitiV = train_all_SensitiV[:extra_test_num * (
        data_split - 1)] + train_all_SensitiV[extra_test_num * data_split:]
    train_label = target_all[:extra_test_num *
                             (data_split - 1)] + target_all[extra_test_num *
                                                            data_split:]
    print('extra_test_num....data_split', extra_test_num, data_split)

    print('train len  ', train.__len__(), len(train_label))
    print('test len  ', test.__len__(), len(test_label))

    print("dataset created!")
    out = open(user_datafile, 'wb')
    pickle.dump(
        [train, train_SensitiV, train_label, test, test_SensitiV, test_label],
        out, 0)
    out.close()
예제 #2
0
    print("dataset created!")
    out = open(user_datafile, 'wb')
    pickle.dump(
        [train, train_SensitiV, train_label, test, test_SensitiV, test_label],
        out, 0)
    out.close()


if __name__ == "__main__":
    print(20 * 2)

    trainfile = './data/subtask1_training_all.conll.txt'

    c2v_file = "./data/preEmbedding/CCKS2019_DoubleEmd_Char2Vec.txt"

    print("Precess base data....")
    char_vob, idex_2char, target_vob, idex_2target, max_s = get_Character_index(
        {trainfile})
    print("source char size: ", char_vob.__len__())
    print("max_s: ", max_s)
    max_s = 136
    print("max_s: ", max_s)
    print("source char: ", len(idex_2char))
    print("target vocab size: ", len(target_vob), str(target_vob))
    print("target vocab size: ", len(idex_2target))

    file = './data/subtask1_training_all.txt'
    EntCharDict, OutECDict = Sensitivity.GetVariousDist(file)
    train_all_SensitiV = calSensitiValues(trainfile, max_s, EntCharDict,
                                          OutECDict)