def get_data(trainfile, testfile, w2v_file, c2v_file, base_datafile, user_datafile, w2v_k, c2v_k=100, data_split=1, maxlen=50): """ 数据处理的入口函数 Converts the input files into the model input formats """ ''' pos_vob, pos_idex_word = get_Feature_index([trainfile,devfile,testfile]) pos_train = make_idx_POS_index(trainfile, max_s, pos_vob) pos_dev = make_idx_POS_index(devfile, max_s, pos_vob) pos_test = make_idx_POS_index(testfile, max_s, pos_vob) pos_W, pos_k = load_vec_character(pos_vob, 30) # pos_k, pos_W = load_vec_onehot(pos_vob) # print('entlabel vocab size:'+str(len(entlabel_vob))) print('shape in pos_W:', pos_W.shape) ''' if not os.path.exists(base_datafile): print("Precess base data....") char_vob, idex_2char, target_vob, idex_2target, max_s = get_Character_index( {trainfile}) print("source char size: ", char_vob.__len__()) print("max_s: ", max_s) # max_s = 136 # print("max_s: ", max_s) print("source char: ", len(idex_2char)) print("target vocab size: ", len(target_vob), str(target_vob)) print("target vocab size: ", len(idex_2target)) if 'DoubleEmd' in c2v_file: char_k, char_W = load_vec_txt_DoubleEmd(c2v_file, char_vob, c2v_k) else: char_k, char_W = load_vec_txt(c2v_file, char_vob, c2v_k) print('character_W shape:', char_W.shape) print("base dataset created!") out = open(base_datafile, 'wb') pickle.dump([ char_vob, target_vob, idex_2char, idex_2target, char_W, char_k, max_s ], out, 0) out.close() else: print("base data has existed ....") char_vob, target_vob,\ idex_2char, idex_2target,\ char_W,\ char_k,\ max_s = pickle.load(open(base_datafile, 'rb')) train_all, target_all = make_idx_Char_index(trainfile, max_s, char_vob, target_vob) file = './data/subtask1_training_all.txt' EntCharDict, OutECDict, count_allc, count_entc = Sensitivity.GetVariousDist( file) train_all_SensitiV = calSensitiValues(trainfile, max_s, EntCharDict, OutECDict) extra_test_num = int(len(train_all) / 5) # test_all, test_target_all = make_idx_Char_index(testfile, max_s, char_vob, target_vob) # test = train_all[:extra_test_num] # test_label = target_all[:extra_test_num] # train = train_all[extra_test_num:] + test_all[:] # train_label = target_all[extra_test_num:] + test_target_all[:] # print('extra_test_num', extra_test_num) test = train_all[extra_test_num * (data_split - 1):extra_test_num * data_split] test_SensitiV = train_all_SensitiV[extra_test_num * (data_split - 1):extra_test_num * data_split] test_label = target_all[extra_test_num * (data_split - 1):extra_test_num * data_split] train = train_all[:extra_test_num * (data_split - 1)] + train_all[extra_test_num * data_split:] train_SensitiV = train_all_SensitiV[:extra_test_num * ( data_split - 1)] + train_all_SensitiV[extra_test_num * data_split:] train_label = target_all[:extra_test_num * (data_split - 1)] + target_all[extra_test_num * data_split:] print('extra_test_num....data_split', extra_test_num, data_split) print('train len ', train.__len__(), len(train_label)) print('test len ', test.__len__(), len(test_label)) print("dataset created!") out = open(user_datafile, 'wb') pickle.dump( [train, train_SensitiV, train_label, test, test_SensitiV, test_label], out, 0) out.close()
print("dataset created!") out = open(user_datafile, 'wb') pickle.dump( [train, train_SensitiV, train_label, test, test_SensitiV, test_label], out, 0) out.close() if __name__ == "__main__": print(20 * 2) trainfile = './data/subtask1_training_all.conll.txt' c2v_file = "./data/preEmbedding/CCKS2019_DoubleEmd_Char2Vec.txt" print("Precess base data....") char_vob, idex_2char, target_vob, idex_2target, max_s = get_Character_index( {trainfile}) print("source char size: ", char_vob.__len__()) print("max_s: ", max_s) max_s = 136 print("max_s: ", max_s) print("source char: ", len(idex_2char)) print("target vocab size: ", len(target_vob), str(target_vob)) print("target vocab size: ", len(idex_2target)) file = './data/subtask1_training_all.txt' EntCharDict, OutECDict = Sensitivity.GetVariousDist(file) train_all_SensitiV = calSensitiValues(trainfile, max_s, EntCharDict, OutECDict)