def preprocess(p, h, chars_per_word, preprocessor, save_dir, data_paths, word_vector_save_path, normalize_word_vectors, max_loaded_word_vectors=None, word_vectors_load_path=None, include_word_vectors=True, include_chars=True, include_syntactical_features=True, include_exact_match=True): preprocessor.get_all_words_with_parts_of_speech([data_path[1] for data_path in data_paths]) print('Found', len(preprocessor.unique_words), 'unique words') print('Found', len(preprocessor.unique_parts_of_speech), 'unique parts of speech') # Init mappings of the preprocessor preprocessor.init_word_to_vectors(vectors_file_path=get_word2vec_file_path(word_vectors_load_path), needed_words=preprocessor.unique_words, normalize=normalize_word_vectors, max_loaded_word_vectors=max_loaded_word_vectors) preprocessor.init_chars(words=preprocessor.unique_words) preprocessor.init_parts_of_speech(parts_of_speech=preprocessor.unique_parts_of_speech) # Process and save the data preprocessor.save_word_vectors(word_vector_save_path) for dataset, input_path in data_paths: data = preprocessor.parse(input_file_path=input_path, max_words_p=p, max_words_h=h, chars_per_word=chars_per_word) # Determine which part of data we need to dump if not include_exact_match: del data[6:8] # Exact match feature if not include_syntactical_features: del data[4:6] # Syntactical POS tags if not include_chars: del data[2:4] # Character features if not include_word_vectors: del data[0:2] # Word vectors data_saver = ChunkDataManager(save_data_path=os.path.join(save_dir, dataset)) data_saver.save([np.array(item) for item in data])
def preprocess(p, h, chars_per_word, preprocessor, save_dir, data_paths, word_vector_save_path, normalize_word_vectors, max_loaded_word_vectors=None, word_vectors_load_path=None, include_word_vectors=True, include_chars=True, include_syntactical_features=True, include_exact_match=True): preprocessor.get_all_words_with_parts_of_speech( [data_path[1] for data_path in data_paths]) print('Found', len(preprocessor.unique_words), 'unique words') print('Found', len(preprocessor.unique_parts_of_speech), 'unique parts of speech') # Init mappings of the preprocessor preprocessor.init_word_to_vectors( vectors_file_path=word_vectors_load_path, needed_words=preprocessor.unique_words, normalize=normalize_word_vectors, max_loaded_word_vectors=max_loaded_word_vectors) # preprocessor.init_chars(words=preprocessor.unique_words) preprocessor.init_parts_of_speech( parts_of_speech=preprocessor.unique_parts_of_speech) # 预处理及保存数据; preprocessor.save_word_vectors( word_vector_save_path) # 保存相关词(包括缺失词向量的词随机初始化的词向量)的向量到文件 for dataset, input_path in data_paths: # [('train', '/home/gswyhq/data/LCQMC/train.txt'), # ('test', '/home/gswyhq/data/LCQMC/test.txt'), # ('dev', '/home/gswyhq/data/LCQMC/dev.txt')] data = preprocessor.parse(input_file_path=input_path, max_words_p=p, max_words_h=h, chars_per_word=chars_per_word) # 句1词id列表,句2词id列表,句1字填充补全矩阵,句2字填充补全矩阵,句1词性填充补全矩阵,句2词性填充补全矩阵,句1对应词对方是否包含,句2对应词对方是否包含,标签 # Determine which part of data we need to dump if not include_exact_match: del data[6:8] # Exact match feature if not include_syntactical_features: del data[4:6] # Syntactical POS tags if not include_chars: del data[2:4] # Character features if not include_word_vectors: del data[0:2] # Word vectors data_saver = ChunkDataManager( save_data_path=os.path.join(save_dir, dataset)) data_saver.save([np.array(item) for item in data])
def test_DUCTAC(model, data_path, voca_name, re_exp, args): # data_list = [0,1] data_path = data_path + '_' + voca_name data_list = [0, 1, 2, 3] folders = [ os.path.join(data_path, d) for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d)) and re.search(re_exp, d) ] print('{} folders are found from {}'.format(len(folders), data_path)) time_takes = [] for i, folder in enumerate(folders): x_test = ChunkDataManager(load_data_path=folder).load( load_list=data_list) start_test = time.time() y_pred = model.predict(x_test, batch_size=args.batch_size) finish_test = time.time() - start_test base_name = os.path.basename(folder) time_takes.append(finish_test) print('[{} {}] elapsed testing time: {:3.3f} secs'.format( i + 1, base_name, finish_test)) matfile_name = os.path.join(folder, 'prediction.mat') sio.savemat(matfile_name, {'pred_siamese': y_pred}) print('average elapsed testing time of 1 doc: {}'.format( np.mean(time_takes))) # copy prediction files dest_path = os.path.dirname(data_path) aux_name = '_' + voca_name dest_path = os.path.join(dest_path, 'predictions', 'model' + aux_name) copy_predfile(data_path, dest_path, re_exp)
os.makedirs(save_folder) params = vars(args) print(json.dumps(params, indent=2)) config_fn = os.path.join(save_folder, 'config.json') if not args.testing: with open(config_fn, 'w') as outfile: json.dump(params, outfile) ''' Prepare data ''' voca_name = 'voca' + str(args.voca_size / 1000) + 'k_' + str( args.voca_dim) + 'd' dataset_list = args.dataset_list for dataset in dataset_list: if dataset.startswith('cnn'): train_cnndm = ChunkDataManager( load_data_path=os.path.join(args.load_dir, 'cnn_dm', 'train_' + voca_name)).load( load_list=args.data_list) test_cnndm = ChunkDataManager( load_data_path=os.path.join(args.load_dir, 'cnn_dm', 'test_' + voca_name)).load( load_list=args.data_list) val_cnndm = ChunkDataManager( load_data_path=os.path.join(args.load_dir, 'cnn_dm', 'val_' + voca_name)).load( load_list=args.data_list) if not args.testing: test_cnndm = val_cnndm print('# cnn_dm samples: train:{}, test:{}'.format( train_cnndm[0].shape[0], test_cnndm[0].shape[0]))
default='logs', help='Tensorboard logs dir', type=str) parser.add_argument('--word_vec_path', default='data/word-vectors.npy', help='Save path word vectors', type=str) parser.add_argument('--omit_word_vectors', action='store_true') parser.add_argument('--omit_chars', action='store_true') parser.add_argument('--omit_syntactical_features', action='store_true') parser.add_argument('--omit_exact_match', action='store_true') parser.add_argument('--train_word_embeddings', action='store_true') args = parser.parse_args() ''' Prepare data ''' word_embedding_weights = np.load(args.word_vec_path) train_data = ChunkDataManager( load_data_path=os.path.join(args.load_dir, 'train')).load() test_data = ChunkDataManager( load_data_path=os.path.join(args.load_dir, 'test')).load() dev_data = ChunkDataManager( load_data_path=os.path.join(args.load_dir, 'dev')).load() ''' Getting dimensions of the input ''' chars_per_word = train_data[3].shape[-1] if not args.omit_chars else 0 syntactical_feature_size = train_data[5].shape[ -1] if not args.omit_syntactical_features else 0 ''' Prepare the model and optimizers ''' adam = L2Optimizer(Adam()) adadelta = L2Optimizer(Adadelta(lr=0.5, rho=0.95, epsilon=1e-8)) sgd = L2Optimizer(SGD(lr=3e-3)) model = DIIN( p=train_data[0].shape[-1], # or None h=train_data[1].shape[-1], # or None
default='logs', help='Tensorboard logs dir', type=str) parser.add_argument('--word_vec_path', default='data/word-vectors.npy', help='Save path word vectors', type=str) parser.add_argument('--omit_word_vectors', default=0, type=int) parser.add_argument('--omit_chars', default=0, type=int) parser.add_argument('--omit_exact_match', default=0, type=int) parser.add_argument('--train_word_embeddings', default=True) parser.add_argument('--is_train', default=1, type=int) args = parser.parse_args() ''' Prepare data ''' word_embedding_weights = np.load(args.word_vec_path) all_data = ChunkDataManager(load_data_path=os.path.join(args.load_dir + '/train')).load() predict_data = ChunkDataManager( load_data_path=os.path.join(args.load_dir + '/dev')).load() is_train = args.is_train if args.omit_exact_match == 1: del all_data[4:6] del predict_data[4:6] if args.omit_chars == 1: del all_data[2:4] del predict_data[4:6] train_data = [[] for i in range(len(all_data))] test_data = [[] for i in range(len(all_data))] dev_data = [[] for i in range(len(all_data))] for i, all_data_i in enumerate(all_data):
def preprocess_unified(p, h, preprocessors, save_dir, data_paths, dataset_to_save, word_vectors_load_path=None, normalize_word_vectors=False, voca_size=[50000], voca_dim=300, data_root_dir='data', word_vector_save_path=None, word2id_save_path=None, max_loaded_word_vectors=None, include_word_vectors=True, include_exact_match=False, include_chars=False, include_syntactical_features=False): unified_preprocessor = SNLIPreprocessor() # load word vector if voca_dim == 300: word2vec_func = get_word2vec_file_path elif voca_dim == 100: word2vec_func = get_word2vec_100d_file_path unified_preprocessor.call_load_word_vector( file_path=word2vec_func(word_vectors_load_path), normalize=normalize_word_vectors, max_words=max_loaded_word_vectors) if False and os.path.exists(word2id_save_path) and os.path.exists( word_vector_save_path): unified_preprocessor.load_word2id_dict(word2id_save_path) unified_preprocessor.vectors = np.load(word_vector_save_path) else: # get all tokenized words from all dataset for dataset, ppr in preprocessors.iteritems(): data_path = data_paths[dataset] if dataset.endswith('nli'): if dataset.startswith('s'): train_path = os.path.join(data_path, dataset + '_1.0_train.jsonl') test_path = os.path.join(data_path, dataset + '_1.0_test.jsonl') dev_path = os.path.join(data_path, dataset + '_1.0_dev.jsonl') paths = [('train', train_path), ('test', test_path), ('dev', dev_path)] elif dataset.startswith('m'): train_path = os.path.join(data_path, dataset + '_1.0_train.jsonl') test_matched_path = os.path.join( data_path, dataset + '_1.0_dev_matched.jsonl') test_mismatched_path = os.path.join( data_path, dataset + '_1.0_dev_mismatched.jsonl') paths = [('train', train_path), ('test_matched', test_matched_path), ('test_mismatched', test_mismatched_path)] ppr.get_all_words_with_parts_of_speech( [path[1] for path in paths]) print( 'Found {} unique words, {} unique parts of speech from {}'. format(len(ppr.unique_words), len(ppr.unique_parts_of_speech), dataset)) unified_preprocessor.unique_parts_of_speech = unified_preprocessor.unique_parts_of_speech.union( ppr.unique_parts_of_speech) elif dataset.startswith('DUC'): folders = [ os.path.join(data_path, d) for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d)) and re.search('^d[0-9]+', d) ] ppr.get_all_words_DUC(folders) print('Found {} unique words from {}'.format( len(ppr.unique_words), dataset)) elif dataset.startswith('cnn'): paths = [('train', data_path), ('test', data_path), ('val', data_path)] ppr.gen_all_pairs(paths) ppr.get_all_words() print('Found {} unique words from {}'.format( len(ppr.unique_words), dataset)) unified_preprocessor.unique_words = unified_preprocessor.unique_words.union( ppr.unique_words) temp_dict = dict() for d in (unified_preprocessor.unique_words_freq, ppr.unique_words_freq): for word, freq in d.items(): if word in temp_dict: temp_dict[word] += freq else: temp_dict[word] = freq unified_preprocessor.unique_words_freq = temp_dict # sort to get 50k frequent words sorted_dict = sorted(unified_preprocessor.unique_words_freq.items(), key=operator.itemgetter(1), reverse=True) for v_size in voca_size: voca_name = 'voca' + str(v_size / 1000) + 'k_' + str(voca_dim) + 'd' selected_dict = sorted_dict[:v_size] # list of tuples unified_preprocessor.unique_words_voca = [ word[0] for word in selected_dict ] print('Found {} unique words, {} unique parts of speech from unified'. format(len(unified_preprocessor.unique_words), len(unified_preprocessor.unique_parts_of_speech))) print('Found {} unique words freq(dict), {} {} words(list)'.format( len(unified_preprocessor.unique_words_freq), len(unified_preprocessor.unique_words_voca), voca_name)) n_print = 20 print('Top {} frequent words'.format(n_print)) for it in range(n_print): print('{} - {}:{}'.format(it + 1, selected_dict[it][0], selected_dict[it][1])) # initialize w2v, word2id # ADD <START>, <END>, UNK, ZERO into the vocabulary unified_preprocessor.init_word_to_vectors( needed_words=unified_preprocessor.unique_words_voca, normalize=normalize_word_vectors) unified_preprocessor.init_chars( words=unified_preprocessor.unique_words) unified_preprocessor.init_parts_of_speech( parts_of_speech=unified_preprocessor.unique_parts_of_speech) #dirname = os.path.dirname(word2id_save_path) word2id_path = os.path.join(args.data_root_dir, 'word2id_' + voca_name + '_unified.pkl') wordvec_path = os.path.join( args.data_root_dir, 'word-vectors_' + voca_name + '_unified.npy') unified_preprocessor.save_word2id_dict(word2id_path) unified_preprocessor.save_word_vectors(wordvec_path) # assign word id and save for dataset in dataset_to_save: print('***** [{}] data saving *****'.format(dataset)) data_path = data_paths[dataset] if dataset.endswith('nli'): if dataset.startswith('s'): train_path = os.path.join(data_path, dataset + '_1.0_train.jsonl') test_path = os.path.join(data_path, dataset + '_1.0_test.jsonl') dev_path = os.path.join(data_path, dataset + '_1.0_dev.jsonl') paths = [('train_' + voca_name, train_path), ('test_' + voca_name, test_path), ('dev_' + voca_name, dev_path)] elif dataset.startswith('m'): train_path = os.path.join(data_path, dataset + '_1.0_train.jsonl') test_matched_path = os.path.join( data_path, dataset + '_1.0_dev_matched.jsonl') test_mismatched_path = os.path.join( data_path, dataset + '_1.0_dev_mismatched.jsonl') paths = [('train_' + voca_name, train_path), ('test_matched_' + voca_name, test_matched_path), ('test_mismatched_' + voca_name, test_mismatched_path)] for dataset_var, input_path in paths: data = unified_preprocessor.parse( input_file_path=input_path, max_words_p=p, max_words_h=h) # Determine which part of data we need to dump if not include_exact_match: del data[6:8] # Exact match feature if not include_syntactical_features: del data[4:6] # Syntactical POS tags if not include_chars: del data[2:4] # Character features if not include_word_vectors: del data[0:2] # Word vectors data_saver = ChunkDataManager( save_data_path=os.path.join(data_path, dataset_var)) data_saver.save([np.array(item) for item in data]) elif dataset.startswith('cnn'): cnn_dm = preprocessors['cnn_dm'] cnn_dm.assign_w2id(unified_preprocessor.word_to_id) data_path_1up = os.path.dirname(data_path) # data/cnn_dm paths = [ 'train_' + voca_name, 'test_' + voca_name, 'val_' + voca_name ] for i, path_name in enumerate(paths): data_saver = ChunkDataManager( save_data_path=os.path.join(data_path_1up, path_name)) data_saver.save( [np.array(item) for item in cnn_dm.data_all[i]]) elif dataset.startswith('DUC'): DUC_save_dir = 'data_' + voca_name save_path = os.path.join(data_path, DUC_save_dir) #save_dir folders = [ os.path.join(data_path, d) for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d)) and re.search('^d[0-9]+', d) ] for dir in folders: file_name = os.path.basename(dir) file_path = os.path.join(dir, file_name + '.txt') sents = unified_preprocessor.load_txt_data( file_path=file_path) # list of list: processed words in sentences sents_processed = [] sents_raw_words = [] for sent in sents: sent_tmp = [] sent_tmp.append( unified_preprocessor.word_to_id['<START>']) word_tokens = word_tokenize(sent) for word in word_tokens: word = preprocess_word(word) if word in unified_preprocessor.word_to_id: sent_tmp.append( unified_preprocessor.word_to_id[word]) else: sent_tmp.append( unified_preprocessor.word_to_id['<UNK>']) sent_tmp.append( unified_preprocessor.word_to_id['<END>']) sents_processed.append(sent_tmp) sents_raw_words.append(word_tokens) # test pair for similarity measure sents_pair_p = [] sents_pair_h = [] sents_exact_pair_p = [] sents_exact_pair_h = [] sent_len = len(sents_processed) for i in range(sent_len): for j in range(i + 1, sent_len): sents_pair_p.append(sents_processed[i]) sents_pair_h.append(sents_processed[j]) # exact words premise_exact_match = unified_preprocessor.calculate_exact_match( sents_raw_words[i], sents_raw_words[j]) hypothesis_exact_match = unified_preprocessor.calculate_exact_match( sents_raw_words[j], sents_raw_words[i]) sents_exact_pair_p.append(premise_exact_match) sents_exact_pair_h.append(hypothesis_exact_match) data = [] w2id_p = pad_sequences(sents_pair_p, maxlen=p + 2, padding='post', truncating='post', value=0.) w2id_h = pad_sequences(sents_pair_h, maxlen=h + 2, padding='post', truncating='post', value=0.) data.append(w2id_p) data.append(w2id_h) sents_exact_pair_p = pad_sequences(sents_exact_pair_p, maxlen=p, padding='post', truncating='post', value=0.) sents_exact_pair_h = pad_sequences(sents_exact_pair_h, maxlen=h, padding='post', truncating='post', value=0.) data.append(sents_exact_pair_p) data.append(sents_exact_pair_h) # save as npy data_saver = ChunkDataManager( save_data_path=os.path.join(save_path, file_name)) data_saver.save([np.array(item) for item in data])
def preprocess_DUC(p, h, preprocessor, data_path, save_dir, word_vector_save_path, word_vectors_load_path, word2id_save_path, normalize_word_vectors, max_loaded_word_vectors=None): """ :param p: maximum number of words in text :param h: maximum number of words in hypothesis :param preprocessor: preprocessor :param data_path: root directory of dataset :param word_vector_save_path: path to save a word_vector (only vectors) :param word_vectors_load_path: path to load a Glove word_vector :param word2id_save_path: path to save a word2id :param normalize_word_vectors: normalize word_vector or not :param max_loaded_word_vectors: maximum limitation of number of words :return: (premise_word_ids, hypothesis_word_ids, premise_chars, hypothesis_chars, premise_syntactical_one_hot, hypothesis_syntactical_one_hot, premise_exact_match, hypothesis_exact_match) """ #dirs = [x[0] for x in os.walk(data_path)] # os.walk finds all sub-directoreis #folders = [d for d in dirs if os.path.basename(d).startswith('d')] folders = [ os.path.join(data_path, d) for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d)) and re.search('^d[0-9]+', d) ] if os.path.exists(word2id_save_path) and os.path.exists( word_vector_save_path): preprocessor.load_word2id_dict(word2id_save_path) preprocessor.vectors = np.load(word_vector_save_path) else: preprocessor.get_all_words_DUC(folders) print('Found', len(preprocessor.unique_words), 'unique words from DUC') preprocessor.init_word_to_vectors( vectors_file_path=get_word2vec_file_path(word_vectors_load_path), needed_words=preprocessor.unique_words, normalize=normalize_word_vectors, max_loaded_word_vectors=max_loaded_word_vectors) preprocessor.save_word2id_dict(word2id_save_path) preprocessor.save_word_vectors(word_vector_save_path) save_path = os.path.join(data_path, save_dir) for dir in folders: file_name = os.path.basename(dir) file_path = os.path.join(dir, file_name + '.txt') sents = preprocessor.load_txt_data(file_path=file_path) # list of list: processed words in sentences sents_processed = [] for sent in sents: sent_tmp = [] for word in sent.split(): w = preprocessor.word_to_id[word.translate(None, ',.')] sent_tmp.append(w) sents_processed.append(sent_tmp) sents_pair_p = [] sents_pair_h = [] sent_len = len(sents_processed) for i in range(sent_len): for j in range(i + 1, sent_len): sents_pair_p.append(sents_processed[i]) sents_pair_h.append(sents_processed[j]) data = [sents_pair_p, sents_pair_h] data[0] = pad_sequences(data[0], maxlen=p, padding='post', truncating='post', value=0.) data[1] = pad_sequences(data[1], maxlen=h, padding='post', truncating='post', value=0.) # save as npy data_saver = ChunkDataManager( save_data_path=os.path.join(save_path, file_name)) data_saver.save([np.array(item) for item in data])