def run(self): """ main function to generate hdf5 file :return: """ logger.info('handle glove file...') self.__handle_glove() logger.info('read squad json...') train_context_qas = self.__read_json(self.__train_path) dev_context_qas = self.__read_json(self.__dev_path) logger.info('transform word to id...') train_cache_nopad = self.__build_data(train_context_qas) dev_cache_nopad = self.__build_data(dev_context_qas) self.__attr['train_size'] = len(train_cache_nopad['answer_range']) self.__attr['dev_size'] = len(dev_cache_nopad['answer_range']) self.__attr['word_dict_size'] = len(self.__word2id) self.__attr['char_dict_size'] = len(self.__char2id) self.__attr['embedding_size'] = self.__embedding_size self.__attr['oov_word_num'] = self.__oov_num logger.info('padding id vectors...') self.__data['train'] = { 'context': pad_sequences(train_cache_nopad['context'], maxlen=self.__max_context_token_len, padding='post', value=self.padding_idx), 'question': pad_sequences(train_cache_nopad['question'], maxlen=self.__max_question_token_len, padding='post', value=self.padding_idx), 'answer_range': np.array(train_cache_nopad['answer_range']), 'samples_id': np.array(train_cache_nopad['samples_id'])} self.__data['dev'] = { 'context': pad_sequences(dev_cache_nopad['context'], maxlen=self.__max_context_token_len, padding='post', value=self.padding_idx), 'question': pad_sequences(dev_cache_nopad['question'], maxlen=self.__max_question_token_len, padding='post', value=self.padding_idx), 'answer_range': pad_sequences(dev_cache_nopad['answer_range'], maxlen=self.__max_answer_len, padding='post', value=self.answer_padding_idx), 'samples_id': np.array(dev_cache_nopad['samples_id'])} logger.info('export to hdf5 file...') self.__export_squad_hdf5() logger.info('finished.')
def doc_to_tensor(self, cand_names, get_doc_id=None, hierarchical=False): """ convert the documents on natural language to tensor :param get_doc_id: function for getting documents id by names :param hierarchical: :param cand_names: :return: """ if get_doc_id is None: get_doc_id = self.doc_reader.get_doc_id if hierarchical: cand_docs_id_tensor = [] for name in cand_names: # padding sentences cur_doc_id = get_doc_id(name, hierarchical) cur_doc_id_array = pad_sequences(cur_doc_id, maxlen=self._max_sent_length, padding='post', value=Vocabulary.PAD_IDX) cur_doc_id_tensor = to_long_tensor(cur_doc_id_array) # padding to sentences number if cur_doc_id_tensor.size()[0] > self._max_sent_num: cur_doc_id_tensor_pad = cur_doc_id_tensor[:self. _max_sent_num, :] else: padding_tensor = torch.zeros( self._max_sent_num - cur_doc_id_tensor.shape[0], self._max_sent_length).long() cur_doc_id_tensor_pad = torch.cat( [cur_doc_id_tensor, padding_tensor], dim=0) cand_docs_id_tensor.append(cur_doc_id_tensor_pad) cand_docs_id_tensor = torch.stack(cand_docs_id_tensor, dim=0) else: cand_docs_id = list( map(lambda x: get_doc_id(x, hierarchical), cand_names)) cand_docs_id_array = pad_sequences(cand_docs_id, maxlen=self._max_doc_length, padding='post', value=Vocabulary.PAD_IDX) cand_docs_id_tensor = to_long_tensor(cand_docs_id_array) return cand_docs_id_tensor
def dict2array(data_doc): """ transform dict to numpy array :param data_doc: [{'token': [], 'pos': [], 'ent': [], 'em': [], 'em_lemma': [], 'right_space': []] :return: """ data = { 'token': [], 'pos': [], 'ent': [], 'em': [], 'em_lemma': [], 'right_space': [] } max_len = 0 for ele in data_doc: assert ele.keys() == data.keys() if len(ele['token']) > max_len: max_len = len(ele['token']) for k in ele.keys(): if len(ele[k]) > 0: data[k].append(ele[k]) for k in data.keys(): if len(data[k]) > 0: data[k] = pad_sequences(data[k], maxlen=max_len, padding='post', value=PreprocessData.padding_idx) return data
def dict2array(data_doc): """ transform dict to numpy array :param data_doc: [{'token': [], 'pos': [], 'ent': [], 'em': [], 'em_lemma': [], 'right_space': []] :return: """ data = {'token': [], 'pos': [], 'ent': [], 'em': [], 'em_lemma': [], 'right_space': []} max_len = 0 for ele in data_doc: assert ele.keys() == data.keys() if len(ele['token']) > max_len: max_len = len(ele['token']) for k in ele.keys(): if len(ele[k]) > 0: data[k].append(ele[k]) for k in data.keys(): if len(data[k]) > 0: data[k] = pad_sequences(data[k], maxlen=max_len, padding='post', value=PreprocessData.padding_idx) return data
def __pad_contents_sequences(self,all_contents): new_all_contents=[] for contents in all_contents: new_contents=pad_sequences(contents, maxlen=self.__max_context_token_len, padding='post', value=self.padding_idx) new_all_contents.append(new_contents) result=np.stack(new_all_contents) return result
def run(self): """ main function to generate hdf5 file :return: """ self._train_mode = True logger.info('handle embeddings file...') self._handle_emb() logger.info('read dataset json...') train_context_qas = self._read_json(self._train_path) dev_context_qas = self._read_json(self._dev_path) logger.info('transform word to id...') train_cache_nopad = self._build_data(train_context_qas, training=True) dev_cache_nopad = self._build_data(dev_context_qas, training=False) self._attr['train_size'] = len(train_cache_nopad['answer_range']) self._attr['dev_size'] = len(dev_cache_nopad['answer_range']) self._attr['word_dict_size'] = len(self._word2id) self._attr['char_dict_size'] = len(self._char2id) self._attr['pos_dict_size'] = len(self._pos2id) self._attr['ent_dict_size'] = len(self._ent2id) self._attr['embedding_size'] = self._embedding_size self._attr['oov_word_num'] = self._oov_num logger.info('padding id vectors...') self._data['train'] = { 'context': dict2array(train_cache_nopad['context']), 'question': dict2array(train_cache_nopad['question']), 'answer_range': np.array(train_cache_nopad['answer_range']), 'samples_id': np.array(train_cache_nopad['samples_id']) } self._data['dev'] = { 'context': dict2array(dev_cache_nopad['context']), 'question': dict2array(dev_cache_nopad['question']), 'answer_range': pad_sequences(dev_cache_nopad['answer_range'], maxlen=self._max_answer_len, padding='post', value=self.answer_padding_idx), 'samples_id': np.array(dev_cache_nopad['samples_id']) } logger.info('export to hdf5 file...') self._export_squad_hdf5() self._export_add_features() logger.info('finished.')
def run(self): """ main function to generate hdf5 file :return: """ logger.info('handle glove file...') self._handle_glove() logger.info('read squad json...') train_context_qas = self._read_json(self._train_path) dev_context_qas = self._read_json(self._dev_path) logger.info('transform word to id...') train_cache_nopad = self._build_data(train_context_qas, training=True) dev_cache_nopad = self._build_data(dev_context_qas, training=False) self._attr['train_size'] = len(train_cache_nopad['answer_range']) self._attr['dev_size'] = len(dev_cache_nopad['answer_range']) self._attr['word_dict_size'] = len(self._word2id) self._attr['char_dict_size'] = len(self._char2id) self._attr['pos_dict_size'] = len(self._pos2id) self._attr['ent_dict_size'] = len(self._ent2id) self._attr['embedding_size'] = self._embedding_size self._attr['oov_word_num'] = self._oov_num logger.info('padding id vectors...') self._data['train'] = { 'context': dict2array(train_cache_nopad['context']), 'question': dict2array(train_cache_nopad['question']), 'answer_range': np.array(train_cache_nopad['answer_range']), 'samples_id': np.array(train_cache_nopad['samples_id']) } self._data['dev'] = { 'context': dict2array(dev_cache_nopad['context']), 'question': dict2array(dev_cache_nopad['question']), 'answer_range': pad_sequences(dev_cache_nopad['answer_range'], maxlen=self._max_answer_len, padding='post', value=self.answer_padding_idx), 'samples_id': np.array(dev_cache_nopad['samples_id']) } logger.info('export to hdf5 file...') self._export_squad_hdf5() logger.info('finished.')
def dict2array(self, data_doc, use_domain_tag=False): """ transform dict to numpy array :param data_doc: [{'token': [], 'pos': [], 'ent': [], 'em': [], 'em_lemma': [], 'right_space': [], 'domain_tag':[]] :return: """ data = { 'token': [], 'pos': [], 'ent': [], 'em': [], 'em_lemma': [], 'right_space': [] } if use_domain_tag: data['domain_tag'] = [] max_len = 0 for ele in data_doc: assert ele.keys() == data.keys() if len(ele['token']) > max_len: max_len = len(ele['token']) for k in ele.keys(): if len(ele[k]) > 0: data[k].append(ele[k]) for k in data.keys(): if len(data[k]) > 0: data[k] = pad_sequences(data[k], maxlen=max_len, padding='post', value=0) return data
def run(self): """ main function to generate hdf5 file :return: """ logger.info('handle word2vec file...') self.__handle_word2vec() #读取word2vec_embedding文件,获得word to vector字典,etc. {a:[0.99,0.23,-0.12,0.33]} logger.info('read train/dev/test json file...') train_context_qas = self.__read_json(self.__train_path) logger.info('train json file loading completed') dev_context_qas = self.__read_json(self.__dev_path) logger.info('dev json file loading completed') test_context_qas = self.__read_json(self.__test_path) logger.info('test json file loading completed') logger.info('transform word to id...') train_cache_nopad = self.__build_data(train_context_qas, training=True) dev_cache_nopad = self.__build_data(dev_context_qas, training=True) test_cache_nopad = self.__build_data(test_context_qas, training=True) self.__attr['train_size'] = len(train_cache_nopad['samples_labels']) self.__attr['dev_size'] = len(dev_cache_nopad['samples_labels']) self.__attr['test_size'] = len(test_cache_nopad['samples_labels']) self.__attr['word_dict_size'] = len(self.__word2id) self.__attr['char_dict_size'] = len(self.__char2id) self.__attr['embedding_size'] = self.__embedding_size self.__attr['oov_word_num'] = self.__oov_num self.__attr['max_question_ans_token_len']=self.__max_question_ans_token_len, self.__attr['max_context_token_len'] = self.__max_context_token_len, logger.debug("self.__question_ans_longer_100_count======="+str(self.__question_ans_longer_100_count)) logger.debug("self.__content_longer_max_count======="+str(self.__content_longer_max_count)) logger.info('padding id vectors........') padding_start_time=time.time() logger.info('padding test id vectors...') self.__data['test'] = { 'contents': self.__pad_contents_sequences(test_cache_nopad['contents']), 'question_ans': pad_sequences(test_cache_nopad['question_ans'], maxlen=self.__max_question_ans_token_len, padding='post', value=self.padding_idx), 'samples_ids': np.array(test_cache_nopad['samples_ids']), 'samples_labels': np.array(test_cache_nopad['samples_labels']), 'samples_categorys':np.array(test_cache_nopad['samples_categorys']), 'samples_logics':np.array(test_cache_nopad['samples_logics'])} logger.info('padding test dataset using time= %.2f'%(time.time()-padding_start_time)) padding_start_time=time.time() logger.info('padding dev id vectors...') self.__data['dev'] = { 'contents': self.__pad_contents_sequences(dev_cache_nopad['contents']), 'question_ans': pad_sequences(dev_cache_nopad['question_ans'], maxlen=self.__max_question_ans_token_len, padding='post', value=self.padding_idx), 'samples_ids': np.array(dev_cache_nopad['samples_ids']), 'samples_labels': np.array(dev_cache_nopad['samples_labels']), 'samples_categorys':np.array(dev_cache_nopad['samples_categorys']), 'samples_logics':np.array(dev_cache_nopad['samples_logics'])} logger.info('padding dev dataset using time= %.2f' % (time.time() - padding_start_time)) padding_start_time = time.time() logger.info('padding train id vectors...') self.__data['train'] = { 'contents': self.__pad_contents_sequences(train_cache_nopad['contents']), 'question_ans': pad_sequences(train_cache_nopad['question_ans'], maxlen=self.__max_question_ans_token_len, padding='post', value=self.padding_idx), 'samples_ids': np.array(train_cache_nopad['samples_ids']), 'samples_labels': np.array(train_cache_nopad['samples_labels']), 'samples_categorys':np.array(train_cache_nopad['samples_categorys']), 'samples_logics':np.array(train_cache_nopad['samples_logics'])} logger.info('padding train dataset using time= %.2f' % (time.time() - padding_start_time)) logger.info('export to hdf5 file...') export_h5_start_time = time.time() self.__export_medqa_hdf5() logger.info('export medqa hdf5 using time= %.2f' % (time.time() - export_h5_start_time)) logger.info('finished!!!!!!!!!!!!!!!!!!!!!!!!')
def mrcqa_batch(self, contexts, question, single_question=True): if single_question: questions = [] for i in range(len(contexts)): questions.append(question) data_nopad = self.build_data(contexts, questions) else: data_nopad = self.build_data(contexts, question) data_pad = { 'context': self.dict2array(data_nopad['context']), 'question': self.dict2array(data_nopad['question']), 'answer_range': pad_sequences(data_nopad['answer_range'], padding='post', value=-1), 'samples_id': np.array(data_nopad['samples_id']) } batch_data = self.dataset.get_input_dataloader( self.global_config['test']['batch_size'], self.global_config['global']['num_data_workers'], shuffle=False, input_data=data_pad) # batch_data = dataset.get_dataloader_test(32, 5) batch_cnt = len(batch_data) answer = [] cdict = data_pad['context'] right_space = cdict['right_space'] cnt = 0 for bnum, batch in enumerate(batch_data): batch = [x.to(self.device) if x is not None else x for x in batch] bat_context = batch[0] bat_answer_range = batch[-1] # forward batch_input = batch[:len(batch) - 1] _, tmp_ans_range, _ = self.model.forward(*batch_input) tmp_context_ans = zip(bat_context.cpu().data.numpy(), tmp_ans_range.cpu().data.numpy()) # generate initial answer text i = 0 for c, a in tmp_context_ans: cur_no = cnt + i tmp_ans = self.dataset.sentence_id2word(c[a[0]:(a[1] + 1)]) cur_space = right_space[cur_no][a[0]:(a[1] + 1)] cur_ans = '' for j, word in enumerate(tmp_ans): cur_ans += word if cur_space[j]: cur_ans += ' ' answer.append(cur_ans.strip()) i += 1 cnt += i logging.info('batch=%d/%d' % (bnum, batch_cnt)) # manual release memory, todo: really effect? del bat_context, bat_answer_range, batch, batch_input del tmp_ans_range # torch.cuda.empty_cache() return answer
def run(self): """ main function to generate hdf5 file :return: """ logger.info('handle glove file...') self._handle_glove() logger.info('read squad json...') train_context_qas = self._read_json(self._train_path) dev_context_qas = self._read_json(self._dev_path) test_context_qas = self._read_json(self._test_path) if self._finetune: train2_context_qas = self._read_json(self._train2_path) dev2_context_qas = self._read_json(self._dev2_path) if self._finetune2: train3_context_qas = self._read_json(self._train3_path) dev3_context_qas = self._read_json(self._dev3_path) #print(train_context_qas) #print(dev_context_qas) logger.info('transform word to id...') train_cache_nopad = self._build_data(train_context_qas, training=True) dev_cache_nopad = self._build_data(dev_context_qas, training=False) test_cache_nopad = self._build_data(test_context_qas, training=False) if self._finetune: train2_cache_nopad = self._build_data(train2_context_qas, training=True) dev2_cache_nopad = self._build_data(dev2_context_qas, training=False) self._attr['train2_size'] = len(train2_cache_nopad['answer_range']) self._attr['dev2_size'] = len(dev2_cache_nopad['answer_range']) if self._finetune2: train3_cache_nopad = self._build_data(train3_context_qas, training=True) dev3_cache_nopad = self._build_data(dev3_context_qas, training=False) self._attr['train3_size'] = len(train3_cache_nopad['answer_range']) self._attr['dev3_size'] = len(dev3_cache_nopad['answer_range']) self._attr['train_size'] = len(train_cache_nopad['answer_range']) self._attr['dev_size'] = len(dev_cache_nopad['answer_range']) self._attr['test_size'] = len(test_cache_nopad['answer_range']) self._attr['word_dict_size'] = len(self._word2id) self._attr['char_dict_size'] = len(self._char2id) self._attr['pos_dict_size'] = len(self._pos2id) self._attr['ent_dict_size'] = len(self._ent2id) self._attr['embedding_size'] = self._embedding_size self._attr['oov_word_num'] = self._oov_num logger.info('padding id vectors...') #to check array type of answer_range and samples_id self._data['train'] = { 'context': dict2array(train_cache_nopad['context'], self._use_domain_tag), 'question': dict2array(train_cache_nopad['question'], self._use_domain_tag), # 'answer_range': np.array(train_cache_nopad['answer_range']), 'answer_range': pad_sequences(train_cache_nopad['answer_range'], maxlen=self._max_answer_len, padding='post', value=self.answer_padding_idx), 'samples_id': np.array(train_cache_nopad['samples_id']) } self._data['dev'] = { 'context': dict2array(dev_cache_nopad['context'], self._use_domain_tag), 'question': dict2array(dev_cache_nopad['question'], self._use_domain_tag), 'answer_range': pad_sequences(dev_cache_nopad['answer_range'], maxlen=self._max_answer_len, padding='post', value=self.answer_padding_idx), 'samples_id': np.array(dev_cache_nopad['samples_id']) } self._data['test'] = { 'context': dict2array(test_cache_nopad['context'], self._use_domain_tag), 'question': dict2array(test_cache_nopad['question'], self._use_domain_tag), 'answer_range': pad_sequences(test_cache_nopad['answer_range'], maxlen=self._max_answer_len, padding='post', value=self.answer_padding_idx), 'samples_id': np.array(test_cache_nopad['samples_id']) } if self._finetune: self._data['train2'] = { 'context': dict2array(train2_cache_nopad['context'], self._use_domain_tag), 'question': dict2array(train2_cache_nopad['question'], self._use_domain_tag), # 'answer_range': np.array(train_cache_nopad['answer_range']), 'answer_range': pad_sequences(train2_cache_nopad['answer_range'], maxlen=self._max_answer_len, padding='post', value=self.answer_padding_idx), 'samples_id': np.array(train2_cache_nopad['samples_id']) } self._data['dev2'] = { 'context': dict2array(dev2_cache_nopad['context'], self._use_domain_tag), 'question': dict2array(dev2_cache_nopad['question'], self._use_domain_tag), 'answer_range': pad_sequences(dev2_cache_nopad['answer_range'], maxlen=self._max_answer_len, padding='post', value=self.answer_padding_idx), 'samples_id': np.array(dev2_cache_nopad['samples_id']) } if self._finetune2: self._data['train3'] = { 'context': dict2array(train3_cache_nopad['context'], self._use_domain_tag), 'question': dict2array(train3_cache_nopad['question'], self._use_domain_tag), # 'answer_range': np.array(train_cache_nopad['answer_range']), 'answer_range': pad_sequences(train3_cache_nopad['answer_range'], maxlen=self._max_answer_len, padding='post', value=self.answer_padding_idx), 'samples_id': np.array(train3_cache_nopad['samples_id']) } self._data['dev3'] = { 'context': dict2array(dev3_cache_nopad['context'], self._use_domain_tag), 'question': dict2array(dev3_cache_nopad['question'], self._use_domain_tag), 'answer_range': pad_sequences(dev3_cache_nopad['answer_range'], maxlen=self._max_answer_len, padding='post', value=self.answer_padding_idx), 'samples_id': np.array(dev3_cache_nopad['samples_id']) } logger.info('export to hdf5 file...') self._export_squad_hdf5() logger.info('finished.')