Example #1
0
    def run(self):
        """
        main function to generate hdf5 file
        :return:
        """
        logger.info('handle glove file...')
        self.__handle_glove()

        logger.info('read squad json...')
        train_context_qas = self.__read_json(self.__train_path)
        dev_context_qas = self.__read_json(self.__dev_path)

        logger.info('transform word to id...')
        train_cache_nopad = self.__build_data(train_context_qas)
        dev_cache_nopad = self.__build_data(dev_context_qas)

        self.__attr['train_size'] = len(train_cache_nopad['answer_range'])
        self.__attr['dev_size'] = len(dev_cache_nopad['answer_range'])
        self.__attr['word_dict_size'] = len(self.__word2id)
        self.__attr['char_dict_size'] = len(self.__char2id)
        self.__attr['embedding_size'] = self.__embedding_size
        self.__attr['oov_word_num'] = self.__oov_num

        logger.info('padding id vectors...')
        self.__data['train'] = {
            'context': pad_sequences(train_cache_nopad['context'],
                                     maxlen=self.__max_context_token_len,
                                     padding='post',
                                     value=self.padding_idx),
            'question': pad_sequences(train_cache_nopad['question'],
                                      maxlen=self.__max_question_token_len,
                                      padding='post',
                                      value=self.padding_idx),
            'answer_range': np.array(train_cache_nopad['answer_range']),
            'samples_id': np.array(train_cache_nopad['samples_id'])}
        self.__data['dev'] = {
            'context': pad_sequences(dev_cache_nopad['context'],
                                     maxlen=self.__max_context_token_len,
                                     padding='post',
                                     value=self.padding_idx),
            'question': pad_sequences(dev_cache_nopad['question'],
                                      maxlen=self.__max_question_token_len,
                                      padding='post',
                                      value=self.padding_idx),
            'answer_range': pad_sequences(dev_cache_nopad['answer_range'],
                                          maxlen=self.__max_answer_len,
                                          padding='post',
                                          value=self.answer_padding_idx),
            'samples_id': np.array(dev_cache_nopad['samples_id'])}

        logger.info('export to hdf5 file...')
        self.__export_squad_hdf5()

        logger.info('finished.')
Example #2
0
    def doc_to_tensor(self, cand_names, get_doc_id=None, hierarchical=False):
        """
        convert the documents on natural language to tensor
        :param get_doc_id: function for getting documents id by names
        :param hierarchical:
        :param cand_names:
        :return:
        """
        if get_doc_id is None:
            get_doc_id = self.doc_reader.get_doc_id

        if hierarchical:
            cand_docs_id_tensor = []
            for name in cand_names:

                # padding sentences
                cur_doc_id = get_doc_id(name, hierarchical)
                cur_doc_id_array = pad_sequences(cur_doc_id,
                                                 maxlen=self._max_sent_length,
                                                 padding='post',
                                                 value=Vocabulary.PAD_IDX)
                cur_doc_id_tensor = to_long_tensor(cur_doc_id_array)

                # padding to sentences number
                if cur_doc_id_tensor.size()[0] > self._max_sent_num:
                    cur_doc_id_tensor_pad = cur_doc_id_tensor[:self.
                                                              _max_sent_num, :]
                else:
                    padding_tensor = torch.zeros(
                        self._max_sent_num - cur_doc_id_tensor.shape[0],
                        self._max_sent_length).long()
                    cur_doc_id_tensor_pad = torch.cat(
                        [cur_doc_id_tensor, padding_tensor], dim=0)

                cand_docs_id_tensor.append(cur_doc_id_tensor_pad)

            cand_docs_id_tensor = torch.stack(cand_docs_id_tensor, dim=0)

        else:
            cand_docs_id = list(
                map(lambda x: get_doc_id(x, hierarchical), cand_names))
            cand_docs_id_array = pad_sequences(cand_docs_id,
                                               maxlen=self._max_doc_length,
                                               padding='post',
                                               value=Vocabulary.PAD_IDX)
            cand_docs_id_tensor = to_long_tensor(cand_docs_id_array)

        return cand_docs_id_tensor
Example #3
0
def dict2array(data_doc):
    """
    transform dict to numpy array
    :param data_doc: [{'token': [], 'pos': [], 'ent': [], 'em': [], 'em_lemma': [], 'right_space': []]
    :return:
    """
    data = {
        'token': [],
        'pos': [],
        'ent': [],
        'em': [],
        'em_lemma': [],
        'right_space': []
    }
    max_len = 0

    for ele in data_doc:
        assert ele.keys() == data.keys()

        if len(ele['token']) > max_len:
            max_len = len(ele['token'])

        for k in ele.keys():
            if len(ele[k]) > 0:
                data[k].append(ele[k])

    for k in data.keys():
        if len(data[k]) > 0:
            data[k] = pad_sequences(data[k],
                                    maxlen=max_len,
                                    padding='post',
                                    value=PreprocessData.padding_idx)

    return data
Example #4
0
def dict2array(data_doc):
    """
    transform dict to numpy array
    :param data_doc: [{'token': [], 'pos': [], 'ent': [], 'em': [], 'em_lemma': [], 'right_space': []]
    :return:
    """
    data = {'token': [], 'pos': [], 'ent': [], 'em': [], 'em_lemma': [], 'right_space': []}
    max_len = 0

    for ele in data_doc:
        assert ele.keys() == data.keys()

        if len(ele['token']) > max_len:
            max_len = len(ele['token'])

        for k in ele.keys():
            if len(ele[k]) > 0:
                data[k].append(ele[k])

    for k in data.keys():
        if len(data[k]) > 0:
            data[k] = pad_sequences(data[k],
                                    maxlen=max_len,
                                    padding='post',
                                    value=PreprocessData.padding_idx)

    return data
 def __pad_contents_sequences(self,all_contents):
     new_all_contents=[]
     for contents in all_contents:
         new_contents=pad_sequences(contents,
                                  maxlen=self.__max_context_token_len,
                                  padding='post',
                                  value=self.padding_idx)
         new_all_contents.append(new_contents)
     result=np.stack(new_all_contents)
     return result
Example #6
0
    def run(self):
        """
        main function to generate hdf5 file
        :return:
        """
        self._train_mode = True

        logger.info('handle embeddings file...')
        self._handle_emb()

        logger.info('read dataset json...')
        train_context_qas = self._read_json(self._train_path)
        dev_context_qas = self._read_json(self._dev_path)

        logger.info('transform word to id...')
        train_cache_nopad = self._build_data(train_context_qas, training=True)
        dev_cache_nopad = self._build_data(dev_context_qas, training=False)

        self._attr['train_size'] = len(train_cache_nopad['answer_range'])
        self._attr['dev_size'] = len(dev_cache_nopad['answer_range'])
        self._attr['word_dict_size'] = len(self._word2id)
        self._attr['char_dict_size'] = len(self._char2id)
        self._attr['pos_dict_size'] = len(self._pos2id)
        self._attr['ent_dict_size'] = len(self._ent2id)
        self._attr['embedding_size'] = self._embedding_size
        self._attr['oov_word_num'] = self._oov_num

        logger.info('padding id vectors...')
        self._data['train'] = {
            'context': dict2array(train_cache_nopad['context']),
            'question': dict2array(train_cache_nopad['question']),
            'answer_range': np.array(train_cache_nopad['answer_range']),
            'samples_id': np.array(train_cache_nopad['samples_id'])
        }
        self._data['dev'] = {
            'context':
            dict2array(dev_cache_nopad['context']),
            'question':
            dict2array(dev_cache_nopad['question']),
            'answer_range':
            pad_sequences(dev_cache_nopad['answer_range'],
                          maxlen=self._max_answer_len,
                          padding='post',
                          value=self.answer_padding_idx),
            'samples_id':
            np.array(dev_cache_nopad['samples_id'])
        }

        logger.info('export to hdf5 file...')
        self._export_squad_hdf5()
        self._export_add_features()

        logger.info('finished.')
Example #7
0
    def run(self):
        """
        main function to generate hdf5 file
        :return:
        """
        logger.info('handle glove file...')
        self._handle_glove()

        logger.info('read squad json...')
        train_context_qas = self._read_json(self._train_path)
        dev_context_qas = self._read_json(self._dev_path)

        logger.info('transform word to id...')
        train_cache_nopad = self._build_data(train_context_qas, training=True)
        dev_cache_nopad = self._build_data(dev_context_qas, training=False)

        self._attr['train_size'] = len(train_cache_nopad['answer_range'])
        self._attr['dev_size'] = len(dev_cache_nopad['answer_range'])
        self._attr['word_dict_size'] = len(self._word2id)
        self._attr['char_dict_size'] = len(self._char2id)
        self._attr['pos_dict_size'] = len(self._pos2id)
        self._attr['ent_dict_size'] = len(self._ent2id)
        self._attr['embedding_size'] = self._embedding_size
        self._attr['oov_word_num'] = self._oov_num

        logger.info('padding id vectors...')
        self._data['train'] = {
            'context': dict2array(train_cache_nopad['context']),
            'question': dict2array(train_cache_nopad['question']),
            'answer_range': np.array(train_cache_nopad['answer_range']),
            'samples_id': np.array(train_cache_nopad['samples_id'])
        }
        self._data['dev'] = {
            'context': dict2array(dev_cache_nopad['context']),
            'question': dict2array(dev_cache_nopad['question']),
            'answer_range': pad_sequences(dev_cache_nopad['answer_range'],
                                          maxlen=self._max_answer_len,
                                          padding='post',
                                          value=self.answer_padding_idx),
            'samples_id': np.array(dev_cache_nopad['samples_id'])
        }

        logger.info('export to hdf5 file...')
        self._export_squad_hdf5()

        logger.info('finished.')
    def dict2array(self, data_doc, use_domain_tag=False):
        """
        transform dict to numpy array
        :param data_doc: [{'token': [], 'pos': [], 'ent': [], 'em': [], 'em_lemma': [], 'right_space': [], 'domain_tag':[]]
        :return:
        """
        data = {
            'token': [],
            'pos': [],
            'ent': [],
            'em': [],
            'em_lemma': [],
            'right_space': []
        }
        if use_domain_tag:
            data['domain_tag'] = []
        max_len = 0

        for ele in data_doc:
            assert ele.keys() == data.keys()

            if len(ele['token']) > max_len:
                max_len = len(ele['token'])

            for k in ele.keys():
                if len(ele[k]) > 0:
                    data[k].append(ele[k])

        for k in data.keys():
            if len(data[k]) > 0:
                data[k] = pad_sequences(data[k],
                                        maxlen=max_len,
                                        padding='post',
                                        value=0)

        return data
    def run(self):
        """
        main function to generate hdf5 file
        :return:
        """
        logger.info('handle word2vec file...')
        self.__handle_word2vec() #读取word2vec_embedding文件,获得word to vector字典,etc. {a:[0.99,0.23,-0.12,0.33]}

        logger.info('read train/dev/test json file...')
        train_context_qas = self.__read_json(self.__train_path)
        logger.info('train json file loading completed')
        dev_context_qas = self.__read_json(self.__dev_path)
        logger.info('dev json file loading completed')
        test_context_qas = self.__read_json(self.__test_path)
        logger.info('test json file loading completed')

        logger.info('transform word to id...')
        train_cache_nopad = self.__build_data(train_context_qas, training=True)
        dev_cache_nopad = self.__build_data(dev_context_qas, training=True)
        test_cache_nopad = self.__build_data(test_context_qas, training=True)

        self.__attr['train_size'] = len(train_cache_nopad['samples_labels'])
        self.__attr['dev_size'] = len(dev_cache_nopad['samples_labels'])
        self.__attr['test_size'] = len(test_cache_nopad['samples_labels'])

        self.__attr['word_dict_size'] = len(self.__word2id)
        self.__attr['char_dict_size'] = len(self.__char2id)
        self.__attr['embedding_size'] = self.__embedding_size
        self.__attr['oov_word_num'] = self.__oov_num
        self.__attr['max_question_ans_token_len']=self.__max_question_ans_token_len,
        self.__attr['max_context_token_len'] = self.__max_context_token_len,

        logger.debug("self.__question_ans_longer_100_count======="+str(self.__question_ans_longer_100_count))
        logger.debug("self.__content_longer_max_count======="+str(self.__content_longer_max_count))

        logger.info('padding id vectors........')
        padding_start_time=time.time()
        logger.info('padding test id vectors...')
        self.__data['test'] = {
            'contents': self.__pad_contents_sequences(test_cache_nopad['contents']),
            'question_ans': pad_sequences(test_cache_nopad['question_ans'],
                                          maxlen=self.__max_question_ans_token_len,
                                          padding='post',
                                          value=self.padding_idx),
            'samples_ids': np.array(test_cache_nopad['samples_ids']),
            'samples_labels': np.array(test_cache_nopad['samples_labels']),
            'samples_categorys':np.array(test_cache_nopad['samples_categorys']),
            'samples_logics':np.array(test_cache_nopad['samples_logics'])}
        logger.info('padding test dataset using time= %.2f'%(time.time()-padding_start_time))
        padding_start_time=time.time()

        logger.info('padding dev id vectors...')
        self.__data['dev'] = {
            'contents': self.__pad_contents_sequences(dev_cache_nopad['contents']),
            'question_ans': pad_sequences(dev_cache_nopad['question_ans'],
                                          maxlen=self.__max_question_ans_token_len,
                                          padding='post',
                                          value=self.padding_idx),
            'samples_ids': np.array(dev_cache_nopad['samples_ids']),
            'samples_labels': np.array(dev_cache_nopad['samples_labels']),
            'samples_categorys':np.array(dev_cache_nopad['samples_categorys']),
            'samples_logics':np.array(dev_cache_nopad['samples_logics'])}
        logger.info('padding dev dataset using time= %.2f' % (time.time() - padding_start_time))
        padding_start_time = time.time()

        logger.info('padding train id vectors...')
        self.__data['train'] = {
            'contents': self.__pad_contents_sequences(train_cache_nopad['contents']),
            'question_ans': pad_sequences(train_cache_nopad['question_ans'],
                                      maxlen=self.__max_question_ans_token_len,
                                      padding='post',
                                      value=self.padding_idx),
            'samples_ids':    np.array(train_cache_nopad['samples_ids']),
            'samples_labels': np.array(train_cache_nopad['samples_labels']),
            'samples_categorys':np.array(train_cache_nopad['samples_categorys']),
            'samples_logics':np.array(train_cache_nopad['samples_logics'])}
        logger.info('padding train dataset using time= %.2f' % (time.time() - padding_start_time))


        logger.info('export to hdf5 file...')
        export_h5_start_time = time.time()
        self.__export_medqa_hdf5()
        logger.info('export medqa hdf5 using time= %.2f' % (time.time() - export_h5_start_time))

        logger.info('finished!!!!!!!!!!!!!!!!!!!!!!!!')
    def mrcqa_batch(self, contexts, question, single_question=True):
        if single_question:
            questions = []
            for i in range(len(contexts)):
                questions.append(question)
            data_nopad = self.build_data(contexts, questions)
        else:
            data_nopad = self.build_data(contexts, question)
        data_pad = {
            'context':
            self.dict2array(data_nopad['context']),
            'question':
            self.dict2array(data_nopad['question']),
            'answer_range':
            pad_sequences(data_nopad['answer_range'], padding='post',
                          value=-1),
            'samples_id':
            np.array(data_nopad['samples_id'])
        }
        batch_data = self.dataset.get_input_dataloader(
            self.global_config['test']['batch_size'],
            self.global_config['global']['num_data_workers'],
            shuffle=False,
            input_data=data_pad)
        # batch_data = dataset.get_dataloader_test(32, 5)

        batch_cnt = len(batch_data)
        answer = []

        cdict = data_pad['context']
        right_space = cdict['right_space']

        cnt = 0
        for bnum, batch in enumerate(batch_data):
            batch = [x.to(self.device) if x is not None else x for x in batch]
            bat_context = batch[0]
            bat_answer_range = batch[-1]

            # forward
            batch_input = batch[:len(batch) - 1]
            _, tmp_ans_range, _ = self.model.forward(*batch_input)

            tmp_context_ans = zip(bat_context.cpu().data.numpy(),
                                  tmp_ans_range.cpu().data.numpy())

            # generate initial answer text
            i = 0
            for c, a in tmp_context_ans:
                cur_no = cnt + i
                tmp_ans = self.dataset.sentence_id2word(c[a[0]:(a[1] + 1)])
                cur_space = right_space[cur_no][a[0]:(a[1] + 1)]

                cur_ans = ''
                for j, word in enumerate(tmp_ans):
                    cur_ans += word
                    if cur_space[j]:
                        cur_ans += ' '
                answer.append(cur_ans.strip())
                i += 1
            cnt += i
            logging.info('batch=%d/%d' % (bnum, batch_cnt))

            # manual release memory, todo: really effect?
            del bat_context, bat_answer_range, batch, batch_input
            del tmp_ans_range
            # torch.cuda.empty_cache()
        return answer
    def run(self):
        """
        main function to generate hdf5 file
        :return:
        """
        logger.info('handle glove file...')
        self._handle_glove()

        logger.info('read squad json...')
        train_context_qas = self._read_json(self._train_path)
        dev_context_qas = self._read_json(self._dev_path)
        test_context_qas = self._read_json(self._test_path)
        if self._finetune:
            train2_context_qas = self._read_json(self._train2_path)
            dev2_context_qas = self._read_json(self._dev2_path)

        if self._finetune2:
            train3_context_qas = self._read_json(self._train3_path)
            dev3_context_qas = self._read_json(self._dev3_path)
        #print(train_context_qas)
        #print(dev_context_qas)

        logger.info('transform word to id...')
        train_cache_nopad = self._build_data(train_context_qas, training=True)
        dev_cache_nopad = self._build_data(dev_context_qas, training=False)
        test_cache_nopad = self._build_data(test_context_qas, training=False)
        if self._finetune:
            train2_cache_nopad = self._build_data(train2_context_qas,
                                                  training=True)
            dev2_cache_nopad = self._build_data(dev2_context_qas,
                                                training=False)
            self._attr['train2_size'] = len(train2_cache_nopad['answer_range'])
            self._attr['dev2_size'] = len(dev2_cache_nopad['answer_range'])
        if self._finetune2:
            train3_cache_nopad = self._build_data(train3_context_qas,
                                                  training=True)
            dev3_cache_nopad = self._build_data(dev3_context_qas,
                                                training=False)
            self._attr['train3_size'] = len(train3_cache_nopad['answer_range'])
            self._attr['dev3_size'] = len(dev3_cache_nopad['answer_range'])

        self._attr['train_size'] = len(train_cache_nopad['answer_range'])
        self._attr['dev_size'] = len(dev_cache_nopad['answer_range'])
        self._attr['test_size'] = len(test_cache_nopad['answer_range'])
        self._attr['word_dict_size'] = len(self._word2id)
        self._attr['char_dict_size'] = len(self._char2id)
        self._attr['pos_dict_size'] = len(self._pos2id)
        self._attr['ent_dict_size'] = len(self._ent2id)
        self._attr['embedding_size'] = self._embedding_size
        self._attr['oov_word_num'] = self._oov_num

        logger.info('padding id vectors...')

        #to check array type of answer_range and samples_id
        self._data['train'] = {
            'context':
            dict2array(train_cache_nopad['context'], self._use_domain_tag),
            'question':
            dict2array(train_cache_nopad['question'], self._use_domain_tag),
            # 'answer_range': np.array(train_cache_nopad['answer_range']),
            'answer_range':
            pad_sequences(train_cache_nopad['answer_range'],
                          maxlen=self._max_answer_len,
                          padding='post',
                          value=self.answer_padding_idx),
            'samples_id':
            np.array(train_cache_nopad['samples_id'])
        }
        self._data['dev'] = {
            'context':
            dict2array(dev_cache_nopad['context'], self._use_domain_tag),
            'question':
            dict2array(dev_cache_nopad['question'], self._use_domain_tag),
            'answer_range':
            pad_sequences(dev_cache_nopad['answer_range'],
                          maxlen=self._max_answer_len,
                          padding='post',
                          value=self.answer_padding_idx),
            'samples_id':
            np.array(dev_cache_nopad['samples_id'])
        }
        self._data['test'] = {
            'context':
            dict2array(test_cache_nopad['context'], self._use_domain_tag),
            'question':
            dict2array(test_cache_nopad['question'], self._use_domain_tag),
            'answer_range':
            pad_sequences(test_cache_nopad['answer_range'],
                          maxlen=self._max_answer_len,
                          padding='post',
                          value=self.answer_padding_idx),
            'samples_id':
            np.array(test_cache_nopad['samples_id'])
        }

        if self._finetune:
            self._data['train2'] = {
                'context':
                dict2array(train2_cache_nopad['context'],
                           self._use_domain_tag),
                'question':
                dict2array(train2_cache_nopad['question'],
                           self._use_domain_tag),
                # 'answer_range': np.array(train_cache_nopad['answer_range']),
                'answer_range':
                pad_sequences(train2_cache_nopad['answer_range'],
                              maxlen=self._max_answer_len,
                              padding='post',
                              value=self.answer_padding_idx),
                'samples_id':
                np.array(train2_cache_nopad['samples_id'])
            }
            self._data['dev2'] = {
                'context':
                dict2array(dev2_cache_nopad['context'], self._use_domain_tag),
                'question':
                dict2array(dev2_cache_nopad['question'], self._use_domain_tag),
                'answer_range':
                pad_sequences(dev2_cache_nopad['answer_range'],
                              maxlen=self._max_answer_len,
                              padding='post',
                              value=self.answer_padding_idx),
                'samples_id':
                np.array(dev2_cache_nopad['samples_id'])
            }

        if self._finetune2:
            self._data['train3'] = {
                'context':
                dict2array(train3_cache_nopad['context'],
                           self._use_domain_tag),
                'question':
                dict2array(train3_cache_nopad['question'],
                           self._use_domain_tag),
                # 'answer_range': np.array(train_cache_nopad['answer_range']),
                'answer_range':
                pad_sequences(train3_cache_nopad['answer_range'],
                              maxlen=self._max_answer_len,
                              padding='post',
                              value=self.answer_padding_idx),
                'samples_id':
                np.array(train3_cache_nopad['samples_id'])
            }
            self._data['dev3'] = {
                'context':
                dict2array(dev3_cache_nopad['context'], self._use_domain_tag),
                'question':
                dict2array(dev3_cache_nopad['question'], self._use_domain_tag),
                'answer_range':
                pad_sequences(dev3_cache_nopad['answer_range'],
                              maxlen=self._max_answer_len,
                              padding='post',
                              value=self.answer_padding_idx),
                'samples_id':
                np.array(dev3_cache_nopad['samples_id'])
            }

        logger.info('export to hdf5 file...')
        self._export_squad_hdf5()

        logger.info('finished.')