Ejemplo n.º 1
0
 def get_batch(self, bucket_dbs, bucket_id, data):
     (encoder_size, decoder_size) = self.buckets[bucket_id]
     (encoder_inputs, decoder_inputs) = ([], [])
     for (encoder_input, decoder_input) in data:
         encoder_input = data_utils.sentence_indice(encoder_input)
         decoder_input = data_utils.sentence_indice(decoder_input)
         encoder_pad = ([data_utils.PAD_ID] *
                        (encoder_size - len(encoder_input)))
         encoder_inputs.append(list(reversed(
             (encoder_input + encoder_pad))))
         decoder_pad_size = ((decoder_size - len(decoder_input)) - 2)
         decoder_inputs.append(
             ((([data_utils.GO_ID] + decoder_input) + [data_utils.EOS_ID]) +
              ([data_utils.PAD_ID] * decoder_pad_size)))
     (batch_encoder_inputs, batch_decoder_inputs, batch_weights) = ([], [],
                                                                    [])
     for i in range(encoder_size):
         batch_encoder_inputs.append(
             np.array(
                 [encoder_inputs[j][i] for j in range(self.batch_size)],
                 dtype=np.int32))
     for i in range(decoder_size):
         batch_decoder_inputs.append(
             np.array(
                 [decoder_inputs[j][i] for j in range(self.batch_size)],
                 dtype=np.int32))
         batch_weight = np.ones(self.batch_size, dtype=np.float32)
         for j in range(self.batch_size):
             if (i < (decoder_size - 1)):
                 target = decoder_inputs[j][(i + 1)]
             if ((i == (decoder_size - 1))
                     or (target == data_utils.PAD_ID)):
                 batch_weight[j] = 0.0
         batch_weights.append(batch_weight)
     return (batch_encoder_inputs, batch_decoder_inputs, batch_weights)
Ejemplo n.º 2
0
 def get_batch(self, bucket_dbs, bucket_id, data):
     '''将data转换为模型训练可接受的格式
     '''
     encoder_size, decoder_size = self.buckets[bucket_id]
     # bucket_db = bucket_dbs[bucket_id]
     encoder_inputs, decoder_inputs = [], []
     for encoder_input, decoder_input in data:
         # encoder_input, decoder_input = random.choice(data[bucket_id])
         # encoder_input, decoder_input = bucket_db.random()
         encoder_input = data_utils.sentence_indice(encoder_input)
         decoder_input = data_utils.sentence_indice(decoder_input)
         
         # Encoder 
         # 句子填充为固定长度
         encoder_pad = [data_utils.PAD_ID] * (
             encoder_size - len(encoder_input)
         )
         # 填充后,将输入反转,填充的内容置前。
         # 此处我的理解:信息在前向传播过程中,越靠前的内容损失越多,借此可以降低填充内容的权重
         # 编码反转,解码不需反转。要提高准确率可以采用attention机制
         encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
         
         # Decoder
         # 解码填充长度需减2, 因为解码自带开始符和结束符,本项目定义为GO_ID和EOS_ID
         decoder_pad_size = decoder_size - len(decoder_input) - 2
         decoder_inputs.append(
             [data_utils.GO_ID] + decoder_input +
             [data_utils.EOS_ID] +
             [data_utils.PAD_ID] * decoder_pad_size
         )
     
     batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
     # batch encoder
     for i in range(encoder_size):
         batch_encoder_inputs.append(np.array(
             [encoder_inputs[j][i] for j in range(self.batch_size)],
             dtype=np.int32
         ))
     # batch decoder
     for i in range(decoder_size):
         batch_decoder_inputs.append(np.array(
             [decoder_inputs[j][i] for j in range(self.batch_size)],
             dtype=np.int32
         ))
         # batch_weights 的维度与 batch_decoder_inputs 一致
         batch_weight = np.ones(self.batch_size, dtype=np.float32)
         for j in range(self.batch_size):
             if i < decoder_size - 1:
                 target = decoder_inputs[j][i + 1]
             if i == decoder_size - 1 or target == data_utils.PAD_ID:
                 batch_weight[j] = 0.0
         batch_weights.append(batch_weight)
     return batch_encoder_inputs, batch_decoder_inputs, batch_weights
Ejemplo n.º 3
0
    def get_batch(self, bucket_dbs, bucket_id, data):
        '''将batch中的字符调用data_utils里面的函数转换为数值'''

        encoder_size, decoder_size = self.buckets[bucket_id]
        # bucket_db = bucket_dbs[bucket_id]
        encoder_inputs, decoder_inputs = [], []

        # 读取的文字存放在data里面 data = <class 'list'>: [('你输入的内容\n', '')]
        for encoder_input, decoder_input in data:
            # encoder_input, decoder_input = random.choice(data[bucket_id])
            # encoder_input, decoder_input = bucket_db.random()

            # 利用sentence——indice 把输入句子转化为id
            encoder_input = data_utils.sentence_indice(encoder_input) # encoder_input 是ask,汉字,输出的该句子中每个词在字典中的位置,并且组成列表<class 'list'>: [1632, 3008, 69, 1334, 1642, 2280, 2524]
            decoder_input = data_utils.sentence_indice(decoder_input) #<class 'list'>: [1334, 1200, 882, 69, 1334, 197, 852, 644, 1094, 2280, 3146, 3703, 164]
            # Encoder
            # data_utils.PAD_ID = 2, 句子总长度减去传入句子长度,然后用pad填充
            encoder_pad = [data_utils.PAD_ID] * ( encoder_size - len(encoder_input)  )

            # reversed翻转,利用我们想输入风云三尺剑,实际上我们输入的剑尺三云风,这样提高我们模型的预测能力
            encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
            # Decoder 20 - 句子原始长度 - 0 - 2,留2是因为要给留go,eos,其他地方全部做pad
            decoder_pad_size = decoder_size - len(decoder_input) - 2
            decoder_inputs.append( # go必须写在最前面,代表模型开始运行
                [data_utils.GO_ID] + decoder_input + # decoderinput是20,所以decoder要长两个维度,因为有go和eos
                [data_utils.EOS_ID] + # 结束标记符
                [data_utils.PAD_ID] * decoder_pad_size # pad size = 20 - 0 - 2 = 18
            ) # 最终组装成的,一个batch32个句子,编码取用pad填充,解码区用GO ~~~eos-PAD填充,里面全部整型数字填充,在前面的sen2index已经由汉字根据在字典里面的位置转为数字了
        batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
        # batch encoder
        for i in range(encoder_size): # 1-10for循环
            batch_encoder_inputs.append(np.array(
                [encoder_inputs[j][i] for j in range(self.batch_size)],
                dtype=np.int32
            ))
        # batch decoder
        for i in range(decoder_size): # size和input是有区别的
            batch_decoder_inputs.append(np.array(
                [decoder_inputs[j][i] for j in range(self.batch_size)],
                dtype=np.int32
            )) # 在decoder阶段,因为有pad的存在,所以被pad的部分给定权重为0,没有pad的部分为1
            batch_weight = np.ones(self.batch_size, dtype=np.float32) #batch_weight要么0,要么1
            for j in range(self.batch_size):
                if i < decoder_size - 1: # 20 -1 = 19,就是说i在0-18
                    target = decoder_inputs[j][i + 1] # 编码阶段向右移动一个单位
                if i == decoder_size - 1 or target == data_utils.PAD_ID:
                    batch_weight[j] = 0.0
            batch_weights.append(batch_weight)
        return batch_encoder_inputs, batch_decoder_inputs, batch_weights
Ejemplo n.º 4
0
 def get_batch(self, bucket_dbs, bucket_id, data):
     encoder_size, decoder_size = self.buckets[bucket_id]
     # bucket_db = bucket_dbs[bucket_id]
     encoder_inputs, decoder_inputs = [], []
     for encoder_input, decoder_input in data:
         # encoder_input, decoder_input = random.choice(data[bucket_id])
         # encoder_input, decoder_input = bucket_db.random()
         #把输入句子转化为id
         encoder_input = data_utils.sentence_indice(encoder_input)
         decoder_input = data_utils.sentence_indice(decoder_input)
         # Encoder
         encoder_pad = [data_utils.PAD_ID] * (
             encoder_size - len(encoder_input)
         )
         encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
         # Decoder
         decoder_pad_size = decoder_size - len(decoder_input) - 2
         decoder_inputs.append(
             [data_utils.GO_ID] + decoder_input +
             [data_utils.EOS_ID] +
             [data_utils.PAD_ID] * decoder_pad_size
         )
     batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
     # batch encoder
     for i in range(encoder_size):
         batch_encoder_inputs.append(np.array(
             [encoder_inputs[j][i] for j in range(self.batch_size)],
             dtype=np.int32
         ))
     # batch decoder
     for i in range(decoder_size):
         batch_decoder_inputs.append(np.array(
             [decoder_inputs[j][i] for j in range(self.batch_size)],
             dtype=np.int32
         ))
         batch_weight = np.ones(self.batch_size, dtype=np.float32)
         for j in range(self.batch_size):
             if i < decoder_size - 1:
                 target = decoder_inputs[j][i + 1]
             if i == decoder_size - 1 or target == data_utils.PAD_ID:
                 batch_weight[j] = 0.0
         batch_weights.append(batch_weight)
     return batch_encoder_inputs, batch_decoder_inputs, batch_weights
Ejemplo n.º 5
0
 def get_batch(self, bucket_dbs, bucket_id, data):
     encoder_size, decoder_size = self.buckets[bucket_id]
     # bucket_db = bucket_dbs[bucket_id]
     encoder_inputs, decoder_inputs = [], []
     for encoder_input, decoder_input in data:
         # encoder_input, decoder_input = random.choice(data[bucket_id])
         # encoder_input, decoder_input = bucket_db.random()
         encoder_input = data_utils.sentence_indice(encoder_input)
         decoder_input = data_utils.sentence_indice(decoder_input)
         # Encoder
         encoder_pad = [data_utils.PAD_ID] * (
             encoder_size - len(encoder_input)
         )
         encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
         # Decoder
         decoder_pad_size = decoder_size - len(decoder_input) - 2
         decoder_inputs.append(
             [data_utils.GO_ID] + decoder_input +
             [data_utils.EOS_ID] +
             [data_utils.PAD_ID] * decoder_pad_size
         )
     batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
     # batch encoder
     for i in range(encoder_size):
         batch_encoder_inputs.append(np.array(
             [encoder_inputs[j][i] for j in range(self.batch_size)],
             dtype=np.int32
         ))
     # batch decoder
     for i in range(decoder_size):
         batch_decoder_inputs.append(np.array(
             [decoder_inputs[j][i] for j in range(self.batch_size)],
             dtype=np.int32
         ))
         batch_weight = np.ones(self.batch_size, dtype=np.float32)
         for j in range(self.batch_size):
             if i < decoder_size - 1:
                 target = decoder_inputs[j][i + 1]
             if i == decoder_size - 1 or target == data_utils.PAD_ID:
                 batch_weight[j] = 0.0
         batch_weights.append(batch_weight)
     return batch_encoder_inputs, batch_decoder_inputs, batch_weights
Ejemplo n.º 6
0
 def get_batch(self, bucket_id, data):
     # 获取bucket_id这个桶对应的ask和answer的字符长度大小限制值
     encoder_size, decoder_size = self.buckets[bucket_id]
     encoder_inputs, decoder_inputs = [], []
     for encoder_input, decoder_input in data:
         # ids化
         encoder_input = data_utils.sentence_indice(encoder_input)
         decoder_input = data_utils.sentence_indice(decoder_input)
         # Encoder Padding
         encoder_pad = [data_utils.PAD_ID
                        ] * (encoder_size - len(encoder_input))
         encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
         # Decoder Padding
         decoder_pad_size = decoder_size - len(decoder_input) - 2
         decoder_inputs.append([data_utils.GO_ID] + decoder_input +
                               [data_utils.EOS_ID] +
                               [data_utils.PAD_ID] * decoder_pad_size)
     batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
     # batch encoder
     for i in range(encoder_size):
         batch_encoder_inputs.append(
             np.array(
                 [encoder_inputs[j][i] for j in range(self.batch_size)],
                 dtype=np.int32))
     # batch decoder
     for i in range(decoder_size):
         batch_decoder_inputs.append(
             np.array(
                 [decoder_inputs[j][i] for j in range(self.batch_size)],
                 dtype=np.int32))
         batch_weight = np.ones(self.batch_size, dtype=np.float32)
         for j in range(self.batch_size):
             if i < decoder_size - 1:
                 target = decoder_inputs[j][i + 1]
             if i == decoder_size - 1 or target == data_utils.PAD_ID:
                 batch_weight[j] = 0.0
         batch_weights.append(batch_weight)
     return batch_encoder_inputs, batch_decoder_inputs, batch_weights