Exemple #1
0
    def create_masks(self, inp, tar):
        # 编码器填充遮挡
        if inp is None or tar is None:
            return None, None, None
        enc_padding_mask = create_padding_mask(inp)

        # 在解码器的第二个注意力模块使用。
        # 该填充遮挡用于遮挡编码器的输出。
        dec_padding_mask = create_padding_mask(inp)

        # 在解码器的第一个注意力模块使用。
        # 用于填充(pad)和遮挡(mask)解码器获取到的输入的后续标记(future tokens)。
        look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
        dec_target_padding_mask = create_padding_mask(tar)
        combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

        return enc_padding_mask, combined_mask, dec_padding_mask
Exemple #2
0
  def forward(self, enc_in, dec_in, padding_mask=None):
    enc_in, dec_in = self.embed_input(enc_in, dec_in)

    if self.encoder_reduce_dim:
      padding_mask = u.create_padding_mask(enc_in, self.pad_idx)

    enc_out = self.encoder(enc_in, padding_mask=padding_mask)

    dec_out = self.decoder(dec_in, enc_out, padding_mask=padding_mask)

    output = self.output_projection(dec_out)

    return self.softmax(output) if self.apply_softmax else output
Exemple #3
0
    def custom_collator(self, batch):
        encoder_inputs, decoder_inputs = zip(*batch)
        encoder_input_batch = pad_sequence(
            encoder_inputs,
            batch_first=True,
            padding_value=self.vocab_2_idx[self.pad])
        decoder_input_batch = torch.LongTensor(
            u.pad_documents(decoder_inputs, self.vocab_2_idx[self.pad]))

        if self.create_mask:
            padding_mask_batch = u.create_padding_mask(
                encoder_input_batch, self.vocab_2_idx[self.pad])
            return encoder_input_batch, decoder_input_batch, padding_mask_batch

        return encoder_input_batch, decoder_input_batch
Exemple #4
0
    def __call__(self, batch):
        encoder_inputs, decoder_inputs = zip(*batch)
        encoder_input_batch = pad_sequence(encoder_inputs,
                                           batch_first=True,
                                           padding_value=self.pad_idx).float()
        decoder_input_batch = pad_sequence(decoder_inputs,
                                           batch_first=True,
                                           padding_value=self.pad_idx)

        if self.create_mask:
            padding_mask_batch = u.create_padding_mask(encoder_input_batch,
                                                       self.pad_idx)
            return encoder_input_batch, decoder_input_batch, padding_mask_batch

        return encoder_input_batch, decoder_input_batch
def evaluate(test_dataset):
    predictions = []
    tars = []
    for (batch, (inp, tar)) in tqdm(enumerate(test_dataset)):
        enc_padding_mask = create_padding_mask(inp)
        predict = transformer(inp, False, enc_padding_mask=enc_padding_mask)
        predictions.append(predict)
        tars.append(tar)
    predictions = tf.concat(predictions, axis=0)
    tars = tf.concat(tars, axis=0)
    mi_f1 = micro_f1(tars, predictions)
    ma_f1 = macro_f1(tars, predictions)

    predictions = np.where(predictions > 0.5, 1, 0)
    tars = np.where(tars > 0.5, 1, 0)

    smaple_f1 = f1_score(tars, predictions, average='samples')
    return mi_f1, ma_f1, smaple_f1, tars, predictions
def train_step(inp, tar):

    enc_padding_mask = create_padding_mask(inp)

    with tf.GradientTape() as tape:
        predictions = transformer(inp,
                                  training=True,
                                  enc_padding_mask=enc_padding_mask)
        loss = loss_function(tar, predictions)
    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar, predictions)

    mi_f1 = micro_f1(tar, predictions)
    ma_f1 = macro_f1(tar, predictions)
    return mi_f1, ma_f1
Exemple #7
0
def dataset_generator(folder, batch_size=32, metadata=None):
    '''
  Custom generator that reads data from folder then yield batches

  Params:
    * folder : str
    * batch_size (optional) : int, default to 32
    * metadata (optional) : dict, default to None
  '''
    filelist = get_list_files(folder)

    save_name = '{}_{}.pk'.format('metadata', folder.split('/')[-2])

    if metadata is None:
        metadata = load_metadata(save_name)

    batch = [[], []]
    num_els = len(filelist['features'])

    for i, filename in enumerate(filelist['features']):
        identity = filename.split('/')[-1].replace('.features.npy', '')

        encoder_input = torch.tensor(np.load(filename))
        decoder_input = metadata['id_2_doc'][identity]

        batch[0].append(encoder_input)
        batch[1].append(decoder_input)

        if len(batch[0]) == batch_size or i == num_els - 1:
            encoder_input_batch = pad_sequence(
                batch[0],
                batch_first=True,
                padding_value=metadata['PAD'].index)
            decoder_input_batch = torch.LongTensor(batch[1])
            padding_mask_batch = u.create_padding_mask(encoder_input_batch,
                                                       metadata['PAD'].index)

            yield encoder_input_batch, decoder_input_batch, padding_mask_batch
            batch = [[], []]
def preprocess(file, BATCH_SIZE, max_length, tokenizer):
    train_dataset = []
    input_vocab_size = len(tokenizer.vocab)
    f = open(file, 'r')

    words = f.read()

    words = words.replace('\n\n', '.')
    words = words.replace('\n', ' ')
    words = re.split('[;:.!?]', words)

    i = 0
    for _ in range(len(words) // BATCH_SIZE + 1):
        if i + 1 >= len(words):
            break
        input_ids_list = []
        segment_list = []
        is_masked_list = []
        is_next_list = []

        for j in range(BATCH_SIZE):
            if i + 1 >= len(words):
                break

            now = int(
                random.random() > 0.5
            )  # decide if the 2nd sentence has to be next sentence or not

            if now == 1:
                res = ["[CLS]"] + tokenizer.tokenize(words[i]) + [
                    "[SEP]"
                ] + tokenizer.tokenize(words[i + 1]) + ["[SEP]"]
            else:
                res = ["[CLS]"] + tokenizer.tokenize(
                    words[i]) + ["[SEP]"] + tokenizer.tokenize(
                        words[random.randint(0,
                                             len(words) - 1)]) + ["[SEP]"]

            input_ids = get_ids(res, tokenizer, max_length)
            segment_list.append(get_segments(res, max_length))
            is_next_list.append(now)
            is_masked = [0] * max_length

            for ind in range(max_length):
                if input_ids[ind] == 0:  # is padding token appears, then break
                    break
                if input_ids[ind] == 101 or input_ids[
                        ind] == 102:  # don't mask [CLS] and [SEP] tokens
                    continue
                if random.random() < 0.15:  # mask 15% of tokens
                    is_masked[ind] = input_ids[ind]
                    if random.random() < 0.8:  # out of 15%, mask 80%
                        input_ids[ind] = 103
                    elif random.random(
                    ) < 0.5:  # replace 10% with random token
                        input_ids[ind] = random.randint(1000, input_vocab_size)
                        #in the remaining tokens, keep the same token
            input_ids_list.append(input_ids)
            is_masked_list.append(is_masked)
            if now == 1:
                i += 2
            else:
                i += 1

        input_ids_list = np.array(input_ids_list)
        is_masked_list = np.array(is_masked_list)
        masks = create_padding_mask(input_ids_list)
        segment_list = np.array(segment_list)
        is_next_list = np.array(is_next_list)
        is_next_list = np.reshape(is_next_list, (len(is_next_list), 1))
        train_dataset.append([
            input_ids_list, segment_list, masks, is_next_list, is_masked_list
        ])

    return train_dataset
                                         epsilon=1e-9)

    for epoch in range(EPOCHS):
        start = time.time()

        train_loss.reset_states()
        train_accuracy.reset_states()

        print('Start Train......')
        for (batch, (inp, tar)) in enumerate(train_dataset):
            time1 = time.time()
            mic_f1, mac_f1 = train_step(inp, tar)

            if batch % 50 == 0:
                test_input, test_target = next(iter(valid_dataset))
                enc_padding_mask = create_padding_mask(test_input)
                val_mic_f1, val_mac_f1 = predict(test_input, test_target,
                                                 enc_padding_mask)

                print(
                    'Epoch {} Batch {} Loss {:.4f} micro_f1 {:.4f} macro_f1 {:.4f} val_micro_f1 {:.4f} val_macro_f1 {:.4f}'
                    .format(epoch + 1, batch, train_loss.result(), mic_f1,
                            mac_f1, val_mic_f1, val_mac_f1))
                print('Cost time:{}'.format(time.time() - time1))

        if (epoch + 1) % 5 == 0:
            ckpt_save_path = ckpt_manager.save()
            print('Saving checkpoint for epoch {} at {}'.format(
                epoch + 1, ckpt_save_path))

        print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(