b is a embedding for ab-d c is a embedding for abc- later, b embedding will + another b embedding from previous residual input to predict c """ eye = tf.eye(self.max_len + 1, batch_shape=[len(seqs)], dtype=tf.float32)[:, 1:, :-1] pad = tf.math.equal(seqs, self.padding_idx) mask = tf.where(pad[:, tf.newaxis, tf.newaxis, :], 1, eye[:, tf.newaxis, :, :]) return mask # [n, 1, step, step] if __name__ == "__main__": MODEL_DIM = 256 N_LAYER = 4 LEARNING_RATE = 1e-4 d = utils.MRPCData("./MRPC", 2000) print("num word: ", d.num_word) m = BERT(model_dim=MODEL_DIM, max_len=d.max_len - 1, n_layer=N_LAYER, n_head=4, n_vocab=d.num_word, lr=LEARNING_RATE, max_seg=d.num_seg, drop_rate=0.2, padding_idx=d.pad_id) train(m, d, step=5000, name="bert_window_mask") export_attention(m, d, "bert_window_mask")
b is a embedding for ab-d c is a embedding for abc- later, b embedding will + another b embedding from previous residual input to predict c """ eye = tf.eye(self.max_len + 1, batch_shape=[len(seqs)], dtype=tf.float32)[:, 1:, :-1] pad = tf.math.equal(seqs, self.padding_idx) mask = tf.where(pad[:, tf.newaxis, tf.newaxis, :], 1, eye[:, tf.newaxis, :, :]) return mask # [n, 1, step, step] if __name__ == "__main__": MODEL_DIM = 256 N_LAYER = 4 LEARNING_RATE = 1e-4 d = utils.MRPCData("./MRPC", 2000) print("num word: ", d.num_word) m = BERT(model_dim=MODEL_DIM, max_len=d.max_len - 1, n_layer=N_LAYER, n_head=4, n_vocab=d.num_word, lr=LEARNING_RATE, max_seg=d.num_seg, drop_rate=0.2, padding_idx=d.pad_id) train(m, d, step=5000, name="bert_next_mask") export_attention(m, d, "bert_next_mask")
a010011 b001011 c000111 d000011 -000001 -000000 a is a embedding for a-cd b is a embedding for ab-d c is a embedding for abc- later, b embedding will + another b embedding from previous residual input to predict c """ eye = tf.eye(self.max_len+1, batch_shape=[len(seqs)], dtype=tf.float32)[:, 1:, :-1] pad = tf.math.equal(seqs, self.padding_idx) mask = tf.where(pad[:, tf.newaxis, tf.newaxis, :], 1, eye[:, tf.newaxis, :, :]) return mask # [n, 1, step, step] if __name__ == "__main__": MODEL_DIM = 256 N_LAYER = 4 LEARNING_RATE = 1e-4 d = utils.MRPCData("./MRPC", 2000) print("num word: ", d.num_word) m = BERT( model_dim=MODEL_DIM, max_len=d.max_len - 1, n_layer=N_LAYER, n_head=4, n_vocab=d.num_word, lr=LEARNING_RATE, max_seg=d.num_seg, drop_rate=0.2, padding_idx=d.pad_id) train(m, d, step=5000, name="bert") export_attention(m, d, "bert")
b is a embedding for ab-d c is a embedding for abc- later, b embedding will + another b embedding from previous residual input to predict c """ eye = tf.eye(self.max_len + 1, batch_shape=[len(seqs)], dtype=tf.float32)[:, 1:, :-1] pad = tf.math.equal(seqs, self.padding_idx) mask = tf.where(pad[:, tf.newaxis, tf.newaxis, :], 1, eye[:, tf.newaxis, :, :]) return mask # [n, 1, step, step] if __name__ == "__main__": MODEL_DIM = 256 N_LAYER = 4 LEARNING_RATE = 1e-4 d = utils.MRPCData("./MRPC", 2000) print("num word: ", d.num_word) m = BERT(model_dim=MODEL_DIM, max_len=d.max_len - 1, n_layer=N_LAYER, n_head=4, n_vocab=d.num_word, lr=LEARNING_RATE, max_seg=d.num_seg, drop_rate=0.2, padding_idx=d.pad_id) train(m, d, step=5000, name="bert_self_mask") export_attention(m, d, "bert_self_mask")