def main(): # get and process data data = utils.MRPCData("./MRPC") print("num word: ", data.num_word) model = BERT( model_dim=MODEL_DIM, max_len=data.max_len, n_layer=N_LAYER, n_head=4, n_vocab=data.num_word, lr=LEARNING_RATE, max_seg=data.num_seg, drop_rate=0.1, padding_idx=data.v2i["<PAD>"]) t0 = time.time() arange = np.arange(0, data.max_len) for t in range(10000): seqs, segs, seqs_, loss_mask, xlen, nsp_labels = random_mask_or_replace(data, arange) loss, pred = model.step(seqs, segs, seqs_, loss_mask, nsp_labels) if t % 20 == 0: pred = pred[0].numpy().argmax(axis=1) t1 = time.time() print( "\n\nstep: ", t, "| time: %.2f" % (t1 - t0), "| loss: %.3f" % loss.numpy(), "\n| tgt: ", " ".join([data.i2v[i] for i in seqs[0][:xlen[0].sum()+1]]), "\n| prd: ", " ".join([data.i2v[i] for i in pred[:xlen[0].sum()+1]]), "\n| tgt word: ", [data.i2v[i] for i in seqs_[0]*loss_mask[0] if i != data.v2i["<PAD>"]], "\n| prd word: ", [data.i2v[i] for i in pred*loss_mask[0] if i != data.v2i["<PAD>"]], ) t0 = t1 os.makedirs("./visual_helper/bert", exist_ok=True) model.save_weights("./visual_helper/bert/model.ckpt")
def train(): MODEL_DIM = 256 N_LAYER = 4 LEARNING_RATE = 1e-4 dataset = utils.MRPCData("./MRPC", 2000) print("num word: ", dataset.num_word) model = GPT(model_dim=MODEL_DIM, max_len=dataset.max_len - 1, num_layer=N_LAYER, num_head=4, n_vocab=dataset.num_word, lr=LEARNING_RATE, max_seg=dataset.num_seg, drop_rate=0.2, padding_idx=dataset.pad_id) if torch.cuda.is_available(): print("GPU train avaliable") device = torch.device("cuda") model = model.cuda() else: device = torch.device("cpu") model = model.cpu() loader = DataLoader(dataset, batch_size=32, shuffle=True) for epoch in range(100): for batch_idx, batch in enumerate(loader): seqs, segs, xlen, nsp_labels = batch seqs, segs, nsp_labels = seqs.type( torch.LongTensor).to(device), segs.type( torch.LongTensor).to(device), nsp_labels.to(device) # pred: [n, step, n_vocab] loss, pred = model.step(seqs=seqs[:, :-1], segs=segs[:, :-1], seqs_=seqs[:, 1:], nsp_labels=nsp_labels) if batch_idx % 100 == 0: pred = pred[0].cpu().data.numpy().argmax(axis=1) # [step] print( "Epoch: ", epoch, "|batch: ", batch_idx, "| loss: %.3f" % loss, "\n| tgt: ", " ".join([ dataset.i2v[i] for i in seqs[0, 1:].cpu().data.numpy()[:xlen[0].sum() + 1] ]), "\n| prd: ", " ".join( [dataset.i2v[i] for i in pred[:xlen[0].sum() + 1]]), ) os.makedirs("./visual/models/gpt", exist_ok=True) torch.save(model.state_dict(), "./visual/models/gpt/model.pth") export_attention(model, device, dataset)
def export_attention(): data = utils.MRPCData("./MRPC") print("num word: ", data.num_word) model = BERT( model_dim=MODEL_DIM, max_len=data.max_len, n_layer=N_LAYER, n_head=4, n_vocab=data.num_word, lr=LEARNING_RATE, max_seg=data.num_seg, drop_rate=0.1, padding_idx=data.v2i["<PAD>"]) model.load_weights("./visual_helper/bert/model.ckpt") # save attention matrix for visualization seqs, segs, xlen, nsp_labels = data.sample(1) model(seqs, segs, False) data = {"src": [data.i2v[i] for i in seqs[0]], "attentions": model.attentions} with open("./visual_helper/bert_attention_matrix.pkl", "wb") as f: pickle.dump(data, f)
# save attention matrix for visualization seqs, segs, xlen, nsp_labels = data.sample(32) model.call(seqs[:, :-1], segs[:, :-1], False) data = { "src": [[data.i2v[i] for i in seqs[j]] for j in range(len(seqs))], "attentions": model.attentions } with open("./visual/tmp/%s_attention_matrix.pkl" % name, "wb") as f: pickle.dump(data, f) if __name__ == "__main__": MODEL_DIM = 256 N_LAYER = 4 LEARNING_RATE = 1e-4 d = utils.MRPCData("./MRPC", 2000) print("num word: ", d.num_word) m = GPT(model_dim=MODEL_DIM, max_len=d.max_len - 1, n_layer=N_LAYER, n_head=4, n_vocab=d.num_word, lr=LEARNING_RATE, max_seg=d.num_seg, drop_rate=0.2, padding_idx=d.pad_id) train(m, d, step=5000, name="gpt") export_attention(m, d, name="gpt")
def train(): MODEL_DIM = 256 N_LAYER = 4 LEARNING_RATE = 1e-4 dataset = utils.MRPCData("./MRPC", 2000) print("num word: ", dataset.num_word) model = BERT(model_dim=MODEL_DIM, max_len=dataset.max_len, num_layer=N_LAYER, num_head=4, n_vocab=dataset.num_word, lr=LEARNING_RATE, max_seg=dataset.num_seg, drop_rate=0.2, padding_idx=dataset.pad_id) if torch.cuda.is_available(): print("GPU train avaliable") device = torch.device("cuda") model = model.cuda() else: device = torch.device("cpu") model = model.cpu() loader = DataLoader(dataset, batch_size=32, shuffle=True) arange = np.arange(0, dataset.max_len) for epoch in range(500): for batch_idx, batch in enumerate(loader): seqs, segs, seqs_, loss_mask, xlen, nsp_labels = random_mask_or_replace( batch, arange, dataset) seqs, segs, seqs_, nsp_labels, loss_mask = seqs.type( torch.LongTensor).to(device), segs.type( torch.LongTensor).to(device), seqs_.type( torch.LongTensor).to(device), nsp_labels.to( device), loss_mask.to(device) loss, pred = model.step(seqs, segs, seqs_, loss_mask, nsp_labels) if batch_idx % 100 == 0: pred = pred[0].cpu().data.numpy().argmax(axis=1) print( "\n\nEpoch: ", epoch, "|batch: ", batch_idx, "| loss: %.3f" % loss, "\n| tgt: ", " ".join([ dataset.i2v[i] for i in seqs[0].cpu().data.numpy()[:xlen[0].sum() + 1] ]), "\n| prd: ", " ".join( [dataset.i2v[i] for i in pred[:xlen[0].sum() + 1]]), "\n| tgt word: ", [ dataset.i2v[i] for i in (seqs_[0] * loss_mask[0].view(-1)).cpu().data.numpy() if i != dataset.v2i["<PAD>"] ], "\n| prd word: ", [ dataset.i2v[i] for i in pred * (loss_mask[0].view(-1).cpu().data.numpy()) if i != dataset.v2i["<PAD>"] ], ) os.makedirs("./visual/models/bert", exist_ok=True) torch.save(model.state_dict(), "./visual/models/bert/model.pth") export_attention(model, device, dataset)