Esempio n. 1
0
pred.shape

tokenizer.decode(torch.argmax(pred, dim=-1).squeeze(0))

loss_fn = nn.CrossEntropyLoss()  #

masked_lm_loss = loss_fn(pred.view(-1, tokenizer.vocab_size), labels.view(-1))
masked_lm_loss

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
total_loss = 0.0
model.train()

model.to(device)
inputs = inputs.to(device)
labels = labels.to(device)

loss = []
optimizer = AdamW(params=model.parameters())

for _ in tqdm(range(100000)):
    pred = model(inputs)
    mlm_loss = loss_fn(pred.view(-1, tokenizer.vocab_size), labels.view(-1))

    total_loss += mlm_loss.item()
    loss.append(mlm_loss.item())

    mlm_loss.backward()
    optimizer.step()
    model.zero_grad()
Esempio n. 2
0
def train(device='cpu',
          output_dir='model',
          epochs=5,
          save_step=5,
          batch_size=4):

    model = ReformerLM(num_tokens=13137,
                       dim=128,
                       depth=12,
                       max_seq_len=4096,
                       lsh_dropout=0.1,
                       causal=True,
                       full_attn_thres=128)
    model = TrainingWrapper(model, ignore_index=0, pad_value=0).to(device)
    # output_dir="model"
    model_cpu_path = os.path.join(output_dir, 'model_cpu.pt')
    try:
        model.load_state_dict(torch.load(model_cpu_path))
    except:
        pass

    model.train()
    optimizer = AdamW(params=model.parameters())
    optimizer_path = os.path.join(output_dir, 'optimizer.pt')
    try:
        optimizer.load_state_dict(torch.load(optimizer_path))
    except:
        pass
    print(optimizer)
    total_loss = 0.0
    # batch_size=4

    loss = []

    data = []
    for it in get_data("data/train.json", tokenizer):
        data.append(it)
    # data=data[:1000]
    loss_fn = nn.CrossEntropyLoss()  # -100 index = padding token
    for n in tqdm(range(epochs)):
        # print(n)
        random.shuffle(data)
        inputs = []
        labels = []
        for i, it in enumerate(data):
            # print("it",it)
            inputs.append(it['keywords'])
            labels.append(it['text'])
            if i % batch_size == 0 and i != 0:
                # print(it)

                inputs_batch = torch.tensor(inputs).long().to(device)

                labels_batch = torch.tensor(labels).long().to(device)
                # print(inputs_batch)
                inputs = []
                labels = []

                # inputs = torch.tensor(it['keywords']).long()
                # labels = torch.tensor(it['text']).long()
                # print("inputs",inputs)
                pred = model(inputs_batch)
                mlm_loss = loss_fn(pred.view(-1, tokenizer.vocab_size),
                                   labels_batch.view(-1))

                total_loss += mlm_loss.item()
                loss.append(mlm_loss.item())
                print('loss', mlm_loss.item())
                mlm_loss.backward()
                optimizer.step()
                model.zero_grad()
                # output_dir="model"
            if i % save_step == 0 and i != 0:
                model_cpu_path = os.path.join(output_dir, 'model_cpu.pt')
                optimizer_path = os.path.join(output_dir, 'optimizer.pt')
                torch.save(model.state_dict(), model_cpu_path)
                torch.save(optimizer.state_dict(), optimizer_path)
        model_cpu_path = os.path.join(output_dir, 'model_cpu.pt')
        optimizer_path = os.path.join(output_dir, 'optimizer.pt')
        torch.save(model.state_dict(), model_cpu_path)
        torch.save(optimizer.state_dict(), optimizer_path)