def train(seq, dataloader, epochs=10): criterion = CrossEntropyLoss(seq) optimizer = Adam(seq) for epoch in range(epochs): epoch_loss = 0.0 epoch_accuracy = 0.0 n_batch = 0 for batch, labels in dataloader: n_batch += 1 outputs = seq(batch) loss = criterion(outputs, labels) accuracy = accuracy_score(outputs.argmax(axis=1), labels) loss.backward() optimizer.step() epoch_loss += loss epoch_accuracy += accuracy print("Epoch {}/{} - loss: {:%.5f} accuracy: {:%.5f}".format( epoch + 1, epochs, epoch_loss / n_batch, epoch_accuracy / n_batch)) print("Finished training !")
def main(): args = parse_args() with open(args.input, 'r') as fp: data_loader = DataLoader(fp.read(), batch_size=args.seq_length) rnn = RNN() params = init_params(data_loader.vocab_size, hidden_size=args.hidden_size) optimizer = Adam(params, lr=args.lr) it = 0 for epoch in range(args.num_epochs): hidden_state = np.zeros((1, args.hidden_size)) for x, y in data_loader: if it % args.sample_every == 0: one_hot = sample(rnn, hidden_state, x[0], params, args.sample_size) generated_text = data_loader.decode(one_hot) print(generated_text) loss, hidden_state, dparams = rnn_training_step( rnn, hidden_state, x, y, params) if it % args.print_every == 0: print('iteration: {}, loss: {}'.format(it, loss)) optimizer.step(dparams) it += 1
path = "./checkpoints/Linear_MINST_weights.sav" # model = load_weights(path) epochs = 6 for epoch in range(epochs): i = 0 for image, label in dataloader: if epoch == 5: model.graph() image = image/255 i = i + 1 print("Iteration no.", i) predicted = model(image) loss = model.loss(predicted, label) model.backward() optimizer.step() # print("loss= ", loss) print("===========") lr_schedular.step() # save_weights(model, path) e = Evaluation(10) for image, label in dataloader_test: image = image/255 predicted = model(image) probs = softMax(predicted) pred = np.argmax(probs,axis=0)
def main(args, local_rank): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) vocabs = dict() vocabs['src'] = Vocab(args.src_vocab, 0, [BOS, EOS]) vocabs['tgt'] = Vocab(args.tgt_vocab, 0, [BOS, EOS]) if args.world_size == 1 or (dist.get_rank() == 0): logger.info(args) for name in vocabs: logger.info("vocab %s, size %d, coverage %.3f", name, vocabs[name].size, vocabs[name].coverage) set_seed(19940117) #device = torch.device('cpu') torch.cuda.set_device(local_rank) device = torch.device('cuda', local_rank) if args.resume_ckpt: model = MatchingModel.from_pretrained(vocabs, args.resume_ckpt) else: model = MatchingModel.from_params(vocabs, args.layers, args.embed_dim, args.ff_embed_dim, args.num_heads, args.dropout, args.output_dim, args.bow) if args.world_size > 1: set_seed(19940117 + dist.get_rank()) model = model.to(device) if args.resume_ckpt: dev_data = DataLoader(vocabs, args.dev_data, args.dev_batch_size, addition=args.additional_negs) acc = validate(model, dev_data, device) logger.info("initialize from %s, initial acc %.2f", args.resume_ckpt, acc) optimizer = Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) lr_schedule = get_linear_schedule_with_warmup(optimizer, args.warmup_steps, args.total_train_steps) train_data = DataLoader(vocabs, args.train_data, args.per_gpu_train_batch_size, worddrop=args.worddrop, addition=args.additional_negs) global_step, step, epoch = 0, 0, 0 tr_stat = Statistics() logger.info("start training") model.train() while global_step <= args.total_train_steps: for batch in train_data: batch = move_to_device(batch, device) loss, acc, bsz = model(batch['src_tokens'], batch['tgt_tokens'], args.label_smoothing) tr_stat.update({ 'loss': loss.item() * bsz, 'nsamples': bsz, 'acc': acc * bsz }) tr_stat.step() loss.backward() step += 1 if not (step % args.gradient_accumulation_steps == -1 % args.gradient_accumulation_steps): continue if args.world_size > 1: average_gradients(model) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() lr_schedule.step() optimizer.zero_grad() global_step += 1 if args.world_size == 1 or (dist.get_rank() == 0): if global_step % args.print_every == -1 % args.print_every: logger.info("epoch %d, step %d, loss %.3f, acc %.3f", epoch, global_step, tr_stat['loss'] / tr_stat['nsamples'], tr_stat['acc'] / tr_stat['nsamples']) tr_stat = Statistics() if global_step > args.warmup_steps and global_step % args.eval_every == -1 % args.eval_every: dev_data = DataLoader(vocabs, args.dev_data, args.dev_batch_size, addition=args.additional_negs) acc = validate(model, dev_data, device) logger.info("epoch %d, step %d, dev, dev acc %.2f", epoch, global_step, acc) save_path = '%s/epoch%d_batch%d_acc%.2f' % ( args.ckpt, epoch, global_step, acc) model.save(args, save_path) model.train() if global_step > args.total_train_steps: break epoch += 1 logger.info('rank %d, finish training after %d steps', local_rank, global_step)
def main(args, local_rank): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) vocabs = dict() vocabs['src'] = Vocab(args.src_vocab, 0, [BOS, EOS]) vocabs['tgt'] = Vocab(args.tgt_vocab, 0, [BOS, EOS]) if args.world_size == 1 or (dist.get_rank() == 0): logger.info(args) for name in vocabs: logger.info("vocab %s, size %d, coverage %.3f", name, vocabs[name].size, vocabs[name].coverage) set_seed(19940117) #device = torch.device('cpu') torch.cuda.set_device(local_rank) device = torch.device('cuda', local_rank) if args.arch == 'vanilla': model = Generator(vocabs, args.embed_dim, args.ff_embed_dim, args.num_heads, args.dropout, args.enc_layers, args.dec_layers, args.label_smoothing) elif args.arch == 'mem': model = MemGenerator(vocabs, args.embed_dim, args.ff_embed_dim, args.num_heads, args.dropout, args.mem_dropout, args.enc_layers, args.dec_layers, args.mem_enc_layers, args.label_smoothing, args.use_mem_score) elif args.arch == 'rg': logger.info("start building model") logger.info("building retriever") retriever = Retriever.from_pretrained( args.num_retriever_heads, vocabs, args.retriever, args.nprobe, args.topk, local_rank, use_response_encoder=(args.rebuild_every > 0)) logger.info("building retriever + generator") model = RetrieverGenerator(vocabs, retriever, args.share_encoder, args.embed_dim, args.ff_embed_dim, args.num_heads, args.dropout, args.mem_dropout, args.enc_layers, args.dec_layers, args.mem_enc_layers, args.label_smoothing) if args.resume_ckpt: model.load_state_dict(torch.load(args.resume_ckpt)['model']) else: global_step = 0 if args.world_size > 1: set_seed(19940117 + dist.get_rank()) model = model.to(device) retriever_params = [ v for k, v in model.named_parameters() if k.startswith('retriever.') ] other_params = [ v for k, v in model.named_parameters() if not k.startswith('retriever.') ] optimizer = Adam([{ 'params': retriever_params, 'lr': args.embed_dim**-0.5 * 0.1 }, { 'params': other_params, 'lr': args.embed_dim**-0.5 }], betas=(0.9, 0.98), eps=1e-9) lr_schedule = get_inverse_sqrt_schedule_with_warmup( optimizer, args.warmup_steps, args.total_train_steps) train_data = DataLoader(vocabs, args.train_data, args.per_gpu_train_batch_size, for_train=True, rank=local_rank, num_replica=args.world_size) model.eval() #dev_data = DataLoader(vocabs, cur_dev_data, args.dev_batch_size, for_train=False) #bleu = validate(device, model, dev_data, beam_size=5, alpha=0.6, max_time_step=10) step, epoch = 0, 0 tr_stat = Statistics() logger.info("start training") model.train() best_dev_bleu = 0. while global_step <= args.total_train_steps: for batch in train_data: #step_start = time.time() batch = move_to_device(batch, device) if args.arch == 'rg': loss, acc = model( batch, update_mem_bias=(global_step > args.update_retriever_after)) else: loss, acc = model(batch) tr_stat.update({ 'loss': loss.item() * batch['tgt_num_tokens'], 'tokens': batch['tgt_num_tokens'], 'acc': acc }) tr_stat.step() loss.backward() #step_cost = time.time() - step_start #print ('step_cost', step_cost) step += 1 if not (step % args.gradient_accumulation_steps == -1 % args.gradient_accumulation_steps): continue if args.world_size > 1: average_gradients(model) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() lr_schedule.step() optimizer.zero_grad() global_step += 1 if args.world_size == 1 or (dist.get_rank() == 0): if global_step % args.print_every == -1 % args.print_every: logger.info("epoch %d, step %d, loss %.3f, acc %.3f", epoch, global_step, tr_stat['loss'] / tr_stat['tokens'], tr_stat['acc'] / tr_stat['tokens']) tr_stat = Statistics() if global_step % args.eval_every == -1 % args.eval_every: model.eval() max_time_step = 256 if global_step > 2 * args.warmup_steps else 5 bleus = [] for cur_dev_data in args.dev_data: dev_data = DataLoader(vocabs, cur_dev_data, args.dev_batch_size, for_train=False) bleu = validate(device, model, dev_data, beam_size=5, alpha=0.6, max_time_step=max_time_step) bleus.append(bleu) bleu = sum(bleus) / len(bleus) logger.info("epoch %d, step %d, dev bleu %.2f", epoch, global_step, bleu) if bleu > best_dev_bleu: testbleus = [] for cur_test_data in args.test_data: test_data = DataLoader(vocabs, cur_test_data, args.dev_batch_size, for_train=False) testbleu = validate(device, model, test_data, beam_size=5, alpha=0.6, max_time_step=max_time_step) testbleus.append(testbleu) testbleu = sum(testbleus) / len(testbleus) logger.info("epoch %d, step %d, test bleu %.2f", epoch, global_step, testbleu) torch.save({ 'args': args, 'model': model.state_dict() }, '%s/best.pt' % (args.ckpt, )) if not args.only_save_best: torch.save( { 'args': args, 'model': model.state_dict() }, '%s/epoch%d_batch%d_devbleu%.2f_testbleu%.2f' % (args.ckpt, epoch, global_step, bleu, testbleu)) best_dev_bleu = bleu model.train() if args.rebuild_every > 0 and (global_step % args.rebuild_every == -1 % args.rebuild_every): model.retriever.drop_index() torch.cuda.empty_cache() next_index_dir = '%s/batch%d' % (args.ckpt, global_step) if args.world_size == 1 or (dist.get_rank() == 0): model.retriever.rebuild_index(next_index_dir) dist.barrier() else: dist.barrier() model.retriever.update_index(next_index_dir, args.nprobe) if global_step > args.total_train_steps: break epoch += 1 logger.info('rank %d, finish training after %d steps', local_rank, global_step)
def run(hparams, model, train_dataloader, valid_dataloader, device, out_dir='checkpoints'): learning_rate = hparams['learning_rate'] accumulate_step = hparams['accumulate_step'] lr_schedule = hparams['lr_schedule'] warmup_steps = hparams['warmup_steps'] warmup_proportion = hparams['warmup_proportion'] n_embd = hparams['n_embd'] num_optim_steps = hparams['num_optim_steps'] train_batch_size = hparams['train_batch_size'] valid_step = hparams['valid_step'] no_token_id = hparams['no_token_id'] model_parameters = filter(lambda p: p.requires_grad, model.parameters()) total_params = sum([np.prod(p.size()) for p in model_parameters]) logger.info('Number of parameter = {}'.format(total_params)) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'ln'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = Adam(optimizer_grouped_parameters, learning_rate, max_grad_norm=1.0) step = 0 global_step = 0 epoch = 0 while True: model.train() (tr_loss, tr_ppl, mean_ppl, nb_tr_examples, nb_tr_steps) = 0.0, 0.0, 0.0, 0, 0 n_token_real, n_token_total = 0, 0 pbar = tqdm.tqdm(enumerate(train_dataloader), total=len(train_dataloader)) for i, batch in pbar: batch = tuple(t.cuda() for t in batch) input_ids, position_ids, token_type_ids, label_ids, *_ = batch if no_token_id: token_type_ids = None loss, ppl = model(input_ids, position_ids, token_type_ids, label_ids) loss = loss.mean() loss = loss / (train_batch_size / input_ids.shape[0]) loss.backward() nb_tr_steps += 1 tr_loss += float( loss.sum().item()) * (train_batch_size / input_ids.shape[0]) if ppl.sum().item() < 1000000: tr_ppl += ppl.sum().item() else: tr_ppl += mean_ppl mean_loss = tr_loss / nb_tr_steps mean_ppl = tr_ppl / nb_tr_steps n_token_total += input_ids.shape[0] * input_ids.shape[1] n_token_real += (input_ids != 0).sum().item() #gradient update step += 1 if step % accumulate_step == 0: set_lr(optimizer, global_step, lr_schedule, learning_rate, warmup_steps, warmup_proportion, n_embd, num_optim_steps) optimizer.step() optimizer.zero_grad() global_step += 1 print( 'epoch: {}, global_step: {}, step: {}, mean_loss: {}, mean_ppl:{}' .format(epoch + 1, global_step + 1, step + 1, mean_loss, mean_ppl), file=train_logger) if global_step % valid_step == 0: print('Saving model...') torch.save( { 'model': model.state_dict(), 'epoch': epoch, 'hparams': hparams, }, os.path.join(out_dir, f'GPT2-pretrain-step-{global_step}.pkl')) eval_loss, eval_ppl = valid(model, valid_dataloader, epoch, device) print('{},{},{},{},{}'.format(epoch + 1, global_step + 1, step + 1, eval_loss, eval_ppl), file=valid_logger) logger.info('current learning rate: ' + str(optimizer.param_groups[0]['lr'])) model.train() if global_step >= num_optim_steps: break if (step + 1) % CACHE_EMPTY_STEP == 0: torch.cuda.empty_cache() if global_step >= num_optim_steps: break epoch += 1 train_logger.close() valid_logger.close()
L = 32 adam = Adam(bnn.params.parameters(), 0.001) T = 2500 x1, x2 = -6, 6 y1, y2 = -100, 100 for i in range(T): adam.zero_grad() bnn.params.sample() loss = lossf(X_,Y_) loss.backward() adam.step() if i % 100 == 0: print i, loss.data.numpy()[0] N = 500 xx = varify(np.linspace(x1,x2,N).astype('float32').reshape(N,1)) yys = list() for i in range(32): bnn.params.sample() yys.append(model(xx).data.numpy()) xx = xx.data.numpy()[:,0] yys = np.concatenate(yys,axis=1) yy = yys.mean(1) ss = yys.std(1)