def get_optimizer(model, args): """Set up the optimizer.""" # Build parameter groups (weight decay and non-decay). while isinstance(model, (DDP, FP16_Module)): model = model.module layers = model.model.bert.encoder.layer pooler = model.model.bert.pooler lmheads = model.model.cls.predictions nspheads = model.model.cls.seq_relationship embeddings = model.model.bert.embeddings param_groups = [] param_groups += list(get_params_for_weight_decay_optimization(layers)) param_groups += list(get_params_for_weight_decay_optimization(pooler)) param_groups += list(get_params_for_weight_decay_optimization(nspheads)) param_groups += list(get_params_for_weight_decay_optimization(embeddings)) param_groups += list( get_params_for_weight_decay_optimization(lmheads.transform)) param_groups[1]['params'].append(lmheads.bias) # Use Adam. optimizer = Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay) # Wrap into fp16 optimizer. if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale, dynamic_loss_scale=args.dynamic_loss_scale, dynamic_loss_args={ 'scale_window': args.loss_scale_window, 'min_scale': args.min_scale, 'delayed_shift': args.hysteresis }) return optimizer
def main(n_aggregation, dim_feature, n_epochs, batch_size, eps): W = np.random.normal(0, 0.4, [dim_feature, dim_feature]) A = np.random.normal(0, 0.4, dim_feature) b = np.array([0.]) model = GraphNeuralNetwork(W, A, b, n_aggregation=n_aggregation) optimizer = Adam(model) dataset = util.get_train_data('../../datasets') train_data, valid_data = util.random_split(dataset, train_ratio=0.5) print('train_size: %d, valid_size: %d' % (len(train_data), len(valid_data))) for epoch in range(n_epochs): train_loss = util.AverageMeter() train_acc = util.AverageMeter() for graphs, labels in util.get_shuffled_batches( train_data, batch_size): grads_flat = 0 for graph, label in zip(graphs, labels): x = np.zeros([len(graph), dim_feature]) x[:, 0] = 1 grads_flat += calc_grads(model, graph, x, label, bce_with_logit, eps) / batch_size outputs = model(graph, x) train_loss.update(bce_with_logit(outputs, label), 1) train_acc.update((sigmoid(outputs) > 0.5) == label, 1) optimizer.update(grads_flat) valid_loss, valid_acc = test(model, valid_data, dim_feature) print( 'epoch: %d, train_loss: %f, train_acc: %f, valid_loss: %f, vald_acc: %f' % (epoch, train_loss.avg, train_acc.avg, valid_loss, valid_acc))
def __init__(self, args): super(PPO, self).__init__() # saved path self.saved_path = args.saved_path if not os.path.exists(self.saved_path): os.makedirs(self.saved_path) # neural networks self.actor_critic = ActorCritic(action_size=args.action_size, hidden_size=args.hidden_size, extra_hidden=args.extra_hidden, enlargement=args.enlargement, recurrent=args.recurrent, device=args.device).to(args.device) # args self.rank = args.rank self.device = args.device self.num_steps = args.num_steps self.num_envs = args.num_envs self.num_rollouts = args.num_rollouts self.render = args.render self.action_size = args.action_size self.update_epochs = args.update_epochs self.batch_size = args.batch_size self.clip_range = args.clip_range self.max_grad_norm = args.max_grad_norm self.gamma = args.gamma self.lamda = args.lamda self.coeff_ent = args.coeff_ent # optimizer self.optimizer = Adam( self.actor_critic.parameters(), args.learning_rate) # batch self.sample_envs = self.batch_size // self.num_steps
def train(): graphs, labels = load_data("datasets/train") train_inputs, train_targets, val_inputs, val_targets = utils.split_train_val( graphs, labels, val_rate=0.3) model = GNNModel(8) loss_func = BinaryCrossEntropy() optimizer = Adam() batch_generator = utils.BatchGenerator(batch_size=32) min_loss = 100000 for epoch in range(50): print(f"Epoch{epoch + 1}") train_losses = [] for inputs, targets in batch_generator.generator( train_inputs, train_targets): train_loss, loss_grad = loss_func(model, inputs, targets, is_grad=True) optimizer.update(model, loss_grad) train_losses.append(train_loss) train_mean_loss = np.mean(train_losses) pred = np.array([model.predict(input_) for input_ in train_inputs]).squeeze() train_accuracy = accuracy(pred, train_targets) val_losses = [] for inputs, targets in batch_generator.generator( val_inputs, val_targets): val_loss, _ = loss_func(model, inputs, targets, is_grad=False) val_losses.append(val_loss) val_mean_loss = np.mean(val_losses) pred = np.array([model.predict(input_) for input_ in val_inputs]).squeeze() val_accuracy = accuracy(pred, val_targets) if min(min_loss, val_mean_loss) < min_loss: min_loss = val_mean_loss print( f"Train loss: {train_mean_loss}\tTrain accuracy: {train_accuracy}" ) print( f"Validation loss: {val_mean_loss}\tValidation accuracy: {val_accuracy}" ) print("")
def main(n_aggregation, dim_feature, n_epochs, batch_size, eps, outputfile): W = np.random.normal(0, 0.4, [dim_feature, dim_feature]) A = np.random.normal(0, 0.4, dim_feature) b = np.array([0.]) model = GraphNeuralNetwork(W, A, b, n_aggregation=n_aggregation) optimizer = Adam(model) # Training train_data = util.get_train_data('../../datasets') print('train_size: %d' % len(train_data)) for epoch in range(n_epochs): train_loss = util.AverageMeter() train_acc = util.AverageMeter() for graphs, labels in util.get_shuffled_batches( train_data, batch_size): grads_flat = 0 for graph, label in zip(graphs, labels): x = np.zeros([len(graph), dim_feature]) x[:, 0] = 1 grads_flat += calc_grads(model, graph, x, label, bce_with_logit, eps) / batch_size outputs = model(graph, x) train_loss.update(bce_with_logit(outputs, label), 1) train_acc.update((sigmoid(outputs) > 0.5) == label, 1) optimizer.update(grads_flat) print('epoch: %d, train_loss: %f, train_acc: %f' % (epoch, train_loss.avg, train_acc.avg)) # Prediction test_data = util.get_test_data('../../datasets') with open(outputfile, 'w') as o: for graph in test_data: x = np.zeros([len(graph), dim_feature]) x[:, 0] = 1 logit = model(graph, x) pred = sigmoid(logit) > 0.5 o.write(str(int(pred[0])) + '\n')
def train(seq, dataloader, epochs=10): criterion = CrossEntropyLoss(seq) optimizer = Adam(seq) for epoch in range(epochs): epoch_loss = 0.0 epoch_accuracy = 0.0 n_batch = 0 for batch, labels in dataloader: n_batch += 1 outputs = seq(batch) loss = criterion(outputs, labels) accuracy = accuracy_score(outputs.argmax(axis=1), labels) loss.backward() optimizer.step() epoch_loss += loss epoch_accuracy += accuracy print("Epoch {}/{} - loss: {:%.5f} accuracy: {:%.5f}".format( epoch + 1, epochs, epoch_loss / n_batch, epoch_accuracy / n_batch)) print("Finished training !")
def main(): args = parse_args() with open(args.input, 'r') as fp: data_loader = DataLoader(fp.read(), batch_size=args.seq_length) rnn = RNN() params = init_params(data_loader.vocab_size, hidden_size=args.hidden_size) optimizer = Adam(params, lr=args.lr) it = 0 for epoch in range(args.num_epochs): hidden_state = np.zeros((1, args.hidden_size)) for x, y in data_loader: if it % args.sample_every == 0: one_hot = sample(rnn, hidden_state, x[0], params, args.sample_size) generated_text = data_loader.decode(one_hot) print(generated_text) loss, hidden_state, dparams = rnn_training_step( rnn, hidden_state, x, y, params) if it % args.print_every == 0: print('iteration: {}, loss: {}'.format(it, loss)) optimizer.step(dparams) it += 1
参考:http://glowingpython.blogspot.jp/2012/02/convolution-with-numpy.html """ window_len = 11 s = np.r_[x[window_len - 1:0:-1], x, x[-1:-window_len:-1]] w = np.kaiser(window_len, 2) y = np.convolve(w / w.sum(), s, mode='valid') return y[5:len(y) - 5] ##定义几种优化optimizer optimizers = OrderedDict() optimizers['SGD'] = SGD() optimizers['momentum'] = momentum() optimizers['adagrad'] = adagrad() optimizers['Adam'] = Adam() ##提取数据 (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True) #设置参数,初始化网络 train_size = x_train.shape[0] batch_size = 100 iter_num = 1000 train_loss = {} networks = {} for key in optimizers.keys(): networks[key] = backnet(input_size=784, hidden_size=50, output_size=10)
def main(args, local_rank): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) vocabs = dict() vocabs['src'] = Vocab(args.src_vocab, 0, [BOS, EOS]) vocabs['tgt'] = Vocab(args.tgt_vocab, 0, [BOS, EOS]) if args.world_size == 1 or (dist.get_rank() == 0): logger.info(args) for name in vocabs: logger.info("vocab %s, size %d, coverage %.3f", name, vocabs[name].size, vocabs[name].coverage) set_seed(19940117) #device = torch.device('cpu') torch.cuda.set_device(local_rank) device = torch.device('cuda', local_rank) if args.resume_ckpt: model = MatchingModel.from_pretrained(vocabs, args.resume_ckpt) else: model = MatchingModel.from_params(vocabs, args.layers, args.embed_dim, args.ff_embed_dim, args.num_heads, args.dropout, args.output_dim, args.bow) if args.world_size > 1: set_seed(19940117 + dist.get_rank()) model = model.to(device) if args.resume_ckpt: dev_data = DataLoader(vocabs, args.dev_data, args.dev_batch_size, addition=args.additional_negs) acc = validate(model, dev_data, device) logger.info("initialize from %s, initial acc %.2f", args.resume_ckpt, acc) optimizer = Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) lr_schedule = get_linear_schedule_with_warmup(optimizer, args.warmup_steps, args.total_train_steps) train_data = DataLoader(vocabs, args.train_data, args.per_gpu_train_batch_size, worddrop=args.worddrop, addition=args.additional_negs) global_step, step, epoch = 0, 0, 0 tr_stat = Statistics() logger.info("start training") model.train() while global_step <= args.total_train_steps: for batch in train_data: batch = move_to_device(batch, device) loss, acc, bsz = model(batch['src_tokens'], batch['tgt_tokens'], args.label_smoothing) tr_stat.update({ 'loss': loss.item() * bsz, 'nsamples': bsz, 'acc': acc * bsz }) tr_stat.step() loss.backward() step += 1 if not (step % args.gradient_accumulation_steps == -1 % args.gradient_accumulation_steps): continue if args.world_size > 1: average_gradients(model) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() lr_schedule.step() optimizer.zero_grad() global_step += 1 if args.world_size == 1 or (dist.get_rank() == 0): if global_step % args.print_every == -1 % args.print_every: logger.info("epoch %d, step %d, loss %.3f, acc %.3f", epoch, global_step, tr_stat['loss'] / tr_stat['nsamples'], tr_stat['acc'] / tr_stat['nsamples']) tr_stat = Statistics() if global_step > args.warmup_steps and global_step % args.eval_every == -1 % args.eval_every: dev_data = DataLoader(vocabs, args.dev_data, args.dev_batch_size, addition=args.additional_negs) acc = validate(model, dev_data, device) logger.info("epoch %d, step %d, dev, dev acc %.2f", epoch, global_step, acc) save_path = '%s/epoch%d_batch%d_acc%.2f' % ( args.ckpt, epoch, global_step, acc) model.save(args, save_path) model.train() if global_step > args.total_train_steps: break epoch += 1 logger.info('rank %d, finish training after %d steps', local_rank, global_step)
def main(args, local_rank): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) vocabs = dict() vocabs['src'] = Vocab(args.src_vocab, 0, [BOS, EOS]) vocabs['tgt'] = Vocab(args.tgt_vocab, 0, [BOS, EOS]) if args.world_size == 1 or (dist.get_rank() == 0): logger.info(args) for name in vocabs: logger.info("vocab %s, size %d, coverage %.3f", name, vocabs[name].size, vocabs[name].coverage) set_seed(19940117) #device = torch.device('cpu') torch.cuda.set_device(local_rank) device = torch.device('cuda', local_rank) if args.arch == 'vanilla': model = Generator(vocabs, args.embed_dim, args.ff_embed_dim, args.num_heads, args.dropout, args.enc_layers, args.dec_layers, args.label_smoothing) elif args.arch == 'mem': model = MemGenerator(vocabs, args.embed_dim, args.ff_embed_dim, args.num_heads, args.dropout, args.mem_dropout, args.enc_layers, args.dec_layers, args.mem_enc_layers, args.label_smoothing, args.use_mem_score) elif args.arch == 'rg': logger.info("start building model") logger.info("building retriever") retriever = Retriever.from_pretrained( args.num_retriever_heads, vocabs, args.retriever, args.nprobe, args.topk, local_rank, use_response_encoder=(args.rebuild_every > 0)) logger.info("building retriever + generator") model = RetrieverGenerator(vocabs, retriever, args.share_encoder, args.embed_dim, args.ff_embed_dim, args.num_heads, args.dropout, args.mem_dropout, args.enc_layers, args.dec_layers, args.mem_enc_layers, args.label_smoothing) if args.resume_ckpt: model.load_state_dict(torch.load(args.resume_ckpt)['model']) else: global_step = 0 if args.world_size > 1: set_seed(19940117 + dist.get_rank()) model = model.to(device) retriever_params = [ v for k, v in model.named_parameters() if k.startswith('retriever.') ] other_params = [ v for k, v in model.named_parameters() if not k.startswith('retriever.') ] optimizer = Adam([{ 'params': retriever_params, 'lr': args.embed_dim**-0.5 * 0.1 }, { 'params': other_params, 'lr': args.embed_dim**-0.5 }], betas=(0.9, 0.98), eps=1e-9) lr_schedule = get_inverse_sqrt_schedule_with_warmup( optimizer, args.warmup_steps, args.total_train_steps) train_data = DataLoader(vocabs, args.train_data, args.per_gpu_train_batch_size, for_train=True, rank=local_rank, num_replica=args.world_size) model.eval() #dev_data = DataLoader(vocabs, cur_dev_data, args.dev_batch_size, for_train=False) #bleu = validate(device, model, dev_data, beam_size=5, alpha=0.6, max_time_step=10) step, epoch = 0, 0 tr_stat = Statistics() logger.info("start training") model.train() best_dev_bleu = 0. while global_step <= args.total_train_steps: for batch in train_data: #step_start = time.time() batch = move_to_device(batch, device) if args.arch == 'rg': loss, acc = model( batch, update_mem_bias=(global_step > args.update_retriever_after)) else: loss, acc = model(batch) tr_stat.update({ 'loss': loss.item() * batch['tgt_num_tokens'], 'tokens': batch['tgt_num_tokens'], 'acc': acc }) tr_stat.step() loss.backward() #step_cost = time.time() - step_start #print ('step_cost', step_cost) step += 1 if not (step % args.gradient_accumulation_steps == -1 % args.gradient_accumulation_steps): continue if args.world_size > 1: average_gradients(model) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() lr_schedule.step() optimizer.zero_grad() global_step += 1 if args.world_size == 1 or (dist.get_rank() == 0): if global_step % args.print_every == -1 % args.print_every: logger.info("epoch %d, step %d, loss %.3f, acc %.3f", epoch, global_step, tr_stat['loss'] / tr_stat['tokens'], tr_stat['acc'] / tr_stat['tokens']) tr_stat = Statistics() if global_step % args.eval_every == -1 % args.eval_every: model.eval() max_time_step = 256 if global_step > 2 * args.warmup_steps else 5 bleus = [] for cur_dev_data in args.dev_data: dev_data = DataLoader(vocabs, cur_dev_data, args.dev_batch_size, for_train=False) bleu = validate(device, model, dev_data, beam_size=5, alpha=0.6, max_time_step=max_time_step) bleus.append(bleu) bleu = sum(bleus) / len(bleus) logger.info("epoch %d, step %d, dev bleu %.2f", epoch, global_step, bleu) if bleu > best_dev_bleu: testbleus = [] for cur_test_data in args.test_data: test_data = DataLoader(vocabs, cur_test_data, args.dev_batch_size, for_train=False) testbleu = validate(device, model, test_data, beam_size=5, alpha=0.6, max_time_step=max_time_step) testbleus.append(testbleu) testbleu = sum(testbleus) / len(testbleus) logger.info("epoch %d, step %d, test bleu %.2f", epoch, global_step, testbleu) torch.save({ 'args': args, 'model': model.state_dict() }, '%s/best.pt' % (args.ckpt, )) if not args.only_save_best: torch.save( { 'args': args, 'model': model.state_dict() }, '%s/epoch%d_batch%d_devbleu%.2f_testbleu%.2f' % (args.ckpt, epoch, global_step, bleu, testbleu)) best_dev_bleu = bleu model.train() if args.rebuild_every > 0 and (global_step % args.rebuild_every == -1 % args.rebuild_every): model.retriever.drop_index() torch.cuda.empty_cache() next_index_dir = '%s/batch%d' % (args.ckpt, global_step) if args.world_size == 1 or (dist.get_rank() == 0): model.retriever.rebuild_index(next_index_dir) dist.barrier() else: dist.barrier() model.retriever.update_index(next_index_dir, args.nprobe) if global_step > args.total_train_steps: break epoch += 1 logger.info('rank %d, finish training after %d steps', local_rank, global_step)
##初始值给定, init_pos=(-7.0,2.0) params={} params['x']=init_pos[0] params['y']=init_pos[1] grads={} grads['x']=0 grads['y']=0 optimizers=OrderedDict()##有序字典 optimizers['SGD']=SGD(lr=0.95) optimizers['momentum']=momentum(lr=0.1) optimizers['adagrad']=adagrad(lr=1.5) optimizers['Adam']= Adam(lr=0.3) idx=1##图的位置分布 for key in optimizers.keys():##取每个key键值 ##尝试每种优化方法 ##初始化参数 optimizer=optimizers[key] params['x']=init_pos[0] params['y']=init_pos[1] x_history=[] y_history=[] ##定义梯度的来源以及梯度下降过程 for i in range(30): x_history.append(params['x'])
def foo(mod, op, d): if (op[0] == "linear"): xx = Linear(d) # rnncell, lstmcell, grucell elif (mod[0] in ["LSTMCell", "GRUCell"]) and (op[0] == "forward"): xx = RNNCell(d) elif op[0] in [ "conv1d", "conv2d", ]: xx = Conv(d) elif (op[0] in Pointwise.ops): xx = Pointwise(d) elif (op[0] in Convert.ops): xx = Convert(d) elif op[0] in ["__matmul__", "matmul"]: xx = Matmul(d) elif op[0] == "embedding": xx = Embedding(d) #reduction elif op[0] == "sum": xx = Sum(d) elif op[0] == "mean": xx = Mean(d) elif op[0] == "norm": xx = Norm(d) elif op[0] == "dropout": xx = Dropout(d) #Index, Slice, Join, Mutate elif (op[0] == "cat"): xx = Cat(d) elif (op[0] == "reshape"): xx = Reshape(d) elif (op[0] == "masked_scatter_"): xx = MaskedScatter(d) elif (op[0] == "gather"): xx = Gather(d) elif (op[0] == "nonzero"): xx = Nonzero(d) elif (op[0] == "index_select"): xx = IndexSelect(d) elif (op[0] == "masked_select"): xx = MaskedSelect(d) #blas elif op[0] in ["addmm", "addmm_"]: xx = Addmm(d) elif op[0] == "mm": xx = Mm(d) elif op[0] == "bmm": xx = Bmm(d) #softmax elif op[0] == "softmax": xx = Softmax(d) elif op[0] == "log_softmax": xx = LogSoftmax(d) #loss elif op[0] == "mse_loss": xx = MSELoss(d) #optimizers elif op[0] == "adam": xx = Adam(d) #normalization elif op[0] == "batch_norm": xx = BatchNorm(d) #random elif op[0] == "randperm": xx = RandPerm(d) #misc elif op[0] == "copy_": xx = Copy(d) elif op[0] == "clone": xx = Clone(d) elif op[0] == "contiguous": xx = Contiguous(d) elif op[0] == "any": xx = Any(d) elif (op[0] in Activation.ops): xx = Activation(d) elif op[0] == "to": xx = Convert(d) else: xx = Foo(d) return xx
scores, deprocess(targets, n_bits)) # multinomial sampling needs to be processed to [-1,1] at generation generate_fn = partial(pixelcnn.generate_fn, preprocess_fn=preprocess, n_bits=args.n_bits) optimizer = RMSprop(model.parameters(), lr=args.lr, polyak=args.polyak) scheduler = None elif args.model == 'pixelcnnpp': from generative_models_toolbox.algos.pixelcnn import pixelcnnpp model = pixelcnnpp.PixelCNNpp(args.image_dims, args.n_channels, args.n_res_layers, args.n_logistic_mix, args.n_cond_classes).to(args.device) loss_fn = pixelcnnpp.loss_fn generate_fn = pixelcnnpp.generate_fn optimizer = Adam(model.parameters(), lr=args.lr, betas=(0.95, 0.9995), polyak=args.polyak) scheduler = torch.optim.lr_scheduler.ExponentialLR( optimizer, args.lr_decay) elif args.model == 'pixelsnail': from generative_models_toolbox.algos.pixelcnn import pixelsnail, pixelcnnpp model = pixelsnail.PixelSNAIL(args.image_dims, args.n_channels, args.n_res_layers, args.attn_n_layers, args.attn_nh, args.attn_dq, args.attn_dv, args.attn_drop_rate, args.n_logistic_mix, args.n_cond_classes).to(args.device) loss_fn = pixelcnnpp.loss_fn generate_fn = pixelcnnpp.generate_fn optimizer = Adam(model.parameters(), lr=args.lr, betas=(0.95, 0.9995),
images = np.asarray(images) img_viewer_examples(images, labels.tolist()[0], greyscale= True) model = Model() model.add(Dense(784, 90)) model.add(ReLU()) model.add(Dense(90, 45)) model.add(ReLU()) model.add(Dense(45, 10)) model.set_loss(CrossEntropyLoss()) optimizer = Adam(model.parameters(), learning_rate = 0.01) lr_schedular = StepLR(optimizer, step_size = 1, gamma=0.1) # weights path path = "./checkpoints/Linear_MINST_weights.sav" # model = load_weights(path) epochs = 6 for epoch in range(epochs): i = 0 for image, label in dataloader: if epoch == 5: model.graph() image = image/255 i = i + 1
def run(hparams, model, train_dataloader, valid_dataloader, device, out_dir='checkpoints'): learning_rate = hparams['learning_rate'] accumulate_step = hparams['accumulate_step'] lr_schedule = hparams['lr_schedule'] warmup_steps = hparams['warmup_steps'] warmup_proportion = hparams['warmup_proportion'] n_embd = hparams['n_embd'] num_optim_steps = hparams['num_optim_steps'] train_batch_size = hparams['train_batch_size'] valid_step = hparams['valid_step'] no_token_id = hparams['no_token_id'] model_parameters = filter(lambda p: p.requires_grad, model.parameters()) total_params = sum([np.prod(p.size()) for p in model_parameters]) logger.info('Number of parameter = {}'.format(total_params)) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'ln'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = Adam(optimizer_grouped_parameters, learning_rate, max_grad_norm=1.0) step = 0 global_step = 0 epoch = 0 while True: model.train() (tr_loss, tr_ppl, mean_ppl, nb_tr_examples, nb_tr_steps) = 0.0, 0.0, 0.0, 0, 0 n_token_real, n_token_total = 0, 0 pbar = tqdm.tqdm(enumerate(train_dataloader), total=len(train_dataloader)) for i, batch in pbar: batch = tuple(t.cuda() for t in batch) input_ids, position_ids, token_type_ids, label_ids, *_ = batch if no_token_id: token_type_ids = None loss, ppl = model(input_ids, position_ids, token_type_ids, label_ids) loss = loss.mean() loss = loss / (train_batch_size / input_ids.shape[0]) loss.backward() nb_tr_steps += 1 tr_loss += float( loss.sum().item()) * (train_batch_size / input_ids.shape[0]) if ppl.sum().item() < 1000000: tr_ppl += ppl.sum().item() else: tr_ppl += mean_ppl mean_loss = tr_loss / nb_tr_steps mean_ppl = tr_ppl / nb_tr_steps n_token_total += input_ids.shape[0] * input_ids.shape[1] n_token_real += (input_ids != 0).sum().item() #gradient update step += 1 if step % accumulate_step == 0: set_lr(optimizer, global_step, lr_schedule, learning_rate, warmup_steps, warmup_proportion, n_embd, num_optim_steps) optimizer.step() optimizer.zero_grad() global_step += 1 print( 'epoch: {}, global_step: {}, step: {}, mean_loss: {}, mean_ppl:{}' .format(epoch + 1, global_step + 1, step + 1, mean_loss, mean_ppl), file=train_logger) if global_step % valid_step == 0: print('Saving model...') torch.save( { 'model': model.state_dict(), 'epoch': epoch, 'hparams': hparams, }, os.path.join(out_dir, f'GPT2-pretrain-step-{global_step}.pkl')) eval_loss, eval_ppl = valid(model, valid_dataloader, epoch, device) print('{},{},{},{},{}'.format(epoch + 1, global_step + 1, step + 1, eval_loss, eval_ppl), file=valid_logger) logger.info('current learning rate: ' + str(optimizer.param_groups[0]['lr'])) model.train() if global_step >= num_optim_steps: break if (step + 1) % CACHE_EMPTY_STEP == 0: torch.cuda.empty_cache() if global_step >= num_optim_steps: break epoch += 1 train_logger.close() valid_logger.close()
"Please install apex from https://www.github.com/nvidia/apex " "to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True, verbose=False) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale, verbose=False) else: optimizer = Adam(optimizer_grouped_parameters, args.learning_rate, max_grad_norm=1.0) ######################################################################### # Training ! ########################################################################## if args.local_rank == -1 or get_rank() == 0: with open(join(log_dir, 'train_log.txt'), 'a+', buffering=1) as train_logger: print('epoch,global_step,step,mean_loss,mean_ppl,n_token_real,' 'n_token_total,epoch_time', file=train_logger) with open(join(log_dir, 'eval_log.txt'), 'a+', buffering=1) as eval_logger: print('epoch,global_step,step,eval_loss,eval_ppl', file=eval_logger) global_step = 0 step = 0
return ll def likelihood(x,y): out = model(x) #return -1*((y - out)**2).sum(1) return log_normal(y,out,zero+np.log(9)).sum(1) def lossf(x,y): ll = likelihood(x,y).sum() + prior() + bnn.params.merged_sampler.logdet return -ll/float(n) L = 32 adam = Adam(bnn.params.parameters(), 0.001) T = 2500 x1, x2 = -6, 6 y1, y2 = -100, 100 for i in range(T): adam.zero_grad() bnn.params.sample() loss = lossf(X_,Y_) loss.backward() adam.step()