def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None): outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) pooled_output = outputs[1] out = None loss = 0 for i in range(self.multi_drop): output = self.dropout(pooled_output) if labels is not None: if i == 0: out = self.classifier(output) loss = compute_loss(out, labels, loss_method=self.loss_method) else: temp_out = self.classifier(output) temp_loss = compute_loss(temp_out, labels, loss_method=self.loss_method) out = out + temp_out loss = loss + temp_loss loss = loss / self.multi_drop out = out / self.multi_drop if self.loss_method in ['binary']: out = torch.sigmoid(out).flatten() return out, loss
def build_model(self,): utils.prepare_data(data_file=self.data_file) self.lap_list, self.feature = utils.load_gcn_data(self.graph_file, self.num_support) self.num_feature = self.feature.shape[1] self.x = tf.placeholder(tf.float32, [None, self.d_input_step, self.d_input_size]) self.z = tf.placeholder(tf.float32, [None, self.g_input_step, self.g_input_size]) self.z_t = tf.placeholder(tf.float32, [None, self.g_input_step, self.g_input_size]) self.lap = tf.placeholder(tf.float32, [self.num_support, self.d_input_size, self.d_input_size]) self.fea = tf.placeholder(tf.float32, [self.d_input_size, self.num_feature]) self.x_ = self.generator(self.z, self.g_input_step, self.g_input_size, self.g_hidden_size, self.g_batch_size) self.D = self.discriminator(self.x, self.d_input_step, self.d_input_size, self.d_hidden_size, 1, self.g_batch_size) self.D_ = self.discriminator(self.x_, self.d_input_step, self.d_input_size, self.d_hidden_size, 1, self.g_batch_size, reuse=True) if self.wgan == 1: self.d_loss_real = tf.reduce_mean(self.D) self.d_loss_fake = tf.reduce_mean(self.D_) self.g_loss = self.d_loss_fake self.d_loss = self.d_loss_real - self.d_loss_fake else: self.d_loss_real = utils.compute_loss(self.D, tf.ones_like(self.D)) self.d_loss_fake = utils.compute_loss(self.D_, tf.zeros_like(self.D_)) self.g_loss = utils.compute_loss(self.D_, tf.ones_like(self.D_)) self.d_loss = self.d_loss_real + self.d_loss_fake self.accuracy = utils.compute_accuracy(self.z_t, self.z_)
def test(epoch, model, test_loader, writer, sigma_0, lr_sigma, iters_sig): model = model.eval() test_loss = 0 test_loss_corrupted = 0 total = 0 correct = 0 correct_corrupted = 0 for _, (batch, targets, idx) in enumerate(test_loader): batch = batch.to(device) targets = targets.to(device) sigma, batch_corrupted = get_sigma(model, batch, lr_sigma, sigma_0[idx], iters_sig, device) sigma_0[idx] = sigma # update sigma with torch.no_grad(): # forward pass through the base classifier outputs_softmax = model(batch) outputs_corrputed_softmax = model(batch_corrupted) loss = compute_loss(outputs_softmax, targets) loss_corrupted = compute_loss(outputs_corrputed_softmax, targets) test_loss += loss.item() * len(batch) test_loss_corrupted += loss_corrupted.item() * len(batch) _, predicted = outputs_softmax.max(1) _, predicted_corrupted = outputs_corrputed_softmax.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() correct_corrupted += predicted_corrupted.eq(targets).sum().item() print( '===> Test Loss: {}. Test Accuracy: {}. Test Loss Corrupted: {}. Test Accuracy Corrupted: {}' .format(test_loss / total, 100. * correct / total, test_loss_corrupted / total, 100. * correct_corrupted / total)) n = min(batch.size(0), 8) comparison = torch.cat([batch[:n], batch_corrupted[:n]]) comparison = torch.clamp(comparison, min=0, max=1) fig = plot_samples(comparison.detach().cpu().numpy().transpose( 0, 2, 3, 1).squeeze(), h=2, w=n) writer.add_figure('sample of noisy test examples', fig, epoch) writer.add_scalar('loss/test_loss', test_loss / total, epoch) writer.add_scalar('accuracy/test_accuracy', 100. * correct / total, epoch) writer.add_scalar('loss/test_loss_corrupted', test_loss_corrupted / total, epoch) writer.add_scalar('accuracy/test_accuracy_corrupted', 100. * correct_corrupted / total, epoch) writer.add_scalar('sigma/test_sigma_mean', sigma_0.mean().item(), epoch) writer.add_scalar('sigma/test_sigma_min', sigma_0.min().item(), epoch) writer.add_scalar('sigma/test_sigma_max', sigma_0.max().item(), epoch) return 100. * correct_corrupted / total, sigma_0
def validate_ori(args, model, criterion, test_data): # PREPARE DATA dataloader = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) # pesq_test=(random.randrange(len(test_data))) # VALIDATE model.eval() # print('np.zeros(len(test_data) // args.batch_size)',np.zeros(len(test_data) // args.batch_size)) total_loss = 0. with tqdm(total=len(test_data) // args.batch_size) as pbar, torch.no_grad(): for example_num, (x, targets) in enumerate(dataloader): if args.cuda: x = x.cuda() targets = targets.cuda() outputs, avg_loss = compute_loss(model, x, targets, criterion) total_loss += (1. / float(example_num + 1)) * (avg_loss - total_loss) return total_loss
def train(args, n_actors, batch_queue, prios_queue, param_queue): env = wrapper.make_atari(args.env) env = wrapper.wrap_atari_dqn(env, args) utils.set_global_seeds(args.seed, use_torch=True) model = DuelingDQN(env, args).to(args.device) # model.load_state_dict(torch.load('model_30h.pth')) tgt_model = DuelingDQN(env, args).to(args.device) tgt_model.load_state_dict(model.state_dict()) writer = SummaryWriter(comment="-{}-learner".format(args.env)) optimizer = torch.optim.Adam(model.parameters(), args.lr) # optimizer = torch.optim.RMSprop(model.parameters(), args.lr, alpha=0.95, eps=1.5e-7, centered=True) check_connection(n_actors) param_queue.put(model.state_dict()) learn_idx = 0 ts = time.time() tb_dict = { k: [] for k in ['loss', 'grad_norm', 'max_q', 'mean_q', 'min_q'] } while True: *batch, idxes = batch_queue.get() loss, prios, q_values = utils.compute_loss(model, tgt_model, batch, args.n_steps, args.gamma) grad_norm = utils.update_parameters(loss, model, optimizer, args.max_norm) prios_queue.put((idxes, prios)) batch, idxes, prios = None, None, None learn_idx += 1 tb_dict["loss"].append(float(loss)) tb_dict["grad_norm"].append(float(grad_norm)) tb_dict["max_q"].append(float(torch.max(q_values))) tb_dict["mean_q"].append(float(torch.mean(q_values))) tb_dict["min_q"].append(float(torch.min(q_values))) if args.soft_target_update: tau = args.tau for p_tgt, p in zip(tgt_model.parameters(), model.parameters()): p_tgt.data *= 1 - tau p_tgt.data += tau * p elif learn_idx % args.target_update_interval == 0: print("Updating Target Network..") tgt_model.load_state_dict(model.state_dict()) if learn_idx % args.save_interval == 0: print("Saving Model..") torch.save(model.state_dict(), "model.pth") if learn_idx % args.publish_param_interval == 0: param_queue.put(model.state_dict()) if learn_idx % args.tb_interval == 0: bps = args.tb_interval / (time.time() - ts) print("Step: {:8} / BPS: {:.2f}".format(learn_idx, bps)) writer.add_scalar("learner/BPS", bps, learn_idx) for k, v in tb_dict.items(): writer.add_scalar(f'learner/{k}', np.mean(v), learn_idx) v.clear() ts = time.time()
def forward(self, inputs=None, attention_mask=None, output_id=None, labels=None): inputs = torch.relu(self.text_linear(inputs)) bert_outputs = self.roberta(inputs_embeds=inputs, attention_mask=attention_mask) #calculate mlm loss last_hidden_state = bert_outputs[0] output_id_tmp = output_id[output_id.ne(-100)] output_id_emb = last_hidden_state[output_id.ne(-100)] pre_score = self.vocab_layer(output_id_emb) loss_cro = CrossEntropyLoss() mlm_loss = loss_cro(torch.sigmoid(pre_score), output_id_tmp) labels_bool = labels.ne(-1) if labels_bool.sum().item() == 0: return mlm_loss, torch.tensor([]) #calculate label loss pooled_output = bert_outputs[1] out = self.classifier(pooled_output) out = out[labels_bool] labels_tmp = labels[labels_bool] label_loss = compute_loss(out, labels_tmp) out = torch.sigmoid(out).flatten() return mlm_loss + label_loss, out return out, loss
def _main(): learner = TrajectoryLearner() learner.setup_inference(FLAGS, mode='loss') saver = tf.train.Saver([var for var in tf.trainable_variables()]) init = tf.initialize_all_variables() test_generator = DirectoryIterator(FLAGS.test_dir, shuffle=False, target_size=(FLAGS.img_width, FLAGS.img_height), batch_size=FLAGS.batch_size) steps = int(math.ceil(test_generator.samples / FLAGS.batch_size)) with tf.Session() as sess: saver.restore(sess, FLAGS.ckpt_file) print("--------------------------------------------------") print("Restored checkpoint file {}".format(FLAGS.ckpt_file)) print("--------------------------------------------------") outs = compute_loss(sess, learner, test_generator, steps, verbose=1) # Logging print("Average Vel Std: {:.3f}".format(outs['vel_std'])) print("Average Point Std: {:.3f}".format(outs['pnt_std'])) print("Average Vel MSE: {:.3f}".format(outs['vel_mse'])) print("Average Point MSE: {:.3f}".format(outs['pnt_mse']))
def _val(epoch): print('=> val') TxtEnc.eval() ImgEnc.eval() loss_epoch = 0.0 imgs = [] rcps = [] for batch in tqdm(val_loader): recipe = batch recipe[0], recipe[1] = recipe[0].to(device), recipe[1].to(device) with torch.no_grad(): txts_sub = TxtEnc(recipe[0]) imgs_sub = ImgEnc(recipe[1]) loss = compute_loss(txts_sub, imgs_sub, device) loss_epoch += loss.item() * recipe[1].shape[0] rcps.append(txts_sub.detach().cpu().numpy()) imgs.append(imgs_sub.detach().cpu().numpy()) rcps = np.concatenate(rcps, axis=0) imgs = np.concatenate(imgs, axis=0) print('=> computing ranks...') medR, medR_std, recalls = rank(rcps, imgs, args.retrieved_type, args.retrieved_range) print('=> val MedR: {:.4f}({:.4f})'.format(medR, medR_std)) writer.add_scalar('medR', medR, epoch) writer.add_scalar('medR_std', medR_std, epoch) for k, v in recalls.items(): writer.add_scalar('Recall@{}'.format(k), v, epoch) loss_epoch /= len(val_set) writer.add_scalar('loss_epoch_val', loss_epoch, epoch) scheduler.step(loss_epoch)
def validate(args, model, criterion, test_data): # PREPARE DATA dataloader = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) # VALIDATE model.eval() total_loss = 0. with tqdm(total=len(test_data) // args.batch_size) as pbar, torch.no_grad(): for example_num, (x, targets) in enumerate(dataloader): if args.cuda: x = x.cuda() for k in list(targets.keys()): targets[k] = targets[k].cuda() _, avg_loss = compute_loss(model, x, targets, criterion) total_loss += (1. / float(example_num + 1)) * (avg_loss - total_loss) pbar.set_description("Current loss: " + str(total_loss)) pbar.update(1) return total_loss
def evaluate_model(model, dataset, train, test, hyperparams): is_cifar_model = model.__name__ == "Cifar10CustomModel" cifar_model_weights_path = joinpath(RESULTS_DIR, "Cifar10CustomModel-weights.pkl") start_time = time.time() train_data, test_data = model.prepare_dataset(train, test, dataset.categorical_features) estimator = model.build_estimator(hyperparams, train_data) # Restore Cifar10CustomModel if weights have been saved if is_cifar_model and isfile(cifar_model_weights_path): estimator.initialize() estimator.load_params(f_params=cifar_model_weights_path) train_time = -1 else: X, y, *_ = train_data estimator.fit(X, y) train_time = time.time() - start_time if is_cifar_model: estimator.save_params(f_params=cifar_model_weights_path) start_time = time.time() X_test, y_test = test_data metric_value = compute_metric(y_test, estimator.predict(X_test), dataset.metric) score = -compute_loss(dataset.metric, [metric_value]) evaluation_time = time.time() - start_time return score, train_time, evaluation_time
def compute_td_loss(self,batch_size, beta): state, action, reward, next_state, done, weights, indices = self.replay_buffer.sample(batch_size, beta) state = torch.FloatTensor(state).to(self.device) next_state = torch.FloatTensor(next_state).to(self.device) action = torch.LongTensor(action).to(self.device) reward = torch.FloatTensor(reward).to(self.device) done = torch.FloatTensor(done).to(self.device) weights = torch.FloatTensor(weights).to(self.device) batch = (state, action, reward, next_state, done, weights) # q_values = self.model(state) # next_q_values = self.target_model(next_state) # q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) # next_q_value = next_q_values.max(1)[0] # expected_q_value = reward + self.gamma * next_q_value * (1 - done) # td_error = torch.abs(expected_q_value.detach() - q_value) # loss = (td_error).pow(2) * weights # prios = loss+1e-5#0.9 * torch.max(td_error)+(1-0.9)*td_error # loss = loss.mean() loss, prios = utils.compute_loss(self.model,self.target_model, batch,1) self.optimizer.zero_grad() loss.backward() self.scheduler.step() self.replay_buffer.update_priorities(indices, prios) self.optimizer.step() return loss
def run_HetGNN(model, hg, het_graph, config): # het_graph is used to sample neighbour hg = hg.to('cpu') category = config.category train_mask = hg.nodes[category].data.pop('train_mask') test_mask = hg.nodes[category].data.pop('test_mask') train_idx = th.nonzero(train_mask, as_tuple=False).squeeze() test_idx = th.nonzero(test_mask, as_tuple=False).squeeze() labels = hg.nodes[category].data.pop('label') emd = hg.nodes[category].data['dw_embedding'] train_batch = load_link_pred('./a_a_list_train.txt') test_batch = load_link_pred('./a_a_list_test.txt') # HetGNN Sampler batch_sampler = SkipGramBatchSampler(hg, config.batch_size, config.window_size) neighbor_sampler = NeighborSampler(het_graph, hg.ntypes, batch_sampler.num_nodes, config.device) collator = HetGNNCollator(neighbor_sampler, hg) dataloader = DataLoader(batch_sampler, collate_fn=collator.collate_train, num_workers=config.num_workers) opt = th.optim.Adam(model.parameters()) pred = ScorePredictor() dataloader_it = iter(dataloader) for i in range(config.max_epoch): model.train() for batch_id in tqdm.trange(config.batches_per_epoch): positive_graph, negative_graph, blocks = next(dataloader_it) blocks = [b.to(config.device) for b in blocks] positive_graph = positive_graph.to(config.device) negative_graph = negative_graph.to(config.device) # we need extract multi-feature input_features = extract_feature(blocks[0], hg.ntypes) x = model(blocks[0], input_features) loss = compute_loss(pred(positive_graph, x), pred(negative_graph, x)) opt.zero_grad() loss.backward() opt.step() print('Epoch {:05d} |Train - Loss: {:.4f}'.format(i, loss.item())) input_features = extract_feature(het_graph, hg.ntypes) x = model(het_graph, input_features) author_link_prediction(x['author'].to('cpu').detach(), train_batch, test_batch) micro_f1, macro_f1 = Hetgnn_evaluate( x[config.category].to('cpu').detach(), labels, train_idx, test_idx) print('<Classification> Micro-F1 = %.4f, Macro-F1 = %.4f' % (micro_f1, macro_f1)) pass
def train(args, n_actors, batch_queue, prios_queue, param_queue): env = RunTagEnv(width=5, height=5, number_of_subordinates=1, max_steps=1000) #env = wrapper.make_atari(args.env) #env = wrapper.wrap_atari_dqn(env, args) utils.set_global_seeds(args.seed, use_torch=True) model = DuelingDQN(env).to(args.device) tgt_model = DuelingDQN(env).to(args.device) tgt_model.load_state_dict(model.state_dict()) writer = SummaryWriter(comment="-{}-learner".format(args.env)) # optimizer = torch.optim.Adam(model.parameters(), args.lr) optimizer = torch.optim.RMSprop(model.parameters(), args.lr, alpha=0.95, eps=1.5e-7, centered=True) check_connection(n_actors) param_queue.put(model.state_dict()) learn_idx = 0 ts = time.time() while True: *batch, idxes = batch_queue.get() loss, prios = utils.compute_loss(model, tgt_model, batch, args.n_steps, args.gamma) grad_norm = utils.update_parameters(loss, model, optimizer, args.max_norm) print('Updated parameters!') prios_queue.put((idxes, prios)) batch, idxes, prios = None, None, None learn_idx += 1 writer.add_scalar("learner/loss", loss, learn_idx) writer.add_scalar("learner/grad_norm", grad_norm, learn_idx) if learn_idx % args.target_update_interval == 0: print("Updating Target Network..") tgt_model.load_state_dict(model.state_dict()) if learn_idx % args.save_interval == 0: print("Saving Model..") torch.save(model.state_dict(), "model.pth") if learn_idx % args.publish_param_interval == 0: param_queue.put(model.state_dict()) if learn_idx % args.bps_interval == 0: bps = args.bps_interval / (time.time() - ts) print("Step: {:8} / BPS: {:.2f}".format(learn_idx, bps)) writer.add_scalar("learner/BPS", bps, learn_idx) ts = time.time()
def train_step(x, y, model, optimizer): # Use tf.GradientTape() with tf.GradientTape() as tape: y_hat = model(x) loss = compute_loss(y, y_hat) # Now, compute the gradients grads = tape.gradient(loss, model.trainable_variables) # Apply the gradients to the optimizer so it can update the model accordingly optimizer.apply_gradients(zip(grads, model.trainable_variables)) return loss
def test(self): self.test_net.load_state_dict(self.train_net.state_dict()) DB = self.gen_batch() seq = Variable(Tensor(DB[0])) seq_m = Variable(torch.LongTensor(DB[1].astype('int64'))) target = Variable(Tensor(DB[2])) label = Variable(torch.LongTensor(DB[3].astype('int64'))) pointer = self.test_net(seq, seq_m, target) loss = utils.compute_loss(pointer, label, target) acc = utils.compute_acc(pointer, label) return pointer, loss, acc, label
def validation(model, criterion, evaluation_loader, converter, opt): """ validation or evaluation """ n_correct = 0 norm_ED = 0 length_of_data = 0 infer_time = 0 valid_loss_avg = Averager() for i, (image_tensors, labels) in enumerate(evaluation_loader): batch_size = image_tensors.size(0) length_of_data = length_of_data + batch_size image = image_tensors.to(device) # For max length prediction length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(device) text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length + 1).fill_(0).to(device) text_for_loss, length_for_loss = converter.encode( labels, batch_max_length=opt.batch_max_length) start_time = time.time() preds, global_feature, local_feature, attention_weights, transformed_imgs, control_points = model( image, text_for_pred, is_train=False) forward_time = time.time() - start_time preds = preds[:, :text_for_loss.shape[1] - 1, :] target = text_for_loss[:, 1:] # without [GO] Symbol cost = criterion(preds.contiguous().view(-1, preds.shape[-1]), target.contiguous().view(-1)) # select max probabilty (greedy decoding) then decode index to character preds_score, preds_index = preds.max(2) preds_str = converter.decode(preds_index, length_for_pred) labels = converter.decode(text_for_loss[:, 1:], length_for_loss) infer_time += forward_time valid_loss_avg.add(cost) # calculate accuracy. batch_n_correct, batch_char_acc = compute_loss(preds_str, labels, opt) n_correct += batch_n_correct norm_ED += batch_char_acc accuracy = n_correct / float(length_of_data) * 100 norm_ED = norm_ED / float(length_of_data) * 100 return valid_loss_avg.val( ), accuracy, norm_ED, preds_str, labels, infer_time, length_of_data
def train(self): DB = self.gen_batch() seq = Variable(Tensor(DB[0])) seq_m = Variable(torch.LongTensor(DB[1].astype('int64'))) target = Variable(Tensor(DB[2])) label = Variable(torch.LongTensor(DB[3].astype('int64'))) pointer = self.train_net(seq, seq_m, target) loss = utils.compute_loss(pointer, label, target) acc = utils.compute_acc(pointer, label) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return pointer, loss, acc, label
def train(self): utils.set_global_seeds(self.seed, use_torch=True) learn_idx = 0 while True: beta = self.beta_by_frame(learn_idx) states, actions, rewards, next_states, dones, weights, idxes = self.buffer.sample( self.batch_size, beta) states = torch.FloatTensor(states).to(self.device) actions = torch.LongTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) weights = torch.FloatTensor(weights).to(self.device) batch = (states, actions, rewards, next_states, dones, weights) loss, prios = utils.compute_loss(self.model, self.tgt_model, batch, self.n_step, self.gamma) self.scheduler.step() grad_norm = utils.update_parameters(loss, self.model, self.optimizer, self.max_norm) self.buffer.update_priorities(idxes, prios) batch, idxes, prios = None, None, None learn_idx += 1 self.writer.add_scalar("learner/loss", loss, learn_idx) self.writer.add_scalar("learner/grad_norm", grad_norm, learn_idx) if learn_idx % self.target_update_interval == 0: print("Updating Target Network..") self.tgt_model.load_state_dict(self.model.state_dict()) if learn_idx % self.save_interval == 0: print("Saving Model..") torch.save(self.model.state_dict(), "model{}.pth".format(learn_idx)) if learn_idx % self.publish_param_interval == 0: self.batch_recorder.set_worker_weights( copy.deepcopy(self.model)) if learn_idx >= self.max_step: torch.save(self.model.state_dict(), "model{}.pth".format(learn_idx)) self.batch_recorder.cleanup() break
def find_close_point(X, Y, center, TARGET=2, k=50): """ @topic: Find top k cloest points to given center. @input: X: dataset(2D); Y: label(1D); center: the center of target cluster; TARGET: the target number that we need to handle; k: top k cloest points. @return: the index of top k cloest points to given center in lable. """ loss_arr = np.zeros_like(Y, dtype=float) for l in range(len(Y)): if Y[l] == TARGET: loss = compute_loss(X[l], center) loss_arr[l] = loss else: loss_arr[l] = float("inf") return np.array(loss_arr).argsort()[:k]
def train_epoch(model, epoch, train_data_loader, optimizer): model.train() for param in model.parameters(): # Setting complete model to be trainable param.requires_grad = True learning_rate = 0.0 if epoch < 30: for param_group in optimizer.param_groups: param_group['lr'] = 1e-2 learning_rate = param_group['lr'] if epoch >= 30 and epoch < 60: for param_group in optimizer_backbone.param_groups: param_group['lr'] = 1e-3 learning_rate = param_group['lr'] if epoch >= 60: for param_group in optimizer_backbone.param_groups: param_group['lr'] = 1e-4 learning_rate = param_group['lr'] print("epoch number --> " + str(epoch) + " learning rate ---> " + str(learning_rate)) train_loss = 0 for batch_idx, (data, target) in enumerate(train_data_loader): data = data.to(dev) for key in target.keys(): target[key] = target[key].to(dev) optimizer.zero_grad() sempred, inspred, insreg = model(data) loss = compute_loss(sempred, inspred, insreg, target) loss.backward() train_loss += loss.item() optimizer.step() print('====> Epoch: {} Average loss: {:.4f}'.format(epoch, train_loss))
def objective(args): try: estimator = model.build_estimator(args, train) metric_values = [] X, y, *_ = train for train_index, val_index in kfold.split(*train): X_train, X_val = X[train_index], X[val_index] y_train, y_val = y[train_index], y[val_index] estimator.fit(X_train, y_train) metric_value = compute_metric(y_val, estimator.predict(X_val), dataset.metric) metric_values.append(metric_value) if not getattr(dataset, 'needs_k_fold', True): break return compute_loss(dataset.metric, metric_values) except ValueError: """ With some hyper-parameters combinations, a ValueError can be raised during training (in particular MLPRegressor) """ return {'status': 'fail'}
def _validate(self, updates=0): """ Validate on development set. """ tagged_dev_sentences, accuracy = self._tag_dataset(self.dev, train_mode=False) loss = utils.compute_loss(tagged_dev_sentences, self.dev, 'dev') iterator.write(sentences=tagged_dev_sentences, ids=self._dev_iterator.ids, file_name=self._outfile_prefix + '_devset.csv', verbose=True) logger.info("Update %r: dev loss/sent=%.4f, acc=%.4f" % (updates, loss, accuracy)) # Early stop here if self._early_stop: if loss < self._best_dev_loss: logger.info("Dev loss improved to %.4f" % loss) self._best_dev_loss = loss self._dev_loss_not_improved = 0 # Save best model model_path = self._outfile_prefix + ".m" logger.info("Saving best model to '%s'." % model_path) self.model.save(model_path) else: self._dev_loss_not_improved += 1 if self._dev_loss_not_improved > self._early_stop_patience: logger.info( "Model has not improved for %d validation steps, stopping." % self._dev_loss_not_improved) logger.info("Best dev loss: %.4f" % self._best_dev_loss) self._training_stopped = True else: logger.info( "Model has not improved for %d validation steps." % self._dev_loss_not_improved)
def train_step(src_token_ids, tgt_token_ids): """Performs a single training step on a minibatch of source and target token ids. Args: src_token_ids: int tensor of shape [batch_size, src_seq_len], lists of subtoken ids of batched source sequences ending with EOS_ID and zero-padded. tgt_token_ids: int tensor of shape [batch_size, src_seq_len], lists of subtoken ids of batched target sequences ending with EOS_ID and zero-padded. Returns: loss: float scalar tensor, the loss. step: int scalar tensor, the global step. lr: float scalar tensor, the learning rate. """ with tf.GradientTape() as tape: # for each sequence of subtokens s1, s2, ..., sn, 1 # prepend it with 0 (SOS_ID) and truncate it to the same length: # 0, s1, s2, ..., sn tgt_token_ids_input = tf.pad(tgt_token_ids, [[0, 0], [1, 0]])[:, :-1] logits = self._model(src_token_ids, tgt_token_ids_input, training=True) loss = compute_loss(tgt_token_ids, logits, self._label_smoothing, self._model._vocab_size) gradients = tape.gradient(loss, self._model.trainable_variables) if clip_norm is not None: gradients, norm = tf.clip_by_global_norm(gradients, clip_norm) optimizer.apply_gradients( zip(gradients, self._model.trainable_variables)) step = optimizer.iterations lr = optimizer.learning_rate(step) return loss, step - 1, lr
def _train(epoch): global niter print('=> train') TxtEnc.train() ImgEnc.train() loss_epoch = 0.0 for batch in tqdm(train_loader): recipe = batch recipe[0], recipe[1] = recipe[0].to(device), recipe[1].to(device) txt = TxtEnc(recipe[0]) img = ImgEnc(recipe[1]) loss = compute_loss(txt, img, device) optimizer.zero_grad() loss.backward() for group in optimizer.param_groups: torch.nn.utils.clip_grad_norm_(group['params'], args.grad_clip) optimizer.step() writer.add_scalar('loss_batch_train', loss.item(), niter) loss_epoch += loss.item() * recipe[1].shape[0] niter += 1 loss_epoch /= len(train_set) writer.add_scalar('loss_epoch_train', loss_epoch, epoch) writer.add_scalar('lr', optimizer.param_groups[0]['lr'], epoch)
def run(args): if args.slurm: args.slurmid = "%s_%s" % (os.environ["SLURM_JOB_ID"], os.environ["SLURM_ARRAY_TASK_ID"]) EMA = args.ema BATCH_SIZE = args.batch_size NUM_Z = args.num_latent NUM_FILTERS = args.num_filters LR_GEN = args.learning_rate_gen LR_DIS = args.learning_rate_dis NUM_EPOCHS = args.num_epochs SEED = args.seed RESOLUTION = args.resolution GRADIENT_PENALTY = args.gradient_penalty torch.manual_seed(SEED) ROOT = args.path_to_dataset device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") exp_name = "%i_%i" % (int(time.time()), np.random.randint(9999)) if args.slurmid is not None: exp_name = args.slurmid OUTPUT_PATH = os.path.join(args.output_path, '%i/%s') % (RESOLUTION, exp_name) writer = SummaryWriter(log_dir=os.path.join(OUTPUT_PATH, 'runs')) print("Loading dataset...") transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) dataset = CrocodileDataset(root=ROOT, transform=transform, resolution=RESOLUTION, one_hot=True) dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) print("Init...") if args.model == "small": gen = models.SmallGenerator( NUM_Z + dataset.num_cat, RESOLUTION, NUM_FILTERS, args.num_layers, spectral_norm=args.spectral_norm_gen).to(device) dis = models.ConditionalSmallDiscriminator(RESOLUTION, dataset.num_cat, NUM_FILTERS, args.num_layers).to(device) gen_optimizer = optim.Adam(gen.parameters(), lr=LR_GEN, betas=(0.5, 0.999)) dis_optimizer = optim.Adam(dis.parameters(), lr=LR_DIS, betas=(0.5, 0.999)) z_examples = torch.zeros(1, 10, NUM_Z).normal_().expand(dataset.num_cat, -1, -1) y_examples = torch.eye(dataset.num_cat).unsqueeze(1).expand(-1, 10, -1) z_examples = torch.cat([z_examples, y_examples], -1).view(-1, NUM_Z + dataset.num_cat).to(device) if not os.path.exists(os.path.join(OUTPUT_PATH, "gen")): os.makedirs(os.path.join(OUTPUT_PATH, "gen")) if not os.path.exists(os.path.join(OUTPUT_PATH, "img")): os.makedirs(os.path.join(OUTPUT_PATH, "img")) dataiter = iter(dataloader) x_examples, _ = dataiter.next()[:100] x_examples = x_examples / 2 + 0.5 torchvision.utils.save_image(x_examples, os.path.join(OUTPUT_PATH, "examples.png"), nrow=10) with open(os.path.join(OUTPUT_PATH, 'config.json'), 'w') as f: json.dump(vars(args), f) print("Training...") init_epoch = 0 for epoch in range(NUM_EPOCHS): t = time.time() for x, y in dataloader: x = x.to(device) y = y.to(device) z = torch.zeros(len(x), NUM_Z).normal_().to(device) z = torch.cat([z, y], -1) x_gen = gen(z) score_true, score_gen = dis(x, y), dis(x_gen, y) loss_gen, loss_dis = utils.compute_loss(score_true, score_gen, mode="nsgan") if GRADIENT_PENALTY: loss_dis += GRADIENT_PENALTY * dis.get_penalty(x, x_gen) grad_gen = autograd.grad(loss_gen, gen.parameters(), retain_graph=True) grad_dis = autograd.grad(loss_dis, dis.parameters(), retain_graph=True) for p, g in zip(gen.parameters(), grad_gen): p.grad = g for p, g in zip(dis.parameters(), grad_dis): p.grad = g gen_optimizer.step() dis_optimizer.step() print("Epoch: %i, Loss dis: %.2e, Loss gen %.2e, Time: %i" % (init_epoch + epoch, loss_dis, loss_gen, time.time() - t)) x_gen = x_gen / 2 + 0.5 img = torchvision.utils.make_grid(x_gen, nrow=10) writer.add_image('gen_random', img, epoch) x_gen = gen(z_examples) x_gen = x_gen / 2 + 0.5 img = torchvision.utils.make_grid( x_gen, nrow=10) # First dimension is row, second dimension is column writer.add_image('gen', img, epoch) torchvision.utils.save_image( x_gen, os.path.join(OUTPUT_PATH, "img/img_%.3i.png" % (init_epoch + epoch)), nrow=10) torch.save( { 'epoch': init_epoch + epoch, 'gen_state_dict': gen.state_dict() }, os.path.join(OUTPUT_PATH, "gen/gen_%i.chk" % (init_epoch + epoch))) torch.save( { 'epoch': init_epoch + epoch, 'gen_state_dict': gen.state_dict(), 'dis_state_dict': dis.state_dict(), 'gen_optimizer_state_dict': gen_optimizer.state_dict(), 'dis_optimizer_state_dict': dis_optimizer.state_dict() }, os.path.join(OUTPUT_PATH, "last_model.chk"))
def train_loop( run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False, use_cuda=False, light_target=False, ): """Train loop""" if torch.cuda.is_available(): torch.cuda.empty_cache() rank = dist.get_rank() world_size = dist.get_world_size() train_epochs = 8 train_min_len, train_max_len = 0, 75 val_min_len, val_max_len = 0, 150 math_mode = "fp16" # One of `fp16`, `fp32` lang = ("en", "de") # Training train_global_batch_size = 2048 # Global batch size max_bs = 128 # Max batch size for used hardware update_freq = int(max(1, train_global_batch_size // (max_bs * world_size))) train_batch_size = int(train_global_batch_size // (world_size * update_freq)) val_batch_size = 64 # Model attributes model_args = { "hidden_size": 1024, "num_layers": 4, "dropout": 0.2, "share_embedding": True, "fusion": True, } # Criterion criterion_args = {"smoothing": 0.1, "fast_xentropy": True} # Loss scaling loss_scaling = {"init_scale": 1024, "upscale_interval": 128} # Optimizer optimizer_args = { "lr": 2e-3, "grad_clip": 5.0, } # Scheduler scheduler_args = { "warmup_steps": 200, "remain_steps": 0.4, "decay_interval": 0.05, "decay_steps": 4, "decay_factor": 0.5, } # Translator translator_args = { "beam_size": 5, "len_norm_factor": 0.6, "cov_penalty_factor": 0.1, "len_norm_const": 5.0, "max_seq_len": 150, } # Build train/val datsets train_set = WMT16Dataset( dataset_dir, math_precision=math_mode, lang=lang, train=True, download=True, preprocessed=True, min_len=train_min_len, max_len=train_max_len, ) train_set.prepare() val_set = WMT16Dataset( dataset_dir, math_precision=math_mode, lang=lang, validation=True, download=False, min_len=val_min_len, max_len=val_max_len, sort=True, ) tokenizer = train_set.tokenizer # Build model model = GNMT(vocab_size=train_set.vocab_size, **model_args) # Build loss function criterion = LabelSmoothing(padding_idx=wmt16_config.PAD, **criterion_args) # Bilingual Evaluation Understudy Score metrics = [BLEUScore()] # Partition data train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) collate_fn = build_collate_fn(sort=True) train_loader = DataLoader( train_set, batch_size=train_batch_size, collate_fn=collate_fn, num_workers=2, pin_memory=True, drop_last=False, shuffle=True, ) val_loader = DataLoader( val_set, batch_size=val_batch_size, collate_fn=collate_fn, num_workers=2, pin_memory=True, drop_last=False, ) validate_every = update_freq * round( len(train_loader) * 0.30 / update_freq ) # Validate every 30% # Build optimizer & scheduler total_train_iters = (len(train_loader) // update_freq) * train_epochs print("Number of batches per epoch {}".format(len(train_loader))) print("Train iterations per epoch {}".format(total_train_iters / train_epochs)) if use_cuda: model = model.cuda() criterion = criterion.cuda() use_horovod = math_mode == "fp16" and dist.get_backend() == dist.Backend.MPI if use_horovod: hvd.init() logger.info("Using horovod rank={}".format(hvd.rank())) tensor = torch.tensor([1]) res = hvd.allreduce(tensor, op=hvd.Sum) assert res[0] == world_size fp_optimizer, optimizer, model = build_optimizer( model=model, math=math_mode, loss_scaling=loss_scaling, use_cuda=use_cuda, use_horovod=use_horovod, **optimizer_args ) # Create a learning rate scheduler for an optimizer scheduler = ExponentialWarmupMultiStepLR( optimizer, total_train_iters, **scheduler_args ) # Translator translator = Translator(model=model, trg_tokenizer=tokenizer, **translator_args) checkpointer = Checkpointer( ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.BEST ) if not validation_only: if light_target: goal = task4_time_to_bleu_goal(20) else: goal = task4_time_to_bleu_goal(24) num_batches_per_device_train = len(train_loader) tracker = Tracker(metrics, run_id, rank, goal=goal) dist.barrier() tracker.start() for epoch in range(0, train_epochs): if torch.cuda.is_available(): torch.cuda.empty_cache() model.train() tracker.train() for batch_idx, (data, target) in enumerate(train_loader): tracker.batch_start() data, target = prepare_batch(data, target, use_cuda=use_cuda) tracker.record_batch_load() is_last = batch_idx == len(train_loader) update = (batch_idx % update_freq) == update_freq - 1 init = (batch_idx % update_freq) == 0 # Clear gradients in the optimizer. if init: fp_optimizer.zero_grad() tracker.record_batch_init() # Compute the output output = compute_model_output(model, data, target) tracker.record_batch_fwd_pass() # Compute the loss loss, loss_per_token = compute_loss( data, target, output, criterion, update_freq ) tracker.record_batch_comp_loss() # Backprop fp_optimizer.backward_loss(loss) tracker.record_batch_backprop() # Opt step if update or is_last: # For this task, simply sum all gradients updated = fp_optimizer.step(tracker=tracker, denom=1) # Learning rate scheduler if updated: scheduler.step() tracker.batch_end() record_train_batch_stats( batch_idx=batch_idx, loss=loss_per_token, output=target[0], # Use target just for the size metric_results={}, tracker=tracker, num_batches_per_device_train=num_batches_per_device_train, ) # Validation during training if (batch_idx + 1) % validate_every == 0: if torch.cuda.is_available(): torch.cuda.empty_cache() metrics_values, loss = validation_round( val_loader, metrics, model, criterion, update_freq, translator, tracker=tracker, use_cuda=use_cuda, ) record_validation_stats(metrics_values, loss, tracker, rank) if tracker.goal_reached: break model.train() tracker.train() if torch.cuda.is_available(): torch.cuda.empty_cache() metrics_values, loss = validation_round( val_loader, metrics, model, criterion, update_freq, translator, use_cuda=use_cuda, ) is_best = record_validation_stats(metrics_values, loss, tracker, rank) checkpointer.save( tracker, model, fp_optimizer.optimizer, scheduler, tracker.current_epoch, is_best, ) tracker.epoch_end() if tracker.goal_reached: print("Goal Reached!") dist.barrier() time.sleep(10) return else: cecf = CheckpointsEvaluationControlFlow( ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=criterion, metrics=metrics, use_cuda=use_cuda, dtype="fp32", max_batch_per_epoch=None, ) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), "w") as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(output_dir, "val_stats.json"), "w") as f: json.dump(val_stats, f)
def main(args): #torch.backends.cudnn.benchmark=True # This makes dilated conv much faster for CuDNN 7.5 # MODEL num_features = [args.features*i for i in range(1, args.levels+1)] if args.feature_growth == "add" else \ [args.features*2**i for i in range(0, args.levels)] target_outputs = int(args.output_size * args.sr) model = Waveunet(args.channels, num_features, args.channels, args.instruments, kernel_size=args.kernel_size, target_output_size=target_outputs, depth=args.depth, strides=args.strides, conv_type=args.conv_type, res=args.res, separate=args.separate) if args.cuda: model = utils.DataParallel(model) print("move model to gpu") model.cuda() print('model: ', model) print('parameter count: ', str(sum(p.numel() for p in model.parameters()))) writer = SummaryWriter(args.log_dir) ### DATASET musdb = get_musdb_folds(args.dataset_dir) # If not data augmentation, at least crop targets to fit model output shape crop_func = partial(crop, shapes=model.shapes) # Data augmentation function for training augment_func = partial(random_amplify, shapes=model.shapes, min=0.7, max=1.0) train_data = SeparationDataset(musdb, "train", args.instruments, args.sr, args.channels, model.shapes, True, args.hdf_dir, audio_transform=augment_func) val_data = SeparationDataset(musdb, "val", args.instruments, args.sr, args.channels, model.shapes, False, args.hdf_dir, audio_transform=crop_func) test_data = SeparationDataset(musdb, "test", args.instruments, args.sr, args.channels, model.shapes, False, args.hdf_dir, audio_transform=crop_func) dataloader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, worker_init_fn=utils.worker_init_fn) ##### TRAINING #### # Set up the loss function if args.loss == "L1": criterion = nn.L1Loss() elif args.loss == "L2": criterion = nn.MSELoss() else: raise NotImplementedError("Couldn't find this loss!") # Set up optimiser optimizer = Adam(params=model.parameters(), lr=args.lr) # Set up training state dict that will also be saved into checkpoints state = {"step" : 0, "worse_epochs" : 0, "epochs" : 0, "best_loss" : np.Inf} # LOAD MODEL CHECKPOINT IF DESIRED if args.load_model is not None: print("Continuing training full model from checkpoint " + str(args.load_model)) state = utils.load_model(model, optimizer, args.load_model) print('TRAINING START') while state["worse_epochs"] < args.patience: print("Training one epoch from iteration " + str(state["step"])) avg_time = 0. model.train() with tqdm(total=len(train_data) // args.batch_size) as pbar: np.random.seed() for example_num, (x, targets) in enumerate(dataloader): if args.cuda: x = x.cuda() for k in list(targets.keys()): targets[k] = targets[k].cuda() t = time.time() # Set LR for this iteration utils.set_cyclic_lr(optimizer, example_num, len(train_data) // args.batch_size, args.cycles, args.min_lr, args.lr) writer.add_scalar("lr", utils.get_lr(optimizer), state["step"]) # Compute loss for each instrument/model optimizer.zero_grad() outputs, avg_loss = utils.compute_loss(model, x, targets, criterion, compute_grad=True) optimizer.step() state["step"] += 1 t = time.time() - t avg_time += (1. / float(example_num + 1)) * (t - avg_time) writer.add_scalar("train_loss", avg_loss, state["step"]) if example_num % args.example_freq == 0: input_centre = torch.mean(x[0, :, model.shapes["output_start_frame"]:model.shapes["output_end_frame"]], 0) # Stereo not supported for logs yet writer.add_audio("input", input_centre, state["step"], sample_rate=args.sr) for inst in outputs.keys(): writer.add_audio(inst + "_pred", torch.mean(outputs[inst][0], 0), state["step"], sample_rate=args.sr) writer.add_audio(inst + "_target", torch.mean(targets[inst][0], 0), state["step"], sample_rate=args.sr) pbar.update(1) # VALIDATE val_loss = validate(args, model, criterion, val_data) print("VALIDATION FINISHED: LOSS: " + str(val_loss)) writer.add_scalar("val_loss", val_loss, state["step"]) # EARLY STOPPING CHECK checkpoint_path = os.path.join(args.checkpoint_dir, "checkpoint_" + str(state["step"])) if val_loss >= state["best_loss"]: state["worse_epochs"] += 1 else: print("MODEL IMPROVED ON VALIDATION SET!") state["worse_epochs"] = 0 state["best_loss"] = val_loss state["best_checkpoint"] = checkpoint_path # CHECKPOINT print("Saving model...") utils.save_model(model, optimizer, state, checkpoint_path) state["epochs"] += 1 #### TESTING #### # Test loss print("TESTING") # Load best model based on validation loss state = utils.load_model(model, None, state["best_checkpoint"]) test_loss = validate(args, model, criterion, test_data) print("TEST FINISHED: LOSS: " + str(test_loss)) writer.add_scalar("test_loss", test_loss, state["step"]) # Mir_eval metrics test_metrics = evaluate(args, musdb["test"], model, args.instruments) # Dump all metrics results into pickle file for later analysis if needed with open(os.path.join(args.checkpoint_dir, "results.pkl"), "wb") as f: pickle.dump(test_metrics, f) # Write most important metrics into Tensorboard log avg_SDRs = {inst : np.mean([np.nanmean(song[inst]["SDR"]) for song in test_metrics]) for inst in args.instruments} avg_SIRs = {inst : np.mean([np.nanmean(song[inst]["SIR"]) for song in test_metrics]) for inst in args.instruments} for inst in args.instruments: writer.add_scalar("test_SDR_" + inst, avg_SDRs[inst], state["step"]) writer.add_scalar("test_SIR_" + inst, avg_SIRs[inst], state["step"]) overall_SDR = np.mean([v for v in avg_SDRs.values()]) writer.add_scalar("test_SDR", overall_SDR) print("SDR: " + str(overall_SDR)) writer.close()
x_rep_combined, y_combined, x_rep_list, y_list = utils.get_rep( envs, rep_model) credit_before, absolute_weights = trainer.compute_variance_cheating( flags["l1_penalty"], linear_predictor, x_rep_list, y_list, x_rep_combined, y_combined, device) magnitude = list(linear_predictor.parameters())[0] max_v_arg = torch.argmax(credit_before.squeeze()) min_mag_arg = torch.argmin(magnitude.squeeze()) max_v_val_before = credit_before.squeeze()[max_v_arg] min_mag_val_before = magnitude.squeeze()[min_mag_arg] credit_before = torch.sum(torch.abs(credit_before)**2) loss_before = utils.compute_loss(linear_predictor, x_rep_combined, y_combined) if rep_steps % 10 == 0: rep_model.perturb_layer(0, 0.2 * random.random()) credit_before_real = copy.deepcopy(credit_before) else: rep_model.perturb_feature(min_mag_arg, 0.2 * random.random()) x_rep_combined, y_combined, x_rep_list, y_list = utils.get_rep( envs, rep_model) # Cheating to speed up the experiment; we could compute this online, but that would take longer. credit_after, weights_after = trainer.compute_variance_cheating( flags["l1_penalty"], linear_predictor, x_rep_list, y_list, x_rep_combined, y_combined, device)
def train(epoch, model, train_loader, optimizer, writer, sigma_0, lr_sigma, iters_sig, gaussian_num=1, lamda=0.0, gamma=0.0, gaussian_num_ds=1): model = model.train() train_loss = 0 total = 0 correct = 0 # CE_loss = nn.CrossEntropyLoss() for batch_idx, (batch, targets, idx) in enumerate(train_loader): optimizer.zero_grad() batch_size = len(idx) batch = batch.to(device) targets = targets.to(device) # model.eval() sigma, _ = get_sigma(model, batch, lr_sigma, sigma_0[idx], iters_sig, device, gaussian_num=gaussian_num_ds) # model.train() sigma_0[idx] = sigma # updating sigma #repeating the input for computing the macer loss new_shape = [batch_size * gaussian_num] new_shape.extend(batch[0].shape) batch = batch.repeat((1, gaussian_num, 1, 1)).view(new_shape) #repeating sigmas to do the monte carlo sigma_repeated = sigma.repeat( (1, gaussian_num, 1, 1)).view(-1, 1, 1, 1) noise = torch.randn_like(batch) * sigma_repeated batch_corrupted = batch + noise outputs_softmax = model(batch_corrupted).reshape( batch_size, gaussian_num, 1000).mean(1) #1000 here is for ImageNet # clean_output = model(batch) total_loss = compute_loss(outputs_softmax, targets) if torch.isnan(outputs_softmax).any() or torch.isnan(total_loss).any(): print('F**k') total_loss += lamda * macer_loss(outputs_softmax, targets, sigma, gamma) # clean_loss = compute_loss(clean_output, targets) # total_loss += clean_loss train_loss += total_loss.item() * len(batch) _, predicted = outputs_softmax.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() # update parameters total_loss.backward() optimizer.step() if batch_idx % 100 == 0: print( '+ Epoch: {}. Iter: [{}/{} ({:.0f}%)]. Loss: {}. Accuracy: {}'. format(epoch, batch_idx * len(batch), len(train_loader.dataset), 100. * batch_idx / len(train_loader), train_loss / total, 100. * correct / total)) n = min(batch.size(0), 8) comparison = torch.cat([batch[:n], batch_corrupted[:n]]) comparison = torch.clamp(comparison, min=0, max=1) fig = plot_samples(comparison.detach().cpu().numpy().transpose( 0, 2, 3, 1).squeeze(), h=2, w=n) writer.add_figure('sample of noisy trained examples', fig, epoch) writer.add_scalar('loss/train_loss', train_loss / total, epoch) writer.add_scalar('accuracy/train_accuracy', 100. * correct / total, epoch) writer.add_scalar('sigma/train_sigma_mean', sigma_0.mean().item(), epoch) writer.add_scalar('sigma/train_sigma_min', sigma_0.min().item(), epoch) writer.add_scalar('sigma/train_sigma_max', sigma_0.max().item(), epoch) return sigma_0
def learner(args): comm_cross = global_dict['comm_cross'] hvd.init(comm=comm_cross) torch.cuda.set_device(hvd.local_rank()) env = wrap_atari_dqn(make_atari(args['env']), args) # utils.set_global_seeds(args['seed'], use_torch=True) device = args['device'] model = DuelingDQN(env, args).to(device) if os.path.exists('model.pth'): # model.load_state_dict(torch.load('model.pth')) pass tgt_model = DuelingDQN(env, args).to(device) del env writer = SummaryWriter(log_dir=os.path.join( args['log_dir'], f'{global_dict["unit_idx"]}-learner')) # optimizer = torch.optim.SGD(model.parameters(), 1e-5 * args['num_units'], momentum=0.8) # optimizer = torch.optim.RMSprop(model.parameters(), args['lr'], alpha=0.95, eps=1.5e-7, centered=True) optimizer = torch.optim.Adam(model.parameters(), args['lr'] * args['num_units']) optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) hvd.broadcast_parameters(model.state_dict(), root_rank=0) tgt_model.load_state_dict(model.state_dict()) if args['dynamic_gradient_clip']: grad_norm_running_mean = args['gradient_norm_running_mean'] grad_norm_lambda = args['gradient_norm_lambda'] batch_queue = queue.Queue(maxsize=3) prios_queue = queue.Queue(maxsize=4) param_queue = queue.Queue(maxsize=3) threading.Thread(target=recv_batch, args=(batch_queue, )).start() threading.Thread(target=send_prios, args=(prios_queue, )).start() threading.Thread(target=send_param, args=(param_queue, )).start() if global_dict['unit_idx'] == 0: threading.Thread(target=send_param_evaluator, args=(param_queue, )).start() prefetcher = data_prefetcher(batch_queue, args['cuda']) learn_idx = 0 ts = time.time() tb_dict = { k: [] for k in [ 'loss', 'grad_norm', 'max_q', 'mean_q', 'min_q', 'batch_queue_size', 'prios_queue_size' ] } first_rount = True while True: (*batch, idxes) = prefetcher.next() if first_rount: print("start training") sys.stdout.flush() first_rount = False loss, prios, q_values = utils.compute_loss(model, tgt_model, batch, args['n_steps'], args['gamma']) optimizer.zero_grad() loss.backward() if args['dynamic_gradient_clip']: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_norm_running_mean * args['clipping_threshold']) grad_norm_running_mean = grad_norm_running_mean * grad_norm_lambda + \ min(grad_norm, grad_norm_running_mean * args['clipping_threshold']) * (1-grad_norm_lambda) else: grad_norm = torch.norm( torch.stack([ torch.norm(p.grad.detach(), 2) for p in model.parameters() ]), 2) # global_prios_sum = np.array(prios_sum) # comm_cross.Allreduce(MPI.IN_PLACE, global_prios_sum.data) # global_prios_sum = float(global_prios_sum) # scale = prios_sum / global_prios_sum if args['dynamic_gradient_clip'] and args[ 'dropping_threshold'] and grad_norm > grad_norm_running_mean * args[ 'dropping_threshold']: pass else: optimizer.step() prios_queue.put((idxes, prios)) learn_idx += 1 tb_dict["loss"].append(float(loss)) tb_dict["grad_norm"].append(float(grad_norm)) tb_dict["max_q"].append(float(torch.max(q_values))) tb_dict["mean_q"].append(float(torch.mean(q_values))) tb_dict["min_q"].append(float(torch.min(q_values))) tb_dict["batch_queue_size"].append(batch_queue.qsize()) tb_dict["prios_queue_size"].append(prios_queue.qsize()) if learn_idx % args['target_update_interval'] == 0: tgt_model.load_state_dict(model.state_dict()) if learn_idx % args['save_interval'] == 0 and global_dict[ 'unit_idx'] == 0: torch.save(model.state_dict(), "model.pth") if learn_idx % args['publish_param_interval'] == 0: param_queue.put(model.state_dict()) if learn_idx % args['tb_interval'] == 0: bps = args['tb_interval'] / (time.time() - ts) for i, (k, v) in enumerate(tb_dict.items()): writer.add_scalar(f'learner/{i+1}_{k}', np.mean(v), learn_idx) v.clear() writer.add_scalar(f"learner/{i+2}_BPS", bps, learn_idx) ts = time.time()