def forward(self, images, labels, classes, reconstructions): left = F.relu(0.9 - classes, inplace=True) ** 2 right = F.relu(classes - 0.1, inplace=True) ** 2 margin_loss = labels * left + 0.5 * (1. - labels) * right margin_loss = margin_loss.sum() assert torch.numel(images) == torch.numel(reconstructions) images = images.view(reconstructions.size()[0], -1) reconstruction_loss = self.reconstruction_loss(reconstructions, images) return (margin_loss + 0.0005 * reconstruction_loss) / images.size(0)
def forward(self, x): x = self.conv1(x) x = self.conv2(x) x = self.conv3(x) x = x.view(-1, torch.numel(x[0])) #automatic x = self.fc1(x) x = self.fc2(x) return x
def forward(self, x): x = self.conv1(x) x = self.relu(x) x = self.pool(x) x = self.conv2(x) x = self.relu(x) x = self.pool(x) x = self.conv3(x) x = self.relu(x) x = self.pool(x) x = self.conv4(x) x = self.relu(x) #print(x.shape) x = x.view(-1, torch.numel(x[0])) #print(x.shape) x = self.dropout_1(x) x = self.fc1(x) x = self.thres(x) x = self.dropout_2(x) x = self.fc2(x) return x
def extract(m): global sparses global nums if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear): nums.append(torch.numel(m.weight.data))
def load(cls, model_path, sp_model_path, device, print_stats=True): paramspath = os.path.join(model_path, 'params.json') with open(paramspath, 'r') as paramsf: xl_params = json.loads(paramsf.read()) print(repr(xl_params)) model = MemTransformerLM( xl_params['ntokens'], # 50000, xl_params['n_layer'], # 16, xl_params['n_head'], # 10, xl_params['d_model'], # 410, xl_params['d_head'], # 41, xl_params['d_inner'], # 2100, 0.0, # no dropout, 0.0, # no dropatt, tie_weight=xl_params['tie_weight'], # True, d_embed=xl_params['d_embed'], # 410, div_val=xl_params['div_val'], # 1, tie_projs=xl_params['tie_projs'], # [False, True, True, True] pre_lnorm=xl_params['pre_lnorm'], # False, tgt_len=xl_params['tgt_len'], # 150, ext_len=xl_params['ext_len'], # 0, mem_len=xl_params['mem_len'], # 150, cutoffs=xl_params['cutoffs'], # [3500, 7500, 37500], same_length=xl_params['same_length'], # False, attn_type=xl_params['attn_type'], # 0, clamp_len=xl_params['clamp_len'], # -1, sample_softmax=xl_params['sample_softmax']) # -1 state_dict_path = os.path.join(model_path, 'valid_state_dict.pt') print("loading weights %s ..." % state_dict_path) tensor_dict = torch.load(state_dict_path, map_location=torch.device(device)) model.load_state_dict(tensor_dict) print("loading weights %s ... done." % state_dict_path) if print_stats: tensor_list = list(tensor_dict.items()) for layer_tensor_name, tensor in tensor_list: print("Layer %-42s: %9d elements" % (layer_tensor_name, torch.numel(tensor))) pytorch_total_params = sum(p.numel() for p in model.parameters()) print("Total # params: %d" % pytorch_total_params) # with open(os.path.join(MODEL_PATH, 'model.pt'), 'rb') as f: # model = torch.load(f) # model.apply(update_dropout) # model.apply(update_dropatt) para_model = model.to(device) # print ("loading model %s ... done." % MODEL_PATH) print("loading sp model from %s ..." % sp_model_path) sp_model = spm.SentencePieceProcessor() sp_model.load(sp_model_path) print("loading sp model from %s ... done." % sp_model_path) return cls(para_model, sp_model, device)
def trainc(self, net, args, iter_glob, user_epoch, net_ema=None, diff_w_old=None): net.train() if net_ema != None: net_ema.train() optimizer = torch.optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=False) epoch_loss = [] epoch_loss_ema = [] w_t = [] class_criterion = nn.CrossEntropyLoss(size_average=False, ignore_index=-1) if args.dataset == 'cifar' and args.iid != 'noniid_tradition': consistency_criterion = softmax_kl_loss else: consistency_criterion = softmax_mse_loss residual_logit_criterion = symmetric_mse_loss for iter in range(self.args.local_ep): batch_loss = [] batch_loss_ema = [] for batch_idx, (img, label) in enumerate(self.ldr_train): img, img_ema, label = img[0].to(self.args.device), img[1].to( self.args.device), label.to(self.args.device) adjust_learning_rate(optimizer, user_epoch * args.local_ep + iter + 1, batch_idx, len(self.ldr_train), args) input_var = torch.autograd.Variable(img) ema_input_var = torch.autograd.Variable(img_ema, volatile=True) target_var = torch.autograd.Variable(label) minibatch_size = len(target_var) labeled_minibatch_size = target_var.data.ne(-1).sum() if net_ema != None: ema_model_out = net_ema(ema_input_var) else: ema_model_out = net(ema_input_var) model_out = net(input_var) if isinstance(model_out, Variable): logit1 = model_out ema_logit = ema_model_out else: assert len(model_out) == 2 assert len(ema_model_out) == 2 logit1, logit2 = model_out ema_logit, _ = ema_model_out ema_logit = Variable(ema_logit.detach().data, requires_grad=False) class_logit, cons_logit = logit1, logit1 classification_weight = 1 class_loss = classification_weight * class_criterion( class_logit, target_var) / minibatch_size ema_class_loss = class_criterion(ema_logit, target_var) / minibatch_size consistency_weight = get_current_consistency_weight( user_epoch * args.local_ep + iter + 1) consistency_loss = consistency_weight * consistency_criterion( cons_logit, ema_logit) / minibatch_size loss = class_loss + consistency_loss optimizer.zero_grad() loss.backward() optimizer.step() if net_ema != None: if iter_glob > args.phi_g: update_ema_variables( net, net_ema, args.ema_decay, user_epoch * args.local_ep + iter + 1) else: update_ema_variables( net, net_ema, 0.0, user_epoch * args.local_ep + iter + 1) batch_loss.append(class_loss.item()) batch_loss_ema.append(consistency_loss.item()) epoch_loss.append(sum(batch_loss) / len(batch_loss)) epoch_loss_ema.append(sum(batch_loss_ema) / len(batch_loss_ema)) if self.args.test == 2: return net.state_dict(), net_ema.state_dict( ), sum(epoch_loss) / len(epoch_loss), sum(epoch_loss_ema) / len( epoch_loss_ema), epoch_loss, epoch_loss_ema if int(diff_w_old) != None: w, w_ema = net.state_dict(), net_ema.state_dict() w_dic, w_ema_dic, diff_w_ema = {}, {}, {} comu_w, comu_w_ema = 0, 0 w_keys = list(w.keys()) for i in w_keys: diff_w_ema[i] = ((w[i] - w_ema[i]).float().norm(2)**2, w[i].float().norm(2)**2) if len(diff_w_ema) == 33: diff_w_ema = cifar_add(diff_w_ema) else: diff_w_ema = mnist_add(diff_w_ema) for i in w_keys: if (iter_glob < args.phi_g): w_ema_dic[i] = w_ema[i] comu_w_ema += torch.numel(w_ema_dic[i]) else: if diff_w_ema[i] >= args.threshold * diff_w_old: w_dic[i] = w[i] comu_w += torch.numel(w_dic[i]) else: w_ema_dic[i] = w_ema[i] comu_w_ema += torch.numel(w_ema_dic[i]) return w_dic, w_ema_dic, w_ema, sum(epoch_loss) / len( epoch_loss), sum(epoch_loss_ema) / len( epoch_loss_ema), diff_w_ema, comu_w, comu_w_ema if net_ema != None: return net.state_dict(), net_ema.state_dict( ), sum(epoch_loss) / len(epoch_loss), sum(epoch_loss_ema) / len( epoch_loss_ema) else: return net.state_dict(), sum(epoch_loss) / len(epoch_loss), sum( epoch_loss_ema) / len(epoch_loss_ema)
def check_accuracy(args, loader, generator, discriminator, d_loss_fn, limit=False): d_losses = [] metrics = {} g_l2_losses_abs, g_l2_losses_rel = ([],) * 2 disp_error, disp_error_l, disp_error_nl = ([],) * 3 f_disp_error, f_disp_error_l, f_disp_error_nl = ([],) * 3 total_traj, total_traj_l, total_traj_nl = 0, 0, 0 loss_mask_sum = 0 generator.eval() with torch.no_grad(): for batch in loader: batch = [tensor.cuda() for tensor in batch] # modified by zyl 2020/12/14 10:13 (obs_traj, pred_traj_gt, obs_traj_rel, pred_traj_gt_rel, non_linear_ped, loss_mask, seq_start_end) = batch (obs_traj, pred_traj_gt, obs_traj_rel, pred_traj_gt_rel, obs_traj_rel_v, pred_traj_rel_v, obs_traj_g, pred_traj_g, non_linear_ped, loss_mask, seq_start_end) = batch linear_ped = 1 - non_linear_ped loss_mask = loss_mask[:, args.obs_len:] pred_traj_fake_rel = generator(obs_traj, obs_traj_rel, seq_start_end, obs_traj_g) pred_traj_fake = relative_to_abs(pred_traj_fake_rel, obs_traj[-1]) g_l2_loss_abs, g_l2_loss_rel = cal_l2_losses( pred_traj_gt, pred_traj_gt_rel, pred_traj_fake, pred_traj_fake_rel, loss_mask ) ade, ade_l, ade_nl = cal_ade( pred_traj_gt, pred_traj_fake, linear_ped, non_linear_ped ) fde, fde_l, fde_nl = cal_fde( pred_traj_gt, pred_traj_fake, linear_ped, non_linear_ped ) traj_real = torch.cat([obs_traj, pred_traj_gt], dim=0) traj_real_rel = torch.cat([obs_traj_rel, pred_traj_gt_rel], dim=0) traj_fake = torch.cat([obs_traj, pred_traj_fake], dim=0) traj_fake_rel = torch.cat([obs_traj_rel, pred_traj_fake_rel], dim=0) scores_fake = discriminator(traj_fake, traj_fake_rel, seq_start_end) scores_real = discriminator(traj_real, traj_real_rel, seq_start_end) d_loss = d_loss_fn(scores_real, scores_fake) d_losses.append(d_loss.item()) g_l2_losses_abs.append(g_l2_loss_abs.item()) g_l2_losses_rel.append(g_l2_loss_rel.item()) disp_error.append(ade.item()) disp_error_l.append(ade_l.item()) disp_error_nl.append(ade_nl.item()) f_disp_error.append(fde.item()) f_disp_error_l.append(fde_l.item()) f_disp_error_nl.append(fde_nl.item()) loss_mask_sum += torch.numel(loss_mask.data) total_traj += pred_traj_gt.size(1) total_traj_l += torch.sum(linear_ped).item() total_traj_nl += torch.sum(non_linear_ped).item() if limit and total_traj >= args.num_samples_check: break metrics['d_loss'] = sum(d_losses) / len(d_losses) metrics['g_l2_loss_abs'] = sum(g_l2_losses_abs) / loss_mask_sum metrics['g_l2_loss_rel'] = sum(g_l2_losses_rel) / loss_mask_sum metrics['ade'] = sum(disp_error) / (total_traj * args.pred_len) metrics['fde'] = sum(f_disp_error) / total_traj if total_traj_l != 0: metrics['ade_l'] = sum(disp_error_l) / (total_traj_l * args.pred_len) metrics['fde_l'] = sum(f_disp_error_l) / total_traj_l else: metrics['ade_l'] = 0 metrics['fde_l'] = 0 if total_traj_nl != 0: metrics['ade_nl'] = sum(disp_error_nl) / ( total_traj_nl * args.pred_len) metrics['fde_nl'] = sum(f_disp_error_nl) / total_traj_nl else: metrics['ade_nl'] = 0 metrics['fde_nl'] = 0 generator.train() return metrics
def launch(self, resume=False, cfg_path_source=None, test_only=False, tag="norm"): time_str = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") self.ckpt_dir = os.path.join(self.exp_dir, "checkpoints") self.log_dir = os.path.join(self.exp_dir, "logs") self.cfg_path = os.path.join(self.exp_dir, "cfg.yaml") self.res_dir = os.path.join( self.exp_dir, "results-{tag}-{time}.yaml".format(tag=tag, time=time_str)) exists = os.path.exists(self.exp_dir) if not exists or not resume: if exists: print("Removing previous experiment...") id1 = uuid.uuid1() temp_dir = tempfile.gettempdir() copyfile(cfg_path_source, os.path.join(temp_dir, str(id1) + "cfg.yaml")) rmtree(self.exp_dir) os.makedirs(self.ckpt_dir) copyfile(os.path.join(temp_dir, str(id1) + "cfg.yaml"), cfg_path_source) else: os.makedirs(self.ckpt_dir) self.log_text_dir = os.path.join( self.exp_dir, "output-{tag}-{time}.log".format(tag=tag, time=time_str)) sys.stdout = Logger(self.log_text_dir) if hasattr(self, "__config__"): print("Launching experiment with the configuration description:") print(yaml.dump(self.__config__)) self.writer = SummaryWriter(self.log_dir) if cfg_path_source != None: # backup the config file in the log folder if cfg_path_source != self.cfg_path: copyfile(cfg_path_source, self.cfg_path) print("\n[Phase 1] : Data Preparation") self.trainer.set_data_loaders(self.trainloader, self.testloader, self.mini_testloader) print("\n[Phase 2] : Model setup") print(self.trainer.net) print("total # of parameters = {:,} ({:,} trainable)".format( sum([torch.numel(p) for p in self.trainer.net.parameters()]), sum([ torch.numel(p) for p in filter(lambda x: x.requires_grad, self.trainer.net.parameters()) ]), )) if torch.cuda.is_available(): self.trainer.net.cuda() # Test model: Temporary hack self.trainer.test_run_model() if resume: print("| Resuming from checkpoint...") self.load_state() print("Initial Validation...") acc_valid = self.trainer.run(self.epoch, self.num_epochs, is_training=False) # import pdb; pdb.set_trace() acc_valid.summarize() self.best_metrics = acc_valid print("\nSaving the Best Checkpoint...") # import pdb; pdb.set_trace() self.save_state() print("Best Metrics: {acc}\n".format( acc=self.best_metrics.summary_str(dtype="scalar", level=0))) if not test_only: print("\n[Phase 3] : Training model") print("| Training Epochs = " + str(self.num_epochs)) elapsed_time = 0 while self.epoch <= self.num_epochs: print("Running at [{}] ...".format(self.exp_dir)) start_time = time.time() acc_train = self.trainer.run(self.epoch, self.num_epochs, is_training=True) acc_valid = self.trainer.run(self.epoch, self.num_epochs, is_training=False) acc_mini_test = self.trainer.run(self.epoch, self.num_epochs, is_training=False, mini_test=True) acc_train.summarize() acc_valid.summarize() self.log_acc(acc_train, acc_valid) self.epoch += 1 if self.best_metrics_comparator(self.best_metrics, acc_valid): print("\nSaving the Best Checkpoint...") self.best_metrics = acc_valid self.save_state() print("Best Metrics: {acc}\n".format( acc=self.best_metrics.summary_str(dtype="scalar", level=0))) print(acc_valid) epoch_time = time.time() - start_time elapsed_time += epoch_time print("| Elapsed time : %d:%02d:%02d" % (get_hms(elapsed_time))) else: print("\n[Phase 4] : Final Performance") print("* Test results : {acc}".format(acc=self.best_metrics)) print("Restoring the Best Checkpoint...") self.load_state() # self.best_metrics.summarize() record = self.best_metrics.filter(dtype="scalar", op=lambda x: float(x)) offset = 5 self.trainer.test_run_model() l_constant = l2_lipschitz_constant_checker(self.trainer.net) # streamline the module during post_steps with Streamline(self.trainer.net, True, False): print("Current l_constant = {}".format(l_constant)) for index, post_step in enumerate(self.post_steps): print("\n[Phase {}] : ".format(index + offset), end="") post_step( self.trainer.net, (self.trainloader, self.testloader, self.mini_testloader), l_constant=l_constant, record=record, device=next(self.trainer.net.parameters()).device, ) print() print(yaml.safe_dump(record)) print("Saving results into a dictionary...") print(self.res_dir) with open(self.res_dir, "w") as f: yaml.safe_dump(record, f) print("finished")
def forward(self, pred, real): diffs = torch.add(real, - pred) n = torch.numel(diffs.data) simse = torch.sum(diffs).pow(2) / (n ** 2) return simse
n_epoch = 50 PATH_model = "./model/modelAfinal.pt" #Initialisation print(PATH_model) model = CNN_A() if torch.cuda.is_available(): model = model.cuda() #Get number of parameters of the model number_parameter = 0 tensor_list = list(model.state_dict().items()) for layer_tensor_name, tensor in tensor_list: print('Layer {}: {} elements'.format(layer_tensor_name, torch.numel(tensor))) number_parameter += torch.numel(tensor) print('total amount of parameters : {}'.format(number_parameter)) #Adam optimizer (used for the Loss backpropagation) optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=5e-4) #Loss object : CrossEntropy crossentropy = nn.CrossEntropyLoss() #Performances tracker train_loss = np.array([]) train_accuracy = np.array([]) val_loss = np.array([])
def num_params(self): total = 0 for p in self.model.module.parameters(): total += th.numel(p.data) return total
def model_parameter_number(model): k = 0 for i in list(model.parameters()): k += torch.numel(i.data) print('model\'s parameter number is :', k) return 0
loss.backward() loss_lst.append(loss.data) print('training the nmf layer') print(loss.data) for A in net.lsqnonneglst.parameters(): A.data = A.data.sub_(lr_nmf * A.grad.data) A.data = A.data.clamp(min=0) total_loss += loss.data # A.requires_grad = False # train the linear classifier print('training the classifier') for k in range(1000): net.zero_grad() pred = net.linear(S_lst[-1].data) loss = criterion(l_batch * pred, l_batch * label) loss = loss * torch.numel(l_batch) / torch.sum(l_batch) loss.backward() if (k + 1) % 100 == 0: print(loss.data) for A in net.linear.parameters(): A.data = A.data.sub_(lr_cl * A.grad.data) # for A in net.lsqnonneglst.parameters(): # A.requires_grad = True print('epoch = ', epo, '\n', total_loss) total_loss_lst.append(total_loss) # In[25]: # Doing forward propagation on the whole dataset, remember to SAVE S and prod! def get_whole_output(net, dataset, param_lst=None):
def dice_metric(input, target): intersection = (input * target).sum(dim=(1, 2, 3)) smooth = torch.ones_like(intersection) * (1 / torch.numel(input[0])) return (2. * intersection) / (input.sum(dim=(1, 2, 3)) + target.sum(dim=(1, 2, 3)) + smooth)
def sum_params(model): net_size = 0 for idx in model.parameters(): # get total parameters net_size = net_size + torch.numel(idx) return net_size
def num_episodes(self): return torch.numel(self.total_rewards)
def main(opt): output_dir = join(config.result_dir, opt.model_name + '_{}{}_{}'.format(opt.optimizer, opt.learning_rate, opt.drop_rate)) make_path(output_dir) output_config = join(output_dir, 'config.json') with open(output_config, 'w') as f: optDict = opt.__dict__ json.dump(optDict, f) log_dir = join(output_dir, 'log') checkpoint_dir = join(output_dir, 'ckpts') make_path(log_dir) make_path(checkpoint_dir) logger = get_logger(log_dir, 'none') logger.info('[Output] {}'.format(output_dir)) ## create a dataset given opt.dataset_mode and other options, the trn_db neither Dataset nor Dataloader trn_db = CustomDatasetDataLoader(opt, config.data_dir, config.target_dir, setname='trn', is_train=True) val_db = CustomDatasetDataLoader(opt, config.data_dir, config.target_dir, setname='val', is_train=False) tst_db = CustomDatasetDataLoader(opt, config.data_dir, config.target_dir, setname='tst', is_train=False) logger.info('The number of training samples = {}'.format(len(trn_db))) logger.info('The number of validation samples = {}'.format(len(val_db))) logger.info('The number of testing samples = {}'.format(len(tst_db))) model_saver = ModelSaver(checkpoint_dir) model = DenseNet(opt.gpu_id, growth_rate=opt.growth_rate, block_config=opt.block_config, num_init_features=opt.num_init_features, bn_size=opt.bn_size, compression_rate=opt.reduction, drop_rate=opt.drop_rate, num_classes=opt.num_classes) # to gpu card model.to(model.device) num_parameters = sum(torch.numel(parameter) for parameter in model.parameters()) logger.info('[Model] parameters {}'.format(num_parameters)) # logger.info(model) # Prepare model if opt.is_test and opt.restore_checkpoint: logger.info('[Model] At testing stage and restore from {}'.format(opt.restore_checkpoint)) checkpoint = torch.load(opt.restore_checkpoint) model.load_state_dict(checkpoint) else: checkpoint = {} # initialized the optimizer if opt.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=opt.learning_rate) else: optimizer = torch.optim.SGD(model.parameters(), lr=opt.learning_rate, momentum=opt.momentum, nesterov=opt.nesterov, weight_decay=opt.weight_decay) scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.reduce_half_lr_epoch, gamma=opt.reduce_half_lr_rate) best_eval_f1 = 0 # record the best eval UAR patience = opt.patience for epoch in range(opt.max_epoch): for i, batch in enumerate(trn_db): # inner loop within one epoch model.set_input(batch) model.forward() batch_loss = model.loss optimizer.zero_grad() model.backward() optimizer.step() if i % 100 == 0: logger.info('\t Cur train batch loss {}'.format(batch_loss)) # for evaluation if epoch % 1 == 0: logger.info("============ Evaluation Epoch {} ============".format(epoch)) logger.info("Cur learning rate {}".format(optimizer.state_dict()['param_groups'][0]['lr'])) val_log = evaluation(model, val_db) logger.info(f"[Validation] Loss: {val_log['loss']:.2f}," f"\t F1: {val_log['F1']*100:.2f}," f"\t WA: {val_log['WA']*100:.2f}," f"\t UA: {val_log['UA']*100:.2f},\n") test_log = evaluation(model, tst_db) logger.info(f"[Testing] Loss: {test_log['loss']:.2f}," f"\t F1: {test_log['F1']*100:.2f}," f"\t WA: {test_log['WA']*100:.2f}," f"\t UA: {test_log['UA']*100:.2f},\n") logger.info(test_log['cm']) logger.info('Save model at {} epoch'.format(epoch)) model_saver.save(model, epoch) # update the current best model based on validation results if val_log['F1'] > best_eval_f1: best_eval_epoch = epoch best_eval_f1 = val_log['F1'] # reset to init patience = opt.patience # for early stop if patience <= 0: break else: patience -= 1 # update the learning rate scheduler.step() # print best eval result logger.info('Loading best model found on val set: epoch-%d' % best_eval_epoch) checkpoint_path = os.path.join(checkpoint_dir, 'model_step_{}.pt'.format(best_eval_epoch)) if not os.path.exists(checkpoint_path): logger.error("Load checkpoint error, not exist such file") exit(0) ck = torch.load(checkpoint_path) model.load_state_dict(ck) val_log = evaluation(model, val_db, save_dir=log_dir, set_name='val') logger.info('[Val] result WA: %.4f UAR %.4f F1 %.4f' % (val_log['WA'], val_log['UA'], val_log['F1'])) logger.info('\n{}'.format(val_log['cm'])) tst_log = evaluation(model, tst_db, save_dir=log_dir, set_name='tst') logger.info('[Tst] result WA: %.4f UAR %.4f F1 %.4f' % (tst_log['WA'], tst_log['UA'], tst_log['F1'])) logger.info('\n{}'.format(tst_log['cm']))
def __call__(pred, y, hinge=0): # Your code here! # Compute hinge loss over the whole tensor of weights. loss = (torch.ones_like(pred) - pred * y).clamp(min=hinge).sum() return loss / torch.numel(pred)
def vae_loss(x, mu, logsigma, recon_x, beta=1): recon_loss = F.mse_loss(x, recon_x, reduction='mean') kl_loss = -0.5 * torch.sum(1 + logsigma - mu.pow(2) - logsigma.exp()) kl_loss = kl_loss / torch.numel(x) return recon_loss + kl_loss * beta
def density(tensor): nonzero = tensor.abs().gt(0).sum() return float(nonzero.item()) / torch.numel(tensor)
return x if __name__ == '__main__': batch_size = 25 seq_len = 128 class_num = 5 ch_num = 4 filterbanks = torch.from_numpy(lin_tri_filter_shape( 32, 256, 100, 0, 50)).to(torch.float).cuda() # [129, 32] net = SeqSleepNet(filterbanks=filterbanks, ch_num=ch_num, seq_len=seq_len, class_num=class_num) #net = SeqSleepNet(filterbanks=filterbanks, seq_len=seq_len, class_num=class_num) net = net.cuda() inputs = torch.rand(batch_size, seq_len, ch_num, int(100 * 30)) # [bs, seq_len, 30*100] inputs = preprocessing(inputs) # [bs, seq_len, 29, 129] print(inputs.shape) inputs = inputs.cuda() outputs = net(inputs) # [bs, seq_len, class_num] params = list(net.parameters()) print(outputs.size()) print("total param num is: {}".format(sum(torch.numel(p) for p in params))) ''' for name, param in net.named_parameters(): print(name, param.shape) '''
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--task", default=None, type=str, required=True, help="Sentiment analysis or natural language inference? (SA or NLI)") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--trained_model_dir", default="", type=str, help= "Where is the fine-tuned (with the cloze-style LM objective) BERT model?" ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--freeze_bert', action='store_true', help="Whether to freeze BERT") parser.add_argument('--full_bert', action='store_true', help="Whether to use full BERT") parser.add_argument('--num_train_samples', type=int, default=-1, help="-1 for full train set, otherwise please specify") parser.add_argument('--damping', type=float, default=0.0, help="probably need damping for deep models") parser.add_argument('--test_idx', type=int, default=1, help="test index we want to examine") parser.add_argument( '--influence_on_decision', action='store_true', help= "Whether to compute influence on decision (rather than influence on ground truth)" ) parser.add_argument("--if_compute_saliency", default=1, type=int) parser.add_argument('--start_test_idx', type=int, default=-1, help="when not -1, --test_idx will be disabled") parser.add_argument('--end_test_idx', type=int, default=-1, help="when not -1, --test_idx will be disabled") parser.add_argument("--lissa_repeat", default=1, type=int) parser.add_argument("--lissa_depth", default=1.0, type=float) args = parser.parse_args() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.influence_on_decision: raise ValueError( "To use loss function w.r.t. the ground truth, manually disable this error in the code." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): #raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) logger.info( "WARNING: Output directory already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) mnli_processor = MnliProcessor() hans_processor = HansProcessor() sst_processor = Sst2Processor() if args.task == "SA": label_list = sst_processor.get_labels() elif args.task == "NLI": label_list = mnli_processor.get_labels() elif args.task == "NLI_negation": label_list = mnli_processor.get_labels() elif args.task == "NLI_natural": label_list = mnli_processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Prepare model model = MyBertForSequenceClassification.from_pretrained( args.trained_model_dir, num_labels=num_labels) if args.fp16: raise ValueError("Not sure if FP16 precision works yet.") model.half() model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) # for n, p in param_optimizer: # print(n) # sys.exit() if args.freeze_bert: frozen = ['bert'] elif args.full_bert: frozen = [] else: frozen = [ 'bert.embeddings.', 'bert.encoder.layer.0.', 'bert.encoder.layer.1.', 'bert.encoder.layer.2.', 'bert.encoder.layer.3.', 'bert.encoder.layer.4.', 'bert.encoder.layer.5.', 'bert.encoder.layer.6.', 'bert.encoder.layer.7.', ] # *** change here to filter out params we don't want to track *** param_influence = [] for n, p in param_optimizer: if (not any(fr in n for fr in frozen)): param_influence.append(p) elif 'bert.embeddings.word_embeddings.' in n: pass # need gradients through embedding layer for computing saliency map else: p.requires_grad = False param_shape_tensor = [] param_size = 0 for p in param_influence: tmp_p = p.clone().detach() param_shape_tensor.append(tmp_p) param_size += torch.numel(tmp_p) logger.info(" Parameter size = %d", param_size) if args.task == "SA": train_examples = sst_processor.get_train_examples( args.data_dir, args.num_train_samples) elif args.task == "NLI": train_examples = mnli_processor.get_train_examples( args.data_dir, args.num_train_samples) elif args.task == "NLI_negation": train_examples = mnli_processor.get_train_examples( args.data_dir, args.num_train_samples) elif args.task == "NLI_natural": train_examples = mnli_processor.get_train_examples( args.data_dir, args.num_train_samples) train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Train set *****") logger.info(" Num examples = %d", len(train_examples)) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_id = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_guids = torch.tensor([f.guid for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_id, all_guids) train_dataloader_wbatch = DataLoader(train_data, sampler=SequentialSampler(train_data), batch_size=args.train_batch_size) train_dataloader = DataLoader(train_data, sampler=SequentialSampler(train_data), batch_size=1) if args.task == "SA": test_examples = sst_processor.get_dev_examples(args.data_dir) elif args.task == "NLI": test_examples = hans_processor.get_test_examples(args.data_dir) elif args.task == "NLI_negation": test_examples = hans_processor.get_neg_test_examples(args.data_dir) elif args.task == "NLI_natural": test_examples = mnli_processor.get_dev_examples(args.data_dir) test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Test set *****") logger.info(" Num examples = %d", len(test_examples)) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_label_id = torch.tensor([f.label_id for f in test_features], dtype=torch.long) all_guids = torch.tensor([f.guid for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_id, all_guids) test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data), batch_size=1) damping = args.damping test_idx = args.test_idx start_test_idx = args.start_test_idx end_test_idx = args.end_test_idx for input_ids, input_mask, segment_ids, label_ids, guids in test_dataloader: model.eval() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) train_dataloader_lissa = DataLoader(train_data, batch_size=args.train_batch_size, shuffle=True, drop_last=True) guid = guids[0].item( ) # test set loader must have a batch size of 1 now if start_test_idx != -1 and end_test_idx != -1: if guid < start_test_idx: continue if guid > end_test_idx: break else: if guid < test_idx: continue if guid > test_idx: break input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) ######## GET TEST EXAMPLE DECISION ######## with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() outputs = np.argmax(logits, axis=1) pred_label_ids = torch.from_numpy(outputs).long().to(device) if label_ids.item() == pred_label_ids.item(): test_pred_status = "correct" else: test_pred_status = "wrong" if args.influence_on_decision: label_ids = torch.from_numpy(outputs).long().to(device) ################ ######## L_TEST GRADIENT ######## model.zero_grad() test_loss = model(input_ids, segment_ids, input_mask, label_ids) test_grads = autograd.grad(test_loss, param_influence) ################ ######## TEST EXAMPLE SALIENCY MAP ######## if args.if_compute_saliency: saliency_scores = saliency_map(model, input_ids, segment_ids, input_mask, pred_label_ids) test_tok_sal_list = [] for tok, sal in zip( tokenizer.convert_ids_to_tokens( input_ids.view(-1).cpu().numpy()), saliency_scores): if tok == '[PAD]': break test_tok_sal_list.append((tok, sal)) ################ ######## IHVP ######## model.train() logger.info("######## START COMPUTING IHVP ########") inverse_hvp = get_inverse_hvp_lissa( test_grads, model, device, param_influence, train_dataloader_lissa, damping=damping, num_samples=args.lissa_repeat, recursion_depth=int(len(train_examples) * args.lissa_depth)) logger.info("######## FINISHED COMPUTING IHVP ########") ################ influences = np.zeros(len(train_dataloader.dataset)) train_tok_sal_lists = [] for train_idx, (_input_ids, _input_mask, _segment_ids, _label_ids, _) in enumerate( tqdm(train_dataloader, desc="Train set index")): model.train() _input_ids = _input_ids.to(device) _input_mask = _input_mask.to(device) _segment_ids = _segment_ids.to(device) _label_ids = _label_ids.to(device) ######## L_TRAIN GRADIENT ######## model.zero_grad() train_loss = model(_input_ids, _segment_ids, _input_mask, _label_ids) train_grads = autograd.grad(train_loss, param_influence) influences[train_idx] = torch.dot( inverse_hvp, gather_flat_grad(train_grads)).item() ################ ######## TRAIN EXAMPLE SALIENCY MAP ######## # if args.if_compute_saliency: # with torch.no_grad(): # logits = model(_input_ids, _segment_ids, _input_mask) # logits = logits.detach().cpu().numpy() # outputs = np.argmax(logits, axis=1) # _pred_label_ids = torch.from_numpy(outputs).long().to(device) # saliency_scores = saliency_map(model, _input_ids, _segment_ids, _input_mask, _pred_label_ids) # train_tok_sal_list = [] # for tok, sal in zip(tokenizer.convert_ids_to_tokens(_input_ids.view(-1).cpu().numpy()), saliency_scores): # if tok == '[PAD]': # break # train_tok_sal_list.append((tok, sal)) # train_tok_sal_lists.append(train_tok_sal_list) ################ if args.influence_on_decision: pickle.dump( influences, open( os.path.join(args.output_dir, "influences_test_" + str(guid) + ".pkl"), "wb")) else: pickle.dump( influences, open( os.path.join(args.output_dir, "influences_on_x_test_" + str(guid) + ".pkl"), "wb")) if args.if_compute_saliency: pickle.dump( (test_tok_sal_list, train_tok_sal_lists, test_pred_status), open( os.path.join(args.output_dir, "saliency_test_" + str(guid) + ".pkl"), "wb"))
def attribute(self, inputs: TensorOrTupleOfTensorsGeneric, baselines: BaselineType = None, target: TargetType = None, additional_forward_args: Any = None, feature_mask: Union[None, Tensor, Tuple[Tensor, ...]] = None, perturbations_per_eval: int = 1, **kwargs: Any) -> TensorOrTupleOfTensorsGeneric: r""" Args: inputs (tensor or tuple of tensors): Input for which ablation attributions are computed. If forward_func takes a single tensor as input, a single input tensor should be provided. If forward_func takes multiple tensors as input, a tuple of the input tensors should be provided. It is assumed that for all given input tensors, dimension 0 corresponds to the number of examples (aka batch size), and if multiple input tensors are provided, the examples must be aligned appropriately. baselines (scalar, tensor, tuple of scalars or tensors, optional): Baselines define reference value which replaces each feature when ablated. Baselines can be provided as: - a single tensor, if inputs is a single tensor, with exactly the same dimensions as inputs or broadcastable to match the dimensions of inputs - a single scalar, if inputs is a single tensor, which will be broadcasted for each input value in input tensor. - a tuple of tensors or scalars, the baseline corresponding to each tensor in the inputs' tuple can be: - either a tensor with matching dimensions to corresponding tensor in the inputs' tuple or the first dimension is one and the remaining dimensions match with the corresponding input tensor. - or a scalar, corresponding to a tensor in the inputs' tuple. This scalar value is broadcasted for corresponding input tensor. In the cases when `baselines` is not provided, we internally use zero scalar corresponding to each input tensor. Default: None target (int, tuple, tensor or list, optional): Output indices for which gradients are computed (for classification cases, this is usually the target class). If the network returns a scalar value per example, no target index is necessary. For general 2D outputs, targets can be either: - a single integer or a tensor containing a single integer, which is applied to all input examples - a list of integers or a 1D tensor, with length matching the number of examples in inputs (dim 0). Each integer is applied as the target for the corresponding example. For outputs with > 2 dimensions, targets can be either: - A single tuple, which contains #output_dims - 1 elements. This target index is applied to all examples. - A list of tuples with length equal to the number of examples in inputs (dim 0), and each tuple containing #output_dims - 1 elements. Each tuple is applied as the target for the corresponding example. Default: None additional_forward_args (any, optional): If the forward function requires additional arguments other than the inputs for which attributions should not be computed, this argument can be provided. It must be either a single additional argument of a Tensor or arbitrary (non-tuple) type or a tuple containing multiple additional arguments including tensors or any arbitrary python types. These arguments are provided to forward_func in order following the arguments in inputs. For a tensor, the first dimension of the tensor must correspond to the number of examples. For all other types, the given argument is used for all forward evaluations. Note that attributions are not computed with respect to these arguments. Default: None feature_mask (tensor or tuple of tensors, optional): feature_mask defines a mask for the input, grouping features which should be ablated together. feature_mask should contain the same number of tensors as inputs. Each tensor should be the same size as the corresponding input or broadcastable to match the input tensor. Each tensor should contain integers in the range 0 to num_features - 1, and indices corresponding to the same feature should have the same value. Note that features within each input tensor are ablated independently (not across tensors). If the forward function returns a single scalar per batch, we enforce that the first dimension of each mask must be 1, since attributions are returned batch-wise rather than per example, so the attributions must correspond to the same features (indices) in each input example. If None, then a feature mask is constructed which assigns each scalar within a tensor as a separate feature, which is ablated independently. Default: None perturbations_per_eval (int, optional): Allows ablation of multiple features to be processed simultaneously in one call to forward_fn. Each forward pass will contain a maximum of perturbations_per_eval * #examples samples. For DataParallel models, each batch is split among the available devices, so evaluations on each available device contain at most (perturbations_per_eval * #examples) / num_devices samples. If the forward function returns a single scalar per batch, perturbations_per_eval must be set to 1. Default: 1 **kwargs (Any, optional): Any additional arguments used by child classes of FeatureAblation (such as Occlusion) to construct ablations. These arguments are ignored when using FeatureAblation directly. Default: None Returns: *tensor* or tuple of *tensors* of **attributions**: - **attributions** (*tensor* or tuple of *tensors*): The attributions with respect to each input feature. If the forward function returns a scalar value per example, attributions will be the same size as the provided inputs, with each value providing the attribution of the corresponding input index. If the forward function returns a scalar per batch, then attribution tensor(s) will have first dimension 1 and the remaining dimensions will match the input. If a single tensor is provided as inputs, a single tensor is returned. If a tuple of tensors is provided for inputs, a tuple of corresponding sized tensors is returned. Examples:: >>> # SimpleClassifier takes a single input tensor of size Nx4x4, >>> # and returns an Nx3 tensor of class probabilities. >>> net = SimpleClassifier() >>> # Generating random input with size 2 x 4 x 4 >>> input = torch.randn(2, 4, 4) >>> # Defining FeatureAblation interpreter >>> ablator = FeatureAblation(net) >>> # Computes ablation attribution, ablating each of the 16 >>> # scalar input independently. >>> attr = ablator.attribute(input, target=1) >>> # Alternatively, we may want to ablate features in groups, e.g. >>> # grouping each 2x2 square of the inputs and ablating them together. >>> # This can be done by creating a feature mask as follows, which >>> # defines the feature groups, e.g.: >>> # +---+---+---+---+ >>> # | 0 | 0 | 1 | 1 | >>> # +---+---+---+---+ >>> # | 0 | 0 | 1 | 1 | >>> # +---+---+---+---+ >>> # | 2 | 2 | 3 | 3 | >>> # +---+---+---+---+ >>> # | 2 | 2 | 3 | 3 | >>> # +---+---+---+---+ >>> # With this mask, all inputs with the same value are ablated >>> # simultaneously, and the attribution for each input in the same >>> # group (0, 1, 2, and 3) per example are the same. >>> # The attributions can be calculated as follows: >>> # feature mask has dimensions 1 x 4 x 4 >>> feature_mask = torch.tensor([[[0,0,1,1],[0,0,1,1], >>> [2,2,3,3],[2,2,3,3]]]) >>> attr = ablator.attribute(input, target=1, feature_mask=feature_mask) """ # Keeps track whether original input is a tuple or not before # converting it into a tuple. is_inputs_tuple = _is_tuple(inputs) inputs, baselines = _format_input_baseline(inputs, baselines) additional_forward_args = _format_additional_forward_args( additional_forward_args) num_examples = inputs[0].shape[0] feature_mask = _format_input( feature_mask) if feature_mask is not None else None assert ( isinstance(perturbations_per_eval, int) and perturbations_per_eval >= 1 ), "Perturbations per evaluation must be an integer and at least 1." with torch.no_grad(): # Computes initial evaluation with all features, which is compared # to each ablated result. initial_eval = _run_forward(self.forward_func, inputs, target, additional_forward_args) agg_output_mode = _find_output_mode_and_verify( initial_eval, num_examples, perturbations_per_eval, feature_mask) if not agg_output_mode: initial_eval = initial_eval.reshape(1, num_examples) # Initialize attribution totals and counts attrib_type = cast( dtype, initial_eval.dtype if isinstance(initial_eval, Tensor) else type(initial_eval), ) total_attrib = [ torch.zeros_like(input[0:1] if agg_output_mode else input, dtype=attrib_type) for input in inputs ] # Weights are used in cases where ablations may be overlapping. if self.use_weights: weights = [ torch.zeros_like( input[0:1] if agg_output_mode else input).float() for input in inputs ] # Iterate through each feature tensor for ablation for i in range(len(inputs)): # Skip any empty input tensors if torch.numel(inputs[i]) == 0: continue for ( current_inputs, current_add_args, current_target, current_mask, ) in self._ablation_generator(i, inputs, additional_forward_args, target, baselines, feature_mask, perturbations_per_eval, **kwargs): # modified_eval dimensions: 1D tensor with length # equal to #num_examples * #features in batch modified_eval = _run_forward( self.forward_func, current_inputs, current_target, current_add_args, ) # eval_diff dimensions: (#features in batch, #num_examples, 1,.. 1) # (contains 1 more dimension than inputs). This adds extra # dimensions of 1 to make the tensor broadcastable with the inputs # tensor. if agg_output_mode: eval_diff = initial_eval - modified_eval else: eval_diff = (initial_eval - modified_eval.reshape(-1, num_examples) ).reshape((-1, num_examples) + (len(inputs[i].shape) - 1) * (1, )) if self.use_weights: weights[i] += current_mask.float().sum(dim=0) total_attrib[i] += (eval_diff * current_mask.to(attrib_type)).sum( dim=0) # Divide total attributions by counts and return formatted attributions if self.use_weights: attrib = tuple( single_attrib.float() / weight for single_attrib, weight in zip(total_attrib, weights)) else: attrib = tuple(total_attrib) _result = _format_output(is_inputs_tuple, attrib) return _result
# 检测是否为PyTorch中的向量(Tensor) x = [12, 23, 34, 45, 56, 67, 78, 89] # checks whether the object is a tensor object print(T.is_tensor(x)) # checks whether the object is stored as tensor object print(T.is_storage(x)) y = T.randn(2, 2, 3) print(y) print(T.is_tensor(y)) print(T.is_storage(y)) # size of tensor print(y.size()) # the total number of elements in the input tensor print(T.numel(y)) # zeros函数 z = T.zeros(4, 5) print(z) print(z.size()) print(T.numel(z)) # eye函数 w1 = T.eye(3, 4) print(w1) print(w1.size()) print(T.numel(w1)) w2 = T.eye(5, 4) print(w2) print(w2.size())
def forward(self, pred, real): diffs = torch.add(real, -pred) n = torch.numel(diffs.data) mse = torch.sum(torch.abs(diffs)) / n return mse
def train(paramdict): fname = paramdict['file'] with open(fname, 'rb') as f: params = pickle.load(f) #params = dict(click.get_current_context().params) print("Passed params: ", params) print(platform.uname()) #params['nbsteps'] = params['nbshots'] * ((params['prestime'] + params['interpresdelay']) * params['nbclasses']) + params['prestimetest'] # Total number of steps per episode suffix = "btchFixmod_" + "".join([ str(x) + "_" if pair[0] != 'nbsteps' and pair[0] != 'rngseed' and pair[0] != 'save_every' and pair[0] != 'test_every' and pair[0] != 'pe' else '' for pair in sorted(zip(params.keys(), params.values()), key=lambda x: x[0]) for x in pair ])[:-1] + "_rngseed_" + str( params['rngseed'] ) # Turning the parameters into a nice suffix for filenames #suffix = "modRPDT_"+"".join([str(x)+"_" if pair[0] != 'nbsteps' and pair[0] != 'rngseed' and pair[0] != 'save_every' and pair[0] != 'test_every' else '' for pair in sorted(zip(params.keys(), params.values()), key=lambda x:x[0] ) for x in pair])[:-1] + "_rngseed_" + str(params['rngseed']) # Turning the parameters into a nice suffix for filenames print("Reconstructed suffix:", suffix) params['rsp'] = 1 #params['rngseed'] = 3 # Initialize random seeds (first two redundant?) print("Setting random seeds") np.random.seed(params['rngseed']) random.seed(params['rngseed']) torch.manual_seed(params['rngseed']) #print(click.get_current_context().params) net = Network(params) # YOU MAY NEED TO CHANGE THE DIRECTORY HERE: if paramdict['initialize'] == 0: net.load_state_dict(torch.load('./tmp/torchmodel_' + suffix + '.dat')) print("Shape of all optimized parameters:", [x.size() for x in net.parameters()]) allsizes = [torch.numel(x.data.cpu()) for x in net.parameters()] print("Size (numel) of all optimized elements:", allsizes) print("Total size (numel) of all optimized elements:", sum(allsizes)) BATCHSIZE = params['bs'] LABSIZE = params['msize'] lab = np.ones((LABSIZE, LABSIZE)) CTR = LABSIZE // 2 # Simple cross maze #lab[CTR, 1:LABSIZE-1] = 0 #lab[1:LABSIZE-1, CTR] = 0 # Double-T maze #lab[CTR, 1:LABSIZE-1] = 0 #lab[1:LABSIZE-1, 1] = 0 #lab[1:LABSIZE-1, LABSIZE - 2] = 0 # Grid maze lab[1:LABSIZE - 1, 1:LABSIZE - 1].fill(0) for row in range(1, LABSIZE - 1): for col in range(1, LABSIZE - 1): if row % 2 == 0 and col % 2 == 0: lab[row, col] = 1 # Not strictly necessary, but cleaner since we start the agent at the # center for each episode; may help loclization in some maze sizes # (including 13 and 9, but not 11) by introducing a detectable irregularity # in the center: lab[CTR, CTR] = 0 all_losses = [] all_grad_norms = [] all_losses_objective = [] all_total_rewards = [] all_losses_v = [] lossbetweensaves = 0 nowtime = time.time() meanrewards = np.zeros((LABSIZE, LABSIZE)) meanrewardstmp = np.zeros((LABSIZE, LABSIZE, params['eplen'])) pos = 0 hidden = net.initialZeroState() hebb = net.initialZeroHebb() pw = net.initialZeroPlasticWeights() #celoss = torch.nn.CrossEntropyLoss() # For supervised learning - not used here params['nbiter'] = 3 ax_imgs = [] for numiter in range(params['nbiter']): PRINTTRACE = 0 #if (numiter+1) % (1 + params['pe']) == 0: if (numiter + 1) % (params['pe']) == 0: PRINTTRACE = 1 #lab = makemaze.genmaze(size=LABSIZE, nblines=4) #count = np.zeros((LABSIZE, LABSIZE)) # Select the reward location for this episode - not on a wall! # And not on the center either! (though not sure how useful that restriction is...) # We always start the episode from the center (when hitting reward, we may teleport either to center or to a random location depending on params['rsp']) posr = {} posc = {} rposr = {} rposc = {} for nb in range(BATCHSIZE): # Note: it doesn't matter if the reward is on the center (see below). All we need is not to put it on a wall or pillar (lab=1) myrposr = 0 myrposc = 0 while lab[myrposr, myrposc] == 1 or (myrposr == CTR and myrposc == CTR): myrposr = np.random.randint(1, LABSIZE - 1) myrposc = np.random.randint(1, LABSIZE - 1) rposr[nb] = myrposr rposc[nb] = myrposc #print("Reward pos:", rposr, rposc) # Agent always starts an episode from the center posc[nb] = CTR posr[nb] = CTR #optimizer.zero_grad() loss = 0 lossv = 0 hidden = net.initialZeroState() hebb = net.initialZeroHebb() et = net.initialZeroHebb( ) # Eligibility Trace is identical to Hebbian Trace in shape pw = net.initialZeroPlasticWeights() numactionchosen = 0 reward = np.zeros(BATCHSIZE) sumreward = np.zeros(BATCHSIZE) rewards = [] vs = [] logprobs = [] dist = 0 numactionschosen = np.zeros(BATCHSIZE, dtype='int32') #reloctime = np.random.randint(params['eplen'] // 4, (3 * params['eplen']) // 4) #print("EPISODE ", numiter) for numstep in range(params['eplen']): inputs = np.zeros((BATCHSIZE, TOTALNBINPUTS), dtype='float32') labg = lab.copy() #labg[rposr, rposc] = -1 # The agent can see the reward if it falls within its RF for nb in range(BATCHSIZE): inputs[nb, 0:RFSIZE * RFSIZE] = labg[posr[nb] - RFSIZE // 2:posr[nb] + RFSIZE // 2 + 1, posc[nb] - RFSIZE // 2:posc[nb] + RFSIZE // 2 + 1].flatten() * 1.0 # Previous chosen action inputs[nb, RFSIZE * RFSIZE + 1] = 1.0 # Bias neuron inputs[nb, RFSIZE * RFSIZE + 2] = numstep / params['eplen'] #inputs[0, RFSIZE * RFSIZE +3] = 1.0 * reward # Reward from previous time step inputs[nb, RFSIZE * RFSIZE + 3] = 1.0 * reward[nb] inputs[nb, RFSIZE * RFSIZE + ADDINPUT + numactionschosen[nb]] = 1 #inputs = 100.0 * inputs # input boosting : Very bad with clamp=0 inputsC = torch.from_numpy(inputs).cuda() # Might be better: #if rposr == posr and rposc = posc: # inputs[0][-4] = 100.0 #else: # inputs[0][-4] = 0 # Running the network ## Running the network y, v, hidden, hebb, et, pw = net( Variable(inputsC, requires_grad=False), hidden, hebb, et, pw) # y should output raw scores, not probas # For now: #numactionchosen = np.argmax(y.data[0]) # But wait, this is bad, because the network needs to see the # reward signal to guide its own (within-episode) learning... and # argmax might not provide enough exploration for this! #ee = np.exp(y.data[0].cpu().numpy()) #numactionchosen = np.random.choice(NBNONRESTACTIONS, p = ee / (1e-10 + np.sum(ee))) y = F.softmax(y, dim=1) # Must convert y to probas to use this ! distrib = torch.distributions.Categorical(y) actionschosen = distrib.sample() logprobs.append(distrib.log_prob(actionschosen)) numactionschosen = actionschosen.data.cpu().numpy( ) # Turn to scalar reward = np.zeros(BATCHSIZE, dtype='float32') #if numiter == 7 and numstep == 1: # pdb.set_trace() for nb in range(BATCHSIZE): myreward = 0 numactionchosen = numactionschosen[nb] tgtposc = posc[nb] tgtposr = posr[nb] if numactionchosen == 0: # Up tgtposr -= 1 elif numactionchosen == 1: # Down tgtposr += 1 elif numactionchosen == 2: # Left tgtposc -= 1 elif numactionchosen == 3: # Right tgtposc += 1 else: raise ValueError("Wrong Action") reward[nb] = 0.0 # The reward for this step if lab[tgtposr][tgtposc] == 1: reward[nb] -= params['wp'] else: #dist += 1 posc[nb] = tgtposc posr[nb] = tgtposr # Did we hit the reward location ? Increase reward and teleport! # Note that it doesn't matter if we teleport onto the reward, since reward hitting is only evaluated after the (obligatory) move if rposr[nb] == posr[nb] and rposc[nb] == posc[nb]: reward[nb] += params['rew'] posr[nb] = np.random.randint(1, LABSIZE - 1) posc[nb] = np.random.randint(1, LABSIZE - 1) while lab[posr[nb], posc[nb]] == 1 or (rposr[nb] == posr[nb] and rposc[nb] == posc[nb]): posr[nb] = np.random.randint(1, LABSIZE - 1) posc[nb] = np.random.randint(1, LABSIZE - 1) rewards.append(reward) vs.append(v) sumreward += reward loss += ( params['bent'] * y.pow(2).sum() / BATCHSIZE ) # We want to penalize concentration, i.e. encourage diversity; our version of PyTorch does not have an entropy() function for Distribution. Note: .2 may be too strong, .04 may be too weak. #lossentmean = .99 * lossentmean + .01 * ( params['bent'] * y.pow(2).sum() / BATCHSIZE ).data[0] # We want to penalize concentration, i.e. encourage diversity; our version of PyTorch does not have an entropy() function for Distribution. Note: .2 may be too strong, .04 may be too weak. if PRINTTRACE: #print("Step ", numstep, "- GI: ", goodinputs, ", GA: ", goodaction, " Inputs: ", inputsN, " - Outputs: ", y.data.cpu().numpy(), " - action chosen: ", numactionchosen, # " - inputsthisstep:", inputsthisstep, " - mean abs pw: ", np.mean(np.abs(pw.data.cpu().numpy())), " -Rew: ", reward) print("Step ", numstep, " Inputs (to 1st in batch): ", inputs[0, :TOTALNBINPUTS], " - Outputs(1st in batch): ", y[0].data.cpu().numpy(), " - action chosen(1st in batch): ", numactionschosen[0], " - mean abs pw: ", np.mean(np.abs(pw.data.cpu().numpy())), " -Reward (this step, 1st in batch): ", reward[0]) # Display the labyrinth #for numr in range(LABSIZE): # s = "" # for numc in range(LABSIZE): # if posr == numr and posc == numc: # s += "o" # elif rposr == numr and rposc == numc: # s += "X" # elif lab[numr, numc] == 1: # s += "#" # else: # s += " " # print(s) #print("") #print("") labg = lab.copy() labg[rposr[0], rposc[0]] = 2 labg[posr[0], posc[0]] = 3 fullimg = plt.imshow(labg, animated=True) ax_imgs.append([fullimg]) # Episode is done, now let's do the actual computations R = Variable(torch.zeros(BATCHSIZE).cuda(), requires_grad=False) gammaR = params['gr'] for numstepb in reversed(range(params['eplen'])): R = gammaR * R + Variable(torch.from_numpy( rewards[numstepb]).cuda(), requires_grad=False) ctrR = R - vs[numstepb][0] lossv += ctrR.pow(2).sum() / BATCHSIZE loss -= (logprobs[numstepb] * ctrR.detach() ).sum() / BATCHSIZE # Need to check if detach() is OK #pdb.set_trace() #elif params['algo'] == 'REI': # R = sumreward # baseline = meanrewards[rposr, rposc] # for numstepb in reversed(range(params['eplen'])) : # loss -= logprobs[numstepb] * (R - baseline) #elif params['algo'] == 'REINOB': # R = sumreward # for numstepb in reversed(range(params['eplen'])) : # loss -= logprobs[numstepb] * R #elif params['algo'] == 'REITMP': # R = 0 # for numstepb in reversed(range(params['eplen'])) : # R = gammaR * R + rewards[numstepb] # loss -= logprobs[numstepb] * R #elif params['algo'] == 'REITMPB': # R = 0 # for numstepb in reversed(range(params['eplen'])) : # R = gammaR * R + rewards[numstepb] # loss -= logprobs[numstepb] * (R - meanrewardstmp[rposr, rposc, numstepb]) #else: # raise ValueError("Which algo?") #meanrewards[rposr, rposc] = (1.0 - params['nu']) * meanrewards[rposr, rposc] + params['nu'] * sumreward #R = 0 #for numstepb in reversed(range(params['eplen'])) : # R = gammaR * R + rewards[numstepb] # meanrewardstmp[rposr, rposc, numstepb] = (1.0 - params['nu']) * meanrewardstmp[rposr, rposc, numstepb] + params['nu'] * R loss += params['blossv'] * lossv loss /= params['eplen'] if True: #PRINTTRACE: if True: #params['algo'] == 'A3C': print("lossv: ", float(lossv)) print("Total reward for this episode:", sumreward, "Dist:", dist) #if numiter > 100: # Burn-in period for meanrewards # loss.backward() # optimizer.step() #torch.cuda.empty_cache() print("Saving animation....") anim = animation.ArtistAnimation(fig, ax_imgs, interval=200) anim.save('anim.gif', writer='imagemagick', fps=10)
def accuracy(yhat, y): num_correct = torch.eq(yhat, y).sum().float() return num_correct / torch.numel(y), num_correct, torch.numel(y)
def get_layer_param(model): return sum([torch.numel(param) for param in model.parameters()])
def forward(self, hidden_states, position_ids, attention_mask, memory_states=None, encoder_states=None, return_memory=False, detach_memory=True): batch_size, query_length = hidden_states.size()[:2] memory_length = memory_states[0].size(1) if memory_states else 0 key_length = query_length + memory_length # attention mask is the beginning postion of B region, \in [0, query_len) is_scalar = torch.numel(attention_mask) == 1 is_sep = is_scalar or torch.numel(attention_mask) == batch_size if self.performer: assert is_scalar, 'attention_mask should be a scalar to indicate the seperation position.' assert memory_length == 0, 'Do not support transformer-xl.' if is_sep: sep = attention_mask.item() if is_scalar else attention_mask # conventional transformer def build_mask_matrix(seq_length, sep, memory_length=0): m = hidden_states.new_ones((1, seq_length, seq_length)) m = torch.tril(m) if is_scalar: m[0, :, :sep] = 1 else: m = m.expand(batch_size, -1, -1) ids = torch.arange(seq_length, device=sep.device, dtype=sep.dtype).view(1, -1) mask = ids < sep.view(-1, 1) m = m.masked_fill(mask.unsqueeze(1).expand_as(m), 1) if memory_length > 0: m = m.expand(batch_size, -1, -1) m = torch.cat((hidden_states.new_ones((batch_size, seq_length, memory_length)), m), dim=2) m = m.unsqueeze(1) return m if not self.performer: attention_mask = build_mask_matrix(query_length, sep, memory_length=memory_length) else: attention_mask = attention_mask[:, :, :, -query_length - memory_length:] if self.relative_encoding: position_sequence = torch.arange(key_length - 1, -1, -1.0, device=hidden_states.device, dtype=hidden_states.dtype) position_embeddings = self.position_embeddings(position_sequence) # Apply dropout position_embeddings = self.embedding_dropout(position_embeddings) else: if self.block_position_encoding: position_ids, block_position_ids = position_ids[:, 0], position_ids[:, 1] position_embeddings = self.position_embeddings(position_ids) hidden_states = hidden_states + position_embeddings if self.block_position_encoding: block_position_embeddings = self.block_position_embeddings(block_position_ids) hidden_states = hidden_states + block_position_embeddings hidden_states = self.embedding_dropout(hidden_states) def check_detach(_hidden_states): if detach_memory: return _hidden_states.detach() return _hidden_states if self.max_memory_length > 0 or return_memory: mem_layers = [check_detach(hidden_states)] else: mem_layers = [] def custom(start, end): def custom_forward(*inputs): layers_ = self.layers[start:end] x_, inputs = inputs[0], inputs[1:] if self.relative_encoding: inputs, mems_ = inputs[:4], inputs[4:] else: inputs, mems_ = inputs[:1], inputs[1:] for i, layer in enumerate(layers_): mem_i_ = mems_[i] if mems_ else None x_ = layer(x_, *inputs, mem=mem_i_) if self.max_memory_length > 0 or return_memory: mem_layers.append(check_detach(x_)) return x_ return custom_forward if self.checkpoint_activations: l = 0 num_layers = len(self.layers) chunk_length = self.checkpoint_num_layers while l < num_layers: args = [hidden_states, attention_mask] if not self.use_decoder_layer else [hidden_states, encoder_states, attention_mask] if self.relative_encoding: args += [position_embeddings, self.r_w_bias, self.r_r_bias] if memory_states: args += memory_states[l: l + chunk_length] hidden_states = checkpoint(custom(l, l + chunk_length), *args) l += chunk_length else: for i, layer in enumerate(self.layers): args = [hidden_states, attention_mask] if not self.use_decoder_layer else [hidden_states, encoder_states, attention_mask] if self.relative_encoding: args += [position_embeddings, self.r_w_bias, self.r_r_bias] mem_i = memory_states[i] if memory_states else None hidden_states = layer(*args, mem=mem_i) if self.max_memory_length > 0 or return_memory: mem_layers.append(check_detach(hidden_states)) # Final layer norm. output = self.final_layernorm(hidden_states) if self.max_memory_length > 0 or return_memory: mem_layers = self.update_mems(mem_layers, memory_states, return_memory=return_memory) return (output, mem_layers)
def train(paramdict): # params = dict(click.get_current_context().params) hebbian_trace = [] # TOTALNBINPUTS = RFSIZE * RFSIZE + ADDITIONALINPUTS + NBNONRESTACTIONS print("Starting training...") params = {} # params.update(defaultParams) params.update(paramdict) print("Passed params: ", params) print(platform.uname()) # params['nbsteps'] = params['nbshots'] * ((params['prestime'] + params['interpresdelay']) * params['nbclasses']) + params['prestimetest'] # Total number of steps per episode suffix = "btchFixmod_" + "".join([ str(x) + "_" if pair[0] is not 'nbsteps' and pair[0] is not 'rngseed' and pair[0] is not 'save_every' and pair[0] is not 'test_every' and pair[0] is not 'pe' else '' for pair in sorted(zip(params.keys(), params.values()), key=lambda x: x[0]) for x in pair ])[:-1] + "_rngseed_" + str( params['rngseed'] ) # Turning the parameters into a nice suffix for filenames # Initialize random seeds (first two redundant?) print("Setting random seeds") np.random.seed(params['rngseed']) random.seed(params['rngseed']) torch.manual_seed(params['rngseed']) print("Initializing network") use_cuda = False #torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") net = Network(TOTALNBINPUTS, params['hs']).to(device) # Creating the network #net.load_state_dict(torch.load('200k_trained_normal.dat')) #net.load_state_dict(torch.load('200k_trained_move_reward.dat')) net.load_state_dict(torch.load('200k_5by5_hidden20_move.dat')) #net.load_state_dict(torch.load('50k_trained_uncertainty.dat')) print("Shape of all optimized parameters:", [x.size() for x in net.parameters()]) allsizes = [torch.numel(x.data.cpu()) for x in net.parameters()] print("Size (numel) of all optimized elements:", allsizes) print("Total size (numel) of all optimized elements:", sum(allsizes)) # total_loss = 0.0 print("Initializing optimizer") optimizer = torch.optim.Adam(net.parameters(), lr=1.0 * params['lr'], eps=1e-4, weight_decay=params['l2']) # optimizer = torch.optim.SGD(net.parameters(), lr=1.0*params['lr']) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, gamma=params['gamma'], step_size=params['steplr']) LABSIZE = params['msize'] lab = np.ones((LABSIZE, LABSIZE)) CTR = LABSIZE // 2 num_zeros = 0 rwloc = [] # Grid maze lab[1:LABSIZE - 1, 1:LABSIZE - 1].fill(0) for row in range(1, LABSIZE - 1): for col in range(1, LABSIZE - 1): if row % 2 == 0 and col % 2 == 0: lab[row, col] = 1 else: rwloc.append([row, col]) num_zeros = num_zeros + 1 BATCHSIZE = params['bs'] NUMMOVES = 1 movebuffer = 5 move = True #BATCHSIZE = num_zeros # Not strictly necessary, but cleaner since we start the agent at the # center for each episode; may help loclization in some maze sizes # (including 13 and 9, but not 11) by introducing a detectable irregularity # in the center: lab[CTR, CTR] = 0 all_losses = [] all_grad_norms = [] all_losses_objective = [] all_total_rewards = [] all_losses_v = [] lossbetweensaves = 0 nowtime = time.time() meanrewards = np.zeros((LABSIZE, LABSIZE)) meanrewardstmp = np.zeros((LABSIZE, LABSIZE, params['eplen'])) pos = 0 hidden = net.initialZeroState(BATCHSIZE) hebb = net.initialZeroHebb(BATCHSIZE) # pw = net.initialZeroPlasticWeights() # For eligibility traces # celoss = torch.nn.CrossEntropyLoss() # For supervised learning - not used here print(rwloc[1]) print("Starting episodes!") PRINTTRACE = 0 # lab = makemaze.genmaze(size=LABSIZE, nblines=4) # count = np.zeros((LABSIZE, LABSIZE)) # Select the reward location for this episode - not on a wall! # And not on the center either! (though not sure how useful that restriction is...) # We always start the episode from the center posr = {} posc = {} reward_move_time = {} #buffer time between front and end for random movement of reward #for example: 50 for 200 episodes would move the reward between 50 and 150. rposr = {} rposc = {} rposr_old = {} rposc_old = {} #Search checks if it's in a searching mode or if it's found the reward search = {} moved = {} BATCHSIZE = params['eplen'] - 2 * movebuffer for nb in range(BATCHSIZE): search[nb] = True moved[nb] = False # Note: it doesn't matter if the reward is on the center (see below). All we need is not to put it on a wall or pillar (lab=1) #For this we'll make the reward always in the same position myrposr = 0 myrposc = 0 while lab[myrposr, myrposc] == 1 or (myrposr == CTR and myrposc == CTR): myrposr = np.random.randint(1, LABSIZE - 1) myrposc = np.random.randint(1, LABSIZE - 1) rposr[nb] = myrposr rposc[nb] = myrposc rposr_old[nb] = myrposr rposc_old[nb] = myrposc # print("Reward pos:", rposr, rposc) # Agent always starts an episode from the center posc[nb] = CTR posr[nb] = CTR #reward_move_time[nb] = random.randint(movebuffer,params['eplen']-movebuffer) reward_move_time[nb] = nb + movebuffer #random.sample(range(movebuffer, params['eplen'] - movebuffer), NUMMOVES) print(reward_move_time) optimizer.zero_grad() loss = 0 lossv = 0 hidden = net.initialZeroState(BATCHSIZE).to(device) hebb = net.initialZeroHebb(BATCHSIZE).to(device) numactionchosen = 0 reward = np.zeros(BATCHSIZE) sumreward = np.zeros(BATCHSIZE) sumreward_after = np.zeros(BATCHSIZE) rewards = [] vs = [] logprobs = [] dist = 0 numactionschosen = np.zeros(BATCHSIZE, dtype='int32') # reloctime = np.random.randint(params['eplen'] // 4, (3 * params['eplen']) // 4) # print("EPISODE ", numiter) for numstep in range(params['eplen']): inputs = np.zeros((BATCHSIZE, TOTALNBINPUTS), dtype='float32') labg = lab.copy() for nb in range(BATCHSIZE): if numstep == reward_move_time[nb] and move: #myrposr = 3; #myrposc = 3 while lab[myrposr, myrposc] == 1 or (myrposr == CTR and myrposc == CTR): myrposr = np.random.randint(1, LABSIZE - 1) myrposc = np.random.randint(1, LABSIZE - 1) rposr[nb] = myrposr rposc[nb] = myrposc moved[nb] = True # print("Reward pos:", rposr, rposc) # Agent always starts an episode from the center inputs[nb, 0:RFSIZE * RFSIZE] = labg[posr[nb] - RFSIZE // 2:posr[nb] + RFSIZE // 2 + 1, posc[nb] - RFSIZE // 2:posc[nb] + RFSIZE // 2 + 1].flatten() * 1.0 # Previous chosen action inputs[nb, RFSIZE * RFSIZE + 1] = 1.0 # Bias neuron inputs[nb, RFSIZE * RFSIZE + 2] = numstep / params['eplen'] inputs[nb, RFSIZE * RFSIZE + 3] = 1.0 * reward[nb] inputs[nb, RFSIZE * RFSIZE + ADDITIONALINPUTS + numactionschosen[nb]] = 1 inputsC = torch.from_numpy(inputs).to(device) ## Running the network y, v, (hidden, hebb) = net( inputsC, (hidden, hebb)) # y should output raw scores, not probas print(hebb.shape) for nb in range(BATCHSIZE): for node in range(params['hs']): hebbian_trace.append( np.concatenate((np.array([nb, node, numstep, search[nb]]), hebb[nb][node].detach().numpy()))) y = torch.softmax(y, dim=1) distrib = torch.distributions.Categorical(y) actionschosen = distrib.sample() logprobs.append(distrib.log_prob(actionschosen)) numactionschosen = actionschosen.data.cpu().numpy( ) # We want to break gradients reward = np.zeros(BATCHSIZE, dtype='float32') for nb in range(BATCHSIZE): myreward = 0 numactionchosen = numactionschosen[nb] tgtposc = posc[nb] tgtposr = posr[nb] if numactionchosen == 0: # Up tgtposr -= 1 elif numactionchosen == 1: # Down tgtposr += 1 elif numactionchosen == 2: # Left tgtposc -= 1 elif numactionchosen == 3: # Right tgtposc += 1 else: raise ValueError("Wrong Action") reward[nb] = 0.0 # The reward for this step if lab[tgtposr][tgtposc] == 1: reward[nb] -= params['wp'] else: posc[nb] = tgtposc posr[nb] = tgtposr #if it hits the old reward location then it reenters search mode if moved[nb] and posr[nb] == rposr_old[nb] and posc[ nb] == rposc_old[nb]: search[nb] = True # Did we hit the reward location ? Increase reward and teleport! # Note that it doesn't matter if we teleport onto the reward, since reward hitting is only evaluated after the (obligatory) move... # But we still avoid it. if rposr[nb] == posr[nb] and rposc[nb] == posc[nb]: if search[nb]: search[nb] = False moved[nb] = False reward[nb] += params['rew'] posr[nb] = np.random.randint(1, LABSIZE - 1) posc[nb] = np.random.randint(1, LABSIZE - 1) while lab[posr[nb], posc[nb]] == 1 or (rposr[nb] == posr[nb] and rposc[nb] == posc[nb]): posr[nb] = np.random.randint(1, LABSIZE - 1) posc[nb] = np.random.randint(1, LABSIZE - 1) if reward_move_time[nb] < numstep: sumreward_after[nb] += reward[nb] else: sumreward[nb] += reward[nb] rewards.append(reward) vs.append(v) #sumreward += reward # This is an "entropy penalty", implemented by the sum-of-squares of the probabilities because our version of PyTorch did not have an entropy() function. # The result is the same: to penalize concentration, i.e. encourage diversity in chosen actions. loss += (params['bent'] * y.pow(2).sum() / BATCHSIZE) # if PRINTTRACE: # print("Step ", numstep, " Inputs (to 1st in batch): ", inputs[0, :TOTALNBINPUTS], " - Outputs(1st in batch): ", y[0].data.cpu().numpy(), " - action chosen(1st in batch): ", numactionschosen[0], # #" - mean abs pw: ", np.mean(np.abs(pw.data.cpu().numpy())), # " -Reward (this step, 1st in batch): ", reward[0]) # Episode is done, now let's do the actual computations of rewards and losses for the A2C algorithm R = torch.zeros(BATCHSIZE).to(device) gammaR = params['gr'] for numstepb in reversed(range(params['eplen'])): R = gammaR * R + torch.from_numpy(rewards[numstepb]).to(device) ctrR = R - vs[numstepb][0] lossv += ctrR.pow(2).sum() / BATCHSIZE loss -= (logprobs[numstepb] * ctrR.detach()).sum() / BATCHSIZE # pdb.set_trace() loss += params['blossv'] * lossv loss /= params['eplen'] if PRINTTRACE: if True: # params['algo'] == 'A3C': print("lossv: ", float(lossv)) print("Total reward for this episode (all):", sumreward, "Dist:", dist) #loss.backward() all_grad_norms.append( torch.nn.utils.clip_grad_norm(net.parameters(), params['gc'])) lossnum = float(loss) lossbetweensaves += lossnum all_losses_objective.append(lossnum) all_total_rewards.append(sumreward.mean()) # all_losses_v.append(lossv.data[0]) # total_loss += lossnum lossbetweensaves = 0 print("Rewards before move:", sumreward) print("Move times: ", reward_move_time) print("Rewards after move:", sumreward_after) #print("Reward Locs", rwloc) print("Mean reward (across batch): ", sumreward.mean() + sumreward_after.mean()) previoustime = nowtime nowtime = time.time() print("Time spent: ", nowtime - previoustime) # print("ETA: ", net.eta.data.cpu().numpy(), " etaet: ", net.etaet.data.cpu().numpy()) # if (numiter + 1) % params['save_every'] == 0: # print("Saving files...") # losslast100 = np.mean(all_losses_objective[-100:]) # print("Average loss over the last 100 episodes:", losslast100) # print("Saving local files...") # with open('grad_' + suffix + '.txt', 'w') as thefile: # for item in all_grad_norms[::10]: # thefile.write("%s\n" % item) # with open('loss_' + suffix + '.txt', 'w') as thefile: # for item in all_total_rewards[::10]: # thefile.write("%s\n" % item) # torch.save(net.state_dict(), 'torchmodel_' + suffix + '.dat') # with open('params_' + suffix + '.dat', 'wb') as fo: # pickle.dump(params, fo) # if os.path.isdir('/mnt/share/tmiconi'): # print("Transferring to NFS storage...") # for fn in ['params_' + suffix + '.dat', 'loss_' + suffix + '.txt', 'torchmodel_' + suffix + '.dat']: # result = os.system( # 'cp {} {}'.format(fn, '/mnt/share/tmiconi/modulmaze/' + fn)) print("Done!") np.savetxt("move_all_hebb_with_search.csv", hebbian_trace, delimiter=",")
data, lab = [_.cuda() for _ in batch] p = args.shot * args.train_way data_shot = data[:p] data_query = data[p:] data_shot = data_shot[:, :3, :] data_query = data_query[:, 3:, :] train_gt = lab[:p].reshape(args.shot, args.train_way)[0, :] #data_query = data_query[:,:3,:] proto = model_cnn(data_shot) proto = proto.reshape(args.shot, args.train_way, -1) which_novel = torch.gt(train_gt, 79) which_novel = args.train_way - torch.numel(train_gt[which_novel]) if which_novel < args.train_way: proto_base = proto[:, :which_novel, :] proto_novel = proto[:, which_novel:, :] noise = torch.cuda.FloatTensor( (args.train_way - which_novel) * args.shot, noise_dim).normal_() proto_novel_gen = model_gen( proto_novel.reshape( args.shot * (args.train_way - which_novel), -1), noise) proto_novel_gen = proto_novel_gen.reshape( args.shot, args.train_way - which_novel, -1) proto_novel_wgen = torch.cat([proto_novel, proto_novel_gen]) ind_gen = torch.randperm(2 * args.shot) train_num = np.random.randint(1, args.shot)
def forward(ctx, x, x_star, gamma): ctx.reg_func = Smoothed1Norm(gamma=gamma) ctx.lin_op = Grad() diff = ctx.lin_op(x - x_star) ctx.save_for_backward(diff, x) return ctx.reg_func(ctx.lin_op(x - x_star)) / torch.numel(x)
def backward(ctx, grad_out): diff, x = ctx.saved_tensors gradx = ctx.lin_op.T(ctx.reg_func.grad(diff)) / torch.numel(x) gradx_star = -gradx return gradx, gradx_star, None
def forward(self, x): x = self.features(x) x = x.view(-1, torch.numel(x[0])) # is this correct? #x = x.view(x.size(0),-1) x = self.classifier(x) return x