Esempio n. 1
0
    def forward(self, images, labels, classes, reconstructions):
        left = F.relu(0.9 - classes, inplace=True) ** 2
        right = F.relu(classes - 0.1, inplace=True) ** 2

        margin_loss = labels * left + 0.5 * (1. - labels) * right
        margin_loss = margin_loss.sum()

        assert torch.numel(images) == torch.numel(reconstructions)
        images = images.view(reconstructions.size()[0], -1)
        reconstruction_loss = self.reconstruction_loss(reconstructions, images)

        return (margin_loss + 0.0005 * reconstruction_loss) / images.size(0)
Esempio n. 2
0
 def forward(self, x): 
     x = self.conv1(x)
     x = self.conv2(x)
     x = self.conv3(x)
     x = x.view(-1, torch.numel(x[0])) #automatic 
     x = self.fc1(x)
     x = self.fc2(x)
     return x
Esempio n. 3
0
    def forward(self, x): 
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.pool(x)
        x = self.conv3(x)
        x = self.relu(x)
        x = self.pool(x)

        x = self.conv4(x)
        x = self.relu(x)
        #print(x.shape)
        x = x.view(-1, torch.numel(x[0]))
        #print(x.shape) 
        x = self.dropout_1(x)
        x = self.fc1(x)
        x = self.thres(x)
        x = self.dropout_2(x)
        x = self.fc2(x)
        return x
def extract(m):
    global sparses
    global nums
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        nums.append(torch.numel(m.weight.data))
Esempio n. 5
0
    def load(cls, model_path, sp_model_path, device, print_stats=True):

        paramspath = os.path.join(model_path, 'params.json')
        with open(paramspath, 'r') as paramsf:
            xl_params = json.loads(paramsf.read())

        print(repr(xl_params))

        model = MemTransformerLM(
            xl_params['ntokens'],  # 50000,
            xl_params['n_layer'],  # 16,
            xl_params['n_head'],  # 10,
            xl_params['d_model'],  # 410,
            xl_params['d_head'],  # 41,
            xl_params['d_inner'],  # 2100,
            0.0,  # no dropout, 
            0.0,  # no dropatt,
            tie_weight=xl_params['tie_weight'],  # True, 
            d_embed=xl_params['d_embed'],  # 410, 
            div_val=xl_params['div_val'],  # 1,
            tie_projs=xl_params['tie_projs'],  # [False, True, True, True] 
            pre_lnorm=xl_params['pre_lnorm'],  # False, 
            tgt_len=xl_params['tgt_len'],  # 150,
            ext_len=xl_params['ext_len'],  # 0, 
            mem_len=xl_params['mem_len'],  # 150, 
            cutoffs=xl_params['cutoffs'],  # [3500, 7500, 37500],
            same_length=xl_params['same_length'],  # False,
            attn_type=xl_params['attn_type'],  # 0,
            clamp_len=xl_params['clamp_len'],  # -1, 
            sample_softmax=xl_params['sample_softmax'])  # -1

        state_dict_path = os.path.join(model_path, 'valid_state_dict.pt')
        print("loading weights %s ..." % state_dict_path)
        tensor_dict = torch.load(state_dict_path,
                                 map_location=torch.device(device))
        model.load_state_dict(tensor_dict)
        print("loading weights %s ... done." % state_dict_path)

        if print_stats:
            tensor_list = list(tensor_dict.items())
            for layer_tensor_name, tensor in tensor_list:
                print("Layer %-42s: %9d elements" %
                      (layer_tensor_name, torch.numel(tensor)))

            pytorch_total_params = sum(p.numel() for p in model.parameters())
            print("Total # params: %d" % pytorch_total_params)

        # with open(os.path.join(MODEL_PATH, 'model.pt'), 'rb') as f:
        #     model = torch.load(f)
        # model.apply(update_dropout)
        # model.apply(update_dropatt)

        para_model = model.to(device)

        # print ("loading model %s ... done." % MODEL_PATH)

        print("loading sp model from %s ..." % sp_model_path)
        sp_model = spm.SentencePieceProcessor()
        sp_model.load(sp_model_path)
        print("loading sp model from %s ... done." % sp_model_path)

        return cls(para_model, sp_model, device)
Esempio n. 6
0
 def trainc(self,
            net,
            args,
            iter_glob,
            user_epoch,
            net_ema=None,
            diff_w_old=None):
     net.train()
     if net_ema != None:
         net_ema.train()
     optimizer = torch.optim.SGD(net.parameters(),
                                 lr=args.lr,
                                 momentum=args.momentum,
                                 weight_decay=args.weight_decay,
                                 nesterov=False)
     epoch_loss = []
     epoch_loss_ema = []
     w_t = []
     class_criterion = nn.CrossEntropyLoss(size_average=False,
                                           ignore_index=-1)
     if args.dataset == 'cifar' and args.iid != 'noniid_tradition':
         consistency_criterion = softmax_kl_loss
     else:
         consistency_criterion = softmax_mse_loss
     residual_logit_criterion = symmetric_mse_loss
     for iter in range(self.args.local_ep):
         batch_loss = []
         batch_loss_ema = []
         for batch_idx, (img, label) in enumerate(self.ldr_train):
             img, img_ema, label = img[0].to(self.args.device), img[1].to(
                 self.args.device), label.to(self.args.device)
             adjust_learning_rate(optimizer,
                                  user_epoch * args.local_ep + iter + 1,
                                  batch_idx, len(self.ldr_train), args)
             input_var = torch.autograd.Variable(img)
             ema_input_var = torch.autograd.Variable(img_ema, volatile=True)
             target_var = torch.autograd.Variable(label)
             minibatch_size = len(target_var)
             labeled_minibatch_size = target_var.data.ne(-1).sum()
             if net_ema != None:
                 ema_model_out = net_ema(ema_input_var)
             else:
                 ema_model_out = net(ema_input_var)
             model_out = net(input_var)
             if isinstance(model_out, Variable):
                 logit1 = model_out
                 ema_logit = ema_model_out
             else:
                 assert len(model_out) == 2
                 assert len(ema_model_out) == 2
                 logit1, logit2 = model_out
                 ema_logit, _ = ema_model_out
             ema_logit = Variable(ema_logit.detach().data,
                                  requires_grad=False)
             class_logit, cons_logit = logit1, logit1
             classification_weight = 1
             class_loss = classification_weight * class_criterion(
                 class_logit, target_var) / minibatch_size
             ema_class_loss = class_criterion(ema_logit,
                                              target_var) / minibatch_size
             consistency_weight = get_current_consistency_weight(
                 user_epoch * args.local_ep + iter + 1)
             consistency_loss = consistency_weight * consistency_criterion(
                 cons_logit, ema_logit) / minibatch_size
             loss = class_loss + consistency_loss
             optimizer.zero_grad()
             loss.backward()
             optimizer.step()
             if net_ema != None:
                 if iter_glob > args.phi_g:
                     update_ema_variables(
                         net, net_ema, args.ema_decay,
                         user_epoch * args.local_ep + iter + 1)
                 else:
                     update_ema_variables(
                         net, net_ema, 0.0,
                         user_epoch * args.local_ep + iter + 1)
             batch_loss.append(class_loss.item())
             batch_loss_ema.append(consistency_loss.item())
         epoch_loss.append(sum(batch_loss) / len(batch_loss))
         epoch_loss_ema.append(sum(batch_loss_ema) / len(batch_loss_ema))
     if self.args.test == 2:
         return net.state_dict(), net_ema.state_dict(
         ), sum(epoch_loss) / len(epoch_loss), sum(epoch_loss_ema) / len(
             epoch_loss_ema), epoch_loss, epoch_loss_ema
     if int(diff_w_old) != None:
         w, w_ema = net.state_dict(), net_ema.state_dict()
         w_dic, w_ema_dic, diff_w_ema = {}, {}, {}
         comu_w, comu_w_ema = 0, 0
         w_keys = list(w.keys())
         for i in w_keys:
             diff_w_ema[i] = ((w[i] - w_ema[i]).float().norm(2)**2,
                              w[i].float().norm(2)**2)
         if len(diff_w_ema) == 33:
             diff_w_ema = cifar_add(diff_w_ema)
         else:
             diff_w_ema = mnist_add(diff_w_ema)
         for i in w_keys:
             if (iter_glob < args.phi_g):
                 w_ema_dic[i] = w_ema[i]
                 comu_w_ema += torch.numel(w_ema_dic[i])
             else:
                 if diff_w_ema[i] >= args.threshold * diff_w_old:
                     w_dic[i] = w[i]
                     comu_w += torch.numel(w_dic[i])
                 else:
                     w_ema_dic[i] = w_ema[i]
                     comu_w_ema += torch.numel(w_ema_dic[i])
         return w_dic, w_ema_dic, w_ema, sum(epoch_loss) / len(
             epoch_loss), sum(epoch_loss_ema) / len(
                 epoch_loss_ema), diff_w_ema, comu_w, comu_w_ema
     if net_ema != None:
         return net.state_dict(), net_ema.state_dict(
         ), sum(epoch_loss) / len(epoch_loss), sum(epoch_loss_ema) / len(
             epoch_loss_ema)
     else:
         return net.state_dict(), sum(epoch_loss) / len(epoch_loss), sum(
             epoch_loss_ema) / len(epoch_loss_ema)
Esempio n. 7
0
def check_accuracy(args, loader, generator, discriminator, d_loss_fn, limit=False):
    d_losses = []
    metrics = {}
    g_l2_losses_abs, g_l2_losses_rel = ([],) * 2
    disp_error, disp_error_l, disp_error_nl = ([],) * 3
    f_disp_error, f_disp_error_l, f_disp_error_nl = ([],) * 3
    total_traj, total_traj_l, total_traj_nl = 0, 0, 0
    loss_mask_sum = 0
    generator.eval()
    with torch.no_grad():
        for batch in loader:
            batch = [tensor.cuda() for tensor in batch]
            # modified by zyl 2020/12/14 10:13 (obs_traj, pred_traj_gt, obs_traj_rel, pred_traj_gt_rel, non_linear_ped, loss_mask, seq_start_end) = batch
            (obs_traj, pred_traj_gt, obs_traj_rel, pred_traj_gt_rel, obs_traj_rel_v, pred_traj_rel_v, obs_traj_g, pred_traj_g,
             non_linear_ped, loss_mask, seq_start_end) = batch
            linear_ped = 1 - non_linear_ped
            loss_mask = loss_mask[:, args.obs_len:]

            pred_traj_fake_rel = generator(obs_traj, obs_traj_rel, seq_start_end, obs_traj_g)
            pred_traj_fake = relative_to_abs(pred_traj_fake_rel, obs_traj[-1])

            g_l2_loss_abs, g_l2_loss_rel = cal_l2_losses(
                pred_traj_gt, pred_traj_gt_rel, pred_traj_fake,
                pred_traj_fake_rel, loss_mask
            )
            ade, ade_l, ade_nl = cal_ade(
                pred_traj_gt, pred_traj_fake, linear_ped, non_linear_ped
            )

            fde, fde_l, fde_nl = cal_fde(
                pred_traj_gt, pred_traj_fake, linear_ped, non_linear_ped
            )

            traj_real = torch.cat([obs_traj, pred_traj_gt], dim=0)
            traj_real_rel = torch.cat([obs_traj_rel, pred_traj_gt_rel], dim=0)
            traj_fake = torch.cat([obs_traj, pred_traj_fake], dim=0)
            traj_fake_rel = torch.cat([obs_traj_rel, pred_traj_fake_rel], dim=0)

            scores_fake = discriminator(traj_fake, traj_fake_rel, seq_start_end)
            scores_real = discriminator(traj_real, traj_real_rel, seq_start_end)

            d_loss = d_loss_fn(scores_real, scores_fake)
            d_losses.append(d_loss.item())

            g_l2_losses_abs.append(g_l2_loss_abs.item())
            g_l2_losses_rel.append(g_l2_loss_rel.item())
            disp_error.append(ade.item())
            disp_error_l.append(ade_l.item())
            disp_error_nl.append(ade_nl.item())
            f_disp_error.append(fde.item())
            f_disp_error_l.append(fde_l.item())
            f_disp_error_nl.append(fde_nl.item())

            loss_mask_sum += torch.numel(loss_mask.data)
            total_traj += pred_traj_gt.size(1)
            total_traj_l += torch.sum(linear_ped).item()
            total_traj_nl += torch.sum(non_linear_ped).item()
            if limit and total_traj >= args.num_samples_check:
                break

    metrics['d_loss'] = sum(d_losses) / len(d_losses)
    metrics['g_l2_loss_abs'] = sum(g_l2_losses_abs) / loss_mask_sum
    metrics['g_l2_loss_rel'] = sum(g_l2_losses_rel) / loss_mask_sum

    metrics['ade'] = sum(disp_error) / (total_traj * args.pred_len)
    metrics['fde'] = sum(f_disp_error) / total_traj
    if total_traj_l != 0:
        metrics['ade_l'] = sum(disp_error_l) / (total_traj_l * args.pred_len)
        metrics['fde_l'] = sum(f_disp_error_l) / total_traj_l
    else:
        metrics['ade_l'] = 0
        metrics['fde_l'] = 0
    if total_traj_nl != 0:
        metrics['ade_nl'] = sum(disp_error_nl) / (
            total_traj_nl * args.pred_len)
        metrics['fde_nl'] = sum(f_disp_error_nl) / total_traj_nl
    else:
        metrics['ade_nl'] = 0
        metrics['fde_nl'] = 0

    generator.train()
    return metrics
Esempio n. 8
0
    def launch(self,
               resume=False,
               cfg_path_source=None,
               test_only=False,
               tag="norm"):
        time_str = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        self.ckpt_dir = os.path.join(self.exp_dir, "checkpoints")
        self.log_dir = os.path.join(self.exp_dir, "logs")
        self.cfg_path = os.path.join(self.exp_dir, "cfg.yaml")
        self.res_dir = os.path.join(
            self.exp_dir, "results-{tag}-{time}.yaml".format(tag=tag,
                                                             time=time_str))
        exists = os.path.exists(self.exp_dir)
        if not exists or not resume:
            if exists:
                print("Removing previous experiment...")
                id1 = uuid.uuid1()

                temp_dir = tempfile.gettempdir()
                copyfile(cfg_path_source,
                         os.path.join(temp_dir,
                                      str(id1) + "cfg.yaml"))
                rmtree(self.exp_dir)
                os.makedirs(self.ckpt_dir)
                copyfile(os.path.join(temp_dir,
                                      str(id1) + "cfg.yaml"), cfg_path_source)
            else:
                os.makedirs(self.ckpt_dir)

        self.log_text_dir = os.path.join(
            self.exp_dir, "output-{tag}-{time}.log".format(tag=tag,
                                                           time=time_str))
        sys.stdout = Logger(self.log_text_dir)
        if hasattr(self, "__config__"):
            print("Launching experiment with the configuration description:")
            print(yaml.dump(self.__config__))
        self.writer = SummaryWriter(self.log_dir)

        if cfg_path_source != None:  # backup the config file in the log folder
            if cfg_path_source != self.cfg_path:
                copyfile(cfg_path_source, self.cfg_path)

        print("\n[Phase 1] : Data Preparation")
        self.trainer.set_data_loaders(self.trainloader, self.testloader,
                                      self.mini_testloader)

        print("\n[Phase 2] : Model setup")

        print(self.trainer.net)
        print("total # of parameters = {:,} ({:,} trainable)".format(
            sum([torch.numel(p) for p in self.trainer.net.parameters()]),
            sum([
                torch.numel(p) for p in filter(lambda x: x.requires_grad,
                                               self.trainer.net.parameters())
            ]),
        ))

        if torch.cuda.is_available():
            self.trainer.net.cuda()
        # Test model: Temporary hack
        self.trainer.test_run_model()

        if resume:
            print("| Resuming from checkpoint...")
            self.load_state()

        print("Initial Validation...")
        acc_valid = self.trainer.run(self.epoch,
                                     self.num_epochs,
                                     is_training=False)
        # import pdb; pdb.set_trace()
        acc_valid.summarize()
        self.best_metrics = acc_valid
        print("\nSaving the Best Checkpoint...")
        # import pdb; pdb.set_trace()
        self.save_state()
        print("Best Metrics: {acc}\n".format(
            acc=self.best_metrics.summary_str(dtype="scalar", level=0)))

        if not test_only:
            print("\n[Phase 3] : Training model")
            print("| Training Epochs = " + str(self.num_epochs))
            elapsed_time = 0
            while self.epoch <= self.num_epochs:
                print("Running at [{}] ...".format(self.exp_dir))
                start_time = time.time()
                acc_train = self.trainer.run(self.epoch,
                                             self.num_epochs,
                                             is_training=True)
                acc_valid = self.trainer.run(self.epoch,
                                             self.num_epochs,
                                             is_training=False)
                acc_mini_test = self.trainer.run(self.epoch,
                                                 self.num_epochs,
                                                 is_training=False,
                                                 mini_test=True)

                acc_train.summarize()
                acc_valid.summarize()
                self.log_acc(acc_train, acc_valid)

                self.epoch += 1
                if self.best_metrics_comparator(self.best_metrics, acc_valid):
                    print("\nSaving the Best Checkpoint...")
                    self.best_metrics = acc_valid
                    self.save_state()
                    print("Best Metrics: {acc}\n".format(
                        acc=self.best_metrics.summary_str(dtype="scalar",
                                                          level=0)))
                print(acc_valid)

                epoch_time = time.time() - start_time
                elapsed_time += epoch_time
                print("| Elapsed time : %d:%02d:%02d" %
                      (get_hms(elapsed_time)))

        else:
            print("\n[Phase 4] : Final Performance")
            print("* Test results : {acc}".format(acc=self.best_metrics))

            print("Restoring the Best Checkpoint...")
            self.load_state()
            # self.best_metrics.summarize()
            record = self.best_metrics.filter(dtype="scalar",
                                              op=lambda x: float(x))
            offset = 5

            self.trainer.test_run_model()
            l_constant = l2_lipschitz_constant_checker(self.trainer.net)
            # streamline the module during post_steps
            with Streamline(self.trainer.net, True, False):
                print("Current l_constant = {}".format(l_constant))
                for index, post_step in enumerate(self.post_steps):
                    print("\n[Phase {}] : ".format(index + offset), end="")
                    post_step(
                        self.trainer.net,
                        (self.trainloader, self.testloader,
                         self.mini_testloader),
                        l_constant=l_constant,
                        record=record,
                        device=next(self.trainer.net.parameters()).device,
                    )
            print()
            print(yaml.safe_dump(record))
            print("Saving results into a dictionary...")
            print(self.res_dir)
            with open(self.res_dir, "w") as f:
                yaml.safe_dump(record, f)
            print("finished")
Esempio n. 9
0
    def forward(self, pred, real):
        diffs = torch.add(real, - pred)
        n = torch.numel(diffs.data)
        simse = torch.sum(diffs).pow(2) / (n ** 2)

        return simse
Esempio n. 10
0
n_epoch = 50
PATH_model = "./model/modelAfinal.pt"

#Initialisation
print(PATH_model)
model = CNN_A()

if torch.cuda.is_available():
    model = model.cuda()

#Get number of parameters of the model
number_parameter = 0
tensor_list = list(model.state_dict().items())
for layer_tensor_name, tensor in tensor_list:
    print('Layer {}: {} elements'.format(layer_tensor_name,
                                         torch.numel(tensor)))
    number_parameter += torch.numel(tensor)

print('total amount of parameters : {}'.format(number_parameter))

#Adam optimizer (used for the Loss backpropagation)
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=5e-4)

#Loss object : CrossEntropy
crossentropy = nn.CrossEntropyLoss()

#Performances tracker
train_loss = np.array([])
train_accuracy = np.array([])

val_loss = np.array([])
Esempio n. 11
0
 def num_params(self):
     total = 0
     for p in self.model.module.parameters():
         total += th.numel(p.data)
     return total
Esempio n. 12
0
def model_parameter_number(model):
    k = 0
    for i in list(model.parameters()):
        k += torch.numel(i.data)
    print('model\'s parameter number is :', k)
    return 0
            loss.backward()
            loss_lst.append(loss.data)
            print('training the nmf layer')
            print(loss.data)
            for A in net.lsqnonneglst.parameters():
                A.data = A.data.sub_(lr_nmf * A.grad.data)
                A.data = A.data.clamp(min=0)
        total_loss += loss.data
        #             A.requires_grad = False
        # train the linear classifier
        print('training the classifier')
        for k in range(1000):
            net.zero_grad()
            pred = net.linear(S_lst[-1].data)
            loss = criterion(l_batch * pred, l_batch * label)
            loss = loss * torch.numel(l_batch) / torch.sum(l_batch)
            loss.backward()
            if (k + 1) % 100 == 0:
                print(loss.data)
            for A in net.linear.parameters():
                A.data = A.data.sub_(lr_cl * A.grad.data)
#         for A in net.lsqnonneglst.parameters():
#             A.requires_grad = True
    print('epoch = ', epo, '\n', total_loss)
    total_loss_lst.append(total_loss)

# In[25]:


# Doing forward propagation on the whole dataset, remember to SAVE S and prod!
def get_whole_output(net, dataset, param_lst=None):
def dice_metric(input, target):
    intersection = (input * target).sum(dim=(1, 2, 3))
    smooth = torch.ones_like(intersection) * (1 / torch.numel(input[0]))
    return (2. * intersection) / (input.sum(dim=(1, 2, 3)) +
                                  target.sum(dim=(1, 2, 3)) + smooth)
Esempio n. 15
0
def sum_params(model):
    net_size = 0
    for idx in model.parameters():  # get total parameters
        net_size = net_size + torch.numel(idx)
    return net_size
Esempio n. 16
0
 def num_episodes(self):
     return torch.numel(self.total_rewards)
Esempio n. 17
0
def main(opt):
    output_dir = join(config.result_dir, opt.model_name + '_{}{}_{}'.format(opt.optimizer, 
                                opt.learning_rate, opt.drop_rate))
    make_path(output_dir)

    output_config = join(output_dir, 'config.json')
    with open(output_config, 'w') as f:
        optDict = opt.__dict__
        json.dump(optDict, f)

    log_dir = join(output_dir, 'log')
    checkpoint_dir = join(output_dir, 'ckpts')
    make_path(log_dir)
    make_path(checkpoint_dir)
    logger = get_logger(log_dir, 'none')
    logger.info('[Output] {}'.format(output_dir))

    ## create a dataset given opt.dataset_mode and other options, the trn_db neither Dataset nor Dataloader
    trn_db = CustomDatasetDataLoader(opt, config.data_dir, config.target_dir, setname='trn', is_train=True)
    val_db = CustomDatasetDataLoader(opt, config.data_dir, config.target_dir, setname='val', is_train=False)
    tst_db = CustomDatasetDataLoader(opt, config.data_dir, config.target_dir, setname='tst', is_train=False)
    logger.info('The number of training samples = {}'.format(len(trn_db)))
    logger.info('The number of validation samples = {}'.format(len(val_db)))
    logger.info('The number of testing samples = {}'.format(len(tst_db)))

    model_saver = ModelSaver(checkpoint_dir)

    model = DenseNet(opt.gpu_id, growth_rate=opt.growth_rate, block_config=opt.block_config, 
                    num_init_features=opt.num_init_features, bn_size=opt.bn_size, 
                    compression_rate=opt.reduction, drop_rate=opt.drop_rate, 
                    num_classes=opt.num_classes)
    # to gpu card
    model.to(model.device)
    num_parameters = sum(torch.numel(parameter) for parameter in model.parameters())
    logger.info('[Model] parameters {}'.format(num_parameters))
    # logger.info(model)

    # Prepare model
    if opt.is_test and opt.restore_checkpoint:
        logger.info('[Model] At testing stage and restore from {}'.format(opt.restore_checkpoint))
        checkpoint = torch.load(opt.restore_checkpoint)
        model.load_state_dict(checkpoint)
    else:
        checkpoint = {}

    # initialized the optimizer
    if opt.optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=opt.learning_rate)
    else:
        optimizer = torch.optim.SGD(model.parameters(), lr=opt.learning_rate, 
                                momentum=opt.momentum, nesterov=opt.nesterov,
                                weight_decay=opt.weight_decay)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.reduce_half_lr_epoch, gamma=opt.reduce_half_lr_rate)

    best_eval_f1 = 0              # record the best eval UAR
    patience = opt.patience
    
    for epoch in range(opt.max_epoch):
        for i, batch in enumerate(trn_db):  # inner loop within one epoch
            model.set_input(batch)   
            model.forward()
            batch_loss = model.loss
            optimizer.zero_grad()  
            model.backward()            
            optimizer.step()
            if i % 100 == 0:
                logger.info('\t Cur train batch loss {}'.format(batch_loss))
        # for evaluation
        if epoch % 1 == 0:
            logger.info("============ Evaluation Epoch {} ============".format(epoch))
            logger.info("Cur learning rate {}".format(optimizer.state_dict()['param_groups'][0]['lr']))
            val_log = evaluation(model, val_db)
            logger.info(f"[Validation] Loss: {val_log['loss']:.2f},"
                        f"\t F1: {val_log['F1']*100:.2f},"
                        f"\t WA: {val_log['WA']*100:.2f},"
                        f"\t UA: {val_log['UA']*100:.2f},\n")
            test_log = evaluation(model, tst_db)
            logger.info(f"[Testing] Loss: {test_log['loss']:.2f},"
                        f"\t F1: {test_log['F1']*100:.2f},"
                        f"\t WA: {test_log['WA']*100:.2f},"
                        f"\t UA: {test_log['UA']*100:.2f},\n")
            logger.info(test_log['cm'])
            logger.info('Save model at {} epoch'.format(epoch))
            model_saver.save(model, epoch)
            # update the current best model based on validation results
            if val_log['F1'] > best_eval_f1:
                best_eval_epoch = epoch
                best_eval_f1 = val_log['F1']
                # reset to init
                patience = opt.patience
        # for early stop
        if patience <= 0:            
            break
        else:
            patience -= 1
        # update the learning rate
        scheduler.step()

    # print best eval result
    logger.info('Loading best model found on val set: epoch-%d' % best_eval_epoch)
    checkpoint_path = os.path.join(checkpoint_dir, 'model_step_{}.pt'.format(best_eval_epoch))
    if not os.path.exists(checkpoint_path):
        logger.error("Load checkpoint error, not exist such file")
        exit(0)
    ck = torch.load(checkpoint_path)
    model.load_state_dict(ck)
    val_log = evaluation(model, val_db, save_dir=log_dir, set_name='val')
    logger.info('[Val] result WA: %.4f UAR %.4f F1 %.4f' % (val_log['WA'], val_log['UA'], val_log['F1']))
    logger.info('\n{}'.format(val_log['cm']))
    tst_log = evaluation(model, tst_db, save_dir=log_dir, set_name='tst')
    logger.info('[Tst] result WA: %.4f UAR %.4f F1 %.4f' % (tst_log['WA'], tst_log['UA'], tst_log['F1']))
    logger.info('\n{}'.format(tst_log['cm']))
Esempio n. 18
0
 def __call__(pred, y, hinge=0):
     # Your code here!
     # Compute hinge loss over the whole tensor of weights.
     loss = (torch.ones_like(pred) - pred * y).clamp(min=hinge).sum()
     return loss / torch.numel(pred)
def vae_loss(x, mu, logsigma, recon_x, beta=1):
    recon_loss = F.mse_loss(x, recon_x, reduction='mean')
    kl_loss = -0.5 * torch.sum(1 + logsigma - mu.pow(2) - logsigma.exp())
    kl_loss = kl_loss / torch.numel(x)
    return recon_loss + kl_loss * beta
Esempio n. 20
0
def density(tensor):
    nonzero = tensor.abs().gt(0).sum()
    return float(nonzero.item()) / torch.numel(tensor)
Esempio n. 21
0
        return x


if __name__ == '__main__':
    batch_size = 25
    seq_len = 128
    class_num = 5
    ch_num = 4
    filterbanks = torch.from_numpy(lin_tri_filter_shape(
        32, 256, 100, 0, 50)).to(torch.float).cuda()  # [129, 32]
    net = SeqSleepNet(filterbanks=filterbanks,
                      ch_num=ch_num,
                      seq_len=seq_len,
                      class_num=class_num)
    #net        = SeqSleepNet(filterbanks=filterbanks, seq_len=seq_len, class_num=class_num)
    net = net.cuda()
    inputs = torch.rand(batch_size, seq_len, ch_num,
                        int(100 * 30))  # [bs, seq_len, 30*100]
    inputs = preprocessing(inputs)  # [bs, seq_len, 29, 129]
    print(inputs.shape)
    inputs = inputs.cuda()
    outputs = net(inputs)  # [bs, seq_len, class_num]
    params = list(net.parameters())
    print(outputs.size())
    print("total param num is: {}".format(sum(torch.numel(p) for p in params)))
    '''
    for name, param in net.named_parameters():
        print(name, param.shape)

    '''
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument(
        "--task",
        default=None,
        type=str,
        required=True,
        help="Sentiment analysis or natural language inference? (SA or NLI)")

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--trained_model_dir",
        default="",
        type=str,
        help=
        "Where is the fine-tuned (with the cloze-style LM objective) BERT model?"
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--freeze_bert',
                        action='store_true',
                        help="Whether to freeze BERT")
    parser.add_argument('--full_bert',
                        action='store_true',
                        help="Whether to use full BERT")
    parser.add_argument('--num_train_samples',
                        type=int,
                        default=-1,
                        help="-1 for full train set, otherwise please specify")
    parser.add_argument('--damping',
                        type=float,
                        default=0.0,
                        help="probably need damping for deep models")
    parser.add_argument('--test_idx',
                        type=int,
                        default=1,
                        help="test index we want to examine")
    parser.add_argument(
        '--influence_on_decision',
        action='store_true',
        help=
        "Whether to compute influence on decision (rather than influence on ground truth)"
    )
    parser.add_argument("--if_compute_saliency", default=1, type=int)
    parser.add_argument('--start_test_idx',
                        type=int,
                        default=-1,
                        help="when not -1, --test_idx will be disabled")
    parser.add_argument('--end_test_idx',
                        type=int,
                        default=-1,
                        help="when not -1, --test_idx will be disabled")
    parser.add_argument("--lissa_repeat", default=1, type=int)
    parser.add_argument("--lissa_depth", default=1.0, type=float)

    args = parser.parse_args()

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if not args.influence_on_decision:
        raise ValueError(
            "To use loss function w.r.t. the ground truth, manually disable this error in the code."
        )

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        #raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
        logger.info(
            "WARNING: Output directory already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    mnli_processor = MnliProcessor()
    hans_processor = HansProcessor()
    sst_processor = Sst2Processor()
    if args.task == "SA":
        label_list = sst_processor.get_labels()
    elif args.task == "NLI":
        label_list = mnli_processor.get_labels()
    elif args.task == "NLI_negation":
        label_list = mnli_processor.get_labels()
    elif args.task == "NLI_natural":
        label_list = mnli_processor.get_labels()
    num_labels = len(label_list)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    # Prepare model
    model = MyBertForSequenceClassification.from_pretrained(
        args.trained_model_dir, num_labels=num_labels)
    if args.fp16:
        raise ValueError("Not sure if FP16 precision works yet.")
        model.half()
    model.to(device)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    #     for n, p in param_optimizer:
    #         print(n)
    #     sys.exit()
    if args.freeze_bert:
        frozen = ['bert']
    elif args.full_bert:
        frozen = []
    else:
        frozen = [
            'bert.embeddings.',
            'bert.encoder.layer.0.',
            'bert.encoder.layer.1.',
            'bert.encoder.layer.2.',
            'bert.encoder.layer.3.',
            'bert.encoder.layer.4.',
            'bert.encoder.layer.5.',
            'bert.encoder.layer.6.',
            'bert.encoder.layer.7.',
        ]  # *** change here to filter out params we don't want to track ***

    param_influence = []
    for n, p in param_optimizer:
        if (not any(fr in n for fr in frozen)):
            param_influence.append(p)
        elif 'bert.embeddings.word_embeddings.' in n:
            pass  # need gradients through embedding layer for computing saliency map
        else:
            p.requires_grad = False

    param_shape_tensor = []
    param_size = 0
    for p in param_influence:
        tmp_p = p.clone().detach()
        param_shape_tensor.append(tmp_p)
        param_size += torch.numel(tmp_p)
    logger.info("  Parameter size = %d", param_size)

    if args.task == "SA":
        train_examples = sst_processor.get_train_examples(
            args.data_dir, args.num_train_samples)
    elif args.task == "NLI":
        train_examples = mnli_processor.get_train_examples(
            args.data_dir, args.num_train_samples)
    elif args.task == "NLI_negation":
        train_examples = mnli_processor.get_train_examples(
            args.data_dir, args.num_train_samples)
    elif args.task == "NLI_natural":
        train_examples = mnli_processor.get_train_examples(
            args.data_dir, args.num_train_samples)

    train_features = convert_examples_to_features(train_examples, label_list,
                                                  args.max_seq_length,
                                                  tokenizer)
    logger.info("***** Train set *****")
    logger.info("  Num examples = %d", len(train_examples))
    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_label_id = torch.tensor([f.label_id for f in train_features],
                                dtype=torch.long)
    all_guids = torch.tensor([f.guid for f in train_features],
                             dtype=torch.long)
    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                               all_label_id, all_guids)
    train_dataloader_wbatch = DataLoader(train_data,
                                         sampler=SequentialSampler(train_data),
                                         batch_size=args.train_batch_size)
    train_dataloader = DataLoader(train_data,
                                  sampler=SequentialSampler(train_data),
                                  batch_size=1)

    if args.task == "SA":
        test_examples = sst_processor.get_dev_examples(args.data_dir)
    elif args.task == "NLI":
        test_examples = hans_processor.get_test_examples(args.data_dir)
    elif args.task == "NLI_negation":
        test_examples = hans_processor.get_neg_test_examples(args.data_dir)
    elif args.task == "NLI_natural":
        test_examples = mnli_processor.get_dev_examples(args.data_dir)

    test_features = convert_examples_to_features(test_examples, label_list,
                                                 args.max_seq_length,
                                                 tokenizer)
    logger.info("***** Test set *****")
    logger.info("  Num examples = %d", len(test_examples))
    all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in test_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                   dtype=torch.long)
    all_label_id = torch.tensor([f.label_id for f in test_features],
                                dtype=torch.long)
    all_guids = torch.tensor([f.guid for f in test_features], dtype=torch.long)
    test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                              all_label_id, all_guids)
    test_dataloader = DataLoader(test_data,
                                 sampler=SequentialSampler(test_data),
                                 batch_size=1)

    damping = args.damping

    test_idx = args.test_idx
    start_test_idx = args.start_test_idx
    end_test_idx = args.end_test_idx

    for input_ids, input_mask, segment_ids, label_ids, guids in test_dataloader:
        model.eval()
        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        train_dataloader_lissa = DataLoader(train_data,
                                            batch_size=args.train_batch_size,
                                            shuffle=True,
                                            drop_last=True)

        guid = guids[0].item(
        )  # test set loader must have a batch size of 1 now
        if start_test_idx != -1 and end_test_idx != -1:
            if guid < start_test_idx:
                continue
            if guid > end_test_idx:
                break
        else:
            if guid < test_idx:
                continue
            if guid > test_idx:
                break
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)

        ######## GET TEST EXAMPLE DECISION ########
        with torch.no_grad():
            logits = model(input_ids, segment_ids, input_mask)
            logits = logits.detach().cpu().numpy()
            outputs = np.argmax(logits, axis=1)
            pred_label_ids = torch.from_numpy(outputs).long().to(device)
            if label_ids.item() == pred_label_ids.item():
                test_pred_status = "correct"
            else:
                test_pred_status = "wrong"
        if args.influence_on_decision:
            label_ids = torch.from_numpy(outputs).long().to(device)
        ################

        ######## L_TEST GRADIENT ########
        model.zero_grad()
        test_loss = model(input_ids, segment_ids, input_mask, label_ids)
        test_grads = autograd.grad(test_loss, param_influence)
        ################

        ######## TEST EXAMPLE SALIENCY MAP ########
        if args.if_compute_saliency:
            saliency_scores = saliency_map(model, input_ids, segment_ids,
                                           input_mask, pred_label_ids)
            test_tok_sal_list = []
            for tok, sal in zip(
                    tokenizer.convert_ids_to_tokens(
                        input_ids.view(-1).cpu().numpy()), saliency_scores):
                if tok == '[PAD]':
                    break
                test_tok_sal_list.append((tok, sal))
        ################

        ######## IHVP ########
        model.train()
        logger.info("######## START COMPUTING IHVP ########")
        inverse_hvp = get_inverse_hvp_lissa(
            test_grads,
            model,
            device,
            param_influence,
            train_dataloader_lissa,
            damping=damping,
            num_samples=args.lissa_repeat,
            recursion_depth=int(len(train_examples) * args.lissa_depth))
        logger.info("######## FINISHED COMPUTING IHVP ########")
        ################

        influences = np.zeros(len(train_dataloader.dataset))
        train_tok_sal_lists = []
        for train_idx, (_input_ids, _input_mask, _segment_ids, _label_ids,
                        _) in enumerate(
                            tqdm(train_dataloader, desc="Train set index")):
            model.train()
            _input_ids = _input_ids.to(device)
            _input_mask = _input_mask.to(device)
            _segment_ids = _segment_ids.to(device)
            _label_ids = _label_ids.to(device)

            ######## L_TRAIN GRADIENT ########
            model.zero_grad()
            train_loss = model(_input_ids, _segment_ids, _input_mask,
                               _label_ids)
            train_grads = autograd.grad(train_loss, param_influence)
            influences[train_idx] = torch.dot(
                inverse_hvp, gather_flat_grad(train_grads)).item()
            ################

            ######## TRAIN EXAMPLE SALIENCY MAP ########


#             if args.if_compute_saliency:
#                 with torch.no_grad():
#                     logits = model(_input_ids, _segment_ids, _input_mask)
#                     logits = logits.detach().cpu().numpy()
#                     outputs = np.argmax(logits, axis=1)
#                     _pred_label_ids = torch.from_numpy(outputs).long().to(device)

#                 saliency_scores = saliency_map(model, _input_ids, _segment_ids, _input_mask, _pred_label_ids)
#                 train_tok_sal_list = []
#                 for tok, sal in zip(tokenizer.convert_ids_to_tokens(_input_ids.view(-1).cpu().numpy()), saliency_scores):
#                     if tok == '[PAD]':
#                         break
#                     train_tok_sal_list.append((tok, sal))
#                 train_tok_sal_lists.append(train_tok_sal_list)
################

        if args.influence_on_decision:
            pickle.dump(
                influences,
                open(
                    os.path.join(args.output_dir,
                                 "influences_test_" + str(guid) + ".pkl"),
                    "wb"))
        else:
            pickle.dump(
                influences,
                open(
                    os.path.join(args.output_dir,
                                 "influences_on_x_test_" + str(guid) + ".pkl"),
                    "wb"))
        if args.if_compute_saliency:
            pickle.dump(
                (test_tok_sal_list, train_tok_sal_lists, test_pred_status),
                open(
                    os.path.join(args.output_dir,
                                 "saliency_test_" + str(guid) + ".pkl"), "wb"))
Esempio n. 23
0
    def attribute(self,
                  inputs: TensorOrTupleOfTensorsGeneric,
                  baselines: BaselineType = None,
                  target: TargetType = None,
                  additional_forward_args: Any = None,
                  feature_mask: Union[None, Tensor, Tuple[Tensor, ...]] = None,
                  perturbations_per_eval: int = 1,
                  **kwargs: Any) -> TensorOrTupleOfTensorsGeneric:
        r"""
        Args:

            inputs (tensor or tuple of tensors):  Input for which ablation
                        attributions are computed. If forward_func takes a single
                        tensor as input, a single input tensor should be provided.
                        If forward_func takes multiple tensors as input, a tuple
                        of the input tensors should be provided. It is assumed
                        that for all given input tensors, dimension 0 corresponds
                        to the number of examples (aka batch size), and if
                        multiple input tensors are provided, the examples must
                        be aligned appropriately.
            baselines (scalar, tensor, tuple of scalars or tensors, optional):
                        Baselines define reference value which replaces each
                        feature when ablated.
                        Baselines can be provided as:

                        - a single tensor, if inputs is a single tensor, with
                          exactly the same dimensions as inputs or
                          broadcastable to match the dimensions of inputs

                        - a single scalar, if inputs is a single tensor, which will
                          be broadcasted for each input value in input tensor.

                        - a tuple of tensors or scalars, the baseline corresponding
                          to each tensor in the inputs' tuple can be:

                          - either a tensor with matching dimensions to
                            corresponding tensor in the inputs' tuple
                            or the first dimension is one and the remaining
                            dimensions match with the corresponding
                            input tensor.

                          - or a scalar, corresponding to a tensor in the
                            inputs' tuple. This scalar value is broadcasted
                            for corresponding input tensor.
                        In the cases when `baselines` is not provided, we internally
                        use zero scalar corresponding to each input tensor.
                        Default: None
            target (int, tuple, tensor or list, optional):  Output indices for
                        which gradients are computed (for classification cases,
                        this is usually the target class).
                        If the network returns a scalar value per example,
                        no target index is necessary.
                        For general 2D outputs, targets can be either:

                        - a single integer or a tensor containing a single
                          integer, which is applied to all input examples

                        - a list of integers or a 1D tensor, with length matching
                          the number of examples in inputs (dim 0). Each integer
                          is applied as the target for the corresponding example.

                        For outputs with > 2 dimensions, targets can be either:

                        - A single tuple, which contains #output_dims - 1
                          elements. This target index is applied to all examples.

                        - A list of tuples with length equal to the number of
                          examples in inputs (dim 0), and each tuple containing
                          #output_dims - 1 elements. Each tuple is applied as the
                          target for the corresponding example.

                        Default: None
            additional_forward_args (any, optional): If the forward function
                        requires additional arguments other than the inputs for
                        which attributions should not be computed, this argument
                        can be provided. It must be either a single additional
                        argument of a Tensor or arbitrary (non-tuple) type or a
                        tuple containing multiple additional arguments including
                        tensors or any arbitrary python types. These arguments
                        are provided to forward_func in order following the
                        arguments in inputs.
                        For a tensor, the first dimension of the tensor must
                        correspond to the number of examples. For all other types,
                        the given argument is used for all forward evaluations.
                        Note that attributions are not computed with respect
                        to these arguments.
                        Default: None
            feature_mask (tensor or tuple of tensors, optional):
                        feature_mask defines a mask for the input, grouping
                        features which should be ablated together. feature_mask
                        should contain the same number of tensors as inputs.
                        Each tensor should
                        be the same size as the corresponding input or
                        broadcastable to match the input tensor. Each tensor
                        should contain integers in the range 0 to num_features
                        - 1, and indices corresponding to the same feature should
                        have the same value.
                        Note that features within each input tensor are ablated
                        independently (not across tensors).
                        If the forward function returns a single scalar per batch,
                        we enforce that the first dimension of each mask must be 1,
                        since attributions are returned batch-wise rather than per
                        example, so the attributions must correspond to the
                        same features (indices) in each input example.
                        If None, then a feature mask is constructed which assigns
                        each scalar within a tensor as a separate feature, which
                        is ablated independently.
                        Default: None
            perturbations_per_eval (int, optional): Allows ablation of multiple
                        features to be processed simultaneously in one call to
                        forward_fn.
                        Each forward pass will contain a maximum of
                        perturbations_per_eval * #examples samples.
                        For DataParallel models, each batch is split among the
                        available devices, so evaluations on each available
                        device contain at most
                        (perturbations_per_eval * #examples) / num_devices
                        samples.
                        If the forward function returns a single scalar per batch,
                        perturbations_per_eval must be set to 1.
                        Default: 1
            **kwargs (Any, optional): Any additional arguments used by child
                        classes of FeatureAblation (such as Occlusion) to construct
                        ablations. These arguments are ignored when using
                        FeatureAblation directly.
                        Default: None

        Returns:
            *tensor* or tuple of *tensors* of **attributions**:
            - **attributions** (*tensor* or tuple of *tensors*):
                        The attributions with respect to each input feature.
                        If the forward function returns
                        a scalar value per example, attributions will be
                        the same size as the provided inputs, with each value
                        providing the attribution of the corresponding input index.
                        If the forward function returns a scalar per batch, then
                        attribution tensor(s) will have first dimension 1 and
                        the remaining dimensions will match the input.
                        If a single tensor is provided as inputs, a single tensor is
                        returned. If a tuple of tensors is provided for inputs, a
                        tuple of corresponding sized tensors is returned.


        Examples::

            >>> # SimpleClassifier takes a single input tensor of size Nx4x4,
            >>> # and returns an Nx3 tensor of class probabilities.
            >>> net = SimpleClassifier()
            >>> # Generating random input with size 2 x 4 x 4
            >>> input = torch.randn(2, 4, 4)
            >>> # Defining FeatureAblation interpreter
            >>> ablator = FeatureAblation(net)
            >>> # Computes ablation attribution, ablating each of the 16
            >>> # scalar input independently.
            >>> attr = ablator.attribute(input, target=1)

            >>> # Alternatively, we may want to ablate features in groups, e.g.
            >>> # grouping each 2x2 square of the inputs and ablating them together.
            >>> # This can be done by creating a feature mask as follows, which
            >>> # defines the feature groups, e.g.:
            >>> # +---+---+---+---+
            >>> # | 0 | 0 | 1 | 1 |
            >>> # +---+---+---+---+
            >>> # | 0 | 0 | 1 | 1 |
            >>> # +---+---+---+---+
            >>> # | 2 | 2 | 3 | 3 |
            >>> # +---+---+---+---+
            >>> # | 2 | 2 | 3 | 3 |
            >>> # +---+---+---+---+
            >>> # With this mask, all inputs with the same value are ablated
            >>> # simultaneously, and the attribution for each input in the same
            >>> # group (0, 1, 2, and 3) per example are the same.
            >>> # The attributions can be calculated as follows:
            >>> # feature mask has dimensions 1 x 4 x 4
            >>> feature_mask = torch.tensor([[[0,0,1,1],[0,0,1,1],
            >>>                             [2,2,3,3],[2,2,3,3]]])
            >>> attr = ablator.attribute(input, target=1, feature_mask=feature_mask)
        """
        # Keeps track whether original input is a tuple or not before
        # converting it into a tuple.
        is_inputs_tuple = _is_tuple(inputs)
        inputs, baselines = _format_input_baseline(inputs, baselines)
        additional_forward_args = _format_additional_forward_args(
            additional_forward_args)
        num_examples = inputs[0].shape[0]
        feature_mask = _format_input(
            feature_mask) if feature_mask is not None else None
        assert (
            isinstance(perturbations_per_eval, int)
            and perturbations_per_eval >= 1
        ), "Perturbations per evaluation must be an integer and at least 1."
        with torch.no_grad():
            # Computes initial evaluation with all features, which is compared
            # to each ablated result.
            initial_eval = _run_forward(self.forward_func, inputs, target,
                                        additional_forward_args)
            agg_output_mode = _find_output_mode_and_verify(
                initial_eval, num_examples, perturbations_per_eval,
                feature_mask)
            if not agg_output_mode:
                initial_eval = initial_eval.reshape(1, num_examples)

            # Initialize attribution totals and counts
            attrib_type = cast(
                dtype,
                initial_eval.dtype
                if isinstance(initial_eval, Tensor) else type(initial_eval),
            )
            total_attrib = [
                torch.zeros_like(input[0:1] if agg_output_mode else input,
                                 dtype=attrib_type) for input in inputs
            ]

            # Weights are used in cases where ablations may be overlapping.
            if self.use_weights:
                weights = [
                    torch.zeros_like(
                        input[0:1] if agg_output_mode else input).float()
                    for input in inputs
                ]

            # Iterate through each feature tensor for ablation
            for i in range(len(inputs)):
                # Skip any empty input tensors
                if torch.numel(inputs[i]) == 0:
                    continue
                for (
                        current_inputs,
                        current_add_args,
                        current_target,
                        current_mask,
                ) in self._ablation_generator(i, inputs,
                                              additional_forward_args, target,
                                              baselines, feature_mask,
                                              perturbations_per_eval,
                                              **kwargs):
                    # modified_eval dimensions: 1D tensor with length
                    # equal to #num_examples * #features in batch
                    modified_eval = _run_forward(
                        self.forward_func,
                        current_inputs,
                        current_target,
                        current_add_args,
                    )
                    # eval_diff dimensions: (#features in batch, #num_examples, 1,.. 1)
                    # (contains 1 more dimension than inputs). This adds extra
                    # dimensions of 1 to make the tensor broadcastable with the inputs
                    # tensor.
                    if agg_output_mode:
                        eval_diff = initial_eval - modified_eval
                    else:
                        eval_diff = (initial_eval -
                                     modified_eval.reshape(-1, num_examples)
                                     ).reshape((-1, num_examples) +
                                               (len(inputs[i].shape) - 1) *
                                               (1, ))
                    if self.use_weights:
                        weights[i] += current_mask.float().sum(dim=0)
                    total_attrib[i] += (eval_diff *
                                        current_mask.to(attrib_type)).sum(
                                            dim=0)

            # Divide total attributions by counts and return formatted attributions
            if self.use_weights:
                attrib = tuple(
                    single_attrib.float() / weight
                    for single_attrib, weight in zip(total_attrib, weights))
            else:
                attrib = tuple(total_attrib)
            _result = _format_output(is_inputs_tuple, attrib)
        return _result
Esempio n. 24
0
# 检测是否为PyTorch中的向量(Tensor)
x = [12, 23, 34, 45, 56, 67, 78, 89]
# checks whether the object is a tensor object
print(T.is_tensor(x))
# checks whether the object is stored as tensor object
print(T.is_storage(x))

y = T.randn(2, 2, 3)
print(y)
print(T.is_tensor(y))
print(T.is_storage(y))
# size of tensor
print(y.size())
# the total number of elements in the input tensor
print(T.numel(y))

# zeros函数
z = T.zeros(4, 5)
print(z)
print(z.size())
print(T.numel(z))

# eye函数
w1 = T.eye(3, 4)
print(w1)
print(w1.size())
print(T.numel(w1))
w2 = T.eye(5, 4)
print(w2)
print(w2.size())
Esempio n. 25
0
    def forward(self, pred, real):
        diffs = torch.add(real, -pred)
        n = torch.numel(diffs.data)
        mse = torch.sum(torch.abs(diffs)) / n

        return mse
Esempio n. 26
0
def train(paramdict):

    fname = paramdict['file']

    with open(fname, 'rb') as f:
        params = pickle.load(f)

    #params = dict(click.get_current_context().params)
    print("Passed params: ", params)
    print(platform.uname())
    #params['nbsteps'] = params['nbshots'] * ((params['prestime'] + params['interpresdelay']) * params['nbclasses']) + params['prestimetest']  # Total number of steps per episode

    suffix = "btchFixmod_" + "".join([
        str(x) + "_" if pair[0] != 'nbsteps' and pair[0] != 'rngseed'
        and pair[0] != 'save_every' and pair[0] != 'test_every'
        and pair[0] != 'pe' else ''
        for pair in sorted(zip(params.keys(), params.values()),
                           key=lambda x: x[0]) for x in pair
    ])[:-1] + "_rngseed_" + str(
        params['rngseed']
    )  # Turning the parameters into a nice suffix for filenames
    #suffix = "modRPDT_"+"".join([str(x)+"_" if pair[0] != 'nbsteps' and pair[0] != 'rngseed' and pair[0] != 'save_every' and pair[0] != 'test_every' else '' for pair in sorted(zip(params.keys(), params.values()), key=lambda x:x[0] ) for x in pair])[:-1] + "_rngseed_" + str(params['rngseed'])   # Turning the parameters into a nice suffix for filenames
    print("Reconstructed suffix:", suffix)

    params['rsp'] = 1

    #params['rngseed'] = 3
    # Initialize random seeds (first two redundant?)
    print("Setting random seeds")
    np.random.seed(params['rngseed'])
    random.seed(params['rngseed'])
    torch.manual_seed(params['rngseed'])
    #print(click.get_current_context().params)

    net = Network(params)
    # YOU MAY NEED TO CHANGE THE DIRECTORY HERE:
    if paramdict['initialize'] == 0:
        net.load_state_dict(torch.load('./tmp/torchmodel_' + suffix + '.dat'))

    print("Shape of all optimized parameters:",
          [x.size() for x in net.parameters()])
    allsizes = [torch.numel(x.data.cpu()) for x in net.parameters()]
    print("Size (numel) of all optimized elements:", allsizes)
    print("Total size (numel) of all optimized elements:", sum(allsizes))

    BATCHSIZE = params['bs']

    LABSIZE = params['msize']
    lab = np.ones((LABSIZE, LABSIZE))
    CTR = LABSIZE // 2

    # Simple cross maze
    #lab[CTR, 1:LABSIZE-1] = 0
    #lab[1:LABSIZE-1, CTR] = 0

    # Double-T maze
    #lab[CTR, 1:LABSIZE-1] = 0
    #lab[1:LABSIZE-1, 1] = 0
    #lab[1:LABSIZE-1, LABSIZE - 2] = 0

    # Grid maze
    lab[1:LABSIZE - 1, 1:LABSIZE - 1].fill(0)
    for row in range(1, LABSIZE - 1):
        for col in range(1, LABSIZE - 1):
            if row % 2 == 0 and col % 2 == 0:
                lab[row, col] = 1
    # Not strictly necessary, but cleaner since we start the agent at the
    # center for each episode; may help loclization in some maze sizes
    # (including 13 and 9, but not 11) by introducing a detectable irregularity
    # in the center:
    lab[CTR, CTR] = 0

    all_losses = []
    all_grad_norms = []
    all_losses_objective = []
    all_total_rewards = []
    all_losses_v = []
    lossbetweensaves = 0
    nowtime = time.time()
    meanrewards = np.zeros((LABSIZE, LABSIZE))
    meanrewardstmp = np.zeros((LABSIZE, LABSIZE, params['eplen']))

    pos = 0
    hidden = net.initialZeroState()
    hebb = net.initialZeroHebb()
    pw = net.initialZeroPlasticWeights()

    #celoss = torch.nn.CrossEntropyLoss() # For supervised learning - not used here

    params['nbiter'] = 3
    ax_imgs = []

    for numiter in range(params['nbiter']):

        PRINTTRACE = 0
        #if (numiter+1) % (1 + params['pe']) == 0:
        if (numiter + 1) % (params['pe']) == 0:
            PRINTTRACE = 1

        #lab = makemaze.genmaze(size=LABSIZE, nblines=4)
        #count = np.zeros((LABSIZE, LABSIZE))

        # Select the reward location for this episode - not on a wall!
        # And not on the center either! (though not sure how useful that restriction is...)
        # We always start the episode from the center (when hitting reward, we may teleport either to center or to a random location depending on params['rsp'])
        posr = {}
        posc = {}
        rposr = {}
        rposc = {}
        for nb in range(BATCHSIZE):
            # Note: it doesn't matter if the reward is on the center (see below). All we need is not to put it on a wall or pillar (lab=1)
            myrposr = 0
            myrposc = 0
            while lab[myrposr, myrposc] == 1 or (myrposr == CTR
                                                 and myrposc == CTR):
                myrposr = np.random.randint(1, LABSIZE - 1)
                myrposc = np.random.randint(1, LABSIZE - 1)
            rposr[nb] = myrposr
            rposc[nb] = myrposc
            #print("Reward pos:", rposr, rposc)
            # Agent always starts an episode from the center
            posc[nb] = CTR
            posr[nb] = CTR

        #optimizer.zero_grad()
        loss = 0
        lossv = 0
        hidden = net.initialZeroState()
        hebb = net.initialZeroHebb()
        et = net.initialZeroHebb(
        )  # Eligibility Trace is identical to Hebbian Trace in shape
        pw = net.initialZeroPlasticWeights()
        numactionchosen = 0

        reward = np.zeros(BATCHSIZE)
        sumreward = np.zeros(BATCHSIZE)
        rewards = []
        vs = []
        logprobs = []
        dist = 0
        numactionschosen = np.zeros(BATCHSIZE, dtype='int32')

        #reloctime = np.random.randint(params['eplen'] // 4, (3 * params['eplen']) // 4)

        #print("EPISODE ", numiter)
        for numstep in range(params['eplen']):

            inputs = np.zeros((BATCHSIZE, TOTALNBINPUTS), dtype='float32')

            labg = lab.copy()
            #labg[rposr, rposc] = -1  # The agent can see the reward if it falls within its RF
            for nb in range(BATCHSIZE):
                inputs[nb, 0:RFSIZE *
                       RFSIZE] = labg[posr[nb] - RFSIZE // 2:posr[nb] +
                                      RFSIZE // 2 + 1,
                                      posc[nb] - RFSIZE // 2:posc[nb] +
                                      RFSIZE // 2 + 1].flatten() * 1.0

                # Previous chosen action
                inputs[nb, RFSIZE * RFSIZE + 1] = 1.0  # Bias neuron
                inputs[nb, RFSIZE * RFSIZE + 2] = numstep / params['eplen']
                #inputs[0, RFSIZE * RFSIZE +3] = 1.0 * reward # Reward from previous time step
                inputs[nb, RFSIZE * RFSIZE + 3] = 1.0 * reward[nb]
                inputs[nb,
                       RFSIZE * RFSIZE + ADDINPUT + numactionschosen[nb]] = 1
                #inputs = 100.0 * inputs  # input boosting : Very bad with clamp=0

            inputsC = torch.from_numpy(inputs).cuda()
            # Might be better:
            #if rposr == posr and rposc = posc:
            #    inputs[0][-4] = 100.0
            #else:
            #    inputs[0][-4] = 0

            # Running the network

            ## Running the network
            y, v, hidden, hebb, et, pw = net(
                Variable(inputsC, requires_grad=False), hidden, hebb, et,
                pw)  # y  should output raw scores, not probas

            # For now:
            #numactionchosen = np.argmax(y.data[0])
            # But wait, this is bad, because the network needs to see the
            # reward signal to guide its own (within-episode) learning... and
            # argmax might not provide enough exploration for this!

            #ee = np.exp(y.data[0].cpu().numpy())
            #numactionchosen = np.random.choice(NBNONRESTACTIONS, p = ee / (1e-10 + np.sum(ee)))

            y = F.softmax(y, dim=1)
            # Must convert y to probas to use this !
            distrib = torch.distributions.Categorical(y)
            actionschosen = distrib.sample()
            logprobs.append(distrib.log_prob(actionschosen))
            numactionschosen = actionschosen.data.cpu().numpy(
            )  # Turn to scalar
            reward = np.zeros(BATCHSIZE, dtype='float32')
            #if numiter == 7 and numstep == 1:
            #    pdb.set_trace()

            for nb in range(BATCHSIZE):
                myreward = 0
                numactionchosen = numactionschosen[nb]

                tgtposc = posc[nb]
                tgtposr = posr[nb]
                if numactionchosen == 0:  # Up
                    tgtposr -= 1
                elif numactionchosen == 1:  # Down
                    tgtposr += 1
                elif numactionchosen == 2:  # Left
                    tgtposc -= 1
                elif numactionchosen == 3:  # Right
                    tgtposc += 1
                else:
                    raise ValueError("Wrong Action")

                reward[nb] = 0.0  # The reward for this step
                if lab[tgtposr][tgtposc] == 1:
                    reward[nb] -= params['wp']
                else:
                    #dist += 1
                    posc[nb] = tgtposc
                    posr[nb] = tgtposr

                # Did we hit the reward location ? Increase reward and teleport!
                # Note that it doesn't matter if we teleport onto the reward, since reward hitting is only evaluated after the (obligatory) move
                if rposr[nb] == posr[nb] and rposc[nb] == posc[nb]:
                    reward[nb] += params['rew']
                    posr[nb] = np.random.randint(1, LABSIZE - 1)
                    posc[nb] = np.random.randint(1, LABSIZE - 1)
                    while lab[posr[nb],
                              posc[nb]] == 1 or (rposr[nb] == posr[nb]
                                                 and rposc[nb] == posc[nb]):
                        posr[nb] = np.random.randint(1, LABSIZE - 1)
                        posc[nb] = np.random.randint(1, LABSIZE - 1)

            rewards.append(reward)
            vs.append(v)
            sumreward += reward

            loss += (
                params['bent'] * y.pow(2).sum() / BATCHSIZE
            )  # We want to penalize concentration, i.e. encourage diversity; our version of PyTorch does not have an entropy() function for Distribution. Note: .2 may be too strong, .04 may be too weak.
            #lossentmean  = .99 * lossentmean + .01 * ( params['bent'] * y.pow(2).sum() / BATCHSIZE ).data[0] # We want to penalize concentration, i.e. encourage diversity; our version of PyTorch does not have an entropy() function for Distribution. Note: .2 may be too strong, .04 may be too weak.

            if PRINTTRACE:
                #print("Step ", numstep, "- GI: ", goodinputs, ", GA: ", goodaction, " Inputs: ", inputsN, " - Outputs: ", y.data.cpu().numpy(), " - action chosen: ", numactionchosen,
                #        " - inputsthisstep:", inputsthisstep, " - mean abs pw: ", np.mean(np.abs(pw.data.cpu().numpy())), " -Rew: ", reward)
                print("Step ", numstep, " Inputs (to 1st in batch): ",
                      inputs[0, :TOTALNBINPUTS], " - Outputs(1st in batch): ",
                      y[0].data.cpu().numpy(),
                      " - action chosen(1st in batch): ",
                      numactionschosen[0], " - mean abs pw: ",
                      np.mean(np.abs(pw.data.cpu().numpy())),
                      " -Reward (this step, 1st in batch): ", reward[0])

            # Display the labyrinth

            #for numr in range(LABSIZE):
            #    s = ""
            #    for numc in range(LABSIZE):
            #        if posr == numr and posc == numc:
            #            s += "o"
            #        elif rposr == numr and rposc == numc:
            #            s += "X"
            #        elif lab[numr, numc] == 1:
            #            s += "#"
            #        else:
            #            s += " "
            #    print(s)
            #print("")
            #print("")

            labg = lab.copy()
            labg[rposr[0], rposc[0]] = 2
            labg[posr[0], posc[0]] = 3
            fullimg = plt.imshow(labg, animated=True)
            ax_imgs.append([fullimg])

        # Episode is done, now let's do the actual computations

        R = Variable(torch.zeros(BATCHSIZE).cuda(), requires_grad=False)
        gammaR = params['gr']
        for numstepb in reversed(range(params['eplen'])):
            R = gammaR * R + Variable(torch.from_numpy(
                rewards[numstepb]).cuda(),
                                      requires_grad=False)
            ctrR = R - vs[numstepb][0]
            lossv += ctrR.pow(2).sum() / BATCHSIZE
            loss -= (logprobs[numstepb] * ctrR.detach()
                     ).sum() / BATCHSIZE  # Need to check if detach() is OK
            #pdb.set_trace()

        #elif params['algo'] == 'REI':
        #    R = sumreward
        #    baseline = meanrewards[rposr, rposc]
        #    for numstepb in reversed(range(params['eplen'])) :
        #        loss -= logprobs[numstepb] * (R - baseline)
        #elif params['algo'] == 'REINOB':
        #    R = sumreward
        #    for numstepb in reversed(range(params['eplen'])) :
        #        loss -= logprobs[numstepb] * R
        #elif params['algo'] == 'REITMP':
        #    R = 0
        #    for numstepb in reversed(range(params['eplen'])) :
        #        R = gammaR * R + rewards[numstepb]
        #        loss -= logprobs[numstepb] * R
        #elif params['algo'] == 'REITMPB':
        #    R = 0
        #    for numstepb in reversed(range(params['eplen'])) :
        #        R = gammaR * R + rewards[numstepb]
        #        loss -= logprobs[numstepb] * (R - meanrewardstmp[rposr, rposc, numstepb])

        #else:
        #    raise ValueError("Which algo?")

        #meanrewards[rposr, rposc] = (1.0 - params['nu']) * meanrewards[rposr, rposc] + params['nu'] * sumreward
        #R = 0
        #for numstepb in reversed(range(params['eplen'])) :
        #    R = gammaR * R + rewards[numstepb]
        #    meanrewardstmp[rposr, rposc, numstepb] = (1.0 - params['nu']) * meanrewardstmp[rposr, rposc, numstepb] + params['nu'] * R

        loss += params['blossv'] * lossv
        loss /= params['eplen']

        if True:  #PRINTTRACE:
            if True:  #params['algo'] == 'A3C':
                print("lossv: ", float(lossv))
            print("Total reward for this episode:", sumreward, "Dist:", dist)

        #if numiter > 100:  # Burn-in period for meanrewards
        #    loss.backward()
        #    optimizer.step()

        #torch.cuda.empty_cache()

    print("Saving animation....")
    anim = animation.ArtistAnimation(fig, ax_imgs, interval=200)
    anim.save('anim.gif', writer='imagemagick', fps=10)
def accuracy(yhat, y):
    num_correct = torch.eq(yhat, y).sum().float()
    return num_correct / torch.numel(y), num_correct, torch.numel(y)
 def get_layer_param(model):
     return sum([torch.numel(param) for param in model.parameters()])
Esempio n. 29
0
    def forward(self, hidden_states, position_ids, attention_mask, memory_states=None, encoder_states=None,
                return_memory=False, detach_memory=True):
        batch_size, query_length = hidden_states.size()[:2]
        memory_length = memory_states[0].size(1) if memory_states else 0
        key_length = query_length + memory_length
        # attention mask is the beginning postion of B region, \in [0, query_len)
        is_scalar = torch.numel(attention_mask) == 1
        is_sep = is_scalar or torch.numel(attention_mask) == batch_size
        if self.performer:
            assert is_scalar, 'attention_mask should be a scalar to indicate the seperation position.'
            assert memory_length == 0, 'Do not support transformer-xl.'
        if is_sep:
            sep = attention_mask.item() if is_scalar else attention_mask

            # conventional transformer
            def build_mask_matrix(seq_length, sep, memory_length=0):
                m = hidden_states.new_ones((1, seq_length, seq_length))
                m = torch.tril(m)
                if is_scalar:
                    m[0, :, :sep] = 1
                else:
                    m = m.expand(batch_size, -1, -1)
                    ids = torch.arange(seq_length, device=sep.device, dtype=sep.dtype).view(1, -1)
                    mask = ids < sep.view(-1, 1)
                    m = m.masked_fill(mask.unsqueeze(1).expand_as(m), 1)
                if memory_length > 0:
                    m = m.expand(batch_size, -1, -1)
                    m = torch.cat((hidden_states.new_ones((batch_size, seq_length, memory_length)), m), dim=2)
                m = m.unsqueeze(1)
                return m

            if not self.performer:
                attention_mask = build_mask_matrix(query_length, sep, memory_length=memory_length)
        else:
            attention_mask = attention_mask[:, :, :, -query_length - memory_length:]

        if self.relative_encoding:
            position_sequence = torch.arange(key_length - 1, -1, -1.0, device=hidden_states.device,
                                             dtype=hidden_states.dtype)
            position_embeddings = self.position_embeddings(position_sequence)
            # Apply dropout
            position_embeddings = self.embedding_dropout(position_embeddings)
        else:
            if self.block_position_encoding:
                position_ids, block_position_ids = position_ids[:, 0], position_ids[:, 1]
            position_embeddings = self.position_embeddings(position_ids)
            hidden_states = hidden_states + position_embeddings
            if self.block_position_encoding:
                block_position_embeddings = self.block_position_embeddings(block_position_ids)
                hidden_states = hidden_states + block_position_embeddings
        hidden_states = self.embedding_dropout(hidden_states)

        def check_detach(_hidden_states):
            if detach_memory:
                return _hidden_states.detach()
            return _hidden_states

        if self.max_memory_length > 0 or return_memory:
            mem_layers = [check_detach(hidden_states)]
        else:
            mem_layers = []

        def custom(start, end):
            def custom_forward(*inputs):
                layers_ = self.layers[start:end]
                x_, inputs = inputs[0], inputs[1:]
                if self.relative_encoding:
                    inputs, mems_ = inputs[:4], inputs[4:]
                else:
                    inputs, mems_ = inputs[:1], inputs[1:]
                for i, layer in enumerate(layers_):
                    mem_i_ = mems_[i] if mems_ else None
                    x_ = layer(x_, *inputs, mem=mem_i_)
                    if self.max_memory_length > 0 or return_memory:
                        mem_layers.append(check_detach(x_))
                return x_

            return custom_forward

        if self.checkpoint_activations:
            l = 0
            num_layers = len(self.layers)
            chunk_length = self.checkpoint_num_layers
            while l < num_layers:
                args = [hidden_states, attention_mask] if not self.use_decoder_layer else [hidden_states,
                                                                                           encoder_states,
                                                                                           attention_mask]
                if self.relative_encoding:
                    args += [position_embeddings, self.r_w_bias, self.r_r_bias]
                if memory_states:
                    args += memory_states[l: l + chunk_length]
                hidden_states = checkpoint(custom(l, l + chunk_length), *args)
                l += chunk_length
        else:
            for i, layer in enumerate(self.layers):
                args = [hidden_states, attention_mask] if not self.use_decoder_layer else [hidden_states,
                                                                                           encoder_states,
                                                                                           attention_mask]
                if self.relative_encoding:
                    args += [position_embeddings, self.r_w_bias, self.r_r_bias]
                mem_i = memory_states[i] if memory_states else None
                hidden_states = layer(*args, mem=mem_i)
                if self.max_memory_length > 0 or return_memory:
                    mem_layers.append(check_detach(hidden_states))

        # Final layer norm.
        output = self.final_layernorm(hidden_states)
        if self.max_memory_length > 0 or return_memory:
            mem_layers = self.update_mems(mem_layers, memory_states, return_memory=return_memory)

        return (output, mem_layers)
Esempio n. 30
0
def train(paramdict):
    # params = dict(click.get_current_context().params)
    hebbian_trace = []
    # TOTALNBINPUTS =  RFSIZE * RFSIZE + ADDITIONALINPUTS + NBNONRESTACTIONS
    print("Starting training...")
    params = {}
    # params.update(defaultParams)
    params.update(paramdict)
    print("Passed params: ", params)
    print(platform.uname())
    # params['nbsteps'] = params['nbshots'] * ((params['prestime'] + params['interpresdelay']) * params['nbclasses']) + params['prestimetest']  # Total number of steps per episode
    suffix = "btchFixmod_" + "".join([
        str(x) + "_" if pair[0] is not 'nbsteps' and pair[0] is not 'rngseed'
        and pair[0] is not 'save_every' and pair[0] is not 'test_every'
        and pair[0] is not 'pe' else ''
        for pair in sorted(zip(params.keys(), params.values()),
                           key=lambda x: x[0]) for x in pair
    ])[:-1] + "_rngseed_" + str(
        params['rngseed']
    )  # Turning the parameters into a nice suffix for filenames

    # Initialize random seeds (first two redundant?)
    print("Setting random seeds")
    np.random.seed(params['rngseed'])
    random.seed(params['rngseed'])
    torch.manual_seed(params['rngseed'])

    print("Initializing network")
    use_cuda = False  #torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    net = Network(TOTALNBINPUTS,
                  params['hs']).to(device)  # Creating the network

    #net.load_state_dict(torch.load('200k_trained_normal.dat'))
    #net.load_state_dict(torch.load('200k_trained_move_reward.dat'))
    net.load_state_dict(torch.load('200k_5by5_hidden20_move.dat'))

    #net.load_state_dict(torch.load('50k_trained_uncertainty.dat'))

    print("Shape of all optimized parameters:",
          [x.size() for x in net.parameters()])
    allsizes = [torch.numel(x.data.cpu()) for x in net.parameters()]
    print("Size (numel) of all optimized elements:", allsizes)
    print("Total size (numel) of all optimized elements:", sum(allsizes))

    # total_loss = 0.0
    print("Initializing optimizer")
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr=1.0 * params['lr'],
                                 eps=1e-4,
                                 weight_decay=params['l2'])
    # optimizer = torch.optim.SGD(net.parameters(), lr=1.0*params['lr'])
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, gamma=params['gamma'], step_size=params['steplr'])

    LABSIZE = params['msize']
    lab = np.ones((LABSIZE, LABSIZE))
    CTR = LABSIZE // 2
    num_zeros = 0
    rwloc = []
    # Grid maze
    lab[1:LABSIZE - 1, 1:LABSIZE - 1].fill(0)
    for row in range(1, LABSIZE - 1):
        for col in range(1, LABSIZE - 1):
            if row % 2 == 0 and col % 2 == 0:
                lab[row, col] = 1
            else:
                rwloc.append([row, col])
                num_zeros = num_zeros + 1
    BATCHSIZE = params['bs']
    NUMMOVES = 1
    movebuffer = 5
    move = True
    #BATCHSIZE = num_zeros
    # Not strictly necessary, but cleaner since we start the agent at the
    # center for each episode; may help loclization in some maze sizes
    # (including 13 and 9, but not 11) by introducing a detectable irregularity
    # in the center:
    lab[CTR, CTR] = 0

    all_losses = []
    all_grad_norms = []
    all_losses_objective = []
    all_total_rewards = []
    all_losses_v = []
    lossbetweensaves = 0
    nowtime = time.time()
    meanrewards = np.zeros((LABSIZE, LABSIZE))
    meanrewardstmp = np.zeros((LABSIZE, LABSIZE, params['eplen']))

    pos = 0
    hidden = net.initialZeroState(BATCHSIZE)
    hebb = net.initialZeroHebb(BATCHSIZE)
    # pw = net.initialZeroPlasticWeights()  # For eligibility traces

    # celoss = torch.nn.CrossEntropyLoss() # For supervised learning - not used here
    print(rwloc[1])
    print("Starting episodes!")

    PRINTTRACE = 0
    # lab = makemaze.genmaze(size=LABSIZE, nblines=4)
    # count = np.zeros((LABSIZE, LABSIZE))

    # Select the reward location for this episode - not on a wall!
    # And not on the center either! (though not sure how useful that restriction is...)
    # We always start the episode from the center
    posr = {}
    posc = {}
    reward_move_time = {}
    #buffer time between front and end for random movement of reward
    #for example: 50 for 200 episodes would move the reward between 50 and 150.

    rposr = {}
    rposc = {}

    rposr_old = {}
    rposc_old = {}

    #Search checks if it's in a searching mode or if it's found the reward
    search = {}
    moved = {}
    BATCHSIZE = params['eplen'] - 2 * movebuffer
    for nb in range(BATCHSIZE):
        search[nb] = True
        moved[nb] = False
        # Note: it doesn't matter if the reward is on the center (see below). All we need is not to put it on a wall or pillar (lab=1)

        #For this we'll make the reward always in the same position
        myrposr = 0
        myrposc = 0
        while lab[myrposr, myrposc] == 1 or (myrposr == CTR
                                             and myrposc == CTR):
            myrposr = np.random.randint(1, LABSIZE - 1)
            myrposc = np.random.randint(1, LABSIZE - 1)
        rposr[nb] = myrposr
        rposc[nb] = myrposc

        rposr_old[nb] = myrposr
        rposc_old[nb] = myrposc
        # print("Reward pos:", rposr, rposc)
        # Agent always starts an episode from the center
        posc[nb] = CTR
        posr[nb] = CTR
        #reward_move_time[nb] = random.randint(movebuffer,params['eplen']-movebuffer)
        reward_move_time[nb] = nb + movebuffer
        #random.sample(range(movebuffer, params['eplen'] - movebuffer), NUMMOVES)
    print(reward_move_time)
    optimizer.zero_grad()
    loss = 0
    lossv = 0
    hidden = net.initialZeroState(BATCHSIZE).to(device)
    hebb = net.initialZeroHebb(BATCHSIZE).to(device)
    numactionchosen = 0

    reward = np.zeros(BATCHSIZE)
    sumreward = np.zeros(BATCHSIZE)
    sumreward_after = np.zeros(BATCHSIZE)
    rewards = []
    vs = []
    logprobs = []
    dist = 0
    numactionschosen = np.zeros(BATCHSIZE, dtype='int32')

    # reloctime = np.random.randint(params['eplen'] // 4, (3 * params['eplen']) // 4)

    # print("EPISODE ", numiter)
    for numstep in range(params['eplen']):
        inputs = np.zeros((BATCHSIZE, TOTALNBINPUTS), dtype='float32')

        labg = lab.copy()
        for nb in range(BATCHSIZE):
            if numstep == reward_move_time[nb] and move:
                #myrposr = 3;
                #myrposc = 3
                while lab[myrposr, myrposc] == 1 or (myrposr == CTR
                                                     and myrposc == CTR):
                    myrposr = np.random.randint(1, LABSIZE - 1)
                    myrposc = np.random.randint(1, LABSIZE - 1)
                rposr[nb] = myrposr
                rposc[nb] = myrposc
                moved[nb] = True
                # print("Reward pos:", rposr, rposc)
                # Agent always starts an episode from the center

            inputs[nb,
                   0:RFSIZE * RFSIZE] = labg[posr[nb] - RFSIZE // 2:posr[nb] +
                                             RFSIZE // 2 + 1,
                                             posc[nb] - RFSIZE // 2:posc[nb] +
                                             RFSIZE // 2 + 1].flatten() * 1.0

            # Previous chosen action
            inputs[nb, RFSIZE * RFSIZE + 1] = 1.0  # Bias neuron
            inputs[nb, RFSIZE * RFSIZE + 2] = numstep / params['eplen']
            inputs[nb, RFSIZE * RFSIZE + 3] = 1.0 * reward[nb]
            inputs[nb, RFSIZE * RFSIZE + ADDITIONALINPUTS +
                   numactionschosen[nb]] = 1

        inputsC = torch.from_numpy(inputs).to(device)

        ## Running the network
        y, v, (hidden, hebb) = net(
            inputsC, (hidden, hebb))  # y  should output raw scores, not probas
        print(hebb.shape)
        for nb in range(BATCHSIZE):
            for node in range(params['hs']):
                hebbian_trace.append(
                    np.concatenate((np.array([nb, node, numstep, search[nb]]),
                                    hebb[nb][node].detach().numpy())))
        y = torch.softmax(y, dim=1)
        distrib = torch.distributions.Categorical(y)
        actionschosen = distrib.sample()
        logprobs.append(distrib.log_prob(actionschosen))
        numactionschosen = actionschosen.data.cpu().numpy(
        )  # We want to break gradients
        reward = np.zeros(BATCHSIZE, dtype='float32')

        for nb in range(BATCHSIZE):
            myreward = 0
            numactionchosen = numactionschosen[nb]

            tgtposc = posc[nb]
            tgtposr = posr[nb]
            if numactionchosen == 0:  # Up
                tgtposr -= 1
            elif numactionchosen == 1:  # Down
                tgtposr += 1
            elif numactionchosen == 2:  # Left
                tgtposc -= 1
            elif numactionchosen == 3:  # Right
                tgtposc += 1
            else:
                raise ValueError("Wrong Action")

            reward[nb] = 0.0  # The reward for this step
            if lab[tgtposr][tgtposc] == 1:
                reward[nb] -= params['wp']
            else:
                posc[nb] = tgtposc
                posr[nb] = tgtposr

            #if it hits the old reward location then it reenters search mode
            if moved[nb] and posr[nb] == rposr_old[nb] and posc[
                    nb] == rposc_old[nb]:
                search[nb] = True
            # Did we hit the reward location ? Increase reward and teleport!
            # Note that it doesn't matter if we teleport onto the reward, since reward hitting is only evaluated after the (obligatory) move...
            # But we still avoid it.
            if rposr[nb] == posr[nb] and rposc[nb] == posc[nb]:
                if search[nb]:
                    search[nb] = False
                    moved[nb] = False
                reward[nb] += params['rew']
                posr[nb] = np.random.randint(1, LABSIZE - 1)
                posc[nb] = np.random.randint(1, LABSIZE - 1)
                while lab[posr[nb],
                          posc[nb]] == 1 or (rposr[nb] == posr[nb]
                                             and rposc[nb] == posc[nb]):
                    posr[nb] = np.random.randint(1, LABSIZE - 1)
                    posc[nb] = np.random.randint(1, LABSIZE - 1)
            if reward_move_time[nb] < numstep:
                sumreward_after[nb] += reward[nb]
            else:
                sumreward[nb] += reward[nb]

        rewards.append(reward)
        vs.append(v)
        #sumreward += reward

        # This is an "entropy penalty", implemented by the sum-of-squares of the probabilities because our version of PyTorch did not have an entropy() function.
        # The result is the same: to penalize concentration, i.e. encourage diversity in chosen actions.
        loss += (params['bent'] * y.pow(2).sum() / BATCHSIZE)

        # if PRINTTRACE:
        #    print("Step ", numstep, " Inputs (to 1st in batch): ", inputs[0, :TOTALNBINPUTS], " - Outputs(1st in batch): ", y[0].data.cpu().numpy(), " - action chosen(1st in batch): ", numactionschosen[0],
        #            #" - mean abs pw: ", np.mean(np.abs(pw.data.cpu().numpy())),
        #            " -Reward (this step, 1st in batch): ", reward[0])

        # Episode is done, now let's do the actual computations of rewards and losses for the A2C algorithm

    R = torch.zeros(BATCHSIZE).to(device)
    gammaR = params['gr']
    for numstepb in reversed(range(params['eplen'])):
        R = gammaR * R + torch.from_numpy(rewards[numstepb]).to(device)
        ctrR = R - vs[numstepb][0]
        lossv += ctrR.pow(2).sum() / BATCHSIZE
        loss -= (logprobs[numstepb] * ctrR.detach()).sum() / BATCHSIZE
        # pdb.set_trace()

    loss += params['blossv'] * lossv
    loss /= params['eplen']

    if PRINTTRACE:
        if True:  # params['algo'] == 'A3C':
            print("lossv: ", float(lossv))
        print("Total reward for this episode (all):", sumreward, "Dist:", dist)

    #loss.backward()
    all_grad_norms.append(
        torch.nn.utils.clip_grad_norm(net.parameters(), params['gc']))

    lossnum = float(loss)
    lossbetweensaves += lossnum
    all_losses_objective.append(lossnum)
    all_total_rewards.append(sumreward.mean())
    # all_losses_v.append(lossv.data[0])
    # total_loss  += lossnum

    lossbetweensaves = 0
    print("Rewards before move:", sumreward)
    print("Move times: ", reward_move_time)
    print("Rewards after move:", sumreward_after)
    #print("Reward Locs", rwloc)
    print("Mean reward (across batch): ",
          sumreward.mean() + sumreward_after.mean())
    previoustime = nowtime
    nowtime = time.time()
    print("Time spent: ", nowtime - previoustime)
    # print("ETA: ", net.eta.data.cpu().numpy(), " etaet: ", net.etaet.data.cpu().numpy())

    # if (numiter + 1) % params['save_every'] == 0:
    #     print("Saving files...")
    #     losslast100 = np.mean(all_losses_objective[-100:])
    #     print("Average loss over the last 100 episodes:", losslast100)
    #     print("Saving local files...")
    #     with open('grad_' + suffix + '.txt', 'w') as thefile:
    #         for item in all_grad_norms[::10]:
    #             thefile.write("%s\n" % item)
    #     with open('loss_' + suffix + '.txt', 'w') as thefile:
    #         for item in all_total_rewards[::10]:
    #             thefile.write("%s\n" % item)
    #     torch.save(net.state_dict(), 'torchmodel_' + suffix + '.dat')
    #     with open('params_' + suffix + '.dat', 'wb') as fo:
    #         pickle.dump(params, fo)
    #     if os.path.isdir('/mnt/share/tmiconi'):
    #         print("Transferring to NFS storage...")
    #         for fn in ['params_' + suffix + '.dat', 'loss_' + suffix + '.txt', 'torchmodel_' + suffix + '.dat']:
    #             result = os.system(
    #                 'cp {} {}'.format(fn, '/mnt/share/tmiconi/modulmaze/' + fn))
    print("Done!")
    np.savetxt("move_all_hebb_with_search.csv", hebbian_trace, delimiter=",")
Esempio n. 31
0
            data, lab = [_.cuda() for _ in batch]

            p = args.shot * args.train_way
            data_shot = data[:p]
            data_query = data[p:]
            data_shot = data_shot[:, :3, :]
            data_query = data_query[:, 3:, :]
            train_gt = lab[:p].reshape(args.shot, args.train_way)[0, :]

            #data_query = data_query[:,:3,:]

            proto = model_cnn(data_shot)
            proto = proto.reshape(args.shot, args.train_way, -1)

            which_novel = torch.gt(train_gt, 79)
            which_novel = args.train_way - torch.numel(train_gt[which_novel])

            if which_novel < args.train_way:
                proto_base = proto[:, :which_novel, :]
                proto_novel = proto[:, which_novel:, :]
                noise = torch.cuda.FloatTensor(
                    (args.train_way - which_novel) * args.shot,
                    noise_dim).normal_()
                proto_novel_gen = model_gen(
                    proto_novel.reshape(
                        args.shot * (args.train_way - which_novel), -1), noise)
                proto_novel_gen = proto_novel_gen.reshape(
                    args.shot, args.train_way - which_novel, -1)
                proto_novel_wgen = torch.cat([proto_novel, proto_novel_gen])
                ind_gen = torch.randperm(2 * args.shot)
                train_num = np.random.randint(1, args.shot)
Esempio n. 32
0
 def forward(ctx, x, x_star, gamma):
     ctx.reg_func = Smoothed1Norm(gamma=gamma)
     ctx.lin_op = Grad()
     diff = ctx.lin_op(x - x_star)
     ctx.save_for_backward(diff, x)
     return ctx.reg_func(ctx.lin_op(x - x_star)) / torch.numel(x)
Esempio n. 33
0
 def backward(ctx, grad_out):
     diff, x = ctx.saved_tensors
     gradx = ctx.lin_op.T(ctx.reg_func.grad(diff)) / torch.numel(x)
     gradx_star = -gradx
     return gradx, gradx_star, None
Esempio n. 34
0
 def forward(self, x):
     x = self.features(x)
     x = x.view(-1, torch.numel(x[0])) # is this correct?
     #x = x.view(x.size(0),-1) 
     x = self.classifier(x)
     return x