def test_model_debugger_pmap(self):
        """Test training for two epochs on MNIST with a small model."""

        rep_variables = set_up_cnn()

        pytree_path = os.path.join(self.test_dir, 'metrics')
        metrics_logger = utils.MetricLogger(pytree_path=pytree_path,
                                            events_dir=self.test_dir)
        debugger = model_debugger.ModelDebugger(use_pmap=True,
                                                metrics_logger=metrics_logger)

        # eval twice to test the concat
        extra_metrics = {'train_loss': 1.0}
        extra_metrics2 = {'train_loss': 1.0}
        metrics = debugger.full_eval(10,
                                     params=rep_variables['params'],
                                     grad=rep_variables['params'],
                                     extra_scalar_metrics=extra_metrics)
        metrics = debugger.full_eval(10,
                                     params=rep_variables['params'],
                                     grad=rep_variables['params'],
                                     extra_scalar_metrics=extra_metrics2)
        expected_keys = [
            'step',
            'global_param_norm_sql2',
            'param_norms_sql2',
            'grad_norms_sql2',
            'global_grad_norm_sql2',
            'train_loss',
        ]

        metrics_file = os.path.join(self.test_dir, 'metrics/training_metrics')

        loaded_metrics = checkpoint.load_checkpoint(metrics_file)['pytree']

        self.assertEqual(set(expected_keys), set(metrics.keys()))
        expected_shape = ()
        self.assertEqual(metrics['global_grad_norm_sql2'].shape,
                         expected_shape)
        # Test stored metrics is concatenated.
        expected_shape = (2, )
        self.assertEqual(loaded_metrics['global_grad_norm_sql2'].shape,
                         expected_shape)

        # check param norms were saved correctly
        self.assertEqual(
            loaded_metrics['param_norms_sql2']['Conv_0']['kernel'].shape,
            (2, ))
        self.assertEqual(loaded_metrics['train_loss'][0], 1.0)

        # Test restore of prior metrics.
        new_debugger = model_debugger.ModelDebugger(
            use_pmap=True, metrics_logger=metrics_logger)
        metrics = new_debugger.full_eval(10,
                                         params=rep_variables['params'],
                                         grad=rep_variables['params'],
                                         extra_scalar_metrics=extra_metrics2)
        self.assertEqual(
            new_debugger.stored_metrics['param_norms_sql2']['Conv_0']
            ['kernel'].shape, (3, ))
def test_acuracy(dataloaders,checkpoint_name='ic-model.pth',gpu=False):
    # TODO: Do validation on the test set
    cuda=gpu
    model = loader.load_checkpoint(checkpoint_name,cuda)
    correct=0
    total=0
    model.eval()
    if(cuda):
        model.to(device='cuda') 
    with torch.no_grad():
        for idx, (inputs, labels) in enumerate(dataloaders['test']):
            if cuda:
                inputs, labels = inputs.cuda(), labels.cuda()
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print('Correct'+str(correct))
    print('Total'+str(total))
    print('Accuracy of the network on the 10000 test images: %d %%' % (100 * correct / total))

    if (correct/total)>90:
        print ('It was more than 90%')
    else:
        print ('It was less than 90%')
def main():

    in_args = get_prediction_args()

    chk_suffix = '.pth'
    checkpoint_path = in_args.files[1] + chk_suffix

    model, optimizer = load_checkpoint(checkpoint_path)

    prob, labels = predict(in_args, model)
    print('Probabilities of each class: ', prob)
    print('Classes predicted: ', labels)
Exemple #4
0
    def eval(self, checkpoint_path):
        checkpoint = cp.load_checkpoint(address=checkpoint_path)
        self.model.load_state_dict(checkpoint['state_dict'])

        test = data_utils.TestSet(self.testpath, self.img_size,
                                  self.channel == 3)
        testdatas = test.loadtestdata()
        testdatas.astype(np.float)
        n = 0
        N = 16343
        batch_size = 8
        pre = np.array([])
        batch_site = []
        while n < N:
            n += batch_size
            if n < N:
                n1 = n - batch_size
                n2 = n
            else:
                n1 = n2
                n2 = N

            batch_site.append([n1, n2])

        pred_choice = []
        for site in tqdm(batch_site):
            test_batch = testdatas[site[0]:site[1]]
            test_batch = torch.from_numpy(test_batch)
            datas = Variable(test_batch).float()
            datas = datas.view(-1, 1, 128, 128)
            outputs = self.model(datas)
            outputs = outputs.cpu()
            outputs = outputs.data.numpy()
            for out in outputs:
                K = 5
                index = np.argpartition(out, -K)[-K:]
                pred_choice.append(index)
        pre = np.array(pred_choice)
        predicts = []
        for k in range(self.testnumber):
            index = pre[k]
            predict5 = self.words[index]
            predict5 = "".join(predict5)
            predicts.append(predict5)

        dataframe = pd.DataFrame({
            'filename': self.filename,
            'label': predicts
        })
        dataframe.to_csv("test.csv", index=False, encoding='utf-8')
        return self.filename, predicts
def main():
    checkpoint = cp.load_checkpoint(address='parameters.pth')
    net.load_state_dict(checkpoint['state_dict'])
    outputs = net(x)
    outputs = outputs.cpu()
    outputs = outputs.data.numpy()

    pred_choice = []
    for out in outputs:
        K = 1
        index = np.argpartition(out, -K)[-K:]
        pred_choice.append(index)
    pre = np.array(pred_choice)

    df['score'] = pre
    df.to_csv('predict.csv', encoding='gbk')
Exemple #6
0
def main():
    args = parse_args()
    model = load_checkpoint(args.checkpoint)
    categories = category_names(args.category_names)
    image_path = args.filepath
    model.eval()
    probs, classes = predict(image_path, model, topk=args.top_k)
    names = [categories[str(index)] for index in classes]
    print(probs)
    print(names)

    print('File selected: ' + image_path)
    i = 0
    while i < len(names):
        print("{} with a probability of {}".format(names[i], probs[i]))
        i = i + 1
def predict(image_path, model_name, topk=10, categories='', device='cuda'):
    ''' Predict the class (or classes) of an image using a trained deep learning model.
    '''

    if (not torch.cuda.is_available() and device == 'cuda'):
        device = 'cpu'

    # TODO: Implement the code to predict the class from an image file

    with open('cat_to_name.json', 'r') as f:
        label_mapper = json.load(f)

    gpu = (device == 'cuda')

    model = loader.load_checkpoint(model_name, gpu=gpu)

    model.to('cpu')

    img = process_image(image_path)

    img = torch.from_numpy(img).type(torch.FloatTensor)

    inpt = img.unsqueeze(0)

    model_result = model.forward(inpt)

    expResult = torch.exp(model_result)

    firstTopX, SecondTopX = expResult.topk(topk)

    probs = torch.nn.functional.softmax(firstTopX.data, dim=1).numpy()[0]
    #classes = SecondTopX.data.numpy()[0]

    #probs = firstTopX.detach().numpy().tolist()[0]
    classes = SecondTopX.detach().numpy().tolist()[0]

    # Convert indices to classes
    idx_to_class = {val: key for key, val in model.class_to_idx.items()}
    #labels = [label_mapper[str(lab)] for lab in SecondTopX]
    labels = [idx_to_class[y] for y in classes]
    flowers = [categories[idx_to_class[i]] for i in classes]

    return probs, flowers
Exemple #8
0
def initialise(config, dataset, args):

    data_root = config.root
    log_root = args.log_dir or data_root

    model_args = struct(dataset=struct(classes=dataset.classes,
                                       input_channels=3),
                        model=args.model,
                        version=2)

    run = 0

    debug = struct(predictions=args.debug_predictions or args.debug_all,
                   boxes=args.debug_boxes or args.debug_all)

    output_path, log = logger.make_experiment(log_root,
                                              args.run_name,
                                              load=not args.no_load,
                                              dry_run=args.dry_run)
    model_path = os.path.join(output_path, "model.pth")

    model, encoder = models.create(model_args.model, model_args.dataset)

    set_bn_momentum(model, args.bn_momentum)

    best, current, resumed = checkpoint.load_checkpoint(
        model_path, model, model_args, args)
    model, epoch = current.model, current.epoch + 1

    pause_time = args.pause_epochs
    running_average = [] if epoch >= args.average_start else []

    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)
    # optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=args.weight_decay)

    device = torch.cuda.current_device()
    tests = args.tests.split(",")

    return struct(**locals())
Exemple #9
0
def main(test_img_path):
    options = parse_args()
    is_cuda = use_cuda and not options.no_cuda
    hardware = "cuda" if is_cuda else "cpu"
    device = torch.device(hardware)

    for checkpoint_path in options.checkpoint:
        checkpoint_name, _ = os.path.splitext(
            os.path.basename(checkpoint_path))
        checkpoint = (load_checkpoint(checkpoint_path, cuda=is_cuda)
                      if checkpoint_path else default_checkpoint)
        encoder_checkpoint = checkpoint["model"].get("encoder")
        decoder_checkpoint = checkpoint["model"].get("decoder")

        test_img = Image.open(test_img_path)
        test_img = test_img.convert("RGB")

        enc = Encoder(img_channels=3, checkpoint=encoder_checkpoint).to(device)
        dec = Decoder(
            1,
            low_res_shape,
            high_res_shape,
            checkpoint=decoder_checkpoint,
            device=device,
        ).to(device)
        enc.eval()
        dec.eval()

        result = evaluate(
            enc,
            dec,
            test_img=test_img,
            device=device,
            checkpoint=checkpoint,
            beam_width=options.beam_width,
            prefix=options.prefix,
        )
        print(result)
Exemple #10
0
def train(checkpoint_path):
    # 是否装载模型参数
    load = False

    if load:
        checkpoint = cp.load_checkpoint(address=checkpoint_path)
        net.load_state_dict(checkpoint['state_dict'])
        start_epoch = checkpoint['epoch'] + 1
    else:
        start_epoch = 0

    for epoch in range(start_epoch, n_epoch):
        train_one_epoch()

        # 保存参数
        checkpoint = {
            'epoch': epoch,
            'state_dict': net.state_dict(),
            'optimizer': optimizer.state_dict()
        }
        cp.save_checkpoint(checkpoint, address=checkpoint_path)

        eval()
Exemple #11
0
    def train(self, checkpoint_path):
        # 是否装载模型参数
        load = False

        if load:
            checkpoint = cp.load_checkpoint(address=checkpoint_path)
            self.model.load_state_dict(checkpoint['state_dict'])
            start_epoch = checkpoint['epoch'] + 1
        else:
            start_epoch = 0

        for epoch in range(start_epoch, self.n_epoch):
            self.train_one_epoch(epoch)

            # 保存参数
            checkpoint = {
                'epoch': epoch,
                'state_dict': self.model.state_dict(),
                'optimizer': self.optimizer.state_dict()
            }
            cp.save_checkpoint(checkpoint, address=checkpoint_path)

            if self.selftest:
                self.eval(epoch)
    def train_(self, criterion, logger, model, **pars):
        """
        grad_cache will be updated in-place
        *** only learning_rate and current_loader_ind need to be loaded at checkpoint
        K: the number of active clients
        """
        default_pars = dict(learning_rate=1e-2,
                            K=10,
                            num_its=5000,
                            lr_decay=0.5,
                            decay_step_size=1000,
                            print_every=50,
                            checkpoint_interval=1000)
        init_pars(default_pars, pars)
        pars = default_pars

        K = pars['K']
        learning_rate = pars['learning_rate']
        num_its = pars['num_its']
        lr_decay = pars['lr_decay']
        decay_step_size = pars['decay_step_size']
        print_every = pars['print_every']
        checkpoint_interval = pars['checkpoint_interval']

        I = self.pars['I']
        N = self.pars['N']

        checkpoint_dir = self.checkpoint_dir

        logger.add_meta_data(pars, 'training')
        logger.add_meta_data(self.pars, 'simulation')

        if use_cuda:
            model = model.to(torch.device('cuda'))
        else:
            model = model.to(torch.device('cpu'))

        if osp.exists(osp.join(checkpoint_dir, 'meta.pkl')):
            current_it = load_checkpoint(checkpoint_dir, model, logger)
        else:
            current_it = 0

        while True:
            current_lr = learning_rate * (lr_decay
                                          **(current_it // decay_step_size))
            print(f"current_it={current_it}, current_lr={current_lr}",
                  end='\r')

            global_model = deepcopy(model)
            zero_model(global_model)

            # set the number of  active clients
            idxs_users = np.random.choice(range(N), K, replace=False)
            for idx in idxs_users:
                worker = self.workers[idx]
                local_model = deepcopy(model)
                worker.train_(local_model,
                              criterion,
                              current_lr=current_lr,
                              num_its=I)
                aggregate_model(
                    global_model, local_model, 1,
                    N / K * (worker.num_train / self.num_total_samples))
            model = global_model
            logger.add_train_loss(
                list(model.parameters())[0][0][0][0][0], current_it,
                'model-par')

            if current_it % print_every == 0:
                # fedavg
                fed_acc_array = self.test_model(model)
                fed_acc = np.array(fed_acc_array).mean()
                print('%d fedavg test acc: %.3f%%' %
                      (current_it, fed_acc * 100.0))
                logger.add_test_acc(fed_acc, current_it, 'fedavg')

            if current_it % checkpoint_interval == 0:
                save_checkpoint(current_it, model, logger, checkpoint_dir)

            if current_it == num_its:
                print('Finished Training')
                return

            current_it += 1
Exemple #13
0
    model.dataset = args.dataset
    model.input_shape = (1, 3, args.image_size, args.image_size
                         )  # For channel first.
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("Leverage {} device to run this task.".format(device))
    if args.cpu:
        device = torch.device("cpu")
    model.to(device)

    optimizer = None
    compress_scheduler = None
    if args.train:
        if args.resume_from:
            # Load checkpoint for locally pre-trained model.
            try:
                model, compress_scheduler, optimizer, start_epoch = ckpt.load_checkpoint(
                    model, args.model_path, model_device=device)
            except:
                model.load_state_dict(torch.load(args.model_path))

            optimizer = None
            if optimizer is None:
                optimizer = optim.SGD(model.parameters(),
                                      lr=args.lr_pretrain,
                                      momentum=0.9,
                                      weight_decay=args.weight_decay)
                print("Do build optimizer")

            store_mask = compress_scheduler.zeros_mask_dict
            compress_scheduler = None
            if compress_scheduler is None:
                if args.compress:
Exemple #14
0
    #***************************************************
    criterion = nn.CrossEntropyLoss().to(device)
    # Setting weight decay scheduler (?)
    #optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
    optimizer = None
    if args.train:
        if args.resume_from:
            # Load checkpoint for post training form the pre-trained model.
            """
            net, compress_scheduler, optimizer, start_epoch = ckpt.load_checkpoint(
            net, "/home/bwtseng/Downloads/vww_mobilenetv1_distiller/model_save/image_net_mobilenetv1_saved_best.pth.tar", 
            model_device=device)
            """
            net, compress_scheduler, optimizer, start_epoch = ckpt.load_checkpoint(
                net,  #os.path.join('/home/bwtseng/Downloads/', args.model_path, name), 
                "/home/bwtseng/Downloads/distiller/examples/ssl/checkpoints/checkpoint_trained_dense.pth.tar",
                model_device=device)

            optimizer = None
            print(optimizer)
            if optimizer is None:
                optimizer = optim.SGD(net.parameters(),
                                      lr=args.lr,
                                      momentum=0.9,
                                      weight_decay=args.weight_decay)
                print("Do optimizer")
            compress_scheduler = None
            if compress_scheduler is None:
                compress_scheduler = utl.file_config(net, optimizer,
                                                     args.compress, None, None)
                print("Do load compress")
def main():
    options = parse_args()
    torch.manual_seed(options.seed)
    is_cuda = use_cuda and not options.no_cuda
    hardware = "cuda" if is_cuda else "cpu"
    device = torch.device(hardware)

    checkpoint = (load_checkpoint(options.checkpoint, cuda=is_cuda)
                  if options.checkpoint else default_checkpoint)
    print("Running {} epochs on {}".format(options.num_epochs, hardware))
    encoder_checkpoint = checkpoint["model"].get("encoder")
    decoder_checkpoint = checkpoint["model"].get("decoder")
    if encoder_checkpoint is not None:
        print(("Resuming from - Epoch {}: "
               "Train Accuracy = {train_accuracy:.5f}, "
               "Train Loss = {train_loss:.5f}, "
               "Validation Accuracy = {validation_accuracy:.5f}, "
               "Validation Loss = {validation_loss:.5f}, ").format(
                   checkpoint["epoch"],
                   train_accuracy=checkpoint["train_accuracy"][-1],
                   train_loss=checkpoint["train_losses"][-1],
                   validation_accuracy=checkpoint["validation_accuracy"][-1],
                   validation_loss=checkpoint["validation_losses"][-1],
               ))

    train_dataset = CrohmeDataset(gt_train,
                                  tokensfile,
                                  root=root,
                                  crop=options.crop,
                                  transform=transformers)
    train_data_loader = DataLoader(
        train_dataset,
        batch_size=options.batch_size,
        shuffle=True,
        num_workers=options.num_workers,
        collate_fn=collate_batch,
    )
    validation_dataset = CrohmeDataset(gt_validation,
                                       tokensfile,
                                       root=root,
                                       crop=options.crop,
                                       transform=transformers)
    validation_data_loader = DataLoader(
        validation_dataset,
        batch_size=options.batch_size,
        shuffle=True,
        num_workers=options.num_workers,
        collate_fn=collate_batch,
    )
    criterion = nn.CrossEntropyLoss().to(device)
    enc = Encoder(img_channels=3,
                  dropout_rate=options.dropout_rate,
                  checkpoint=encoder_checkpoint).to(device)
    dec = Decoder(
        len(train_dataset.id_to_token),
        low_res_shape,
        high_res_shape,
        checkpoint=decoder_checkpoint,
        device=device,
    ).to(device)
    enc.train()
    dec.train()

    enc_params_to_optimise = [
        param for param in enc.parameters() if param.requires_grad
    ]
    dec_params_to_optimise = [
        param for param in dec.parameters() if param.requires_grad
    ]
    params_to_optimise = [*enc_params_to_optimise, *dec_params_to_optimise]
    optimiser = optim.Adadelta(params_to_optimise,
                               lr=options.lr,
                               weight_decay=options.weight_decay)
    optimiser_state = checkpoint.get("optimiser")
    if optimiser_state:
        optimiser.load_state_dict(optimiser_state)
    # Set the learning rate instead of using the previous state.
    # The scheduler somehow overwrites the LR to the initial LR after loading,
    # which would always reset it to the first used learning rate instead of
    # the one from the previous checkpoint. So might as well set it manually.
    for param_group in optimiser.param_groups:
        param_group["initial_lr"] = options.lr
    # Decay learning rate by a factor of lr_factor (default: 0.1)
    # every lr_epochs (default: 3)
    lr_scheduler = optim.lr_scheduler.StepLR(optimiser,
                                             step_size=options.lr_epochs,
                                             gamma=options.lr_factor)

    train(
        enc,
        dec,
        optimiser,
        criterion,
        train_data_loader,
        validation_data_loader,
        teacher_forcing_ratio=options.teacher_forcing,
        lr_scheduler=lr_scheduler,
        print_epochs=options.print_epochs,
        device=device,
        num_epochs=options.num_epochs,
        checkpoint=checkpoint,
        prefix=options.prefix,
        max_grad_norm=options.max_grad_norm,
    )
Exemple #16
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    model = create_net(args)

    x = torch.randn(1, 3, 224, 224)
    flops, params = profile(model, inputs=(x, ))

    print("model [%s] - params: %.6fM" % (args.arch, params / 1e6))
    print("model [%s] - FLOPs: %.6fG" % (args.arch, flops / 1e9))

    log_file = os.path.join(args.ckpt, "log.txt")

    if os.path.exists(log_file):
        args.log_file = open(log_file, mode="a")
    else:
        args.log_file = open(log_file, mode="w")
        args.log_file.write("Network - " + args.arch + "\n")
        args.log_file.write("Attention Module - " + args.attention_type + "\n")
        args.log_file.write("Params - " % str(params) + "\n")
        args.log_file.write("FLOPs - " % str(flops) + "\n")
        args.log_file.write(
            "--------------------------------------------------" + "\n")

    args.log_file.close()

    if not torch.cuda.is_available():
        print('using CPU, this will be slow')
    elif args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.device)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.device)
        model = model.to(args.gpu[0])
        model = torch.nn.DataParallel(model, args.gpu)

    print(model)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    if args.resume:
        model, optimizer, best_acc1, start_epoch = load_checkpoint(
            args, model, optimizer)
        args.start_epoch = start_epoch

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.save_weights is not None:  # "deparallelize" saved weights
        print("=> saving 'deparallelized' weights [%s]" % args.save_weights)
        model = model.module
        model = model.cpu()
        torch.save({'state_dict': model.state_dict()},
                   args.save_weights,
                   _use_new_zipfile_serialization=False)
        return

    if args.evaluate:
        args.log_file = open(log_file, mode="a")
        validate(val_loader, model, criterion, args)
        args.log_file.close()
        return

    if args.cos_lr:
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, args.epochs)
        for epoch in range(args.start_epoch):
            scheduler.step()

    for epoch in range(args.start_epoch, args.epochs):

        args.log_file = open(log_file, mode="a")

        if args.distributed:
            train_sampler.set_epoch(epoch)

        if (not args.cos_lr):
            adjust_learning_rate(optimizer, epoch, args)
        else:
            scheduler.step()
            print('[%03d] %.5f' % (epoch, scheduler.get_lr()[0]))

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        args.log_file.close()

        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):

            save_checkpoint(
                {
                    "epoch": epoch + 1,
                    "arch": args.arch,
                    "state_dict": model.state_dict(),
                    "best_acc": best_acc1,
                    "optimizer": optimizer.state_dict(),
                },
                is_best,
                epoch,
                save_path=args.ckpt)
Exemple #17
0
def main():
    options = parse_args()
    is_cuda = use_cuda and not options.no_cuda
    hardware = "cuda" if is_cuda else "cpu"
    device = torch.device(hardware)

    for dataset_name in options.dataset:
        results = {"best": {}, "mean": {}, "highest_prob": {}}
        for checkpoint_path in options.checkpoint:
            checkpoint_name, _ = os.path.splitext(
                os.path.basename(checkpoint_path))
            checkpoint = (load_checkpoint(checkpoint_path, cuda=is_cuda)
                          if checkpoint_path else default_checkpoint)
            encoder_checkpoint = checkpoint["model"].get("encoder")
            decoder_checkpoint = checkpoint["model"].get("decoder")

            test_set = test_sets[dataset_name]
            dataset = CrohmeDataset(
                test_set["groundtruth"],
                tokensfile,
                root=test_set["root"],
                transform=transformers,
            )
            data_loader = DataLoader(
                dataset,
                batch_size=options.batch_size,
                shuffle=False,
                num_workers=options.num_workers,
                collate_fn=collate_batch,
            )

            enc = Encoder(img_channels=3,
                          checkpoint=encoder_checkpoint).to(device)
            dec = Decoder(
                len(dataset.id_to_token),
                low_res_shape,
                high_res_shape,
                checkpoint=decoder_checkpoint,
                device=device,
            ).to(device)
            enc.eval()
            dec.eval()

            result = evaluate(
                enc,
                dec,
                data_loader=data_loader,
                device=device,
                checkpoint=checkpoint,
                beam_width=options.beam_width,
                prefix=options.prefix,
            )
            results["best"][checkpoint_name] = result["best"]
            results["mean"][checkpoint_name] = result["mean"]
            results["highest_prob"][checkpoint_name] = result["highest_prob"]

        highest_prob_err_table, highest_prob_correct_table = create_markdown_tables(
            results["highest_prob"])
        best_err_table, best_correct_table = create_markdown_tables(
            results["best"])
        mean_err_table, mean_correct_table = create_markdown_tables(
            results["mean"])
        print(("\n# Dataset {name}\n\n"
               "Beam width: {beam_width}\n\n"
               "## Highest Probability\n\n{highest_prob_err_table}\n\n"
               "{highest_prob_correct_table}\n\n"
               "## Best\n\n{best_err_table}\n\n{best_correct_table}\n\n"
               "## Mean\n\n{mean_err_table}\n\n{mean_correct_table}").format(
                   name=dataset_name,
                   beam_width=options.beam_width,
                   highest_prob_err_table=highest_prob_err_table,
                   highest_prob_correct_table=highest_prob_correct_table,
                   best_err_table=best_err_table,
                   best_correct_table=best_correct_table,
                   mean_err_table=mean_err_table,
                   mean_correct_table=mean_correct_table,
               ))
Exemple #18
0
def train(args):

    # Set up directories ===========================================================
    os.makedirs(DATA_DIR, exist_ok=True)
    os.makedirs(BUFFER_DIR, exist_ok=True)
    exp_name = "EXP_%04d" % (args.expID)
    exp_path = os.path.join(DATA_DIR, exp_name)
    rb_path = os.path.join(BUFFER_DIR, exp_name)
    os.makedirs(exp_path, exist_ok=True)
    os.makedirs(rb_path, exist_ok=True)
    # save arguments
    with open(os.path.join(exp_path, 'args.txt'), 'w+') as f:
        json.dump(args.__dict__, f, indent=2)

    # Retrieve MuJoCo XML files for training ========================================
    agent_name = args.agent_name
    envs_train_names = [agent_name]
    args.graphs = dict()
    # existing envs
    if not args.custom_xml:
        args.graphs[agent_name] = utils.getGraphStructure(
            os.path.join(XML_DIR, '{}.xml'.format(agent_name)))
    # custom envs

    num_envs_train = len(envs_train_names)
    print("#" * 50 + '\ntraining envs: {}\n'.format(envs_train_names) +
          "#" * 50)

    # Set up training env and policy ================================================
    args.limb_obs_size, args.max_action = utils.registerEnvs(
        envs_train_names, args.max_episode_steps, args.custom_xml)
    max_num_limbs = max(
        [len(args.graphs[env_name]) for env_name in envs_train_names])
    # create vectorized training env
    obs_max_len = max(
        [len(args.graphs[env_name])
         for env_name in envs_train_names]) * args.limb_obs_size
    envs_train = [
        utils.makeEnvWrapper(name, obs_max_len, args.seed)
        for name in envs_train_names
    ]
    # envs_train = SubprocVecEnv(envs_train)  # vectorized env
    # set random seeds
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    # determine the maximum number of children in all the training envs
    if args.max_children is None:
        args.max_children = utils.findMaxChildren(envs_train_names,
                                                  args.graphs)
    # setup agent policy
    policy = TD3.LifeLongTD3(args)

    # Create new training instance or load previous checkpoint ========================
    if cp.has_checkpoint(exp_path, rb_path):
        print("*** loading checkpoint from {} ***".format(exp_path))
        total_timesteps, episode_num, replay_buffer, num_samples, loaded_path = cp.load_checkpoint(
            exp_path, rb_path, policy, args)
        print("*** checkpoint loaded from {} ***".format(loaded_path))
    else:
        print("*** training from scratch ***")
        # init training vars
        total_timesteps = 0
        episode_num = 0
        num_samples = 0
        # different replay buffer for each env; avoid using too much memory if there are too many envs

    # Initialize training variables ================================================
    writer = SummaryWriter("%s/%s/" % (DATA_DIR, exp_name))
    s = time.time()
    # TODO: may have to change the following codes into the loop
    timesteps_since_saving = 0
    this_training_timesteps = 0
    episode_timesteps = 0
    episode_reward = 0
    episode_reward_buffer = 0
    done = True

    # Start training ===========================================================
    for env_handle, env_name in zip(envs_train, envs_train_names):
        env = env_handle()
        obs = env.reset()
        replay_buffer = utils.ReplayBuffer(max_size=args.rb_max)
        policy.change_morphology(args.graphs[env_name])
        policy.graph = args.graphs[env_name]
        task_timesteps = 0
        done = False
        episode_timesteps = 0
        episode_reward = 0
        episode_reward_buffer = 0
        while task_timesteps < args.max_timesteps:
            # train and log after one episode for each env
            if done:
                # log updates and train policy
                if this_training_timesteps != 0:
                    policy.train(replay_buffer,
                                 episode_timesteps,
                                 args.batch_size,
                                 args.discount,
                                 args.tau,
                                 args.policy_noise,
                                 args.noise_clip,
                                 args.policy_freq,
                                 graphs=args.graphs,
                                 env_name=env_name)
                    # add to tensorboard display

                    writer.add_scalar('{}_episode_reward'.format(env_name),
                                      episode_reward, task_timesteps)
                    writer.add_scalar('{}_episode_len'.format(env_name),
                                      episode_timesteps, task_timesteps)
                    # print to console
                    print(
                        "-" * 50 +
                        "\nExpID: {}, FPS: {:.2f}, TotalT: {}, EpisodeNum: {}, SampleNum: {}, ReplayBSize: {}"
                        .format(args.expID, this_training_timesteps /
                                (time.time() -
                                 s), total_timesteps, episode_num, num_samples,
                                len(replay_buffer.storage)))
                    print("{} === EpisodeT: {}, Reward: {:.2f}".format(
                        env_name, episode_timesteps, episode_reward))
                    this_training_timesteps = 0
                    s = time.time()

                # save model and replay buffers
                if timesteps_since_saving >= args.save_freq:
                    print("!!!!!")
                    timesteps_since_saving = 0
                    model_saved_path = cp.save_model(exp_path, policy,
                                                     total_timesteps,
                                                     episode_num, num_samples,
                                                     {env_name: replay_buffer},
                                                     envs_train_names, args)
                    print("*** model saved to {} ***".format(model_saved_path))
                    rb_saved_path = cp.save_replay_buffer(
                        rb_path, {env_name: replay_buffer})
                    print("*** replay buffers saved to {} ***".format(
                        rb_saved_path))

                # reset training variables
                obs = env.reset()
                done = False
                episode_reward = 0
                episode_timesteps = 0
                episode_num += 1
                # create reward buffer to store reward for one sub-env when it is not done
                episode_reward_buffer = 0

            # start sampling ===========================================================
            # sample action randomly for sometime and then according to the policy
            if task_timesteps < args.start_timesteps:
                action = np.random.uniform(low=env.action_space.low[0],
                                           high=env.action_space.high[0],
                                           size=max_num_limbs)
            else:
                # remove 0 padding of obs before feeding into the policy (trick for vectorized env)
                obs = np.array(obs[:args.limb_obs_size *
                                   len(args.graphs[env_name])])
                policy_action = policy.select_action(obs)
                if args.expl_noise != 0:
                    policy_action = (policy_action + np.random.normal(
                        0, args.expl_noise, size=policy_action.size)).clip(
                            env.action_space.low[0], env.action_space.high[0])
                # add 0-padding to ensure that size is the same for all envs
                action = np.append(
                    policy_action,
                    np.array([
                        0 for i in range(max_num_limbs - policy_action.size)
                    ]))

            # perform action in the environment
            new_obs, reward, done, _ = env.step(action)

            # record if each env has ever been 'done'

            # add the instant reward to the cumulative buffer
            # if any sub-env is done at the momoent, set the episode reward list to be the value in the buffer
            episode_reward_buffer += reward
            if done and episode_reward == 0:
                episode_reward = episode_reward_buffer
                episode_reward_buffer = 0
            writer.add_scalar('{}_instant_reward'.format(env_name), reward,
                              task_timesteps)
            done_bool = float(done)
            if episode_timesteps + 1 == args.max_episode_steps:
                done_bool = 0
                done = True
            # remove 0 padding before storing in the replay buffer (trick for vectorized env)
            num_limbs = len(args.graphs[env_name])
            obs = np.array(obs[:args.limb_obs_size * num_limbs])
            new_obs = np.array(new_obs[:args.limb_obs_size * num_limbs])
            action = np.array(action[:num_limbs])
            # insert transition in the replay buffer
            replay_buffer.add((obs, new_obs, action, reward, done_bool))
            num_samples += 1
            # do not increment episode_timesteps if the sub-env has been 'done'
            if not done:
                episode_timesteps += 1
                total_timesteps += 1
                task_timesteps += 1
                this_training_timesteps += 1
                timesteps_since_saving += 1

            obs = new_obs
        policy.next_task()

    # save checkpoint after training ===========================================================
    model_saved_path = cp.save_model(exp_path, policy, total_timesteps,
                                     episode_num, num_samples,
                                     {envs_train_names[-1]: replay_buffer},
                                     envs_train_names, args)
    print("*** training finished and model saved to {} ***".format(
        model_saved_path))
Exemple #19
0
def eval(**args):
    """
    Evaluate selected model 
    Args:
        seed       (Int):        Integer indicating set seed for random state
        save_dir   (String):     Top level directory to generate results folder
        model      (String):     Name of selected model 
        dataset    (String):     Name of selected dataset  
        exp        (String):     Name of experiment 
        load_type  (String):     Keyword indicator to evaluate the testing or validation set
        pretrained (Int/String): Int/String indicating loading of random, pretrained or saved weights
        
    Return:
        None
    """

    print("\n############################################################################\n")
    print("Experimental Setup: ", args)
    print("\n############################################################################\n")

    d          = datetime.datetime.today()
    date       = d.strftime('%Y%m%d-%H%M%S')
    result_dir = os.path.join(args['save_dir'], args['model'], '_'.join((args['dataset'],args['exp'],date)))
    log_dir    = os.path.join(result_dir, 'logs')
    save_dir   = os.path.join(result_dir, 'checkpoints')

    if not args['debug']:
        os.makedirs(result_dir, exist_ok=True)
        os.makedirs(log_dir,    exist_ok=True) 
        os.makedirs(save_dir,   exist_ok=True) 

        # Save copy of config file
        with open(os.path.join(result_dir, 'config.yaml'),'w') as outfile:
            yaml.dump(args, outfile, default_flow_style=False)

        # Tensorboard Element
        writer = SummaryWriter(log_dir)

    # Check if GPU is available (CUDA)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Load Network
    model = create_model_object(**args).to(device)

    # Load Data
    loader = data_loader(**args, model_obj=model)

    if args['load_type'] == 'train_val':
        eval_loader = loader['valid']

    elif args['load_type'] == 'train':
        eval_loader = loader['train']

    elif args['load_type'] == 'test':
        eval_loader  = loader['test'] 

    else:
        sys.exit('load_type must be valid or test for eval, exiting')

    # END IF

    if isinstance(args['pretrained'], str):
        ckpt = load_checkpoint(args['pretrained'])
        model.load_state_dict(ckpt)

    # Training Setup
    params     = [p for p in model.parameters() if p.requires_grad]

    acc_metric = Metrics(**args, result_dir=result_dir, ndata=len(eval_loader.dataset))
    acc = 0.0

    # Setup Model To Evaluate 
    model.eval()

    with torch.no_grad():
        for step, data in enumerate(eval_loader):
            x_input     = data['data']
            annotations = data['annots']

            if isinstance(x_input, torch.Tensor):
                outputs = model(x_input.to(device))
            else:
                for i, item in enumerate(x_input):
                    if isinstance(item, torch.Tensor):
                        x_input[i] = item.to(device)
                outputs = model(*x_input)

            # END IF


            acc = acc_metric.get_accuracy(outputs, annotations)

            if step % 100 == 0:
                print('Step: {}/{} | {} acc: {:.4f}'.format(step, len(eval_loader), args['load_type'], acc))

    print('Accuracy of the network on the {} set: {:.3f} %\n'.format(args['load_type'], 100.*acc))

    if not args['debug']:
        writer.add_scalar(args['dataset']+'/'+args['model']+'/'+args['load_type']+'_accuracy', 100.*acc)
        # Close Tensorboard Element
        writer.close()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=args.weight_decay)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     patience=3,
                                                     threshold=0.005,
                                                     verbose=True)

    n_epochs = 200
    if args.resume:
        start_epoch, model, optimizer, scheduler = load_checkpoint(
            model_path=args.model_path,
            ckpt_name=args.ckpt,
            device=device,
            model=model,
            optimizer=optimizer,
            scheduler=scheduler)
        start_epoch -= 1
        print('Resumed checkpoint {} from {}. Starting at epoch {}.'.format(
            args.ckpt, args.model_path, start_epoch + 1))
        print('Current learning rate: {}'.format(get_current_lr(optimizer)))
        print('*' * 30)
    else:
        start_epoch = 0
        # model = init_weights(model)

    for epoch in range(start_epoch, n_epochs):
        print('Epoch: %d/%d' % (epoch + 1, n_epochs))
        train_loss = train_model(model,
                             warmup=job_config.get_warmup_proportion(),
                             t_total=job_config.get_total_training_steps())

    global_step = 0
    start_epoch = 0

    # if args.load_training_checkpoint is not None:
    if load_training_checkpoint != 'False':
        logger.info(f"Looking for previous training checkpoint.")
        latest_checkpoint_path = latest_checkpoint_file(
            args.load_training_checkpoint, no_cuda)

        logger.info(
            f"Restoring previous training checkpoint from {latest_checkpoint_path}"
        )
        start_epoch, global_step = load_checkpoint(model, optimizer,
                                                   latest_checkpoint_path)
        logger.info(
            f"The model is loaded from last checkpoint at epoch {start_epoch} when the global steps were at {global_step}"
        )

    logger.info("Training the model")

    best_loss = None
    for index in range(start_epoch, args.epochs):
        logger.info(f"Training epoch: {index + 1}")

        eval_loss = train(index)

        if check_write_log():
            if best_loss is None or eval_loss is None or eval_loss < best_loss * 0.99:
                best_loss = eval_loss
def run(gpu_id, options, distributed=False):
    if distributed:
        dist.init_process_group(
            backend="nccl",
            rank=gpu_id,
            world_size=options.num_gpus,
            init_method="env://",
        )
        torch.cuda.set_device(gpu_id)
    use_cuda = torch.cuda.is_available() and not options.no_cuda
    device = torch.device("cuda" if use_cuda else "cpu")
    for cp in options.checkpoint:
        checkpoint = load_checkpoint(os.path.join(cp, "stats.pt"))
        name = "evaluate/{}".format(cp)
        logger = lavd.Logger(name, disabled=gpu_id != 0)

        spinner = logger.spinner("Initialising")
        spinner.start()

        # All but the primary GPU wait here, so that only the primary process loads the
        # pre-trained model and the rest uses the cached version.
        if distributed and gpu_id != 0:
            torch.distributed.barrier()

        model_kind = checkpoint["model"].get("kind")
        use_special = True
        masked_lm = True
        add_space = False
        if model_kind == "bert" or model_kind == "bert-scratch":
            config = BertConfig.from_pretrained(cp)
            model = BertForMaskedLM.from_pretrained(cp, config=config)
            tokeniser = BertTokenizer.from_pretrained(cp)
        elif model_kind == "gpt2" or model_kind == "gpt2-scratch":
            config = GPT2Config.from_pretrained(cp)
            model = GPT2LMHeadModel.from_pretrained(cp, config=config)
            tokeniser = GPT2Tokenizer.from_pretrained(cp)
            masked_lm = False
            use_special = False
            add_space = True
        else:
            raise Exception("No model available for {}".format(model_kind))
        model = model.to(device)

        # Primary process has loaded the model and the other can now load the cached
        # version.
        if distributed and gpu_id == 0:
            torch.distributed.barrier()

        data_loaders = []
        for data_file in options.datasets:
            data = data_file.split("=", 1)
            if len(data) > 1:
                # Remove whitespace around the name
                name = data[0].strip()
                # Expand the ~ to the full path as it won't be done automatically since
                # it's not at the beginning of the word.
                file_path = os.path.expanduser(data[1])
            else:
                name = None
                file_path = data[0]
            dataset = TextDataset(file_path,
                                  tokeniser,
                                  name=name,
                                  use_special=use_special)
            sampler = (DistributedSampler(dataset,
                                          num_replicas=options.num_gpus,
                                          rank=gpu_id,
                                          shuffle=False)
                       if distributed else None)
            data_loader = DataLoader(
                dataset,
                batch_size=options.batch_size,
                shuffle=False,
                num_workers=options.num_workers,
                sampler=sampler,
                pin_memory=True,
            )
            data_loaders.append(data_loader)

        if distributed:
            model = DistributedDataParallel(model,
                                            device_ids=[gpu_id],
                                            find_unused_parameters=True)

        # Wait for all processes to load eveything before starting training.
        # Not strictly necessary, since they will wait once the actual model is run, but
        # this makes it nicer to show the spinner until all of them are ready.
        if distributed:
            torch.distributed.barrier()
        spinner.stop()

        start_time = time.time()
        logger.set_prefix("Evaluation - {}".format(cp))
        results = []
        for data_loader in data_loaders:
            data_name = data_loader.dataset.name
            logger.start(data_name)
            result = evaluate(
                data_loader,
                model,
                device=device,
                name=data_name,
                logger=logger,
                masked_lm=masked_lm,
            )
            result["name"] = data_name
            results.append(result)
            logger.end(data_name)

        time_difference = time.time() - start_time
        evaluation_results = [
            OrderedDict(
                name=result["name"],
                stats=OrderedDict(loss=result["loss"],
                                  perplexity=result["perplexity"]),
            ) for result in results
        ]
        log_epoch_stats(logger,
                        evaluation_results,
                        metrics,
                        time_elapsed=time_difference)
Exemple #23
0
def run(gpu_id, options, distributed=False):
    if distributed:
        dist.init_process_group(
            backend="nccl",
            rank=gpu_id,
            world_size=options.num_gpus,
            init_method="env://",
        )
        torch.cuda.set_device(gpu_id)
    torch.manual_seed(options.seed)
    use_cuda = torch.cuda.is_available() and not options.no_cuda
    device = torch.device("cuda" if use_cuda else "cpu")
    logger = lavd.Logger(options.name, disabled=gpu_id != 0)
    # Parser needs to be rebuilt, since it can't be serialised and it is needed to even
    # detect the number of GPUs, but here it's only used to log it.
    parser = build_parser() if gpu_id == 0 else None

    spinner = logger.spinner("Initialising")
    spinner.start()

    checkpoint = (default_checkpoint
                  if options.checkpoint is None else load_checkpoint(
                      os.path.join(options.checkpoint, "stats.pt")))
    # Either use the checkpoint directory as the configuration or use one of the
    # available pre-trained models.
    pre_trained = options.checkpoint or options.pre_trained

    # All but the primary GPU wait here, so that only the primary process loads the
    # pre-trained model and the rest uses the cached version.
    if distributed and gpu_id != 0:
        torch.distributed.barrier()

    model_kind = checkpoint["model"].get("kind") or options.model_kind
    use_special = True
    masked_lm = True
    if model_kind == "bert":
        if pre_trained is None:
            pre_trained = "bert-base-german-cased"
        config = BertConfig.from_pretrained(pre_trained)
        model = BertForMaskedLM.from_pretrained(pre_trained, config=config)
        tokeniser = BertTokenizer.from_pretrained(pre_trained)
    elif model_kind == "bert-scratch":
        # The pre_trained here is only for the configuartion (num layers etc.)
        # But the weights are not loaded
        if pre_trained is None:
            pre_trained = "bert-base-german-cased"
        # Use either the provided vocabulary or the pre_trained one.
        vocab = options.vocab or pre_trained
        tokeniser = BertTokenizer.from_pretrained(vocab)
        config = BertConfig.from_pretrained(pre_trained)
        config.vocab_size = tokeniser.vocab_size
        model = BertForMaskedLM(config)
    elif model_kind == "gpt2":
        if pre_trained is None:
            pre_trained = "gpt2"
        config = GPT2Config.from_pretrained(pre_trained)
        model = GPT2LMHeadModel.from_pretrained(pre_trained, config=config)
        tokeniser = GPT2Tokenizer.from_pretrained(pre_trained)
        masked_lm = False
        use_special = False
    elif model_kind == "gpt2-german":
        assert pre_trained is not None, "--pre-trained must be given for gpt2-german"
        config = GPT2Config.from_pretrained(pre_trained)
        model = GPT2LMHeadModel.from_pretrained(pre_trained, config=config)
        # Using the XLNetTokenizer because the pre-trained German GPT-2 model uses
        # SentencePiece and that's easiest way to use it.
        # That also means that the automatic tokenisation cannot be done, because XLNet
        # uses different placing of the special tokens.
        tokeniser = XLNetTokenizer.from_pretrained(
            pre_trained,
            keep_accents=True,
            unk_token="<unk>",
            # start and end of sequence use the same token
            bos_token="<endoftext>",
            eos_token="<endoftext>",
        )
        masked_lm = False
        use_special = False
    elif model_kind == "gpt2-scratch":
        # The pre_trained here is only for the configuartion (num layers etc.)
        # But the weights are not loaded
        if pre_trained is None:
            pre_trained = "gpt2"
        # Use either the provided vocabulary or the pre_trained one.
        vocab = options.vocab or pre_trained
        tokeniser = GPT2Tokenizer.from_pretrained(vocab)
        config = GPT2Config.from_pretrained(pre_trained)
        config.vocab_size = tokeniser.vocab_size
        model = GPT2LMHeadModel(config)
        masked_lm = False
        use_special = False
    else:
        raise Exception("No model available for {}".format(model_kind))
    model = model.to(device)

    # Primary process has loaded the model and the other can now load the cached
    # version.
    if distributed and gpu_id == 0:
        torch.distributed.barrier()

    train_dataset = TextDataset(
        options.train_text,
        tokeniser,
        use_special=use_special,
        manual_special=model_kind == "gpt2-german",
    )
    train_sampler = (DistributedSampler(train_dataset,
                                        num_replicas=options.num_gpus,
                                        rank=gpu_id) if distributed else None)
    train_data_loader = DataLoader(
        train_dataset,
        batch_size=options.batch_size,
        # Only shuffle when not using a sampler
        shuffle=train_sampler is None,
        num_workers=options.actual_num_workers,
        sampler=train_sampler,
        pin_memory=True,
    )

    validation_data_loaders = []
    for val_file in options.validation_text:
        vals = val_file.split("=", 1)
        if len(vals) > 1:
            # Remove whitespace around the name
            name = vals[0].strip()
            # Expand the ~ to the full path as it won't be done automatically since it's
            # not at the beginning of the word.
            file_path = os.path.expanduser(vals[1])
        else:
            name = None
            file_path = vals[0]
        validation_dataset = TextDataset(
            file_path,
            tokeniser,
            name=name,
            use_special=use_special,
            manual_special=model_kind == "gpt2-german",
        )
        validation_sampler = (DistributedSampler(
            validation_dataset, num_replicas=options.num_gpus, rank=gpu_id)
                              if distributed else None)
        validation_data_loader = DataLoader(
            validation_dataset,
            batch_size=options.batch_size,
            # Only shuffle when not using a sampler
            shuffle=validation_sampler is None,
            num_workers=options.actual_num_workers,
            sampler=validation_sampler,
            pin_memory=True,
        )
        validation_data_loaders.append(validation_data_loader)

    initial_lr = options.lr
    # Only restore the learning rate if resuming from a checkpoint and not manually
    # resetting the learning rate.
    if len(checkpoint["train"]["lr"]) > 0 and not options.reset_lr:
        initial_lr = checkpoint["train"]["lr"][-1]

    no_decay = ["bias", "LayerNorm.weight"]
    optimiser_grouped_parameters = [
        {
            "params": [
                param for name, param in model.named_parameters()
                if not any(nd in name for nd in no_decay)
            ],
            "weight_decay":
            options.weight_decay,
        },
        {
            "params": [
                param for name, param in model.named_parameters()
                if any(nd in name for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimiser = AdamW(optimiser_grouped_parameters,
                      lr=initial_lr,
                      eps=options.adam_eps)
    lr_scheduler = get_linear_schedule_with_warmup(
        optimiser,
        num_warmup_steps=options.lr_warmup,
        num_training_steps=options.num_epochs,
    )

    amp_scaler = amp.GradScaler() if use_cuda and options.fp16 else None

    if distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[gpu_id],
                                        find_unused_parameters=True)

    validation_details = [
        OrderedDict(
            name=data_loader.dataset.name,
            path=data_loader.dataset.path,
            size=len(data_loader.dataset),
        ) for data_loader in validation_data_loaders
    ]
    experiment = OrderedDict(
        model_kind=model_kind,
        train=OrderedDict(path=train_dataset.path, size=len(train_dataset)),
        validation=validation_details,
        options=options,
    )
    log_experiment(logger, experiment)

    logger.log_command(parser, options)

    # Wait for all processes to load eveything before starting training.
    # Not strictly necessary, since they will wait once the actual model is run, but
    # this makes it nicer to show the spinner until all of them are ready.
    if distributed:
        torch.distributed.barrier()
    spinner.stop()

    if options.checkpoint is not None:
        resume_text = "Resuming from - Epoch {epoch}".format(
            epoch=checkpoint["epoch"])
        logger.set_prefix(resume_text)
        epoch_results = [
            OrderedDict(
                name="Train",
                stats=OrderedDict(
                    loss=checkpoint["train"]["stats"]["loss"][-1],
                    perplexity=checkpoint["train"]["stats"]["perplexity"][-1],
                ),
            )
        ] + [
            OrderedDict(
                name=val_name,
                stats=OrderedDict(
                    loss=val_result["stats"]["loss"][-1],
                    perplexity=val_result["stats"]["perplexity"][-1],
                ),
            ) for val_name, val_result in checkpoint["validation"].items()
        ]
        log_epoch_stats(logger, epoch_results, metrics)

    train(
        logger,
        model,
        optimiser,
        train_data_loader,
        validation_data_loaders,
        lr_scheduler=lr_scheduler,
        device=device,
        num_epochs=options.num_epochs,
        checkpoint=checkpoint,
        model_kind=model_kind,
        amp_scaler=amp_scaler,
        masked_lm=masked_lm,
    )
def main():
    config = get_train_config()

    # device
    device, device_ids = setup_device(config.n_gpu)

    # tensorboard
    writer = TensorboardWriter(config.summary_dir, config.tensorboard)

    # metric tracker
    metric_names = ['loss', 'acc1', 'acc5']
    train_metrics = MetricTracker(*[metric for metric in metric_names],
                                  writer=writer)
    valid_metrics = MetricTracker(*[metric for metric in metric_names],
                                  writer=writer)

    # create model
    print("create model")
    model = VisionTransformer(image_size=(config.image_size,
                                          config.image_size),
                              patch_size=(config.patch_size,
                                          config.patch_size),
                              emb_dim=config.emb_dim,
                              mlp_dim=config.mlp_dim,
                              num_heads=config.num_heads,
                              num_layers=config.num_layers,
                              num_classes=config.num_classes,
                              attn_dropout_rate=config.attn_dropout_rate,
                              dropout_rate=config.dropout_rate)

    # load checkpoint
    if config.checkpoint_path:
        state_dict = load_checkpoint(config.checkpoint_path)
        if config.num_classes != state_dict['classifier.weight'].size(0):
            del state_dict['classifier.weight']
            del state_dict['classifier.bias']
            print("re-initialize fc layer")
            model.load_state_dict(state_dict, strict=False)
        else:
            model.load_state_dict(state_dict)
        print("Load pretrained weights from {}".format(config.checkpoint_path))

    # send model to device
    model = model.to(device)
    if len(device_ids) > 1:
        model = torch.nn.DataParallel(model, device_ids=device_ids)

    # create dataloader
    print("create dataloaders")
    train_dataloader = eval("{}DataLoader".format(config.dataset))(
        data_dir=os.path.join(config.data_dir, config.dataset),
        image_size=config.image_size,
        batch_size=config.batch_size,
        num_workers=config.num_workers,
        split='train')
    valid_dataloader = eval("{}DataLoader".format(config.dataset))(
        data_dir=os.path.join(config.data_dir, config.dataset),
        image_size=config.image_size,
        batch_size=config.batch_size,
        num_workers=config.num_workers,
        split='val')

    # training criterion
    print("create criterion and optimizer")
    criterion = nn.CrossEntropyLoss()

    # create optimizers and learning rate scheduler
    optimizer = torch.optim.SGD(params=model.parameters(),
                                lr=config.lr,
                                weight_decay=config.wd,
                                momentum=0.9)
    lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer=optimizer,
        max_lr=config.lr,
        pct_start=config.warmup_steps / config.train_steps,
        total_steps=config.train_steps)

    # start training
    print("start training")
    best_acc = 0.0
    epochs = config.train_steps // len(train_dataloader)
    for epoch in range(1, epochs + 1):
        log = {'epoch': epoch}

        # train the model
        model.train()
        result = train_epoch(epoch, model, train_dataloader, criterion,
                             optimizer, lr_scheduler, train_metrics, device)
        log.update(result)

        # validate the model
        model.eval()
        result = valid_epoch(epoch, model, valid_dataloader, criterion,
                             valid_metrics, device)
        log.update(**{'val_' + k: v for k, v in result.items()})

        # best acc
        best = False
        if log['val_acc1'] > best_acc:
            best_acc = log['val_acc1']
            best = True

        # save model
        save_model(config.checkpoint_dir, epoch, model, optimizer,
                   lr_scheduler, device_ids, best)

        # print logged informations to the screen
        for key, value in log.items():
            print('    {:15s}: {}'.format(str(key), value))
from checkpoint import load_checkpoint

checkpoint_name = "./checkpoints/xavier-dropout-bottleneckonly-teacher0.5-0380.pth"

checkpoint = load_checkpoint(checkpoint_name, cuda=True)

encoder_checkpoint = checkpoint["model"].get("train_loss")
decoder_checkpoint = checkpoint["model"].get("train_accuracy")
encoder_checkpoint = checkpoint["model"].get("validation_loss")
decoder_checkpoint = checkpoint["model"].get("validation_accuracy")
print(encoder_checkpoint)
Exemple #26
0
pred_test = False # Predict the test data

if LOAD_CHECKPOINT:
    # Modify this path.
    def get_path(i,j):
        out_dir = '/research/lyu1/cygao/workspace/data/checkpoints/'
        checkpoint_dirs = os.listdir(out_dir)
        # for idx, checkpoint_dir in enumerate(checkpoint_dirs):
        #     checkpoint_fns = os.listdir(os.path.join(out_dir, checkpoint_dir))
        checkpoint_fns = os.listdir(os.path.join(out_dir, checkpoint_dirs[i]))
        # for jdx, checkpoint_fn in enumerate(checkpoint_fns[i]):
        checkpoint_path = os.path.join(out_dir, checkpoint_dirs[i], checkpoint_fns[j])
        print("Current checkpoint path is ", checkpoint_path)
        return checkpoint_path
    checkpoint_path = get_path(7,17)
    checkpoint = load_checkpoint(checkpoint_path)
    opts = checkpoint['opts']
    print('=' * 100)
    print('Options log:')
    print('- Load from checkpoint: {}'.format(LOAD_CHECKPOINT))
    print('- Global step: {}'.format(checkpoint['global_step']))

else:
    opts = AttrDict()
    # Configure models
    opts.word_vec_size = 100
    opts.feature_vec_size = 90
    opts.rnn_type = 'GRU'
    opts.hidden_size = 200
    opts.batch_size = 32
    opts.max_vocab_size = 10000
Exemple #27
0
def train(**args):
    """
    Evaluate selected model 
    Args:
        rerun        (Int):        Integer indicating number of repetitions for the select experiment 
        seed         (Int):        Integer indicating set seed for random state
        save_dir     (String):     Top level directory to generate results folder
        model        (String):     Name of selected model 
        dataset      (String):     Name of selected dataset  
        exp          (String):     Name of experiment 
        debug        (Int):        Debug state to avoid saving variables 
        load_type    (String):     Keyword indicator to evaluate the testing or validation set
        pretrained   (Int/String): Int/String indicating loading of random, pretrained or saved weights
        opt          (String):     Int/String indicating loading of random, pretrained or saved weights
        lr           (Float):      Learning rate 
        momentum     (Float):      Momentum in optimizer 
        weight_decay (Float):      Weight_decay value 
        final_shape  ([Int, Int]): Shape of data when passed into network
        
    Return:
        None
    """

    print(
        "\n############################################################################\n"
    )
    print("Experimental Setup: ", args)
    print(
        "\n############################################################################\n"
    )

    for total_iteration in range(args['rerun']):

        # Generate Results Directory
        d = datetime.datetime.today()
        date = d.strftime('%Y%m%d-%H%M%S')
        result_dir = os.path.join(
            args['save_dir'], args['model'], '_'.join(
                (args['dataset'], args['exp'], date)))
        log_dir = os.path.join(result_dir, 'logs')
        save_dir = os.path.join(result_dir, 'checkpoints')

        if not args['debug']:
            os.makedirs(result_dir, exist_ok=True)
            os.makedirs(log_dir, exist_ok=True)
            os.makedirs(save_dir, exist_ok=True)

            # Save copy of config file
            with open(os.path.join(result_dir, 'config.yaml'), 'w') as outfile:
                yaml.dump(args, outfile, default_flow_style=False)

            # Tensorboard Element
            writer = SummaryWriter(log_dir)

        # Check if GPU is available (CUDA)
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Load Network
        model = create_model_object(**args).to(device)

        # Load Data
        loader = data_loader(model_obj=model, **args)

        if args['load_type'] == 'train':
            train_loader = loader['train']
            valid_loader = loader[
                'train']  # Run accuracy on train data if only `train` selected

        elif args['load_type'] == 'train_val':
            train_loader = loader['train']
            valid_loader = loader['valid']

        else:
            sys.exit('Invalid environment selection for training, exiting')

        # END IF

        # Training Setup
        params = [p for p in model.parameters() if p.requires_grad]

        if args['opt'] == 'sgd':
            optimizer = optim.SGD(params,
                                  lr=args['lr'],
                                  momentum=args['momentum'],
                                  weight_decay=args['weight_decay'])

        elif args['opt'] == 'adam':
            optimizer = optim.Adam(params,
                                   lr=args['lr'],
                                   weight_decay=args['weight_decay'])

        else:
            sys.exit('Unsupported optimizer selected. Exiting')

        # END IF

        scheduler = MultiStepLR(optimizer,
                                milestones=args['milestones'],
                                gamma=args['gamma'])

        if isinstance(args['pretrained'], str):
            ckpt = load_checkpoint(args['pretrained'])
            model.load_state_dict(ckpt)
            start_epoch = load_checkpoint(args['pretrained'],
                                          key_name='epoch') + 1
            optimizer.load_state_dict(
                load_checkpoint(args['pretrained'], key_name='optimizer'))

            for quick_looper in range(start_epoch):
                scheduler.step()

            # END FOR

        else:
            start_epoch = 0

        # END IF

        model_loss = Losses(device=device, **args)
        acc_metric = Metrics(**args)
        best_val_acc = 0.0

        ############################################################################################################################################################################

        # Start: Training Loop
        for epoch in range(start_epoch, args['epoch']):
            running_loss = 0.0
            print('Epoch: ', epoch)

            # Setup Model To Train
            model.train()

            # Start: Epoch
            for step, data in enumerate(train_loader):
                if step % args['pseudo_batch_loop'] == 0:
                    loss = 0.0
                    optimizer.zero_grad()

                # END IF

                x_input = data['data'].to(device)
                annotations = data['annots']

                assert args['final_shape'] == list(x_input.size(
                )[-2:]), "Input to model does not match final_shape argument"
                outputs = model(x_input)
                loss = model_loss.loss(outputs, annotations)
                loss = loss * args['batch_size']
                loss.backward()

                running_loss += loss.item()

                if np.isnan(running_loss):
                    import pdb
                    pdb.set_trace()

                # END IF

                if not args['debug']:
                    # Add Learning Rate Element
                    for param_group in optimizer.param_groups:
                        writer.add_scalar(
                            args['dataset'] + '/' + args['model'] +
                            '/learning_rate', param_group['lr'],
                            epoch * len(train_loader) + step)

                    # END FOR

                    # Add Loss Element
                    writer.add_scalar(
                        args['dataset'] + '/' + args['model'] +
                        '/minibatch_loss',
                        loss.item() / args['batch_size'],
                        epoch * len(train_loader) + step)

                # END IF

                if ((epoch * len(train_loader) + step + 1) % 100 == 0):
                    print('Epoch: {}/{}, step: {}/{} | train loss: {:.4f}'.
                          format(
                              epoch, args['epoch'], step + 1,
                              len(train_loader), running_loss /
                              float(step + 1) / args['batch_size']))

                # END IF

                if (epoch * len(train_loader) +
                    (step + 1)) % args['pseudo_batch_loop'] == 0 and step > 0:
                    # Apply large mini-batch normalization
                    for param in model.parameters():
                        param.grad *= 1. / float(
                            args['pseudo_batch_loop'] * args['batch_size'])
                    optimizer.step()

                # END IF

            # END FOR: Epoch

            if not args['debug']:
                # Save Current Model
                save_path = os.path.join(
                    save_dir, args['dataset'] + '_epoch' + str(epoch) + '.pkl')
                save_checkpoint(epoch, step, model, optimizer, save_path)

            # END IF: Debug

            scheduler.step(epoch=epoch)
            print('Schedulers lr: %f', scheduler.get_lr()[0])

            ## START FOR: Validation Accuracy
            running_acc = []
            running_acc = valid(valid_loader, running_acc, model, device,
                                acc_metric)
            if not args['debug']:
                writer.add_scalar(
                    args['dataset'] + '/' + args['model'] +
                    '/validation_accuracy', 100. * running_acc[-1],
                    epoch * len(valid_loader) + step)
            print('Accuracy of the network on the validation set: %f %%\n' %
                  (100. * running_acc[-1]))

            # Save Best Validation Accuracy Model Separately
            if best_val_acc < running_acc[-1]:
                best_val_acc = running_acc[-1]

                if not args['debug']:
                    # Save Current Model
                    save_path = os.path.join(
                        save_dir, args['dataset'] + '_best_model.pkl')
                    save_checkpoint(epoch, step, model, optimizer, save_path)

                # END IF

            # END IF

        # END FOR: Training Loop

    ############################################################################################################################################################################

        if not args['debug']:
            # Close Tensorboard Element
            writer.close()
Exemple #28
0
def train(_run):
    # Set up directories ===========================================================
    os.makedirs(DATA_DIR, exist_ok=True)
    os.makedirs(BUFFER_DIR, exist_ok=True)
    exp_name = args.expID
    exp_path = os.path.join(DATA_DIR, exp_name)
    rb_path = os.path.join(BUFFER_DIR, exp_name)
    os.makedirs(exp_path, exist_ok=True)
    os.makedirs(rb_path, exist_ok=True)
    # save arguments
    with open(os.path.join(exp_path, "args.txt"), "w+") as f:
        json.dump(args.__dict__, f, indent=2)

    # Retrieve MuJoCo XML files for training ========================================
    envs_train_names = []
    args.graphs = dict()
    # existing envs
    if not args.custom_xml:
        for morphology in args.morphologies:
            envs_train_names += [
                name[:-4] for name in os.listdir(XML_DIR)
                if ".xml" in name and morphology in name
            ]
        for name in envs_train_names:
            args.graphs[name] = utils.getGraphStructure(
                os.path.join(XML_DIR, "{}.xml".format(name)),
                args.observation_graph_type,
            )
    # custom envs
    else:
        if os.path.isfile(args.custom_xml):
            assert ".xml" in os.path.basename(
                args.custom_xml), "No XML file found."
            name = os.path.basename(args.custom_xml)
            envs_train_names.append(name[:-4])  # truncate the .xml suffix
            args.graphs[name[:-4]] = utils.getGraphStructure(
                args.custom_xml, args.observation_graph_type)
        elif os.path.isdir(args.custom_xml):
            for name in os.listdir(args.custom_xml):
                if ".xml" in name:
                    envs_train_names.append(name[:-4])
                    args.graphs[name[:-4]] = utils.getGraphStructure(
                        os.path.join(args.custom_xml, name),
                        args.observation_graph_type)

    envs_train_names.sort()
    num_envs_train = len(envs_train_names)
    print("#" * 50 + "\ntraining envs: {}\n".format(envs_train_names) +
          "#" * 50)

    # Set up training env and policy ================================================
    args.limb_obs_size, args.max_action = utils.registerEnvs(
        envs_train_names, args.max_episode_steps, args.custom_xml)
    max_num_limbs = max(
        [len(args.graphs[env_name]) for env_name in envs_train_names])
    # create vectorized training env
    obs_max_len = (
        max([len(args.graphs[env_name])
             for env_name in envs_train_names]) * args.limb_obs_size)
    envs_train = [
        utils.makeEnvWrapper(name, obs_max_len, args.seed)
        for name in envs_train_names
    ]

    envs_train = SubprocVecEnv(envs_train)  # vectorized env
    # set random seeds
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    # determine the maximum number of children in all the training envs
    if args.max_children is None:
        args.max_children = utils.findMaxChildren(envs_train_names,
                                                  args.graphs)

    args.max_num_limbs = max_num_limbs
    # setup agent policy
    policy = TD3.TD3(args)

    # Create new training instance or load previous checkpoint ========================
    if cp.has_checkpoint(exp_path, rb_path):
        print("*** loading checkpoint from {} ***".format(exp_path))
        (
            total_timesteps,
            episode_num,
            replay_buffer,
            num_samples,
            loaded_path,
        ) = cp.load_checkpoint(exp_path, rb_path, policy, args)
        print("*** checkpoint loaded from {} ***".format(loaded_path))
    else:
        print("*** training from scratch ***")
        # init training vars
        total_timesteps = 0
        episode_num = 0
        num_samples = 0
        # different replay buffer for each env; avoid using too much memory if there are too many envs
        replay_buffer = dict()
        if num_envs_train > args.rb_max // 1e6:
            for name in envs_train_names:
                replay_buffer[name] = utils.ReplayBuffer(
                    max_size=args.rb_max // num_envs_train)
        else:
            for name in envs_train_names:
                replay_buffer[name] = utils.ReplayBuffer()

    # Initialize training variables ================================================
    writer = SummaryWriter("%s/%s/" % (DATA_DIR, exp_name))
    s = time.time()
    timesteps_since_saving = 0
    timesteps_since_saving_model_only = 0
    this_training_timesteps = 0
    collect_done = True
    episode_timesteps_list = [0 for i in range(num_envs_train)]
    done_list = [True for i in range(num_envs_train)]

    # Start training ===========================================================
    model_savings_so_far = 0
    while total_timesteps < args.max_timesteps:

        # train and log after one episode for each env
        if collect_done:
            # log updates and train policy
            if this_training_timesteps != 0:
                policy.train(
                    replay_buffer,
                    episode_timesteps_list,
                    args.batch_size,
                    args.discount,
                    args.tau,
                    args.policy_noise,
                    args.noise_clip,
                    args.policy_freq,
                    graphs=args.graphs,
                    envs_train_names=envs_train_names[:num_envs_train],
                )
                # add to tensorboard display
                for i in range(num_envs_train):
                    writer.add_scalar(
                        "{}_episode_reward".format(envs_train_names[i]),
                        episode_reward_list[i],
                        total_timesteps,
                    )
                    writer.add_scalar(
                        "{}_episode_len".format(envs_train_names[i]),
                        episode_timesteps_list[i],
                        total_timesteps,
                    )
                    if not args.debug:
                        ex.log_scalar(
                            f"{envs_train_names[i]}_episode_reward",
                            float(episode_reward_list[i]),
                            total_timesteps,
                        )
                        ex.log_scalar(
                            f"{envs_train_names[i]}_episode_len",
                            float(episode_timesteps_list[i]),
                            total_timesteps,
                        )
                if not args.debug:
                    ex.log_scalar(
                        "total_timesteps",
                        float(total_timesteps),
                        total_timesteps,
                    )
                # print to console
                print(
                    "-" * 50 +
                    "\nExpID: {}, FPS: {:.2f}, TotalT: {}, EpisodeNum: {}, SampleNum: {}, ReplayBSize: {}"
                    .format(
                        args.expID,
                        this_training_timesteps / (time.time() - s),
                        total_timesteps,
                        episode_num,
                        num_samples,
                        sum([
                            len(replay_buffer[name].storage)
                            for name in envs_train_names
                        ]),
                    ))
                for i in range(len(envs_train_names)):
                    print("{} === EpisodeT: {}, Reward: {:.2f}".format(
                        envs_train_names[i],
                        episode_timesteps_list[i],
                        episode_reward_list[i],
                    ))

            # save model and replay buffers
            if timesteps_since_saving >= args.save_freq:
                timesteps_since_saving = 0
                model_saved_path = cp.save_model(
                    exp_path,
                    policy,
                    total_timesteps,
                    episode_num,
                    num_samples,
                    replay_buffer,
                    envs_train_names,
                    args,
                    model_name=f"model_{model_savings_so_far}.pyth",
                )
                model_savings_so_far += 1
                print("*** model saved to {} ***".format(model_saved_path))
                if args.save_buffer:
                    rb_saved_path = cp.save_replay_buffer(
                        rb_path, replay_buffer)
                    print("*** replay buffers saved to {} ***".format(
                        rb_saved_path))

            # reset training variables
            obs_list = envs_train.reset()
            done_list = [False for i in range(num_envs_train)]
            episode_reward_list = [0 for i in range(num_envs_train)]
            episode_timesteps_list = [0 for i in range(num_envs_train)]
            episode_num += num_envs_train
            # create reward buffer to store reward for one sub-env when it is not done
            episode_reward_list_buffer = [0 for i in range(num_envs_train)]

        # start sampling ===========================================================
        # sample action randomly for sometime and then according to the policy
        if total_timesteps < args.start_timesteps * num_envs_train:
            action_list = [
                np.random.uniform(
                    low=envs_train.action_space.low[0],
                    high=envs_train.action_space.high[0],
                    size=max_num_limbs,
                ) for i in range(num_envs_train)
            ]
        else:
            action_list = []
            for i in range(num_envs_train):
                # dynamically change the graph structure of the modular policy
                policy.change_morphology(args.graphs[envs_train_names[i]])
                # remove 0 padding of obs before feeding into the policy (trick for vectorized env)
                obs = np.array(
                    obs_list[i][:args.limb_obs_size *
                                len(args.graphs[envs_train_names[i]])])
                policy_action = policy.select_action(obs)
                if args.expl_noise != 0:
                    policy_action = (policy_action + np.random.normal(
                        0, args.expl_noise, size=policy_action.size)).clip(
                            envs_train.action_space.low[0],
                            envs_train.action_space.high[0])
                # add 0-padding to ensure that size is the same for all envs
                policy_action = np.append(
                    policy_action,
                    np.array([
                        0 for i in range(max_num_limbs - policy_action.size)
                    ]),
                )
                action_list.append(policy_action)

        # perform action in the environment
        new_obs_list, reward_list, curr_done_list, _ = envs_train.step(
            action_list)

        # record if each env has ever been 'done'
        done_list = [
            done_list[i] or curr_done_list[i] for i in range(num_envs_train)
        ]

        for i in range(num_envs_train):
            # add the instant reward to the cumulative buffer
            # if any sub-env is done at the momoent, set the episode reward list to be the value in the buffer
            episode_reward_list_buffer[i] += reward_list[i]
            if curr_done_list[i] and episode_reward_list[i] == 0:
                episode_reward_list[i] = episode_reward_list_buffer[i]
                episode_reward_list_buffer[i] = 0
            done_bool = float(curr_done_list[i])
            if episode_timesteps_list[i] + 1 == args.max_episode_steps:
                done_bool = 0
                done_list[i] = True
            # remove 0 padding before storing in the replay buffer (trick for vectorized env)
            num_limbs = len(args.graphs[envs_train_names[i]])
            obs = np.array(obs_list[i][:args.limb_obs_size * num_limbs])
            new_obs = np.array(new_obs_list[i][:args.limb_obs_size *
                                               num_limbs])
            action = np.array(action_list[i][:num_limbs])
            # insert transition in the replay buffer
            replay_buffer[envs_train_names[i]].add(
                (obs, new_obs, action, reward_list[i], done_bool))
            num_samples += 1
            # do not increment episode_timesteps if the sub-env has been 'done'
            if not done_list[i]:
                episode_timesteps_list[i] += 1
                total_timesteps += 1
                this_training_timesteps += 1
                timesteps_since_saving += 1
                timesteps_since_saving_model_only += 1

        obs_list = new_obs_list
        collect_done = all(done_list)

    # save checkpoint after training ===========================================================
    model_saved_path = cp.save_model(
        exp_path,
        policy,
        total_timesteps,
        episode_num,
        num_samples,
        replay_buffer,
        envs_train_names,
        args,
    )
    print("*** training finished and model saved to {} ***".format(
        model_saved_path))
Exemple #29
0
def train():
    """
    Naive Multi-Device Training

    NOTE: the communicator exposes low-level interfaces

    * Parse command line arguments.
    * Instantiate a communicator and set parameter variables.
    * Specify contexts for computation.
    * Initialize DataIterator.
    * Construct a computation graph for training and one for validation.
    * Initialize solver and set parameter variables to that.
    * Create monitor instances for saving and displaying training stats.
    * Training loop
      * Computate error rate for validation data (periodically)
      * Get a next minibatch.
      * Execute forwardprop
      * Set parameter gradients zero
      * Execute backprop.
      * AllReduce for gradients
      * Solver updates parameters by using gradients computed by backprop and all reduce.
      * Compute training error
    """
    # Parse args
    args = get_args()
    n_train_samples = 50000
    n_valid_samples = 10000
    bs_valid = args.batch_size

    # Create Communicator and Context
    extension_module = "cudnn"
    ctx = get_extension_context(extension_module, type_config=args.type_config)
    comm = C.MultiProcessDataParalellCommunicator(ctx)
    comm.init()
    n_devices = comm.size
    mpi_rank = comm.rank
    mpi_local_rank = comm.local_rank
    device_id = mpi_local_rank
    ctx.device_id = str(device_id)
    nn.set_default_context(ctx)

    # Model
    rng = np.random.RandomState(313)
    comm_syncbn = comm if args.sync_bn else None
    if args.net == "cifar10_resnet23":
        prediction = functools.partial(resnet23_prediction,
                                       rng=rng,
                                       ncls=10,
                                       nmaps=32,
                                       act=F.relu,
                                       comm=comm_syncbn)
        data_iterator = data_iterator_cifar10
    if args.net == "cifar100_resnet23":
        prediction = functools.partial(resnet23_prediction,
                                       rng=rng,
                                       ncls=100,
                                       nmaps=384,
                                       act=F.elu,
                                       comm=comm_syncbn)
        data_iterator = data_iterator_cifar100

    # Create training graphs
    image_train = nn.Variable((args.batch_size, 3, 32, 32))
    label_train = nn.Variable((args.batch_size, 1))
    pred_train = prediction(image_train, test=False)
    pred_train.persistent = True
    loss_train = (loss_function(pred_train, label_train) /
                  n_devices).apply(persistent=True)
    error_train = F.mean(F.top_n_error(pred_train, label_train,
                                       axis=1)).apply(persistent=True)
    loss_error_train = F.sink(loss_train, error_train)
    input_image_train = {"image": image_train, "label": label_train}

    # Create validation graph
    image_valid = nn.Variable((bs_valid, 3, 32, 32))
    label_valid = nn.Variable((args.batch_size, 1))
    pred_valid = prediction(image_valid, test=True)
    error_valid = F.mean(F.top_n_error(pred_valid, label_valid, axis=1))
    input_image_valid = {"image": image_valid, "label": label_valid}

    # Solvers
    solver = S.Adam()
    solver.set_parameters(nn.get_parameters())
    base_lr = args.learning_rate
    warmup_iter = int(
        1. * n_train_samples / args.batch_size / n_devices) * args.warmup_epoch
    warmup_slope = base_lr * (n_devices - 1) / warmup_iter
    solver.set_learning_rate(base_lr)

    # load checkpoint if file exist.
    start_point = 0
    if args.use_latest_checkpoint:
        files = glob.glob(f'{args.model_save_path}/checkpoint_*.json')
        if len(files) != 0:
            index = max([
                int(n) for n in
                [re.sub(r'.*checkpoint_(\d+).json', '\\1', f) for f in files]
            ])
            # load weights and solver state info from specified checkpoint file.
            start_point = load_checkpoint(
                f'{args.model_save_path}/checkpoint_{index}.json', solver)

    # Create monitor
    from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed
    monitor = Monitor(args.monitor_path)
    monitor_loss = MonitorSeries("Training loss", monitor, interval=10)
    monitor_err = MonitorSeries("Training error", monitor, interval=10)
    monitor_time = MonitorTimeElapsed("Training time", monitor, interval=10)
    monitor_verr = MonitorSeries("Validation error", monitor, interval=1)
    monitor_vtime = MonitorTimeElapsed("Validation time", monitor, interval=1)

    # Data Iterator

    # If the data does not exist, it will try to download it from the server
    # and prepare it. When executing multiple processes on the same host, it is
    # necessary to execute initial data preparation by the representative
    # process (local_rank is 0) on the host.

    # Prepare data only when local_rank is 0
    if mpi_rank == 0:
        rng = np.random.RandomState(device_id)
        _, tdata = data_iterator(args.batch_size, True, rng)
        vsource, vdata = data_iterator(args.batch_size, False)

    # Wait for data to be prepared without watchdog
    comm.barrier()

    # Prepare data when local_rank is not 0
    if mpi_rank != 0:
        rng = np.random.RandomState(device_id)
        _, tdata = data_iterator(args.batch_size, True, rng)
        vsource, vdata = data_iterator(args.batch_size, False)

    # loss_error_train.forward()

    # Training-loop
    ve = nn.Variable()
    model_save_interval = 0
    for i in range(start_point, int(args.max_iter / n_devices)):
        # Validation
        if i % int(n_train_samples / args.batch_size / n_devices) == 0:
            ve_local = 0.
            k = 0
            idx = np.random.permutation(n_valid_samples)
            val_images = vsource.images[idx]
            val_labels = vsource.labels[idx]
            for j in range(int(n_valid_samples / n_devices * mpi_rank),
                           int(n_valid_samples / n_devices * (mpi_rank + 1)),
                           bs_valid):
                image = val_images[j:j + bs_valid]
                label = val_labels[j:j + bs_valid]
                if len(image
                       ) != bs_valid:  # note that smaller batch is ignored
                    continue
                input_image_valid["image"].d = image
                input_image_valid["label"].d = label
                error_valid.forward(clear_buffer=True)
                ve_local += error_valid.d.copy()
                k += 1
            ve_local /= k
            ve.d = ve_local
            comm.all_reduce(ve.data, division=True, inplace=True)

            # Save model
            if mpi_rank == 0:
                monitor_verr.add(i * n_devices, ve.d.copy())
                monitor_vtime.add(i * n_devices)
                if model_save_interval <= 0:
                    nn.save_parameters(
                        os.path.join(args.model_save_path,
                                     'params_%06d.h5' % i))
                    save_checkpoint(args.model_save_path, i, solver)
                    model_save_interval += int(args.model_save_interval /
                                               n_devices)
        model_save_interval -= 1

        # Forward/Zerograd
        image, label = tdata.next()
        input_image_train["image"].d = image
        input_image_train["label"].d = label
        loss_error_train.forward(clear_no_need_grad=True)
        solver.zero_grad()

        # Backward/AllReduce
        backward_and_all_reduce(
            loss_error_train,
            comm,
            with_all_reduce_callback=args.with_all_reduce_callback)

        # Solvers update
        solver.update()

        # Linear Warmup
        if i <= warmup_iter:
            lr = base_lr + warmup_slope * i
            solver.set_learning_rate(lr)

        if mpi_rank == 0:  # loss and error locally, and elapsed time
            monitor_loss.add(i * n_devices, loss_train.d.copy())
            monitor_err.add(i * n_devices, error_train.d.copy())
            monitor_time.add(i * n_devices)

        # exit(0)

    if mpi_rank == 0:
        nn.save_parameters(
            os.path.join(args.model_save_path,
                         'params_%06d.h5' % (args.max_iter / n_devices)))
    comm.barrier()
Exemple #30
0
    #***************************************************
    criterion = nn.CrossEntropyLoss().to(device)
    # Setting weight decay scheduler (?)
    #optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
    optimizer = None
    if args.train:
        if args.resume_from:
            # Load checkpoint for post training form the pre-trained model.
            """
            net, compress_scheduler, optimizer, start_epoch = ckpt.load_checkpoint(
            net, "/home/bwtseng/Downloads/vww_mobilenetv1_distiller/model_save/image_net_mobilenetv1_saved_best.pth.tar", 
            model_device=device)
            """
            net, compress_scheduler, optimizer, start_epoch = ckpt.load_checkpoint(
                net,
                os.path.join('/home/bwtseng/Downloads/', args.model_path),
                name,
                model_device=device)

            optimizer = None
            if optimizer is None:
                optimizer = optim.SGD(net.parameters(),
                                      lr=args.lr,
                                      momentum=0.9,
                                      weight_decay=args.weight_decay)
                print("Do bulid optimizer")

            if compress_scheduler is None:
                compress_scheduler = utl.file_config(net, optimizer,
                                                     args.compress, None, None)
                print("Do load compress")