def test_model_debugger_pmap(self): """Test training for two epochs on MNIST with a small model.""" rep_variables = set_up_cnn() pytree_path = os.path.join(self.test_dir, 'metrics') metrics_logger = utils.MetricLogger(pytree_path=pytree_path, events_dir=self.test_dir) debugger = model_debugger.ModelDebugger(use_pmap=True, metrics_logger=metrics_logger) # eval twice to test the concat extra_metrics = {'train_loss': 1.0} extra_metrics2 = {'train_loss': 1.0} metrics = debugger.full_eval(10, params=rep_variables['params'], grad=rep_variables['params'], extra_scalar_metrics=extra_metrics) metrics = debugger.full_eval(10, params=rep_variables['params'], grad=rep_variables['params'], extra_scalar_metrics=extra_metrics2) expected_keys = [ 'step', 'global_param_norm_sql2', 'param_norms_sql2', 'grad_norms_sql2', 'global_grad_norm_sql2', 'train_loss', ] metrics_file = os.path.join(self.test_dir, 'metrics/training_metrics') loaded_metrics = checkpoint.load_checkpoint(metrics_file)['pytree'] self.assertEqual(set(expected_keys), set(metrics.keys())) expected_shape = () self.assertEqual(metrics['global_grad_norm_sql2'].shape, expected_shape) # Test stored metrics is concatenated. expected_shape = (2, ) self.assertEqual(loaded_metrics['global_grad_norm_sql2'].shape, expected_shape) # check param norms were saved correctly self.assertEqual( loaded_metrics['param_norms_sql2']['Conv_0']['kernel'].shape, (2, )) self.assertEqual(loaded_metrics['train_loss'][0], 1.0) # Test restore of prior metrics. new_debugger = model_debugger.ModelDebugger( use_pmap=True, metrics_logger=metrics_logger) metrics = new_debugger.full_eval(10, params=rep_variables['params'], grad=rep_variables['params'], extra_scalar_metrics=extra_metrics2) self.assertEqual( new_debugger.stored_metrics['param_norms_sql2']['Conv_0'] ['kernel'].shape, (3, ))
def test_acuracy(dataloaders,checkpoint_name='ic-model.pth',gpu=False): # TODO: Do validation on the test set cuda=gpu model = loader.load_checkpoint(checkpoint_name,cuda) correct=0 total=0 model.eval() if(cuda): model.to(device='cuda') with torch.no_grad(): for idx, (inputs, labels) in enumerate(dataloaders['test']): if cuda: inputs, labels = inputs.cuda(), labels.cuda() outputs = model(inputs) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print('Correct'+str(correct)) print('Total'+str(total)) print('Accuracy of the network on the 10000 test images: %d %%' % (100 * correct / total)) if (correct/total)>90: print ('It was more than 90%') else: print ('It was less than 90%')
def main(): in_args = get_prediction_args() chk_suffix = '.pth' checkpoint_path = in_args.files[1] + chk_suffix model, optimizer = load_checkpoint(checkpoint_path) prob, labels = predict(in_args, model) print('Probabilities of each class: ', prob) print('Classes predicted: ', labels)
def eval(self, checkpoint_path): checkpoint = cp.load_checkpoint(address=checkpoint_path) self.model.load_state_dict(checkpoint['state_dict']) test = data_utils.TestSet(self.testpath, self.img_size, self.channel == 3) testdatas = test.loadtestdata() testdatas.astype(np.float) n = 0 N = 16343 batch_size = 8 pre = np.array([]) batch_site = [] while n < N: n += batch_size if n < N: n1 = n - batch_size n2 = n else: n1 = n2 n2 = N batch_site.append([n1, n2]) pred_choice = [] for site in tqdm(batch_site): test_batch = testdatas[site[0]:site[1]] test_batch = torch.from_numpy(test_batch) datas = Variable(test_batch).float() datas = datas.view(-1, 1, 128, 128) outputs = self.model(datas) outputs = outputs.cpu() outputs = outputs.data.numpy() for out in outputs: K = 5 index = np.argpartition(out, -K)[-K:] pred_choice.append(index) pre = np.array(pred_choice) predicts = [] for k in range(self.testnumber): index = pre[k] predict5 = self.words[index] predict5 = "".join(predict5) predicts.append(predict5) dataframe = pd.DataFrame({ 'filename': self.filename, 'label': predicts }) dataframe.to_csv("test.csv", index=False, encoding='utf-8') return self.filename, predicts
def main(): checkpoint = cp.load_checkpoint(address='parameters.pth') net.load_state_dict(checkpoint['state_dict']) outputs = net(x) outputs = outputs.cpu() outputs = outputs.data.numpy() pred_choice = [] for out in outputs: K = 1 index = np.argpartition(out, -K)[-K:] pred_choice.append(index) pre = np.array(pred_choice) df['score'] = pre df.to_csv('predict.csv', encoding='gbk')
def main(): args = parse_args() model = load_checkpoint(args.checkpoint) categories = category_names(args.category_names) image_path = args.filepath model.eval() probs, classes = predict(image_path, model, topk=args.top_k) names = [categories[str(index)] for index in classes] print(probs) print(names) print('File selected: ' + image_path) i = 0 while i < len(names): print("{} with a probability of {}".format(names[i], probs[i])) i = i + 1
def predict(image_path, model_name, topk=10, categories='', device='cuda'): ''' Predict the class (or classes) of an image using a trained deep learning model. ''' if (not torch.cuda.is_available() and device == 'cuda'): device = 'cpu' # TODO: Implement the code to predict the class from an image file with open('cat_to_name.json', 'r') as f: label_mapper = json.load(f) gpu = (device == 'cuda') model = loader.load_checkpoint(model_name, gpu=gpu) model.to('cpu') img = process_image(image_path) img = torch.from_numpy(img).type(torch.FloatTensor) inpt = img.unsqueeze(0) model_result = model.forward(inpt) expResult = torch.exp(model_result) firstTopX, SecondTopX = expResult.topk(topk) probs = torch.nn.functional.softmax(firstTopX.data, dim=1).numpy()[0] #classes = SecondTopX.data.numpy()[0] #probs = firstTopX.detach().numpy().tolist()[0] classes = SecondTopX.detach().numpy().tolist()[0] # Convert indices to classes idx_to_class = {val: key for key, val in model.class_to_idx.items()} #labels = [label_mapper[str(lab)] for lab in SecondTopX] labels = [idx_to_class[y] for y in classes] flowers = [categories[idx_to_class[i]] for i in classes] return probs, flowers
def initialise(config, dataset, args): data_root = config.root log_root = args.log_dir or data_root model_args = struct(dataset=struct(classes=dataset.classes, input_channels=3), model=args.model, version=2) run = 0 debug = struct(predictions=args.debug_predictions or args.debug_all, boxes=args.debug_boxes or args.debug_all) output_path, log = logger.make_experiment(log_root, args.run_name, load=not args.no_load, dry_run=args.dry_run) model_path = os.path.join(output_path, "model.pth") model, encoder = models.create(model_args.model, model_args.dataset) set_bn_momentum(model, args.bn_momentum) best, current, resumed = checkpoint.load_checkpoint( model_path, model, model_args, args) model, epoch = current.model, current.epoch + 1 pause_time = args.pause_epochs running_average = [] if epoch >= args.average_start else [] optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=args.weight_decay) device = torch.cuda.current_device() tests = args.tests.split(",") return struct(**locals())
def main(test_img_path): options = parse_args() is_cuda = use_cuda and not options.no_cuda hardware = "cuda" if is_cuda else "cpu" device = torch.device(hardware) for checkpoint_path in options.checkpoint: checkpoint_name, _ = os.path.splitext( os.path.basename(checkpoint_path)) checkpoint = (load_checkpoint(checkpoint_path, cuda=is_cuda) if checkpoint_path else default_checkpoint) encoder_checkpoint = checkpoint["model"].get("encoder") decoder_checkpoint = checkpoint["model"].get("decoder") test_img = Image.open(test_img_path) test_img = test_img.convert("RGB") enc = Encoder(img_channels=3, checkpoint=encoder_checkpoint).to(device) dec = Decoder( 1, low_res_shape, high_res_shape, checkpoint=decoder_checkpoint, device=device, ).to(device) enc.eval() dec.eval() result = evaluate( enc, dec, test_img=test_img, device=device, checkpoint=checkpoint, beam_width=options.beam_width, prefix=options.prefix, ) print(result)
def train(checkpoint_path): # 是否装载模型参数 load = False if load: checkpoint = cp.load_checkpoint(address=checkpoint_path) net.load_state_dict(checkpoint['state_dict']) start_epoch = checkpoint['epoch'] + 1 else: start_epoch = 0 for epoch in range(start_epoch, n_epoch): train_one_epoch() # 保存参数 checkpoint = { 'epoch': epoch, 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict() } cp.save_checkpoint(checkpoint, address=checkpoint_path) eval()
def train(self, checkpoint_path): # 是否装载模型参数 load = False if load: checkpoint = cp.load_checkpoint(address=checkpoint_path) self.model.load_state_dict(checkpoint['state_dict']) start_epoch = checkpoint['epoch'] + 1 else: start_epoch = 0 for epoch in range(start_epoch, self.n_epoch): self.train_one_epoch(epoch) # 保存参数 checkpoint = { 'epoch': epoch, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict() } cp.save_checkpoint(checkpoint, address=checkpoint_path) if self.selftest: self.eval(epoch)
def train_(self, criterion, logger, model, **pars): """ grad_cache will be updated in-place *** only learning_rate and current_loader_ind need to be loaded at checkpoint K: the number of active clients """ default_pars = dict(learning_rate=1e-2, K=10, num_its=5000, lr_decay=0.5, decay_step_size=1000, print_every=50, checkpoint_interval=1000) init_pars(default_pars, pars) pars = default_pars K = pars['K'] learning_rate = pars['learning_rate'] num_its = pars['num_its'] lr_decay = pars['lr_decay'] decay_step_size = pars['decay_step_size'] print_every = pars['print_every'] checkpoint_interval = pars['checkpoint_interval'] I = self.pars['I'] N = self.pars['N'] checkpoint_dir = self.checkpoint_dir logger.add_meta_data(pars, 'training') logger.add_meta_data(self.pars, 'simulation') if use_cuda: model = model.to(torch.device('cuda')) else: model = model.to(torch.device('cpu')) if osp.exists(osp.join(checkpoint_dir, 'meta.pkl')): current_it = load_checkpoint(checkpoint_dir, model, logger) else: current_it = 0 while True: current_lr = learning_rate * (lr_decay **(current_it // decay_step_size)) print(f"current_it={current_it}, current_lr={current_lr}", end='\r') global_model = deepcopy(model) zero_model(global_model) # set the number of active clients idxs_users = np.random.choice(range(N), K, replace=False) for idx in idxs_users: worker = self.workers[idx] local_model = deepcopy(model) worker.train_(local_model, criterion, current_lr=current_lr, num_its=I) aggregate_model( global_model, local_model, 1, N / K * (worker.num_train / self.num_total_samples)) model = global_model logger.add_train_loss( list(model.parameters())[0][0][0][0][0], current_it, 'model-par') if current_it % print_every == 0: # fedavg fed_acc_array = self.test_model(model) fed_acc = np.array(fed_acc_array).mean() print('%d fedavg test acc: %.3f%%' % (current_it, fed_acc * 100.0)) logger.add_test_acc(fed_acc, current_it, 'fedavg') if current_it % checkpoint_interval == 0: save_checkpoint(current_it, model, logger, checkpoint_dir) if current_it == num_its: print('Finished Training') return current_it += 1
model.dataset = args.dataset model.input_shape = (1, 3, args.image_size, args.image_size ) # For channel first. device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Leverage {} device to run this task.".format(device)) if args.cpu: device = torch.device("cpu") model.to(device) optimizer = None compress_scheduler = None if args.train: if args.resume_from: # Load checkpoint for locally pre-trained model. try: model, compress_scheduler, optimizer, start_epoch = ckpt.load_checkpoint( model, args.model_path, model_device=device) except: model.load_state_dict(torch.load(args.model_path)) optimizer = None if optimizer is None: optimizer = optim.SGD(model.parameters(), lr=args.lr_pretrain, momentum=0.9, weight_decay=args.weight_decay) print("Do build optimizer") store_mask = compress_scheduler.zeros_mask_dict compress_scheduler = None if compress_scheduler is None: if args.compress:
#*************************************************** criterion = nn.CrossEntropyLoss().to(device) # Setting weight decay scheduler (?) #optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) optimizer = None if args.train: if args.resume_from: # Load checkpoint for post training form the pre-trained model. """ net, compress_scheduler, optimizer, start_epoch = ckpt.load_checkpoint( net, "/home/bwtseng/Downloads/vww_mobilenetv1_distiller/model_save/image_net_mobilenetv1_saved_best.pth.tar", model_device=device) """ net, compress_scheduler, optimizer, start_epoch = ckpt.load_checkpoint( net, #os.path.join('/home/bwtseng/Downloads/', args.model_path, name), "/home/bwtseng/Downloads/distiller/examples/ssl/checkpoints/checkpoint_trained_dense.pth.tar", model_device=device) optimizer = None print(optimizer) if optimizer is None: optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) print("Do optimizer") compress_scheduler = None if compress_scheduler is None: compress_scheduler = utl.file_config(net, optimizer, args.compress, None, None) print("Do load compress")
def main(): options = parse_args() torch.manual_seed(options.seed) is_cuda = use_cuda and not options.no_cuda hardware = "cuda" if is_cuda else "cpu" device = torch.device(hardware) checkpoint = (load_checkpoint(options.checkpoint, cuda=is_cuda) if options.checkpoint else default_checkpoint) print("Running {} epochs on {}".format(options.num_epochs, hardware)) encoder_checkpoint = checkpoint["model"].get("encoder") decoder_checkpoint = checkpoint["model"].get("decoder") if encoder_checkpoint is not None: print(("Resuming from - Epoch {}: " "Train Accuracy = {train_accuracy:.5f}, " "Train Loss = {train_loss:.5f}, " "Validation Accuracy = {validation_accuracy:.5f}, " "Validation Loss = {validation_loss:.5f}, ").format( checkpoint["epoch"], train_accuracy=checkpoint["train_accuracy"][-1], train_loss=checkpoint["train_losses"][-1], validation_accuracy=checkpoint["validation_accuracy"][-1], validation_loss=checkpoint["validation_losses"][-1], )) train_dataset = CrohmeDataset(gt_train, tokensfile, root=root, crop=options.crop, transform=transformers) train_data_loader = DataLoader( train_dataset, batch_size=options.batch_size, shuffle=True, num_workers=options.num_workers, collate_fn=collate_batch, ) validation_dataset = CrohmeDataset(gt_validation, tokensfile, root=root, crop=options.crop, transform=transformers) validation_data_loader = DataLoader( validation_dataset, batch_size=options.batch_size, shuffle=True, num_workers=options.num_workers, collate_fn=collate_batch, ) criterion = nn.CrossEntropyLoss().to(device) enc = Encoder(img_channels=3, dropout_rate=options.dropout_rate, checkpoint=encoder_checkpoint).to(device) dec = Decoder( len(train_dataset.id_to_token), low_res_shape, high_res_shape, checkpoint=decoder_checkpoint, device=device, ).to(device) enc.train() dec.train() enc_params_to_optimise = [ param for param in enc.parameters() if param.requires_grad ] dec_params_to_optimise = [ param for param in dec.parameters() if param.requires_grad ] params_to_optimise = [*enc_params_to_optimise, *dec_params_to_optimise] optimiser = optim.Adadelta(params_to_optimise, lr=options.lr, weight_decay=options.weight_decay) optimiser_state = checkpoint.get("optimiser") if optimiser_state: optimiser.load_state_dict(optimiser_state) # Set the learning rate instead of using the previous state. # The scheduler somehow overwrites the LR to the initial LR after loading, # which would always reset it to the first used learning rate instead of # the one from the previous checkpoint. So might as well set it manually. for param_group in optimiser.param_groups: param_group["initial_lr"] = options.lr # Decay learning rate by a factor of lr_factor (default: 0.1) # every lr_epochs (default: 3) lr_scheduler = optim.lr_scheduler.StepLR(optimiser, step_size=options.lr_epochs, gamma=options.lr_factor) train( enc, dec, optimiser, criterion, train_data_loader, validation_data_loader, teacher_forcing_ratio=options.teacher_forcing, lr_scheduler=lr_scheduler, print_epochs=options.print_epochs, device=device, num_epochs=options.num_epochs, checkpoint=checkpoint, prefix=options.prefix, max_grad_norm=options.max_grad_norm, )
def main_worker(gpu, ngpus_per_node, args): global best_acc1 if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model model = create_net(args) x = torch.randn(1, 3, 224, 224) flops, params = profile(model, inputs=(x, )) print("model [%s] - params: %.6fM" % (args.arch, params / 1e6)) print("model [%s] - FLOPs: %.6fG" % (args.arch, flops / 1e9)) log_file = os.path.join(args.ckpt, "log.txt") if os.path.exists(log_file): args.log_file = open(log_file, mode="a") else: args.log_file = open(log_file, mode="w") args.log_file.write("Network - " + args.arch + "\n") args.log_file.write("Attention Module - " + args.attention_type + "\n") args.log_file.write("Params - " % str(params) + "\n") args.log_file.write("FLOPs - " % str(flops) + "\n") args.log_file.write( "--------------------------------------------------" + "\n") args.log_file.close() if not torch.cuda.is_available(): print('using CPU, this will be slow') elif args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.device) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.device) model = model.to(args.gpu[0]) model = torch.nn.DataParallel(model, args.gpu) print(model) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.resume: model, optimizer, best_acc1, start_epoch = load_checkpoint( args, model, optimizer) args.start_epoch = start_epoch cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.save_weights is not None: # "deparallelize" saved weights print("=> saving 'deparallelized' weights [%s]" % args.save_weights) model = model.module model = model.cpu() torch.save({'state_dict': model.state_dict()}, args.save_weights, _use_new_zipfile_serialization=False) return if args.evaluate: args.log_file = open(log_file, mode="a") validate(val_loader, model, criterion, args) args.log_file.close() return if args.cos_lr: scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, args.epochs) for epoch in range(args.start_epoch): scheduler.step() for epoch in range(args.start_epoch, args.epochs): args.log_file = open(log_file, mode="a") if args.distributed: train_sampler.set_epoch(epoch) if (not args.cos_lr): adjust_learning_rate(optimizer, epoch, args) else: scheduler.step() print('[%03d] %.5f' % (epoch, scheduler.get_lr()[0])) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) args.log_file.close() if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { "epoch": epoch + 1, "arch": args.arch, "state_dict": model.state_dict(), "best_acc": best_acc1, "optimizer": optimizer.state_dict(), }, is_best, epoch, save_path=args.ckpt)
def main(): options = parse_args() is_cuda = use_cuda and not options.no_cuda hardware = "cuda" if is_cuda else "cpu" device = torch.device(hardware) for dataset_name in options.dataset: results = {"best": {}, "mean": {}, "highest_prob": {}} for checkpoint_path in options.checkpoint: checkpoint_name, _ = os.path.splitext( os.path.basename(checkpoint_path)) checkpoint = (load_checkpoint(checkpoint_path, cuda=is_cuda) if checkpoint_path else default_checkpoint) encoder_checkpoint = checkpoint["model"].get("encoder") decoder_checkpoint = checkpoint["model"].get("decoder") test_set = test_sets[dataset_name] dataset = CrohmeDataset( test_set["groundtruth"], tokensfile, root=test_set["root"], transform=transformers, ) data_loader = DataLoader( dataset, batch_size=options.batch_size, shuffle=False, num_workers=options.num_workers, collate_fn=collate_batch, ) enc = Encoder(img_channels=3, checkpoint=encoder_checkpoint).to(device) dec = Decoder( len(dataset.id_to_token), low_res_shape, high_res_shape, checkpoint=decoder_checkpoint, device=device, ).to(device) enc.eval() dec.eval() result = evaluate( enc, dec, data_loader=data_loader, device=device, checkpoint=checkpoint, beam_width=options.beam_width, prefix=options.prefix, ) results["best"][checkpoint_name] = result["best"] results["mean"][checkpoint_name] = result["mean"] results["highest_prob"][checkpoint_name] = result["highest_prob"] highest_prob_err_table, highest_prob_correct_table = create_markdown_tables( results["highest_prob"]) best_err_table, best_correct_table = create_markdown_tables( results["best"]) mean_err_table, mean_correct_table = create_markdown_tables( results["mean"]) print(("\n# Dataset {name}\n\n" "Beam width: {beam_width}\n\n" "## Highest Probability\n\n{highest_prob_err_table}\n\n" "{highest_prob_correct_table}\n\n" "## Best\n\n{best_err_table}\n\n{best_correct_table}\n\n" "## Mean\n\n{mean_err_table}\n\n{mean_correct_table}").format( name=dataset_name, beam_width=options.beam_width, highest_prob_err_table=highest_prob_err_table, highest_prob_correct_table=highest_prob_correct_table, best_err_table=best_err_table, best_correct_table=best_correct_table, mean_err_table=mean_err_table, mean_correct_table=mean_correct_table, ))
def train(args): # Set up directories =========================================================== os.makedirs(DATA_DIR, exist_ok=True) os.makedirs(BUFFER_DIR, exist_ok=True) exp_name = "EXP_%04d" % (args.expID) exp_path = os.path.join(DATA_DIR, exp_name) rb_path = os.path.join(BUFFER_DIR, exp_name) os.makedirs(exp_path, exist_ok=True) os.makedirs(rb_path, exist_ok=True) # save arguments with open(os.path.join(exp_path, 'args.txt'), 'w+') as f: json.dump(args.__dict__, f, indent=2) # Retrieve MuJoCo XML files for training ======================================== agent_name = args.agent_name envs_train_names = [agent_name] args.graphs = dict() # existing envs if not args.custom_xml: args.graphs[agent_name] = utils.getGraphStructure( os.path.join(XML_DIR, '{}.xml'.format(agent_name))) # custom envs num_envs_train = len(envs_train_names) print("#" * 50 + '\ntraining envs: {}\n'.format(envs_train_names) + "#" * 50) # Set up training env and policy ================================================ args.limb_obs_size, args.max_action = utils.registerEnvs( envs_train_names, args.max_episode_steps, args.custom_xml) max_num_limbs = max( [len(args.graphs[env_name]) for env_name in envs_train_names]) # create vectorized training env obs_max_len = max( [len(args.graphs[env_name]) for env_name in envs_train_names]) * args.limb_obs_size envs_train = [ utils.makeEnvWrapper(name, obs_max_len, args.seed) for name in envs_train_names ] # envs_train = SubprocVecEnv(envs_train) # vectorized env # set random seeds torch.manual_seed(args.seed) np.random.seed(args.seed) # determine the maximum number of children in all the training envs if args.max_children is None: args.max_children = utils.findMaxChildren(envs_train_names, args.graphs) # setup agent policy policy = TD3.LifeLongTD3(args) # Create new training instance or load previous checkpoint ======================== if cp.has_checkpoint(exp_path, rb_path): print("*** loading checkpoint from {} ***".format(exp_path)) total_timesteps, episode_num, replay_buffer, num_samples, loaded_path = cp.load_checkpoint( exp_path, rb_path, policy, args) print("*** checkpoint loaded from {} ***".format(loaded_path)) else: print("*** training from scratch ***") # init training vars total_timesteps = 0 episode_num = 0 num_samples = 0 # different replay buffer for each env; avoid using too much memory if there are too many envs # Initialize training variables ================================================ writer = SummaryWriter("%s/%s/" % (DATA_DIR, exp_name)) s = time.time() # TODO: may have to change the following codes into the loop timesteps_since_saving = 0 this_training_timesteps = 0 episode_timesteps = 0 episode_reward = 0 episode_reward_buffer = 0 done = True # Start training =========================================================== for env_handle, env_name in zip(envs_train, envs_train_names): env = env_handle() obs = env.reset() replay_buffer = utils.ReplayBuffer(max_size=args.rb_max) policy.change_morphology(args.graphs[env_name]) policy.graph = args.graphs[env_name] task_timesteps = 0 done = False episode_timesteps = 0 episode_reward = 0 episode_reward_buffer = 0 while task_timesteps < args.max_timesteps: # train and log after one episode for each env if done: # log updates and train policy if this_training_timesteps != 0: policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, args.policy_noise, args.noise_clip, args.policy_freq, graphs=args.graphs, env_name=env_name) # add to tensorboard display writer.add_scalar('{}_episode_reward'.format(env_name), episode_reward, task_timesteps) writer.add_scalar('{}_episode_len'.format(env_name), episode_timesteps, task_timesteps) # print to console print( "-" * 50 + "\nExpID: {}, FPS: {:.2f}, TotalT: {}, EpisodeNum: {}, SampleNum: {}, ReplayBSize: {}" .format(args.expID, this_training_timesteps / (time.time() - s), total_timesteps, episode_num, num_samples, len(replay_buffer.storage))) print("{} === EpisodeT: {}, Reward: {:.2f}".format( env_name, episode_timesteps, episode_reward)) this_training_timesteps = 0 s = time.time() # save model and replay buffers if timesteps_since_saving >= args.save_freq: print("!!!!!") timesteps_since_saving = 0 model_saved_path = cp.save_model(exp_path, policy, total_timesteps, episode_num, num_samples, {env_name: replay_buffer}, envs_train_names, args) print("*** model saved to {} ***".format(model_saved_path)) rb_saved_path = cp.save_replay_buffer( rb_path, {env_name: replay_buffer}) print("*** replay buffers saved to {} ***".format( rb_saved_path)) # reset training variables obs = env.reset() done = False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # create reward buffer to store reward for one sub-env when it is not done episode_reward_buffer = 0 # start sampling =========================================================== # sample action randomly for sometime and then according to the policy if task_timesteps < args.start_timesteps: action = np.random.uniform(low=env.action_space.low[0], high=env.action_space.high[0], size=max_num_limbs) else: # remove 0 padding of obs before feeding into the policy (trick for vectorized env) obs = np.array(obs[:args.limb_obs_size * len(args.graphs[env_name])]) policy_action = policy.select_action(obs) if args.expl_noise != 0: policy_action = (policy_action + np.random.normal( 0, args.expl_noise, size=policy_action.size)).clip( env.action_space.low[0], env.action_space.high[0]) # add 0-padding to ensure that size is the same for all envs action = np.append( policy_action, np.array([ 0 for i in range(max_num_limbs - policy_action.size) ])) # perform action in the environment new_obs, reward, done, _ = env.step(action) # record if each env has ever been 'done' # add the instant reward to the cumulative buffer # if any sub-env is done at the momoent, set the episode reward list to be the value in the buffer episode_reward_buffer += reward if done and episode_reward == 0: episode_reward = episode_reward_buffer episode_reward_buffer = 0 writer.add_scalar('{}_instant_reward'.format(env_name), reward, task_timesteps) done_bool = float(done) if episode_timesteps + 1 == args.max_episode_steps: done_bool = 0 done = True # remove 0 padding before storing in the replay buffer (trick for vectorized env) num_limbs = len(args.graphs[env_name]) obs = np.array(obs[:args.limb_obs_size * num_limbs]) new_obs = np.array(new_obs[:args.limb_obs_size * num_limbs]) action = np.array(action[:num_limbs]) # insert transition in the replay buffer replay_buffer.add((obs, new_obs, action, reward, done_bool)) num_samples += 1 # do not increment episode_timesteps if the sub-env has been 'done' if not done: episode_timesteps += 1 total_timesteps += 1 task_timesteps += 1 this_training_timesteps += 1 timesteps_since_saving += 1 obs = new_obs policy.next_task() # save checkpoint after training =========================================================== model_saved_path = cp.save_model(exp_path, policy, total_timesteps, episode_num, num_samples, {envs_train_names[-1]: replay_buffer}, envs_train_names, args) print("*** training finished and model saved to {} ***".format( model_saved_path))
def eval(**args): """ Evaluate selected model Args: seed (Int): Integer indicating set seed for random state save_dir (String): Top level directory to generate results folder model (String): Name of selected model dataset (String): Name of selected dataset exp (String): Name of experiment load_type (String): Keyword indicator to evaluate the testing or validation set pretrained (Int/String): Int/String indicating loading of random, pretrained or saved weights Return: None """ print("\n############################################################################\n") print("Experimental Setup: ", args) print("\n############################################################################\n") d = datetime.datetime.today() date = d.strftime('%Y%m%d-%H%M%S') result_dir = os.path.join(args['save_dir'], args['model'], '_'.join((args['dataset'],args['exp'],date))) log_dir = os.path.join(result_dir, 'logs') save_dir = os.path.join(result_dir, 'checkpoints') if not args['debug']: os.makedirs(result_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(save_dir, exist_ok=True) # Save copy of config file with open(os.path.join(result_dir, 'config.yaml'),'w') as outfile: yaml.dump(args, outfile, default_flow_style=False) # Tensorboard Element writer = SummaryWriter(log_dir) # Check if GPU is available (CUDA) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Load Network model = create_model_object(**args).to(device) # Load Data loader = data_loader(**args, model_obj=model) if args['load_type'] == 'train_val': eval_loader = loader['valid'] elif args['load_type'] == 'train': eval_loader = loader['train'] elif args['load_type'] == 'test': eval_loader = loader['test'] else: sys.exit('load_type must be valid or test for eval, exiting') # END IF if isinstance(args['pretrained'], str): ckpt = load_checkpoint(args['pretrained']) model.load_state_dict(ckpt) # Training Setup params = [p for p in model.parameters() if p.requires_grad] acc_metric = Metrics(**args, result_dir=result_dir, ndata=len(eval_loader.dataset)) acc = 0.0 # Setup Model To Evaluate model.eval() with torch.no_grad(): for step, data in enumerate(eval_loader): x_input = data['data'] annotations = data['annots'] if isinstance(x_input, torch.Tensor): outputs = model(x_input.to(device)) else: for i, item in enumerate(x_input): if isinstance(item, torch.Tensor): x_input[i] = item.to(device) outputs = model(*x_input) # END IF acc = acc_metric.get_accuracy(outputs, annotations) if step % 100 == 0: print('Step: {}/{} | {} acc: {:.4f}'.format(step, len(eval_loader), args['load_type'], acc)) print('Accuracy of the network on the {} set: {:.3f} %\n'.format(args['load_type'], 100.*acc)) if not args['debug']: writer.add_scalar(args['dataset']+'/'+args['model']+'/'+args['load_type']+'_accuracy', 100.*acc) # Close Tensorboard Element writer.close()
criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, threshold=0.005, verbose=True) n_epochs = 200 if args.resume: start_epoch, model, optimizer, scheduler = load_checkpoint( model_path=args.model_path, ckpt_name=args.ckpt, device=device, model=model, optimizer=optimizer, scheduler=scheduler) start_epoch -= 1 print('Resumed checkpoint {} from {}. Starting at epoch {}.'.format( args.ckpt, args.model_path, start_epoch + 1)) print('Current learning rate: {}'.format(get_current_lr(optimizer))) print('*' * 30) else: start_epoch = 0 # model = init_weights(model) for epoch in range(start_epoch, n_epochs): print('Epoch: %d/%d' % (epoch + 1, n_epochs)) train_loss = train_model(model,
warmup=job_config.get_warmup_proportion(), t_total=job_config.get_total_training_steps()) global_step = 0 start_epoch = 0 # if args.load_training_checkpoint is not None: if load_training_checkpoint != 'False': logger.info(f"Looking for previous training checkpoint.") latest_checkpoint_path = latest_checkpoint_file( args.load_training_checkpoint, no_cuda) logger.info( f"Restoring previous training checkpoint from {latest_checkpoint_path}" ) start_epoch, global_step = load_checkpoint(model, optimizer, latest_checkpoint_path) logger.info( f"The model is loaded from last checkpoint at epoch {start_epoch} when the global steps were at {global_step}" ) logger.info("Training the model") best_loss = None for index in range(start_epoch, args.epochs): logger.info(f"Training epoch: {index + 1}") eval_loss = train(index) if check_write_log(): if best_loss is None or eval_loss is None or eval_loss < best_loss * 0.99: best_loss = eval_loss
def run(gpu_id, options, distributed=False): if distributed: dist.init_process_group( backend="nccl", rank=gpu_id, world_size=options.num_gpus, init_method="env://", ) torch.cuda.set_device(gpu_id) use_cuda = torch.cuda.is_available() and not options.no_cuda device = torch.device("cuda" if use_cuda else "cpu") for cp in options.checkpoint: checkpoint = load_checkpoint(os.path.join(cp, "stats.pt")) name = "evaluate/{}".format(cp) logger = lavd.Logger(name, disabled=gpu_id != 0) spinner = logger.spinner("Initialising") spinner.start() # All but the primary GPU wait here, so that only the primary process loads the # pre-trained model and the rest uses the cached version. if distributed and gpu_id != 0: torch.distributed.barrier() model_kind = checkpoint["model"].get("kind") use_special = True masked_lm = True add_space = False if model_kind == "bert" or model_kind == "bert-scratch": config = BertConfig.from_pretrained(cp) model = BertForMaskedLM.from_pretrained(cp, config=config) tokeniser = BertTokenizer.from_pretrained(cp) elif model_kind == "gpt2" or model_kind == "gpt2-scratch": config = GPT2Config.from_pretrained(cp) model = GPT2LMHeadModel.from_pretrained(cp, config=config) tokeniser = GPT2Tokenizer.from_pretrained(cp) masked_lm = False use_special = False add_space = True else: raise Exception("No model available for {}".format(model_kind)) model = model.to(device) # Primary process has loaded the model and the other can now load the cached # version. if distributed and gpu_id == 0: torch.distributed.barrier() data_loaders = [] for data_file in options.datasets: data = data_file.split("=", 1) if len(data) > 1: # Remove whitespace around the name name = data[0].strip() # Expand the ~ to the full path as it won't be done automatically since # it's not at the beginning of the word. file_path = os.path.expanduser(data[1]) else: name = None file_path = data[0] dataset = TextDataset(file_path, tokeniser, name=name, use_special=use_special) sampler = (DistributedSampler(dataset, num_replicas=options.num_gpus, rank=gpu_id, shuffle=False) if distributed else None) data_loader = DataLoader( dataset, batch_size=options.batch_size, shuffle=False, num_workers=options.num_workers, sampler=sampler, pin_memory=True, ) data_loaders.append(data_loader) if distributed: model = DistributedDataParallel(model, device_ids=[gpu_id], find_unused_parameters=True) # Wait for all processes to load eveything before starting training. # Not strictly necessary, since they will wait once the actual model is run, but # this makes it nicer to show the spinner until all of them are ready. if distributed: torch.distributed.barrier() spinner.stop() start_time = time.time() logger.set_prefix("Evaluation - {}".format(cp)) results = [] for data_loader in data_loaders: data_name = data_loader.dataset.name logger.start(data_name) result = evaluate( data_loader, model, device=device, name=data_name, logger=logger, masked_lm=masked_lm, ) result["name"] = data_name results.append(result) logger.end(data_name) time_difference = time.time() - start_time evaluation_results = [ OrderedDict( name=result["name"], stats=OrderedDict(loss=result["loss"], perplexity=result["perplexity"]), ) for result in results ] log_epoch_stats(logger, evaluation_results, metrics, time_elapsed=time_difference)
def run(gpu_id, options, distributed=False): if distributed: dist.init_process_group( backend="nccl", rank=gpu_id, world_size=options.num_gpus, init_method="env://", ) torch.cuda.set_device(gpu_id) torch.manual_seed(options.seed) use_cuda = torch.cuda.is_available() and not options.no_cuda device = torch.device("cuda" if use_cuda else "cpu") logger = lavd.Logger(options.name, disabled=gpu_id != 0) # Parser needs to be rebuilt, since it can't be serialised and it is needed to even # detect the number of GPUs, but here it's only used to log it. parser = build_parser() if gpu_id == 0 else None spinner = logger.spinner("Initialising") spinner.start() checkpoint = (default_checkpoint if options.checkpoint is None else load_checkpoint( os.path.join(options.checkpoint, "stats.pt"))) # Either use the checkpoint directory as the configuration or use one of the # available pre-trained models. pre_trained = options.checkpoint or options.pre_trained # All but the primary GPU wait here, so that only the primary process loads the # pre-trained model and the rest uses the cached version. if distributed and gpu_id != 0: torch.distributed.barrier() model_kind = checkpoint["model"].get("kind") or options.model_kind use_special = True masked_lm = True if model_kind == "bert": if pre_trained is None: pre_trained = "bert-base-german-cased" config = BertConfig.from_pretrained(pre_trained) model = BertForMaskedLM.from_pretrained(pre_trained, config=config) tokeniser = BertTokenizer.from_pretrained(pre_trained) elif model_kind == "bert-scratch": # The pre_trained here is only for the configuartion (num layers etc.) # But the weights are not loaded if pre_trained is None: pre_trained = "bert-base-german-cased" # Use either the provided vocabulary or the pre_trained one. vocab = options.vocab or pre_trained tokeniser = BertTokenizer.from_pretrained(vocab) config = BertConfig.from_pretrained(pre_trained) config.vocab_size = tokeniser.vocab_size model = BertForMaskedLM(config) elif model_kind == "gpt2": if pre_trained is None: pre_trained = "gpt2" config = GPT2Config.from_pretrained(pre_trained) model = GPT2LMHeadModel.from_pretrained(pre_trained, config=config) tokeniser = GPT2Tokenizer.from_pretrained(pre_trained) masked_lm = False use_special = False elif model_kind == "gpt2-german": assert pre_trained is not None, "--pre-trained must be given for gpt2-german" config = GPT2Config.from_pretrained(pre_trained) model = GPT2LMHeadModel.from_pretrained(pre_trained, config=config) # Using the XLNetTokenizer because the pre-trained German GPT-2 model uses # SentencePiece and that's easiest way to use it. # That also means that the automatic tokenisation cannot be done, because XLNet # uses different placing of the special tokens. tokeniser = XLNetTokenizer.from_pretrained( pre_trained, keep_accents=True, unk_token="<unk>", # start and end of sequence use the same token bos_token="<endoftext>", eos_token="<endoftext>", ) masked_lm = False use_special = False elif model_kind == "gpt2-scratch": # The pre_trained here is only for the configuartion (num layers etc.) # But the weights are not loaded if pre_trained is None: pre_trained = "gpt2" # Use either the provided vocabulary or the pre_trained one. vocab = options.vocab or pre_trained tokeniser = GPT2Tokenizer.from_pretrained(vocab) config = GPT2Config.from_pretrained(pre_trained) config.vocab_size = tokeniser.vocab_size model = GPT2LMHeadModel(config) masked_lm = False use_special = False else: raise Exception("No model available for {}".format(model_kind)) model = model.to(device) # Primary process has loaded the model and the other can now load the cached # version. if distributed and gpu_id == 0: torch.distributed.barrier() train_dataset = TextDataset( options.train_text, tokeniser, use_special=use_special, manual_special=model_kind == "gpt2-german", ) train_sampler = (DistributedSampler(train_dataset, num_replicas=options.num_gpus, rank=gpu_id) if distributed else None) train_data_loader = DataLoader( train_dataset, batch_size=options.batch_size, # Only shuffle when not using a sampler shuffle=train_sampler is None, num_workers=options.actual_num_workers, sampler=train_sampler, pin_memory=True, ) validation_data_loaders = [] for val_file in options.validation_text: vals = val_file.split("=", 1) if len(vals) > 1: # Remove whitespace around the name name = vals[0].strip() # Expand the ~ to the full path as it won't be done automatically since it's # not at the beginning of the word. file_path = os.path.expanduser(vals[1]) else: name = None file_path = vals[0] validation_dataset = TextDataset( file_path, tokeniser, name=name, use_special=use_special, manual_special=model_kind == "gpt2-german", ) validation_sampler = (DistributedSampler( validation_dataset, num_replicas=options.num_gpus, rank=gpu_id) if distributed else None) validation_data_loader = DataLoader( validation_dataset, batch_size=options.batch_size, # Only shuffle when not using a sampler shuffle=validation_sampler is None, num_workers=options.actual_num_workers, sampler=validation_sampler, pin_memory=True, ) validation_data_loaders.append(validation_data_loader) initial_lr = options.lr # Only restore the learning rate if resuming from a checkpoint and not manually # resetting the learning rate. if len(checkpoint["train"]["lr"]) > 0 and not options.reset_lr: initial_lr = checkpoint["train"]["lr"][-1] no_decay = ["bias", "LayerNorm.weight"] optimiser_grouped_parameters = [ { "params": [ param for name, param in model.named_parameters() if not any(nd in name for nd in no_decay) ], "weight_decay": options.weight_decay, }, { "params": [ param for name, param in model.named_parameters() if any(nd in name for nd in no_decay) ], "weight_decay": 0.0, }, ] optimiser = AdamW(optimiser_grouped_parameters, lr=initial_lr, eps=options.adam_eps) lr_scheduler = get_linear_schedule_with_warmup( optimiser, num_warmup_steps=options.lr_warmup, num_training_steps=options.num_epochs, ) amp_scaler = amp.GradScaler() if use_cuda and options.fp16 else None if distributed: model = DistributedDataParallel(model, device_ids=[gpu_id], find_unused_parameters=True) validation_details = [ OrderedDict( name=data_loader.dataset.name, path=data_loader.dataset.path, size=len(data_loader.dataset), ) for data_loader in validation_data_loaders ] experiment = OrderedDict( model_kind=model_kind, train=OrderedDict(path=train_dataset.path, size=len(train_dataset)), validation=validation_details, options=options, ) log_experiment(logger, experiment) logger.log_command(parser, options) # Wait for all processes to load eveything before starting training. # Not strictly necessary, since they will wait once the actual model is run, but # this makes it nicer to show the spinner until all of them are ready. if distributed: torch.distributed.barrier() spinner.stop() if options.checkpoint is not None: resume_text = "Resuming from - Epoch {epoch}".format( epoch=checkpoint["epoch"]) logger.set_prefix(resume_text) epoch_results = [ OrderedDict( name="Train", stats=OrderedDict( loss=checkpoint["train"]["stats"]["loss"][-1], perplexity=checkpoint["train"]["stats"]["perplexity"][-1], ), ) ] + [ OrderedDict( name=val_name, stats=OrderedDict( loss=val_result["stats"]["loss"][-1], perplexity=val_result["stats"]["perplexity"][-1], ), ) for val_name, val_result in checkpoint["validation"].items() ] log_epoch_stats(logger, epoch_results, metrics) train( logger, model, optimiser, train_data_loader, validation_data_loaders, lr_scheduler=lr_scheduler, device=device, num_epochs=options.num_epochs, checkpoint=checkpoint, model_kind=model_kind, amp_scaler=amp_scaler, masked_lm=masked_lm, )
def main(): config = get_train_config() # device device, device_ids = setup_device(config.n_gpu) # tensorboard writer = TensorboardWriter(config.summary_dir, config.tensorboard) # metric tracker metric_names = ['loss', 'acc1', 'acc5'] train_metrics = MetricTracker(*[metric for metric in metric_names], writer=writer) valid_metrics = MetricTracker(*[metric for metric in metric_names], writer=writer) # create model print("create model") model = VisionTransformer(image_size=(config.image_size, config.image_size), patch_size=(config.patch_size, config.patch_size), emb_dim=config.emb_dim, mlp_dim=config.mlp_dim, num_heads=config.num_heads, num_layers=config.num_layers, num_classes=config.num_classes, attn_dropout_rate=config.attn_dropout_rate, dropout_rate=config.dropout_rate) # load checkpoint if config.checkpoint_path: state_dict = load_checkpoint(config.checkpoint_path) if config.num_classes != state_dict['classifier.weight'].size(0): del state_dict['classifier.weight'] del state_dict['classifier.bias'] print("re-initialize fc layer") model.load_state_dict(state_dict, strict=False) else: model.load_state_dict(state_dict) print("Load pretrained weights from {}".format(config.checkpoint_path)) # send model to device model = model.to(device) if len(device_ids) > 1: model = torch.nn.DataParallel(model, device_ids=device_ids) # create dataloader print("create dataloaders") train_dataloader = eval("{}DataLoader".format(config.dataset))( data_dir=os.path.join(config.data_dir, config.dataset), image_size=config.image_size, batch_size=config.batch_size, num_workers=config.num_workers, split='train') valid_dataloader = eval("{}DataLoader".format(config.dataset))( data_dir=os.path.join(config.data_dir, config.dataset), image_size=config.image_size, batch_size=config.batch_size, num_workers=config.num_workers, split='val') # training criterion print("create criterion and optimizer") criterion = nn.CrossEntropyLoss() # create optimizers and learning rate scheduler optimizer = torch.optim.SGD(params=model.parameters(), lr=config.lr, weight_decay=config.wd, momentum=0.9) lr_scheduler = torch.optim.lr_scheduler.OneCycleLR( optimizer=optimizer, max_lr=config.lr, pct_start=config.warmup_steps / config.train_steps, total_steps=config.train_steps) # start training print("start training") best_acc = 0.0 epochs = config.train_steps // len(train_dataloader) for epoch in range(1, epochs + 1): log = {'epoch': epoch} # train the model model.train() result = train_epoch(epoch, model, train_dataloader, criterion, optimizer, lr_scheduler, train_metrics, device) log.update(result) # validate the model model.eval() result = valid_epoch(epoch, model, valid_dataloader, criterion, valid_metrics, device) log.update(**{'val_' + k: v for k, v in result.items()}) # best acc best = False if log['val_acc1'] > best_acc: best_acc = log['val_acc1'] best = True # save model save_model(config.checkpoint_dir, epoch, model, optimizer, lr_scheduler, device_ids, best) # print logged informations to the screen for key, value in log.items(): print(' {:15s}: {}'.format(str(key), value))
from checkpoint import load_checkpoint checkpoint_name = "./checkpoints/xavier-dropout-bottleneckonly-teacher0.5-0380.pth" checkpoint = load_checkpoint(checkpoint_name, cuda=True) encoder_checkpoint = checkpoint["model"].get("train_loss") decoder_checkpoint = checkpoint["model"].get("train_accuracy") encoder_checkpoint = checkpoint["model"].get("validation_loss") decoder_checkpoint = checkpoint["model"].get("validation_accuracy") print(encoder_checkpoint)
pred_test = False # Predict the test data if LOAD_CHECKPOINT: # Modify this path. def get_path(i,j): out_dir = '/research/lyu1/cygao/workspace/data/checkpoints/' checkpoint_dirs = os.listdir(out_dir) # for idx, checkpoint_dir in enumerate(checkpoint_dirs): # checkpoint_fns = os.listdir(os.path.join(out_dir, checkpoint_dir)) checkpoint_fns = os.listdir(os.path.join(out_dir, checkpoint_dirs[i])) # for jdx, checkpoint_fn in enumerate(checkpoint_fns[i]): checkpoint_path = os.path.join(out_dir, checkpoint_dirs[i], checkpoint_fns[j]) print("Current checkpoint path is ", checkpoint_path) return checkpoint_path checkpoint_path = get_path(7,17) checkpoint = load_checkpoint(checkpoint_path) opts = checkpoint['opts'] print('=' * 100) print('Options log:') print('- Load from checkpoint: {}'.format(LOAD_CHECKPOINT)) print('- Global step: {}'.format(checkpoint['global_step'])) else: opts = AttrDict() # Configure models opts.word_vec_size = 100 opts.feature_vec_size = 90 opts.rnn_type = 'GRU' opts.hidden_size = 200 opts.batch_size = 32 opts.max_vocab_size = 10000
def train(**args): """ Evaluate selected model Args: rerun (Int): Integer indicating number of repetitions for the select experiment seed (Int): Integer indicating set seed for random state save_dir (String): Top level directory to generate results folder model (String): Name of selected model dataset (String): Name of selected dataset exp (String): Name of experiment debug (Int): Debug state to avoid saving variables load_type (String): Keyword indicator to evaluate the testing or validation set pretrained (Int/String): Int/String indicating loading of random, pretrained or saved weights opt (String): Int/String indicating loading of random, pretrained or saved weights lr (Float): Learning rate momentum (Float): Momentum in optimizer weight_decay (Float): Weight_decay value final_shape ([Int, Int]): Shape of data when passed into network Return: None """ print( "\n############################################################################\n" ) print("Experimental Setup: ", args) print( "\n############################################################################\n" ) for total_iteration in range(args['rerun']): # Generate Results Directory d = datetime.datetime.today() date = d.strftime('%Y%m%d-%H%M%S') result_dir = os.path.join( args['save_dir'], args['model'], '_'.join( (args['dataset'], args['exp'], date))) log_dir = os.path.join(result_dir, 'logs') save_dir = os.path.join(result_dir, 'checkpoints') if not args['debug']: os.makedirs(result_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(save_dir, exist_ok=True) # Save copy of config file with open(os.path.join(result_dir, 'config.yaml'), 'w') as outfile: yaml.dump(args, outfile, default_flow_style=False) # Tensorboard Element writer = SummaryWriter(log_dir) # Check if GPU is available (CUDA) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Load Network model = create_model_object(**args).to(device) # Load Data loader = data_loader(model_obj=model, **args) if args['load_type'] == 'train': train_loader = loader['train'] valid_loader = loader[ 'train'] # Run accuracy on train data if only `train` selected elif args['load_type'] == 'train_val': train_loader = loader['train'] valid_loader = loader['valid'] else: sys.exit('Invalid environment selection for training, exiting') # END IF # Training Setup params = [p for p in model.parameters() if p.requires_grad] if args['opt'] == 'sgd': optimizer = optim.SGD(params, lr=args['lr'], momentum=args['momentum'], weight_decay=args['weight_decay']) elif args['opt'] == 'adam': optimizer = optim.Adam(params, lr=args['lr'], weight_decay=args['weight_decay']) else: sys.exit('Unsupported optimizer selected. Exiting') # END IF scheduler = MultiStepLR(optimizer, milestones=args['milestones'], gamma=args['gamma']) if isinstance(args['pretrained'], str): ckpt = load_checkpoint(args['pretrained']) model.load_state_dict(ckpt) start_epoch = load_checkpoint(args['pretrained'], key_name='epoch') + 1 optimizer.load_state_dict( load_checkpoint(args['pretrained'], key_name='optimizer')) for quick_looper in range(start_epoch): scheduler.step() # END FOR else: start_epoch = 0 # END IF model_loss = Losses(device=device, **args) acc_metric = Metrics(**args) best_val_acc = 0.0 ############################################################################################################################################################################ # Start: Training Loop for epoch in range(start_epoch, args['epoch']): running_loss = 0.0 print('Epoch: ', epoch) # Setup Model To Train model.train() # Start: Epoch for step, data in enumerate(train_loader): if step % args['pseudo_batch_loop'] == 0: loss = 0.0 optimizer.zero_grad() # END IF x_input = data['data'].to(device) annotations = data['annots'] assert args['final_shape'] == list(x_input.size( )[-2:]), "Input to model does not match final_shape argument" outputs = model(x_input) loss = model_loss.loss(outputs, annotations) loss = loss * args['batch_size'] loss.backward() running_loss += loss.item() if np.isnan(running_loss): import pdb pdb.set_trace() # END IF if not args['debug']: # Add Learning Rate Element for param_group in optimizer.param_groups: writer.add_scalar( args['dataset'] + '/' + args['model'] + '/learning_rate', param_group['lr'], epoch * len(train_loader) + step) # END FOR # Add Loss Element writer.add_scalar( args['dataset'] + '/' + args['model'] + '/minibatch_loss', loss.item() / args['batch_size'], epoch * len(train_loader) + step) # END IF if ((epoch * len(train_loader) + step + 1) % 100 == 0): print('Epoch: {}/{}, step: {}/{} | train loss: {:.4f}'. format( epoch, args['epoch'], step + 1, len(train_loader), running_loss / float(step + 1) / args['batch_size'])) # END IF if (epoch * len(train_loader) + (step + 1)) % args['pseudo_batch_loop'] == 0 and step > 0: # Apply large mini-batch normalization for param in model.parameters(): param.grad *= 1. / float( args['pseudo_batch_loop'] * args['batch_size']) optimizer.step() # END IF # END FOR: Epoch if not args['debug']: # Save Current Model save_path = os.path.join( save_dir, args['dataset'] + '_epoch' + str(epoch) + '.pkl') save_checkpoint(epoch, step, model, optimizer, save_path) # END IF: Debug scheduler.step(epoch=epoch) print('Schedulers lr: %f', scheduler.get_lr()[0]) ## START FOR: Validation Accuracy running_acc = [] running_acc = valid(valid_loader, running_acc, model, device, acc_metric) if not args['debug']: writer.add_scalar( args['dataset'] + '/' + args['model'] + '/validation_accuracy', 100. * running_acc[-1], epoch * len(valid_loader) + step) print('Accuracy of the network on the validation set: %f %%\n' % (100. * running_acc[-1])) # Save Best Validation Accuracy Model Separately if best_val_acc < running_acc[-1]: best_val_acc = running_acc[-1] if not args['debug']: # Save Current Model save_path = os.path.join( save_dir, args['dataset'] + '_best_model.pkl') save_checkpoint(epoch, step, model, optimizer, save_path) # END IF # END IF # END FOR: Training Loop ############################################################################################################################################################################ if not args['debug']: # Close Tensorboard Element writer.close()
def train(_run): # Set up directories =========================================================== os.makedirs(DATA_DIR, exist_ok=True) os.makedirs(BUFFER_DIR, exist_ok=True) exp_name = args.expID exp_path = os.path.join(DATA_DIR, exp_name) rb_path = os.path.join(BUFFER_DIR, exp_name) os.makedirs(exp_path, exist_ok=True) os.makedirs(rb_path, exist_ok=True) # save arguments with open(os.path.join(exp_path, "args.txt"), "w+") as f: json.dump(args.__dict__, f, indent=2) # Retrieve MuJoCo XML files for training ======================================== envs_train_names = [] args.graphs = dict() # existing envs if not args.custom_xml: for morphology in args.morphologies: envs_train_names += [ name[:-4] for name in os.listdir(XML_DIR) if ".xml" in name and morphology in name ] for name in envs_train_names: args.graphs[name] = utils.getGraphStructure( os.path.join(XML_DIR, "{}.xml".format(name)), args.observation_graph_type, ) # custom envs else: if os.path.isfile(args.custom_xml): assert ".xml" in os.path.basename( args.custom_xml), "No XML file found." name = os.path.basename(args.custom_xml) envs_train_names.append(name[:-4]) # truncate the .xml suffix args.graphs[name[:-4]] = utils.getGraphStructure( args.custom_xml, args.observation_graph_type) elif os.path.isdir(args.custom_xml): for name in os.listdir(args.custom_xml): if ".xml" in name: envs_train_names.append(name[:-4]) args.graphs[name[:-4]] = utils.getGraphStructure( os.path.join(args.custom_xml, name), args.observation_graph_type) envs_train_names.sort() num_envs_train = len(envs_train_names) print("#" * 50 + "\ntraining envs: {}\n".format(envs_train_names) + "#" * 50) # Set up training env and policy ================================================ args.limb_obs_size, args.max_action = utils.registerEnvs( envs_train_names, args.max_episode_steps, args.custom_xml) max_num_limbs = max( [len(args.graphs[env_name]) for env_name in envs_train_names]) # create vectorized training env obs_max_len = ( max([len(args.graphs[env_name]) for env_name in envs_train_names]) * args.limb_obs_size) envs_train = [ utils.makeEnvWrapper(name, obs_max_len, args.seed) for name in envs_train_names ] envs_train = SubprocVecEnv(envs_train) # vectorized env # set random seeds torch.manual_seed(args.seed) np.random.seed(args.seed) # determine the maximum number of children in all the training envs if args.max_children is None: args.max_children = utils.findMaxChildren(envs_train_names, args.graphs) args.max_num_limbs = max_num_limbs # setup agent policy policy = TD3.TD3(args) # Create new training instance or load previous checkpoint ======================== if cp.has_checkpoint(exp_path, rb_path): print("*** loading checkpoint from {} ***".format(exp_path)) ( total_timesteps, episode_num, replay_buffer, num_samples, loaded_path, ) = cp.load_checkpoint(exp_path, rb_path, policy, args) print("*** checkpoint loaded from {} ***".format(loaded_path)) else: print("*** training from scratch ***") # init training vars total_timesteps = 0 episode_num = 0 num_samples = 0 # different replay buffer for each env; avoid using too much memory if there are too many envs replay_buffer = dict() if num_envs_train > args.rb_max // 1e6: for name in envs_train_names: replay_buffer[name] = utils.ReplayBuffer( max_size=args.rb_max // num_envs_train) else: for name in envs_train_names: replay_buffer[name] = utils.ReplayBuffer() # Initialize training variables ================================================ writer = SummaryWriter("%s/%s/" % (DATA_DIR, exp_name)) s = time.time() timesteps_since_saving = 0 timesteps_since_saving_model_only = 0 this_training_timesteps = 0 collect_done = True episode_timesteps_list = [0 for i in range(num_envs_train)] done_list = [True for i in range(num_envs_train)] # Start training =========================================================== model_savings_so_far = 0 while total_timesteps < args.max_timesteps: # train and log after one episode for each env if collect_done: # log updates and train policy if this_training_timesteps != 0: policy.train( replay_buffer, episode_timesteps_list, args.batch_size, args.discount, args.tau, args.policy_noise, args.noise_clip, args.policy_freq, graphs=args.graphs, envs_train_names=envs_train_names[:num_envs_train], ) # add to tensorboard display for i in range(num_envs_train): writer.add_scalar( "{}_episode_reward".format(envs_train_names[i]), episode_reward_list[i], total_timesteps, ) writer.add_scalar( "{}_episode_len".format(envs_train_names[i]), episode_timesteps_list[i], total_timesteps, ) if not args.debug: ex.log_scalar( f"{envs_train_names[i]}_episode_reward", float(episode_reward_list[i]), total_timesteps, ) ex.log_scalar( f"{envs_train_names[i]}_episode_len", float(episode_timesteps_list[i]), total_timesteps, ) if not args.debug: ex.log_scalar( "total_timesteps", float(total_timesteps), total_timesteps, ) # print to console print( "-" * 50 + "\nExpID: {}, FPS: {:.2f}, TotalT: {}, EpisodeNum: {}, SampleNum: {}, ReplayBSize: {}" .format( args.expID, this_training_timesteps / (time.time() - s), total_timesteps, episode_num, num_samples, sum([ len(replay_buffer[name].storage) for name in envs_train_names ]), )) for i in range(len(envs_train_names)): print("{} === EpisodeT: {}, Reward: {:.2f}".format( envs_train_names[i], episode_timesteps_list[i], episode_reward_list[i], )) # save model and replay buffers if timesteps_since_saving >= args.save_freq: timesteps_since_saving = 0 model_saved_path = cp.save_model( exp_path, policy, total_timesteps, episode_num, num_samples, replay_buffer, envs_train_names, args, model_name=f"model_{model_savings_so_far}.pyth", ) model_savings_so_far += 1 print("*** model saved to {} ***".format(model_saved_path)) if args.save_buffer: rb_saved_path = cp.save_replay_buffer( rb_path, replay_buffer) print("*** replay buffers saved to {} ***".format( rb_saved_path)) # reset training variables obs_list = envs_train.reset() done_list = [False for i in range(num_envs_train)] episode_reward_list = [0 for i in range(num_envs_train)] episode_timesteps_list = [0 for i in range(num_envs_train)] episode_num += num_envs_train # create reward buffer to store reward for one sub-env when it is not done episode_reward_list_buffer = [0 for i in range(num_envs_train)] # start sampling =========================================================== # sample action randomly for sometime and then according to the policy if total_timesteps < args.start_timesteps * num_envs_train: action_list = [ np.random.uniform( low=envs_train.action_space.low[0], high=envs_train.action_space.high[0], size=max_num_limbs, ) for i in range(num_envs_train) ] else: action_list = [] for i in range(num_envs_train): # dynamically change the graph structure of the modular policy policy.change_morphology(args.graphs[envs_train_names[i]]) # remove 0 padding of obs before feeding into the policy (trick for vectorized env) obs = np.array( obs_list[i][:args.limb_obs_size * len(args.graphs[envs_train_names[i]])]) policy_action = policy.select_action(obs) if args.expl_noise != 0: policy_action = (policy_action + np.random.normal( 0, args.expl_noise, size=policy_action.size)).clip( envs_train.action_space.low[0], envs_train.action_space.high[0]) # add 0-padding to ensure that size is the same for all envs policy_action = np.append( policy_action, np.array([ 0 for i in range(max_num_limbs - policy_action.size) ]), ) action_list.append(policy_action) # perform action in the environment new_obs_list, reward_list, curr_done_list, _ = envs_train.step( action_list) # record if each env has ever been 'done' done_list = [ done_list[i] or curr_done_list[i] for i in range(num_envs_train) ] for i in range(num_envs_train): # add the instant reward to the cumulative buffer # if any sub-env is done at the momoent, set the episode reward list to be the value in the buffer episode_reward_list_buffer[i] += reward_list[i] if curr_done_list[i] and episode_reward_list[i] == 0: episode_reward_list[i] = episode_reward_list_buffer[i] episode_reward_list_buffer[i] = 0 done_bool = float(curr_done_list[i]) if episode_timesteps_list[i] + 1 == args.max_episode_steps: done_bool = 0 done_list[i] = True # remove 0 padding before storing in the replay buffer (trick for vectorized env) num_limbs = len(args.graphs[envs_train_names[i]]) obs = np.array(obs_list[i][:args.limb_obs_size * num_limbs]) new_obs = np.array(new_obs_list[i][:args.limb_obs_size * num_limbs]) action = np.array(action_list[i][:num_limbs]) # insert transition in the replay buffer replay_buffer[envs_train_names[i]].add( (obs, new_obs, action, reward_list[i], done_bool)) num_samples += 1 # do not increment episode_timesteps if the sub-env has been 'done' if not done_list[i]: episode_timesteps_list[i] += 1 total_timesteps += 1 this_training_timesteps += 1 timesteps_since_saving += 1 timesteps_since_saving_model_only += 1 obs_list = new_obs_list collect_done = all(done_list) # save checkpoint after training =========================================================== model_saved_path = cp.save_model( exp_path, policy, total_timesteps, episode_num, num_samples, replay_buffer, envs_train_names, args, ) print("*** training finished and model saved to {} ***".format( model_saved_path))
def train(): """ Naive Multi-Device Training NOTE: the communicator exposes low-level interfaces * Parse command line arguments. * Instantiate a communicator and set parameter variables. * Specify contexts for computation. * Initialize DataIterator. * Construct a computation graph for training and one for validation. * Initialize solver and set parameter variables to that. * Create monitor instances for saving and displaying training stats. * Training loop * Computate error rate for validation data (periodically) * Get a next minibatch. * Execute forwardprop * Set parameter gradients zero * Execute backprop. * AllReduce for gradients * Solver updates parameters by using gradients computed by backprop and all reduce. * Compute training error """ # Parse args args = get_args() n_train_samples = 50000 n_valid_samples = 10000 bs_valid = args.batch_size # Create Communicator and Context extension_module = "cudnn" ctx = get_extension_context(extension_module, type_config=args.type_config) comm = C.MultiProcessDataParalellCommunicator(ctx) comm.init() n_devices = comm.size mpi_rank = comm.rank mpi_local_rank = comm.local_rank device_id = mpi_local_rank ctx.device_id = str(device_id) nn.set_default_context(ctx) # Model rng = np.random.RandomState(313) comm_syncbn = comm if args.sync_bn else None if args.net == "cifar10_resnet23": prediction = functools.partial(resnet23_prediction, rng=rng, ncls=10, nmaps=32, act=F.relu, comm=comm_syncbn) data_iterator = data_iterator_cifar10 if args.net == "cifar100_resnet23": prediction = functools.partial(resnet23_prediction, rng=rng, ncls=100, nmaps=384, act=F.elu, comm=comm_syncbn) data_iterator = data_iterator_cifar100 # Create training graphs image_train = nn.Variable((args.batch_size, 3, 32, 32)) label_train = nn.Variable((args.batch_size, 1)) pred_train = prediction(image_train, test=False) pred_train.persistent = True loss_train = (loss_function(pred_train, label_train) / n_devices).apply(persistent=True) error_train = F.mean(F.top_n_error(pred_train, label_train, axis=1)).apply(persistent=True) loss_error_train = F.sink(loss_train, error_train) input_image_train = {"image": image_train, "label": label_train} # Create validation graph image_valid = nn.Variable((bs_valid, 3, 32, 32)) label_valid = nn.Variable((args.batch_size, 1)) pred_valid = prediction(image_valid, test=True) error_valid = F.mean(F.top_n_error(pred_valid, label_valid, axis=1)) input_image_valid = {"image": image_valid, "label": label_valid} # Solvers solver = S.Adam() solver.set_parameters(nn.get_parameters()) base_lr = args.learning_rate warmup_iter = int( 1. * n_train_samples / args.batch_size / n_devices) * args.warmup_epoch warmup_slope = base_lr * (n_devices - 1) / warmup_iter solver.set_learning_rate(base_lr) # load checkpoint if file exist. start_point = 0 if args.use_latest_checkpoint: files = glob.glob(f'{args.model_save_path}/checkpoint_*.json') if len(files) != 0: index = max([ int(n) for n in [re.sub(r'.*checkpoint_(\d+).json', '\\1', f) for f in files] ]) # load weights and solver state info from specified checkpoint file. start_point = load_checkpoint( f'{args.model_save_path}/checkpoint_{index}.json', solver) # Create monitor from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) monitor_err = MonitorSeries("Training error", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=10) monitor_verr = MonitorSeries("Validation error", monitor, interval=1) monitor_vtime = MonitorTimeElapsed("Validation time", monitor, interval=1) # Data Iterator # If the data does not exist, it will try to download it from the server # and prepare it. When executing multiple processes on the same host, it is # necessary to execute initial data preparation by the representative # process (local_rank is 0) on the host. # Prepare data only when local_rank is 0 if mpi_rank == 0: rng = np.random.RandomState(device_id) _, tdata = data_iterator(args.batch_size, True, rng) vsource, vdata = data_iterator(args.batch_size, False) # Wait for data to be prepared without watchdog comm.barrier() # Prepare data when local_rank is not 0 if mpi_rank != 0: rng = np.random.RandomState(device_id) _, tdata = data_iterator(args.batch_size, True, rng) vsource, vdata = data_iterator(args.batch_size, False) # loss_error_train.forward() # Training-loop ve = nn.Variable() model_save_interval = 0 for i in range(start_point, int(args.max_iter / n_devices)): # Validation if i % int(n_train_samples / args.batch_size / n_devices) == 0: ve_local = 0. k = 0 idx = np.random.permutation(n_valid_samples) val_images = vsource.images[idx] val_labels = vsource.labels[idx] for j in range(int(n_valid_samples / n_devices * mpi_rank), int(n_valid_samples / n_devices * (mpi_rank + 1)), bs_valid): image = val_images[j:j + bs_valid] label = val_labels[j:j + bs_valid] if len(image ) != bs_valid: # note that smaller batch is ignored continue input_image_valid["image"].d = image input_image_valid["label"].d = label error_valid.forward(clear_buffer=True) ve_local += error_valid.d.copy() k += 1 ve_local /= k ve.d = ve_local comm.all_reduce(ve.data, division=True, inplace=True) # Save model if mpi_rank == 0: monitor_verr.add(i * n_devices, ve.d.copy()) monitor_vtime.add(i * n_devices) if model_save_interval <= 0: nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % i)) save_checkpoint(args.model_save_path, i, solver) model_save_interval += int(args.model_save_interval / n_devices) model_save_interval -= 1 # Forward/Zerograd image, label = tdata.next() input_image_train["image"].d = image input_image_train["label"].d = label loss_error_train.forward(clear_no_need_grad=True) solver.zero_grad() # Backward/AllReduce backward_and_all_reduce( loss_error_train, comm, with_all_reduce_callback=args.with_all_reduce_callback) # Solvers update solver.update() # Linear Warmup if i <= warmup_iter: lr = base_lr + warmup_slope * i solver.set_learning_rate(lr) if mpi_rank == 0: # loss and error locally, and elapsed time monitor_loss.add(i * n_devices, loss_train.d.copy()) monitor_err.add(i * n_devices, error_train.d.copy()) monitor_time.add(i * n_devices) # exit(0) if mpi_rank == 0: nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % (args.max_iter / n_devices))) comm.barrier()
#*************************************************** criterion = nn.CrossEntropyLoss().to(device) # Setting weight decay scheduler (?) #optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) optimizer = None if args.train: if args.resume_from: # Load checkpoint for post training form the pre-trained model. """ net, compress_scheduler, optimizer, start_epoch = ckpt.load_checkpoint( net, "/home/bwtseng/Downloads/vww_mobilenetv1_distiller/model_save/image_net_mobilenetv1_saved_best.pth.tar", model_device=device) """ net, compress_scheduler, optimizer, start_epoch = ckpt.load_checkpoint( net, os.path.join('/home/bwtseng/Downloads/', args.model_path), name, model_device=device) optimizer = None if optimizer is None: optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) print("Do bulid optimizer") if compress_scheduler is None: compress_scheduler = utl.file_config(net, optimizer, args.compress, None, None) print("Do load compress")