def main(): """Main function""" # Initialization args = parse_args() rank, n_ranks = init_workers(args.distributed_backend) # Load configuration config = load_config(args.config) # Prepare output directory output_dir = os.path.expandvars(args.output_dir if args.output_dir is not None else config['output_dir']) os.makedirs(output_dir, exist_ok=True) # Setup logging log_file = os.path.join(output_dir, 'out_%i.log' % rank) config_logging(verbose=args.verbose, log_file=log_file) logging.info('Initialized rank %i out of %i', rank, n_ranks) if rank == 0: logging.info('Configuration: %s' % config) # Load the datasets is_distributed = args.distributed_backend is not None train_data_loader, valid_data_loader = get_data_loaders( distributed=is_distributed, **config['data_config']) # Load the trainer gpu = (rank % args.ranks_per_node) if args.rank_gpu else args.gpu if gpu is not None: logging.info('Using GPU %i', gpu) trainer = get_trainer(name=config['trainer'], distributed=is_distributed, rank=rank, output_dir=output_dir, gpu=gpu) # Build the model trainer.build_model(**config['model_config']) if rank == 0: trainer.print_model_summary() # Run the training summary = trainer.train(train_data_loader=train_data_loader, valid_data_loader=valid_data_loader, **config['train_config']) trainer.write_summaries() # Print some conclusions logging.info('Finished training') logging.info('Train samples %g time %g s rate %g samples/s', np.mean(summary['train_samples']), np.mean(summary['train_time']), np.mean(summary['train_rate'])) if valid_data_loader is not None: logging.info('Valid samples %g time %g s rate %g samples/s', np.mean(summary['valid_samples']), np.mean(summary['valid_time']), np.mean(summary['valid_rate'])) logging.info('All done!')
def main(): """Main function""" # Parse the command line args = parse_args() # Initialize distributed workers rank, n_ranks = init_workers(args.distributed) # Load configuration config = load_config(args.config, output_dir=args.output_dir, n_ranks=n_ranks, crayai=args.crayai) config = update_config(config, args) os.makedirs(config['output_dir'], exist_ok=True) # Setup logging config_logging(verbose=args.verbose, output_dir=config['output_dir'], append=args.resume, rank=rank) logging.info('Initialized rank %i out of %i', rank, n_ranks) if args.show_config and (rank == 0): logging.info('Command line config: %s' % args) if rank == 0: logging.info('Configuration: %s', config) logging.info('Saving job outputs to %s', config['output_dir']) if args.distributed is not None: logging.info('Using distributed mode: %s', args.distributed) # Reproducible training torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.seed(args.seed + 10) # Save configuration in the outptut directory if rank == 0: save_config(config) # Load the datasets is_distributed = (args.distributed is not None) # Workaround because multi-process I/O not working with MPI backend if args.distributed in ['ddp-mpi', 'cray']: if rank == 0: logging.info('Disabling I/O workers because of MPI issue') config['data']['n_workers'] = 0 train_data_loader, valid_data_loader = get_data_loaders( distributed=is_distributed, rank=rank, n_ranks=n_ranks, **config['data']) logging.info('Loaded %g training samples', len(train_data_loader.dataset)) if valid_data_loader is not None: logging.info('Loaded %g validation samples', len(valid_data_loader.dataset)) # Load the trainer gpu = (rank % args.ranks_per_node) if args.rank_gpu else args.gpu if gpu is not None: logging.info('Choosing GPU %s', gpu) trainer = get_trainer(distributed_mode=args.distributed, output_dir=config['output_dir'], rank=rank, n_ranks=n_ranks, gpu=gpu, pbt_checkpoint=args.pbt_checkpoint, **config['trainer']) # Build the model and optimizer model_config = config.get('model', {}) optimizer_config = config.get('optimizer', {}) logging.debug("Building model") trainer.build_model(optimizer_config=optimizer_config, **model_config) if rank == 0: trainer.print_model_summary() # Checkpoint resume if args.resume: trainer.load_checkpoint() # Run the training logging.debug("Training") summary = trainer.train(train_data_loader=train_data_loader, valid_data_loader=valid_data_loader, **config['training']) # Print some conclusions n_train_samples = len(train_data_loader.sampler) logging.info('Finished training') train_time = summary.train_time.mean() logging.info('Train samples %g time %g s rate %g samples/s', n_train_samples, train_time, n_train_samples / train_time) if valid_data_loader is not None: n_valid_samples = len(valid_data_loader.sampler) valid_time = summary.valid_time.mean() logging.info('Valid samples %g time %g s rate %g samples/s', n_valid_samples, valid_time, n_valid_samples / valid_time) # Drop to IPython interactive shell if args.interactive and (rank == 0): logging.info('Starting IPython interactive session') import IPython IPython.embed() if rank == 0: if args.crayai: print("FoM: %e" % summary['valid_loss'][0]) logging.info('All done!')
def main(): """Main function""" # Parse the command line args = parse_args() # Setup logging log_format = '%(asctime)s %(levelname)s %(message)s' log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(level=log_level, format=log_format) logging.info('Initializing') if args.show_config: logging.info('Command line config: %s' % args) # Initialize MPI if args.distributed: dist.init_process_group(backend='mpi') logging.info('MPI rank %i out of %i', dist.get_rank(), dist.get_world_size()) # Load configuration with open(args.config) as f: config = yaml.load(f) if not args.distributed or (dist.get_rank() == 0): logging.info('Configuration: %s' % config) data_config = config['data_config'] model_config = config.get('model_config', {}) train_config = config['train_config'] # Load the datasets train_data_loader, valid_data_loader = get_data_loaders( distributed=args.distributed, **data_config) logging.info('Loaded %g training samples', len(train_data_loader.dataset)) if valid_data_loader is not None: logging.info('Loaded %g validation samples', len(valid_data_loader.dataset)) print('train_data') print(train_data_loader.dataset) print('valid_data') print(valid_data_loader.dataset) # Load the trainer experiment_config = config['experiment_config'] output_dir = experiment_config.pop('output_dir', None) if args.distributed and dist.get_rank() != 0: output_dir = None trainer = get_trainer(distributed=args.distributed, output_dir=output_dir, device=args.device, **experiment_config) # Build the model trainer.build_model(**model_config) if not args.distributed or (dist.get_rank() == 0): trainer.print_model_summary() print('model') print(trainer) # Run the training summary = trainer.train(train_data_loader=train_data_loader, valid_data_loader=valid_data_loader, **train_config) if not args.distributed or (dist.get_rank() == 0): trainer.write_summaries() print('summary') print(summary) # Print some conclusions n_train_samples = len(train_data_loader.sampler) logging.info('Finished training') train_time = np.mean(summary['train_time']) logging.info('Train samples %g time %gs rate %g samples/s', n_train_samples, train_time, n_train_samples / train_time) if valid_data_loader is not None: n_valid_samples = len(valid_data_loader.sampler) valid_time = np.mean(summary['valid_time']) logging.info('Valid samples %g time %g s rate %g samples/s', n_valid_samples, valid_time, n_valid_samples / valid_time) # Drop to IPython interactive shell if args.interactive: logging.info('Starting IPython interactive session') import IPython IPython.embed() logging.info('All done!')
def main(): # Training settings parser = argparse.ArgumentParser(description='SWD') # Mandatory Arguments parser.add_argument('--dataset', type=str, default="cifar10", help="The dataset to consider") parser.add_argument('--lr', type=float, default=.1, metavar='LR', help='learning rate (default: .1) (negative -> Adam)') parser.add_argument('--batch-size', type=int, default=128, metavar='N', help='input batch size for training (default: 100)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=300, metavar='N', help='number of epochs to train') parser.add_argument('--ft-epochs', type=int, default=150, metavar='N', help='number of epochs to train') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument("--seed", type=int, default=random.randint(0,1000000), help = "random seed to initialize.") parser.add_argument("--cutout", action="store_true", help = "perform cutout") parser.add_argument("--mixup", action="store_true", help = "perform mixup") parser.add_argument("--cutmix", action="store_true", help = "perform cutmix") parser.add_argument("--auto-augment", action="store_true", help = "perform auto_augment") parser.add_argument('--feature-maps', type=int, default=64, help='Total feature_maps') parser.add_argument('--wd', default = "5e-4", type=float, help='Weight decay') parser.add_argument('--a', default = "-1", type=float, help='Parameter a') parser.add_argument('--width', default="0.1", type=float, help="parameter width") parser.add_argument('--half', action='store_true', help='Half precision') ## ---------------------------------------------------------------------------------------- args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") torch.manual_seed(args.seed) train_loader, test_loader, metadata = get_data_loaders(args) model = ResNet20(args).to(device) if args.half: model.half() # convert to half precision for layer in model.modules(): if isinstance(layer, torch.nn.BatchNorm2d): layer.float() n_params = torch.sum(torch.LongTensor([elt.numel() for elt in model.parameters()])).item() print(str(n_params) + " parameters maximum with " + str(args.feature_maps) + " feature maps") if args.lr > 0: optimizer = optim.SGD(model.parameters(), lr = args.lr, momentum = 0.9, weight_decay = 0) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [args.epochs // 3, 2 * args.epochs // 3], gamma=0.1) else: optimizer = optim.Adam(model.parameters()) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [2 * args.epochs // 3], gamma=0.1) for epoch in range(args.epochs): train_data = train(model, epoch, args, device, train_loader, optimizer) test_data = test(model, args, device, test_loader) display_progress(epoch, train_data, test_data) torch.save(model, "/tmp/best_model.pt") scheduler.step() values = None for parameters in model.parameters(): if values == None: if args.half: values = torch.abs(parameters.data.view(-1).half()) else: values = torch.abs(parameters.data.view(-1)) else: if args.half: values = torch.cat([values,torch.abs(parameters.data.view(-1).half())], dim=0) else: values = torch.cat([values,torch.abs(parameters.data.view(-1))], dim=0) values = torch.sort(values)[0] print("sorted {:d} values".format(values.shape[0])) perfs = [] perfs_ft = [] ths = [] prunes = [500, 800, 900, 950, 980, 990, 995, 998, 999] for i in prunes: print("Testing with pruning {:3d}/1000... ".format(i),end='') model = torch.load("/tmp/best_model.pt") th = values[((i * values.shape[0]) // 1000)] ths.append(th.item()) print(str(th.item()) + " ", end='') for parameters in model.parameters(): if args.half: parameters.data = parameters.data * (torch.abs(parameters.data) >= th).half() else: parameters.data = parameters.data * (torch.abs(parameters.data) >= th).float() masks = [] res = test(model, args, device, test_loader) perfs.append(res["test_acc"]) print(str(res["test_acc"]) + " ", end='') for parameters in model.parameters(): if args.half: masks.append((torch.abs(parameters.data) >= th).half()) else: masks.append((torch.abs(parameters.data) >= th).float()) print("tuning") if args.lr > 0: optimizer = optim.SGD(model.parameters(), lr = args.lr, momentum = 0.9, weight_decay = 0) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [args.ft_epochs // 3, 2 * args.ft_epochs // 3], gamma=0.1) else: optimizer = optim.Adam(model.parameters()) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [2 * args.ft_epochs // 3], gamma=0.1) for epoch in range(args.epochs, args.epochs + args.ft_epochs): train_data = train(model, epoch, args, device, train_loader, optimizer, masks = masks) test_data = test(model, args, device, test_loader) scheduler.step() res = test(model, args, device, test_loader) perfs_ft.append(res["test_acc"]) print(" " + str(res["test_acc"])) values = { "\"dataset\": \"{:s}\"": args.dataset, "\"wd\": {:f}": args.wd, "\"a\": {:f}": args.a, "\"width\": {:f}": args.width, "\"epochs\": {:d}": args.epochs, "\"ft-epochs\": {:d}": args.ft_epochs, "\"feature_maps\": {:d}": args.feature_maps, "\"auto_augment\": {:b}": args.auto_augment, "\"cutout\": {:b}": args.cutout, "\"mixup\": {:b}": args.mixup, "\"cutmix\": {:b}": args.cutmix, "\"seed\": {:d}" : args.seed, "\"training_loss\": {:f}": train_data["train_loss"], "\"training_acc\": {:f}": train_data["train_acc"], "\"test_loss\": {:f}": test_data["test_loss"], "\"test_acc\": {:f}": test_data["test_acc"], "\"nparams\": {:d}": n_params, "\"ths\": {:s}": str(ths), "\"perfs\": {:s}": str(perfs), "\"perfs_ft\": {:s}": str(perfs_ft) } file_output = open("results.txt","a") file_output.write("results.append({") for key in values.keys(): file_output.write(key.format(values[key]) + ", ") file_output.write("})\n") file_output.close()
parser.add_argument("-temperature", "--temperature", type=float, default=10.0) parser.add_argument("-distil-weight", "--distil-weight", type=float, default=10.0) args = parser.parse_args() print(args) args.no_shuffle = True device = torch.device("cuda" if torch.cuda.is_available() else "cpu") torch.manual_seed(args.seed) train_loader, test_loader, metadata = get_data_loaders(args) teacher_model = ResNet.fromFile(args.teacher) teacher_model.eval() model = get_model(args, metadata) print("N parameters : ", model.n_parameters) if args.resume is not None: model.load_state_dict(torch.load(args.resume)["state_dict"]) teacher_model = teacher_model.to(device) model = model.to(device) scheduler = None if args.optimizer == "sgd":
def main(): # Training settings parser = argparse.ArgumentParser(description='SWD') # Mandatory Arguments parser.add_argument('--dataset', type=str, default="cifar10", help="The dataset to consider") parser.add_argument('--lr', type=float, default=.1, metavar='LR', help='learning rate (default: .1) (negative -> Adam)') parser.add_argument('--batch-size', type=int, default=128, metavar='N', help='input batch size for training (default: 100)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=300, metavar='N', help='number of epochs to train') parser.add_argument('--model', type=str, default="resnet18", choices=list(dict_models.keys()), help='model to train') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument("--seed", type=int, default=random.randint(0, 1000000), help="random seed to initialize.") parser.add_argument("--cutout", action="store_true", help="perform cutout") parser.add_argument("--mixup", action="store_true", help="perform mixup") parser.add_argument("--cutmix", action="store_true", help="perform cutmix") parser.add_argument("--auto-augment", action="store_true", help="perform auto_augment") parser.add_argument('--feature-maps', type=int, default=64, help='Total feature_maps') parser.add_argument('--wd', default="5e-4", type=float, help='Weight decay') parser.add_argument('--temp-init', default="1", type=float, help='Initial importance of binarization') parser.add_argument('--temp-final', default="1e2", type=float, help='Final importance of binarization') parser.add_argument('-l', default=-1, type=int, help='l') parser.add_argument('-c', default=-1, type=int, help='c') parser.add_argument('--output', default="results.txt", type=str, help="Output file to write on") parser.add_argument('--half', action='store_true', help='Half precision') ## ---------------------------------------------------------------------------------------- args = parser.parse_args() if args.c == -1 and args.l == -1: raise Exception("Set c or l") elif args.c == -1 and args.l == -1: raise Exception("Set only one of c or l") if not os.path.isdir("result"): os.makedirs("result") use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") torch.manual_seed(args.seed) train_loader, test_loader, metadata = get_data_loaders(args) model = dict_models[args.model]( args, num_classes=metadata["n_classes"]).to(device) if args.half: model.half() # convert to half precision for layer in model.modules(): if isinstance(layer, torch.nn.BatchNorm2d): layer.float() n_params = torch.sum( torch.LongTensor([elt.numel() for elt in model.parameters()])).item() print( str(n_params) + " parameters maximum with " + str(args.feature_maps) + " feature maps") if args.lr > 0: optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.wd) scheduler = optim.lr_scheduler.MultiStepLR( optimizer, [args.epochs // 3, 2 * args.epochs // 3], gamma=0.1) else: optimizer = optim.Adam(model.parameters()) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [2 * args.epochs // 3], gamma=0.1) for epoch in range(args.epochs): train_data = train(model, epoch, args, device, train_loader, optimizer) test_data = test(model, epoch, args, device, test_loader) display_progress(epoch, train_data, test_data) scheduler.step() values = { "\"model\": \"{:s}\"": args.model, "\"dataset\": \"{:s}\"": args.dataset, "\"wd\": {:f}": args.wd, "\"epochs\": {:d}": args.epochs, "\"feature_maps\": {:d}": args.feature_maps, "\"auto_augment\": {:b}": args.auto_augment, "\"cutout\": {:b}": args.cutout, "\"mixup\": {:b}": args.mixup, "\"cutmix\": {:b}": args.cutmix, "\"seed\": {:d}": args.seed, "\"training_loss\": {:f}": train_data["train_loss"], "\"training_acc\": {:f}": train_data["train_acc"], "\"test_loss\": {:f}": test_data["test_loss"], "\"test_acc\": {:f}": test_data["test_acc"], "\"nb_ops\": {:f}": params.nb_ops, "\"nparams\": {:d}": n_params, "\"temp_init\": {:f}": args.temp_init, "\"temp_final\": {:f}": args.temp_final, "\"l\": {:f}": args.l, "\"c\": {:f}": args.c } file_output = open(args.output, "a") file_output.write("results.append({") for key in values.keys(): file_output.write(key.format(values[key]) + ", ") file_output.write("})\n") file_output.close() filename = "result/" for key, value in sorted(values.items()): if "training_loss" not in key and "training_acc" not in key and "test_loss" not in key and "test_acc" not in key and "nparams" not in key: filename += "{}_".format(value) # print(key,value) filename = "{}.pt".format(filename[:-1]) torch.save(model, filename)
os.makedirs(out_dir, exist_ok=True) logging.basicConfig(filename=(out_dir + "/process_log.log"), level=logging.INFO, format=logfilename) logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logging.info("=" * 20) logging.info("=" * 20) logging.info("=" * 20) logging.info("\nStarting...") logging.info("Config:") logging.info(cfg) train_data_loader, valid_data_loader = get_data_loaders(cfg['train']) logging.info('Loaded %g training samples', len(train_data_loader.dataset)) if valid_data_loader is not None: logging.info('Loaded %g validation samples', len(valid_data_loader.dataset)) # Load the trainer trainer = GNNTrainer(cfg['trainer'], output_dir=out_dir, device=args_in.device, train_loader=train_data_loader) # Build the model and optimizer trainer.build_model(**cfg.get('model', {})) trainer.print_model_summary() # Run the training summary = trainer.train(train_data_loader=train_data_loader,
def main(): """Main function""" # Initialization args = parse_args() rank, n_ranks = init_workers(args.distributed) # Load configuration config = load_config(args.config) data_config = config['data_config'] model_config = config.get('model_config', {}) train_config = config['train_config'] # Prepare output directory output_dir = config.get('output_dir', None) if output_dir is not None: output_dir = os.path.expandvars(output_dir) os.makedirs(output_dir, exist_ok=True) # Setup logging log_file = (os.path.join(output_dir, 'out_%i.log' % rank) if output_dir is not None else None) config_logging(verbose=args.verbose, log_file=log_file) logging.info('Initialized rank %i out of %i', rank, n_ranks) if rank == 0: logging.info('Configuration: %s' % config) # Load the datasets train_data_loader, valid_data_loader = get_data_loaders( distributed=args.distributed, **data_config) # Load the trainer trainer = get_trainer(name=config['trainer'], distributed=args.distributed, rank=rank, output_dir=output_dir, device=args.device) # Build the model trainer.build_model(**model_config) if rank == 0: trainer.print_model_summary() # Run the training summary = trainer.train(train_data_loader=train_data_loader, valid_data_loader=valid_data_loader, **train_config) if output_dir is not None: trainer.write_summaries() # Print some conclusions n_train_samples = len(train_data_loader.sampler) logging.info('Finished training') train_time = np.mean(summary['train_time']) logging.info('Train samples %g time %g s rate %g samples/s', n_train_samples, train_time, n_train_samples / train_time) if valid_data_loader is not None: n_valid_samples = len(valid_data_loader.sampler) valid_time = np.mean(summary['valid_time']) logging.info('Valid samples %g time %g s rate %g samples/s', n_valid_samples, valid_time, n_valid_samples / valid_time) # Drop to IPython interactive shell if args.interactive and rank == 0: logging.info('Starting IPython interactive session') import IPython IPython.embed() logging.info('All done!')
def distributed_node(process_id, args): print('===>>> Process', process_id, 'in Node', args.node_rank) total_time = TimeLapse() total_time.start() args.process_id = process_id loggers = initialise_loggers(args) set_training_environment(args) train_loader, val_loader, test_loader = get_data_loaders(args) checkpoint = recover_saved_session(args) model, optimizer, criterion, saved_loggers = build_model(args, checkpoint) if saved_loggers is not None: loggers = saved_loggers else: get_model_metrics(model, args, loggers) ### ToDo ### This is needed to add new fields in partially trained files due to saved_loggers delete no saved keys ### Maybe can be solved automatically by adding a function after updating that compares a new empty loggers ### with the existing logger and add non existing keys (or deleting currently used ones?) if 'epoch_number' not in loggers: loggers['epoch_number'] = ListMeter() #print_loggers(loggers) print( '===>>> Model have been trained for {} epochs from {} required'.format( args.start_epoch, args.epochs)) for epoch in range(args.start_epoch, args.epochs): ###args.epoch = epoch loggers['last_epoch'] = epoch adjust_lr(optimizer, args, loggers) train(model, train_loader, optimizer, criterion, args, loggers) evaluate(model, val_loader, criterion, args, loggers) if test_loader is not None: evaluate(model, test_loader, criterion, epoch, args, loggers, validation=False) # Save checkpoint only in master process if args.node_rank == 0 and args.process_id == 0: is_best = loggers['epoch_val_performance'].val > loggers[ 'best_val_performance'] if is_best: loggers['best_val_performance'] = loggers[ 'epoch_val_performance'].val loggers['train_best_val_performance'] = loggers[ 'epoch_train_performance'].val if test_loader is not None: loggers['test_best_val_performance'] = loggers[ 'epoch_test_performance'].val loggers['epoch_best_val_performance'] = loggers['last_epoch'] save_checkpoint( { 'epoch': epoch + 1, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'amp': amp.state_dict(), 'loggers': loggers, }, is_best, args) save_loggers(loggers, args) #print_loggers(loggers) total_time.stop() loggers['total_time'] = loggers['total_time'] + total_time.time( ) ###### Have to solve how to recover time if process is broken before reachen final epoch where the total time is computed save_loggers(loggers, args) save_summary(loggers, args) #print('This session time:', total_time.time()) print('Total time for ', args.epochs, 'epochs:', loggers['total_time'])
def main(): """Main function""" # Initialization args = parse_args() rank, n_ranks = init_workers(args.distributed_backend) # Load configuration config = load_config(args) # Prepare output directory output_dir = config.get('output_dir', None) if output_dir is not None: output_dir = os.path.expandvars(output_dir) os.makedirs(output_dir, exist_ok=True) # Setup logging log_file = (os.path.join(output_dir, 'out_%i.log' % rank) if output_dir is not None else None) config_logging(verbose=args.verbose, log_file=log_file, append=args.resume) logging.info('Initialized rank %i out of %i', rank, n_ranks) try_barrier() if rank == 0: logging.info('Configuration: %s' % config) # Load the datasets distributed = args.distributed_backend is not None train_data_loader, valid_data_loader = get_data_loaders( distributed=distributed, **config['data']) # Load the trainer gpu = (rank % args.ranks_per_node) if args.rank_gpu else args.gpu if gpu is not None: logging.info('Using GPU %i', gpu) trainer = get_trainer(name=config['trainer'], distributed=distributed, rank=rank, output_dir=output_dir, gpu=gpu) # Build the model and optimizer trainer.build(config) # Resume from checkpoint if args.resume: trainer.load_checkpoint() # Run the training summary = trainer.train(train_data_loader=train_data_loader, valid_data_loader=valid_data_loader, **config['train']) # Print some conclusions try_barrier() n_train_samples = len(train_data_loader.sampler) logging.info('Finished training') train_time = np.mean(summary['train_time']) logging.info('Train samples %g time %g s rate %g samples/s', n_train_samples, train_time, n_train_samples / train_time) if valid_data_loader is not None: n_valid_samples = len(valid_data_loader.sampler) valid_time = np.mean(summary['valid_time']) logging.info('Valid samples %g time %g s rate %g samples/s', n_valid_samples, valid_time, n_valid_samples / valid_time) logging.info('All done!')
def main(): """Main function""" # Parse the command line args = parse_args() # Initialize MPI rank, n_ranks = init_workers(args.distributed) # Load configuration config = load_config(args.config) output_dir = os.path.expandvars(config.get('output_dir', None)) if rank == 0: os.makedirs(output_dir, exist_ok=True) else: output_dir = None # Setup logging config_logging(verbose=args.verbose, output_dir=output_dir) logging.info('Initialized rank %i out of %i', rank, n_ranks) if args.show_config and (rank == 0): logging.info('Command line config: %s' % args) if rank == 0: logging.info('Configuration: %s', config) logging.info('Saving job outputs to %s', output_dir) # Load the datasets train_data_loader, valid_data_loader = get_data_loaders( distributed=args.distributed, **config['data']) logging.info('Loaded %g training samples', len(train_data_loader.dataset)) if valid_data_loader is not None: logging.info('Loaded %g validation samples', len(valid_data_loader.dataset)) # Load the trainer trainer = get_trainer(distributed=args.distributed, output_dir=output_dir, device=args.device, **config['trainer']) # Build the model and optimizer trainer.build_model(**config.get('model', {})) if rank == 0: trainer.print_model_summary() # Run the training summary = trainer.train(train_data_loader=train_data_loader, valid_data_loader=valid_data_loader, **config['training']) if rank == 0: trainer.write_summaries() # Print some conclusions n_train_samples = len(train_data_loader.sampler) logging.info('Finished training') train_time = np.mean(summary['train_time']) logging.info('Train samples %g time %g s rate %g samples/s', n_train_samples, train_time, n_train_samples / train_time) if valid_data_loader is not None: n_valid_samples = len(valid_data_loader.sampler) valid_time = np.mean(summary['valid_time']) logging.info('Valid samples %g time %g s rate %g samples/s', n_valid_samples, valid_time, n_valid_samples / valid_time) # Drop to IPython interactive shell if args.interactive and (rank == 0): logging.info('Starting IPython interactive session') import IPython IPython.embed() if rank == 0: logging.info('All done!')