def test_build_triplet(): config = read_yaml("./tests/config/triplet.yaml") dset_config = read_yaml("./tests/config/mnist.yaml") dset = get_dataset(config, dset_config, mode="train") valid_dset = get_dataset(config, dset_config, mode="valid") model = get_model(config, dset_config) trainer = get_trainer(config, dset_config)
def main(): """Main function""" # Initialization args = parse_args() rank, n_ranks = init_workers(args.distributed_backend) # Load configuration config = load_config(args.config) # Prepare output directory output_dir = os.path.expandvars(args.output_dir if args.output_dir is not None else config['output_dir']) os.makedirs(output_dir, exist_ok=True) # Setup logging log_file = os.path.join(output_dir, 'out_%i.log' % rank) config_logging(verbose=args.verbose, log_file=log_file) logging.info('Initialized rank %i out of %i', rank, n_ranks) if rank == 0: logging.info('Configuration: %s' % config) # Load the datasets is_distributed = args.distributed_backend is not None train_data_loader, valid_data_loader = get_data_loaders( distributed=is_distributed, **config['data_config']) # Load the trainer gpu = (rank % args.ranks_per_node) if args.rank_gpu else args.gpu if gpu is not None: logging.info('Using GPU %i', gpu) trainer = get_trainer(name=config['trainer'], distributed=is_distributed, rank=rank, output_dir=output_dir, gpu=gpu) # Build the model trainer.build_model(**config['model_config']) if rank == 0: trainer.print_model_summary() # Run the training summary = trainer.train(train_data_loader=train_data_loader, valid_data_loader=valid_data_loader, **config['train_config']) trainer.write_summaries() # Print some conclusions logging.info('Finished training') logging.info('Train samples %g time %g s rate %g samples/s', np.mean(summary['train_samples']), np.mean(summary['train_time']), np.mean(summary['train_rate'])) if valid_data_loader is not None: logging.info('Valid samples %g time %g s rate %g samples/s', np.mean(summary['valid_samples']), np.mean(summary['valid_time']), np.mean(summary['valid_rate'])) logging.info('All done!')
def test_build_arcface(): config = read_yaml("./tests/config/arcface.yaml") dset_config = read_yaml("./tests/config/mnist.yaml") dset = get_dataset(config, dset_config, mode="train") valid_dset = get_dataset(config, dset_config, mode="valid") model = get_model(config, dset_config) trainer = get_trainer(config, dset_config) trainer.train(dataset=dset, valid_dataset=valid_dset, model=model)
def main(cfg: DictConfig) -> None: """Main function Builds model, loads data, trains and evaluates Args: cfg: Config. Returns: None. """ trainer = get_trainer(cfg) trainer.execute(eval=cfg.train.eval)
def get_edge_scores(result_dir, n_tasks, task): """ - Takes config info for triplet training dataset (different from doublet training dataset), - Runs the dataset through the trained doublet network, - Returns edge scores with same indices as edge network input """ # Load configs config = load_config_dir(result_dir) logging.info('Inferring triplets on model configuration:') logging.info(config) # Find the best epoch summaries = load_summaries(config) best_idx = summaries.valid_loss.idxmin() summaries.loc[[best_idx]] # Build the trainer and load best checkpoint trainer = get_trainer(output_dir=config['output_dir'], gpu=0, **config['trainer']) trainer.build_model(optimizer_config=config['optimizer'], **config['model']) best_epoch = summaries.epoch.loc[best_idx] trainer.load_checkpoint(checkpoint_id=best_epoch) logging.info("With weight system:") logging.info(trainer.model) logging.info("On device:") logging.info(trainer.device) # Load the test dataset test_loader, filelist = get_seed_data_loader(config, n_tasks, task) # Apply the model test_preds, test_targets = trainer.device_predict(test_loader) print("Graph prediction complete") #GET Hit ID data here and GRAPH NAMES graph_dataset, graph_names = load_triplets(test_loader, filelist) return test_preds, graph_dataset, graph_names
def main(config, dset_config): root_dir = Path(increment_path(os.path.join(config.result_dir, "runs"))) Path(os.path.join(root_dir, "weights")).mkdir(parents=True, exist_ok=True) config.result_dir = root_dir log = setup_logger.setFileHandler( filename=os.path.join(root_dir, "log.txt")) save_yaml(config) save_hostname(config) dt_now = datetime.datetime.now() logger.info(f"\n Start: {dt_now.strftime('%Y年%m月%d日 %H:%M:%S')}") dset = get_dataset(config, dset_config, mode="train") valid_dset = get_dataset(config, dset_config, mode="valid") model = get_model(config, dset_config) trainer = get_trainer(config, dset_config) trainer.train(dataset=dset, valid_dataset=valid_dset, model=model) trainer.save()
def main(cfg): print(cfg) print() # setup logdir, writer and logger logdir = os.path.join(cfg['root'], cfg['logdir']) if not os.path.exists(logdir): os.makedirs(logdir) writer = SummaryWriter(log_dir=logdir) trainer_name = cfg['trainer'] with open(os.path.join(logdir,trainer_name+'.yml'), 'w') as fp: yaml.dump(cfg, fp) logger = get_logger(logdir) Trainer = get_trainer(trainer_name)(cfg, writer, logger) print() # start training Trainer.train()
def main(): """Main function""" # Parse the command line args = parse_args() # Initialize distributed workers rank, n_ranks = init_workers(args.distributed) # Load configuration config = load_config(args.config, output_dir=args.output_dir, n_ranks=n_ranks, crayai=args.crayai) config = update_config(config, args) os.makedirs(config['output_dir'], exist_ok=True) # Setup logging config_logging(verbose=args.verbose, output_dir=config['output_dir'], append=args.resume, rank=rank) logging.info('Initialized rank %i out of %i', rank, n_ranks) if args.show_config and (rank == 0): logging.info('Command line config: %s' % args) if rank == 0: logging.info('Configuration: %s', config) logging.info('Saving job outputs to %s', config['output_dir']) if args.distributed is not None: logging.info('Using distributed mode: %s', args.distributed) # Reproducible training torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.seed(args.seed + 10) # Save configuration in the outptut directory if rank == 0: save_config(config) # Load the datasets is_distributed = (args.distributed is not None) # Workaround because multi-process I/O not working with MPI backend if args.distributed in ['ddp-mpi', 'cray']: if rank == 0: logging.info('Disabling I/O workers because of MPI issue') config['data']['n_workers'] = 0 train_data_loader, valid_data_loader = get_data_loaders( distributed=is_distributed, rank=rank, n_ranks=n_ranks, **config['data']) logging.info('Loaded %g training samples', len(train_data_loader.dataset)) if valid_data_loader is not None: logging.info('Loaded %g validation samples', len(valid_data_loader.dataset)) # Load the trainer gpu = (rank % args.ranks_per_node) if args.rank_gpu else args.gpu if gpu is not None: logging.info('Choosing GPU %s', gpu) trainer = get_trainer(distributed_mode=args.distributed, output_dir=config['output_dir'], rank=rank, n_ranks=n_ranks, gpu=gpu, pbt_checkpoint=args.pbt_checkpoint, **config['trainer']) # Build the model and optimizer model_config = config.get('model', {}) optimizer_config = config.get('optimizer', {}) logging.debug("Building model") trainer.build_model(optimizer_config=optimizer_config, **model_config) if rank == 0: trainer.print_model_summary() # Checkpoint resume if args.resume: trainer.load_checkpoint() # Run the training logging.debug("Training") summary = trainer.train(train_data_loader=train_data_loader, valid_data_loader=valid_data_loader, **config['training']) # Print some conclusions n_train_samples = len(train_data_loader.sampler) logging.info('Finished training') train_time = summary.train_time.mean() logging.info('Train samples %g time %g s rate %g samples/s', n_train_samples, train_time, n_train_samples / train_time) if valid_data_loader is not None: n_valid_samples = len(valid_data_loader.sampler) valid_time = summary.valid_time.mean() logging.info('Valid samples %g time %g s rate %g samples/s', n_valid_samples, valid_time, n_valid_samples / valid_time) # Drop to IPython interactive shell if args.interactive and (rank == 0): logging.info('Starting IPython interactive session') import IPython IPython.embed() if rank == 0: if args.crayai: print("FoM: %e" % summary['valid_loss'][0]) logging.info('All done!')
def main(): """Main function""" # Parse the command line args = parse_args() # Setup logging log_format = '%(asctime)s %(levelname)s %(message)s' log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(level=log_level, format=log_format) logging.info('Initializing') if args.show_config: logging.info('Command line config: %s' % args) # Initialize MPI if args.distributed: dist.init_process_group(backend='mpi') logging.info('MPI rank %i out of %i', dist.get_rank(), dist.get_world_size()) # Load configuration with open(args.config) as f: config = yaml.load(f) if not args.distributed or (dist.get_rank() == 0): logging.info('Configuration: %s' % config) data_config = config['data_config'] model_config = config.get('model_config', {}) train_config = config['train_config'] # Load the datasets train_data_loader, valid_data_loader = get_data_loaders( distributed=args.distributed, **data_config) logging.info('Loaded %g training samples', len(train_data_loader.dataset)) if valid_data_loader is not None: logging.info('Loaded %g validation samples', len(valid_data_loader.dataset)) print('train_data') print(train_data_loader.dataset) print('valid_data') print(valid_data_loader.dataset) # Load the trainer experiment_config = config['experiment_config'] output_dir = experiment_config.pop('output_dir', None) if args.distributed and dist.get_rank() != 0: output_dir = None trainer = get_trainer(distributed=args.distributed, output_dir=output_dir, device=args.device, **experiment_config) # Build the model trainer.build_model(**model_config) if not args.distributed or (dist.get_rank() == 0): trainer.print_model_summary() print('model') print(trainer) # Run the training summary = trainer.train(train_data_loader=train_data_loader, valid_data_loader=valid_data_loader, **train_config) if not args.distributed or (dist.get_rank() == 0): trainer.write_summaries() print('summary') print(summary) # Print some conclusions n_train_samples = len(train_data_loader.sampler) logging.info('Finished training') train_time = np.mean(summary['train_time']) logging.info('Train samples %g time %gs rate %g samples/s', n_train_samples, train_time, n_train_samples / train_time) if valid_data_loader is not None: n_valid_samples = len(valid_data_loader.sampler) valid_time = np.mean(summary['valid_time']) logging.info('Valid samples %g time %g s rate %g samples/s', n_valid_samples, valid_time, n_valid_samples / valid_time) # Drop to IPython interactive shell if args.interactive: logging.info('Starting IPython interactive session') import IPython IPython.embed() logging.info('All done!')
valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size, cores=args.cores) logging.info('Loaded %g validation samples', len(valid_dataset)) else: valid_data_loader = None if test_dataset.__len__() > 0: test_data_loader = DataLoader(test_dataset, batch_size=batch_size, cores=args.cores) logging.info('Loaded %g test samples', len(test_dataset)) else: test_data_loader = None # Load the trainer trainer = get_trainer(output_dir=output_dir, **experiment_config) # Build the model trainer.build_model(**model_config) trainer.print_model_summary() # Run the training summary = trainer.train(train_data_loader=train_data_loader, valid_data_loader=valid_data_loader, test_data_loader=test_data_loader, **train_config) trainer.write_summary(kfold_i=kfold_i) # Print some conclusions tf.keras.backend.clear_session() logging.info('All done!')
TestingDataSets) ################################################################################################### # Step 5: Setting up the neural network ################################################################################################### import torch.distributed as dist from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler # Locals from datasets import get_data_loaders from trainers import get_trainer trainer = get_trainer(distributed=args.distributed, output_dir=output_dir, device=args.device, **experiment_config) # Build the model trainer.build_model(**model_config) if not args.distributed or (dist.get_rank() == 0): trainer.print_model_summary() ################################################################################################### # Step 6: Training and evaluating the network ################################################################################################### summary = trainer.train(train_data_loader=train_data_loader, valid_data_loader=valid_data_loader, **train_config) if not args.distributed or (dist.get_rank() == 0):
def main(): """Main function""" # Initialization args = parse_args() rank, n_ranks = init_workers(args.distributed) # Load configuration config = load_config(args.config) data_config = config['data_config'] model_config = config.get('model_config', {}) train_config = config['train_config'] # Prepare output directory output_dir = config.get('output_dir', None) if output_dir is not None: output_dir = os.path.expandvars(output_dir) os.makedirs(output_dir, exist_ok=True) # Setup logging log_file = (os.path.join(output_dir, 'out_%i.log' % rank) if output_dir is not None else None) config_logging(verbose=args.verbose, log_file=log_file) logging.info('Initialized rank %i out of %i', rank, n_ranks) if rank == 0: logging.info('Configuration: %s' % config) # Load the datasets train_data_loader, valid_data_loader = get_data_loaders( distributed=args.distributed, **data_config) # Load the trainer trainer = get_trainer(name=config['trainer'], distributed=args.distributed, rank=rank, output_dir=output_dir, device=args.device) # Build the model trainer.build_model(**model_config) if rank == 0: trainer.print_model_summary() # Run the training summary = trainer.train(train_data_loader=train_data_loader, valid_data_loader=valid_data_loader, **train_config) if output_dir is not None: trainer.write_summaries() # Print some conclusions n_train_samples = len(train_data_loader.sampler) logging.info('Finished training') train_time = np.mean(summary['train_time']) logging.info('Train samples %g time %g s rate %g samples/s', n_train_samples, train_time, n_train_samples / train_time) if valid_data_loader is not None: n_valid_samples = len(valid_data_loader.sampler) valid_time = np.mean(summary['valid_time']) logging.info('Valid samples %g time %g s rate %g samples/s', n_valid_samples, valid_time, n_valid_samples / valid_time) # Drop to IPython interactive shell if args.interactive and rank == 0: logging.info('Starting IPython interactive session') import IPython IPython.embed() logging.info('All done!')
def main(): """Main function""" # Initialization args = parse_args() rank, n_ranks = init_workers(args.distributed_backend) # Load configuration config = load_config(args) # Prepare output directory output_dir = config.get('output_dir', None) if output_dir is not None: output_dir = os.path.expandvars(output_dir) os.makedirs(output_dir, exist_ok=True) # Setup logging log_file = (os.path.join(output_dir, 'out_%i.log' % rank) if output_dir is not None else None) config_logging(verbose=args.verbose, log_file=log_file, append=args.resume) logging.info('Initialized rank %i out of %i', rank, n_ranks) try_barrier() if rank == 0: logging.info('Configuration: %s' % config) # Load the datasets distributed = args.distributed_backend is not None train_data_loader, valid_data_loader = get_data_loaders( distributed=distributed, **config['data']) # Load the trainer gpu = (rank % args.ranks_per_node) if args.rank_gpu else args.gpu if gpu is not None: logging.info('Using GPU %i', gpu) trainer = get_trainer(name=config['trainer'], distributed=distributed, rank=rank, output_dir=output_dir, gpu=gpu) # Build the model and optimizer trainer.build(config) # Resume from checkpoint if args.resume: trainer.load_checkpoint() # Run the training summary = trainer.train(train_data_loader=train_data_loader, valid_data_loader=valid_data_loader, **config['train']) # Print some conclusions try_barrier() n_train_samples = len(train_data_loader.sampler) logging.info('Finished training') train_time = np.mean(summary['train_time']) logging.info('Train samples %g time %g s rate %g samples/s', n_train_samples, train_time, n_train_samples / train_time) if valid_data_loader is not None: n_valid_samples = len(valid_data_loader.sampler) valid_time = np.mean(summary['valid_time']) logging.info('Valid samples %g time %g s rate %g samples/s', n_valid_samples, valid_time, n_valid_samples / valid_time) logging.info('All done!')
def main(): """Main function""" # Parse the command line args = parse_args() # Initialize MPI rank, n_ranks = init_workers(args.distributed) # Load configuration config = load_config(args.config) output_dir = os.path.expandvars(config.get('output_dir', None)) if rank == 0: os.makedirs(output_dir, exist_ok=True) else: output_dir = None # Setup logging config_logging(verbose=args.verbose, output_dir=output_dir) logging.info('Initialized rank %i out of %i', rank, n_ranks) if args.show_config and (rank == 0): logging.info('Command line config: %s' % args) if rank == 0: logging.info('Configuration: %s', config) logging.info('Saving job outputs to %s', output_dir) # Load the datasets train_data_loader, valid_data_loader = get_data_loaders( distributed=args.distributed, **config['data']) logging.info('Loaded %g training samples', len(train_data_loader.dataset)) if valid_data_loader is not None: logging.info('Loaded %g validation samples', len(valid_data_loader.dataset)) # Load the trainer trainer = get_trainer(distributed=args.distributed, output_dir=output_dir, device=args.device, **config['trainer']) # Build the model and optimizer trainer.build_model(**config.get('model', {})) if rank == 0: trainer.print_model_summary() # Run the training summary = trainer.train(train_data_loader=train_data_loader, valid_data_loader=valid_data_loader, **config['training']) if rank == 0: trainer.write_summaries() # Print some conclusions n_train_samples = len(train_data_loader.sampler) logging.info('Finished training') train_time = np.mean(summary['train_time']) logging.info('Train samples %g time %g s rate %g samples/s', n_train_samples, train_time, n_train_samples / train_time) if valid_data_loader is not None: n_valid_samples = len(valid_data_loader.sampler) valid_time = np.mean(summary['valid_time']) logging.info('Valid samples %g time %g s rate %g samples/s', n_valid_samples, valid_time, n_valid_samples / valid_time) # Drop to IPython interactive shell if args.interactive and (rank == 0): logging.info('Starting IPython interactive session') import IPython IPython.embed() if rank == 0: logging.info('All done!')