def main(args): set_global_seeds(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') args.distributed = args.local_rank >= 0 or args.world_size > 1 if args.distributed: args.device_ids = args.local_rank dist.init_process_group(backend=args.dist_backend, init_method=args.dist_init, world_size=args.world_size, rank=args.local_rank) else: args.device_ids = literal_eval(args.device_ids) main_node = not (args.distributed and torch.distributed.get_rank() > 0) if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) if main_node and not os.path.exists(save_path): os.makedirs(save_path) setup_logging(os.path.join(save_path, 'log.txt'), dummy=not main_node) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) device = args.device if 'cuda' in args.device: main_gpu = 0 if isinstance(args.device_ids, tuple): main_gpu = args.device_ids[0] elif isinstance(args.device_ids, int): main_gpu = args.device_ids elif isinstance(args.device_ids, dict): main_gpu = args.device_ids.get('input', 0) torch.cuda.set_device(main_gpu) cudnn.benchmark = True device = torch.device(device, main_gpu) dataset = getattr(datasets, args.dataset) args.data_config = literal_eval(args.data_config) args.grad_clip = literal_eval(args.grad_clip) train_data = dataset(args.dataset_dir, split='train', **args.data_config) val_data = dataset(args.dataset_dir, split='dev', **args.data_config) src_tok, target_tok = train_data.tokenizers.values() regime = literal_eval(args.optimization_config) model_config = literal_eval(args.model_config) model_config.setdefault('encoder', {}) model_config.setdefault('decoder', {}) if hasattr(src_tok, 'vocab_size'): model_config['encoder']['vocab_size'] = src_tok.vocab_size model_config['decoder']['vocab_size'] = target_tok.vocab_size model_config['vocab_size'] = model_config['decoder']['vocab_size'] args.model_config = model_config model = getattr(models, args.model)(**model_config) model.to(device) batch_first = getattr(model, 'batch_first', False) logging.info(model) pack_encoder_inputs = getattr(model.encoder, 'pack_inputs', False) # define data loaders if args.distributed: train_sampler = DistributedSampler(train_data) else: train_sampler = None train_loader = train_data.get_loader(batch_size=args.batch_size, batch_first=batch_first, shuffle=train_sampler is None, sampler=train_sampler, pack=pack_encoder_inputs, max_length=args.max_length, fixed_length=args.fixed_length, num_workers=args.workers, drop_last=True) val_loader = val_data.get_loader(batch_size=args.eval_batch_size or args.batch_size, batch_first=batch_first, shuffle=False, pack=pack_encoder_inputs, max_length=args.max_length, fixed_length=args.fixed_length, num_workers=args.workers) trainer_options = dict(grad_clip=args.grad_clip, embedding_grad_clip=args.embedding_grad_clip, label_smoothing=args.label_smoothing, save_path=save_path, save_info={ 'tokenizers': train_data.tokenizers, 'config': args }, regime=regime, max_tokens=args.max_tokens, chunk_batch=args.chunk_batch, distributed=args.distributed, local_rank=args.local_rank, device_ids=args.device_ids, device=device, dtype=args.dtype, print_freq=args.print_freq, save_freq=args.save_freq, eval_freq=args.eval_freq) trainer_options['model'] = model trainer = getattr(trainers, args.trainer)(**trainer_options) def num_parameters(model): return 0 if model is None else sum( [l.nelement() for l in model.parameters()]) logging.info("\nEncoder - number of parameters: %d", num_parameters(getattr(model, 'encoder', None))) logging.info("Decoder - number of parameters: %d", num_parameters(getattr(model, 'decoder', None))) logging.info("Total number of parameters: %d\n", num_parameters(model)) if args.uniform_init is not None: for param in model.parameters(): param.data.uniform_(args.uniform_init, -args.uniform_init) # optionally resume from a checkpoint if args.evaluate: trainer.load(args.evaluate) trainer.evaluate(val_loader) return elif args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth.tar') if os.path.isfile(checkpoint_file): trainer.load(checkpoint_file) else: logging.error("no checkpoint found at '%s'", args.resume) logging.info('training regime: %s\n', regime) trainer.epoch = args.start_epoch while trainer.epoch < args.epochs: # train for one epoch trainer.run(train_loader, val_loader)
def main(args): time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') save_path = os.path.join('/tmp', time_stamp) if not os.path.exists(save_path): os.makedirs(save_path) setup_logging(os.path.join(save_path, 'log.txt')) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) args.devices = literal_eval(args.devices) if 'cuda' in args.type: main_gpu = 0 if isinstance(args.devices, tuple): main_gpu = args.devices[0] elif isinstance(args.devices, int): main_gpu = args.devices elif isinstance(args.devices, dict): main_gpu = args.devices.get('input', 0) torch.cuda.set_device(main_gpu) cudnn.benchmark = True checkpoint = torch.load(args.checkpoint , map_location=lambda storage, loc: storage) config = checkpoint['config'] src_tok, target_tok = checkpoint['tokenizers'].values() args.data_config = literal_eval(args.data_config) dataset = getattr(datasets, args.dataset) args.data_config['tokenizers'] = checkpoint['tokenizers'] val_data = dataset(args.dataset_dir, split='dev', **args.data_config) model = getattr(models, config.model)(**config.model_config) model.load_state_dict(checkpoint['state_dict']) batch_first = getattr(model, 'batch_first', False) logging.info(model) # define data loaders val_loader = val_data.get_loader(batch_size=args.batch_size, batch_first=batch_first, shuffle=False, augment=False, pack=args.pack_encoder_inputs, max_length=args.max_length, max_tokens=args.max_tokens, num_workers=args.workers) trainer_options = dict( save_path=save_path, devices=args.devices, print_freq=args.print_freq) trainer_options['model'] = model trainer = getattr(trainers, args.trainer)(**trainer_options) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) model.type(args.type) trainer.evaluate(val_loader)
def main(args): time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') save_path = os.path.join('/tmp', time_stamp) if not os.path.exists(save_path): os.makedirs(save_path) setup_logging(os.path.join(save_path, 'log.txt')) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) args.devices = literal_eval(args.devices) if 'cuda' in args.type: main_gpu = 0 if isinstance(args.devices, tuple): main_gpu = args.devices[0] elif isinstance(args.devices, int): main_gpu = args.devices elif isinstance(args.devices, dict): main_gpu = args.devices.get('input', 0) torch.cuda.set_device(main_gpu) cudnn.benchmark = True checkpoint = torch.load(args.checkpoint, map_location=lambda storage, loc: storage) config = checkpoint['config'] src_tok, target_tok = checkpoint['tokenizers'].values() args.data_config = literal_eval(args.data_config) dataset = getattr(datasets, args.dataset) args.data_config['tokenizers'] = checkpoint['tokenizers'] val_data = dataset(args.dataset_dir, split='dev', **args.data_config) model = getattr(models, config.model)(**config.model_config) model.load_state_dict(checkpoint['state_dict']) batch_first = getattr(model, 'batch_first', False) logging.info(model) # define data loaders val_loader = val_data.get_loader(batch_size=args.batch_size, batch_first=batch_first, shuffle=False, augment=False, pack=args.pack_encoder_inputs, max_length=args.max_length, max_tokens=args.max_tokens, num_workers=args.workers) trainer_options = dict(save_path=save_path, devices=args.devices, print_freq=args.print_freq) trainer_options['model'] = model trainer = getattr(trainers, args.trainer)(**trainer_options) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) model.type(args.type) trainer.evaluate(val_loader)
def main(args): # set up save path time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) # set up logging setup_logging(os.path.join(save_path, 'log_%s.txt' % time_stamp)) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) # set up cuda args.devices = literal_eval(args.devices) if 'cuda' in args.type: main_gpu = 0 if isinstance(args.devices, tuple): main_gpu = args.devices[0] elif isinstance(args.devices, int): main_gpu = args.devices elif isinstance(args.devices, dict): main_gpu = args.devices.get('input', 0) torch.cuda.set_device(main_gpu) cudnn.benchmark = True # set dataset dataset = getattr(datasets, args.dataset) args.data_config = literal_eval(args.data_config) train_data = dataset(args.dataset_dir, split='train', **args.data_config) val_data = dataset(args.dataset_dir, split='dev', **args.data_config) src_tok, target_tok = train_data.tokenizers.values() regime = literal_eval(args.optimization_config) model_config = literal_eval(args.model_config) model_config.setdefault('encoder', {}) model_config.setdefault('decoder', {}) if hasattr(src_tok, 'vocab_size'): model_config['encoder']['vocab_size'] = src_tok.vocab_size model_config['decoder']['vocab_size'] = target_tok.vocab_size model_config['vocab_size'] = model_config['decoder']['vocab_size'] args.model_config = model_config model = getattr(models, args.model)(**model_config) batch_first = getattr(model, 'batch_first', False) logging.info(model) # define data loaders train_loader = train_data.get_loader(batch_size=args.batch_size, batch_first=batch_first, shuffle=True, pack=args.pack_encoder_inputs, max_length=args.max_length, num_workers=args.workers) val_loader = val_data.get_loader(batch_size=args.batch_size, batch_first=batch_first, shuffle=False, pack=args.pack_encoder_inputs, max_length=args.max_length, num_workers=args.workers) trainer_options = dict(grad_clip=args.grad_clip, embedding_grad_clip=args.embedding_grad_clip, save_path=save_path, save_info={ 'tokenizers': train_data.tokenizers, 'config': args }, regime=regime, devices=args.devices, print_freq=args.print_freq, save_freq=args.save_freq, eval_freq=args.eval_freq) trainer_options['model'] = model trainer = getattr(trainers, args.trainer)(**trainer_options) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) model.type(args.type) if args.uniform_init is not None: for param in model.parameters(): param.data.uniform_(args.uniform_init, -args.uniform_init) # optionally resume from a checkpoint if args.evaluate: trainer.load(args.evaluate) elif args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): results.load(os.path.join(checkpoint_file, 'results.csv')) checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth.tar') if os.path.isfile(checkpoint_file): trainer.load(checkpoint_file) else: logging.error("no checkpoint found at '%s'", args.resume) logging.info('training regime: %s', regime) trainer.epoch = args.start_epoch while trainer.epoch < args.epochs: # train for one epoch trainer.run(train_loader, val_loader)
def main(args): time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) setup_logging(os.path.join(save_path, 'log.txt')) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) args.device_ids = literal_eval(args.device_ids) device = args.device if 'cuda' in args.device: main_gpu = 0 if isinstance(args.device_ids, tuple): main_gpu = args.device_ids[0] elif isinstance(args.device_ids, int): main_gpu = args.device_ids elif isinstance(args.device_ids, dict): main_gpu = args.device_ids.get('input', 0) torch.cuda.set_device(main_gpu) cudnn.benchmark = True device = torch.device(device, main_gpu) dataset = getattr(datasets, args.dataset) args.data_config = literal_eval(args.data_config) args.grad_clip = literal_eval(args.grad_clip) train_data = dataset(args.dataset_dir, split='train', **args.data_config) val_data = dataset(args.dataset_dir, split='dev', **args.data_config) src_tok, target_tok = train_data.tokenizers.values() regime = literal_eval(args.optimization_config) model_config = literal_eval(args.model_config) model_config.setdefault('encoder', {}) model_config.setdefault('decoder', {}) if hasattr(src_tok, 'vocab_size'): model_config['encoder']['vocab_size'] = src_tok.vocab_size model_config['decoder']['vocab_size'] = target_tok.vocab_size model_config['vocab_size'] = model_config['decoder']['vocab_size'] args.model_config = model_config model = getattr(models, args.model)(**model_config) model.to(device) batch_first = getattr(model, 'batch_first', False) logging.info(model) pack_encoder_inputs = getattr(model.encoder, 'pack_inputs', False) # define data loaders train_loader = train_data.get_loader(batch_size=args.batch_size, batch_first=batch_first, shuffle=True, augment=True, pack=pack_encoder_inputs, max_length=args.max_length, max_tokens=args.max_tokens, num_workers=args.workers) val_loader = val_data.get_loader(batch_size=args.batch_size, batch_first=batch_first, shuffle=False, augment=False, pack=pack_encoder_inputs, max_length=args.max_length, max_tokens=args.max_tokens, num_workers=args.workers) trainer_options = dict( grad_clip=args.grad_clip, embedding_grad_clip=args.embedding_grad_clip, label_smoothing=args.label_smoothing, save_path=save_path, save_info={'tokenizers': train_data.tokenizers, 'config': args}, regime=regime, limit_num_tokens=args.limit_num_tokens, device_ids=args.device_ids, device=device, dtype=args.dtype, print_freq=args.print_freq, save_freq=args.save_freq, eval_freq=args.eval_freq) trainer_options['model'] = model trainer = getattr(trainers, args.trainer)(**trainer_options) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) if args.uniform_init is not None: for param in model.parameters(): param.data.uniform_(args.uniform_init, -args.uniform_init) # optionally resume from a checkpoint if args.evaluate: trainer.load(args.evaluate) trainer.evaluate(val_loader) return elif args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): checkpoint_file = os.path.join( checkpoint_file, 'model_best.pth.tar') if os.path.isfile(checkpoint_file): trainer.load(checkpoint_file) else: logging.error("no checkpoint found at '%s'", args.resume) logging.info('training regime: %s', regime) trainer.epoch = args.start_epoch while trainer.epoch < args.epochs: # train for one epoch trainer.run(train_loader, val_loader)