def main(opt): ArgumentParser.validate_train_opts(opt) ArgumentParser.update_model_opts(opt) ArgumentParser.validate_model_opts(opt) nb_gpu = len(opt.gpu_ranks) if opt.world_size > 1: mp = torch.multiprocessing.get_context('spawn') # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for device_id in range(nb_gpu): procs.append( mp.Process(target=run, args=( opt, device_id, error_queue, ), daemon=True)) procs[device_id].start() logger.info(" Starting process pid: %d " % procs[device_id].pid) error_handler.add_child(procs[device_id].pid) for p in procs: p.join() elif nb_gpu == 1: # case 1 GPU only single_main(opt, 0) else: # case only CPU single_main(opt, -1)
def run(opt, device_id, error_queue, batch_queue, semaphore): try: gpu_rank = onmt.utils.distributed.multi_init(opt, device_id) if gpu_rank != opt.gpu_ranks[device_id]: raise AssertionError("An error occurred in \ Distributed initialization") single_main(opt, device_id, batch_queue, semaphore) except KeyboardInterrupt: pass
def main(opt): ArgumentParser.validate_train_opts(opt) ArgumentParser.update_model_opts(opt) ArgumentParser.validate_model_opts(opt) if opt.gpu > -1: # case GPU single_main(opt, 0) else: # case only CPU single_main(opt, -1)
def main(opt, train_type): if opt.rnn_type == "SRU" and not opt.gpuid: raise AssertionError("Using SRU requires -gpuid set.") if opt.epochs: raise AssertionError("-epochs is deprecated please use -train_steps.") if len(opt.gpuid) > 1: multi_main(opt, train_type) else: single_main(opt, train_type)
def run(opt, error_queue): """ run process """ try: opt.gpu_rank = onmt.utils.distributed.multi_init(opt) single_main(opt) except KeyboardInterrupt: pass # killed by parent, do nothing except Exception: # propagate exception to parent process, keeping original traceback import traceback error_queue.put((opt.gpu_rank, traceback.format_exc()))
def run(opt, device_id, error_queue, batch_queue, semaphore): """ run process """ try: gpu_rank = onmt.utils.distributed.multi_init(opt, device_id) if gpu_rank != opt.gpu_ranks[device_id]: raise AssertionError("An error occurred in \ Distributed initialization") single_main(opt, device_id, batch_queue, semaphore) except KeyboardInterrupt: pass # killed by parent, do nothing except Exception: # propagate exception to parent process, keeping original traceback import traceback error_queue.put((opt.gpu_ranks[device_id], traceback.format_exc()))
def main(opt): if opt.rnn_type == "SRU" and not opt.gpuid: raise AssertionError("Using SRU requires -gpuid set.") if opt.epochs: raise AssertionError("-epochs is deprecated please use -train_steps.") if opt.truncated_decoder > 0 and opt.accum_count > 1: raise AssertionError("BPTT is not compatible with -accum > 1") if len(opt.gpuid) > 1: multi_main(opt) else: single_main(opt)
def main(opt): ArgumentParser.validate_train_opts(opt) ArgumentParser.update_model_opts(opt) ArgumentParser.validate_model_opts(opt) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] else: data = torch.load(opt.data) single_main(opt, opt.gpu, data)
def main(): opt = parse_args() print( "[train.py] opt.model_mode: {}, opt.model_mode2: {}, opt.model_ffn_mode: {}" .format(opt.model_mode, opt.model_mode2, opt.model_ffn_mode)) if opt.rnn_type == "SRU" and not opt.gpu_ranks: raise AssertionError("Using SRU requires -gpu_ranks set.") if opt.epochs: raise AssertionError("-epochs is deprecated please use -train_steps.") if opt.truncated_decoder > 0 and opt.accum_count > 1: raise AssertionError("BPTT is not compatible with -accum > 1") if opt.gpuid: raise AssertionError("gpuid is deprecated \ see world_size and gpu_ranks") nb_gpu = len(opt.gpu_ranks) if opt.world_size > 1: mp = torch.multiprocessing.get_context('spawn') # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for device_id in range(nb_gpu): procs.append( mp.Process(target=run, args=( opt, device_id, error_queue, ), daemon=True)) procs[device_id].start() logger.info(" Starting process pid: %d " % procs[device_id].pid) error_handler.add_child(procs[device_id].pid) for p in procs: p.join() elif nb_gpu == 1: # case 1 GPU only single_main(opt, 0) else: # case only CPU single_main(opt, -1)
def main(opt): if opt.rnn_type == "SRU" and not opt.gpu_ranks: raise AssertionError("Using SRU requires -gpu_ranks set.") if opt.epochs: raise AssertionError("-epochs is deprecated please use -train_steps.") if opt.truncated_decoder > 0 and opt.accum_count > 1: raise AssertionError("BPTT is not compatible with -accum > 1") if opt.gpuid: raise AssertionError("gpuid is deprecated \ see world_size and gpu_ranks") # add by wchen # if opt.encoder_type == "hr_brnn": # assert not opt.copy_attn, "currently, no copy is available for hr_brnn encoder" # add by wchen assert opt.save_checkpoint_steps == opt.valid_steps nb_gpu = len(opt.gpu_ranks) if opt.world_size > 1: mp = torch.multiprocessing.get_context('spawn') # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for device_id in range(nb_gpu): procs.append(mp.Process(target=run, args=( opt, device_id, error_queue, ), daemon=True)) procs[device_id].start() logger.info(" Starting process pid: %d " % procs[device_id].pid) error_handler.add_child(procs[device_id].pid) for p in procs: p.join() elif nb_gpu == 1: # case 1 GPU only single_main(opt, 0) else: # case only CPU single_main(opt, -1)
def train(opt): ArgumentParser.validate_train_opts(opt) set_random_seed(opt.seed, False) if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] else: vocab = torch.load(opt.data + '.vocab.pt') if old_style_vocab(vocab): fields = load_old_vocab(vocab, opt.model_type, dynamic_dict=opt.copy_attn) else: fields = vocab patch_fields(opt, fields) if len(opt.data_ids) > 1: train_shards = [] for train_id in opt.data_ids: shard_base = "train_" + train_id train_shards.append(shard_base) train_iter = build_dataset_iter_multiple(train_shards, fields, opt) else: if opt.data_ids[0] is not None: shard_base = "train_" + opt.data_ids[0] else: shard_base = "train" train_iter = build_dataset_iter(shard_base, fields, opt) nb_gpu = len(opt.gpu_ranks) if opt.world_size > 1: queues = [] mp = torch.multiprocessing.get_context('spawn') semaphore = mp.Semaphore(opt.world_size * opt.queue_size) procs = [] for device_id in range(nb_gpu): q = mp.Queue(opt.queue_size) queues += [q] procs.append( mp.Process(target=run, args=(opt, device_id, error_queue, q, semaphore), daemon=True)) procs[device_id].start() logger.info(" Starting process pid: %d " % procs[device_id].pid) error_handler.add_child(procs[device_id].pid) producer = mp.Process(target=batch_producer, args=( train_iter, queues, semaphore, opt, ), daemon=True) producer.start() error_handler.add_child(producer.pid) for p in procs: p.join() producer.terminate() elif nb_gpu == 1: single_main(opt, 0) else: single_main(opt, -1)
def main(opt): ArgumentParser.validate_train_opts(opt) ArgumentParser.update_model_opts(opt) ArgumentParser.validate_model_opts(opt) init_logger(opt.log_file) assert len(opt.accum_count) == len(opt.accum_steps), \ 'Number of accum_count values must match number of accum_steps' # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) # 将模型参数导入到CPU checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) model_opt = ArgumentParser.ckpt_model_opts(checkpoint["opt"]) ArgumentParser.update_model_opts(model_opt) ArgumentParser.validate_model_opts(model_opt) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) else: checkpoint = None model_opt = opt # Build Tokenizaer ################################ # Build model. model = build_model(model_opt, opt, fields, checkpoint) # 输出参数个数 n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) # 检查模型保存路径,如果不存在则创建 _check_save_model_path(opt) # Build optimizer. optim = Optimizer.from_opt(model, opt, checkpoint=checkpoint) # Build model saver model_saver = build_model_saver(model_opt, opt, model, fields, optim) trainer = build_trainer( opt, device_id, model, fields, optim, model_saver=model_saver) train_iterables = [] if len(opt.data_ids) > 1: for train_id in opt.data_ids: shard_base = "train_" + train_id iterable = build_dataset_iter(shard_base, fields, opt, multi=True) train_iterables.append(iterable) train_iter = MultipleDatasetIterator(train_iterables, device_id, opt) else: train_iter = build_dataset_iter("train", fields, opt) valid_iter = build_dataset_iter( "valid", fields, opt, is_train=False) if len(opt.gpu_ranks): logger.info('Starting training on GPU: %s' % opt.gpu_ranks) else: logger.info('Starting training on CPU, could be very slow') train_steps = opt.train_steps if opt.single_pass and train_steps > 0: logger.warning("Option single_pass is enabled, ignoring train_steps.") train_steps = 0 trainer.train( train_iter, train_steps, save_checkpoint_steps=opt.save_checkpoint_steps, valid_iter=valid_iter, valid_steps=opt.valid_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close() ######################## if opt.use_gpu: if opt.gpus == []: pass else: pass else: # use cpu pass single_main(opt, 0) else: # case only CPU
def train(opt): ArgumentParser.validate_train_opts(opt) ArgumentParser.update_model_opts(opt) ArgumentParser.validate_model_opts(opt) if opt.train_from != '': raise Exception( 'train_from will be set automatically to the latest model, you should not set it manually' ) # set gpu ranks automatically if not specified if len(opt.gpu_ranks) == 0: opt.gpu_ranks = list(range(opt.world_size)) # Set train_from to latest checkpoint if it exists file_list = glob.glob(opt.save_model + '*.pt') if len(os.listdir(os.path.dirname( opt.save_model))) > 0 and len(file_list) == 0: raise Exception( 'save_model directory is not empty but no pretrained models found') if len(file_list) > 0: ckpt_nos = list( map(lambda x: int(x.split('_')[-1].split('.')[0]), file_list)) ckpt_no = max(ckpt_nos) opt.train_from = opt.save_model + '_' + str(ckpt_no) + '.pt' print(opt.train_from) assert os.path.exists(opt.train_from) set_random_seed(opt.seed, False) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] else: vocab = torch.load(opt.data + '.vocab.pt') # check for code where vocab is saved instead of fields # (in the future this will be done in a smarter way) if old_style_vocab(vocab): fields = load_old_vocab(vocab, opt.model_type, dynamic_dict=opt.copy_attn) else: fields = vocab if len(opt.data_ids) > 1: train_shards = [] for train_id in opt.data_ids: shard_base = "train_" + train_id train_shards.append(shard_base) train_iter = build_dataset_iter_multiple(train_shards, fields, opt) else: if opt.data_ids[0] is not None: shard_base = "train_" + opt.data_ids[0] else: shard_base = "train" train_iter = build_dataset_iter(shard_base, fields, opt) nb_gpu = len(opt.gpu_ranks) if opt.world_size > 1: queues = [] mp = torch.multiprocessing.get_context('spawn') semaphore = mp.Semaphore(opt.world_size * opt.queue_size) # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for device_id in range(nb_gpu): q = mp.Queue(opt.queue_size) queues += [q] procs.append( mp.Process(target=run, args=(opt, device_id, error_queue, q, semaphore), daemon=True)) procs[device_id].start() logger.info(" Starting process pid: %d " % procs[device_id].pid) error_handler.add_child(procs[device_id].pid) producer = mp.Process(target=batch_producer, args=( train_iter, queues, semaphore, opt, ), daemon=True) producer.start() error_handler.add_child(producer.pid) for p in procs: p.join() producer.terminate() elif nb_gpu == 1: # case 1 GPU only single_main(opt, 0) else: # case only CPU single_main(opt, -1)
def train(opt): ArgumentParser.validate_train_opts(opt) ArgumentParser.update_model_opts(opt) ArgumentParser.validate_model_opts(opt) set_random_seed(opt.seed, False) # @Memray, check the dir existence beforehand to avoid path conflicting errors, # and set save_model, tensorboard_log_dir, wandb_log_dir if not exist train_single._check_save_model_path(opt) if not os.path.exists(opt.tensorboard_log_dir): os.makedirs(opt.tensorboard_log_dir) # Scan previous checkpoint to resume training latest_step = 0 latest_ckpt = None for subdir, dirs, filenames in os.walk(opt.exp_dir): for filename in sorted(filenames): if not filename.endswith('.pt'): continue step = int(filename[filename.rfind('_') + 1:filename.rfind('.pt')]) if step > latest_step: latest_ckpt = os.path.join(subdir, filename) latest_step = step # if not saved in the exp folder, check opt.save_model if latest_ckpt is None and opt.save_model is not None: save_model_dir = os.path.dirname(os.path.abspath(opt.save_model)) model_prefix = opt.save_model[opt.save_model.rfind(os.path.sep) + 1:] for subdir, dirs, filenames in os.walk(save_model_dir): for filename in sorted(filenames): if not filename.endswith('.pt'): continue if not filename.startswith(model_prefix): continue step = int(filename[filename.rfind('_') + 1:filename.rfind('.pt')]) if step > latest_step: latest_ckpt = os.path.join(subdir, filename) latest_step = step if latest_ckpt is not None: logger.info("A previous checkpoint is found, train from it: %s" % latest_ckpt) setattr(opt, 'train_from', latest_ckpt) setattr(opt, 'reset_optim', 'none') # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] elif opt.vocab and opt.vocab != 'none': # added by @memray for multiple datasets vocab = torch.load(opt.vocab) # check for code where vocab is saved instead of fields # (in the future this will be done in a smarter way) if old_style_vocab(vocab): vocab = load_old_vocab(vocab, opt.model_type, dynamic_dict=opt.copy_attn) elif opt.encoder_type == 'pretrained': vocab = None else: vocab = None fields = vocab # @memray: a temporary workaround, as well as train_single.py line 78 if fields and opt.data_type == "keyphrase": if opt.tgt_type in ["one2one", "multiple"]: if 'sep_indices' in fields: del fields['sep_indices'] else: if 'sep_indices' not in fields: sep_indices = Field(use_vocab=False, dtype=torch.long, postprocessing=make_tgt, sequential=False) fields["sep_indices"] = sep_indices if 'src_ex_vocab' not in fields: src_ex_vocab = RawField() fields["src_ex_vocab"] = src_ex_vocab # @memray reload fields for news dataset and pretrained models tokenizer = None if opt.pretrained_tokenizer is not None: tokenizer = load_pretrained_tokenizer(opt.pretrained_tokenizer, opt.cache_dir, opt.special_vocab_path) setattr(opt, 'vocab_size', len(tokenizer)) if opt.data_type == 'news': fields = reload_news_fields(opt, tokenizer=tokenizer) # elif opt.data_type == 'keyphrase': # fields = reload_keyphrase_fields(opt, tokenizer=tokenizer) if len(opt.data_ids) > 1: # added by @memray, for loading multiple datasets if opt.multi_dataset: shard_base = "train" train_iter = build_dataset_iter(shard_base, fields, opt, multi=True) else: train_shards = [] for train_id in opt.data_ids: shard_base = "train_" + train_id train_shards.append(shard_base) train_iter = build_dataset_iter_multiple(train_shards, fields, opt) else: shard_base = "train" train_iter = build_dataset_iter(shard_base, fields, opt) nb_gpu = len(opt.gpu_ranks) if opt.world_size > 1: queues = [] mp = torch.multiprocessing.get_context('spawn') semaphore = mp.Semaphore(opt.world_size * opt.queue_size) # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for device_id in range(nb_gpu): q = mp.Queue(opt.queue_size) queues += [q] procs.append( mp.Process(target=run, args=(opt, device_id, error_queue, q, semaphore), daemon=True)) procs[device_id].start() logger.info(" Starting process pid: %d " % procs[device_id].pid) error_handler.add_child(procs[device_id].pid) producer = mp.Process(target=batch_producer, args=( train_iter, queues, semaphore, opt, ), daemon=True) producer.start() error_handler.add_child(producer.pid) for p in procs: p.join() producer.terminate() elif nb_gpu == 1: # case 1 GPU only single_main(opt, 0) else: # case only CPU single_main(opt, -1)
def main(opt): ArgumentParser.validate_train_opts(opt) ArgumentParser.update_model_opts(opt) ArgumentParser.validate_model_opts(opt) # Load checkpoint if we resume from a previous training. aux_vocab = None if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] if opt.crosslingual: aux_vocab = checkpoint['aux_vocab'] elif opt.crosslingual: assert opt.crosslingual in ['old', 'lm'] vocab = torch.load(opt.data + '.vocab.pt') aux_vocab = torch.load(opt.aux_train_data + '.vocab.pt') else: vocab = torch.load(opt.data + '.vocab.pt') # check for code where vocab is saved instead of fields # (in the future this will be done in a smarter way) def get_fields(vocab): if old_style_vocab(vocab): return load_old_vocab(vocab, opt.model_type, dynamic_dict=opt.copy_attn) else: return vocab fields = get_fields(vocab) aux_fields = None if opt.crosslingual: aux_fields = get_fields(aux_vocab) if opt.crosslingual: if opt.crosslingual == 'old': aeq(len(opt.eat_formats), 3) fields_info = [ ('train', fields, 'data', Eat2PlainMonoTask, 'base', opt.eat_formats[0]), ('train', aux_fields, 'aux_train_data', Eat2PlainAuxMonoTask, 'aux', opt.eat_formats[1]), ('train', aux_fields, 'aux_train_data', Eat2PlainCrosslingualTask, 'crosslingual', opt.eat_format[2]) ] else: aeq(len(opt.eat_formats), 4) fields_info = [ ('train', fields, 'data', Eat2PlainMonoTask, 'base', opt.eat_formats[0]), ('train', fields, 'data', EatLMMonoTask, 'lm', opt.eat_formats[1]), ('train', aux_fields, 'aux_train_data', Eat2PlainAuxMonoTask, 'aux', opt.eat_formats[2]), ('train', aux_fields, 'aux_train_data', EatLMCrosslingualTask, 'crosslingual', opt.eat_formats[3]) ] train_iter = build_crosslingual_dataset_iter(fields_info, opt) elif len(opt.data_ids) > 1: train_shards = [] for train_id in opt.data_ids: shard_base = "train_" + train_id train_shards.append(shard_base) train_iter = build_dataset_iter_multiple(train_shards, fields, opt) else: if opt.data_ids[0] is not None: shard_base = "train_" + opt.data_ids[0] else: shard_base = "train" train_iter = build_dataset_iter(shard_base, fields, opt) nb_gpu = len(opt.gpu_ranks) if opt.world_size > 1: queues = [] mp = torch.multiprocessing.get_context('spawn') semaphore = mp.Semaphore(opt.world_size * opt.queue_size) # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for device_id in range(nb_gpu): q = mp.Queue(opt.queue_size) queues += [q] procs.append( mp.Process(target=run, args=(opt, device_id, error_queue, q, semaphore), daemon=True)) procs[device_id].start() logger.info(" Starting process pid: %d " % procs[device_id].pid) error_handler.add_child(procs[device_id].pid) producer = mp.Process(target=batch_producer, args=( train_iter, queues, semaphore, opt, ), daemon=True) producer.start() error_handler.add_child(producer.pid) for p in procs: p.join() producer.terminate() else: device_id = 0 if nb_gpu == 1 else -1 # NOTE Only pass train_iter in my crosslingual mode. train_iter = train_iter if opt.crosslingual else None passed_fields = { 'main': fields, 'crosslingual': aux_fields } if opt.crosslingual else None single_main(opt, device_id, train_iter=train_iter, passed_fields=passed_fields)
def train(opt): ArgumentParser.validate_train_opts(opt) ArgumentParser.update_model_opts(opt) ArgumentParser.validate_model_opts(opt) set_random_seed(opt.seed, False) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) if 'vocab' in checkpoint: logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] else: vocab = torch.load(opt.data + '.vocab.pt') else: vocab = torch.load(opt.data + '.vocab.pt') # check for code where vocab is saved instead of fields # (in the future this will be done in a smarter way) if old_style_vocab(vocab): fields = load_old_vocab(vocab, opt.model_type, dynamic_dict=opt.copy_attn) else: fields = vocab if len(opt.data_ids) > 1: train_shards = [] for train_id in opt.data_ids: shard_base = "train_" + train_id train_shards.append(shard_base) train_iter = build_dataset_iter_multiple(train_shards, fields, opt) else: if opt.data_ids[0] is not None: shard_base = "train_" + opt.data_ids[0] else: shard_base = "train" train_iter = build_dataset_iter(shard_base, fields, opt) nb_gpu = len(opt.gpu_ranks) if opt.world_size > 1: queues = [] mp = torch.multiprocessing.get_context('spawn') semaphore = mp.Semaphore(opt.world_size * opt.queue_size) # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for device_id in range(nb_gpu): q = mp.Queue(opt.queue_size) queues += [q] procs.append( mp.Process(target=run, args=(opt, device_id, error_queue, q, semaphore), daemon=True)) procs[device_id].start() logger.info(" Starting process pid: %d " % procs[device_id].pid) error_handler.add_child(procs[device_id].pid) producer = mp.Process(target=batch_producer, args=( train_iter, queues, semaphore, opt, ), daemon=True) producer.start() error_handler.add_child(producer.pid) for p in procs: p.join() producer.terminate() elif nb_gpu == 1: # case 1 GPU only single_main(opt, 0) else: # case only CPU single_main(opt, -1)
transformer_ff=1024, position_encoding=True, max_generator_batches=2, train_steps=1000000, dropout=0.1, data="data/USP", save_model="models/transformer_model", batch_size=4096, gpu_ranks=[0], valid_steps=1000, valid_batch_size=8, optim="adam", learning_rate=1., save_checkpoint_steps=1000, encoder_type="transformer", decoder_type="transformer", enc_layers=4, dec_layers=4, enc_rnn_size=256, dec_rnn_size=256, normalization="tokens", batch_type="tokens", adam_beta2=0.98, adam_beta1=0.9, accum_count=4, decay_method="noam", warmup_steps=8000, rnn_size=256, ) single_main(opt, -1)
def main(opt): ArgumentParser.validate_train_opts(opt) ArgumentParser.update_model_opts(opt) ArgumentParser.validate_model_opts(opt) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] else: vocab = torch.load(opt.data + '.vocab.pt') # check for code where vocab is saved instead of fields # (in the future this will be done in a smarter way) if old_style_vocab(vocab): fields = load_old_vocab( vocab, opt.model_type, dynamic_dict=opt.copy_attn) else: fields = vocab # @memray: a temporary workaround, as well as train_single.py line 78 if opt.model_type == "keyphrase": if opt.tgt_type in ["one2one", "multiple"]: del fields['sep_indices'] else: if 'sep_indices' not in fields: sep_indices = Field( use_vocab=False, dtype=torch.long, postprocessing=make_tgt, sequential=False) fields["sep_indices"] = sep_indices if 'src_ex_vocab' not in fields: src_ex_vocab = RawField() fields["src_ex_vocab"] = src_ex_vocab if len(opt.data_ids) > 1: train_shards = [] for train_id in opt.data_ids: shard_base = "train_" + train_id train_shards.append(shard_base) train_iter = build_dataset_iter_multiple(train_shards, fields, opt) else: if opt.data_ids[0] is not None: shard_base = "train_" + opt.data_ids[0] else: shard_base = "train" train_iter = build_dataset_iter(shard_base, fields, opt) nb_gpu = len(opt.gpu_ranks) print(os.environ['PATH']) if opt.world_size > 1: queues = [] mp = torch.multiprocessing.get_context('spawn') semaphore = mp.Semaphore(opt.world_size * opt.queue_size) # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for device_id in range(nb_gpu): q = mp.Queue(opt.queue_size) queues += [q] procs.append(mp.Process(target=run, args=( opt, device_id, error_queue, q, semaphore), daemon=True)) procs[device_id].start() logger.info(" Starting process pid: %d " % procs[device_id].pid) error_handler.add_child(procs[device_id].pid) producer = mp.Process(target=batch_producer, args=(train_iter, queues, semaphore, opt,), daemon=True) producer.start() error_handler.add_child(producer.pid) for p in procs: p.join() producer.terminate() elif nb_gpu == 1: # case 1 GPU only single_main(opt, 0) else: # case only CPU single_main(opt, -1)