def batch_producer(generator_to_serve, queues, semaphore, opt): init_logger(opt.log_file) set_random_seed(opt.seed, False) # generator_to_serve = iter(generator_to_serve) def pred(x): """ Filters batches that belong only to gpu_ranks of current node """ for rank in opt.gpu_ranks: if x[0] % opt.world_size == rank: return True generator_to_serve = filter(pred, enumerate(generator_to_serve)) def next_batch(device_id): new_batch = next(generator_to_serve) semaphore.acquire() return new_batch[1] b = next_batch(0) for device_id, q in cycle(enumerate(queues)): b.dataset = None # hack to dodge unpicklable `dict_keys` b.fields = list(b.fields) q.put(b) b = next_batch(device_id)
def __init__(self, opt, model_id, tokenizer_opt=None, load=False, timeout=-1, on_timeout="to_cpu", model_root="./"): self.model_root = model_root self.opt = self.parse_opt(opt) if self.opt.n_best > 1: raise ValueError("Values of n_best > 1 are not supported") self.model_id = model_id self.tokenizer_opt = tokenizer_opt self.timeout = timeout self.on_timeout = on_timeout self.unload_timer = None self.user_opt = opt self.tokenizer = None if len(self.opt.log_file) > 0: log_file = os.path.join(model_root, self.opt.log_file) else: log_file = None self.logger = init_logger(log_file=log_file, log_file_level=self.opt.log_file_level) self.loading_lock = threading.Event() self.loading_lock.set() self.running_lock = threading.Semaphore(value=1) set_random_seed(self.opt.seed, self.opt.cuda) if load: self.load()
def configure_process(opt, device_id): if device_id >= 0: torch.cuda.set_device(device_id) else: return set_random_seed(opt.seed, device_id >= 0)
def batch_producer(generator_to_serve, queue, semaphore, opt, device_id): """Produce batches to `queues` from `generator_to_serve`.""" log_level = "INFO" if opt.verbose or device_id == 0 else "WARNING" init_logger(opt.log_file, log_level=log_level) set_random_seed(opt.seed, False) def pred(x): """ Filters batches that belong only to gpu_ranks of current node """ for rank in opt.gpu_ranks: if x[0] % opt.world_size == rank: return True generator_to_serve = filter( pred, enumerate(generator_to_serve)) def next_batch(): # NOTE: stride (if needed) is handled at the # generator (train_iter) level new_batch = next(generator_to_serve) semaphore.acquire() return new_batch[1] b = next_batch() while True: b.dataset = None # Move batch to correspond device_id when consumer iterate # hack to dodge unpicklable `dict_keys` b.fields = list(b.fields) queue.put(b) b = next_batch()
def training_opt_postprocessing(opt, device_id): if opt.word_vec_size != -1: opt.src_word_vec_size = opt.word_vec_size opt.tgt_word_vec_size = opt.word_vec_size if opt.layers != -1: opt.enc_layers = opt.layers opt.dec_layers = opt.layers if opt.rnn_size != -1: opt.enc_rnn_size = opt.rnn_size opt.dec_rnn_size = opt.rnn_size # this check is here because audio allows the encoder and decoder to # be different sizes, but other model types do not yet same_size = opt.enc_rnn_size == opt.dec_rnn_size assert opt.model_type == 'audio' or same_size, \ "The encoder and decoder rnns must be the same size for now" opt.brnn = opt.encoder_type == "brnn" assert opt.rnn_type != "SRU" or opt.gpu_ranks, \ "Using SRU requires -gpu_ranks set." if torch.cuda.is_available() and not opt.gpu_ranks: logger.info("WARNING: You have a CUDA device, \ should run with -gpu_ranks") if device_id >= 0: torch.cuda.set_device(device_id) set_random_seed(opt.seed, device_id >= 0) return opt
def __init__(self, opt, model_id, tokenizer_opt=None, load=False, timeout=-1, on_timeout="to_cpu", model_root="./"): """ Args: opt: (dict) options for the Translator model_id: (int) model id tokenizer_opt: (dict) options for the tokenizer or None load: (bool) whether to load the model during __init__ timeout: (int) seconds before running `do_timeout` Negative values means no timeout on_timeout: (str) in ["to_cpu", "unload"] set what to do on timeout (see function `do_timeout`) model_root: (str) path to the model directory it must contain de model and tokenizer file """ self.model_root = model_root self.opt = self.parse_opt(opt) if self.opt.n_best > 1: raise ValueError("Values of n_best > 1 are not supported") self.model_id = model_id self.tokenizer_opt = tokenizer_opt self.timeout = timeout self.on_timeout = on_timeout self.unload_timer = None self.user_opt = opt self.tokenizer = None if len(self.opt.log_file) > 0: log_file = os.path.join(model_root, self.opt.log_file) else: log_file = None self.logger = init_logger(log_file=log_file, log_file_level=self.opt.log_file_level) self.loading_lock = threading.Event() self.loading_lock.set() self.running_lock = threading.Semaphore(value=1) set_random_seed(self.opt.seed, self.opt.cuda) if load: self.load()
def batch_producer(generator_to_serve, queues, semaphore, opt): init_logger(opt.log_file) set_random_seed(opt.seed, False) # generator_to_serve = iter(generator_to_serve) def pred(x): """ Filters batches that belong only to gpu_ranks of current node """ for rank in opt.gpu_ranks: if x[0] % opt.world_size == rank: return True generator_to_serve = filter(pred, enumerate(generator_to_serve)) def next_batch(device_id): new_batch = next(generator_to_serve) semaphore.acquire() return new_batch[1] b = next_batch(0) for device_id, q in cycle(enumerate(queues)): b.dataset = None if isinstance(b.src, tuple): b.src = tuple([_.to(torch.device(device_id)) for _ in b.src]) else: b.src = b.src.to(torch.device(device_id)) #######============================================= if isinstance(b.tt, tuple): b.tt = tuple([_.to(torch.device(device_id)) for _ in b.tt]) else: b.tt = b.tt.to(torch.device(device_id)) #b.tt = b.tt.to(torch.device(device_id)) b.tgt = b.tgt.to(torch.device(device_id)) b.indices = b.indices.to(torch.device(device_id)) b.alignment = b.alignment.to(torch.device(device_id)) \ if hasattr(b, 'alignment') else None b.src_map = b.src_map.to(torch.device(device_id)) \ if hasattr(b, 'src_map') else None b.align = b.align.to(torch.device(device_id)) \ if hasattr(b, 'align') else None # hack to dodge unpicklable `dict_keys` b.fields = list(b.fields) q.put(b) b = next_batch(device_id)
def __init__(self, opt, model_id, preprocess_opt=None, tokenizer_opt=None, postprocess_opt=None, load=False, timeout=-1, on_timeout="to_cpu", model_root="./", debug=True): self.model_root = model_root self.opt = self.parse_opt(opt) self.model_id = model_id self.preprocess_opt = preprocess_opt self.tokenizer_opt = tokenizer_opt self.postprocess_opt = postprocess_opt self.timeout = timeout self.on_timeout = on_timeout self.unload_timer = None self.user_opt = opt self.tokenizer = None self.logger = logging.getLogger(f'{__name__}.{model_id}') self.loading_lock = threading.Event() self.loading_lock.set() self.running_lock = threading.Semaphore(value=1) set_random_seed(self.opt.seed, self.opt.cuda) self.logger.info("Loading preprocessors and post processors") self.preprocessor = [ SentObjProcessor(), MosesProcessor(), MorfessorProcessor() ] self.postprocessor = [ DetokenizationProcessor(), NerProcessor(), AbbrevProcessor() ] self.postprocessor_after_merge = [PuncProcessor()] if load: self.load()
def build_vocab_main(opts): """Apply transforms to samples of specified data and build vocab from it. Transforms that need vocab will be disabled in this. Built vocab is saved in plain text format as following and can be pass as `-src_vocab` (and `-tgt_vocab`) when training: ``` <tok_0>\t<count_0> <tok_1>\t<count_1> ``` """ ArgumentParser.validate_prepare_opts(opts, build_vocab_only=True) assert opts.n_sample == -1 or opts.n_sample > 1, \ f"Illegal argument n_sample={opts.n_sample}." logger = init_logger() set_random_seed(opts.seed, False) transforms_cls = get_transforms_cls(opts._all_transform) fields = None transforms = make_transforms(opts, transforms_cls, fields) logger.info(f"Counter vocab from {opts.n_sample} samples.") src_counter, tgt_counter, src_feats_counter = build_vocab( opts, transforms, n_sample=opts.n_sample) logger.info(f"Counters src:{len(src_counter)}") logger.info(f"Counters tgt:{len(tgt_counter)}") for feat_name, feat_counter in src_feats_counter.items(): logger.info(f"Counters {feat_name}:{len(feat_counter)}") def save_counter(counter, save_path): check_path(save_path, exist_ok=opts.overwrite, log=logger.warning) with open(save_path, "w", encoding="utf8") as fo: for tok, count in counter.most_common(): fo.write(tok + "\t" + str(count) + "\n") if opts.share_vocab: src_counter += tgt_counter tgt_counter = src_counter logger.info(f"Counters after share:{len(src_counter)}") save_counter(src_counter, opts.src_vocab) else: save_counter(src_counter, opts.src_vocab) save_counter(tgt_counter, opts.tgt_vocab) for k, v in src_feats_counter.items(): save_counter(v, opts.src_feats_vocab[k])
def __init__(self, opt, model_id, preprocess_opt=None, tokenizer_opt=None, postprocess_opt=None, load=False, timeout=-1, on_timeout="to_cpu", model_root="./"): self.model_root = model_root self.opt = self.parse_opt(opt) self.custom_opt = custom_opt self.model_id = model_id self.preprocess_opt = preprocess_opt self.tokenizers_opt = tokenizer_opt self.postprocess_opt = postprocess_opt self.timeout = timeout self.on_timeout = on_timeout self.ct2_model = os.path.join(model_root, ct2_model) \ if ct2_model is not None else None self.unload_timer = None self.user_opt = opt self.tokenizers = None if len(self.opt.log_file) > 0: log_file = os.path.join(model_root, self.opt.log_file) else: log_file = None self.logger = init_logger(log_file=log_file, log_file_level=self.opt.log_file_level) self.loading_lock = threading.Event() self.loading_lock.set() self.running_lock = threading.Semaphore(value=1) set_random_seed(self.opt.seed, self.opt.cuda) if load: self.load(preload=True) self.stop_unload_timer()
def build_vocab_main(opts): """Apply transforms to samples of specified data and build vocab from it. Transforms that need vocab will be disabled in this. Built vocab is saved in plain text format as following and can be pass as `-src_vocab` (and `-tgt_vocab`) when training: ``` <tok_0>\t<count_0> <tok_1>\t<count_1> ``` """ ArgumentParser.validate_prepare_opts(opts, build_vocab_only=True) assert opts.n_sample == -1 or opts.n_sample > 1, \ f"Illegal argument n_sample={opts.n_sample}." logger = init_logger() set_random_seed(opts.seed, False) transforms_cls = get_transforms_cls(opts._all_transform) fields = None transforms = make_transforms(opts, transforms_cls, fields) logger.info(f"Counter vocab from {opts.n_sample} samples.") src_counter, tgt_counter = save_transformed_sample(opts, transforms, n_sample=opts.n_sample, build_vocab=True) logger.info(f"Counters src:{len(src_counter)}") logger.info(f"Counters tgt:{len(tgt_counter)}") if opts.share_vocab: src_counter += tgt_counter tgt_counter = src_counter logger.info(f"Counters after share:{len(src_counter)}") def save_counter(counter, save_path): with open(save_path, "w") as fo: for tok, count in counter.most_common(): fo.write(tok + "\t" + str(count) + "\n") save_counter(src_counter, opts.save_data + '.vocab.src') save_counter(tgt_counter, opts.save_data + '.vocab.tgt')
def batch_producer(generator_to_serve, queues, semaphore, opt): init_logger(opt.log_file) set_random_seed(opt.seed, False) def pred(x): for rank in opt.gpu_ranks: if x[0] % opt.world_size == rank: return True generator_to_serve = filter(pred, enumerate(generator_to_serve)) def next_batch(device_id): new_batch = next(generator_to_serve) semaphore.acquire() return new_batch[1] b = next_batch(0) for device_id, q in cycle(enumerate(queues)): b.dataset = None if isinstance(b.src, tuple): b.src = tuple([_.to(torch.device(device_id)) for _ in b.src]) else: b.src = b.src.to(torch.device(device_id)) b.tgt = b.tgt.to(torch.device(device_id)) b.indices = b.indices.to(torch.device(device_id)) b.alignment = b.alignment.to(torch.device(device_id)) \ if hasattr(b, 'alignment') else None b.src_map = b.src_map.to(torch.device(device_id)) \ if hasattr(b, 'src_map') else None b.align = b.align.to(torch.device(device_id)) \ if hasattr(b, 'align') else None b.corpus_id = b.corpus_id.to(torch.device(device_id)) \ if hasattr(b, 'corpus_id') else None b.fields = list(b.fields) q.put(b) b = next_batch(device_id)
def main(model = "transformer", dataset = "toy-ende"): init_logger() is_cuda = cuda.is_available() set_random_seed(1111, is_cuda) data = preprocess.setup_dataset(dataset) vocab = preprocess.setup_vocab(data) if model == "transformer": Model, loss, opt = transformer.SimpleTransformer(vocab) elif model == "lstm": Model, loss, opt = lstm.BaseLSTMModel(vocab) train, validate = training.training_iterator(data, vocab) TrainingSession = training.training_session(Model, loss, opt) report = TrainingSession.train( train_iter=train, valid_iter=validate, **defaults.training) evaluate.evaluation(model, data, vocab) return 0
def train(opt): ArgumentParser.validate_train_opts(opt) set_random_seed(opt.seed, False) if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] else: vocab = torch.load(opt.data + '.vocab.pt') if old_style_vocab(vocab): fields = load_old_vocab(vocab, opt.model_type, dynamic_dict=opt.copy_attn) else: fields = vocab patch_fields(opt, fields) if len(opt.data_ids) > 1: train_shards = [] for train_id in opt.data_ids: shard_base = "train_" + train_id train_shards.append(shard_base) train_iter = build_dataset_iter_multiple(train_shards, fields, opt) else: if opt.data_ids[0] is not None: shard_base = "train_" + opt.data_ids[0] else: shard_base = "train" train_iter = build_dataset_iter(shard_base, fields, opt) nb_gpu = len(opt.gpu_ranks) if opt.world_size > 1: queues = [] mp = torch.multiprocessing.get_context('spawn') semaphore = mp.Semaphore(opt.world_size * opt.queue_size) procs = [] for device_id in range(nb_gpu): q = mp.Queue(opt.queue_size) queues += [q] procs.append( mp.Process(target=run, args=(opt, device_id, error_queue, q, semaphore), daemon=True)) procs[device_id].start() logger.info(" Starting process pid: %d " % procs[device_id].pid) error_handler.add_child(procs[device_id].pid) producer = mp.Process(target=batch_producer, args=( train_iter, queues, semaphore, opt, ), daemon=True) producer.start() error_handler.add_child(producer.pid) for p in procs: p.join() producer.terminate() elif nb_gpu == 1: single_main(opt, 0) else: single_main(opt, -1)
def __init__(self, model, fields, opt, model_opt, global_scorer=None, out_file=None, report_score=True, logger=None): self.model = model self.fields = fields self.gpu = opt.gpu self.cuda = opt.gpu > -1 self.n_best = opt.n_best self.max_length = opt.max_length if opt.beam_size != 1 and opt.random_sampling_topk != 1: raise ValueError('Can either do beam search OR random sampling.') self.beam_size = opt.beam_size self.random_sampling_temp = opt.random_sampling_temp self.sample_from_topk = opt.random_sampling_topk self.min_length = opt.min_length self.stepwise_penalty = opt.stepwise_penalty self.dump_beam = opt.dump_beam self.block_ngram_repeat = opt.block_ngram_repeat self.ignore_when_blocking = set(opt.ignore_when_blocking) self.sample_rate = opt.sample_rate self.window_size = opt.window_size self.window_stride = opt.window_stride self.window = opt.window self.image_channel_size = opt.image_channel_size self.replace_unk = opt.replace_unk self.data_type = opt.data_type self.verbose = opt.verbose self.report_bleu = opt.report_bleu self.report_rouge = opt.report_rouge self.report_time = opt.report_time self.fast = opt.fast self.copy_attn = model_opt.copy_attn self.global_scorer = global_scorer self.out_file = out_file self.report_score = report_score self.logger = logger self.use_filter_pred = False # for debugging self.beam_trace = self.dump_beam != "" self.beam_accum = None if self.beam_trace: self.beam_accum = { "predicted_ids": [], "beam_parent_ids": [], "scores": [], "log_probs": [] } set_random_seed(opt.seed, self.cuda)
def __init__(self, model, fields, n_best=1, min_length=4, max_length=200, ratio=0., beam_size=30, random_sampling_topk=1, random_sampling_temp=1, stepwise_penalty=None, dump_beam=False, block_ngram_repeat=0, ignore_when_blocking=frozenset(), replace_unk=False, phrase_table="", copy_attn=False, global_scorer=None, report_score=True, logger=None, seed=-1, device_id=0): self.model = model self.fields = fields #tgt_field = self.fields["tgt"] self._tgt_vocab = fields #self.fields["tgt"] self._tgt_eos_idx = self._tgt_vocab.eos_idx self._tgt_pad_idx = self._tgt_vocab.pad_idx self._tgt_bos_idx = self._tgt_vocab.bos_idx self._tgt_unk_idx = self._tgt_vocab.unk_idx self._tgt_vocab_len = len(self._tgt_vocab) self._src_vocab = None #dict(self.fields)["src"].base_field.vocab self._dev = torch.device("cuda", 0) self.n_best = n_best self.max_length = max_length self.beam_size = 30 self.random_sampling_temp = random_sampling_temp self.sample_from_topk = random_sampling_topk self.min_length = min_length self.ratio = ratio self.stepwise_penalty = stepwise_penalty self.dump_beam = dump_beam self.block_ngram_repeat = block_ngram_repeat self.ignore_when_blocking = ignore_when_blocking self._exclusion_idxs = { self._tgt_vocab.stoi[t] for t in self.ignore_when_blocking } self.replace_unk = replace_unk self.phrase_table = phrase_table self.copy_attn = copy_attn self.global_scorer = global_scorer self.report_score = report_score self.logger = logger self.use_filter_pred = False self._filter_pred = None self.device_id = device_id # for debugging self.beam_trace = self.dump_beam != "" self.beam_accum = None if self.beam_trace: self.beam_accum = { "predicted_ids": [], "beam_parent_ids": [], "scores": [], "log_probs": [] } set_random_seed(seed, True)
def __init__( self, model, fields, src_reader, tgt_reader, # MMM length_model, length_penalty_a, length_penalty_b, length_model_loc, output, # /MMM gpu=-1, n_best=1, min_length=0, max_length=100, ratio=0., beam_size=30, random_sampling_topk=1, random_sampling_temp=1, stepwise_penalty=None, dump_beam=False, block_ngram_repeat=0, ignore_when_blocking=frozenset(), replace_unk=False, phrase_table="", data_type="text", verbose=False, report_time=False, copy_attn=False, global_scorer=None, out_file=None, report_align=False, report_score=True, logger=None, seed=-1): self.model = model self.fields = fields tgt_field = dict(self.fields)["tgt"].base_field self._tgt_vocab = tgt_field.vocab self._tgt_eos_idx = self._tgt_vocab.stoi[tgt_field.eos_token] self._tgt_pad_idx = self._tgt_vocab.stoi[tgt_field.pad_token] self._tgt_bos_idx = self._tgt_vocab.stoi[tgt_field.init_token] self._tgt_unk_idx = self._tgt_vocab.stoi[tgt_field.unk_token] self._tgt_vocab_len = len(self._tgt_vocab) self._gpu = gpu self._use_cuda = gpu > -1 self._dev = torch.device("cuda", self._gpu) \ if self._use_cuda else torch.device("cpu") # MMM self.length_model = length_model self.length_penalty_a = length_penalty_a self.length_penalty_b = length_penalty_b if self.length_model == 'lstm': self.device = 'cuda' if self._use_cuda else 'cpu' length_model_loc = length_model_loc output_loc = output checkpoint = torch.load(length_model_loc) length_model_opt = checkpoint['opt'] EMBEDDING_DIM = length_model_opt.embedding_dim HIDDEN_DIM = length_model_opt.hidden_dim src_vocab = dict(self.fields)["src"].base_field.vocab # Construct the model: self.l_model = onmt.utils.length_model.LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, src_vocab, self.device).to(self.device) self.l_model.load_state_dict(checkpoint['model_state_dict']) self.l_model.eval() # /MMM self.n_best = n_best self.max_length = max_length self.beam_size = beam_size self.random_sampling_temp = random_sampling_temp self.sample_from_topk = random_sampling_topk self.min_length = min_length self.ratio = ratio self.stepwise_penalty = stepwise_penalty self.dump_beam = dump_beam self.block_ngram_repeat = block_ngram_repeat self.ignore_when_blocking = ignore_when_blocking self._exclusion_idxs = { self._tgt_vocab.stoi[t] for t in self.ignore_when_blocking} self.src_reader = src_reader self.tgt_reader = tgt_reader self.replace_unk = replace_unk if self.replace_unk and not self.model.decoder.attentional: raise ValueError( "replace_unk requires an attentional decoder.") self.phrase_table = phrase_table self.data_type = data_type self.verbose = verbose self.report_time = report_time self.copy_attn = copy_attn self.global_scorer = global_scorer if self.global_scorer.has_cov_pen and \ not self.model.decoder.attentional: raise ValueError( "Coverage penalty requires an attentional decoder.") self.out_file = out_file self.report_align = report_align self.report_score = report_score self.logger = logger self.use_filter_pred = False self._filter_pred = None # for debugging self.beam_trace = self.dump_beam != "" self.beam_accum = None if self.beam_trace: self.beam_accum = { "predicted_ids": [], "beam_parent_ids": [], "scores": [], "log_probs": []} set_random_seed(seed, self._use_cuda)
def train(opt): init_logger(opt.log_file) ArgumentParser.validate_train_opts(opt) ArgumentParser.update_model_opts(opt) ArgumentParser.validate_model_opts(opt) set_random_seed(opt.seed, False) checkpoint, fields, transforms_cls = _init_train( opt) # Datasets and transformations (Both dicts) train_process = partial(single_main, fields=fields, transforms_cls=transforms_cls, checkpoint=checkpoint) nb_gpu = len(opt.gpu_ranks) if opt.world_size > 1: queues = [] mp = torch.multiprocessing.get_context('spawn') semaphore = mp.Semaphore(opt.world_size * opt.queue_size) # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for device_id in range(nb_gpu): q = mp.Queue(opt.queue_size) queues += [q] procs.append( mp.Process(target=consumer, args=(train_process, opt, device_id, error_queue, q, semaphore), daemon=True)) procs[device_id].start() logger.info(" Starting process pid: %d " % procs[device_id].pid) error_handler.add_child(procs[device_id].pid) producers = [] # This does not work if we merge with the first loop, not sure why for device_id in range(nb_gpu): # Get the iterator to generate from train_iter = _build_train_iter(opt, fields, transforms_cls, stride=nb_gpu, offset=device_id) producer = mp.Process(target=batch_producer, args=( train_iter, queues[device_id], semaphore, opt, ), daemon=True) producers.append(producer) producers[device_id].start() logger.info(" Starting producer process pid: {} ".format( producers[device_id].pid)) error_handler.add_child(producers[device_id].pid) for p in procs: p.join() # Once training is done, we can terminate the producers for p in producers: p.terminate() elif nb_gpu == 1: # case 1 GPU only # TODO make possible for custom GPU id. Also replace assert at utils/parse.py line 275 train_process(opt, device_id=opt.gpu_ranks[0]) else: # case only CPU train_process(opt, device_id=-1)
def __init__(self, model, fields, opt, model_opt, global_scorer=None, out_file=None, report_score=True, logger=None): self.model = model self.fields = fields self.gpu = opt.gpu self.cuda = opt.gpu > -1 self.n_best = opt.n_best self.max_length = opt.max_length if opt.beam_size != 1 and opt.random_sampling_topk != 1: raise ValueError('Can either do beam search OR random sampling.') self.beam_size = opt.beam_size self.random_sampling_temp = opt.random_sampling_temp self.sample_from_topk = opt.random_sampling_topk self.num_random_samples = opt.num_random_samples if self.num_random_samples > 1: self.n_best = self.num_random_samples self.hidden_state_noise = opt.hidden_state_noise self.do_stochastic_beam = opt.stochastic_beam self.min_length = opt.min_length self.stepwise_penalty = opt.stepwise_penalty self.dump_beam = opt.dump_beam self.block_ngram_repeat = opt.block_ngram_repeat self.ignore_when_blocking = set(opt.ignore_when_blocking) self.sample_rate = opt.sample_rate self.window_size = opt.window_size self.window_stride = opt.window_stride self.window = opt.window self.image_channel_size = opt.image_channel_size self.replace_unk = opt.replace_unk self.data_type = opt.data_type self.verbose = opt.verbose self.report_bleu = opt.report_bleu self.report_rouge = opt.report_rouge self.fast = opt.fast self.copy_attn = model_opt.copy_attn self.global_scorer = global_scorer self.out_file = out_file self.report_score = report_score self.logger = logger self.use_filter_pred = False # K per cand argument self.k_per_cand = opt.k_per_cand # Diverse beam search argument self.hamming_penalty = opt.hamming_penalty # Iterative beam search arguments self.hamming_dist = opt.hamming_dist self.beam_iters = opt.beam_iters # Clustered beam search arguments self.num_clusters = opt.num_clusters self.cluster_embeddings_file = opt.cluster_embeddings_file vocab = self.fields['tgt'][0][1].vocab self.cluster_embeddings = [] if self.num_clusters > 1: self.cluster_embeddings = self.load_embeddings( self.cluster_embeddings_file, vocab) # for debugging self.beam_trace = self.dump_beam != "" self.beam_accum = None if self.beam_trace: self.beam_accum = { "predicted_ids": [], "beam_parent_ids": [], "scores": [], "log_probs": [] } set_random_seed(opt.seed, self.cuda)
def configure_process(opt, device_id): if device_id >= 0: torch.cuda.set_device(device_id) set_random_seed(opt.seed, device_id >= 0)
def configure_process(opt, device_id): if device_id >= 0 and torch.cuda.is_available(): torch.cuda.set_device(device_id) set_random_seed(opt.seed, device_id >= 0)
def configure_process(opt, device_id): if device_id >= 0: torch.cuda.set_device(opt.gpu_ranks[device_id]) #torch.cuda.set_device(device_id) set_random_seed(opt.seed, device_id >= 0)
def __init__( self, model, fields, opt, model_opt, global_scorer=None, out_file=None, report_score=True, logger=None ): self.model = model self.fields = fields tgt_field = self.fields["tgt"][0][1].base_field tgt_field.eos_token = '</s>' self._tgt_vocab = tgt_field.vocab self._tgt_eos_idx = self._tgt_vocab.stoi[tgt_field.eos_token] self._tgt_pad_idx = self._tgt_vocab.stoi[tgt_field.pad_token] self._tgt_bos_idx = self._tgt_vocab.stoi[tgt_field.init_token] self._tgt_unk_idx = self._tgt_vocab.stoi[tgt_field.unk_token] self._tgt_vocab_len = len(self._tgt_vocab) self.opt=opt self.gpu = opt.gpu self.cuda = opt.gpu > -1 self.n_best = opt.n_best self.max_length = opt.max_length if opt.beam_size != 1 and opt.random_sampling_topk != 1: raise ValueError('Can either do beam search OR random sampling.') self.beam_size = opt.beam_size self.random_sampling_temp = opt.random_sampling_temp self.sample_from_topk = opt.random_sampling_topk self.min_length = opt.min_length self.stepwise_penalty = opt.stepwise_penalty self.dump_beam = opt.dump_beam self.block_ngram_repeat = opt.block_ngram_repeat self.ignore_when_blocking = set(opt.ignore_when_blocking) self._exclusion_idxs = { self._tgt_vocab.stoi[t] for t in self.ignore_when_blocking} self.src_reader = inputters.str2reader[opt.data_type].from_opt(opt) self.tgt_reader = inputters.str2reader["text"].from_opt(opt) self.replace_unk = opt.replace_unk self.data_type = opt.data_type self.verbose = opt.verbose self.report_bleu = opt.report_bleu self.report_rouge = opt.report_rouge self.report_time = opt.report_time self.fast = opt.fast self.copy_attn = model_opt.copy_attn self.global_scorer = global_scorer self.out_file = out_file self.report_score = report_score self.logger = logger self.use_filter_pred = False # for debugging self.beam_trace = self.dump_beam != "" self.beam_accum = None if self.beam_trace: self.beam_accum = { "predicted_ids": [], "beam_parent_ids": [], "scores": [], "log_probs": []} set_random_seed(opt.seed, self.cuda)
def __init__(self, opt, model_id, preprocess_opt=None, tokenizer_opt=None, postprocess_opt=None, load=False, timeout=-1, on_timeout="to_cpu", model_root="./"): self.model_root = model_root self.opt = self.parse_opt(opt) self.model_id = model_id self.preprocess_opt = preprocess_opt self.tokenizer_opt = tokenizer_opt self.postprocess_opt = postprocess_opt self.timeout = timeout self.on_timeout = on_timeout self.unload_timer = None self.user_opt = opt self.tokenizer = None if len(self.opt.log_file) > 0: log_file = os.path.join(model_root, self.opt.log_file) else: log_file = None self.logger = init_logger(log_file=log_file, log_file_level=self.opt.log_file_level) self.loading_lock = threading.Event() self.loading_lock.set() self.running_lock = threading.Semaphore(value=1) set_random_seed(self.opt.seed, self.opt.cuda) if self.preprocess_opt is not None: self.logger.info("Loading preprocessor") self.preprocessor = [] for function_path in self.preprocess_opt: function = get_function_by_path(function_path) self.preprocessor.append(function) if self.tokenizer_opt is not None: self.logger.info("Loading tokenizer") if "type" not in self.tokenizer_opt: raise ValueError("Missing mandatory tokenizer option 'type'") if self.tokenizer_opt['type'] == 'sentencepiece': if "model" not in self.tokenizer_opt: raise ValueError( "Missing mandatory tokenizer option 'model'") import sentencepiece as spm sp = spm.SentencePieceProcessor() model_path = os.path.join(self.model_root, self.tokenizer_opt['model']) sp.Load(model_path) self.tokenizer = sp elif self.tokenizer_opt['type'] == 'pyonmttok': if "params" not in self.tokenizer_opt: raise ValueError( "Missing mandatory tokenizer option 'params'") import pyonmttok if self.tokenizer_opt["mode"] is not None: mode = self.tokenizer_opt["mode"] else: mode = None # load can be called multiple times: modify copy tokenizer_params = dict(self.tokenizer_opt["params"]) for key, value in self.tokenizer_opt["params"].items(): if key.endswith("path"): tokenizer_params[key] = os.path.join( self.model_root, value) tokenizer = pyonmttok.Tokenizer(mode, **tokenizer_params) self.tokenizer = tokenizer else: raise ValueError("Invalid value for tokenizer type") if self.postprocess_opt is not None: self.logger.info("Loading postprocessor") self.postprocessor = [] for function_path in self.postprocess_opt: function = get_function_by_path(function_path) self.postprocessor.append(function) if load: self.load()
def train(opt): ArgumentParser.validate_train_opts(opt) ArgumentParser.update_model_opts(opt) ArgumentParser.validate_model_opts(opt) set_random_seed(opt.seed, False) # @Memray, check the dir existence beforehand to avoid path conflicting errors, # and set save_model, tensorboard_log_dir, wandb_log_dir if not exist train_single._check_save_model_path(opt) if not os.path.exists(opt.tensorboard_log_dir): os.makedirs(opt.tensorboard_log_dir) # Scan previous checkpoint to resume training latest_step = 0 latest_ckpt = None for subdir, dirs, filenames in os.walk(opt.exp_dir): for filename in sorted(filenames): if not filename.endswith('.pt'): continue step = int(filename[filename.rfind('_') + 1:filename.rfind('.pt')]) if step > latest_step: latest_ckpt = os.path.join(subdir, filename) latest_step = step # if not saved in the exp folder, check opt.save_model if latest_ckpt is None and opt.save_model is not None: save_model_dir = os.path.dirname(os.path.abspath(opt.save_model)) model_prefix = opt.save_model[opt.save_model.rfind(os.path.sep) + 1:] for subdir, dirs, filenames in os.walk(save_model_dir): for filename in sorted(filenames): if not filename.endswith('.pt'): continue if not filename.startswith(model_prefix): continue step = int(filename[filename.rfind('_') + 1:filename.rfind('.pt')]) if step > latest_step: latest_ckpt = os.path.join(subdir, filename) latest_step = step if latest_ckpt is not None: logger.info("A previous checkpoint is found, train from it: %s" % latest_ckpt) setattr(opt, 'train_from', latest_ckpt) setattr(opt, 'reset_optim', 'none') # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] elif opt.vocab and opt.vocab != 'none': # added by @memray for multiple datasets vocab = torch.load(opt.vocab) # check for code where vocab is saved instead of fields # (in the future this will be done in a smarter way) if old_style_vocab(vocab): vocab = load_old_vocab(vocab, opt.model_type, dynamic_dict=opt.copy_attn) elif opt.encoder_type == 'pretrained': vocab = None else: vocab = None fields = vocab # @memray: a temporary workaround, as well as train_single.py line 78 if fields and opt.data_type == "keyphrase": if opt.tgt_type in ["one2one", "multiple"]: if 'sep_indices' in fields: del fields['sep_indices'] else: if 'sep_indices' not in fields: sep_indices = Field(use_vocab=False, dtype=torch.long, postprocessing=make_tgt, sequential=False) fields["sep_indices"] = sep_indices if 'src_ex_vocab' not in fields: src_ex_vocab = RawField() fields["src_ex_vocab"] = src_ex_vocab # @memray reload fields for news dataset and pretrained models tokenizer = None if opt.pretrained_tokenizer is not None: tokenizer = load_pretrained_tokenizer(opt.pretrained_tokenizer, opt.cache_dir, opt.special_vocab_path) setattr(opt, 'vocab_size', len(tokenizer)) if opt.data_type == 'news': fields = reload_news_fields(opt, tokenizer=tokenizer) # elif opt.data_type == 'keyphrase': # fields = reload_keyphrase_fields(opt, tokenizer=tokenizer) if len(opt.data_ids) > 1: # added by @memray, for loading multiple datasets if opt.multi_dataset: shard_base = "train" train_iter = build_dataset_iter(shard_base, fields, opt, multi=True) else: train_shards = [] for train_id in opt.data_ids: shard_base = "train_" + train_id train_shards.append(shard_base) train_iter = build_dataset_iter_multiple(train_shards, fields, opt) else: shard_base = "train" train_iter = build_dataset_iter(shard_base, fields, opt) nb_gpu = len(opt.gpu_ranks) if opt.world_size > 1: queues = [] mp = torch.multiprocessing.get_context('spawn') semaphore = mp.Semaphore(opt.world_size * opt.queue_size) # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for device_id in range(nb_gpu): q = mp.Queue(opt.queue_size) queues += [q] procs.append( mp.Process(target=run, args=(opt, device_id, error_queue, q, semaphore), daemon=True)) procs[device_id].start() logger.info(" Starting process pid: %d " % procs[device_id].pid) error_handler.add_child(procs[device_id].pid) producer = mp.Process(target=batch_producer, args=( train_iter, queues, semaphore, opt, ), daemon=True) producer.start() error_handler.add_child(producer.pid) for p in procs: p.join() producer.terminate() elif nb_gpu == 1: # case 1 GPU only single_main(opt, 0) else: # case only CPU single_main(opt, -1)
def __init__( self, model, fields, src_reader, tgt_reader, gpu=-1, n_best=1, min_length=0, max_length=100, ratio=0., beam_size=30, random_sampling_topk=1, random_sampling_temp=1, stepwise_penalty=None, dump_beam=False, block_ngram_repeat=0, ignore_when_blocking=frozenset(), replace_unk=False, phrase_table="", data_type="text", verbose=False, report_bleu=False, report_rouge=False, report_time=False, copy_attn=False, global_scorer=None, out_file=None, report_score=True, logger=None, seed=-1): self.model = model self.fields = fields tgt_field = dict(self.fields)["tgt"].base_field self._tgt_vocab = tgt_field.vocab self._tgt_eos_idx = self._tgt_vocab.stoi[tgt_field.eos_token] self._tgt_pad_idx = self._tgt_vocab.stoi[tgt_field.pad_token] self._tgt_bos_idx = self._tgt_vocab.stoi[tgt_field.init_token] self._tgt_unk_idx = self._tgt_vocab.stoi[tgt_field.unk_token] self._tgt_vocab_len = len(self._tgt_vocab) self._gpu = gpu self._use_cuda = gpu > -1 self._dev = torch.device("cuda", self._gpu) \ if self._use_cuda else torch.device("cpu") self.n_best = n_best self.max_length = max_length self.beam_size = beam_size self.random_sampling_temp = random_sampling_temp self.sample_from_topk = random_sampling_topk self.min_length = min_length self.ratio = ratio self.stepwise_penalty = stepwise_penalty self.dump_beam = dump_beam self.block_ngram_repeat = block_ngram_repeat self.ignore_when_blocking = ignore_when_blocking self._exclusion_idxs = { self._tgt_vocab.stoi[t] for t in self.ignore_when_blocking} self.src_reader = src_reader self.tgt_reader = tgt_reader self.replace_unk = replace_unk if self.replace_unk and not self.model.decoder.attentional: raise ValueError( "replace_unk requires an attentional decoder.") self.phrase_table = phrase_table self.data_type = data_type self.verbose = verbose self.report_bleu = report_bleu self.report_rouge = report_rouge self.report_time = report_time self.copy_attn = copy_attn self.global_scorer = global_scorer if self.global_scorer.has_cov_pen and \ not self.model.decoder.attentional: raise ValueError( "Coverage penalty requires an attentional decoder.") self.out_file = out_file self.report_score = report_score self.logger = logger self.use_filter_pred = False self._filter_pred = None # for debugging self.beam_trace = self.dump_beam != "" self.beam_accum = None if self.beam_trace: self.beam_accum = { "predicted_ids": [], "beam_parent_ids": [], "scores": [], "log_probs": []} set_random_seed(seed, self._use_cuda)
def __init__(self, opt, model_id, preprocess_opt=None, tokenizer_opt=None, postprocess_opt=None, custom_opt=None, load=False, timeout=-1, on_timeout="to_cpu", model_root="./", ct2_model=None): self.model_root = model_root self.opt = self.parse_opt(opt) self.custom_opt = custom_opt self.model_id = model_id self.preprocess_opt = preprocess_opt self.tokenizers_opt = tokenizer_opt self.postprocess_opt = postprocess_opt self.timeout = timeout self.on_timeout = on_timeout self.ct2_model = os.path.join(model_root, ct2_model) \ if ct2_model is not None else None self.unload_timer = None self.user_opt = opt self.tokenizers = None if len(self.opt.log_file) > 0: log_file = os.path.join(model_root, self.opt.log_file) else: log_file = None self.logger = init_logger(log_file=log_file, log_file_level=self.opt.log_file_level, rotate=True) self.loading_lock = threading.Event() self.loading_lock.set() self.running_lock = threading.Semaphore(value=1) set_random_seed(self.opt.seed, self.opt.cuda) if self.preprocess_opt is not None: self.logger.info("Loading preprocessor") self.preprocessor = [] for function_path in self.preprocess_opt: function = get_function_by_path(function_path) self.preprocessor.append(function) if self.tokenizers_opt is not None: if "src" in self.tokenizers_opt and "tgt" in self.tokenizers_opt: self.logger.info("Loading src & tgt tokenizer") self.tokenizers = { 'src': self.build_tokenizer(tokenizer_opt['src']), 'tgt': self.build_tokenizer(tokenizer_opt['tgt']) } else: self.logger.info("Loading tokenizer") self.tokenizers_opt = { 'src': tokenizer_opt, 'tgt': tokenizer_opt } tokenizer = self.build_tokenizer(tokenizer_opt) self.tokenizers = {'src': tokenizer, 'tgt': tokenizer} if self.postprocess_opt is not None: self.logger.info("Loading postprocessor") self.postprocessor = [] for function_path in self.postprocess_opt: function = get_function_by_path(function_path) self.postprocessor.append(function) if load: self.load(preload=True) self.stop_unload_timer()
def __init__(self, model, rl_model, optim, model_saver, fields, src_reader, tgt_reader, gpu=-1, n_best=1, min_length=0, max_length=100, ratio=0., beam_size=30, random_sampling_topk=1, random_sampling_temp=1, sta=False, stepwise_penalty=None, dump_beam=False, block_ngram_repeat=0, ignore_when_blocking=frozenset(), replace_unk=False, phrase_table="", data_type="text", verbose=False, report_bleu=False, report_rouge=False, report_time=False, copy_attn=False, global_scorer=None, out_file=None, report_score=True, logger=None, sample_method="freq", tag_mask_path="", mask_attn=False, later_mask=False, reward_alpha=1, epochs=4, samples_n=2, report_every=5, valid_steps=30, save_checkpoint_steps=100, random_steps=10000, seed=-1): self.model = model self.fields = fields tgt_field = dict(self.fields)["tgt"].base_field self._tgt_vocab = tgt_field.vocab self._tgt_eos_idx = self._tgt_vocab.stoi[tgt_field.eos_token] self._tgt_pad_idx = self._tgt_vocab.stoi[tgt_field.pad_token] self._tgt_bos_idx = self._tgt_vocab.stoi[tgt_field.init_token] self._tgt_unk_idx = self._tgt_vocab.stoi[tgt_field.unk_token] self._tgt_vocab_len = len(self._tgt_vocab) self._gpu = gpu self._use_cuda = gpu > -1 self._dev = torch.device("cuda", self._gpu) \ if self._use_cuda else torch.device("cpu") self.n_best = n_best self.max_length = max_length self.beam_size = beam_size self.random_sampling_temp = random_sampling_temp self.sample_from_topk = random_sampling_topk self.min_length = min_length self.ratio = ratio self.stepwise_penalty = stepwise_penalty self.dump_beam = dump_beam self.block_ngram_repeat = block_ngram_repeat self.ignore_when_blocking = ignore_when_blocking self._exclusion_idxs = { self._tgt_vocab.stoi[t] for t in self.ignore_when_blocking } self.src_reader = src_reader self.tgt_reader = tgt_reader self.replace_unk = replace_unk if self.replace_unk and not self.model.decoder.attentional: raise ValueError("replace_unk requires an attentional decoder.") self.phrase_table = phrase_table self.data_type = data_type self.verbose = verbose self.report_bleu = report_bleu self.report_rouge = report_rouge self.report_time = report_time self.copy_attn = copy_attn self.global_scorer = global_scorer if self.global_scorer.has_cov_pen and \ not self.model.decoder.attentional: raise ValueError( "Coverage penalty requires an attentional decoder.") self.out_file = out_file self.report_score = report_score self.logger = logger self.use_filter_pred = False self._filter_pred = None # for debugging self.beam_trace = self.dump_beam != "" self.beam_accum = None if self.beam_trace: self.beam_accum = { "predicted_ids": [], "beam_parent_ids": [], "scores": [], "log_probs": [] } # for rl self.later_mask = later_mask self.reward_alpha = reward_alpha self.mask_attn = mask_attn self.sta = sta self.epochs = 1 if sta else epochs self.report_every = report_every self.valid_steps = valid_steps self.save_checkpoint_steps = save_checkpoint_steps self.samples_n = samples_n self.samples_method = sample_method self.writer = SummaryWriter() self.rl_model, self.optim, self.model_saver = rl_model, optim, model_saver self.criterion = torch.nn.NLLLoss(reduction='none') self.random_steps = random_steps self.tag_mask = load_json(tag_mask_path) for k in self.tag_mask: self.tag_mask[k] = torch.tensor(self.tag_mask[k], dtype=torch.float, device=self._dev).unsqueeze(0) self.tag_vocab = dict(self.fields)["tag_tgt"].base_field.vocab.stoi self.rl_model.train() self.model.eval() set_random_seed(seed, self._use_cuda)
def train(opt): ArgumentParser.validate_train_opts(opt) ArgumentParser.update_model_opts(opt) ArgumentParser.validate_model_opts(opt) set_random_seed(opt.seed, False) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) if 'vocab' in checkpoint: logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] else: vocab = torch.load(opt.data + '.vocab.pt') else: vocab = torch.load(opt.data + '.vocab.pt') # check for code where vocab is saved instead of fields # (in the future this will be done in a smarter way) if old_style_vocab(vocab): fields = load_old_vocab(vocab, opt.model_type, dynamic_dict=opt.copy_attn) else: fields = vocab if len(opt.data_ids) > 1: train_shards = [] for train_id in opt.data_ids: shard_base = "train_" + train_id train_shards.append(shard_base) train_iter = build_dataset_iter_multiple(train_shards, fields, opt) else: if opt.data_ids[0] is not None: shard_base = "train_" + opt.data_ids[0] else: shard_base = "train" train_iter = build_dataset_iter(shard_base, fields, opt) nb_gpu = len(opt.gpu_ranks) if opt.world_size > 1: queues = [] mp = torch.multiprocessing.get_context('spawn') semaphore = mp.Semaphore(opt.world_size * opt.queue_size) # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for device_id in range(nb_gpu): q = mp.Queue(opt.queue_size) queues += [q] procs.append( mp.Process(target=run, args=(opt, device_id, error_queue, q, semaphore), daemon=True)) procs[device_id].start() logger.info(" Starting process pid: %d " % procs[device_id].pid) error_handler.add_child(procs[device_id].pid) producer = mp.Process(target=batch_producer, args=( train_iter, queues, semaphore, opt, ), daemon=True) producer.start() error_handler.add_child(producer.pid) for p in procs: p.join() producer.terminate() elif nb_gpu == 1: # case 1 GPU only single_main(opt, 0) else: # case only CPU single_main(opt, -1)
def __init__(self, model, fields, src_reader, uid_reader, dom_reader, tgt_reader, gpu=-1, n_best=1, min_length=0, max_length=100, beam_size=30, random_sampling_topk=1, random_sampling_temp=1, stepwise_penalty=None, dump_beam=False, block_ngram_repeat=0, ignore_when_blocking=frozenset(), replace_unk=False, data_type="text", verbose=False, report_bleu=False, report_rouge=False, report_time=False, copy_attn=False, global_scorer=None, out_file=None, report_score=True, logger=None, seed=-1, user_bias='none', user_cls=False, dom_cls=False, user_emb=False): self.model = model self.fields = fields tgt_field = dict(self.fields)["tgt"].base_field self._tgt_vocab = tgt_field.vocab self._tgt_eos_idx = self._tgt_vocab.stoi[tgt_field.eos_token] self._tgt_pad_idx = self._tgt_vocab.stoi[tgt_field.pad_token] self._tgt_bos_idx = self._tgt_vocab.stoi[tgt_field.init_token] self._tgt_unk_idx = self._tgt_vocab.stoi[tgt_field.unk_token] self._tgt_vocab_len = len(self._tgt_vocab) self._gpu = gpu self._use_cuda = gpu > -1 self._dev = torch.device("cuda", self._gpu) \ if self._use_cuda else torch.device("cpu") self.n_best = n_best self.max_length = max_length self.beam_size = beam_size self.random_sampling_temp = random_sampling_temp self.sample_from_topk = random_sampling_topk self.min_length = min_length self.stepwise_penalty = stepwise_penalty self.dump_beam = dump_beam self.block_ngram_repeat = block_ngram_repeat self.ignore_when_blocking = ignore_when_blocking self._exclusion_idxs = { self._tgt_vocab.stoi[t] for t in self.ignore_when_blocking } self.src_reader = src_reader self.uid_reader = uid_reader self.dom_reader = dom_reader self.tgt_reader = tgt_reader self.replace_unk = replace_unk self.data_type = data_type self.verbose = verbose self.report_bleu = report_bleu self.report_rouge = report_rouge self.report_time = report_time self.copy_attn = copy_attn self.global_scorer = global_scorer self.out_file = out_file self.report_score = report_score self.logger = logger self.use_filter_pred = False self._filter_pred = None self.user_bias = user_bias self.user_cls = user_cls self.dom_cls = dom_cls self.user_emb = user_emb # for debugging self.beam_trace = self.dump_beam != "" self.beam_accum = None if self.beam_trace: self.beam_accum = { "predicted_ids": [], "beam_parent_ids": [], "scores": [], "log_probs": [] } set_random_seed(seed, self._use_cuda)
def __init__(self, model, fields, src_reader, tgt_reader, gpu=-1, n_best=1, min_length=0, max_length=100, ratio=0., beam_size=30, random_sampling_topk=1, random_sampling_temp=1, stepwise_penalty=None, dump_beam=False, block_ngram_repeat=0, ignore_when_blocking=frozenset(), replace_unk=False, phrase_table="", data_type="text", verbose=False, report_time=False, copy_attn=False, global_scorer=None, out_file=None, report_align=False, report_score=True, logger=None, seed=-1): self.model = model self.fields = fields tgt_field = dict(self.fields)["tgt"].base_field self._tgt_vocab = tgt_field.vocab self._tgt_eos_idx = self._tgt_vocab.stoi[tgt_field.eos_token] self._tgt_pad_idx = self._tgt_vocab.stoi[tgt_field.pad_token] self._tgt_bos_idx = self._tgt_vocab.stoi[tgt_field.init_token] self._tgt_unk_idx = self._tgt_vocab.stoi[tgt_field.unk_token] self._tgt_vocab_len = len(self._tgt_vocab) self._gpu = gpu self._use_cuda = gpu > -1 self._dev = torch.device("cuda", self._gpu) \ if self._use_cuda else torch.device("cpu") self.n_best = n_best self.max_length = max_length self.beam_size = beam_size self.random_sampling_temp = random_sampling_temp self.sample_from_topk = random_sampling_topk self.min_length = min_length self.ratio = ratio self.stepwise_penalty = stepwise_penalty self.dump_beam = dump_beam self.block_ngram_repeat = block_ngram_repeat self.ignore_when_blocking = ignore_when_blocking self._exclusion_idxs = { self._tgt_vocab.stoi[t] for t in self.ignore_when_blocking } self.src_reader = src_reader self.tgt_reader = tgt_reader self.replace_unk = replace_unk if self.replace_unk and not self.model.decoder.attentional: raise ValueError("replace_unk requires an attentional decoder.") self.phrase_table = phrase_table self.data_type = data_type self.verbose = verbose self.report_time = report_time self.copy_attn = copy_attn self.global_scorer = global_scorer if self.global_scorer.has_cov_pen and \ not self.model.decoder.attentional: raise ValueError( "Coverage penalty requires an attentional decoder.") self.out_file = out_file self.report_align = report_align self.report_score = report_score self.logger = logger self.use_filter_pred = False self._filter_pred = None # for debugging self.beam_trace = self.dump_beam != "" self.beam_accum = None if self.beam_trace: self.beam_accum = { "predicted_ids": [], "beam_parent_ids": [], "scores": [], "log_probs": [] } set_random_seed(seed, self._use_cuda)
def train(opt): ArgumentParser.validate_train_opts(opt) ArgumentParser.update_model_opts(opt) ArgumentParser.validate_model_opts(opt) if opt.train_from != '': raise Exception( 'train_from will be set automatically to the latest model, you should not set it manually' ) # set gpu ranks automatically if not specified if len(opt.gpu_ranks) == 0: opt.gpu_ranks = list(range(opt.world_size)) # Set train_from to latest checkpoint if it exists file_list = glob.glob(opt.save_model + '*.pt') if len(os.listdir(os.path.dirname( opt.save_model))) > 0 and len(file_list) == 0: raise Exception( 'save_model directory is not empty but no pretrained models found') if len(file_list) > 0: ckpt_nos = list( map(lambda x: int(x.split('_')[-1].split('.')[0]), file_list)) ckpt_no = max(ckpt_nos) opt.train_from = opt.save_model + '_' + str(ckpt_no) + '.pt' print(opt.train_from) assert os.path.exists(opt.train_from) set_random_seed(opt.seed, False) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] else: vocab = torch.load(opt.data + '.vocab.pt') # check for code where vocab is saved instead of fields # (in the future this will be done in a smarter way) if old_style_vocab(vocab): fields = load_old_vocab(vocab, opt.model_type, dynamic_dict=opt.copy_attn) else: fields = vocab if len(opt.data_ids) > 1: train_shards = [] for train_id in opt.data_ids: shard_base = "train_" + train_id train_shards.append(shard_base) train_iter = build_dataset_iter_multiple(train_shards, fields, opt) else: if opt.data_ids[0] is not None: shard_base = "train_" + opt.data_ids[0] else: shard_base = "train" train_iter = build_dataset_iter(shard_base, fields, opt) nb_gpu = len(opt.gpu_ranks) if opt.world_size > 1: queues = [] mp = torch.multiprocessing.get_context('spawn') semaphore = mp.Semaphore(opt.world_size * opt.queue_size) # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for device_id in range(nb_gpu): q = mp.Queue(opt.queue_size) queues += [q] procs.append( mp.Process(target=run, args=(opt, device_id, error_queue, q, semaphore), daemon=True)) procs[device_id].start() logger.info(" Starting process pid: %d " % procs[device_id].pid) error_handler.add_child(procs[device_id].pid) producer = mp.Process(target=batch_producer, args=( train_iter, queues, semaphore, opt, ), daemon=True) producer.start() error_handler.add_child(producer.pid) for p in procs: p.join() producer.terminate() elif nb_gpu == 1: # case 1 GPU only single_main(opt, 0) else: # case only CPU single_main(opt, -1)