def output(self, step, num_steps, learning_rate, start): """Write out statistics to stdout. Args: step (int): current step n_batch (int): total batches start (int): start time of step. """ t = self.elapsed_time() logger.info( ("Step %2d, %5d; acc: %6.2f; ppl: %6.2f; xent: %6.2f; " + "lr: %7.5f; %3.0f / %3.0f tok/s; %6.0f sec") % (step, num_steps, self.accuracy(), self.ppl(), self.xent(), learning_rate, self.n_src_words / (t + 1e-5), self.n_words / (t + 1e-5), time.time() - start)) sys.stdout.flush()
def main(): opt = parse_args() init_logger(opt.log_file) logger.info("Extracting features...") logger.info("Building `Fields` object...") fields = inputters.get_fields(opt.data_type) logger.info("Building & saving training data...") train_dataset_files = build_save_dataset('train', fields, opt) logger.info("Building & saving vocabulary...") build_save_vocab(train_dataset_files, fields, opt) logger.info("Building & saving validation data...") build_save_dataset('valid', fields, opt)
def build_model(embedding_opt, memory_opts, discriminator_opt, generator_opt, opt, fields, checkpoint): """ Build the discriminator & generator """ logger.info('Building memory & discriminator & generator ...') memory, discriminator, generator = build_base_model(embedding_opt, memory_opts, discriminator_opt, generator_opt, fields, use_gpu(opt), checkpoint) logger.info(memory) logger.info(discriminator) logger.info(generator) return memory, discriminator, generator
def build_save_dataset(corpus_type, fields, opt): """ Building and saving the dataset """ assert corpus_type in ['train', 'valid'] if corpus_type == 'train': src_corpus = opt.train_src tgt_corpus = opt.train_tgt else: src_corpus = opt.valid_src tgt_corpus = opt.valid_tgt # Currently we only do preprocess sharding for corpus: data_type=='text'. if opt.data_type == 'text_single_turn': return build_save_in_shards(src_corpus, tgt_corpus, fields, corpus_type, opt) # For data_type == 'img' or 'audio', currently we don't do # preprocess sharding. We only build a monolithic dataset. # But since the interfaces are uniform, it would be not hard # to do this should users need this feature. dataset = inputters.build_dataset( fields, opt.data_type, src_path=src_corpus, tgt_path=tgt_corpus, src_dir=opt.src_dir, src_seq_length=opt.src_seq_length, tgt_seq_length=opt.tgt_seq_length, src_seq_length_trunc=opt.src_seq_length_trunc, tgt_seq_length_trunc=opt.tgt_seq_length_trunc, dynamic_dict=opt.dynamic_dict, sample_rate=opt.sample_rate, window_size=opt.window_size, window_stride=opt.window_stride, window=opt.window) # We save fields in vocab.pt seperately, so make it empty. dataset.fields = [] pt_file = "{:s}.{:s}.pt".format(opt.save_data, corpus_type) logger.info(" * saving %s dataset to %s." % (corpus_type, pt_file)) torch.save(dataset, pt_file) return [pt_file]
def _save(self, step): discriminator_state_dict = self.discriminator.state_dict() generator_state_dict = self.generator.state_dict() checkpoint = { 'discriminator': discriminator_state_dict, 'generator': generator_state_dict, 'vocab': ogan.inputters.save_fields_to_vocab(self.fields), 'embedding_opt': self.embedding_opt, 'memory_opt': self.memory_opt, 'discriminator_opt': self.discriminator_opt, 'generator_opt': self.generator_opt, 'discriminator_optim': self.optim, 'generator_optim': self.optim } logger.info("Saving checkpoint %s_step_%d.pt" % (self.base_path, step)) checkpoint_path = '%s_step_%d.pt' % (self.base_path, step) torch.save(checkpoint, checkpoint_path) return checkpoint, checkpoint_path
def training_opt_postprocessing(opt): if opt.word_vec_size != -1: opt.src_word_vec_size = opt.word_vec_size opt.tgt_word_vec_size = opt.word_vec_size if torch.cuda.is_available() and not opt.gpuid: logger.info("WARNING: You have a CUDA device, should run with -gpuid") if opt.gpuid: torch.cuda.set_device(opt.device_id) if opt.seed > 0: # this one is needed for torchtext random call (shuffled iterator) # in multi gpu it ensures datasets are read in the same order random.seed(opt.seed) # These ensure same initialization in multi gpu mode torch.manual_seed(opt.seed) torch.cuda.manual_seed(opt.seed) return opt
def _load_fields(dataset, data_type, opt, checkpoint): if checkpoint is not None: logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) fields = load_fields_from_vocab(checkpoint['vocab'], data_type) else: fields = load_fields_from_vocab(torch.load(opt.data + '.vocab.pt'), data_type) fields = dict([(k, f) for (k, f) in fields.items() if k in dataset.examples[0].__dict__]) if data_type == 'text': logger.info(' * vocabulary size. source = %d; target = %d' % (len(fields['src'].vocab), len(fields['tgt'].vocab))) else: logger.info(' * vocabulary size. target = %d' % (len(fields['tgt'].vocab))) return fields
def _lazy_dataset_loader(pt_file, corpus_type): dataset = torch.load(pt_file) logger.info('Loading %s dataset from %s, number of examples: %d' % (corpus_type, pt_file, len(dataset))) return dataset
def build_vocab(train_dataset_files, fields, data_type, share_vocab, src_vocab_path, src_vocab_size, src_words_min_frequency, tgt_vocab_path, tgt_vocab_size, tgt_words_min_frequency): """ Args: train_dataset_files: a list of train dataset pt file. fields (dict): fields to build vocab for. data_type: "text", "img" or "audio"? share_vocab(bool): share source and target vocabulary? src_vocab_path(string): Path to src vocabulary file. src_vocab_size(int): size of the source vocabulary. src_words_min_frequency(int): the minimum frequency needed to include a source word in the vocabulary. tgt_vocab_path(string): Path to tgt vocabulary file. tgt_vocab_size(int): size of the target vocabulary. tgt_words_min_frequency(int): the minimum frequency needed to include a target word in the vocabulary. Returns: Dict of Fields """ counter = {} for k in fields: counter[k] = Counter() # Load vocabulary src_vocab = None # if len(src_vocab_path) > 0: if src_vocab_path: src_vocab = set([]) logger.info('Loading source vocab from %s' % src_vocab_path) assert os.path.exists(src_vocab_path), \ 'src vocab %s not found!' % src_vocab_path with open(src_vocab_path) as f: for line in f: if len(line.strip()) == 0: continue word = line.strip().split()[0] src_vocab.add(word) tgt_vocab = None # if len(tgt_vocab_path) > 0: if tgt_vocab_path: tgt_vocab = set([]) logger.info('Loading target vocab from %s' % tgt_vocab_path) assert os.path.exists(tgt_vocab_path), \ 'tgt vocab %s not found!' % tgt_vocab_path with open(tgt_vocab_path) as f: for line in f: if len(line.strip()) == 0: continue word = line.strip().split()[0] tgt_vocab.add(word) for path in train_dataset_files: dataset = torch.load(path) logger.info(" * reloading %s." % path) for ex in dataset.examples: for k in fields: val = getattr(ex, k, None) if val is not None and not fields[k].sequential: val = [val] elif k == 'src' and src_vocab: val = [item for item in val if item in src_vocab] elif k == 'tgt' and tgt_vocab: val = [item for item in val if item in tgt_vocab] counter[k].update(val) _build_field_vocab(fields["tgt"], counter["tgt"], max_size=tgt_vocab_size, min_freq=tgt_words_min_frequency) logger.info(" * tgt vocab size: %d." % len(fields["tgt"].vocab)) if data_type == 'text_single_turn': _build_field_vocab(fields["src"], counter["src"], max_size=src_vocab_size, min_freq=src_words_min_frequency) logger.info(" * src vocab size: %d." % len(fields["src"].vocab)) # Merge the input and output vocabularies. if share_vocab: # `tgt_vocab_size` is ignored when sharing vocabularies logger.info(" * merging src and tgt vocab...") merged_vocab = merge_vocabs( [fields["src"].vocab, fields["tgt"].vocab], vocab_size=src_vocab_size) fields["src"].vocab = merged_vocab fields["tgt"].vocab = merged_vocab return fields
def log(self, *args, **kwargs): logger.info(*args, **kwargs)
def build_save_in_shards(src_corpus, tgt_corpus, fields, corpus_type, opt): """ Divide the big corpus into shards, and build dataset separately. This is currently only for data_type=='text'. The reason we do this is to avoid taking up too much memory due to sucking in a huge corpus file. To tackle this, we only read in part of the corpus file of size `max_shard_size`(actually it is multiples of 64 bytes that equals or is slightly larger than this size), and process it into dataset, then write it to disk along the way. By doing this, we only focus on part of the corpus at any moment, thus effectively reducing memory use. According to test, this method can reduce memory footprint by ~50%. Note! As we process along the shards, previous shards might still stay in memory, but since we are done with them, and no more reference to them, if there is memory tight situation, the OS could easily reclaim these memory. If `max_shard_size` is 0 or is larger than the corpus size, it is effectively preprocessed into one dataset, i.e. no sharding. NOTE! `max_shard_size` is measuring the input corpus size, not the output pt file size. So a shard pt file consists of examples of size 2 * `max_shard_size`(source + target). """ corpus_size = os.path.getsize(src_corpus) if corpus_size > 10 * (1024**2) and opt.max_shard_size == 0: logger.info("Warning. The corpus %s is larger than 10M bytes, " "you can set '-max_shard_size' to process it by " "small shards to use less memory." % src_corpus) if opt.max_shard_size != 0: logger.info(' * divide corpus into shards and build dataset ' 'separately (shard_size = %d bytes).' % opt.max_shard_size) ret_list = [] src_iter = inputters.ShardedTextCorpusIterator(src_corpus, opt.src_seq_length_trunc, "src", opt.max_shard_size) tgt_iter = inputters.ShardedTextCorpusIterator(tgt_corpus, opt.tgt_seq_length_trunc, "tgt", opt.max_shard_size, assoc_iter=src_iter) index = 0 while not src_iter.hit_end(): index += 1 dataset = inputters.SingleTurnDataset( fields, src_iter, tgt_iter, src_seq_length=opt.src_seq_length, tgt_seq_length=opt.tgt_seq_length) # We save fields in vocab.pt separately, so make it empty. dataset.fields = [] pt_file = "{:s}.{:s}.{:d}.pt".format(opt.save_data, corpus_type, index) logger.info(" * saving %s data shard to %s." % (corpus_type, pt_file)) torch.save(dataset, pt_file) ret_list.append(pt_file) return ret_list
def train(self, train_iter_fct, valid_iter_fct, train_steps, valid_steps): """ The main training loops. by iterating over training data (i.e. `train_iter_fct`) and running validation (i.e. iterating over `valid_iter_fct` Args: train_iter_fct(function): a function that returns the train iterator. e.g. something like train_iter_fct = lambda: generator(*args, **kwargs) valid_iter_fct(function): same as train_iter_fct, for valid data train_steps(int): valid_steps(int): save_checkpoint_steps(int): Return: None """ logger.info('Start training...') step = self.optim._step + 1 true_batchs = [] accum = 0 normalization = 0 train_iter = train_iter_fct() total_stats = ogan.utils.Statistics() report_stats = ogan.utils.Statistics() self._start_report_manager(start_time=total_stats.start_time) while step <= train_steps: reduce_counter = 0 for i, batch in enumerate(train_iter): if self.n_gpu == 0 or (i % self.n_gpu == self.gpu_rank): if self.gpu_verbose_level > 1: logger.info("GpuRank %d: index: %d accum: %d" % (self.gpu_rank, i, accum)) cur_dataset = train_iter.get_cur_dataset() self.train_loss.cur_dataset = cur_dataset true_batchs.append(batch) if self.norm_method == "tokens": num_tokens = batch.tgt[1:].data.view(-1) \ .ne(self.train_loss.padding_idx).sum() normalization += num_tokens else: normalization += batch.batch_size accum += 1 if accum == self.grad_accum_count: reduce_counter += 1 if self.gpu_verbose_level > 0: logger.info("GpuRank %d: reduce_counter: %d \ n_minibatch %d" % (self.gpu_rank, reduce_counter, len(true_batchs))) if self.n_gpu > 1: normalization = sum(ogan.utils.distributed .all_gather_list (normalization)) self._gradient_accumulation( true_batchs, normalization, total_stats, report_stats) report_stats = self._maybe_report_training( step, train_steps, self.optim.learning_rate, report_stats) true_batchs = [] accum = 0 normalization = 0 if (step % valid_steps == 0): if self.gpu_verbose_level > 0: logger.info('GpuRank %d: validate step %d' % (self.gpu_rank, step)) valid_iter = valid_iter_fct() valid_stats = self.validate(valid_iter) if self.gpu_verbose_level > 0: logger.info('GpuRank %d: gather valid stat \ step %d' % (self.gpu_rank, step)) valid_stats = self._maybe_gather_stats(valid_stats) if self.gpu_verbose_level > 0: logger.info('GpuRank %d: report stat step %d' % (self.gpu_rank, step)) self._report_step(self.optim.learning_rate, step, valid_stats=valid_stats) if self.gpu_rank == 0: self._maybe_save(step) step += 1 if step > train_steps: break if self.gpu_verbose_level > 0: logger.info('GpuRank %d: we completed an epoch \ at step %d' % (self.gpu_rank, step)) train_iter = train_iter_fct() return total_stats
def main(opt): opt = training_opt_postprocessing(opt) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) embedding_opt = checkpoint["embedding_opt"] memory_opt = checkpoint["memory_opt"] discriminator_opt = checkpoint['discriminator_opt'] generator_opt = checkpoint['generator_opt'] else: checkpoint = None embedding_opt = memory_opt = discriminator_opt = generator_opt = opt # Peek the fisrt dataset to determine the data_type. # (All datasets have the same data_type). first_dataset = next(lazily_load_dataset("train", opt)) data_type = first_dataset.data_type # Load fields generated from preprocess phase. fields = _load_fields(first_dataset, data_type, opt, checkpoint) # Build discriminator & generator. # discriminator = build_discriminator(discriminator_opt, opt, fields, discriminator_checkpoint) # generator = build_generator(generator_opt, opt, fields, generator_checkpoint) memory, discriminator, generator = build_model(embedding_opt, memory_opt, discriminator_opt, generator_opt, opt, fields, checkpoint) # n_params, enc, dec = _tally_parameters(discriminator) # n_params, enc, dec = _tally_parameters(generator) # logger.info('encoder: %d' % enc) # logger.info('decoder: %d' % dec) # logger.info('* number of parameters: %d' % n_params) # _check_save_model_path(opt) # Build optimizer. discriminator_optim = build_optim([discriminator, memory], opt, checkpoint) generator_optim = build_optim([generator, memory], opt, checkpoint) # Build model saver model_saver = build_model_saver(embedding_opt, memory_opt, discriminator_opt, generator_opt, opt, memory, discriminator, generator, fields, discriminator_optim) trainer = build_trainer(opt, memory, discriminator, generator, fields, discriminator_optim, generator_optim, data_type, model_saver=model_saver) def train_iter_fct(): return build_dataset_iter(lazily_load_dataset("train", opt), fields, opt) def valid_iter_fct(): return build_dataset_iter(lazily_load_dataset("valid", opt), fields, opt) # Do training. trainer.train(train_iter_fct, valid_iter_fct, opt.train_steps, opt.valid_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close()