Esempio n. 1
0
 def output(self, step, num_steps, learning_rate, start):
     """Write out statistics to stdout.
     Args:
        step (int): current step
        n_batch (int): total batches
        start (int): start time of step.
     """
     t = self.elapsed_time()
     logger.info(
         ("Step %2d, %5d; acc: %6.2f; ppl: %6.2f; xent: %6.2f; " +
          "lr: %7.5f; %3.0f / %3.0f tok/s; %6.0f sec") %
         (step, num_steps, self.accuracy(), self.ppl(), self.xent(),
          learning_rate, self.n_src_words / (t + 1e-5), self.n_words /
          (t + 1e-5), time.time() - start))
     sys.stdout.flush()
Esempio n. 2
0
def main():
    opt = parse_args()
    init_logger(opt.log_file)
    logger.info("Extracting features...")

    logger.info("Building `Fields` object...")
    fields = inputters.get_fields(opt.data_type)

    logger.info("Building & saving training data...")
    train_dataset_files = build_save_dataset('train', fields, opt)

    logger.info("Building & saving vocabulary...")
    build_save_vocab(train_dataset_files, fields, opt)

    logger.info("Building & saving validation data...")
    build_save_dataset('valid', fields, opt)
Esempio n. 3
0
def build_model(embedding_opt, memory_opts, discriminator_opt, generator_opt, opt, fields, checkpoint):
    """ Build the discriminator & generator """
    logger.info('Building memory & discriminator & generator ...')
    memory, discriminator, generator = build_base_model(embedding_opt, memory_opts,  discriminator_opt, generator_opt, fields, use_gpu(opt), checkpoint)
    logger.info(memory)
    logger.info(discriminator)
    logger.info(generator)

    return memory, discriminator, generator
Esempio n. 4
0
def build_save_dataset(corpus_type, fields, opt):
    """ Building and saving the dataset """
    assert corpus_type in ['train', 'valid']

    if corpus_type == 'train':
        src_corpus = opt.train_src
        tgt_corpus = opt.train_tgt
    else:
        src_corpus = opt.valid_src
        tgt_corpus = opt.valid_tgt

    # Currently we only do preprocess sharding for corpus: data_type=='text'.
    if opt.data_type == 'text_single_turn':
        return build_save_in_shards(src_corpus, tgt_corpus, fields,
                                    corpus_type, opt)

    # For data_type == 'img' or 'audio', currently we don't do
    # preprocess sharding. We only build a monolithic dataset.
    # But since the interfaces are uniform, it would be not hard
    # to do this should users need this feature.
    dataset = inputters.build_dataset(
        fields,
        opt.data_type,
        src_path=src_corpus,
        tgt_path=tgt_corpus,
        src_dir=opt.src_dir,
        src_seq_length=opt.src_seq_length,
        tgt_seq_length=opt.tgt_seq_length,
        src_seq_length_trunc=opt.src_seq_length_trunc,
        tgt_seq_length_trunc=opt.tgt_seq_length_trunc,
        dynamic_dict=opt.dynamic_dict,
        sample_rate=opt.sample_rate,
        window_size=opt.window_size,
        window_stride=opt.window_stride,
        window=opt.window)

    # We save fields in vocab.pt seperately, so make it empty.
    dataset.fields = []

    pt_file = "{:s}.{:s}.pt".format(opt.save_data, corpus_type)
    logger.info(" * saving %s dataset to %s." % (corpus_type, pt_file))
    torch.save(dataset, pt_file)

    return [pt_file]
Esempio n. 5
0
    def _save(self, step):
        discriminator_state_dict = self.discriminator.state_dict()
        generator_state_dict = self.generator.state_dict()
        checkpoint = {
            'discriminator': discriminator_state_dict,
            'generator': generator_state_dict,
            'vocab': ogan.inputters.save_fields_to_vocab(self.fields),
            'embedding_opt': self.embedding_opt,
            'memory_opt': self.memory_opt,
            'discriminator_opt': self.discriminator_opt,
            'generator_opt': self.generator_opt,
            'discriminator_optim': self.optim,
            'generator_optim': self.optim
        }

        logger.info("Saving checkpoint %s_step_%d.pt" % (self.base_path, step))
        checkpoint_path = '%s_step_%d.pt' % (self.base_path, step)
        torch.save(checkpoint, checkpoint_path)
        return checkpoint, checkpoint_path
Esempio n. 6
0
def training_opt_postprocessing(opt):
    if opt.word_vec_size != -1:
        opt.src_word_vec_size = opt.word_vec_size
        opt.tgt_word_vec_size = opt.word_vec_size

    if torch.cuda.is_available() and not opt.gpuid:
        logger.info("WARNING: You have a CUDA device, should run with -gpuid")

    if opt.gpuid:
        torch.cuda.set_device(opt.device_id)
        if opt.seed > 0:
            # this one is needed for torchtext random call (shuffled iterator)
            # in multi gpu it ensures datasets are read in the same order
            random.seed(opt.seed)
            # These ensure same initialization in multi gpu mode
            torch.manual_seed(opt.seed)
            torch.cuda.manual_seed(opt.seed)

    return opt
Esempio n. 7
0
def _load_fields(dataset, data_type, opt, checkpoint):
    if checkpoint is not None:
        logger.info('Loading vocab from checkpoint at %s.' % opt.train_from)
        fields = load_fields_from_vocab(checkpoint['vocab'], data_type)
    else:
        fields = load_fields_from_vocab(torch.load(opt.data + '.vocab.pt'),
                                        data_type)
    fields = dict([(k, f) for (k, f) in fields.items()
                   if k in dataset.examples[0].__dict__])

    if data_type == 'text':
        logger.info(' * vocabulary size. source = %d; target = %d' %
                    (len(fields['src'].vocab), len(fields['tgt'].vocab)))
    else:
        logger.info(' * vocabulary size. target = %d' %
                    (len(fields['tgt'].vocab)))

    return fields
Esempio n. 8
0
 def _lazy_dataset_loader(pt_file, corpus_type):
     dataset = torch.load(pt_file)
     logger.info('Loading %s dataset from %s, number of examples: %d' %
                 (corpus_type, pt_file, len(dataset)))
     return dataset
Esempio n. 9
0
def build_vocab(train_dataset_files, fields, data_type, share_vocab,
                src_vocab_path, src_vocab_size, src_words_min_frequency,
                tgt_vocab_path, tgt_vocab_size, tgt_words_min_frequency):
    """
    Args:
        train_dataset_files: a list of train dataset pt file.
        fields (dict): fields to build vocab for.
        data_type: "text", "img" or "audio"?
        share_vocab(bool): share source and target vocabulary?
        src_vocab_path(string): Path to src vocabulary file.
        src_vocab_size(int): size of the source vocabulary.
        src_words_min_frequency(int): the minimum frequency needed to
                include a source word in the vocabulary.
        tgt_vocab_path(string): Path to tgt vocabulary file.
        tgt_vocab_size(int): size of the target vocabulary.
        tgt_words_min_frequency(int): the minimum frequency needed to
                include a target word in the vocabulary.
    Returns:
        Dict of Fields
    """
    counter = {}
    for k in fields:
        counter[k] = Counter()

    # Load vocabulary
    src_vocab = None
    # if len(src_vocab_path) > 0:
    if src_vocab_path:
        src_vocab = set([])
        logger.info('Loading source vocab from %s' % src_vocab_path)

        assert os.path.exists(src_vocab_path), \
            'src vocab %s not found!' % src_vocab_path
        with open(src_vocab_path) as f:
            for line in f:
                if len(line.strip()) == 0:
                    continue
                word = line.strip().split()[0]
                src_vocab.add(word)

    tgt_vocab = None
    # if len(tgt_vocab_path) > 0:
    if tgt_vocab_path:
        tgt_vocab = set([])
        logger.info('Loading target vocab from %s' % tgt_vocab_path)
        assert os.path.exists(tgt_vocab_path), \
            'tgt vocab %s not found!' % tgt_vocab_path
        with open(tgt_vocab_path) as f:
            for line in f:
                if len(line.strip()) == 0:
                    continue
                word = line.strip().split()[0]
                tgt_vocab.add(word)

    for path in train_dataset_files:
        dataset = torch.load(path)
        logger.info(" * reloading %s." % path)
        for ex in dataset.examples:
            for k in fields:
                val = getattr(ex, k, None)
                if val is not None and not fields[k].sequential:
                    val = [val]
                elif k == 'src' and src_vocab:
                    val = [item for item in val if item in src_vocab]
                elif k == 'tgt' and tgt_vocab:
                    val = [item for item in val if item in tgt_vocab]
                counter[k].update(val)

    _build_field_vocab(fields["tgt"],
                       counter["tgt"],
                       max_size=tgt_vocab_size,
                       min_freq=tgt_words_min_frequency)
    logger.info(" * tgt vocab size: %d." % len(fields["tgt"].vocab))

    if data_type == 'text_single_turn':
        _build_field_vocab(fields["src"],
                           counter["src"],
                           max_size=src_vocab_size,
                           min_freq=src_words_min_frequency)
        logger.info(" * src vocab size: %d." % len(fields["src"].vocab))

        # Merge the input and output vocabularies.
        if share_vocab:
            # `tgt_vocab_size` is ignored when sharing vocabularies
            logger.info(" * merging src and tgt vocab...")
            merged_vocab = merge_vocabs(
                [fields["src"].vocab, fields["tgt"].vocab],
                vocab_size=src_vocab_size)
            fields["src"].vocab = merged_vocab
            fields["tgt"].vocab = merged_vocab

    return fields
Esempio n. 10
0
 def log(self, *args, **kwargs):
     logger.info(*args, **kwargs)
Esempio n. 11
0
def build_save_in_shards(src_corpus, tgt_corpus, fields, corpus_type, opt):
    """
    Divide the big corpus into shards, and build dataset separately.
    This is currently only for data_type=='text'.
    The reason we do this is to avoid taking up too much memory due
    to sucking in a huge corpus file.
    To tackle this, we only read in part of the corpus file of size
    `max_shard_size`(actually it is multiples of 64 bytes that equals
    or is slightly larger than this size), and process it into dataset,
    then write it to disk along the way. By doing this, we only focus on
    part of the corpus at any moment, thus effectively reducing memory use.
    According to test, this method can reduce memory footprint by ~50%.
    Note! As we process along the shards, previous shards might still
    stay in memory, but since we are done with them, and no more
    reference to them, if there is memory tight situation, the OS could
    easily reclaim these memory.
    If `max_shard_size` is 0 or is larger than the corpus size, it is
    effectively preprocessed into one dataset, i.e. no sharding.
    NOTE! `max_shard_size` is measuring the input corpus size, not the
    output pt file size. So a shard pt file consists of examples of size
    2 * `max_shard_size`(source + target).
    """

    corpus_size = os.path.getsize(src_corpus)
    if corpus_size > 10 * (1024**2) and opt.max_shard_size == 0:
        logger.info("Warning. The corpus %s is larger than 10M bytes, "
                    "you can set '-max_shard_size' to process it by "
                    "small shards to use less memory." % src_corpus)

    if opt.max_shard_size != 0:
        logger.info(' * divide corpus into shards and build dataset '
                    'separately (shard_size = %d bytes).' % opt.max_shard_size)

    ret_list = []
    src_iter = inputters.ShardedTextCorpusIterator(src_corpus,
                                                   opt.src_seq_length_trunc,
                                                   "src", opt.max_shard_size)
    tgt_iter = inputters.ShardedTextCorpusIterator(tgt_corpus,
                                                   opt.tgt_seq_length_trunc,
                                                   "tgt",
                                                   opt.max_shard_size,
                                                   assoc_iter=src_iter)

    index = 0
    while not src_iter.hit_end():
        index += 1
        dataset = inputters.SingleTurnDataset(
            fields,
            src_iter,
            tgt_iter,
            src_seq_length=opt.src_seq_length,
            tgt_seq_length=opt.tgt_seq_length)

        # We save fields in vocab.pt separately, so make it empty.
        dataset.fields = []

        pt_file = "{:s}.{:s}.{:d}.pt".format(opt.save_data, corpus_type, index)
        logger.info(" * saving %s data shard to %s." % (corpus_type, pt_file))
        torch.save(dataset, pt_file)

        ret_list.append(pt_file)

    return ret_list
Esempio n. 12
0
    def train(self, train_iter_fct, valid_iter_fct, train_steps, valid_steps):
        """
        The main training loops.
        by iterating over training data (i.e. `train_iter_fct`)
        and running validation (i.e. iterating over `valid_iter_fct`
        Args:
            train_iter_fct(function): a function that returns the train
                iterator. e.g. something like
                train_iter_fct = lambda: generator(*args, **kwargs)
            valid_iter_fct(function): same as train_iter_fct, for valid data
            train_steps(int):
            valid_steps(int):
            save_checkpoint_steps(int):
        Return:
            None
        """
        logger.info('Start training...')

        step = self.optim._step + 1
        true_batchs = []
        accum = 0
        normalization = 0
        train_iter = train_iter_fct()

        total_stats = ogan.utils.Statistics()
        report_stats = ogan.utils.Statistics()
        self._start_report_manager(start_time=total_stats.start_time)

        while step <= train_steps:

            reduce_counter = 0
            for i, batch in enumerate(train_iter):
                if self.n_gpu == 0 or (i % self.n_gpu == self.gpu_rank):
                    if self.gpu_verbose_level > 1:
                        logger.info("GpuRank %d: index: %d accum: %d"
                                    % (self.gpu_rank, i, accum))
                    cur_dataset = train_iter.get_cur_dataset()
                    self.train_loss.cur_dataset = cur_dataset

                    true_batchs.append(batch)

                    if self.norm_method == "tokens":
                        num_tokens = batch.tgt[1:].data.view(-1) \
                                     .ne(self.train_loss.padding_idx).sum()
                        normalization += num_tokens
                    else:
                        normalization += batch.batch_size

                    accum += 1
                    if accum == self.grad_accum_count:
                        reduce_counter += 1
                        if self.gpu_verbose_level > 0:
                            logger.info("GpuRank %d: reduce_counter: %d \
                                        n_minibatch %d"
                                        % (self.gpu_rank, reduce_counter,
                                           len(true_batchs)))
                        if self.n_gpu > 1:
                            normalization = sum(ogan.utils.distributed
                                                .all_gather_list
                                                (normalization))

                        self._gradient_accumulation(
                            true_batchs, normalization, total_stats,
                            report_stats)

                        report_stats = self._maybe_report_training(
                            step, train_steps,
                            self.optim.learning_rate,
                            report_stats)

                        true_batchs = []
                        accum = 0
                        normalization = 0
                        if (step % valid_steps == 0):
                            if self.gpu_verbose_level > 0:
                                logger.info('GpuRank %d: validate step %d'
                                            % (self.gpu_rank, step))
                            valid_iter = valid_iter_fct()
                            valid_stats = self.validate(valid_iter)
                            if self.gpu_verbose_level > 0:
                                logger.info('GpuRank %d: gather valid stat \
                                            step %d' % (self.gpu_rank, step))
                            valid_stats = self._maybe_gather_stats(valid_stats)
                            if self.gpu_verbose_level > 0:
                                logger.info('GpuRank %d: report stat step %d'
                                            % (self.gpu_rank, step))
                            self._report_step(self.optim.learning_rate,
                                              step, valid_stats=valid_stats)

                        if self.gpu_rank == 0:
                            self._maybe_save(step)
                        step += 1
                        if step > train_steps:
                            break
            if self.gpu_verbose_level > 0:
                logger.info('GpuRank %d: we completed an epoch \
                            at step %d' % (self.gpu_rank, step))
            train_iter = train_iter_fct()

        return total_stats
Esempio n. 13
0
def main(opt):
    opt = training_opt_postprocessing(opt)

    # Load checkpoint if we resume from a previous training.
    if opt.train_from:
        logger.info('Loading checkpoint from %s' % opt.train_from)
        checkpoint = torch.load(opt.train_from,
                                map_location=lambda storage, loc: storage)
        embedding_opt = checkpoint["embedding_opt"]
        memory_opt = checkpoint["memory_opt"]
        discriminator_opt = checkpoint['discriminator_opt']
        generator_opt = checkpoint['generator_opt']

    else:
        checkpoint = None
        embedding_opt = memory_opt = discriminator_opt = generator_opt = opt

    # Peek the fisrt dataset to determine the data_type.
    # (All datasets have the same data_type).
    first_dataset = next(lazily_load_dataset("train", opt))
    data_type = first_dataset.data_type

    # Load fields generated from preprocess phase.
    fields = _load_fields(first_dataset, data_type, opt, checkpoint)

    # Build discriminator & generator.
    # discriminator = build_discriminator(discriminator_opt, opt, fields, discriminator_checkpoint)
    # generator = build_generator(generator_opt, opt, fields, generator_checkpoint)
    memory, discriminator, generator = build_model(embedding_opt, memory_opt,
                                                   discriminator_opt,
                                                   generator_opt, opt, fields,
                                                   checkpoint)

    # n_params, enc, dec = _tally_parameters(discriminator)
    # n_params, enc, dec = _tally_parameters(generator)
    # logger.info('encoder: %d' % enc)
    # logger.info('decoder: %d' % dec)
    # logger.info('* number of parameters: %d' % n_params)
    # _check_save_model_path(opt)

    # Build optimizer.
    discriminator_optim = build_optim([discriminator, memory], opt, checkpoint)
    generator_optim = build_optim([generator, memory], opt, checkpoint)

    # Build model saver
    model_saver = build_model_saver(embedding_opt, memory_opt,
                                    discriminator_opt, generator_opt, opt,
                                    memory, discriminator, generator, fields,
                                    discriminator_optim)

    trainer = build_trainer(opt,
                            memory,
                            discriminator,
                            generator,
                            fields,
                            discriminator_optim,
                            generator_optim,
                            data_type,
                            model_saver=model_saver)

    def train_iter_fct():
        return build_dataset_iter(lazily_load_dataset("train", opt), fields,
                                  opt)

    def valid_iter_fct():
        return build_dataset_iter(lazily_load_dataset("valid", opt), fields,
                                  opt)

    # Do training.
    trainer.train(train_iter_fct, valid_iter_fct, opt.train_steps,
                  opt.valid_steps)

    if opt.tensorboard:
        trainer.report_manager.tensorboard_writer.close()