Esempio n. 1
0
    def load_dataset(self,
                     split,
                     src_bin_path,
                     tgt_bin_path,
                     weights_file=None):
        corpus = pytorch_translate_data.ParallelCorpusConfig(
            source=pytorch_translate_data.CorpusConfig(
                dialect=self.args.source_lang, data_file=src_bin_path),
            target=pytorch_translate_data.CorpusConfig(
                dialect=self.args.target_lang, data_file=tgt_bin_path),
            weights_file=weights_file,
        )

        if self.args.log_verbose:
            print("Starting to load binarized data files.", flush=True)

        if not os.path.exists(corpus.source.data_file):
            raise ValueError(
                f"{corpus.source.data_file} for {split} not found!")
        if not os.path.exists(corpus.target.data_file):
            raise ValueError(
                f"{corpus.target.data_file} for {split} not found!")

        dst_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
            corpus.target.data_file)
        weights_dataset = None
        if corpus.weights_file and os.path.exists(corpus.weights_file):
            weights_dataset = weighted_data.IndexedWeightsDataset(
                corpus.weights_file)
            assert len(dst_dataset) == len(weights_dataset)

        if self.char_source_dict is not None:
            src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file(
                corpus.source.data_file)
            self.datasets[split] = char_data.LanguagePairSourceCharDataset(
                src=src_dataset,
                src_sizes=src_dataset.sizes,
                src_dict=self.source_dictionary,
                tgt=dst_dataset,
                tgt_sizes=dst_dataset.sizes,
                tgt_dict=self.target_dictionary,
                weights=weights_dataset,
            )
        else:
            src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                corpus.source.data_file)
            self.datasets[split] = weighted_data.WeightedLanguagePairDataset(
                src=src_dataset,
                src_sizes=src_dataset.sizes,
                src_dict=self.source_dictionary,
                tgt=dst_dataset,
                tgt_sizes=dst_dataset.sizes,
                tgt_dict=self.target_dictionary,
                weights=weights_dataset,
            )

        if self.args.log_verbose:
            print("Finished loading dataset", flush=True)

        print(f"| {split} {len(self.datasets[split])} examples")
    def load_dataset(self,
                     split,
                     src_bin_path,
                     tgt_bin_path,
                     weights_file=None,
                     is_train=False):
        """
        Currently this method does not support character models.
        """
        corpus = pytorch_translate_data.ParallelCorpusConfig(
            source=pytorch_translate_data.CorpusConfig(
                dialect=self.args.source_lang, data_file=src_bin_path),
            target=pytorch_translate_data.CorpusConfig(
                dialect=self.args.target_lang, data_file=tgt_bin_path),
            weights_file=weights_file,
        )

        if self.args.log_verbose:
            print("Starting to load binarized data files.", flush=True)
        data_utils.validate_corpus_exists(corpus=corpus, split=split)

        dst_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
            corpus.target.data_file)
        src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
            corpus.source.data_file)
        if is_train:
            self.datasets[split] = TeacherDataset(
                src=src_dataset,
                src_sizes=src_dataset.sizes,
                src_dict=self.src_dict,
                tgt=dst_dataset,
                tgt_sizes=dst_dataset.sizes,
                tgt_dict=self.tgt_dict,
                teacher_models=self.teacher_models,
                top_k_teacher_tokens=self.top_k_teacher_tokens,
                top_k_teacher_scores=self.top_k_teacher_scores,
                top_k_teacher_indices=self.top_k_teacher_indices,
                left_pad_source=False,
            )
        else:
            self.datasets[split] = weighted_data.WeightedLanguagePairDataset(
                src=src_dataset,
                src_sizes=src_dataset.sizes,
                src_dict=self.src_dict,
                tgt=dst_dataset,
                tgt_sizes=dst_dataset.sizes,
                tgt_dict=self.tgt_dict,
                weights=None,
                left_pad_source=False,
            )

        if self.args.log_verbose:
            print("Finished loading dataset", flush=True)

        print(f"| {split} {len(self.datasets[split])} examples")
Esempio n. 3
0
def load_parallel_dataset(
    source_lang,
    target_lang,
    src_bin_path,
    tgt_bin_path,
    source_dictionary,
    target_dictionary,
    split,
    remove_eos_from_source,
    append_eos_to_target=True,
    char_source_dict=None,
    log_verbose=True,
):
    corpus = pytorch_translate_data.ParallelCorpusConfig(
        source=pytorch_translate_data.CorpusConfig(
            dialect=source_lang, data_file=src_bin_path
        ),
        target=pytorch_translate_data.CorpusConfig(
            dialect=target_lang, data_file=tgt_bin_path
        ),
        weights_file=None,
    )

    if log_verbose:
        print("Starting to load binarized data files.", flush=True)
    validate_corpus_exists(corpus=corpus, split=split)

    tgt_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
        corpus.target.data_file
    )
    if char_source_dict is not None:
        src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file(
            corpus.source.data_file
        )
    else:
        src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
            corpus.source.data_file
        )
    parallel_dataset = weighted_data.WeightedLanguagePairDataset(
        src=src_dataset,
        src_sizes=src_dataset.sizes,
        src_dict=source_dictionary,
        tgt=tgt_dataset,
        tgt_sizes=tgt_dataset.sizes,
        tgt_dict=target_dictionary,
        remove_eos_from_source=remove_eos_from_source,
        append_eos_to_target=append_eos_to_target,
    )
    return parallel_dataset, src_dataset, tgt_dataset
Esempio n. 4
0
def setup_training(args):
    """Parse args, load dataset, and load model trainer."""
    if not torch.cuda.is_available():
        raise NotImplementedError("Training on CPU is not supported")
    torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)

    # Load dataset
    splits = [args.train_subset, args.valid_subset]

    validate_and_set_default_args(args)

    train_corpus = pytorch_translate_data.ParallelCorpusConfig(
        source=pytorch_translate_data.CorpusConfig(
            dialect=args.source_lang,
            data_file=args.train_source_binary_prefix),
        target=pytorch_translate_data.CorpusConfig(
            dialect=args.target_lang,
            data_file=args.train_target_binary_prefix),
    )
    eval_corpus = pytorch_translate_data.ParallelCorpusConfig(
        source=pytorch_translate_data.CorpusConfig(
            dialect=args.source_lang,
            data_file=args.eval_source_binary_prefix),
        target=pytorch_translate_data.CorpusConfig(
            dialect=args.target_lang,
            data_file=args.eval_target_binary_prefix),
    )

    if args.log_verbose:
        print("Starting to load binarized data files.", flush=True)
    dataset = pytorch_translate_data.load_binarized_dataset(
        train_corpus=train_corpus,
        eval_corpus=eval_corpus,
        train_split=args.train_subset,
        eval_split=args.valid_subset,
        args=args,
    )
    if args.log_verbose:
        print("Finished loading dataset", flush=True)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types")
    print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types")

    for split in splits:
        print(f"| {split} {len(dataset.splits[split])} examples")

    # Build model and criterion
    model = models.build_model(args, dataset.src_dict, dataset.dst_dict)
    criterion = criterions.build_criterion(args, dataset.src_dict,
                                           dataset.dst_dict)
    print(f"| model {args.arch}, criterion {criterion.__class__.__name__}")
    print(f"| num. model params: \
        {sum(p.data.numel() for p in model.parameters())}")

    # Build trainer
    trainer = Trainer(args, model, criterion)
    print(f"| training on {args.distributed_world_size} GPUs")
    print(
        f"| max tokens per GPU = {args.max_tokens} and \
        max sentences per GPU = {args.max_sentences}",
        flush=True,
    )

    extra_state = load_existing_checkpoint(args.save_dir, args.restore_file,
                                           trainer)

    return extra_state, trainer, dataset
Esempio n. 5
0
    def load_dataset(
        self, split, src_bin_path, tgt_bin_path, forward_model=None, backward_model=None
    ):
        """Load a dataset split."""

        corpus = ptt_data.ParallelCorpusConfig(
            source=ptt_data.CorpusConfig(
                dialect=self.source_lang, data_file=src_bin_path
            ),
            target=ptt_data.CorpusConfig(
                dialect=self.target_lang, data_file=tgt_bin_path
            ),
            weights_file=None,
        )

        if self.args.log_verbose:
            print("Starting to load binarized data files.", flush=True)
        data_utils.validate_corpus_exists(corpus=corpus, split=split)

        forward_tgt_dataset = ptt_data.InMemoryNumpyDataset.create_from_file(
            corpus.target.data_file
        )
        backward_tgt_dataset = ptt_data.InMemoryNumpyDataset.create_from_file(
            corpus.source.data_file
        )
        forward_src_dataset = ptt_data.InMemoryNumpyDataset.create_from_file(
            corpus.source.data_file
        )
        backward_src_dataset = ptt_data.InMemoryNumpyDataset.create_from_file(
            corpus.target.data_file
        )
        forward_parallel_dataset = weighted_data.WeightedLanguagePairDataset(
            src=forward_src_dataset,
            src_sizes=forward_src_dataset.sizes,
            src_dict=self.source_dictionary,
            tgt=forward_tgt_dataset,
            tgt_sizes=forward_tgt_dataset.sizes,
            tgt_dict=self.target_dictionary,
            remove_eos_from_source=self.remove_eos_from_source,
            append_eos_to_target=True,
        )
        backward_parallel_dataset = weighted_data.WeightedLanguagePairDataset(
            src=backward_src_dataset,
            src_sizes=backward_src_dataset.sizes,
            src_dict=self.target_dictionary,
            tgt=backward_tgt_dataset,
            tgt_sizes=backward_tgt_dataset.sizes,
            tgt_dict=self.source_dictionary,
            remove_eos_from_source=self.remove_eos_from_source,
            append_eos_to_target=True,
        )

        dataset_map = OrderedDict(
            [
                (f"{self.source_lang}-{self.target_lang}", forward_parallel_dataset),
                (f"{self.target_lang}-{self.source_lang}", backward_parallel_dataset),
            ]
        )

        assert (forward_model and backward_model) or (
            forward_model is None and backward_model is None
        ), (
            "Only one of forward or backward models can't be null;"
            " both have to be non-null or null"
        )
        if forward_model and backward_model:
            fwd_generator = beam_decode.SequenceGenerator(
                models=[forward_model], tgt_dict=self.source_dictionary
            )
            bwd_generator = beam_decode.SequenceGenerator(
                models=[backward_model], tgt_dict=self.target_dictionary
            )

            def monolingual_dataset(
                path,
                dictionary,
                is_source=False,
                num_examples_limit: Optional[int] = None,
            ):
                dataset = self.load_monolingual_dataset(
                    path, is_source=is_source, num_examples_limit=num_examples_limit
                )
                return LanguagePairDataset(
                    src=dataset,
                    src_sizes=dataset.sizes,
                    src_dict=dictionary,
                    tgt=None,
                    tgt_sizes=None,
                    tgt_dict=None,
                )

            monolingual_num_examples_limit = None
            if self.args.monolingual_ratio is not None:
                monolingual_num_examples_limit = int(
                    self.args.monolingual_ratio * len(forward_parallel_dataset)
                )

            src_dataset = monolingual_dataset(
                path=self.args.train_mono_source_binary_path,
                dictionary=self.source_dictionary,
                is_source=True,
                num_examples_limit=monolingual_num_examples_limit,
            )
            tgt_dataset = monolingual_dataset(
                path=self.args.train_mono_target_binary_path,
                dictionary=self.target_dictionary,
                is_source=False,
                num_examples_limit=monolingual_num_examples_limit,
            )

            dataset_map[
                f"{self.source_lang}-"
                f"{self.target_lang}_{constants.MONOLINGUAL_DATA_IDENTIFIER}"
            ] = BacktranslationDataset(
                tgt_dataset=TransformEosDataset(
                    dataset=tgt_dataset,
                    eos=self.target_dictionary.eos(),
                    # Remove EOS from the input before backtranslation.
                    remove_eos_from_src=True,
                ),
                backtranslation_fn=bwd_generator.generate,
                max_len_a=self.args.max_len_a,
                max_len_b=self.args.max_len_b,
                output_collater=TransformEosDataset(
                    dataset=tgt_dataset,
                    eos=self.target_dictionary.eos(),
                    # The original input (now the target) doesn't have
                    # an EOS, so we need to add one. The generated
                    # backtranslation (now the source) will have an EOS,
                    # so we want to remove it.
                    append_eos_to_tgt=True,
                    remove_eos_from_src=True,
                ).collater,
            )
            dataset_map[
                f"{self.target_lang}-"
                f"{self.source_lang}_{constants.MONOLINGUAL_DATA_IDENTIFIER}"
            ] = BacktranslationDataset(
                tgt_dataset=src_dataset,
                backtranslation_fn=fwd_generator.generate,
                max_len_a=self.args.max_len_a,
                max_len_b=self.args.max_len_b,
                output_collater=TransformEosDataset(
                    dataset=src_dataset,
                    eos=self.source_dictionary.eos(),
                    # The original input (now the target) doesn't have
                    # an EOS, so we need to add one. The generated
                    # backtranslation (now the source) will have an EOS,
                    # so we want to remove it.
                    append_eos_to_tgt=True,
                    remove_eos_from_src=True,
                ).collater,
            )

        # print before loading RoundRobinZipDatasets to help catch any bugs
        for dataset_key, dataset in dataset_map.items():
            print(f"| {split}: {dataset_key} {len(dataset)} examples in dataset")

        self.datasets[split] = RoundRobinZipDatasets(dataset_map)
        print(
            f"| {split} {len(self.datasets[split])} examples in RoundRobinZipDatasets"
        )

        if self.args.log_verbose:
            print("Finished loading dataset", flush=True)
Esempio n. 6
0
    def load_dataset(self,
                     split,
                     src_bin_path,
                     tgt_bin_path,
                     seed=None,
                     noiser=None):
        """
        Load a dataset split. Seed and noiser are only used for loading train
        data, not eval data.
        """

        corpus = pytorch_translate_data.ParallelCorpusConfig(
            source=pytorch_translate_data.CorpusConfig(
                dialect=self.source_lang, data_file=src_bin_path),
            target=pytorch_translate_data.CorpusConfig(
                dialect=self.target_lang, data_file=tgt_bin_path),
            weights_file=None,
        )

        if self.args.log_verbose:
            print("Starting to load binarized data files.", flush=True)
        data_utils.validate_corpus_exists(corpus=corpus, split=split)

        tgt_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
            corpus.target.data_file)
        if self.char_source_dict is not None:
            src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file(
                corpus.source.data_file)
        else:
            src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                corpus.source.data_file)
        parallel_dataset = weighted_data.WeightedLanguagePairDataset(
            src=src_dataset,
            src_sizes=src_dataset.sizes,
            src_dict=self.source_dictionary,
            tgt=tgt_dataset,
            tgt_sizes=tgt_dataset.sizes,
            tgt_dict=self.target_dictionary,
            remove_eos_from_source=not self.args.append_eos_to_source,
            append_eos_to_target=True,
        )
        dataset_map = OrderedDict([(f"{self.source_lang}-{self.target_lang}",
                                    parallel_dataset)])

        if noiser is not None:
            if getattr(self.args, "denoising_source_parallel", False):
                dataset_map[(
                    f"{self.source_lang}-{self.source_lang}"
                )] = weighted_data.WeightedLanguagePairDataset(
                    src=noising.NoisingDataset(
                        src_dataset=src_dataset,
                        src_dict=self.source_dictionary,
                        seed=seed,
                        noiser=self.noiser,
                    ),
                    tgt=src_dataset,
                    src_sizes=src_dataset.sizes,
                    src_dict=self.source_dictionary,
                    remove_eos_from_source=not self.args.append_eos_to_source,
                    append_eos_to_target=True,
                )
            if getattr(self.args, "denoising_target_parallel", False):
                dataset_map[(
                    f"{self.target_lang}-{self.target_lang}"
                )] = weighted_data.WeightedLanguagePairDataset(
                    src=noising.NoisingDataset(
                        src_dataset=tgt_dataset,
                        src_dict=self.target_dictionary,
                        seed=seed,
                        noiser=self.noiser,
                    ),
                    tgt=tgt_dataset,
                    src_sizes=tgt_dataset.sizes,
                    src_dict=self.target_dictionary,
                    remove_eos_from_source=not self.args.append_eos_to_source,
                    append_eos_to_target=True,
                )

            if getattr(self.args, "denoising_source_mono", False):
                source_mono_dataset = self.load_monolingual_dataset(
                    self.args.train_mono_source_binary_path)
                dataset_map[(
                    f"{self.source_lang}-{self.source_lang}_"
                    f"{constants.MONOLINGUAL_DATA_IDENTIFIER}"
                )] = weighted_data.WeightedLanguagePairDataset(
                    src=noising.NoisingDataset(
                        src_dataset=source_mono_dataset,
                        src_dict=self.source_dictionary,
                        seed=seed,
                        noiser=self.noiser,
                    ),
                    tgt=source_mono_dataset,
                    src_sizes=source_mono_dataset.sizes,
                    src_dict=self.source_dictionary,
                    remove_eos_from_source=not self.args.append_eos_to_source,
                    append_eos_to_target=True,
                )
            if getattr(self.args, "denoising_target_mono", False):
                target_mono_dataset = self.load_monolingual_dataset(
                    self.args.train_mono_target_binary_path)
                dataset_map[(
                    f"{self.target_lang}-{self.target_lang}_"
                    f"{constants.MONOLINGUAL_DATA_IDENTIFIER}"
                )] = weighted_data.WeightedLanguagePairDataset(
                    src=noising.NoisingDataset(
                        src_dataset=target_mono_dataset,
                        src_dict=self.target_dictionary,
                        seed=seed,
                        noiser=self.noiser,
                    ),
                    tgt=target_mono_dataset,
                    src_sizes=target_mono_dataset.sizes,
                    src_dict=self.target_dictionary,
                    remove_eos_from_source=not self.args.append_eos_to_source,
                    append_eos_to_target=True,
                )

        self.datasets[split] = RoundRobinZipDatasets(dataset_map)

        if self.args.log_verbose:
            print("Finished loading dataset", flush=True)

        print(f"| {split} {len(self.datasets[split])} datasets")
Esempio n. 7
0
def setup_training(args):
    """Parse args, load dataset, and load model trainer."""
    if not torch.cuda.is_available():
        raise NotImplementedError("Training on CPU is not supported")
    torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)

    # Load dataset
    splits = [args.train_subset, args.valid_subset]

    validate_and_set_default_args(args)

    train_corpus = pytorch_translate_data.ParallelCorpusConfig(
        source=pytorch_translate_data.CorpusConfig(
            dialect=args.source_lang, data_file=args.train_source_binary_path),
        target=pytorch_translate_data.CorpusConfig(
            dialect=args.target_lang, data_file=args.train_target_binary_path),
        weights_file=args.train_weights_path if hasattr(
            args, "train_weights_path") else None,
    )

    eval_corpus = pytorch_translate_data.ParallelCorpusConfig(
        source=pytorch_translate_data.CorpusConfig(
            dialect=args.source_lang, data_file=args.eval_source_binary_path),
        target=pytorch_translate_data.CorpusConfig(
            dialect=args.target_lang, data_file=args.eval_target_binary_path),
        weights_file=None,
    )

    if args.log_verbose:
        print("Starting to load binarized data files.", flush=True)
    use_char_source = args.arch == "char_source"
    dataset = pytorch_translate_data.load_binarized_dataset(
        train_corpus=train_corpus,
        eval_corpus=eval_corpus,
        train_split=args.train_subset,
        eval_split=args.valid_subset,
        args=args,
        use_char_source=use_char_source,
    )
    if args.log_verbose:
        print("Finished loading dataset", flush=True)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types")
    print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types")

    for split in splits:
        print(f"| {split} {len(dataset.splits[split])} examples")

    # Build model and criterion
    model = models.build_model(args, dataset.src_dict, dataset.dst_dict)
    print("building criterion")
    criterion = criterions.build_criterion(args, dataset.src_dict,
                                           dataset.dst_dict)
    print(f"| model {args.arch}, criterion {criterion.__class__.__name__}")
    print(f"| num. model params: \
        {sum(p.numel() for p in model.parameters())}")

    # Load pretrained model weights if applicable
    if args.pretrained_weights_file:
        utils.load_model_state(args.pretrained_weights_file,
                               model,
                               cuda_device=torch.cuda.current_device())

    # Build trainer
    trainer = Trainer(args, model, criterion)
    print(f"| training on {args.distributed_world_size} GPUs")
    print(
        f"| max tokens per GPU = {args.max_tokens} and \
        max sentences per GPU = {args.max_sentences}",
        flush=True,
    )

    os.makedirs(args.save_dir, exist_ok=True)
    checkpoint_path = os.path.join(args.save_dir, args.restore_file)
    if not os.path.isfile(checkpoint_path) and args.multi_model_restore_files:
        print(
            f"| Restoring individual models from {args.multi_model_restore_files}"
        )
        extra_state = multi_model.import_individual_models(
            args.multi_model_restore_files, trainer)
    else:
        extra_state = load_existing_checkpoint(checkpoint_path, trainer)
    return extra_state, trainer, dataset
Esempio n. 8
0
def setup_training(args):
    """Parse args, load dataset, and load model trainer."""
    if not torch.cuda.is_available():
        raise NotImplementedError("Training on CPU is not supported")
    torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)

    # Load dataset
    splits = [args.train_subset, args.valid_subset]

    validate_and_set_default_args(args)

    train_corpus = pytorch_translate_data.ParallelCorpusConfig(
        source=pytorch_translate_data.CorpusConfig(
            dialect=args.source_lang, data_file=args.train_source_binary_path),
        target=pytorch_translate_data.CorpusConfig(
            dialect=args.target_lang, data_file=args.train_target_binary_path),
        weights_file=args.train_weights_path if hasattr(
            args, "train_weights_path") else None,
    )

    eval_corpus = pytorch_translate_data.ParallelCorpusConfig(
        source=pytorch_translate_data.CorpusConfig(
            dialect=args.source_lang, data_file=args.eval_source_binary_path),
        target=pytorch_translate_data.CorpusConfig(
            dialect=args.target_lang, data_file=args.eval_target_binary_path),
        weights_file=None,
    )

    if args.log_verbose:
        print("Starting to load binarized data files.", flush=True)
    use_char_source = args.arch == "char_source"
    dataset = pytorch_translate_data.load_binarized_dataset(
        train_corpus=train_corpus,
        eval_corpus=eval_corpus,
        train_split=args.train_subset,
        eval_split=args.valid_subset,
        args=args,
        use_char_source=use_char_source,
    )
    if args.log_verbose:
        print("Finished loading dataset", flush=True)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types")
    print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types")

    for split in splits:
        print(f"| {split} {len(dataset.splits[split])} examples")

    # Build model and criterion
    model = models.build_model(args, dataset.src_dict, dataset.dst_dict)
    print("building criterion")
    criterion = criterions.build_criterion(args, dataset.src_dict,
                                           dataset.dst_dict)
    print(f"| model {args.arch}, criterion {criterion.__class__.__name__}")
    print(f"| num. model params: \
        {sum(p.numel() for p in model.parameters())}")

    # Build trainer
    trainer = Trainer(args, model, criterion)
    print(f"| training on {args.distributed_world_size} GPUs")
    print(
        f"| max tokens per GPU = {args.max_tokens} and \
        max sentences per GPU = {args.max_sentences}",
        flush=True,
    )

    os.makedirs(args.save_dir, exist_ok=True)

    # If --restore-file is already present under --save-dir, use that one
    # instead of the --restore-file that may be present under
    # --restore-checkpoint-dir. The idea is that --restore-checkpoint-dir
    # allows the user to specify restoring from a different run's
    # checkpoint (possibly with different training params), while not
    # polluting the previous run's checkpoint directory with new checkpoints.
    # However, if training gets interrupted and the user restarts training,
    # we want to resume from the checkpoints under --save-dir, instead of
    # restarting again from the old run's checkpoint under
    # --restore-checkpoint-dir.
    #
    # Note that if args.restore_file is an absolute path, os.path.join() will
    # ignore previous directory args and just use the absolute path as is.
    checkpoint_path = os.path.join(args.save_dir, args.restore_file)
    if os.path.exists(checkpoint_path):
        print(f"Using --save-dir={args.save_dir}, "
              f"--restore-file={args.restore_file}.")
    elif args.restore_checkpoint_dir:
        checkpoint_path = os.path.join(args.restore_checkpoint_dir,
                                       args.restore_file)
        print(f"Using --restore-checkpoint-dir={args.restore_checkpoint_dir}, "
              f"--restore-file={args.restore_file}.")

    if not os.path.isfile(checkpoint_path) and args.multi_model_restore_files:
        print(
            f"| Restoring individual models from {args.multi_model_restore_files}"
        )
        extra_state = multi_model.import_individual_models(
            args.multi_model_restore_files, trainer)
    else:
        extra_state = load_existing_checkpoint(
            checkpoint_path=checkpoint_path,
            trainer=trainer,
            restore_state=args.restore_checkpoint_state,
        )
    return extra_state, trainer, dataset
Esempio n. 9
0
    def load_dataset(self,
                     split,
                     src_bin_path,
                     tgt_bin_path,
                     forward_model=None,
                     backward_model=None):
        """Load a dataset split."""

        corpus = ptt_data.ParallelCorpusConfig(
            source=ptt_data.CorpusConfig(dialect=self.source_lang,
                                         data_file=src_bin_path),
            target=ptt_data.CorpusConfig(dialect=self.target_lang,
                                         data_file=tgt_bin_path),
            weights_file=None,
        )

        if self.args.log_verbose:
            print("Starting to load binarized data files.", flush=True)
        data_utils.validate_corpus_exists(corpus=corpus, split=split)

        forward_tgt_dataset = ptt_data.InMemoryNumpyDataset.create_from_file(
            corpus.target.data_file)
        backward_tgt_dataset = ptt_data.InMemoryNumpyDataset.create_from_file(
            corpus.source.data_file)
        forward_src_dataset = ptt_data.InMemoryNumpyDataset.create_from_file(
            corpus.source.data_file)
        backward_src_dataset = ptt_data.InMemoryNumpyDataset.create_from_file(
            corpus.target.data_file)
        forward_parallel_dataset = weighted_data.WeightedLanguagePairDataset(
            src=forward_src_dataset,
            src_sizes=forward_src_dataset.sizes,
            src_dict=self.source_dictionary,
            tgt=forward_tgt_dataset,
            tgt_sizes=forward_tgt_dataset.sizes,
            tgt_dict=self.target_dictionary,
            remove_eos_from_source=self.remove_eos_from_source,
            append_eos_to_target=True,
        )
        backward_parallel_dataset = weighted_data.WeightedLanguagePairDataset(
            src=backward_src_dataset,
            src_sizes=backward_src_dataset.sizes,
            src_dict=self.target_dictionary,
            tgt=backward_tgt_dataset,
            tgt_sizes=backward_tgt_dataset.sizes,
            tgt_dict=self.source_dictionary,
            remove_eos_from_source=self.remove_eos_from_source,
            append_eos_to_target=True,
        )

        dataset_map = OrderedDict([
            (f"{self.source_lang}-{self.target_lang}",
             forward_parallel_dataset),
            (f"{self.target_lang}-{self.source_lang}",
             backward_parallel_dataset),
        ])

        assert (forward_model and backward_model) or (
            forward_model is None and backward_model is None), (
                "Only one of forward or backward models can't be null;"
                " both have to be non-null or null")
        if forward_model and backward_model:
            dataset_map[
                f"{self.source_lang}-"
                f"{self.target_lang}_{constants.MONOLINGUAL_DATA_IDENTIFIER}"] = BacktranslationDataset(
                    tgt_dataset=self.load_monolingual_dataset(
                        self.args.train_mono_target_binary_path),
                    tgt_dict=self.target_dictionary,
                    backtranslation_model=backward_model,
                    max_len_a=self.args.max_len_a,
                    max_len_b=self.args.max_len_b,
                    remove_eos_at_src=True,
                    generator_class=beam_decode.SequenceGenerator,
                )
            dataset_map[
                f"{self.target_lang}-"
                f"{self.source_lang}_{constants.MONOLINGUAL_DATA_IDENTIFIER}"] = BacktranslationDataset(
                    tgt_dataset=self.load_monolingual_dataset(
                        self.args.train_mono_source_binary_path),
                    tgt_dict=self.source_dictionary,
                    backtranslation_model=forward_model,
                    max_len_a=self.args.max_len_a,
                    max_len_b=self.args.max_len_b,
                    remove_eos_at_src=True,
                    generator_class=beam_decode.SequenceGenerator,
                )
        self.datasets[split] = RoundRobinZipDatasets(dataset_map)

        if self.args.log_verbose:
            print("Finished loading dataset", flush=True)

        print(f"| {split} {len(self.datasets[split])} datasets")
Esempio n. 10
0
def setup_training(args):
    """Parse args, load dataset, and load model trainer."""
    if not torch.cuda.is_available():
        raise NotImplementedError('Training on CPU is not supported')
    torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)

    # Load dataset
    splits = [args.train_subset, args.valid_subset]

    if args.source_lang is None:
        args.source_lang = 'src'
    if args.target_lang is None:
        args.target_lang = 'tgt'

    assert_corpora_files_specified(args)
    train_corpus = pytorch_translate_data.ParallelCorpusConfig(
        source=pytorch_translate_data.CorpusConfig(
            dialect=args.source_lang,
            data_file=args.train_source_text_file,
        ),
        target=pytorch_translate_data.CorpusConfig(
            dialect=args.target_lang,
            data_file=args.train_target_text_file,
        ),
    )
    eval_corpus = pytorch_translate_data.ParallelCorpusConfig(
        source=pytorch_translate_data.CorpusConfig(
            dialect=args.source_lang,
            data_file=args.eval_source_text_file,
        ),
        target=pytorch_translate_data.CorpusConfig(
            dialect=args.target_lang,
            data_file=args.eval_target_text_file,
        ),
    )

    if args.log_verbose:
        print('Starting to load raw text files.', flush=True)
    dataset = pytorch_translate_data.load_raw_text_dataset(
        train_corpus=train_corpus,
        eval_corpus=eval_corpus,
        train_split=args.train_subset,
        eval_split=args.valid_subset,
        args=args,
    )
    if args.log_verbose:
        print('Finished loading dataset', flush=True)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print('| [{}] dictionary: {} types'.format(
        dataset.src,
        len(dataset.src_dict),
    ))
    print('| [{}] dictionary: {} types'.format(
        dataset.dst,
        len(dataset.dst_dict),
    ))
    for split in splits:
        print('| {} {} examples'.format(
            split,
            len(dataset.splits[split]),
        ))

    # Build model and criterion
    model = models.build_model(args, dataset.src_dict, dataset.dst_dict)
    criterion = criterions.build_criterion(args, dataset.src_dict,
                                           dataset.dst_dict)
    print('| model {}, criterion {}'.format(
        args.arch,
        criterion.__class__.__name__,
    ))
    print('| num. model params: {}'.format(
        sum(p.data.numel() for p in model.parameters())))

    # Build trainer
    trainer = Trainer(args, model, criterion)
    print('| training on {} GPUs'.format(args.distributed_world_size))
    print(
        '| max tokens per GPU = {} and max sentences per GPU = {}'.format(
            args.max_tokens,
            args.max_sentences,
        ),
        flush=True,
    )

    extra_state = load_existing_checkpoint(
        args.save_dir,
        args.restore_file,
        trainer,
    )

    return extra_state, trainer, dataset