Example #1
0
    def load_dataset(
        self,
        split: str,
        src_bin_path: str,
        tgt_bin_path: str,
        weights_file=None,
        is_npz=True,
    ):
        src_bin_path = pytorch_translate_utils.maybe_parse_collection_argument(
            src_bin_path
        )
        tgt_bin_path = pytorch_translate_utils.maybe_parse_collection_argument(
            tgt_bin_path
        )
        # At most one of dataset_upsampling / dataset_relative_ratio could be
        # specified.
        if type(src_bin_path) is str:
            assert type(tgt_bin_path) is str
            self._load_dataset_single_path(
                split=split,
                src_bin_path=src_bin_path,
                tgt_bin_path=tgt_bin_path,
                weights_file=weights_file,
                is_npz=is_npz,
            )
        else:
            self._load_dataset_multi_path(
                split, src_bin_path, tgt_bin_path, is_npz=is_npz
            )

        if getattr(self.args, "log_verbose", False):
            print("Finished loading dataset", flush=True)

        print(f"| {split} {len(self.datasets[split])} examples")
Example #2
0
def preprocess_corpora(args, dictionary_cls=Dictionary):
    if (args.train_source_binary_path is not None
            and args.train_target_binary_path is not None):
        if isinstance(
                utils.maybe_parse_collection_argument(
                    args.train_source_binary_path), str) and isinstance(
                        utils.maybe_parse_collection_argument(
                            args.train_target_binary_path), str):
            args.train_source_binary_path = maybe_generate_temp_file_path(
                args.train_source_binary_path)
            args.train_target_binary_path = maybe_generate_temp_file_path(
                args.train_target_binary_path)
    args.eval_source_binary_path = maybe_generate_temp_file_path(
        args.eval_source_binary_path)
    args.eval_target_binary_path = maybe_generate_temp_file_path(
        args.eval_target_binary_path)

    # Additional text preprocessing options could be added here before
    # binarizing.
    if pytorch_translate_data.is_multilingual(args):
        preprocess_corpora_multilingual(args)
    elif pytorch_translate_data.is_multilingual_many_to_one(args):
        preprocess_corpora_multilingual_many_to_one(args, dictionary_cls)
    elif pytorch_translate_data.is_latent_variable(args):
        preprocess_corpora_latent_variable(args)
    else:

        # Vocabs are built before preprocessing because we might need to use
        # both monolingual and bilingual corpora sources to build the vocab
        # (in the case of semisupervised training)
        source_dict, char_source_dict, target_dict = build_vocabs(
            args=args, dictionary_cls=dictionary_cls)

        preprocess_bilingual_corpora(
            args=args,
            source_dict=source_dict,
            char_source_dict=char_source_dict,
            target_dict=target_dict,
        )
        # Binarize additional monolingual corpora for the semisupervised translation
        # task
        if (args.task == constants.SEMI_SUPERVISED_TASK
                or args.task == constants.DENOISING_AUTOENCODER_TASK):
            args.train_mono_source_binary_path = maybe_generate_temp_file_path(
                output_path=getattr(args, "train_mono_source_binary_path",
                                    None))
            args.train_mono_target_binary_path = maybe_generate_temp_file_path(
                output_path=getattr(args, "train_mono_target_binary_path",
                                    None))
            preprocess_monolingual_corpora(
                args,
                source_dict=source_dict,
                char_source_dict=char_source_dict,
                target_dict=target_dict,
            )
Example #3
0
    def _load_dataset_multi_path(
        self, split: str, src_bin_path: str, tgt_bin_path: str, is_npz: bool = True
    ):
        assert type(tgt_bin_path) is not str
        assert set(src_bin_path.keys()) == set(tgt_bin_path.keys())
        source_lang = self.args.source_lang or "src"
        target_lang = self.args.target_lang or "tgt"
        direction = source_lang + "-" + target_lang
        dataset_upsampling = (
            pytorch_translate_utils.maybe_parse_collection_argument(
                self.args.dataset_upsampling
            )[direction]
            if self.args.dataset_upsampling
            else None
        )
        dataset_relative_ratio = (
            pytorch_translate_utils.maybe_parse_collection_argument(
                self.args.dataset_relative_ratio
            )[direction]
            if self.args.dataset_relative_ratio
            else None
        )
        noiser = {}
        noise_options = [
            "word_dropout_prob",
            "max_word_shuffle_distance",
            "word_blanking_prob",
        ]
        for option in noise_options:
            option_map = getattr(self.args, option + "_map", None)
            if option_map:
                option_map = pytorch_translate_utils.maybe_parse_collection_argument(
                    option_map
                )[direction]
                for key in option_map:
                    if key not in noiser:
                        noiser[key] = {
                            noise_option: None for noise_option in noise_options
                        }
                    noiser[key][option] = option_map[key]

        for key in noiser:
            noiser[key] = UnsupervisedMTNoising(
                dictionary=self.src_dict,
                max_word_shuffle_distance=noiser[key]["max_word_shuffle_distance"] or 0,
                word_dropout_prob=noiser[key]["word_dropout_prob"] or 0,
                word_blanking_prob=noiser[key]["word_blanking_prob"] or 0,
            )

        if dataset_relative_ratio is not None:
            assert dataset_upsampling is None, "dataset_upsampling and "
            "dataset_relative_ratio couldn't be specified together."
            assert dataset_relative_ratio[0] in src_bin_path.keys()
            self._load_dataset_multi_path_helper(
                split=split,
                src_multiple_bin_paths=src_bin_path,
                tgt_multiple_bin_paths=tgt_bin_path,
                dataset_relative_ratio=dataset_relative_ratio,
                seed=self.args.seed,
                noiser=noiser,
                is_npz=is_npz,
            )
        elif dataset_upsampling is not None:
            for key in dataset_upsampling.keys():
                assert key in src_bin_path.keys()
            self._load_dataset_multi_path_helper(
                split=split,
                src_multiple_bin_paths=src_bin_path,
                tgt_multiple_bin_paths=tgt_bin_path,
                dataset_upsampling=dataset_upsampling,
                seed=self.args.seed,
                noiser=noiser,
                is_npz=is_npz,
            )
        else:
            self._load_dataset_multi_path_helper(
                split=split,
                src_multiple_bin_paths=src_bin_path,
                tgt_multiple_bin_paths=tgt_bin_path,
                seed=self.args.seed,
                noiser=noiser,
                is_npz=is_npz,
            )
Example #4
0
def setup_training_model(args):
    """Parse args, load dataset, and build model with criterion."""
    if not torch.cuda.is_available():
        print("Warning: training without CUDA is likely to be slow!")
    else:
        torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)

    # Setup task and load dataset
    task = tasks.setup_task(args)

    # Build model and criterion
    model = task.build_model(args)
    print("| building criterion")
    criterion = task.build_criterion(args)
    print(f"| model {args.arch}, criterion {criterion.__class__.__name__}")
    print(f"| num. model params: \
        {sum(p.numel() for p in model.parameters())}")

    if args.task == constants.SEMI_SUPERVISED_TASK:
        # TODO(T35638969): hide this inside the task itself, just use self.args
        task.load_dataset(
            split=args.train_subset,
            src_bin_path=args.train_source_binary_path,
            tgt_bin_path=args.train_target_binary_path,
            forward_model=task.forward_model,
            backward_model=task.backward_model,
        )
    elif args.task == "pytorch_translate_denoising_autoencoder":
        task.load_dataset(
            split=args.train_subset,
            src_bin_path=args.train_source_binary_path,
            tgt_bin_path=args.train_target_binary_path,
            seed=args.seed,
            use_noiser=True,
        )
    elif args.task == "dual_learning_task":
        task.load_dataset(split=args.train_subset, seed=args.seed)
    elif args.task == "pytorch_translate_knowledge_distillation":
        task.load_dataset(
            split=args.train_subset,
            src_bin_path=args.train_source_binary_path,
            tgt_bin_path=args.train_target_binary_path,
            weights_file=getattr(args, "train_weights_path", None),
            is_train=True,
        )
    else:
        # Support both single and multi path loading for now
        dataset_upsampling = getattr(args, "dataset_upsampling", None)
        dataset_relative_ratio = getattr(args, "dataset_relative_ratio", None)
        source_lang = getattr(args, "source_lang", "src")
        target_lang = getattr(args, "target_lang", "tgt")
        direction = source_lang + "-" + target_lang

        if dataset_upsampling:
            dataset_upsampling = pytorch_translate_utils.maybe_parse_collection_argument(
                dataset_upsampling)[direction]
        if dataset_relative_ratio:
            dataset_relative_ratio = pytorch_translate_utils.maybe_parse_collection_argument(
                dataset_relative_ratio)[direction]

        noiser = {}
        noise_options = [
            "word_dropout_prob",
            "max_word_shuffle_distance",
            "word_blanking_prob",
        ]
        for option in noise_options:
            option_map = getattr(args, option + "_map", None)
            if option_map:
                option_map = pytorch_translate_utils.maybe_parse_collection_argument(
                    option_map)[direction]
                for key in option_map:
                    if key not in noiser:
                        noiser[key] = {
                            noise_option: None
                            for noise_option in noise_options
                        }
                    noiser[key][option] = option_map[key]

        for key in noiser:
            noiser[key] = UnsupervisedMTNoising(
                dictionary=task.src_dict,
                max_word_shuffle_distance=noiser[key]
                ["max_word_shuffle_distance"] or 0,
                word_dropout_prob=noiser[key]["word_dropout_prob"] or 0,
                word_blanking_prob=noiser[key]["word_blanking_prob"] or 0,
            )
        task.load_dataset(
            split=args.train_subset,
            src_bin_path=pytorch_translate_utils.
            maybe_parse_collection_argument(args.train_source_binary_path),
            tgt_bin_path=pytorch_translate_utils.
            maybe_parse_collection_argument(args.train_target_binary_path),
            weights_file=getattr(args, "train_weights_path", None),
            dataset_upsampling=dataset_upsampling,
            dataset_relative_ratio=dataset_relative_ratio,
            seed=args.seed,
            noiser=noiser,
        )

    if args.task == "dual_learning_task":
        task.load_dataset(split=args.valid_subset, seed=args.seed)
    else:
        task.load_dataset(
            split=args.valid_subset,
            src_bin_path=args.eval_source_binary_path,
            tgt_bin_path=args.eval_target_binary_path,
        )
    return task, model, criterion