Ejemplo n.º 1
0
 def __init__(self,
              tgt_dataset,
              src_dict,
              tgt_dict=None,
              backtranslation_fn=None,
              output_collater=None,
              backward_output_collater=None,
              cuda=True,
              noising=False,
              **kwargs):
     self.tgt_dataset = tgt_dataset
     self.backtranslation_fn = backtranslation_fn
     self.output_collater = output_collater if output_collater is not None \
         else tgt_dataset.collater
     self.backward_output_collater = backward_output_collater if backward_output_collater is not None \
         else self.output_collater
     self.cuda = cuda if torch.cuda.is_available() else False
     self.src_dict = src_dict
     self.tgt_dict = tgt_dict
     if noising:
         self.noising = UnsupervisedMTNoising(self.src_dict,
                                              max_word_shuffle_distance=0,
                                              word_dropout_prob=0.1,
                                              word_blanking_prob=0.1,
                                              bpe_cont_marker="▁")
         #self.noising = UnsupervisedMTNoising(self.src_dict, max_word_shuffle_distance=5, word_dropout_prob=0.2, word_blanking_prob=0.2, bpe_cont_marker="▁")
     else:
         self.noising = None
Ejemplo n.º 2
0
 def test_load_data_noising(self):
     test_args = test_utils.ModelParamsDict()
     test_args.source_lang = "en"
     test_args.target_lang = "fr"
     test_args.log_verbose = False
     src_dict, tgt_dict = test_utils.create_vocab_dictionaries()
     num_paths = 4
     src_bin_path, tgt_bin_path = {}, {}
     for i in range(num_paths):
         src_text_file, tgt_text_file = test_utils.create_test_text_files()
         src_bin_path[i] = preprocess.binarize_text_file(
             text_file=src_text_file,
             dictionary=src_dict,
             output_path=tempfile.NamedTemporaryFile().name,
             append_eos=True,
             reverse_order=False,
         )
         tgt_bin_path[i] = preprocess.binarize_text_file(
             text_file=tgt_text_file,
             dictionary=tgt_dict,
             output_path=tempfile.NamedTemporaryFile().name,
             append_eos=True,
             reverse_order=False,
         )
     task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
     split = "1"
     task.load_dataset(
         split,
         src_bin_path,
         tgt_bin_path,
         noiser={
             0:
             UnsupervisedMTNoising(
                 dictionary=src_dict,
                 max_word_shuffle_distance=3,
                 word_dropout_prob=0.2,
                 word_blanking_prob=0.2,
             )
         },
     )
     self.assertEqual(len(task.datasets[split]), 16)
     self.assertIsInstance(task.datasets[split].datasets[0].src,
                           NoisingDataset)
Ejemplo n.º 3
0
    def _load_dataset_multi_path(
        self, split: str, src_bin_path: str, tgt_bin_path: str, is_npz: bool = True
    ):
        assert type(tgt_bin_path) is not str
        assert set(src_bin_path.keys()) == set(tgt_bin_path.keys())
        source_lang = self.args.source_lang or "src"
        target_lang = self.args.target_lang or "tgt"
        direction = source_lang + "-" + target_lang
        dataset_upsampling = (
            pytorch_translate_utils.maybe_parse_collection_argument(
                self.args.dataset_upsampling
            )[direction]
            if self.args.dataset_upsampling
            else None
        )
        dataset_relative_ratio = (
            pytorch_translate_utils.maybe_parse_collection_argument(
                self.args.dataset_relative_ratio
            )[direction]
            if self.args.dataset_relative_ratio
            else None
        )
        noiser = {}
        noise_options = [
            "word_dropout_prob",
            "max_word_shuffle_distance",
            "word_blanking_prob",
        ]
        for option in noise_options:
            option_map = getattr(self.args, option + "_map", None)
            if option_map:
                option_map = pytorch_translate_utils.maybe_parse_collection_argument(
                    option_map
                )[direction]
                for key in option_map:
                    if key not in noiser:
                        noiser[key] = {
                            noise_option: None for noise_option in noise_options
                        }
                    noiser[key][option] = option_map[key]

        for key in noiser:
            noiser[key] = UnsupervisedMTNoising(
                dictionary=self.src_dict,
                max_word_shuffle_distance=noiser[key]["max_word_shuffle_distance"] or 0,
                word_dropout_prob=noiser[key]["word_dropout_prob"] or 0,
                word_blanking_prob=noiser[key]["word_blanking_prob"] or 0,
            )

        if dataset_relative_ratio is not None:
            assert dataset_upsampling is None, "dataset_upsampling and "
            "dataset_relative_ratio couldn't be specified together."
            assert dataset_relative_ratio[0] in src_bin_path.keys()
            self._load_dataset_multi_path_helper(
                split=split,
                src_multiple_bin_paths=src_bin_path,
                tgt_multiple_bin_paths=tgt_bin_path,
                dataset_relative_ratio=dataset_relative_ratio,
                seed=self.args.seed,
                noiser=noiser,
                is_npz=is_npz,
            )
        elif dataset_upsampling is not None:
            for key in dataset_upsampling.keys():
                assert key in src_bin_path.keys()
            self._load_dataset_multi_path_helper(
                split=split,
                src_multiple_bin_paths=src_bin_path,
                tgt_multiple_bin_paths=tgt_bin_path,
                dataset_upsampling=dataset_upsampling,
                seed=self.args.seed,
                noiser=noiser,
                is_npz=is_npz,
            )
        else:
            self._load_dataset_multi_path_helper(
                split=split,
                src_multiple_bin_paths=src_bin_path,
                tgt_multiple_bin_paths=tgt_bin_path,
                seed=self.args.seed,
                noiser=noiser,
                is_npz=is_npz,
            )
Ejemplo n.º 4
0
def setup_training_model(args):
    """Parse args, load dataset, and build model with criterion."""
    if not torch.cuda.is_available():
        print("Warning: training without CUDA is likely to be slow!")
    else:
        torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)

    # Setup task and load dataset
    task = tasks.setup_task(args)

    # Build model and criterion
    model = task.build_model(args)
    print("| building criterion")
    criterion = task.build_criterion(args)
    print(f"| model {args.arch}, criterion {criterion.__class__.__name__}")
    print(f"| num. model params: \
        {sum(p.numel() for p in model.parameters())}")

    if args.task == constants.SEMI_SUPERVISED_TASK:
        # TODO(T35638969): hide this inside the task itself, just use self.args
        task.load_dataset(
            split=args.train_subset,
            src_bin_path=args.train_source_binary_path,
            tgt_bin_path=args.train_target_binary_path,
            forward_model=task.forward_model,
            backward_model=task.backward_model,
        )
    elif args.task == "pytorch_translate_denoising_autoencoder":
        task.load_dataset(
            split=args.train_subset,
            src_bin_path=args.train_source_binary_path,
            tgt_bin_path=args.train_target_binary_path,
            seed=args.seed,
            use_noiser=True,
        )
    elif args.task == "dual_learning_task":
        task.load_dataset(split=args.train_subset, seed=args.seed)
    elif args.task == "pytorch_translate_knowledge_distillation":
        task.load_dataset(
            split=args.train_subset,
            src_bin_path=args.train_source_binary_path,
            tgt_bin_path=args.train_target_binary_path,
            weights_file=getattr(args, "train_weights_path", None),
            is_train=True,
        )
    else:
        # Support both single and multi path loading for now
        dataset_upsampling = getattr(args, "dataset_upsampling", None)
        dataset_relative_ratio = getattr(args, "dataset_relative_ratio", None)
        source_lang = getattr(args, "source_lang", "src")
        target_lang = getattr(args, "target_lang", "tgt")
        direction = source_lang + "-" + target_lang

        if dataset_upsampling:
            dataset_upsampling = pytorch_translate_utils.maybe_parse_collection_argument(
                dataset_upsampling)[direction]
        if dataset_relative_ratio:
            dataset_relative_ratio = pytorch_translate_utils.maybe_parse_collection_argument(
                dataset_relative_ratio)[direction]

        noiser = {}
        noise_options = [
            "word_dropout_prob",
            "max_word_shuffle_distance",
            "word_blanking_prob",
        ]
        for option in noise_options:
            option_map = getattr(args, option + "_map", None)
            if option_map:
                option_map = pytorch_translate_utils.maybe_parse_collection_argument(
                    option_map)[direction]
                for key in option_map:
                    if key not in noiser:
                        noiser[key] = {
                            noise_option: None
                            for noise_option in noise_options
                        }
                    noiser[key][option] = option_map[key]

        for key in noiser:
            noiser[key] = UnsupervisedMTNoising(
                dictionary=task.src_dict,
                max_word_shuffle_distance=noiser[key]
                ["max_word_shuffle_distance"] or 0,
                word_dropout_prob=noiser[key]["word_dropout_prob"] or 0,
                word_blanking_prob=noiser[key]["word_blanking_prob"] or 0,
            )
        task.load_dataset(
            split=args.train_subset,
            src_bin_path=pytorch_translate_utils.
            maybe_parse_collection_argument(args.train_source_binary_path),
            tgt_bin_path=pytorch_translate_utils.
            maybe_parse_collection_argument(args.train_target_binary_path),
            weights_file=getattr(args, "train_weights_path", None),
            dataset_upsampling=dataset_upsampling,
            dataset_relative_ratio=dataset_relative_ratio,
            seed=args.seed,
            noiser=noiser,
        )

    if args.task == "dual_learning_task":
        task.load_dataset(split=args.valid_subset, seed=args.seed)
    else:
        task.load_dataset(
            split=args.valid_subset,
            src_bin_path=args.eval_source_binary_path,
            tgt_bin_path=args.eval_target_binary_path,
        )
    return task, model, criterion