コード例 #1
0
    def _load_dataset_multi_path(
        self,
        split: str,
        src_multiple_bin_paths: Dict[str, str],
        tgt_multiple_bin_paths: Dict[str, str],
        dataset_upsampling: Optional[Dict[str, float]] = None,
        dataset_relative_ratio: Optional[Tuple[str, float]] = None,
        seed: Optional[int] = None,
        noiser: Optional[Dict[str, UnsupervisedMTNoising]] = None,
    ):
        corpora_map = pytorch_translate_data.ParallelCorporaMapConfig(
            src_files=src_multiple_bin_paths, tgt_files=tgt_multiple_bin_paths)
        datasets = OrderedDict()
        for key in corpora_map.src_files:
            src, tgt = corpora_map.src_files[key], corpora_map.tgt_files[key]
            src_dataset, tgt_dataset = (
                pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                    src),
                pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                    tgt),
            )
            src_sizes = src_dataset.sizes
            if noiser is not None and key in noiser:
                src_dataset = NoisingDataset(
                    src_dataset=src_dataset,
                    src_dict=self.source_dictionary,
                    seed=seed,
                    noiser=noiser[key],
                )
            datasets[key] = LanguagePairDataset(
                src=src_dataset,
                src_sizes=src_sizes,
                src_dict=self.source_dictionary,
                tgt=tgt_dataset,
                tgt_sizes=tgt_dataset.sizes,
                tgt_dict=self.target_dictionary,
                left_pad_source=False,
            )
        total_line_count = sum(len(datasets[key]) for key in datasets)
        if dataset_relative_ratio is not None:
            ds, ratio = dataset_relative_ratio
            line_count = len(datasets[ds])
            # By definition ratio = u * line_count / sum(#lines of other datasets)
            u = (total_line_count - line_count) / line_count * ratio
            dataset_upsampling = {key: u}

        dataset_weights = {
            key: 1.0 * len(datasets[key]) / total_line_count
            for key in src_multiple_bin_paths.keys()
        }
        if dataset_upsampling is not None:
            for k, v in dataset_upsampling.items():
                dataset_weights[k] *= v
        print(f"|dataset_weights:{dataset_weights}")
        self.datasets[split] = MultiCorpusSampledDataset(
            datasets=datasets,
            default_key=list(dataset_weights.keys())[0],
            sampling_func=self._normalized_weighted_sampling(dataset_weights),
        )
コード例 #2
0
    def _load_dataset_multi_path_helper(
        self,
        split: str,
        src_multiple_bin_paths: Dict[str, str],
        tgt_multiple_bin_paths: Dict[str, str],
        dataset_upsampling: Optional[Dict[str, float]] = None,
        dataset_relative_ratio: Optional[Tuple[str, float]] = None,
        seed: Optional[int] = None,
        noiser: Optional[Dict[str, UnsupervisedMTNoising]] = None,
    ):
        corpora_map = pytorch_translate_data.ParallelCorporaMapConfig(
            src_files=src_multiple_bin_paths, tgt_files=tgt_multiple_bin_paths)
        datasets = OrderedDict()
        for key in corpora_map.src_files:
            src, tgt = corpora_map.src_files[key], corpora_map.tgt_files[key]
            src_dataset, tgt_dataset = (
                pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                    src),
                pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                    tgt),
            )
            src_sizes = src_dataset.sizes
            if noiser is not None and key in noiser:
                src_dataset = NoisingDataset(
                    src_dataset=src_dataset,
                    src_dict=self.source_dictionary,
                    seed=seed,
                    noiser=noiser[key],
                )
            datasets[key] = LanguagePairDataset(
                src=src_dataset,
                src_sizes=src_sizes,
                src_dict=self.source_dictionary,
                tgt=tgt_dataset,
                tgt_sizes=tgt_dataset.sizes,
                tgt_dict=self.target_dictionary,
                left_pad_source=False,
            )
        total_line_count = sum(len(datasets[key]) for key in datasets)
        if dataset_relative_ratio:
            ds, ratio = dataset_relative_ratio
            line_count = len(datasets[ds])
            # By definition ratio = u * line_count / sum(#lines of other datasets)
            u = (total_line_count - line_count) / line_count * ratio
            dataset_upsampling = {key: u}
        elif not dataset_upsampling:
            dataset_upsampling = {}

        print(f"|dataset upsampling:{dataset_upsampling}")
        ds_list = []
        sample_ratios = []
        for key, val in datasets.items():
            ds_list.append(val)
            sample_ratios.append(dataset_upsampling.get(key, 1.0))

        self.datasets[split] = ConcatDataset(datasets=datasets.values(),
                                             sample_ratios=sample_ratios)
コード例 #3
0
    def _load_dataset_multi_path(
        self,
        split: str,
        src_multiple_bin_paths: Dict[str, str],
        tgt_multiple_bin_paths: Dict[str, str],
        dataset_upsampling: Optional[Dict[str, float]],
    ):
        corpora_map = pytorch_translate_data.ParallelCorporaMapConfig(
            src_files=src_multiple_bin_paths, tgt_files=tgt_multiple_bin_paths)
        datasets = OrderedDict()
        for key in corpora_map.src_files:
            src, tgt = corpora_map.src_files[key], corpora_map.tgt_files[key]
            src_dataset, tgt_dataset = (
                pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                    src),
                pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                    tgt),
            )
            datasets[key] = LanguagePairDataset(
                src=src_dataset,
                src_sizes=src_dataset.sizes,
                src_dict=self.source_dictionary,
                tgt=tgt_dataset,
                tgt_sizes=tgt_dataset.sizes,
                tgt_dict=self.target_dictionary,
                left_pad_source=False,
            )
        dataset_weights = {
            key: 1.0 / len(src_multiple_bin_paths)
            for key in src_multiple_bin_paths.keys()
        }

        if dataset_upsampling is not None:
            for k, v in dataset_upsampling.items():
                dataset_weights[k] *= v

        self.datasets[split] = MultiCorpusSampledDataset(
            datasets=datasets,
            default_key=list(dataset_weights.keys())[0],
            sampling_func=self._normalized_weighted_sampling(dataset_weights),
        )