def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, "{}.{}-{}.{}".format(split, src, tgt, lang)) da_filename = os.path.join( data_path, "{}.da.{}-{}.{}".format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) \ and indexed_dataset.dataset_exists(da_filename, impl=dataset_impl)
def split_exists_self(split, src, data_path): logger.info( os.path.join(data_path, "{}.{}-{}.{}".format(split, src, src, src))) filename = os.path.join(data_path, "{}.{}-{}.{}".format(split, src, src, src)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl)
def split_exists(split, src, tgt, lang): if src is not None: filename = os.path.join( data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) else: filename = os.path.join( data_path, '{}.{}-None.{}'.format(split, src, tgt)) return indexed_dataset.dataset_exists(filename, impl=self.args.dataset_impl)
def load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, append_source_id=False, num_buckets=0, shuffle=True, pad_to_multiple=1, prepend_bos_src=None, bert_model_name=None, bart_model_name=None, electra_model_name=None, electra_pretrain=False, denoising=False, masking=False, extra_data=False, input_mapping=False, mask_ratio=None, random_ratio=None, insert_ratio=None, rotate_ratio=None, permute_sentence_ratio=None, ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, "{}.{}-{}.{}".format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=False) if denoising: bart_tokenizer = AutoTokenizer.from_pretrained(bart_model_name, do_lower_case=False) #bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name, do_lower_case=False) if electra_pretrain: electra_tokenizer = ElectraTokenizer.from_pretrained( electra_model_name) srcbert_datasets = [] extra_datasets = [] extra_bert_datasets = [] extra_bert_mapping_datasets = [] extra_bart_datasets = [] extra_bart_mapping_datasets = [] if denoising: srcbart_datasets = [] if electra_pretrain: srcelectra_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else "") # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, src, tgt)) bertprefix = os.path.join( data_path, '{}.bert.{}-{}.'.format(split_k, src, tgt)) bert_mapping_prefix = os.path.join( data_path, '{}.bert.map.{}-{}.'.format(split_k, src, tgt)) if denoising: bartprefix = os.path.join( data_path, '{}.bart.{}-{}.'.format(split_k, src, tgt)) bart_mapping_prefix = os.path.join( data_path, '{}.bart.map.{}-{}.'.format(split_k, src, tgt)) if electra_pretrain: electraprefix = os.path.join( data_path, '{}.electra.{}-{}.'.format(split_k, src, tgt)) electra_mapping_prefix = os.path.join( data_path, '{}.electra.map.{}-{}.'.format(split_k, src, tgt)) if extra_data: extraprefix = os.path.join( data_path, '{}.extra.{}-{}.'.format(split_k, src, tgt)) extra_bert_prefix = os.path.join( data_path, '{}.extra.bert.{}-{}.'.format(split_k, src, tgt)) extra_bert_mapping_prefix = os.path.join( data_path, '{}.extra.bert.map.{}-{}.'.format(split_k, src, tgt)) extra_bart_prefix = os.path.join( data_path, '{}.extra.bart.{}-{}.'.format(split_k, src, tgt)) extra_bart_mapping_prefix = os.path.join( data_path, '{}.extra.bart.map.{}-{}.'.format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, tgt, src)) bertprefix = os.path.join( data_path, '{}.bert.{}-{}.'.format(split_k, tgt, src)) bert_mapping_prefix = os.path.join( data_path, '{}.bert.map.{}-{}.'.format(split_k, src, tgt)) if denoising: bartprefix = os.path.join( data_path, '{}.bart.{}-{}.'.format(split_k, tgt, src)) bart_mapping_prefix = os.path.join( data_path, '{}.bart.map.{}-{}.'.format(split_k, src, tgt)) if electra_pretrain: electraprefix = os.path.join( data_path, '{}.electra.{}-{}.'.format(split_k, src, tgt)) electra_mapping_prefix = os.path.join( data_path, '{}.electra.map.{}-{}.'.format(split_k, src, tgt)) if extra_data: extraprefix = os.path.join( data_path, '{}.extra.{}-{}.'.format(split_k, src, tgt)) extra_bert_prefix = os.path.join( data_path, '{}.extra.bert.{}-{}.'.format(split_k, src, tgt)) extra_bert_mapping_prefix = os.path.join( data_path, '{}.extra.bert.map.{}-{}.'.format(split_k, src, tgt)) extra_bart_prefix = os.path.join( data_path, '{}.extra.bart.{}-{}.'.format(split_k, src, tgt)) extra_bart_mapping_prefix = os.path.join( data_path, '{}.extra.bart.map.{}-{}.'.format(split_k, src, tgt)) else: if k > 0: break else: raise FileNotFoundError("Dataset not found: {} ({})".format( split, data_path)) src_dataset = data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_dataset = data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) # srcbert_datasets.append(indexed_dataset.make_dataset(bertprefix + src, impl=dataset_impl, # fix_lua_indexing=True, )) # if denoising: # srcbart_datasets.append(indexed_dataset.make_dataset(bartprefix + src, impl=dataset_impl, # fix_lua_indexing=True, )) # if extra_data: # extra_datasets.append(indexed_dataset.make_dataset(extraprefix + src, impl=dataset_impl, # fix_lua_indexing=True, )) srcbert_datasets.append( data_utils.load_indexed_dataset( bertprefix + src, dataset_impl=dataset_impl, )) if denoising: srcbart_datasets.append( data_utils.load_indexed_dataset( bartprefix + src, dataset_impl=dataset_impl, )) if electra_pretrain: srcelectra_datasets.append( data_utils.load_indexed_dataset( electraprefix + src, dataset_impl=dataset_impl, )) if extra_data and split == 'train': extra_datasets.append( data_utils.load_indexed_dataset( extraprefix + src, dataset_impl=dataset_impl, )) extra_bert_datasets.append( data_utils.load_indexed_dataset( extra_bert_prefix + src, dataset_impl=dataset_impl, )) extra_bert_mapping_datasets.append( data_utils.load_indexed_dataset( extra_bert_mapping_prefix + src, dataset_impl=dataset_impl, )) extra_bart_datasets.append( data_utils.load_indexed_dataset( extra_bart_prefix + src, dataset_impl=dataset_impl, )) extra_bart_mapping_datasets.append( data_utils.load_indexed_dataset( extra_bart_mapping_prefix + src, dataset_impl=dataset_impl, )) #import pdb; pdb.set_trace() assert extra_datasets != [] or extra_bert_datasets != [] or extra_bert_mapping_datasets != [] or extra_bart_datasets != [] or extra_bart_mapping_datasets != [] #extra_datasets = extra_datasets[0] #import pdb; pdb.set_trace() src_datasets[-1] = PrependTokenDataset(src_datasets[-1], token=src_dict.bos_index) if extra_data and split == 'train': extra_datasets[-1] = PrependTokenDataset(extra_datasets[-1], token=src_dict.bos_index) if denoising is True: if input_mapping is True and split == 'train': bart_mapping_dataset = data_utils.load_indexed_dataset( bart_mapping_prefix + src, dataset_impl=dataset_impl) else: bart_mapping_dataset = None src_datasets[-1] = DenoisingBartDataset( src_datasets[-1], src_datasets[-1].sizes, src_dict, srcbart_datasets[-1], srcbart_datasets[-1].sizes, bart_tokenizer, map_dataset=bart_mapping_dataset, mask_ratio=mask_ratio, random_ratio=random_ratio, insert_ratio=insert_ratio, rotate_ratio=rotate_ratio, permute_sentence_ratio=permute_sentence_ratio, ) if electra_pretrain is True: if input_mapping is True and split == 'train': electra_mapping_dataset = data_utils.load_indexed_dataset( electra_mapping_prefix + src, dataset_impl=dataset_impl) else: electra_mapping_dataset = None src_datasets[-1] = ElectrapretrainDataset( src_datasets[-1], src_datasets[-1].sizes, src_dict, srcelectra_datasets[-1], srcelectra_datasets[-1].sizes, electra_tokenizer, map_dataset=electra_mapping_dataset, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, ) if masking is True: if input_mapping is True and split == 'train': #bert_mapping_dataset = indexed_dataset.make_dataset(bert_mapping_prefix + src, impl=dataset_impl, fix_lua_indexing=True) bert_mapping_dataset = data_utils.load_indexed_dataset( bert_mapping_prefix + src, dataset_impl=dataset_impl) else: bert_mapping_dataset = None src_datasets[-1] = MaskingDataset( src_datasets[-1], src_datasets[-1].sizes, src_dict, srcbert_datasets[-1], srcbert_datasets[-1].sizes, bert_tokenizer, map_dataset=bert_mapping_dataset, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, ) if extra_data is True and split == 'train': assert input_mapping is True src_datasets[-1] = MaskingExtraDataset( src_datasets[-1], src_datasets[-1].sizes, src_dict, extra_datasets[-1], extra_datasets[-1].sizes, extra_bert_datasets[-1], extra_bert_datasets[-1].sizes, bert_tokenizer, map_dataset=extra_bert_mapping_datasets[-1], left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, ) src_datasets[-1] = DenoisingBartExtraDataset( src_datasets[-1], src_datasets[-1].sizes, src_dict, extra_datasets[-1], extra_datasets[-1].sizes, extra_bart_datasets[-1], extra_bart_datasets[-1].sizes, bart_tokenizer, map_dataset=extra_bart_mapping_datasets[-1], ) logger.info("{} {} {}-{} {} examples".format(data_path, split_k, src, tgt, len(src_datasets[-1]))) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None # srcbert_datasets = srcbert_datasets[0] # if denoising: # srcbart_datasets = srcbart_datasets[0] else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) elif prepend_bos_src is not None: logger.info(f"prepending src bos: {prepend_bos_src}") src_dataset = PrependTokenDataset(src_dataset, prepend_bos_src) eos = None if append_source_id: src_dataset = AppendTokenDataset(src_dataset, src_dict.index("[{}]".format(src))) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset( tgt_dataset, tgt_dict.index("[{}]".format(tgt))) eos = tgt_dict.index("[{}]".format(tgt)) align_dataset = None if load_alignments: align_path = os.path.join(data_path, "{}.align.{}-{}".format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None src_bart_dataset = None src_bert_dataset = None src_electra_dataset = None return LanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, masking, src_bert_dataset, denoising, src_bart_dataset, src_electra_dataset, #extra_datasets, left_pad_source=left_pad_source, left_pad_target=left_pad_target, align_dataset=align_dataset, eos=eos, num_buckets=num_buckets, shuffle=shuffle, pad_to_multiple=pad_to_multiple, )
def load_dataset(self, split, epoch=0, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" if self.cfg.data.endswith("1"): data_shard = (epoch - 1) % self.cfg.num_data_splits + 1 data_path = self.cfg.data[:-1] + str(data_shard) else: data_path = self.cfg.data def get_path(type, data_split): return os.path.join(data_path, str(type), data_split) def make_dataset(type, dictionary, data_split, combine): split_path = get_path(type, data_split) dataset = data_utils.load_indexed_dataset( split_path, dictionary, combine=combine, ) return dataset def load_split(data_split, metric): input_src = None if self.cfg.include_src: input_src = make_dataset("input_src", self.dictionary, data_split, combine=False) assert input_src is not None, "could not find dataset: {}".format( get_path("input_src", data_split)) input_tgt = make_dataset("input_tgt", self.dictionary, data_split, combine=False) assert input_tgt is not None, "could not find dataset: {}".format( get_path("input_tgt", data_split)) label_path = f"{get_path(metric, data_split)}.{metric}" assert os.path.exists( label_path), f"could not find dataset: {label_path}" np_labels = np.loadtxt(label_path) if self.cfg.target_metric == "ter": np_labels = -np_labels label = RawLabelDataset(np_labels) return input_src, input_tgt, label src_datasets = [] tgt_datasets = [] label_datasets = [] if split == self.cfg.train_subset: for k in itertools.count(): split_k = "train" + (str(k) if k > 0 else "") prefix = os.path.join(data_path, "input_tgt", split_k) if not indexed_dataset.dataset_exists(prefix, impl=None): if k > 0: break else: raise FileNotFoundError(f"Dataset not found: {prefix}") input_src, input_tgt, label = load_split( split_k, self.cfg.target_metric) src_datasets.append(input_src) tgt_datasets.append(input_tgt) label_datasets.append(label) else: input_src, input_tgt, label = load_split(split, self.cfg.target_metric) src_datasets.append(input_src) tgt_datasets.append(input_tgt) label_datasets.append(label) if len(tgt_datasets) == 1: input_tgt, label = tgt_datasets[0], label_datasets[0] if self.cfg.include_src: input_src = src_datasets[0] else: input_tgt = ConcatDataset(tgt_datasets) label = ConcatDataset(label_datasets) if self.cfg.include_src: input_src = ConcatDataset(src_datasets) input_tgt = TruncateDataset(input_tgt, self.cfg.max_positions) if self.cfg.include_src: input_src = PrependTokenDataset(input_src, self.dictionary.bos()) input_src = TruncateDataset(input_src, self.cfg.max_positions) src_lengths = NumelDataset(input_src, reduce=False) src_tokens = ConcatSentencesDataset(input_src, input_tgt) else: src_tokens = PrependTokenDataset(input_tgt, self.dictionary.bos()) src_lengths = NumelDataset(src_tokens, reduce=False) dataset = { "id": IdDataset(), "net_input": { "src_tokens": RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), "src_lengths": src_lengths, }, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), "target": label, } dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) assert len(dataset) % self.cfg.mt_beam == 0, ( "dataset size (%d) is not a multiple of beam size (%d)" % (len(dataset), self.cfg.mt_beam)) # no need to shuffle valid/test sets if not self.cfg.no_shuffle and split == self.cfg.train_subset: # need to keep all hypothese together start_idx = np.arange(0, len(dataset), self.cfg.mt_beam) with data_utils.numpy_seed(self.cfg.seed + epoch): np.random.shuffle(start_idx) idx = np.arange(0, self.cfg.mt_beam) shuffle = np.tile(idx, (len(start_idx), 1)).reshape(-1) + np.tile( start_idx, (self.cfg.mt_beam, 1)).transpose().reshape(-1) dataset = SortDataset( dataset, sort_order=[shuffle], ) logger.info(f"Loaded {split} with #samples: {len(dataset)}") self.datasets[split] = dataset return self.datasets[split]
def split_exists(split, src, tgt, lang, data_path): logger.info(f"Dataset_impl -> {dataset_impl}") filename = os.path.join(data_path, "{}.{}-{}.{}".format(split, src, tgt, lang)) logger.info(f"Dataset filename -> {filename}") return indexed_dataset.dataset_exists(filename, impl=dataset_impl)
def load_langpair_dataset(data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, load_cls_labels=False, load_cls_indices=False, load_sample_weights=False, truncate_source=False, append_source_id=False, shuffle=True): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, data_path)) src_dataset = data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_dataset = data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info('{} {} {}-{} {} examples'.format(data_path, split_k, src, tgt, len(src_datasets[-1]))) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None src_prepended_bos = False if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) src_prepended_bos = True eos = None if append_source_id: src_dataset = AppendTokenDataset(src_dataset, src_dict.index('[{}]'.format(src))) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset( tgt_dataset, tgt_dict.index('[{}]'.format(tgt))) eos = tgt_dict.index('[{}]'.format(tgt)) align_dataset = None if load_alignments: align_path = os.path.join(data_path, '{}.align.{}-{}'.format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None cls_dataset = None if load_cls_labels: cls_labels_path = os.path.join(data_path, '{}.cls'.format(split)) if indexed_dataset.dataset_exists(cls_labels_path, impl=dataset_impl): cls_dataset = data_utils.load_indexed_dataset( cls_labels_path, None, dataset_impl) if truncate_source: cls_dataset = AppendTokenDataset( TruncateDataset( TruncateLastElementDataset(cls_dataset), max_source_positions - 1, ), -1, # will ignore -1 label in training ) if src_prepended_bos: cls_dataset = PrependTokenDataset(cls_dataset, -1) else: print("cls_labels dataset NOT FOUND!", cls_labels_path) cls_indices_dataset = None if load_cls_indices: cls_indices_path = os.path.join(data_path, '{}.cls_ind'.format(split)) if indexed_dataset.dataset_exists(cls_indices_path, impl=dataset_impl): cls_indices_dataset = data_utils.load_indexed_dataset( cls_indices_path, None, dataset_impl) sample_weights = None if load_sample_weights: weights_file = os.path.join( data_path, '{}.{}-{}.weights.npy'.format(split, src, tgt)) assert os.path.exists(weights_file) with open(weights_file, 'rb') as f: sample_weights = np.load(f) logger.info('Loaded {} weights from {}'.format(len(sample_weights), weights_file)) return LanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, align_dataset=align_dataset, eos=eos, cls_dataset=cls_dataset, cls_indices_dataset=cls_indices_dataset, sample_weights=sample_weights, shuffle=shuffle, )
def load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, append_source_id=False, num_buckets=0, shuffle=True, pad_to_multiple=1, # Masked LM parameters. mask_idx: int = 0, seed: int = 1, mask_prob: float = 0.01, leave_unmasked_prob: float = 0.0, random_token_prob: float = 0.0, freq_weighted_replacement: bool = False, mask_whole_words: torch.Tensor = None, mask_multiple_length: int = 1, mask_stdev: float = 0.0, ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join( data_path, "{}.{}-{}.{}".format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else "") # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join( data_path, "{}.{}-{}.".format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join( data_path, "{}.{}-{}.".format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, data_path) ) src_dataset = data_utils.load_indexed_dataset( prefix + src, src_dict, dataset_impl ) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_dataset = data_utils.load_indexed_dataset( prefix + tgt, tgt_dict, dataset_impl ) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info( "{} {} {}-{} {} examples".format( data_path, split_k, src, tgt, len(src_datasets[-1]) ) ) if not combine: break # logger.info('Length of Source DataSets: {}'.format(len(src_datasets))) assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None if append_source_id: src_dataset = AppendTokenDataset( src_dataset, src_dict.index("[{}]".format(src)) ) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset( tgt_dataset, tgt_dict.index("[{}]".format(tgt)) ) eos = tgt_dict.index("[{}]".format(tgt)) align_dataset = None if load_alignments: align_path = os.path.join( data_path, "{}.align.{}-{}".format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl ) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None # mask source dataset. src_dataset, masked_src_dataset = MaskTokensDataset.apply_mask( src_dataset, src_dict, pad_idx=src_dict.pad(), mask_idx=mask_idx, seed=seed, mask_prob=mask_prob, leave_unmasked_prob=leave_unmasked_prob, random_token_prob=random_token_prob, freq_weighted_replacement=freq_weighted_replacement, mask_whole_words=mask_whole_words, mask_multiple_length=mask_multiple_length, mask_stdev=mask_stdev, ) # Print samples. # if split == 'valid': # print(src_dataset[1]) # print(masked_src_dataset[1]) return LanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, # for Mask LM loss calculation. masked_src_dataset, masked_src_dataset.sizes, left_pad_source=left_pad_source, left_pad_target=left_pad_target, align_dataset=align_dataset, eos=eos, num_buckets=num_buckets, shuffle=shuffle, pad_to_multiple=pad_to_multiple, )
def load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, append_source_id=False, num_buckets=0, shuffle=True, ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, data_path)) src_dataset = data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_dataset = data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info('{} {} {}-{} {} examples'.format(data_path, split_k, src, tgt, len(src_datasets[-1]))) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None # these features are not yet implemented for the cluster code if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None if append_source_id: src_dataset = AppendTokenDataset( src_dataset, src_dict.index('[{}]'.format(src))) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset( tgt_dataset, tgt_dict.index('[{}]'.format(tgt))) eos = tgt_dict.index('[{}]'.format(tgt)) align_dataset = None if load_alignments: align_path = os.path.join(data_path, '{}.align.{}-{}'.format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None return LanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, align_dataset=align_dataset, eos=eos, num_buckets=num_buckets, shuffle=shuffle, ) else: # sample_ratios = [1] * len(src_datasets) # sample_ratios[0] = upsample_primary # src_dataset = ConcatDataset(src_datasets, sample_ratios) # if len(tgt_datasets) > 0: # tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) # else: # tgt_dataset = None datasets = [] eos = None align_dataset = None for i in range(0, len(src_datasets)): src_dataset = src_datasets[i] tgt_dataset = tgt_datasets[i] tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None datasets.append( LanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, align_dataset=align_dataset, eos=eos, num_buckets=num_buckets, shuffle=shuffle, )) return datasets
def load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, srcda=False, srcda_choice='uniform', tgtda=False, tgtda_choice='uniform' ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format(split, data_path)) src_dataset = data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_datasets.append( data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl) ) print('| {} {} {}-{} {} examples'.format(data_path, split_k, src, tgt, len(src_datasets[-1]))) if not combine: break assert len(src_datasets) == len(tgt_datasets) if len(src_datasets) == 1: src_dataset, tgt_dataset = src_datasets[0], tgt_datasets[0] else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) align_dataset = None if load_alignments: align_path = os.path.join(data_path, '{}.align.{}-{}'.format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset(align_path, None, dataset_impl) return LanguagePairDatasetDA( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset.sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, align_dataset=align_dataset, srcda=srcda, srcda_choice=srcda_choice, tgtda=tgtda, tgtda_choice=tgtda_choice )
def mono_split_exists(cls, split, lang, data_path, dataset_impl): filename = os.path.join(data_path, '{}.{}'.format(split, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl)
def load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, feature_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_features=False, load_alignments=False, truncate_source=False, append_source_id=False, num_buckets=0, ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, data_path)) src_dataset = data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_dataset = data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info('{} {} {}-{} {} examples'.format(data_path, split_k, src, tgt, len(src_datasets[-1]))) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None if append_source_id: src_dataset = AppendTokenDataset(src_dataset, src_dict.index('[{}]'.format(src))) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset( tgt_dataset, tgt_dict.index('[{}]'.format(tgt))) eos = tgt_dict.index('[{}]'.format(tgt)) align_dataset = None if load_alignments: align_path = os.path.join(data_path, '{}.align.{}-{}'.format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl) # print("feature_dict", feature_dict.symbols, feature_dict.count) #feature_dict ['<s>', '<pad>', '</s>', '<unk>', '<ori>', '<rep>', 'madeupword0000', 'madeupword0001'] [1, 1, 1, 1, 18558611, 5354704, 0, 0] feature_dataset = None if load_features: feature_path = os.path.join( data_path, '{}.feature.{}-{}.{}'.format(split, src, tgt, src)) if indexed_dataset.dataset_exists(feature_path, impl=dataset_impl): feature_dataset = data_utils.load_indexed_dataset( feature_path, feature_dict, dataset_impl) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None return LanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, feature_dataset=feature_dataset, align_dataset=align_dataset, eos=eos, num_buckets=num_buckets, )
def split_cand_exists(split, src, tgt, lang, beamsize, data_path): filename = os.path.join( data_path, '{}.cand{}.{}-{}.{}'.format(split, beamsize, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl)
def load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, load_dependency=False, gold_dependency=False, dependency_with_input=False, truncate_source=False, remove_eos_from_source=True, append_source_id=False, num_buckets=0, shuffle=True, ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, data_path)) src_dataset = data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_dataset = data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info('{} {} {}-{} {} examples'.format(data_path, split_k, src, tgt, len(src_datasets[-1]))) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None if append_source_id: src_dataset = AppendTokenDataset(src_dataset, src_dict.index('[{}]'.format(src))) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset( tgt_dataset, tgt_dict.index('[{}]'.format(tgt))) eos = tgt_dict.index('[{}]'.format(tgt)) align_dataset = None if load_alignments: align_path = os.path.join(data_path, '{}.align.{}-{}'.format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl) src_dep, tgt_dep = None, None if load_dependency: src_dep_path = os.path.join(data_path, '{}.dep.{}'.format(split, src)) tgt_dep_path = os.path.join(data_path, '{}.dep.{}'.format(split, tgt)) if os.path.exists(src_dep_path): src_deps = [] with open(src_dep_path, 'r') as src_dep_data: for h in src_dep_data: src_deps.append( torch.LongTensor( [[i, int(x) - 1] for i, x in enumerate(h.strip().split())])) src_dep = RawLabelDataset(src_deps) if os.path.exists(tgt_dep_path): tgt_deps = [] with open(tgt_dep_path, 'r') as tgt_dep_data: for h in tgt_dep_data: tgt_deps.append( torch.LongTensor( [[i, int(x) - 1] for i, x in enumerate(h.strip().split())])) tgt_dep = RawLabelDataset(tgt_deps) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None return LanguagePairDatasetWithDependency( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, remove_eos_from_source=remove_eos_from_source, align_dataset=align_dataset, eos=eos, src_dep=src_dep, tgt_dep=tgt_dep, dependency_with_input=dependency_with_input, gold_dependency=gold_dependency, num_buckets=num_buckets, shuffle=shuffle, )
def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) print('split_filename', filename) return indexed_dataset.dataset_exists(filename, impl=dataset_impl)
def load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, append_source_id=False, num_buckets=0, shuffle=True, pad_to_multiple=1, plus_encoder_loss=False, add_langs=None, shuffle_lang_pair=None, args=None, word_trans_dict=None, word_align_dict=None, policy_ratio_dicts=None, ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, "{}.{}-{}.{}".format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) def split_exists_valid(split, lang, data_path): filename = os.path.join(data_path, "{}.{}".format(split, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else "") if not "-" in split_k: # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError( "Dataset not found: {} ({}) {} {}".format(split, data_path, src, tgt) ) else: # for multi-valid if split_exists_valid( split_k, src, data_path): prefix = os.path.join(data_path, split_k+".") else: if k > 0: break else: raise FileNotFoundError( "Dataset not found: {} ({}) ".format(split, data_path) ) src_dataset = data_utils.load_indexed_dataset( prefix + src, src_dict, dataset_impl ) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) # for monolingual instances. if src == tgt: tgt_dataset = copy.deepcopy(src_dataset) else: tgt_dataset = data_utils.load_indexed_dataset( prefix + tgt, tgt_dict, dataset_impl ) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None # add src and tag lang id on the biganing of sens. if add_langs: src_dataset = PrependTokenDataset( src_dataset, src_dict.index("[{}]".format(src)) ) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset( tgt_dataset, tgt_dict.index("[{}]".format(tgt)) ) align_dataset = None if load_alignments: align_path = os.path.join(data_path, "{}.align.{}-{}".format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl ) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None return DDenoisingPairDatasetDynaReplace( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, align_dataset=align_dataset, eos=eos, num_buckets=num_buckets, shuffle=shuffle, pad_to_multiple=pad_to_multiple, plus_encoder_loss=plus_encoder_loss, add_langs=add_langs, shuffle_lang_pair=shuffle_lang_pair, args=args , word_trans_dict=word_trans_dict , word_align_dict=word_align_dict, policy_ratio_dicts= policy_ratio_dicts, )
def load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, user_context_path, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, append_source_id=False ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format(split, data_path)) src_dataset = data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_dataset = data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info('{} {} {}-{} {} examples'.format( data_path, split_k, src, tgt, len(src_datasets[-1]) )) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None if append_source_id: src_dataset = AppendTokenDataset(src_dataset, src_dict.index('[{}]'.format(src))) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset(tgt_dataset, tgt_dict.index('[{}]'.format(tgt))) eos = tgt_dict.index('[{}]'.format(tgt)) align_dataset = None if load_alignments: align_path = os.path.join(data_path, '{}.align.{}-{}'.format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset(align_path, None, dataset_impl) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None logger.info('Loading user big issues from {}'.format(split)) user_context = pickle.load(open(os.path.join(user_context_path, '{}_with_claim.users_big_issues.pkl'.format(split)), 'rb')) return ExtendedLanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, user_context, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, align_dataset=align_dataset, eos=eos )
def load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, append_source_id=False, num_buckets=0, shuffle=True, pad_to_multiple=1, add_lang_token=False, ): def split_exists(split, src, tgt, lang, data_path): logger.info( os.path.join(data_path, "{}.{}-{}.{}".format(split, src, tgt, lang))) filename = os.path.join(data_path, "{}.{}-{}.{}".format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) def split_exists_self(split, src, data_path): logger.info( os.path.join(data_path, "{}.{}-{}.{}".format(split, src, src, src))) filename = os.path.join(data_path, "{}.{}-{}.{}".format(split, src, src, src)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) def split_exists_valid(split, lang, data_path): logger.info(os.path.join(data_path, "{}.{}".format(split, lang))) filename = os.path.join(data_path, "{}.{}".format(split, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else "") # print(split_k, src, tgt, src, data_path) prefix_src = None prefix_tgt = None if not "-" in split_k: # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError( "Dataset not found: {} ({}) {} {}".format( split, data_path, src, tgt)) else: # infer langcode if split_exists_valid(split_k, src, data_path): prefix = os.path.join(data_path, split_k + ".") else: if k > 0: break else: raise FileNotFoundError( "Dataset not found: {} ({}) ".format(split, data_path)) if prefix_src != None: prefix = prefix_src src_dataset = data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) if prefix_tgt != None: prefix = prefix_tgt tgt_dataset = data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info("{} {} {}-{} {} examples".format(data_path, split_k, src, tgt, len(src_datasets[-1]))) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary logger.info("::::data sample_ratios:{}".format(sample_ratios)) src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None if append_source_id: src_dataset = AppendTokenDataset(src_dataset, src_dict.index("[{}]".format(src))) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset( tgt_dataset, tgt_dict.index("[{}]".format(tgt))) eos = tgt_dict.index("[{}]".format(tgt)) eos = None if add_lang_token: src_dataset = PrependTokenDataset(src_dataset, src_dict.index("[{}]".format(src))) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset( tgt_dataset, tgt_dict.index("[{}]".format(tgt))) align_dataset = None if load_alignments: align_path = os.path.join(data_path, "{}.align.{}-{}".format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None return LanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, align_dataset=align_dataset, eos=eos, num_buckets=num_buckets, shuffle=shuffle, pad_to_multiple=pad_to_multiple, )
def split_exists(split, src, tgt, feat, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, feat)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl)
def split_exists_valid(split, lang, data_path): logger.info(os.path.join(data_path, "{}.{}".format(split, lang))) filename = os.path.join(data_path, "{}.{}".format(split, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl)
def load_pos_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, append_source_id=False ): # Check the existence of the file def split_exists(split, src, tgt, lang, data_path): filename = os.path.join( data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode (from a->b or from b->a) if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join( data_path, '{}.{}-{}.'.format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join( data_path, '{}.{}-{}.'.format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format(split, data_path)) src_dataset = data_utils.load_indexed_dataset( prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_dataset = data_utils.load_indexed_dataset( prefix + tgt, tgt_dict, dataset_impl) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info('{} {} {}-{} {} examples'.format( data_path, split_k, src, tgt, len(src_datasets[-1]) )) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None if append_source_id: src_dataset = AppendTokenDataset( src_dataset, src_dict.index('[{}]'.format(src))) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset( tgt_dataset, tgt_dict.index('[{}]'.format(tgt))) eos = tgt_dict.index('[{}]'.format(tgt)) align_dataset = None if load_alignments: align_path = os.path.join( data_path, '{}.align.{}-{}'.format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None # Load POS Graph def graph_exist(data_path, split, src, tgt, lang): existence = True row_path = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, src)) + '.row' col_path = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, src)) + '.col' anchor_path = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, src)) + '.anchor' if(not os.path.exists(row_path)): existence = False elif(not os.path.exists(col_path)): existence = False elif(not os.path.exists(anchor_path)): existence = False return existence pos_graphs_l = [] pos_anchors_l = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') existence = graph_exist(data_path, split_k, src, tgt, src) if(not existence): if(k == 0): raise FileNotFoundError('POS Graph Dataset not found') if(k > 0): break pos_rows = codecs.open(os.path.join( data_path, '{}.{}-{}.{}'.format(split_k, src, tgt, src)) + '.row', 'r', 'utf-8').readlines() pos_cols = codecs.open(os.path.join( data_path, '{}.{}-{}.{}'.format(split_k, src, tgt, src)) + '.col', 'r', 'utf-8').readlines() pos_graphs = [] print('Loading graphs' + '.' * 50) assert len(pos_cols) == len(pos_rows) pbar = tqdm(total=len(pos_cols)) for n, (row, col) in enumerate(zip(pos_rows, pos_cols)): pos_row = [eval(i) for i in row.strip().split()] pos_col = [eval(i) for i in col.strip().split()] pos_graphs.append((pos_row, pos_col)) pbar.update() pbar.close() pos_anchors = codecs.open(os.path.join( data_path, '{}.{}-{}.{}'.format(split_k, src, tgt, src)) + '.anchor', 'r', 'utf-8').readlines() anchors = [] for line in pos_anchors: anchors.append([eval(i) for i in line.strip().split()]) pos_graphs_l.extend(pos_graphs) pos_anchors_l.extend(anchors) assert (len(pos_anchors_l) == len(pos_graphs_l)) and (len(src_dataset.sizes) == len(pos_anchors_l)) return POSGraphLanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, pos_anchors_l, pos_graphs_l, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, align_dataset=align_dataset, eos=eos )
def load_lang_dataset( self, data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, max_source_positions, prepend_bos=False, load_alignments=False, truncate_source=False, ): src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else "") # infer langcode if self.split_exists(split_k, src, tgt, src, data_path, dataset_impl): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, src, tgt)) elif self.split_exists(split_k, tgt, src, src, data_path, dataset_impl): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, tgt, src)) else: if k > 0: break else: logger.error( f"Dataset not found: {data_path}, {split_k}, {src}, {tgt}" ) raise FileNotFoundError( "Dataset not found: {} ({})".format(split, data_path) ) src_dataset = self.load_data(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_datasets.append(self.load_data(prefix + tgt, tgt_dict, dataset_impl)) logger.info( "{} {} {}-{} {} examples".format( data_path, split_k, src, tgt, len(src_datasets[-1]) ) ) if not combine: break assert len(src_datasets) == len(tgt_datasets) if len(src_datasets) == 1: src_dataset, tgt_dataset = src_datasets[0], tgt_datasets[0] else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) align_dataset = None if load_alignments: align_path = os.path.join( data_path, "{}.align.{}-{}".format(split, src, tgt) ) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl ) return src_dataset, tgt_dataset, align_dataset
def load_langpair_with_additional_data_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, add_dir=None, add_lang=None, add_dict=None, userdirname=None, ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, data_path)) src_dataset = data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_datasets.append( data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl)) print('| {} {} {}-{} {} examples'.format(data_path, split_k, src, tgt, len(src_datasets[-1]))) if not combine: break assert len(src_datasets) == len(tgt_datasets) if len(src_datasets) == 1: src_dataset, tgt_dataset = src_datasets[0], tgt_datasets[0] else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) align_dataset = None if load_alignments: align_path = os.path.join(data_path, '{}.align.{}-{}'.format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl) add_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') additional_data_path = f'{data_path}/{add_dir}' # infer langcode if split_exists(split_k, add_lang, 'None', add_lang, additional_data_path): prefix = os.path.join( additional_data_path, '{}.{}-{}.'.format(split_k, add_lang, 'None')) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, additional_data_path)) add_dataset = data_utils.load_indexed_dataset(prefix + add_lang, add_dict, dataset_impl) if truncate_source: add_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(add_dataset, add_dict.eos()), max_source_positions - 1, ), add_dict.eos(), ) add_datasets.append(add_dataset) print('| {} {} {}-{} {} examples'.format(data_path, split_k, add_lang, 'None', len(add_datasets[-1]))) if not combine: break if len(add_datasets) == 1: add_dataset = add_datasets[0] else: raise Exception # sample_ratios = [1] * len(src_datasets) # sample_ratios[0] = upsample_primary # src_dataset = ConcatDataset(src_datasets, sample_ratios) # tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) if add_dataset: import sys module_parent, module_name = os.path.split( os.path.abspath(userdirname)) add_user_module(userdirname) return sys.modules[ module_name].data.LanguagePairWithAdditionalDataDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset.sizes, tgt_dict, add_dataset, add_dataset.sizes, add_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, align_dataset=align_dataset, ) else: return LanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset.sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, align_dataset=align_dataset, )
def load_generation_pair_dataset( data_path, split, tgt, src_dict, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, append_source_id=False, common_eos=None ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode if split_exists(split_k, "src", "tgt", "src", data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, "src", "tgt")) elif split_exists(split_k, "tgt", "src", "src", data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, "tgt", "src")) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format(split, data_path)) src_dataset = data_utils.load_indexed_dataset(prefix + "src", src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_dataset = data_utils.load_indexed_dataset(prefix + "tgt", tgt_dict, dataset_impl) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info('{} {} {}-{} {} examples'.format( data_path, split_k, "src", "tgt", len(src_datasets[-1]) )) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None if append_source_id: if common_eos is not None: src_dataset = AppendTokenDataset(src_dataset, src_dict.index('[{}]'.format(common_eos))) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset(tgt_dataset, tgt_dict.index('[{}]'.format(common_eos))) eos = tgt_dict.index('[{}]'.format(common_eos)) bos = tgt_dict.index('[{}]'.format(tgt)) align_dataset = None if load_alignments: align_path = os.path.join(data_path, '{}.align.{}-{}'.format(split, "src", "tgt")) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset(align_path, None, dataset_impl) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None return GenerationPairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, align_dataset=align_dataset, eos=eos, bos=bos )
def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=self.args.dataset_impl)