def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(key, split): return os.path.join(self.cfg.data, key, split) def make_dataset(key, dictionary): split_path = get_path(key, split) try: dataset = data_utils.load_indexed_dataset( split_path, dictionary, combine=combine, ) except Exception as e: if "StorageException: [404] Path not found" in str(e): logger.warning(f"dataset {e} not found") dataset = None else: raise e return dataset input0 = make_dataset("input0", self.source_dictionary) assert input0 is not None, "could not find dataset: {}".format( get_path("input0", split)) input1 = make_dataset("input1", self.source_dictionary) if self.cfg.init_token is not None: input0 = PrependTokenDataset(input0, self.cfg.init_token) if input1 is None: src_tokens = input0 else: if self.cfg.separator_token is not None: input1 = PrependTokenDataset(input1, self.cfg.separator_token) src_tokens = ConcatSentencesDataset(input0, input1) with data_utils.numpy_seed(self.cfg.seed): shuffle = np.random.permutation(len(src_tokens)) src_tokens = maybe_shorten_dataset( src_tokens, split, self.cfg.shorten_data_split_list, self.cfg.shorten_method, self.max_positions(), self.cfg.seed, ) dataset = { "id": IdDataset(), "net_input": { "src_tokens": RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), "src_lengths": NumelDataset(src_tokens, reduce=False), }, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), } if self.cfg.add_prev_output_tokens: prev_tokens_dataset = RightPadDataset( RollDataset(src_tokens, 1), pad_idx=self.dictionary.pad(), ) dataset["net_input"].update( prev_output_tokens=prev_tokens_dataset, ) if not self.cfg.regression_target: label_dataset = make_dataset("label", self.label_dictionary) if label_dataset is not None: dataset.update(target=OffsetTokensDataset( StripTokenDataset( label_dataset, id_to_strip=self.label_dictionary.eos(), ), offset=-self.label_dictionary.nspecial, )) else: label_path = "{0}.label".format(get_path("label", split)) if os.path.exists(label_path): def parse_regression_target(i, line): values = line.split() assert ( len(values) == self.cfg.num_classes ), f'expected num_classes={self.cfg.num_classes} regression target values on line {i}, found: "{line}"' return [float(x) for x in values] with open(label_path) as h: dataset.update(target=RawLabelDataset([ parse_regression_target(i, line.strip()) for i, line in enumerate(h.readlines()) ])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) if self.cfg.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format( split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError("Dataset not found: {} ({})".format( split, split_path)) dataset = maybe_shorten_dataset( dataset, split, self.args.shorten_data_split_list, self.args.shorten_method, self.args.tokens_per_sample, self.args.seed, ) # create continuous blocks of tokens dataset = TokenBlockDataset( dataset, dataset.sizes, self.args.tokens_per_sample - 1, # one less for <s> pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode=self.args.sample_break_mode, ) logger.info("loaded {} blocks from: {}".format(len(dataset), split_path)) # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT) dataset = PrependTokenDataset(dataset, self.source_dictionary.bos()) # create masked input and targets mask_whole_words = (get_whole_word_mask(self.args, self.source_dictionary) if self.args.mask_whole_words else None) src_dataset, tgt_dataset = MaskTokensDataset.apply_mask( dataset, self.source_dictionary, pad_idx=self.source_dictionary.pad(), mask_idx=self.mask_idx, seed=self.args.seed, mask_prob=self.args.mask_prob, leave_unmasked_prob=self.args.leave_unmasked_prob, random_token_prob=self.args.random_token_prob, freq_weighted_replacement=self.args.freq_weighted_replacement, mask_whole_words=mask_whole_words, mask_multiple_length=self.args.mask_multiple_length, mask_stdev=self.args.mask_stdev, fix_mask_len=self.args.fix_mask_len) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_dataset)) self.datasets[split] = SortDataset( NestedDictionaryDataset( { "id": IdDataset(), "net_input": { "src_tokens": RightPadDataset( src_dataset, pad_idx=self.source_dictionary.pad(), ), "src_lengths": NumelDataset(src_dataset, reduce=False), }, "target": RightPadDataset( tgt_dataset, pad_idx=self.source_dictionary.pad(), ), "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_dataset, reduce=True), }, sizes=[src_dataset.sizes], ), sort_order=[ shuffle, src_dataset.sizes, ], )
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, dictionary, self.args.dataset_impl, combine=combine, ) return dataset input0 = make_dataset('input0', self.source_dictionary) assert input0 is not None, 'could not find dataset: {}'.format( get_path('input0', split)) input1 = make_dataset('input1', self.source_dictionary) assert input1 is not None, 'could not find dataset: {}'.format( get_path('input1', split)) assert len(input0) == len(input1), 'input pair different length' if self.args.init_token is not None: input0 = PrependTokenDataset(input0, self.args.init_token) input1 = PrependTokenDataset(input1, self.args.init_token) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(input0)) if self.args.truncate_sequence: input0 = TruncateDataset(input0, self.args.max_positions) input1 = TruncateDataset(input1, self.args.max_positions) dataset = { 'id': IdDataset(), 'net_input0': { 'src_tokens': RightPadDataset( input0, pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(input0, reduce=False), }, 'net_input1': { 'src_tokens': RightPadDataset( input1, pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(input1, reduce=False), }, 'nsentences': NumSamplesDataset(), 'ntokens0': NumelDataset(input0, reduce=True), 'ntokens1': NumelDataset(input1, reduce=True), } label_path = "{0}.label".format(get_path('label', split)) if os.path.exists(label_path): dataset.update(target=RawLabelDataset( [float(x.strip()) for x in open(label_path).readlines()])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[np.maximum(input0.sizes, input1.sizes)], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format( split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = utils.split_paths(self.args.data) assert len(paths) > 0 src_tokens = {} tgt_tokens = {} tgt_values = {} for field in configs.fields: split_path = os.path.join(self.args.data, field, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary[field], self.args.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, split_path) ) dataset = maybe_shorten_dataset( dataset, split, self.args.shorten_data_split_list, self.args.shorten_method, self.args.tokens_per_sample, self.args.seed, ) # create continuous blocks of tokens dataset = TokenBlockDataset( dataset, dataset.sizes, self.args.tokens_per_sample - 1, # one less for <s> pad=self.source_dictionary[field].pad(), eos=self.source_dictionary[field].eos(), break_mode=self.args.sample_break_mode, ) logger.info("loaded {} blocks from: {}".format(len(dataset), split_path)) # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT) dataset = PrependTokenDataset(dataset, self.source_dictionary[field].bos()) if field == configs.static_field: src_dataset_code, tgt_dataset_code = MaskTokensDataset.apply_mask( dataset, self.source_dictionary[field], pad_idx=self.source_dictionary[field].pad(), mask_idx=self.mask_idx_dict[field], seed=self.args.seed, mask_prob=self.args.mask_prob, leave_unmasked_prob=self.args.leave_unmasked_prob, random_token_prob=self.args.random_token_prob, freq_weighted_replacement=self.args.freq_weighted_replacement, ) src_tokens[field] = RightPadDataset( src_dataset_code, pad_idx=self.source_dictionary[field].pad() ) tgt_tokens[field] = RightPadDataset( tgt_dataset_code, pad_idx=self.source_dictionary[field].pad() ) elif field in configs.byte_fields: src_dataset_value, tgt_dataset_value = MaskValuesDataset.apply_mask( dataset, self.source_dictionary[field], pad_idx=self.source_dictionary[field].pad(), mask_idx=self.mask_idx_dict[field], seed=self.args.seed, mask_prob=self.args.mask_prob, leave_unmasked_prob=self.args.leave_unmasked_prob, random_token_prob=self.args.random_token_prob, freq_weighted_replacement=self.args.freq_weighted_replacement, ) src_tokens[field] = RightPadDataset( src_dataset_value, pad_idx=self.source_dictionary[field].pad() ) # dummy tokens are treated as 1 # TODO: assert there should not be any dummy tokens here tgt_values[field] = BytevalueDataset(tgt_dataset_value, self.source_dictionary[field]) else: src_tokens[field] = RightPadDataset( dataset, pad_idx=self.source_dictionary[field].pad() ) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_dataset_code)) self.datasets[split] = SortDataset( NestedDictionaryDataset( { "id": IdDataset(), "net_input": { "src_tokens": src_tokens, "src_lengths": NumelDataset(src_dataset_code, reduce=False), }, "target": { "tgt_tokens": tgt_tokens, "tgt_values": tgt_values }, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_dataset_code, reduce=True), }, sizes=[src_dataset_code.sizes], ), sort_order=[ shuffle, src_dataset_code.sizes, ], )
def load_dataset(self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ def binarize(s, append_bos=False): if self.bpe is not None: s = self.bpe.encode(s) tokens = self.vocab.encode_line( s, append_eos=True, add_if_not_exist=False, ).long() if append_bos and self.args.init_token is not None: tokens = torch.cat([tokens.new([self.args.init_token]), tokens]) return tokens if data_path is None: data_path = os.path.join(self.args.data, split + '.jsonl') if not os.path.exists(data_path): raise FileNotFoundError('Cannot find data: {}'.format(data_path)) src_tokens = [[] for i in range(self.args.num_classes)] src_lengths = [[] for i in range(self.args.num_classes)] labels = [] with open(data_path) as h: for line in h: example = json.loads(line.strip()) if 'answerKey' in example: label = ord(example['answerKey']) - ord('A') labels.append(label) question = example['question']['stem'] assert len(example['question']['choices']) == self.args.num_classes # format: `<s> Q: Where would I not want a fox? </s> A: hen house </s>` question = 'Q: ' + question question_toks = binarize(question, append_bos=True) for i, choice in enumerate(example['question']['choices']): src = 'A: ' + choice['text'] src_bin = torch.cat([question_toks, binarize(src)]) src_tokens[i].append(src_bin) src_lengths[i].append(len(src_bin)) assert all(len(src_tokens[0]) == len(src_tokens[i]) for i in range(self.args.num_classes)) assert len(src_tokens[0]) == len(src_lengths[0]) assert len(labels) == 0 or len(labels) == len(src_tokens[0]) for i in range(self.args.num_classes): src_lengths[i] = np.array(src_lengths[i]) src_tokens[i] = ListDataset(src_tokens[i], src_lengths[i]) src_lengths[i] = ListDataset(src_lengths[i]) dataset = { 'id': IdDataset(), 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens[0], reduce=True), } for i in range(self.args.num_classes): dataset.update({ 'net_input{}'.format(i + 1): { 'src_tokens': RightPadDataset( src_tokens[i], pad_idx=self.source_dictionary.pad(), ), 'src_lengths': src_lengths[i], } }) if len(labels) > 0: dataset.update({'target': RawLabelDataset(labels)}) dataset = NestedDictionaryDataset( dataset, sizes=[np.maximum.reduce([src_token.sizes for src_token in src_tokens])], ) with data_utils.numpy_seed(self.args.seed): dataset = SortDataset( dataset, # shuffle sort_order=[np.random.permutation(len(dataset))], ) print('| Loaded {} with {} samples'.format(split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" inputs_path = Path(self.args.data) / "{split}".format(split=split) src_tokens = data_utils.load_indexed_dataset( str(inputs_path), self.source_dictionary, self.args.dataset_impl, combine=combine, ) assert src_tokens is not None, "could not find dataset: {}".format( inputs_path) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) src_tokens = PrependTokenDataset(src_tokens, self.source_dictionary.bos()) targets_path = Path(self.args.data) / "{}.nonterm".format(split) labelled_spans = data_utils.load_indexed_dataset( str(targets_path), self.label_dictionary, self.args.dataset_impl, combine=combine, ) assert labelled_spans is not None, "could not find labels: {}".format( targets_path) raise NotImplementedError target_spans = LabelledSpanDataset(labelled_spans, return_spans=True) labels = LabelledSpanDataset(labelled_spans, return_spans=False) # all possible word spans in each sequence word_spans = WordSpanDataset(src_tokens, self.source_dictionary, self.is_word_initial) all_spans = ProductSpanDataset(word_spans) dataset = { "id": IdDataset(), "net_input": { "src_tokens": RightPadDataset(src_tokens, pad_idx=self.source_dictionary.pad()), "nsrc_tokens": NumelDataset(src_tokens), "src_spans": RightPadDataset(all_spans, pad_idx=self.label_dictionary.pad()), "nsrc_spans": NumSpanDataset(all_spans), }, "targets": RightPadDataset(labels, pad_idx=self.label_dictionary.pad()), "target_spans": RightPadDataset(target_spans, pad_idx=self.label_dictionary.pad()), "ntargets": NumelDataset(labels), "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), "nwords": NumWordsDataset(src_tokens, self.dictionary, self.is_word_initial), "word_spans": RightPadDataset(word_spans, pad_idx=self.label_dictionary.pad()), } nested_dataset = NestedDictionaryDatasetFix(dataset, sizes=[src_tokens.sizes]) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset(nested_dataset, sort_order=[shuffle]) logger.info("Loaded {0} with #samples: {1}".format( split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) return dataset # inputs are loaded similarly to sentence_prediction input0 = make_dataset("input0", self.source_dictionary) # question input1 = make_dataset("input1", self.source_dictionary) # context # src_tokens: <init_token> input0 <separator_token> input1 <eos_token> if self.args.init_token is not None: input0 = PrependTokenDataset(input0, self.args.init_token) if self.args.separator_token is not None: input1 = PrependTokenDataset(input1, self.args.separator_token) if self.args.max_context_length is not None: input1 = TruncateDataset(input1, self.args.max_option_length) src_tokens = ConcatSentencesDataset(input0, input1) if self.args.truncate_sequence: src_tokens = TruncateDataset(src_tokens, self.args.max_positions) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) dataset = { "id": IdDataset(), "net_input": { "src_tokens": RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), "src_lengths": NumelDataset(src_tokens, reduce=False), "input0_lengths": NumelDataset( input0, reduce=False ), # question length (init_token possibly included) }, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), } # labels (spans) are loaded similarly to sentence_ranking label_path = "{}.label".format(get_path("label", split)) def _process_label(positions, input0_length, truncate_sequence, max_positions): """Process a span [start:end] to the input range. After processing, tokens can be accessed by tokens[start:end+1]. TODO: change inputs to reflect this change in the first place. """ start, end = [ pos + input0_length + (self.args.separator_token is not None) for pos in positions ] end -= 1 # [0, 511] if truncate_sequence: if start >= max_positions: start, end = max_positions - 1, max_positions - 1 # not predictable elif end >= max_positions: end = max_positions - 1 return start, end if os.path.exists(label_path): with open(label_path) as h: dataset.update(target=RawLabelDataset([ _process_label( tuple(int(pos) for pos in x.split()), dataset["net_input"]["input0_lengths"][i], self.args.truncate_sequence, self.max_positions(), ) for i, x in enumerate( h.readlines()) # (start_position, end_position) ])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format( split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, dictionary, self.args.dataset_impl, combine=combine, ) return dataset src_tokens = make_dataset('data', self.source_dictionary) if self.args.init_token is not None: src_tokens = PrependTokenDataset(src_tokens, self.args.init_token) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) if self.args.truncate_sequence: src_tokens = TruncateDataset(src_tokens, self.args.max_positions) dataset = { 'id': IdDataset(), 'net_input': { 'src_tokens': RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(src_tokens, reduce=False), }, 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens, reduce=True), } label_dataset = make_dataset('label', self.label_dictionary) if label_dataset is not None: dataset.update(target=OffsetTokensDataset( StripTokenDataset( label_dataset, id_to_strip=self.label_dictionary.eos(), ), offset=-self.label_dictionary.nspecial, )) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format( split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ dataset = self._load_dataset_split(split, epoch, combine) # create masked input and targets mask_whole_words = (get_whole_word_mask(self.args, self.source_dictionary) if self.cfg.mask_whole_words else None) src_dataset, tgt_dataset = MaskTokensDataset.apply_mask( dataset, self.source_dictionary, pad_idx=self.source_dictionary.pad(), mask_idx=self.mask_idx, seed=self.cfg.seed, mask_prob=self.cfg.mask_prob, leave_unmasked_prob=self.cfg.leave_unmasked_prob, random_token_prob=self.cfg.random_token_prob, freq_weighted_replacement=self.cfg.freq_weighted_replacement, mask_whole_words=mask_whole_words, mask_multiple_length=self.cfg.mask_multiple_length, mask_stdev=self.cfg.mask_stdev, ) with data_utils.numpy_seed(self.cfg.seed): shuffle = np.random.permutation(len(src_dataset)) target_dataset = RightPadDataset( tgt_dataset, pad_idx=self.source_dictionary.pad(), ) input_dict = { "src_tokens": RightPadDataset( src_dataset, pad_idx=self.source_dictionary.pad(), ), "src_lengths": NumelDataset(src_dataset, reduce=False), } if self.cfg.include_target_tokens: input_dict["target_tokens"] = target_dataset self.datasets[split] = SortDataset( NestedDictionaryDataset( { "id": IdDataset(), "net_input": input_dict, "target": target_dataset, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_dataset, reduce=True), }, sizes=[src_dataset.sizes], ), sort_order=[ shuffle, src_dataset.sizes, ], )
def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = utils.split_paths(self.cfg.data) assert len(paths) > 0 if split != getattr(self.cfg, "train_subset", None): # if not training data set, use the first shard for valid and test paths = paths[:1] data_path = paths[(epoch - 1) % len(paths)] # infer langcode src, tgt = self.cfg.source_lang, self.cfg.target_lang prefix = os.path.join(data_path, '{}.{}-{}.'.format(split, src, tgt)) src_dataset = data_utils.load_indexed_dataset(prefix + src, self.src_dict, self.cfg.dataset_impl) tag_dataset = data_utils.load_indexed_dataset(prefix + tgt, self.tag_dict, self.cfg.dataset_impl) src_dataset = StripTokenDataset( src_dataset, id_to_strip=self.source_dictionary.eos()) tag_dataset = StripTokenDataset(tag_dataset, id_to_strip=self.tag_dictionary.eos()) tag_pad = self.source_dictionary.pad() tag_offset = tag_pad + 1 dataset = { 'id': IdDataset(), 'net_input': { 'src_tokens': RightPadDataset(src_dataset, pad_idx=self.source_dictionary.pad()), 'src_lengths': NumelDataset(src_dataset, reduce=False), }, 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_dataset, reduce=True), 'target': RightPadDataset( OffsetTokensDataset(tag_dataset, offset=-self.tag_dictionary.nspecial + tag_offset), pad_idx=tag_pad, ), } dataset = NestedDictionaryDataset( dataset, sizes=[src_dataset.sizes], ) logger.info( str([self.src_dict[k] for k in dataset[0]['net_input.src_tokens']])) logger.info( str([ self.tag_dict[k + self.tag_dictionary.nspecial - tag_offset] for k in dataset[0]['target'] ])) self.datasets[split] = dataset
def load_dataset(self, split, epoch=0, combine=False, data_path=None, return_only=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ def binarize(s, append_bos=False): if self.bpe is not None: s = self.bpe.encode(s) tokens = self.vocab.encode_line( s, append_eos=True, add_if_not_exist=False, ).long() if append_bos and self.args.init_token is not None: tokens = torch.cat( [tokens.new([self.args.init_token]), tokens]) return tokens # self.data_path_table={'train_input':os.path.join(self.args.data,'Training Data','subtaskA_data_all.csv'),\ # 'train_answer':os.path.join(self.args.data,'Training Data','subtaskA_answers_all.csv'),\ # 'valid_input':os.path.join(self.args.data,'Trial Data','taskA_trial_data.csv'),\ # 'valid_answer':os.path.join(self.args.data,'Trial Data','taskA_trial_answer.csv')\ # } # self.data_path_table={'train_input':os.path.join(self.args.data,'trainval','subtaskA_data_all.csv'),\ # 'train_answer':os.path.join(self.args.data,'trainval','subtaskA_answers_all.csv'),\ # 'valid_input':os.path.join(self.args.data,'Dev Data','subtaskA_dev_data.csv'),\ # 'valid_answer':os.path.join(self.args.data,'Dev Data','subtaskA_gold_answers.csv')\ # } self.data_path_table={'train_input':os.path.join(self.args.data,'trainvaldev','subtaskA_data_all_plusplus.csv'),\ 'train_answer':os.path.join(self.args.data,'trainvaldev','subtaskA_answers_all.csv'),\ 'valid_input':os.path.join(self.args.data,'Dev Data','subtaskA_dev_data_plusplus.csv'),\ 'valid_answer':os.path.join(self.args.data,'Dev Data','subtaskA_gold_answers.csv')\ } # self.data_path_table={'train_input':os.path.join(self.args.data,'subtaskA_data_all.csv'),\ # 'train_answer':os.path.join(self.args.data,'subtaskA_answers_all.csv'),\ # 'valid_input':os.path.join(self.args.data,'taskA_trial_data.csv'),\ # 'valid_answer':os.path.join(self.args.data,'taskA_trial_answer.csv')\ # } data_path_input = self.data_path_table[split + '_input'] data_path_answer = self.data_path_table[split + '_answer'] if not os.path.exists(data_path_input): raise FileNotFoundError( 'Cannot find data: {}'.format(data_path_input)) if not os.path.exists(data_path_answer): raise FileNotFoundError( 'Cannot find data: {}'.format(data_path_answer)) src_tokens = [[] for i in range(self.args.num_classes)] src_lengths = [[] for i in range(self.args.num_classes)] src_ids = [] labels = [] label_ids = [] with open(data_path_input) as f: reader = csv.reader(f) for row in islice(reader, 1, None): src_ids.append(row[0]) for i in range(self.args.num_classes): src = row[i + 1] evidence = row[i + 3] if src.isupper(): src = src.capitalize() src = src + ' Context: ' + evidence src_bin = binarize(src, append_bos=True) src_tokens[i].append(src_bin) src_lengths[i].append(len(src_bin)) assert all( len(src_tokens[0]) == len(src_tokens[i]) for i in range(self.args.num_classes)) assert len(src_tokens[0]) == len(src_lengths[0]) with open(data_path_answer) as f: reader = csv.reader(f) for row in reader: label_ids.append(row[0]) label = 1 - int(row[1]) labels.append(label) assert len(labels) == 0 or len(labels) == len(src_tokens[0]) assert all(src_ids[i] == label_ids[i] for i in range(len(src_ids))) for i in range(self.args.num_classes): src_lengths[i] = np.array(src_lengths[i]) src_tokens[i] = ListDataset(src_tokens[i], src_lengths[i]) src_lengths[i] = ListDataset(src_lengths[i]) dataset = { 'id': IdDataset(), 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens[0], reduce=True), } for i in range(self.args.num_classes): dataset.update({ 'net_input{}'.format(i + 1): { 'src_tokens': RightPadDataset( src_tokens[i], pad_idx=self.source_dictionary.pad(), ), 'src_lengths': src_lengths[i], } }) if len(labels) > 0: dataset.update({'target': RawLabelDataset(labels)}) dataset = NestedDictionaryDataset( dataset, sizes=[ np.maximum.reduce( [src_token.sizes for src_token in src_tokens]) ], ) with data_utils.numpy_seed(self.args.seed): dataset = SortDataset( dataset, # shuffle sort_order=[np.random.permutation(len(dataset))], ) print('| Loaded {} with {} samples'.format(split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def build_dataset_for_inference(self, src_tokens, src_lengths): return RightPadDataset(src_tokens, pad_idx=self.source_dictionary.pad())
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, dictionary, self.args.dataset_impl, combine=combine, ) return dataset # input0 is source, input1 is synthetic target, input2 is reference input0 = make_dataset(self.args.input0, self.source_dictionary) assert input0 is not None, 'could not find dataset: {}'.format( get_path(type, split)) input1 = make_dataset(self.args.input1, self.source_dictionary) if self.args.init_token is not None: input0 = PrependTokenDataset(input0, self.args.init_token) if self.args.input2 is not None: input2 = make_dataset(self.args.input2, self.source_dictionary) if self.args.input2 is not None and self.add_ref_prob > 0 and split != 'valid': input3 = PrependTokenDataset(input2, self.args.separator_token) else: input3 = None if input1 is None: src_tokens = input0 else: if self.args.separator_token is not None: input1 = PrependTokenDataset(input1, self.args.separator_token) if self.args.input2 is not None and self.add_ref_prob > 0. and split != 'valid': src_tokens = ConcatSentencesDataset( input0, input3, input1, add_ref_prob=self.add_ref_prob, drop_ref_rate=self.args.dropout_ref, pad_idx=self.source_dictionary.pad(), eos_idx=self.source_dictionary.eos(), bos_idx=self.source_dictionary.bos()) else: src_tokens = ConcatSentencesDataset(input0, input1) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) if self.args.truncate_sequence: src_tokens = TruncateDataset(src_tokens, self.args.max_positions) if self.args.input2 is not None and self.args.add_tran_loss: # create masked input and targets mask_whole_words = get_whole_word_mask(self.args, self.source_dictionary) \ if self.args.mask_whole_words else None ref_dataset, ref_target_dataset = MaskTokensDataset.apply_mask( input2, self.source_dictionary, pad_idx=self.source_dictionary.pad(), mask_idx=self.mask_idx, seed=self.args.seed, mask_prob=self.args.mask_prob, leave_unmasked_prob=self.args.leave_unmasked_prob, random_token_prob=self.args.random_token_prob, freq_weighted_replacement=self.args.freq_weighted_replacement, mask_whole_words=mask_whole_words, ) if self.args.separator_token is not None: input2 = PrependTokenDataset(ref_dataset, self.args.separator_token) parallel_src_tokens = ConcatSentencesDataset(input0, input2) if self.args.truncate_sequence: parallel_src_tokens = TruncateDataset(parallel_src_tokens, self.args.max_positions) dataset = { 'id': IdDataset(), 'net_input': { 'src_tokens': RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(src_tokens, reduce=False), }, 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens, reduce=True), } if self.args.input2 is not None and self.args.add_tran_loss: dataset['net_input']['parallel_src_tokens'] = RightPadDataset( parallel_src_tokens, pad_idx=self.source_dictionary.pad(), ) if self.args.add_prev_output_tokens: prev_tokens_dataset = RightPadDataset( RollDataset(src_tokens, 1), pad_idx=self.dictionary.pad(), ) dataset['net_input'].update( prev_output_tokens=prev_tokens_dataset, ) if not self.args.regression_target: label_dataset = make_dataset('label', self.label_dictionary) if label_dataset is not None: dataset.update(target=OffsetTokensDataset( StripTokenDataset( label_dataset, id_to_strip=self.label_dictionary.eos(), ), offset=-self.label_dictionary.nspecial, )) if self.args.input2 is not None and self.args.add_tran_loss: # used as translation target when calculating loss dataset.update(parallel_target=RightPadDataset( ref_target_dataset, pad_idx=self.source_dictionary.pad(), )) else: label_path = "{0}.label".format(get_path('label', split)) if os.path.exists(label_path): def parse_regression_target(i, line): values = line.split() assert len(values) == self.args.num_classes, \ f'expected num_classes={self.args.num_classes} regression target values on line {i}, found: "{line}"' return [float(x) for x in values] dataset.update(target=RawLabelDataset([ parse_regression_target(i, line.strip()) for i, line in enumerate(open(label_path).readlines()) ])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], all_sizes=src_tokens.all_sizes if self.args.add_target_num_tokens else None, padding_idx=self.source_dictionary.pad(), add_ref_prob=self.add_ref_prob if split != 'valid' else 0., ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format( split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) return dataset input0 = make_dataset('input0', self.source_dictionary) input_options = [ make_dataset('input{idx}'.format(idx=idx + 1), self.source_dictionary) for idx in range(self.args.num_classes) ] if self.args.separator_token is not None: input0 = PrependTokenDataset(input0, self.args.separator_token) src_tokens = [] for input_option in input_options: if self.args.init_token is not None: input_option = PrependTokenDataset(input_option, self.args.init_token) if self.args.max_option_length is not None: input_option = TruncateDataset(input_option, self.args.max_option_length) src_token = ConcatSentencesDataset(input_option, input0) src_token = maybe_shorten_dataset( src_token, split, self.args.shorten_data_split_whitelist, self.args.shorten_method, self.args.max_positions, self.args.seed, ) src_tokens.append(src_token) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens[0])) dataset = { 'id': IdDataset(), 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens[0], reduce=True), } for src_token_idx in range(len(src_tokens)): dataset.update({ 'net_input{idx}'.format(idx=src_token_idx + 1): { 'src_tokens': RightPadDataset( src_tokens[src_token_idx], pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(src_tokens[src_token_idx], reduce=False), } }) label_path = '{}.label'.format(get_path('label', split)) if os.path.exists(label_path): with open(label_path) as h: dataset.update(target=RawLabelDataset( [int(x.strip()) for x in h.readlines()])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[ np.maximum.reduce( [src_token.sizes for src_token in src_tokens]) ], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format( split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" ###encoder 객체 생성 bpe_encoder = MultiprocessingEncoder(self.args.encoder_json, self.args.vocab_bpe) bpe_encoder.initializer() ###preprocess_coqa부르기 examples, features = get_CoQA_features(self.args, bpe_encoder, self.args.init_token, self.args.separator_token, self.dictionary.pad(), split=split) self.examples[split] = examples self.features[split] = features qas_idx = [] src_tokens = [] src_lengths = [] padding_mask = [] start_pos = [] end_pos = [] is_unk = [] is_yes = [] is_no = [] number = [] option = [] for feature in features: src = torch.IntTensor(feature.input_tokens).long() p_mask = torch.IntTensor(feature.p_mask).long() src_tokens.append(src) src_lengths.append(len(src)) padding_mask.append(p_mask) qas_idx.append(feature.qas_id) start_pos.append(feature.start_position) end_pos.append(feature.end_position) is_unk.append(feature.is_unk) is_yes.append(feature.is_yes) is_no.append(feature.is_no) number.append(feature.number) option.append(feature.option) src_tokens = ListDataset(src_tokens, src_lengths) src_lengths = ListDataset(src_lengths) dataset = { "id": IdDataset(), "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), "qas_id": RawLabelDataset(qas_idx), "net_input": { "src_tokens": RightPadDataset(src_tokens, pad_idx=self.dictionary.pad()), "src_lengths": src_lengths, "start_position": RawLabelDataset(start_pos), "p_mask": RightPadDataset(padding_mask, pad_idx=self.dictionary.pad()), }, "start_position": RawLabelDataset(start_pos), "end_position": RawLabelDataset(end_pos), "is_unk": RawLabelDataset(is_unk), "is_yes": RawLabelDataset(is_yes), "is_no": RawLabelDataset(is_no), "number": RawLabelDataset(number), "option": RawLabelDataset(option), } dataset = NestedDictionaryDataset( dataset, sizes=[np.maximum.reduce([src_tokens.sizes])], ) with data_utils.numpy_seed(self.args.seed): dataset = SortDataset( dataset, sort_order=[np.random.permutation(len(dataset))], ) print("| Loaded {} with {} samples".format(split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, epoch=0, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" if self.cfg.data.endswith("1"): data_shard = (epoch - 1) % self.cfg.num_data_splits + 1 data_path = self.cfg.data[:-1] + str(data_shard) else: data_path = self.cfg.data def get_path(type, data_split): return os.path.join(data_path, str(type), data_split) def make_dataset(type, dictionary, data_split, combine): split_path = get_path(type, data_split) dataset = data_utils.load_indexed_dataset( split_path, dictionary, combine=combine, ) return dataset def load_split(data_split, metric): input_src = None if self.cfg.include_src: input_src = make_dataset("input_src", self.dictionary, data_split, combine=False) assert input_src is not None, "could not find dataset: {}".format( get_path("input_src", data_split)) input_tgt = make_dataset("input_tgt", self.dictionary, data_split, combine=False) assert input_tgt is not None, "could not find dataset: {}".format( get_path("input_tgt", data_split)) label_path = f"{get_path(metric, data_split)}.{metric}" assert os.path.exists( label_path), f"could not find dataset: {label_path}" np_labels = np.loadtxt(label_path) if self.cfg.target_metric == "ter": np_labels = -np_labels label = RawLabelDataset(np_labels) return input_src, input_tgt, label src_datasets = [] tgt_datasets = [] label_datasets = [] if split == self.cfg.train_subset: for k in itertools.count(): split_k = "train" + (str(k) if k > 0 else "") prefix = os.path.join(data_path, "input_tgt", split_k) if not indexed_dataset.dataset_exists(prefix, impl=None): if k > 0: break else: raise FileNotFoundError(f"Dataset not found: {prefix}") input_src, input_tgt, label = load_split( split_k, self.cfg.target_metric) src_datasets.append(input_src) tgt_datasets.append(input_tgt) label_datasets.append(label) else: input_src, input_tgt, label = load_split(split, self.cfg.target_metric) src_datasets.append(input_src) tgt_datasets.append(input_tgt) label_datasets.append(label) if len(tgt_datasets) == 1: input_tgt, label = tgt_datasets[0], label_datasets[0] if self.cfg.include_src: input_src = src_datasets[0] else: input_tgt = ConcatDataset(tgt_datasets) label = ConcatDataset(label_datasets) if self.cfg.include_src: input_src = ConcatDataset(src_datasets) input_tgt = TruncateDataset(input_tgt, self.cfg.max_positions) if self.cfg.include_src: input_src = PrependTokenDataset(input_src, self.dictionary.bos()) input_src = TruncateDataset(input_src, self.cfg.max_positions) src_lengths = NumelDataset(input_src, reduce=False) src_tokens = ConcatSentencesDataset(input_src, input_tgt) else: src_tokens = PrependTokenDataset(input_tgt, self.dictionary.bos()) src_lengths = NumelDataset(src_tokens, reduce=False) dataset = { "id": IdDataset(), "net_input": { "src_tokens": RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), "src_lengths": src_lengths, }, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), "target": label, } dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) assert len(dataset) % self.cfg.mt_beam == 0, ( "dataset size (%d) is not a multiple of beam size (%d)" % (len(dataset), self.cfg.mt_beam)) # no need to shuffle valid/test sets if not self.cfg.no_shuffle and split == self.cfg.train_subset: # need to keep all hypothese together start_idx = np.arange(0, len(dataset), self.cfg.mt_beam) with data_utils.numpy_seed(self.cfg.seed + epoch): np.random.shuffle(start_idx) idx = np.arange(0, self.cfg.mt_beam) shuffle = np.tile(idx, (len(start_idx), 1)).reshape(-1) + np.tile( start_idx, (self.cfg.mt_beam, 1)).transpose().reshape(-1) dataset = SortDataset( dataset, sort_order=[shuffle], ) logger.info(f"Loaded {split} with #samples: {len(dataset)}") self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(key, split): return os.path.join(self.args.data, key, split) def make_dataset(key, dictionary): split_path = get_path(key, split) dataset = data_utils.load_indexed_dataset( split_path, dictionary, self.args.dataset_impl, combine=combine, ) return dataset input0 = make_dataset("input0", self.source_dictionary) assert input0 is not None, "could not find dataset: {}".format( get_path("input0", split)) input1 = make_dataset("input1", self.source_dictionary) if self.args.init_token is not None: input0 = PrependTokenDataset(input0, self.args.init_token) if input1 is None: src_tokens = input0 else: if self.args.separator_token is not None: input1 = PrependTokenDataset(input1, self.args.separator_token) src_tokens = ConcatSentencesDataset(input0, input1) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) src_tokens = maybe_shorten_dataset( src_tokens, split, self.args.shorten_data_split_list, self.args.shorten_method, self.args.max_positions, self.args.seed, ) dataset = { "id": IdDataset(), "net_input": { "src_tokens": RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), "src_lengths": NumelDataset(src_tokens, reduce=False), }, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), } if self.args.add_prev_output_tokens: prev_tokens_dataset = RightPadDataset( RollDataset(src_tokens, 1), pad_idx=self.dictionary.pad(), ) dataset["net_input"].update( prev_output_tokens=prev_tokens_dataset, ) label_path = "{0}.npz".format(get_path("label", split)) if os.path.exists(label_path): csr_matrix = load_npz(label_path) dataset.update(target=CSRLabelDataset(csr_matrix)) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format( split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, dictionary, self.args.dataset_impl, combine=combine, ) return dataset input0 = make_dataset('input0', self.source_dictionary) assert input0 is not None, 'could not find dataset: {}'.format( get_path(type, split)) input1 = make_dataset('input1', self.source_dictionary) if self.args.init_token is not None: input0 = PrependTokenDataset(input0, self.args.init_token) if input1 is None: src_tokens = input0 else: if self.args.separator_token is not None: input1 = PrependTokenDataset(input1, self.args.separator_token) src_tokens = ConcatSentencesDataset(input0, input1) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) src_tokens = maybe_shorten_dataset( src_tokens, split, self.args.shorten_data_split_list, self.args.shorten_method, self.args.max_positions, self.args.seed, ) dataset = { 'id': IdDataset(), 'net_input': { 'src_tokens': RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(src_tokens, reduce=False), }, 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens, reduce=True), } if self.args.add_prev_output_tokens: prev_tokens_dataset = RightPadDataset( RollDataset(src_tokens, 1), pad_idx=self.dictionary.pad(), ) dataset['net_input'].update( prev_output_tokens=prev_tokens_dataset, ) if not self.args.regression_target: label_dataset = make_dataset('label', self.label_dictionary) if label_dataset is not None: dataset.update(target=OffsetTokensDataset( StripTokenDataset( label_dataset, id_to_strip=self.label_dictionary.eos(), ), offset=-self.label_dictionary.nspecial, )) else: label_path = "{0}.label".format(get_path('label', split)) if os.path.exists(label_path): def parse_regression_target(i, line): values = line.split() assert len(values) == self.args.num_classes, \ f'expected num_classes={self.args.num_classes} regression target values on line {i}, found: "{line}"' return [float(x) for x in values] dataset.update(target=RawLabelDataset([ parse_regression_target(i, line.strip()) for i, line in enumerate(open(label_path).readlines()) ])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format( split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError('Dataset not found: {} ({})'.format(split, split_path)) dataset = maybe_shorten_dataset( dataset, split, self.args.shorten_data_split_list, self.args.shorten_method, self.args.tokens_per_sample, self.args.seed, ) # create continuous blocks of tokens dataset = TokenBlockDataset( dataset, dataset.sizes, self.args.tokens_per_sample, pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode=self.args.sample_break_mode, ) logger.info('loaded {} blocks from: {}'.format(len(dataset), split_path)) # remove tail dataset = RemoveTailDataset(dataset) # create masked input and targets mask_whole_words = get_whole_word_mask(self.args, self.source_dictionary) \ if self.args.mask_whole_words else None src_dataset, tgt_dataset = MaskTokensDataset.apply_mask( dataset, self.source_dictionary, pad_idx=self.source_dictionary.pad(), mask_idx=self.mask_idx, seed=self.args.seed, mask_prob=self.args.mask_prob, leave_unmasked_prob=self.args.leave_unmasked_prob, random_token_prob=self.args.random_token_prob, freq_weighted_replacement=self.args.freq_weighted_replacement, mask_whole_words=mask_whole_words, ) with data_utils.numpy_seed(self.args.seed + epoch): shuffle = np.random.permutation(len(src_dataset)) self.datasets[split] = SortDataset( NestedDictionaryDataset( { 'id': IdDataset(), 'net_input': { 'src_tokens': RightPadDataset( src_dataset, pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(src_dataset, reduce=False), }, 'target': RightPadDataset( tgt_dataset, pad_idx=self.source_dictionary.pad(), ), 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_dataset, reduce=True), }, sizes=[src_dataset.sizes], ), sort_order=[ shuffle, src_dataset.sizes, ], )
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) return dataset input0 = make_dataset('input0', self.source_dictionary) assert input0 is not None, 'could not find dataset: {}'.format( get_path(type, split)) input1 = make_dataset('input1', self.source_dictionary) if self.args.init_token is not None: input0 = PrependTokenDataset(input0, self.args.init_token) if input1 is None: src_tokens = input0 else: if self.args.separator_token is not None: input1 = PrependTokenDataset(input1, self.args.separator_token) src_tokens = ConcatSentencesDataset(input0, input1) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) if self.args.truncate_sequence: src_tokens = TruncateDataset(src_tokens, self.args.max_positions) dataset = { 'id': IdDataset(), 'net_input': { 'src_tokens': RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(src_tokens, reduce=False), }, 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens, reduce=True), } if not self.args.regression_target: label_dataset = make_dataset('label', self.target_dictionary) if label_dataset is not None: dataset.update(target=OffsetTokensDataset( StripTokenDataset( label_dataset, id_to_strip=self.target_dictionary.eos(), ), offset=-self.target_dictionary.nspecial, )) else: label_path = "{0}.label".format(get_path('label', split)) if os.path.exists(label_path): dataset.update(target=RawLabelDataset( [float(x.strip()) for x in open(label_path).readlines()])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) print("| Loaded {0} with #samples: {1}".format(split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, dictionary, self.args.dataset_impl, combine=combine, ) assert dataset is not None, "could not find dataset: {}".format( get_path(type, split)) return dataset src_tokens = make_dataset("input0", self.source_dictionary) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) label_dataset = make_dataset("label", self.label_dictionary) dataset = { "id": IdDataset(), "net_input": { "src_tokens": RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), "src_lengths": NumelDataset(src_tokens, reduce=False), }, "target": RightPadDataset( # use 1 as padding, will be used to mask out padding when calculating loss ReplaceDataset( # replace eos and existing padding (used when some tokens should not be predicted) with -1 OffsetTokensDataset( # offset tokens to get the targets to the correct range (0,1,2,...) label_dataset, offset=-self.label_dictionary.nspecial, ), replace_map={ self.label_dictionary.eos() - self.label_dictionary.nspecial: -1, self.label_dictionary.pad() - self.label_dictionary.nspecial: -1, }, offsets=np.zeros(len(label_dataset), dtype=np.int), ), pad_idx=-1, ), "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), } nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format(split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]