def load_split(data_split, metric): input_src = None if self.cfg.include_src: input_src = make_dataset("input_src", self.dictionary, data_split, combine=False) assert input_src is not None, "could not find dataset: {}".format( get_path("input_src", data_split)) input_tgt = make_dataset("input_tgt", self.dictionary, data_split, combine=False) assert input_tgt is not None, "could not find dataset: {}".format( get_path("input_tgt", data_split)) label_path = f"{get_path(metric, data_split)}.{metric}" assert os.path.exists( label_path), f"could not find dataset: {label_path}" np_labels = np.loadtxt(label_path) if self.cfg.target_metric == "ter": np_labels = -np_labels label = RawLabelDataset(np_labels) return input_src, input_tgt, label
def load_label(path, label_name): if label_name is None: return None path = os.path.join(path, label_name) labels = [] for line in open(path).readlines(): labels.append(torch.FloatTensor([int(label.strip()) for label in line.strip().split()])) label_dataset = RawLabelDataset(labels) return label_dataset
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) return dataset input0 = make_dataset('input0', self.source_dictionary) input_options = [ make_dataset('input{idx}'.format(idx=idx + 1), self.source_dictionary) for idx in range(self.args.num_classes) ] if self.args.separator_token is not None: input0 = PrependTokenDataset(input0, self.args.separator_token) src_tokens = [] for input_option in input_options: if self.args.init_token is not None: input_option = PrependTokenDataset(input_option, self.args.init_token) if self.args.max_option_length is not None: input_option = TruncateDataset(input_option, self.args.max_option_length) src_token = ConcatSentencesDataset(input_option, input0) if self.args.truncate_sequence: src_token = TruncateDataset(src_token, self.args.max_positions) src_tokens.append(src_token) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens[0])) dataset = { 'id': IdDataset(), 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens[0], reduce=True), } for src_token_idx in range(len(src_tokens)): dataset.update({ 'net_input{idx}'.format(idx=src_token_idx + 1): { 'src_tokens': RightPadDataset( src_tokens[src_token_idx], pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(src_tokens[src_token_idx], reduce=False), } }) label_path = '{}.label'.format(get_path('label', split)) if os.path.exists(label_path): with open(label_path) as h: dataset.update(target=RawLabelDataset( [int(x.strip()) for x in h.readlines()])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[ np.maximum.reduce( [src_token.sizes for src_token in src_tokens]) ], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format( split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) return dataset input0 = make_dataset('input0', self.source_dictionary) assert input0 is not None, 'could not find dataset: {}'.format( get_path(type, split)) input1 = make_dataset('input1', self.source_dictionary) if self.args.init_token is not None: input0 = PrependTokenDataset(input0, self.args.init_token) if input1 is None: src_tokens = input0 else: if self.args.separator_token is not None: input1 = PrependTokenDataset(input1, self.args.separator_token) src_tokens = ConcatSentencesDataset(input0, input1) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) if self.args.truncate_sequence: src_tokens = TruncateDataset(src_tokens, self.args.max_positions) dataset = { 'id': IdDataset(), 'net_input': { 'src_tokens': RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(src_tokens, reduce=False), }, 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens, reduce=True), } if not self.args.regression_target: label_dataset = make_dataset('label', self.target_dictionary) if label_dataset is not None: dataset.update(target=OffsetTokensDataset( StripTokenDataset( label_dataset, id_to_strip=self.target_dictionary.eos(), ), offset=-self.target_dictionary.nspecial, )) else: label_path = f"{get_path('label', split)}.label" if os.path.exists(label_path): dataset.update(target=RawLabelDataset( [float(x.strip()) for x in open(label_path).readlines()])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) print(f"| Loaded {split} with #samples: {len(dataset)}") self.datasets[split] = dataset return self.datasets[split]
def load_KE_dataset(self, split, kedata_path, epoch=0, combine=False): paths = kedata_path.split(':') assert len(paths) > 0 data_path = paths[epoch % len(paths)] def get_path(type): return os.path.join(data_path,type,split) def desc_dataset(type, dictionary, relation_desc=None): now_path=get_path(type) #print(now_path) dataset=data_utils.load_indexed_dataset( now_path, dictionary, self.args.dataset_impl, combine=combine, ) if self.args.init_token is not None: dataset = PrependTokenDataset(dataset, self.args.init_token) if relation_desc is not None: dataset = ConcatSentencesDataset(dataset, relation_desc) dataset = TruncateDataset(dataset, self.args.tokens_per_sample) #??? dataset = RightPadDataset(dataset, pad_idx=self.source_dictionary.pad()) return dataset assert(not (self.args.relation_desc and self.args.relemb_from_desc)) if self.args.relation_desc or self.args.relemb_from_desc: now_path=get_path('relation_desc') relation_desc=data_utils.load_indexed_dataset( now_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) if self.args.relation_desc: if self.args.separator_token is not None: relation_desc = PrependTokenDataset(relation_desc, self.args.separator_token) else: raise Exception("separator_token is None") elif self.args.relemb_from_desc: relation_desc = PrependTokenDataset(relation_desc, self.args.init_token) relation_desc = TruncateDataset(relation_desc, self.args.tokens_per_sample // 8) # 64 relation_desc = RightPadDataset(relation_desc, pad_idx=self.source_dictionary.pad()) else: relation_desc = None head=desc_dataset("head",self.source_dictionary) tail=desc_dataset("tail",self.source_dictionary) nHead=desc_dataset("negHead",self.source_dictionary) nTail=desc_dataset("negTail",self.source_dictionary) head_r=desc_dataset("head",self.source_dictionary, relation_desc if self.args.relation_desc else None) tail_r=desc_dataset("tail",self.source_dictionary, relation_desc if self.args.relation_desc else None) assert len(nHead)%len(head)==0, "check the KE positive and negative instances' number" self.negative_sample_size=len(nHead)/len(head) relation=np.load(get_path("relation")+".npy") sizes=np.load(get_path("sizes")+".npy") with data_utils.numpy_seed(self.args.seed + epoch): shuffle=np.random.permutation(len(head)) net_input = { 'heads': head, 'tails': tail, 'nHeads': KeNegDataset(nHead,self.args), 'nTails': KeNegDataset(nTail,self.args), 'heads_r': head_r, 'tails_r': tail_r, 'src_lengths': FakeNumelDataset(sizes, reduce=False), } if self.args.relemb_from_desc: net_input['relation_desc'] = relation_desc dataset=SortDataset( NestedDictionaryDataset( { 'id':IdDataset(), 'net_input': net_input, 'target': RawLabelDataset(relation), 'nsentences':NumSamplesDataset(), 'ntokens': FakeNumelDataset(sizes, reduce=True), }, sizes=[sizes], ), sort_order=[shuffle], ) return dataset
def load_dataset(self, split, epoch=0, combine=False, data_selector=None): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ print('Loading dataset') data_path = os.path.join(self.args.data) dataset_inst = data_utils.load_indexed_dataset( os.path.join(data_path, 'insts', split), self.instruction_dictionary, self.args.dataset_impl, combine=combine, ) dataset_state = data_utils.load_indexed_dataset( os.path.join(data_path, 'states', split), self.state_dictionary, self.args.dataset_impl, combine=combine, ) if dataset_inst is None or dataset_state is None: raise FileNotFoundError('Dataset not found: {}'.format(split)) dataset_inst = SeqOfSeqDataset(dataset_inst, self.instruction_dictionary) dataset_state = SeqOfSeqDataset(dataset_state, self.state_dictionary) dataset_pos = IRPositionDataset(os.path.join(data_path, 'pos', split)) dataset = IRDataset(dataset_inst, dataset_state, dataset_pos) block_size = self.args.function_length dataset = IRPadDataset( dataset, inst_pad_idx=self.instruction_dictionary.pad(), state_pad_idx=self.state_dictionary.pad(), inst_mask_idx=self.inst_mask_idx, state_mask_idx=self.state_mask_idx, inst_cls_idx=self.instruction_dictionary.index('<t>'), state_cls_idx=self.state_dictionary.index('<t>'), smallbert_insts_per_input=self.args.smallbert_insts_per_group, smallbert_states_per_input=self.args.smallbert_insts_per_group, max_length=block_size, inst_pad_length=32, state_pad_length=16, pair=True, ) labels_str = list(map(json.loads, open(os.path.join(data_path, 'label', split + '.txt')))) labels = torch.tensor([x - 1 if isinstance(x, int) else int(x.strip()) - 1 for x in labels_str]) print('| loaded {} batches from: {} and {}'.format(len(dataset), os.path.join(data_path, 'insts', split), os.path.join(data_path, 'states', split))) with data_utils.numpy_seed(self.args.seed + epoch): shuffle = np.random.permutation(len(dataset)) self.labels[split] = SortDataset(RawLabelDataset(labels), sort_order=[shuffle]) self.datasets[split] = SortDataset( NestedDictionaryDataset( { 'id': IdDataset(), 'net_input': { 'src': dataset, }, 'label': RawLabelDataset(labels) }, sizes=[dataset.sizes], ), sort_order=[ shuffle, # dataset.sizes, ], )
def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] languages = sorted(name for name in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, name))) logger.info("Training on {0} languages: {1}".format( len(languages), languages)) logger.info("Language to id mapping: ", {lang: id for id, lang in enumerate(languages)}) mask_whole_words = self._get_whole_word_mask() lang_datasets = [] for lang_id, language in enumerate(languages): split_path = os.path.join(data_path, language, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, split_path)) # create continuous blocks of tokens dataset = TokenBlockDataset( dataset, dataset.sizes, self.args.tokens_per_sample - 1, # one less for <s> pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode=self.args.sample_break_mode, ) logger.info('loaded {} blocks from: {}'.format( len(dataset), split_path)) # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT) dataset = PrependTokenDataset(dataset, self.source_dictionary.bos()) src_dataset, tgt_dataset = MaskTokensDataset.apply_mask( dataset, self.source_dictionary, pad_idx=self.source_dictionary.pad(), mask_idx=self.mask_idx, seed=self.args.seed, mask_prob=self.args.mask_prob, leave_unmasked_prob=self.args.leave_unmasked_prob, random_token_prob=self.args.random_token_prob, freq_weighted_replacement=self.args.freq_weighted_replacement, mask_whole_words=mask_whole_words, ) lang_dataset = NestedDictionaryDataset( { 'net_input': { 'src_tokens': PadDataset( src_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False, ), 'src_lengths': NumelDataset(src_dataset, reduce=False), }, 'target': PadDataset( tgt_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False, ), 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_dataset, reduce=True), 'lang_id': RawLabelDataset([lang_id] * src_dataset.sizes.shape[0]), }, sizes=[src_dataset.sizes], ) lang_datasets.append(lang_dataset) dataset_lengths = np.array( [len(d) for d in lang_datasets], dtype=float, ) logger.info('loaded total {} blocks for all languages'.format( dataset_lengths.sum(), )) if split == self.args.train_subset: # For train subset, additionally up or down sample languages. sample_probs = self._get_sample_prob(dataset_lengths) logger.info( "Sample probability by language: ", { lang: "{0:.4f}".format(sample_probs[id]) for id, lang in enumerate(languages) }) size_ratio = (sample_probs * dataset_lengths.sum()) / dataset_lengths logger.info( "Up/Down Sampling ratio by language: ", { lang: "{0:.2f}".format(size_ratio[id]) for id, lang in enumerate(languages) }) resampled_lang_datasets = [ ResamplingDataset( lang_datasets[i], size_ratio=size_ratio[i], seed=self.args.seed, epoch=epoch, replace=size_ratio[i] >= 1.0, ) for i, d in enumerate(lang_datasets) ] dataset = ConcatDataset(resampled_lang_datasets) else: dataset = ConcatDataset(lang_datasets) lang_splits = [split] for lang_id, lang_dataset in enumerate(lang_datasets): split_name = split + '_' + languages[lang_id] lang_splits.append(split_name) self.datasets[split_name] = lang_dataset # [TODO]: This is hacky for now to print validation ppl for each # language individually. Maybe need task API changes to allow it # in more generic ways. if split in self.args.valid_subset: self.args.valid_subset = self.args.valid_subset.replace( split, ','.join(lang_splits)) with data_utils.numpy_seed(self.args.seed + epoch): shuffle = np.random.permutation(len(dataset)) self.datasets[split] = SortDataset( dataset, sort_order=[ shuffle, dataset.sizes, ], )
def load_dataset(self, split, epoch=0, combine=False, data_path=None, return_only=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ def binarize(s, append_bos=False): if self.bpe is not None: s = self.bpe.encode(s) tokens = self.vocab.encode_line( s, append_eos=True, add_if_not_exist=False, ).long() if append_bos and self.args.init_token is not None: tokens = torch.cat( [tokens.new([self.args.init_token]), tokens]) return tokens # self.data_path_table={'train_input':os.path.join(self.args.data,'Training Data','subtaskA_data_all.csv'),\ # 'train_answer':os.path.join(self.args.data,'Training Data','subtaskA_answers_all.csv'),\ # 'valid_input':os.path.join(self.args.data,'Trial Data','taskA_trial_data.csv'),\ # 'valid_answer':os.path.join(self.args.data,'Trial Data','taskA_trial_answer.csv')\ # } # self.data_path_table={'train_input':os.path.join(self.args.data,'trainval','subtaskA_data_all.csv'),\ # 'train_answer':os.path.join(self.args.data,'trainval','subtaskA_answers_all.csv'),\ # 'valid_input':os.path.join(self.args.data,'Dev Data','subtaskA_dev_data.csv'),\ # 'valid_answer':os.path.join(self.args.data,'Dev Data','subtaskA_gold_answers.csv')\ # } self.data_path_table={'train_input':os.path.join(self.args.data,'trainvaldev','subtaskA_data_all_plusplus.csv'),\ 'train_answer':os.path.join(self.args.data,'trainvaldev','subtaskA_answers_all.csv'),\ 'valid_input':os.path.join(self.args.data,'Dev Data','subtaskA_dev_data_plusplus.csv'),\ 'valid_answer':os.path.join(self.args.data,'Dev Data','subtaskA_gold_answers.csv')\ } # self.data_path_table={'train_input':os.path.join(self.args.data,'subtaskA_data_all.csv'),\ # 'train_answer':os.path.join(self.args.data,'subtaskA_answers_all.csv'),\ # 'valid_input':os.path.join(self.args.data,'taskA_trial_data.csv'),\ # 'valid_answer':os.path.join(self.args.data,'taskA_trial_answer.csv')\ # } data_path_input = self.data_path_table[split + '_input'] data_path_answer = self.data_path_table[split + '_answer'] if not os.path.exists(data_path_input): raise FileNotFoundError( 'Cannot find data: {}'.format(data_path_input)) if not os.path.exists(data_path_answer): raise FileNotFoundError( 'Cannot find data: {}'.format(data_path_answer)) src_tokens = [[] for i in range(self.args.num_classes)] src_lengths = [[] for i in range(self.args.num_classes)] src_ids = [] labels = [] label_ids = [] with open(data_path_input) as f: reader = csv.reader(f) for row in islice(reader, 1, None): src_ids.append(row[0]) for i in range(self.args.num_classes): src = row[i + 1] evidence = row[i + 3] if src.isupper(): src = src.capitalize() src = src + ' Context: ' + evidence src_bin = binarize(src, append_bos=True) src_tokens[i].append(src_bin) src_lengths[i].append(len(src_bin)) assert all( len(src_tokens[0]) == len(src_tokens[i]) for i in range(self.args.num_classes)) assert len(src_tokens[0]) == len(src_lengths[0]) with open(data_path_answer) as f: reader = csv.reader(f) for row in reader: label_ids.append(row[0]) label = 1 - int(row[1]) labels.append(label) assert len(labels) == 0 or len(labels) == len(src_tokens[0]) assert all(src_ids[i] == label_ids[i] for i in range(len(src_ids))) for i in range(self.args.num_classes): src_lengths[i] = np.array(src_lengths[i]) src_tokens[i] = ListDataset(src_tokens[i], src_lengths[i]) src_lengths[i] = ListDataset(src_lengths[i]) dataset = { 'id': IdDataset(), 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens[0], reduce=True), } for i in range(self.args.num_classes): dataset.update({ 'net_input{}'.format(i + 1): { 'src_tokens': RightPadDataset( src_tokens[i], pad_idx=self.source_dictionary.pad(), ), 'src_lengths': src_lengths[i], } }) if len(labels) > 0: dataset.update({'target': RawLabelDataset(labels)}) dataset = NestedDictionaryDataset( dataset, sizes=[ np.maximum.reduce( [src_token.sizes for src_token in src_tokens]) ], ) with data_utils.numpy_seed(self.args.seed): dataset = SortDataset( dataset, # shuffle sort_order=[np.random.permutation(len(dataset))], ) print('| Loaded {} with {} samples'.format(split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, epoch=0, combine=False, data_path=None, return_only=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ print("Split type --> " + str(split)) def binarize(s, append_bos=False): if self.bpe is not None: s = self.bpe.encode(s) tokens = self.vocab.encode_line( s, append_eos=True, add_if_not_exist=False, ).long() if append_bos and self.args.init_token is not None: tokens = torch.cat([tokens.new([self.args.init_token]), tokens]) return tokens if data_path is None: data_path = os.path.join(self.args.data, split + '.jsonl') if not os.path.exists(data_path): raise FileNotFoundError('Cannot find data: {}'.format(data_path)) src_tokens = [[] for i in range(self.args.num_classes)] src_lengths = [[] for i in range(self.args.num_classes)] labels = [] with open(data_path) as h: for line in h: example = json.loads(line.strip()) if 'answerKey' in example: label = ord(example['answerKey']) - ord('A') labels.append(label) question = example['question']['stem'] if(self.args.num_classes != len(example['question']['choices'])): print("Class size = " + str(self.args.num_classes) + ". Length of sample size = " + str(len(example['question']['choices']))) assert len(example['question']['choices']) == self.args.num_classes # format: `<s> Q: Where would I not want a fox? </s> A: hen house </s>` question = 'Q: ' + question question_toks = binarize(question, append_bos=True) for i, choice in enumerate(example['question']['choices']): src = 'A: ' + choice['text'] src_bin = torch.cat([question_toks, binarize(src)]) src_tokens[i].append(src_bin) src_lengths[i].append(len(src_bin)) assert all(len(src_tokens[0]) == len(src_tokens[i]) for i in range(self.args.num_classes)) assert len(src_tokens[0]) == len(src_lengths[0]) assert len(labels) == 0 or len(labels) == len(src_tokens[0]) for i in range(self.args.num_classes): src_lengths[i] = np.array(src_lengths[i]) src_tokens[i] = ListDataset(src_tokens[i], src_lengths[i]) src_lengths[i] = ListDataset(src_lengths[i]) dataset = { 'id': IdDataset(), 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens[0], reduce=True), } for i in range(self.args.num_classes): dataset.update({ 'net_input{}'.format(i + 1): { 'src_tokens': RightPadDataset( src_tokens[i], pad_idx=self.source_dictionary.pad(), ), 'src_lengths': src_lengths[i], } }) if len(labels) > 0: dataset.update({'target': RawLabelDataset(labels)}) dataset = NestedDictionaryDataset( dataset, sizes=[np.maximum.reduce([src_token.sizes for src_token in src_tokens])], ) with data_utils.numpy_seed(self.args.seed): dataset = SortDataset( dataset, # shuffle sort_order=[np.random.permutation(len(dataset))], ) print('| Loaded {} with {} samples'.format(split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_glue_data( task_path, mydict, mode='train' ): #一个大列表,每个item是一个文档矩阵,矩阵里面每个item是一个node的数值 ,for token_id 和 # dataset = data_utils.load_indexed_dataset(path,mydict,'mmap',combine=False,) # dataset = TokenBlockDataset(dataset,dataset.sizes,512 - 1,pad=mydict.pad(),eos=mydict.eos(), break_mode='complete',) # dataset = PrependTokenDataset(dataset, mydict.bos()) #dataset=[] #input1=open(input_path1,'r').readlines()#[:10000] #label=open(label_path,'r').readlines() input0 = data_utils.load_indexed_dataset( os.path.join(task_path, 'input0', mode), mydict, 'mmap', combine=False, ) assert input0 is not None, 'could not find dataset: {}'.format( get_path(type, split)) input1 = data_utils.load_indexed_dataset( os.path.join(task_path, 'input1', mode), mydict, 'mmap', combine=False, ) input0 = PrependTokenDataset(input0, mydict.bos()) if input1 is None: src_tokens = input0 else: input1 = PrependTokenDataset(input1, mydict.eos()) src_tokens = ConcatSentencesDataset(input0, input1) if not 'STS-B' in task_path: label_dictionary = Dictionary.load( os.path.join(task_path, 'label', 'dict.txt')) label_dictionary.add_symbol('<mask>') #label_dataset = make_dataset('label', label_dictionary) label_dataset = data_utils.load_indexed_dataset( os.path.join(task_path, 'label', mode), label_dictionary, 'mmap', combine=False, ) if label_dataset is not None: label = OffsetTokensDataset( StripTokenDataset( label_dataset, id_to_strip=label_dictionary.eos(), ), offset=-label_dictionary.nspecial, ) else: label_path = "{0}.label".format(os.path.join(task_path, 'label', mode)) if os.path.exists(label_path): def parse_regression_target(i, line): values = line.split() assert len(values) == 1, \ f'expected num_classes={self.args.num_classes} regression target values on line {i}, found: "{line}"' return [float(x) for x in values] with open(label_path) as h: label = RawLabelDataset([ parse_regression_target(i, line.strip()) for i, line in enumerate(h.readlines()) ]) print('data size: ', len(src_tokens), len(label)) assert len(src_tokens) == len(label) # with data_utils.numpy_seed(self.args.seed): # shuffle = np.random.permutation(len(src_tokens)) # src_tokens = maybe_shorten_dataset( # src_tokens, # split, # self.args.shorten_data_split_list, # self.args.shorten_method, # self.args.max_positions, # self.args.seed, # ) # input_data1=[] # input_data2=[] #label_list=[] # for line in input1: # if len(line.strip())==0: # input_data1.append([]) # else: # line = line.strip().split(' ') # input_data1.append([int(x) for x in line]) # if input_path2: # input2=open(input_path2,'r').readlines() # for line in input2: # if len(line.strip())==0: # input_data2.append([]) # else: # line = line.strip().split(' ') # input_data2.append([int(x) for x in line]) # if task=='QNLI': # for line in label: # line = line.strip() # if line=='entailment': # label_list.append(int(1)) # else: # assert line=='not_entailment' # label_list.append(int(0)) # else: # for line in label: # line = line.strip() # label_list.append(int(line)) # print('data length: ',len(input_data1),len(input_data2)) # assert len(input_data1)==len(label_list) # if len(input_data2)!=0: # assert len(input_data1)==len(input_data2) return src_tokens, label
def lang_dataset(lang): input0 = make_dataset('input0', lang, self.source_dictionary) assert input0 is not None, 'could not find dataset: {}'.format( get_path('input0', lang, split)) input1 = make_dataset('input1', lang, self.source_dictionary) if self.args.init_token is not None: input0 = PrependTokenDataset(input0, self.args.init_token) if input1 is None: src_tokens = input0 else: if self.args.separator_token is not None: input1 = PrependTokenDataset(input1, self.args.separator_token) src_tokens = ConcatSentencesDataset(input0, input1) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) if self.args.truncate_sequence: src_tokens = TruncateDataset(src_tokens, self.args.max_positions) dataset = { 'id': IdDataset(), 'net_input': { 'src_tokens': RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(src_tokens, reduce=False), }, 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens, reduce=True), } if not self.args.regression_target: label_dataset = make_dataset('label', lang, self.target_dictionary) if label_dataset is not None: dataset.update(target=OffsetTokensDataset( StripTokenDataset( label_dataset, id_to_strip=self.target_dictionary.eos(), ), offset=-self.target_dictionary.nspecial, )) else: label_path = "{0}.label".format(get_path('label', lang, split)) if os.path.exists(label_path): dataset.update(target=RawLabelDataset([ float(x.strip()) for x in open(label_path).readlines() ])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) print("| Loaded {0} with #samples: {1}".format( split, len(dataset))) return dataset
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" ###encoder 객체 생성 bpe_encoder = MultiprocessingEncoder(self.args.encoder_json, self.args.vocab_bpe) bpe_encoder.initializer() ###preprocess_coqa부르기 examples, features = get_CoQA_features(self.args, bpe_encoder, self.args.init_token, self.args.separator_token, self.dictionary.pad(), split=split) self.examples[split] = examples self.features[split] = features qas_idx = [] src_tokens = [] src_lengths = [] padding_mask = [] start_pos = [] end_pos = [] is_unk = [] is_yes = [] is_no = [] number = [] option = [] for feature in features: src = torch.IntTensor(feature.input_tokens).long() p_mask = torch.IntTensor(feature.p_mask).long() src_tokens.append(src) src_lengths.append(len(src)) padding_mask.append(p_mask) qas_idx.append(feature.qas_id) start_pos.append(feature.start_position) end_pos.append(feature.end_position) is_unk.append(feature.is_unk) is_yes.append(feature.is_yes) is_no.append(feature.is_no) number.append(feature.number) option.append(feature.option) src_tokens = ListDataset(src_tokens, src_lengths) src_lengths = ListDataset(src_lengths) dataset = { "id": IdDataset(), "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), "qas_id": RawLabelDataset(qas_idx), "net_input": { "src_tokens": RightPadDataset(src_tokens, pad_idx=self.dictionary.pad()), "src_lengths": src_lengths, "start_position": RawLabelDataset(start_pos), "p_mask": RightPadDataset(padding_mask, pad_idx=self.dictionary.pad()), }, "start_position": RawLabelDataset(start_pos), "end_position": RawLabelDataset(end_pos), "is_unk": RawLabelDataset(is_unk), "is_yes": RawLabelDataset(is_yes), "is_no": RawLabelDataset(is_no), "number": RawLabelDataset(number), "option": RawLabelDataset(option), } dataset = NestedDictionaryDataset( dataset, sizes=[np.maximum.reduce([src_tokens.sizes])], ) with data_utils.numpy_seed(self.args.seed): dataset = SortDataset( dataset, sort_order=[np.random.permutation(len(dataset))], ) print("| Loaded {} with {} samples".format(split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) return dataset # inputs are loaded similarly to sentence_prediction input0 = make_dataset("input0", self.source_dictionary) # question input1 = make_dataset("input1", self.source_dictionary) # context # src_tokens: <init_token> input0 <separator_token> input1 <eos_token> if self.args.init_token is not None: input0 = PrependTokenDataset(input0, self.args.init_token) if self.args.separator_token is not None: input1 = PrependTokenDataset(input1, self.args.separator_token) if self.args.max_context_length is not None: input1 = TruncateDataset(input1, self.args.max_option_length) src_tokens = ConcatSentencesDataset(input0, input1) if self.args.truncate_sequence: src_tokens = TruncateDataset(src_tokens, self.args.max_positions) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) dataset = { "id": IdDataset(), "net_input": { "src_tokens": RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), "src_lengths": NumelDataset(src_tokens, reduce=False), "input0_lengths": NumelDataset( input0, reduce=False ), # question length (init_token possibly included) }, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), } # labels (spans) are loaded similarly to sentence_ranking label_path = "{}.label".format(get_path("label", split)) def _process_label(positions, input0_length, truncate_sequence, max_positions): """Process a span [start:end] to the input range. After processing, tokens can be accessed by tokens[start:end+1]. TODO: change inputs to reflect this change in the first place. """ start, end = [ pos + input0_length + (self.args.separator_token is not None) for pos in positions ] end -= 1 # [0, 511] if truncate_sequence: if start >= max_positions: start, end = max_positions - 1, max_positions - 1 # not predictable elif end >= max_positions: end = max_positions - 1 return start, end if os.path.exists(label_path): with open(label_path) as h: dataset.update(target=RawLabelDataset([ _process_label( tuple(int(pos) for pos in x.split()), dataset["net_input"]["input0_lengths"][i], self.args.truncate_sequence, self.max_positions(), ) for i, x in enumerate( h.readlines()) # (start_position, end_position) ])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format( split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(split): return os.path.join(self.args.data, split) def make_dataset(split_path, dictionary): dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.args.dataset_impl, combine=combine) return dataset input0 = make_dataset( os.path.join(self.args.data, split), self.source_dictionary) assert input0 is not None, 'could not find dataset: {}'.format( os.path.join(self.args.data, split)) if self.args.init_token is not None: input0 = PrependTokenDataset(input0, self.args.init_token) src_tokens = input0 with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) if self.args.truncate_sequence: src_tokens = TruncateDataset(src_tokens, self.args.max_positions) dataset = { 'id': IdDataset(), 'net_input': { 'src_tokens': RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad()), 'src_lengths': NumelDataset(src_tokens, reduce=False)}, 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens, reduce=True), } if self.args.add_prev_output_tokens: prev_tokens_dataset = RightPadDataset( RollDataset(src_tokens, 1), pad_idx=self.dictionary.pad()) dataset['net_input'].update( prev_output_tokens=prev_tokens_dataset) if not self.args.regression_target: label_dataset = make_dataset( os.path.join(self.args.data, split + '.label'), self.target_dictionary) if label_dataset is not None: dataset.update( target=OffsetTokensDataset( StripTokenDataset( label_dataset, id_to_strip=self.target_dictionary.eos()), offset=-self.target_dictionary.nspecial, ) ) else: label_path = os.path.join(self.args.data, split + '.label') if os.path.exists(label_path): dataset.update(target=RawLabelDataset([ float(x.strip()) for x in open(label_path).readlines()])) nested_dataset = NestedDictionaryDataset(dataset, sizes=[src_tokens.sizes]) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, sort_order=[shuffle]) # shuffle print("| Loaded {0} with #samples: {1}".format(split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, load_dependency=False, gold_dependency=False, dependency_with_input=False, truncate_source=False, remove_eos_from_source=True, append_source_id=False, num_buckets=0, shuffle=True, ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, data_path)) src_dataset = data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_dataset = data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info('{} {} {}-{} {} examples'.format(data_path, split_k, src, tgt, len(src_datasets[-1]))) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None if append_source_id: src_dataset = AppendTokenDataset(src_dataset, src_dict.index('[{}]'.format(src))) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset( tgt_dataset, tgt_dict.index('[{}]'.format(tgt))) eos = tgt_dict.index('[{}]'.format(tgt)) align_dataset = None if load_alignments: align_path = os.path.join(data_path, '{}.align.{}-{}'.format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl) src_dep, tgt_dep = None, None if load_dependency: src_dep_path = os.path.join(data_path, '{}.dep.{}'.format(split, src)) tgt_dep_path = os.path.join(data_path, '{}.dep.{}'.format(split, tgt)) if os.path.exists(src_dep_path): src_deps = [] with open(src_dep_path, 'r') as src_dep_data: for h in src_dep_data: src_deps.append( torch.LongTensor( [[i, int(x) - 1] for i, x in enumerate(h.strip().split())])) src_dep = RawLabelDataset(src_deps) if os.path.exists(tgt_dep_path): tgt_deps = [] with open(tgt_dep_path, 'r') as tgt_dep_data: for h in tgt_dep_data: tgt_deps.append( torch.LongTensor( [[i, int(x) - 1] for i, x in enumerate(h.strip().split())])) tgt_dep = RawLabelDataset(tgt_deps) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None return LanguagePairDatasetWithDependency( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, remove_eos_from_source=remove_eos_from_source, align_dataset=align_dataset, eos=eos, src_dep=src_dep, tgt_dep=tgt_dep, dependency_with_input=dependency_with_input, gold_dependency=gold_dependency, num_buckets=num_buckets, shuffle=shuffle, )
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, field, split): return os.path.join(self.args.data, type, field, split) def make_dataset(type, field, dictionary): split_path = get_path(type, field, split) dataset = data_utils.load_indexed_dataset( split_path, dictionary, self.args.dataset_impl, combine=combine, ) return dataset input0 = {} input1 = {} for field in configs.fields: input0[field] = make_dataset('input0', field, self.source_dictionary[field]) assert input0[ field] is not None, 'could not find dataset: {}'.format( get_path('input0', field, split)) input1[field] = make_dataset('input1', field, self.source_dictionary[field]) assert input1[ field] is not None, 'could not find dataset: {}'.format( get_path('input1', field, split)) assert len(input0[field]) == len( input1[field]), 'input pair different length' if self.args.init_token is not None: input0[field] = PrependTokenDataset(input0[field], self.args.init_token) input1[field] = PrependTokenDataset(input1[field], self.args.init_token) if self.args.truncate_sequence: input0[field] = TruncateDataset(input0[field], self.args.max_positions) input1[field] = TruncateDataset(input1[field], self.args.max_positions) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(input0[field])) dataset = { 'id': IdDataset(), 'net_input0': { 'src_tokens': { field: RightPadDataset( input0[field], pad_idx=self.source_dictionary[field].pad()) for field in configs.fields }, 'src_lengths': NumelDataset(input0[field], reduce=False), }, 'net_input1': { 'src_tokens': { field: RightPadDataset( input1[field], pad_idx=self.source_dictionary[field].pad()) for field in configs.fields }, 'src_lengths': NumelDataset(input1[field], reduce=False), }, 'nsentences': NumSamplesDataset(), 'ntokens0': NumelDataset(input0[field], reduce=True), 'ntokens1': NumelDataset(input1[field], reduce=True), } label_path = "{0}.label".format(get_path('label', '', split)) if os.path.exists(label_path): dataset.update(target=RawLabelDataset( [float(x.strip()) for x in open(label_path).readlines()])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[np.maximum(input0[field].sizes, input1[field].sizes)], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format( split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, dictionary, self.args.dataset_impl, combine=combine, ) return dataset input0 = make_dataset('input0', self.source_dictionary) assert input0 is not None, 'could not find dataset: {}'.format(get_path(type, split)) input1 = make_dataset('input1', self.source_dictionary) if self.args.init_token is not None: input0 = PrependTokenDataset(input0, self.args.init_token) if input1 is None: src_tokens = input0 else: if self.args.separator_token is not None: input1 = PrependTokenDataset(input1, self.args.separator_token) src_tokens = ConcatSentencesDataset(input0, input1) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) src_tokens = maybe_shorten_dataset( src_tokens, split, self.args.shorten_data_split_whitelist, self.args.shorten_method, self.args.max_positions, self.args.seed, ) dataset = { 'id': IdDataset(), 'net_input': { 'src_tokens': RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(src_tokens, reduce=False), }, 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens, reduce=True), } if self.args.add_prev_output_tokens: prev_tokens_dataset = RightPadDataset( RollDataset(src_tokens, 1), pad_idx=self.dictionary.pad(), ) dataset['net_input'].update( prev_output_tokens=prev_tokens_dataset, ) if not self.args.regression_target: label_dataset = make_dataset('label', self.label_dictionary) if label_dataset is not None: dataset.update( target=OffsetTokensDataset( StripTokenDataset( label_dataset, id_to_strip=self.label_dictionary.eos(), ), offset=-self.label_dictionary.nspecial, ) ) else: label_path = "{0}.label".format(get_path('label', split)) if os.path.exists(label_path): def parse_regression_target(i, line): values = line.split() assert len(values) == self.args.num_classes, \ f'expected num_classes={self.args.num_classes} regression target values on line {i}, found: "{line}"' return [float(x) for x in values] dataset.update( target=RawLabelDataset([ parse_regression_target(i, line.strip()) for i, line in enumerate(open(label_path).readlines()) ]) ) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format(split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, epoch=0, combine=False, data_path=None, return_only=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ def getIns(bped,bpeTokens,tokens,L,R): resL=0 tkL=" ".join(tokens[:L]) bped_tkL=self.bpe.encode(tkL) if bped.find(bped_tkL)==0: resL=len(bped_tkL.split()) else: tkL+=" " bped_tkL=self.bpe.encode(tkL) if bped.find(bped_tkL)==0: resL=len(bped_tkL.split()) resR=0 tkR=" ".join(tokens[R:]) bped_tkR=self.bpe.encode(tkR) if bped.rfind(bped_tkR)+len(bped_tkR)==len(bped): resR=len(bpeTokens)-len(bped_tkR.split()) else: tkR=" "+tkR bped_tkR=self.bpe.encode(tkR) if bped.rfind(bped_tkR)+len(bped_tkR)==len(bped): resR=len(bpeTokens)-len(bped_tkR.split()) return resL, resR def getExample(a,bias): s=" ".join(a["token"]) ss=self.bpe.encode(s) sst=ss.split() headL=a['h']['pos'][0] headR=a['h']['pos'][1] hiL, hiR=getIns(ss,sst,a["token"],headL,headR) tailL=a['t']['pos'][0] tailR=a['t']['pos'][1] tiL, tiR=getIns(ss,sst,a["token"],tailL,tailR) E1b='1' E1e='2' E2b='3' E2e='4' ins=[(hiL, E1b), (hiR, E1e), (tiL, E2b), (tiR, E2e)] ins=sorted(ins) pE1=0 pE2=0 pE1_=0 pE2_=0 for i in range(0,4): sst.insert(ins[i][0]+i,ins[i][1]) if ins[i][1]==E1b: pE1=ins[i][0]+i elif ins[i][1]==E2b: pE2=ins[i][0]+i elif ins[i][1]==E1e: pE1_=ins[i][0]+i else: pE2_=ins[i][0]+i if pE1_-pE1==1 or pE2_-pE2==1: return "???", -1, -1 else: return " ".join(sst), pE1+bias, pE2+bias def get_example_bert(item): if 'text' in item: sentence = item['text'] is_token = False else: sentence = item['token'] is_token = True pos_head = item['h']['pos'] pos_tail = item['t']['pos'] pos_min = pos_head pos_max = pos_tail if pos_head[0] > pos_tail[0]: pos_min = pos_tail pos_max = pos_head rev = True else: rev = False if not is_token: sent0 = self.tokenizer.tokenize(sentence[:pos_min[0]]) ent0 = self.tokenizer.tokenize(sentence[pos_min[0]:pos_min[1]]) sent1 = self.tokenizer.tokenize(sentence[pos_min[1]:pos_max[0]]) ent1 = self.tokenizer.tokenize(sentence[pos_max[0]:pos_max[1]]) sent2 = self.tokenizer.tokenize(sentence[pos_max[1]:]) else: sent0 = self.tokenizer.tokenize(' '.join(sentence[:pos_min[0]])) ent0 = self.tokenizer.tokenize(' '.join(sentence[pos_min[0]:pos_min[1]])) sent1 = self.tokenizer.tokenize(' '.join(sentence[pos_min[1]:pos_max[0]])) ent1 = self.tokenizer.tokenize(' '.join(sentence[pos_max[0]:pos_max[1]])) sent2 = self.tokenizer.tokenize(' '.join(sentence[pos_max[1]:])) ent0 = ['[unused0]'] + ent0 + ['[unused1]'] if not rev else ['[unused2]'] + ent0 + ['[unused3]'] ent1 = ['[unused2]'] + ent1 + ['[unused3]'] if not rev else ['[unused0]'] + ent1 + ['[unused1]'] re_tokens = ['[CLS]'] + sent0 + ent0 + sent1 + ent1 + sent2 + ['[SEP]'] pos1 = 1 + len(sent0) if not rev else 1 + len(sent0 + ent0 + sent1) pos2 = 1 + len(sent0 + ent0 + sent1) if not rev else 1 + len(sent0) #pos1 = min(self.max_length - 1, pos1) #pos2 = min(self.max_length - 1, pos2) indexed_tokens = self.tokenizer.convert_tokens_to_ids(re_tokens) avai_len = len(indexed_tokens) # Position #pos1 = torch.tensor([[pos1]]).long() #pos2 = torch.tensor([[pos2]]).long() #indexed_tokens = indexed_tokens[:self.max_length] indexed_tokens = torch.tensor(indexed_tokens).long() return indexed_tokens, pos1, pos2 def binarize(s, append_bos=False): #if self.bpe is not None: # s = self.bpe.encode(s) tokens = self.vocab.encode_line( s, append_eos=True, add_if_not_exist=False, ).long() if append_bos and self.args.init_token is not None: tokens = torch.cat([tokens.new([self.args.init_token]), tokens]) return tokens if data_path is None: data_path = os.path.join(self.args.data, split + '.jsonl') rel2id_path=os.path.join(self.args.data, "rel2id.json") if not os.path.exists(data_path): raise FileNotFoundError('Cannot find data: {}'.format(data_path)) if not os.path.exists(rel2id_path): raise FileNotFoundError('Cannot find rel2id: {}'.format(rel2id_path)) rel2id=json.load(open(rel2id_path,"r")) labels = [] src_tokens = [] src_lengths = [] src_idx = [] with open(data_path) as h: for line in h: example = json.loads(line.strip()) if 'relation' in example: label = rel2id[example['relation']] labels.append(label) #bped=self.bpe.encode(" ".join(example["token"])) if getattr(self.args, 'bert', False): src_bin, pE1, pE2 = get_example_bert(example) else: bped, pE1, pE2 = getExample(example,1) if pE1==-1: continue src_bin = binarize(bped, append_bos=True) src_tokens.append(src_bin) src_lengths.append(len(src_bin)) #pE1=0 #pE2=0 src_idx.append([[pE1 for i in range(0,self.args.encoder_embed_dim)], [pE2 for i in range(0,self.args.encoder_embed_dim)]]) src_lengths = np.array(src_lengths) src_tokens = ListDataset(src_tokens, src_lengths) src_lengths = ListDataset(src_lengths) print("src_len", len(src_lengths)) print("src_tokens", len(src_tokens)) dataset = { 'id': IdDataset(), 'net_input':{ 'src_tokens':RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad() ), 'src_lengths': src_lengths, }, 'index': RawLabelDataset(src_idx), 'target': RawLabelDataset(labels), 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens, reduce=True), } dataset = NestedDictionaryDataset( dataset, sizes=src_tokens.sizes, ) with data_utils.numpy_seed(self.args.seed+epoch): dataset = SortDataset( dataset, # shuffle sort_order=[np.random.permutation(len(dataset))], ) print('| Loaded {} with {} samples'.format(split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(key, split): return os.path.join(self.cfg.data, key, split) def make_dataset(key, dictionary): split_path = get_path(key, split) try: dataset = data_utils.load_indexed_dataset( split_path, dictionary, combine=combine, ) except Exception as e: if "StorageException: [404] Path not found" in str(e): logger.warning(f"dataset {e} not found") dataset = None else: raise e return dataset input0 = make_dataset("input0", self.source_dictionary) assert input0 is not None, "could not find dataset: {}".format( get_path("input0", split)) input1 = make_dataset("input1", self.source_dictionary) if self.cfg.init_token is not None: input0 = PrependTokenDataset(input0, self.cfg.init_token) if input1 is None: src_tokens = input0 else: if self.cfg.separator_token is not None: input1 = PrependTokenDataset(input1, self.cfg.separator_token) src_tokens = ConcatSentencesDataset(input0, input1) with data_utils.numpy_seed(self.cfg.seed): shuffle = np.random.permutation(len(src_tokens)) src_tokens = maybe_shorten_dataset( src_tokens, split, self.cfg.shorten_data_split_list, self.cfg.shorten_method, self.max_positions(), self.cfg.seed, ) dataset = { "id": IdDataset(), "net_input": { "src_tokens": RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), "src_lengths": NumelDataset(src_tokens, reduce=False), }, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), } if self.cfg.add_prev_output_tokens: prev_tokens_dataset = RightPadDataset( RollDataset(src_tokens, 1), pad_idx=self.dictionary.pad(), ) dataset["net_input"].update( prev_output_tokens=prev_tokens_dataset, ) if not self.cfg.regression_target: label_dataset = make_dataset("label", self.label_dictionary) if label_dataset is not None: dataset.update(target=OffsetTokensDataset( StripTokenDataset( label_dataset, id_to_strip=self.label_dictionary.eos(), ), offset=-self.label_dictionary.nspecial, )) else: label_path = "{0}.label".format(get_path("label", split)) if os.path.exists(label_path): def parse_regression_target(i, line): values = line.split() assert ( len(values) == self.cfg.num_classes ), f'expected num_classes={self.cfg.num_classes} regression target values on line {i}, found: "{line}"' return [float(x) for x in values] with open(label_path) as h: dataset.update(target=RawLabelDataset([ parse_regression_target(i, line.strip()) for i, line in enumerate(h.readlines()) ])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) if self.cfg.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format( split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, dictionary, self.args.dataset_impl, combine=combine, ) return dataset # input0 is source, input1 is synthetic target, input2 is reference input0 = make_dataset(self.args.input0, self.source_dictionary) assert input0 is not None, 'could not find dataset: {}'.format( get_path(type, split)) input1 = make_dataset(self.args.input1, self.source_dictionary) if self.args.init_token is not None: input0 = PrependTokenDataset(input0, self.args.init_token) if self.args.input2 is not None: input2 = make_dataset(self.args.input2, self.source_dictionary) if self.args.input2 is not None and self.add_ref_prob > 0 and split != 'valid': input3 = PrependTokenDataset(input2, self.args.separator_token) else: input3 = None if input1 is None: src_tokens = input0 else: if self.args.separator_token is not None: input1 = PrependTokenDataset(input1, self.args.separator_token) if self.args.input2 is not None and self.add_ref_prob > 0. and split != 'valid': src_tokens = ConcatSentencesDataset( input0, input3, input1, add_ref_prob=self.add_ref_prob, drop_ref_rate=self.args.dropout_ref, pad_idx=self.source_dictionary.pad(), eos_idx=self.source_dictionary.eos(), bos_idx=self.source_dictionary.bos()) else: src_tokens = ConcatSentencesDataset(input0, input1) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) if self.args.truncate_sequence: src_tokens = TruncateDataset(src_tokens, self.args.max_positions) if self.args.input2 is not None and self.args.add_tran_loss: # create masked input and targets mask_whole_words = get_whole_word_mask(self.args, self.source_dictionary) \ if self.args.mask_whole_words else None ref_dataset, ref_target_dataset = MaskTokensDataset.apply_mask( input2, self.source_dictionary, pad_idx=self.source_dictionary.pad(), mask_idx=self.mask_idx, seed=self.args.seed, mask_prob=self.args.mask_prob, leave_unmasked_prob=self.args.leave_unmasked_prob, random_token_prob=self.args.random_token_prob, freq_weighted_replacement=self.args.freq_weighted_replacement, mask_whole_words=mask_whole_words, ) if self.args.separator_token is not None: input2 = PrependTokenDataset(ref_dataset, self.args.separator_token) parallel_src_tokens = ConcatSentencesDataset(input0, input2) if self.args.truncate_sequence: parallel_src_tokens = TruncateDataset(parallel_src_tokens, self.args.max_positions) dataset = { 'id': IdDataset(), 'net_input': { 'src_tokens': RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(src_tokens, reduce=False), }, 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens, reduce=True), } if self.args.input2 is not None and self.args.add_tran_loss: dataset['net_input']['parallel_src_tokens'] = RightPadDataset( parallel_src_tokens, pad_idx=self.source_dictionary.pad(), ) if self.args.add_prev_output_tokens: prev_tokens_dataset = RightPadDataset( RollDataset(src_tokens, 1), pad_idx=self.dictionary.pad(), ) dataset['net_input'].update( prev_output_tokens=prev_tokens_dataset, ) if not self.args.regression_target: label_dataset = make_dataset('label', self.label_dictionary) if label_dataset is not None: dataset.update(target=OffsetTokensDataset( StripTokenDataset( label_dataset, id_to_strip=self.label_dictionary.eos(), ), offset=-self.label_dictionary.nspecial, )) if self.args.input2 is not None and self.args.add_tran_loss: # used as translation target when calculating loss dataset.update(parallel_target=RightPadDataset( ref_target_dataset, pad_idx=self.source_dictionary.pad(), )) else: label_path = "{0}.label".format(get_path('label', split)) if os.path.exists(label_path): def parse_regression_target(i, line): values = line.split() assert len(values) == self.args.num_classes, \ f'expected num_classes={self.args.num_classes} regression target values on line {i}, found: "{line}"' return [float(x) for x in values] dataset.update(target=RawLabelDataset([ parse_regression_target(i, line.strip()) for i, line in enumerate(open(label_path).readlines()) ])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], all_sizes=src_tokens.all_sizes if self.args.add_target_num_tokens else None, padding_idx=self.source_dictionary.pad(), add_ref_prob=self.add_ref_prob if split != 'valid' else 0., ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format( split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]