def indexed_dataset(path, dictionary): if self.args.raw_text and IndexedRawTextDataset.exists(path): return IndexedRawTextDataset(path, dictionary) elif not self.args.raw_text and IndexedInMemoryDataset.exists( path): return IndexedDataset(path, fix_lua_indexing=False) return None
def load_dataset(self, split, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) tokens = [t for l in ds.tokens_list for t in l] elif not self.args.raw_text and IndexedInMemoryDataset.exists( path): ds = IndexedInMemoryDataset(path, fix_lua_indexing=False) tokens = ds.buffer else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) with data_utils.numpy_seed(self.seed + k): loaded_datasets.append( BlockDataset( tokens, ds.sizes, self.args.tokens_per_sample, )) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) if split == 'valid': fix_seed = True else: fix_seed = False self.datasets[split] = BertDataset( dataset, sizes, self.dictionary, self.args.shuffle_instance, self.seed, fix_seed, self.args.token_mask_ratio, self.args.token_noise_prob, self.args.token_clean_prob, self.args.sent_pos_mask_ratio, self.args.sent_pos_noise_prob, self.args.sent_pos_clean_prob)
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) elif not self.args.raw_text and IndexedDataset.exists(path): ds = IndexedDataset(path, fix_lua_indexing=True) else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) loaded_datasets.append( TokenBlockDataset( ds, self.args.tokens_per_sample, pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True, )) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none' self.datasets[split] = MonolingualDataset( dataset, sizes, self.dictionary, self.output_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=True, targets=self.targets, )
def load_dataset(self, split, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [] loaded_labels = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) elif not self.args.raw_text and IndexedInMemoryDataset.exists( path): ds = IndexedDataset(path, fix_lua_indexing=False) else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) loaded_datasets.append( TokenBlockDataset( ds, 0, pad=self.dictionary.pad(), break_mode='eos', include_targets=False, )) with open(path + '.lbl', 'r') as lbl_f: lines = lbl_f.readlines() loaded_labels.extend(int(l) for l in lines) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) self.datasets[split] = SentenceClassificationDataset( dataset, loaded_labels, sizes, self.dictionary, )
def load_dataset(self, split, combine=False): """Load a dataset split.""" loaded_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) tokens = [t for l in ds.tokens_list for t in l] elif not self.args.raw_text and IndexedInMemoryDataset.exists( path): ds = IndexedInMemoryDataset(path, fix_lua_indexing=True) tokens = ds.buffer else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) cbt_booktitle_idx = None if self.args.sample_break_mode == 'cbt_booktitle': if self.dictionary.index( '_BOOK_TITLE_') != self.dictionary.unk(): cbt_booktitle_idx = self.dictionary.index('_BOOK_TITLE_') loaded_datasets.append( TokenBlockDataset( tokens, ds.sizes, self.args.tokens_per_sample, self.args.sample_break_mode, include_targets=True, cbt_booktitle_idx=cbt_booktitle_idx, )) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) self.datasets[split] = MonolingualDataset(dataset, sizes, self.dictionary, shuffle=False)
def load_dataset(self, split, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ dataset_map = OrderedDict() for lang in self.langs2id.keys(): if self.default_key is None: self.default_key = lang # Datasets are expected to be in "split.lang" format (Eg: train.en) language_split = '{}.{}'.format(split, lang) path = os.path.join(self.args.data, language_split) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) elif not self.args.raw_text and IndexedDataset.exists(path): if self.args.lazy_load: ds = IndexedDataset(path, fix_lua_indexing=True) else: ds = IndexedCachedDataset(path, fix_lua_indexing=True) else: raise FileNotFoundError('Dataset not found: {} ({})'.format( language_split, self.args.data)) # Since we append each block with the classification_token, # we need to effectively create blocks of length # tokens_per_sample-1 block_dataset = TokenBlockDataset( dataset=ds, sizes=ds.sizes, block_size=self.args.tokens_per_sample - 1, pad=self.dictionary.pad(), eos=self.dictionary.eos()) dataset_map[lang] = MaskedLMDataset( dataset=block_dataset, sizes=block_dataset.sizes, vocab=self.dictionary, pad_idx=self.dictionary.pad(), mask_idx=self.dictionary.mask(), classif_token_idx=self.dictionary.eos(), sep_token_idx=self.dictionary.eos(), shuffle=getattr(self.args, 'shuffle', False), has_pairs=False, segment_id=self.langs2id[lang], seed=self.seed, ) self.datasets[split] = MultiCorpusSampledDataset( dataset_map, default_key=self.default_key) print('| {} {} {} examples'.format(self.args.data, split, len(self.datasets[split])))
def split_exists(split, src, tgt, lang): filename = os.path.join(self.args.data, '{}.{}-{}.{}'.format(split, src, tgt, lang)) if self.args.raw_text and IndexedRawTextDataset.exists(filename): return True elif not self.args.raw_text and IndexedInMemoryDataset.exists(filename): return True return False
def split_exists(split, data_path): filename = os.path.join(data_path, split) if self.args.raw_text and IndexedRawTextDataset.exists(filename): return True elif not self.args.raw_text and IndexedInMemoryDataset.exists(filename): return True return False
def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) if self.args.dataset_impl == 'raw' and IndexedRawTextDataset.exists(filename): return True elif self.args.dataset_impl != 'raw' and IndexedDataset.exists(filename): return True return False
def indexed_dataset(path, dictionary): if self.args.raw_text: tokenizer_tool = tokenizer.build_tokenizer(self.args) return IndexedRawTextDataset(tokenizer_tool, path, dictionary) elif IndexedInMemoryDataset.exists(path): return IndexedInMemoryDataset(path, fix_lua_indexing=True) return None
def load_sentence(self, split, sentence): loaded_datasets = [] words = sentence.split(' ') ds = IndexedRawTextDataset(words, self.dictionary) loaded_datasets.append( TokenBlockDataset( ds, ds.sizes, self.args.tokens_per_sample, pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True, )) if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none' self.datasets[split] = MonolingualDataset( dataset, sizes, self.dictionary, self.output_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=True, targets=self.targets, )
def split_exists(split, src, tgt, lang): filename = os.path.join(self.args.data, '{}.{}-{}.{}'.format(split, src, tgt, lang)) if self.args.raw_text and IndexedRawTextDataset.exists(filename): return True elif not self.args.raw_text and IndexedDataset.exists(filename): return True return False
def split_exists(split, data_type, data_path): filename = os.path.join(data_path, f'{split}.{data_type}') if self.args.raw_text and IndexedRawTextDataset.exists(filename): return True elif not self.args.raw_text and IndexedDataset.exists(filename): return True return False
def indexed_dataset(path, dictionary): if self.args.raw_text: return IndexedRawTextDataset(path, dictionary) elif IndexedInMemoryDatasetStruct.exists(path): return IndexedInMemoryDatasetStruct(path, fix_lua_indexing=True) return None
def indexed_dataset(path, dictionary): print("| ---- loading data from {}, is_training={}".format( path, is_training)) if self.args.raw_text: return IndexedRawTextDataset(path, dictionary) elif IndexedInMemoryDataset.exists(path): return IndexedInMemoryDataset(path) return None
def indexed_dataset(path, dictionary, debug=False): if self.args.raw_text: return IndexedRawTextDataset(path, dictionary, debug=debug) elif IndexedInMemoryDataset.exists(path): return IndexedInMemoryDataset(path, fix_lua_indexing=True, debug=debug) return None
def split_para_exists(split, key, lang): filename = os.path.join(self.args.data, '{}.{}.{}'.format(split, key, lang)) print(filename); print(self.args.raw_text) if self.args.raw_text and IndexedRawTextDataset.exists(filename): return True elif not self.args.raw_text and IndexedDataset.exists(filename): return True return False
def load_dataset_ordering(self, input_ordered_file, input_shuffled_file): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [] assert self.args.raw_text and IndexedRawTextDataset.exists( input_shuffled_file) ds = IndexedRawTextDataset(input_shuffled_file, self.dictionary) tokens = [t for l in ds.tokens_list for t in l] loaded_datasets.append( TokenBlockDataset( tokens, ds.sizes, self.args.tokens_per_sample, pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True, )) print('| {} {} examples'.format(input_shuffled_file, len(loaded_datasets[-1]))) # if not combine: # break assert len(loaded_datasets) == 1 dataset = loaded_datasets[0] sizes = dataset.sizes add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none' self.datasets['test'] = MonolingualDataset( dataset, sizes, self.dictionary, self.output_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=False, targets=self.targets, )
def load_dataset(self, split): """Load a dataset split.""" path = os.path.join(self.args.data, split) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) tokens = ds.tokens_list elif not self.args.raw_text and IndexedInMemoryDataset.exists(path): ds = IndexedInMemoryDataset(path, fix_lua_indexing=True) tokens = ds.buffer else: raise FileNotFoundError('Dataset not found: {} ({})'.format(split, self.args.data)) dataset = TokenBlockDataset( tokens, ds.sizes, self.args.tokens_per_sample, self.args.sample_break_mode, include_targets=True, # return next tokens as targets ) self.datasets[split] = MonolingualDataset(dataset, dataset.sizes, self.dictionary, shuffle=False)
def _load_single_lang_dataset(self, split): loaded_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) elif not self.args.raw_text and IndexedDataset.exists(path): if self.args.lazy_load: ds = IndexedDataset(path, fix_lua_indexing=True) else: ds = IndexedCachedDataset(path, fix_lua_indexing=True) else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) # Since we append each block with the classification_token, # we need to effectively create blocks of length # tokens_per_sample-1 loaded_datasets.append( TokenBlockDataset( ds, ds.sizes, self.args.tokens_per_sample - 1, pad=self.dictionary.pad(), eos=self.dictionary.eos(), )) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) return dataset, sizes
def indexed_dataset(path, dictionary): if self.args.raw_text: return IndexedRawTextDataset(path, dictionary) elif IndexedDataset.exists(path): if self.args.lazy_load: return IndexedDataset(path, fix_lua_indexing=True) else: return IndexedCachedDataset(path, fix_lua_indexing=True) return None
def indexed_dataset(path, dictionary, copy_ext_dict=False, src_dataset=None): if self.args.raw_text: return IndexedRawTextDataset(path, dictionary, copy_ext_dict=copy_ext_dict, src_dataset=src_dataset) elif IndexedDataset.exists(path): if self.args.lazy_load: return IndexedDataset(path, fix_lua_indexing=True) else: return IndexedCachedDataset(path, fix_lua_indexing=True) return None
def indexed_dataset(path, dictionary, ex_dict=None, is_tgt=False): if self.args.segment: #if self.args.raw_text: return IndexedRawTextSegDataset(path, dictionary, ex_dict, is_tgt) else: if self.args.raw_text: return IndexedRawTextDataset(path, dictionary) elif IndexedDataset.exists(path): return IndexedCachedDataset(path, fix_lua_indexing=True) return None
def split_exists(split, src, tgt, lang, data_path): if src is not None: filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) else: filename = os.path.join(data_path, '{}.{}-None.{}'.format(split, src, tgt)) if self.args.raw_text and IndexedRawTextDataset.exists(filename): return True elif not self.args.raw_text and IndexedDataset.exists(filename): return True return False
def indexed_dataset(path, dictionary, cached=True, audio=False): if self.args.raw_text: return IndexedRawTextDataset(path, dictionary) elif IndexedDataset.exists(path): if cached: return IndexedCachedDataset(path, fix_lua_indexing=True, audio=audio) else: return IndexedDataset(path, fix_lua_indexing=True, audio=audio) return None
def load_dataset(self, split, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [[], []] loaded_labels = [] stop = False for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') base_path = os.path.join(self.args.data, split_k) path1 = os.path.join(base_path + '_s1') path2 = os.path.join(base_path + '_s2') for path, datasets in zip([path1, path2], loaded_datasets): if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) elif not self.args.raw_text and IndexedInMemoryDataset.exists( path): ds = IndexedDataset(path, fix_lua_indexing=False) else: if k > 0: stop = True break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) datasets.append( TokenBlockDataset( ds, 0, pad=self.dictionary.pad(), break_mode='eos', include_targets=False, )) if stop: break with open(base_path + '.lbl', 'r') as lbl_f: lines = lbl_f.readlines() cast = int if self.num_labels > 1 else float loaded_labels.extend(cast(l.rstrip()) for l in lines) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[0][-1]))) if not combine: break if self.num_labels == 2: loaded_labels = [l if l == 1 else 0 for l in loaded_labels] if len(loaded_datasets[0]) == 1: dataset1 = loaded_datasets[0][0] dataset2 = loaded_datasets[1][0] sizes1 = dataset1.sizes sizes2 = dataset2.sizes else: dataset1 = ConcatDataset(loaded_datasets[0]) dataset2 = ConcatDataset(loaded_datasets[1]) sizes1 = np.concatenate([ds.sizes for ds in loaded_datasets[0]]) sizes2 = np.concatenate([ds.sizes for ds in loaded_datasets[1]]) self.datasets[split] = SentencePairClassificationDataset( dataset1, dataset2, loaded_labels, sizes1, sizes2, self.dictionary)
def load_dataset(self, split, combine=False): """ Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) elif not self.args.raw_text and IndexedDataset.exists(path): if self.args.lazy_load: ds = IndexedDataset(path, fix_lua_indexing=True) else: ds = IndexedCachedDataset(path, fix_lua_indexing=True) else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) with data_utils.numpy_seed(self.seed + k): loaded_datasets.append( BlockPairDataset( ds, self.dictionary, ds.sizes, self.args.tokens_per_sample, break_mode=self.args.break_mode, )) logger.info('{} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) self.datasets[split] = MaskedLMDataset( dataset=dataset, sizes=sizes, vocab=self.dictionary, pad_idx=self.dictionary.pad(), mask_idx=self.dictionary.mask(), classif_token_idx=self.dictionary.cls(), sep_token_idx=self.dictionary.sep(), shuffle=False, seed=self.seed, )
def load_dataset(self, split, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) tokens = [t for l in ds.tokens_list for t in l] elif not self.args.raw_text and IndexedInMemoryDataset.exists( path): ds = IndexedInMemoryDataset(path, fix_lua_indexing=False) tokens = ds.buffer else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) with data_utils.numpy_seed(self.seed + k): loaded_datasets.append( ModifiedBlockPairDataset( tokens, ds.sizes, self.args.tokens_per_sample, pad=self.dictionary.pad(), class_positive=self.dictionary.class_positive(), class_negative=self.dictionary.class_negative(), sep=self.dictionary.sep(), vocab=self.dictionary, break_mode=self.args.break_mode, short_seq_prob=self.args.short_seq_prob, )) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) self.datasets[split] = ModifiedBertDataset( dataset, sizes, self.dictionary, shuffle=self.args.shuffle_instance, seed=self.seed, mask_ratio=self.args.mask_ratio, lower=self.args.span_lower, upper=self.args.span_upper, geometric_p=self.args.geometric_p)
def load_dataset(self, split, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) tokens = [t for l in ds.tokens_list for t in l] elif not self.args.raw_text and IndexedInMemoryDataset.exists( path): ds = IndexedInMemoryDataset(path, fix_lua_indexing=False) tokens = ds.buffer else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) tag_map = None if self.args.tag_bitmap_file_prefix is not None: print("self.args.tag_bitmap_file_prefix is not None") tag_map = bitarray() tag_map.fromfile( open(self.args.tag_bitmap_file_prefix + split, 'rb')) block_cls = BlockPairDataset if not self.no_nsp else BlockDataset with data_utils.numpy_seed(self.seed + k): loaded_datasets.append( block_cls(tokens, ds.sizes, self.args.tokens_per_sample, pad=self.dictionary.pad(), cls=self.dictionary.cls(), mask=self.dictionary.mask(), sep=self.dictionary.sep(), break_mode=self.args.break_mode, short_seq_prob=self.short_seq_prob, tag_map=tag_map)) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) dataset_cls = SpanBertDataset if not self.no_nsp else NoNSPSpanBertDataset self.datasets[split] = dataset_cls(dataset, sizes, self.dictionary, shuffle=self.args.shuffle_instance, seed=self.seed, args=self.args)
def indexed_dataset(path, dictionary, src_tokens=None): if self.args.raw_text: return IndexedRawTextDataset(path, dictionary, src_tokens=src_tokens, reverse_order=self.args.reverse_order) elif IndexedInMemoryDataset.exists(path): return IndexedInMemoryDataset(path, fix_lua_indexing=True) return None
def indexed_dataset(path, dictionary): if self.args.raw_text: return IndexedRawTextDataset(path, dictionary) elif IndexedInMemoryDataset.exists(path): return IndexedInMemoryDataset(path) return None