def load_indexed_dataset(path, dictionary=None, dataset_impl=None, combine=False, default="cached"): """A helper function for loading indexed datasets. Args: path (str): path to indexed dataset (e.g., 'data-bin/train') dictionary (~fairseq.data.Dictionary): data dictionary dataset_impl (str, optional): which dataset implementation to use. If not provided, it will be inferred automatically. For legacy indexed data we use the 'cached' implementation by default. combine (bool, optional): automatically load and combine multiple datasets. For example, if *path* is 'data-bin/train', then we will combine 'data-bin/train', 'data-bin/train1', ... and return a single ConcatDataset instance. """ import fairseq.data.indexed_dataset as indexed_dataset from fairseq.data.concat_dataset import ConcatDataset datasets = [] for k in itertools.count(): path_k = path + (str(k) if k > 0 else "") try: path_k = indexed_dataset.get_indexed_dataset_to_local(path_k) except Exception as e: if "StorageException: [404] Path not found" in str(e): logger.warning(f"path_k: {e} not found") else: raise e dataset_impl_k = dataset_impl if dataset_impl_k is None: dataset_impl_k = indexed_dataset.infer_dataset_impl(path_k) dataset = indexed_dataset.make_dataset( path_k, impl=dataset_impl_k or default, fix_lua_indexing=True, dictionary=dictionary, ) if dataset is None: break logger.info("loaded {:,} examples from: {}".format( len(dataset), path_k)) datasets.append(dataset) if not combine: break if len(datasets) == 0: return None elif len(datasets) == 1: return datasets[0] else: return ConcatDataset(datasets)
def load_indexed_dataset(path, dictionary, dataset_impl=None, combine=False, default='cached'): """A helper function for loading indexed datasets. Args: path (str): path to indexed dataset (e.g., 'data-bin/train') dictionary (~fairseq.data.Dictionary): data dictionary dataset_impl (str, optional): which dataset implementation to use. If not provided, it will be inferred automatically. For legacy indexed data we use the 'cached' implementation by default. combine (bool, optional): automatically load and combine multiple datasets. For example, if *path* is 'data-bin/train', then we will combine 'data-bin/train', 'data-bin/train1', ... and return a single ConcatDataset instance. """ from fairseq.data.concat_dataset import ConcatDataset import fairseq.data.indexed_dataset as indexed_dataset datasets = [] for k in itertools.count(): ## 从0开始,无限加1遍历,用于存在多个训练集的情况 path_k = path + (str(k) if k > 0 else '') ##k=0时,名字不加入id,用于存在多个训练集的情况 dataset_impl_k = dataset_impl if dataset_impl_k is None: dataset_impl_k = indexed_dataset.infer_dataset_impl(path_k) #dataset_impl_k==lazy -->IndexedDataset类, #调用make_dataset函数构建IndexedDataset对象,并读入数据索引相关信息,即读入.idx后缀的文件。 #之后可以如同list按id索引数据,每次索引都是直接从二进制文件读取 dataset = indexed_dataset.make_dataset( path_k, impl=dataset_impl_k or default, fix_lua_indexing=True, dictionary=dictionary, ) if dataset is None: break print('| loaded {} examples from: {}'.format(len(dataset), path_k)) datasets.append(dataset) if not combine: break if len(datasets) == 0: return None elif len(datasets) == 1: return datasets[0] else: return ConcatDataset(datasets)
def load_indexed_dataset(path, dictionary, dataset_impl=None, combine=False, default='cached', path_xml=None): """A helper function for loading indexed datasets. Args: path (str): path to indexed dataset (e.g., 'data-bin/train') dictionary (~fairseq.data.Dictionary): data dictionary dataset_impl (str, optional): which dataset implementation to use. If not provided, it will be inferred automatically. For legacy indexed data we use the 'cached' implementation by default. combine (bool, optional): automatically load and combine multiple datasets. For example, if *path* is 'data-bin/train', then we will combine 'data-bin/train', 'data-bin/train1', ... and return a single ConcatDataset instance. """ from fairseq.data.concat_dataset import ConcatDataset import fairseq.data.indexed_dataset as indexed_dataset datasets = [] for k in itertools.count(): path_k = path + (str(k) if k > 0 else '') dataset_impl_k = dataset_impl if dataset_impl_k is None: dataset_impl_k = indexed_dataset.infer_dataset_impl(path_k) dataset = indexed_dataset.make_dataset( path_k, impl=dataset_impl_k or default, fix_lua_indexing=True, dictionary=dictionary, path_xml=path_xml, ) if dataset is None: break print('| loaded {} examples from: {}'.format(len(dataset), path_k)) datasets.append(dataset) if not combine: break if len(datasets) == 0: return None elif len(datasets) == 1: return datasets[0] else: return ConcatDataset(datasets)