def main(_): # Legacy datasets urls = set(tfds.core.download.checksums.get_all_url_infos().keys()) # Dataset-as-folder datasets # Could keep track of the dataset name, so the report clearly indicates which # dataset should be updated. url_infos = { name: tfds.builder_cls(name).url_infos for name in tfds.list_builders(with_community_datasets=False) } for url_info in url_infos.values(): if url_info: urls |= url_info.keys() urls = sorted(urls) with futures.ThreadPoolExecutor(max_workers=100) as executor: all_codes = executor.map(_get_status_code, urls) print('\n************ Summary ************\n') total_errors = 0 for url, code in zip(urls, all_codes): if code == requests.codes.ok: continue total_errors += 1 print(f'{url} - status code: {code}') print(f'{total_errors} URLs had issues')
def _get_builder_cls( ds_to_build: str, ) -> Tuple[Type[tfds.core.DatasetBuilder], Dict[str, str]]: """Infer the builder class to build. Args: ds_to_build: Dataset argument. Returns: builder_cls: The dataset class to download and prepare kwargs: """ # 1st case: Requested dataset is a path to `.py` script path = _search_script_path(ds_to_build) if path is not None: logging.info(f'Loading dataset {ds_to_build} from path: {path}') # Dynamically load user dataset script with tfds.core.utils.add_sys_path(path.parent): builder_cls = tfds.core.community.builder_cls_from_module( path.stem) return builder_cls, {} # 2nd case: Dataset is registered through imports. # Extract `name/config:version` name, builder_kwargs = tfds.core.naming.parse_builder_name_kwargs( ds_to_build) builder_cls = tfds.builder_cls(str(name)) logging.info( f'Loading dataset {ds_to_build} from imports: {builder_cls.__module__}' ) builder_kwargs = typing.cast(Dict[str, str], builder_kwargs) return builder_cls, builder_kwargs
def _load_builder_from_code(name: str, ) -> BuilderToDocument: """Load the builder, config,... to document.""" builder_cls = tfds.builder_cls(name) section = _get_section(builder_cls) if builder_cls.BUILDER_CONFIGS: # Builder with configs def get_config_builder(config) -> tfds.core.DatasetBuilder: return tfds.builder(builder_cls.name, config=config) with futures.ThreadPoolExecutor( max_workers=_WORKER_COUNT_CONFIGS) as tpool: config_builders = list( tpool.map(get_config_builder, builder_cls.BUILDER_CONFIGS), ) return BuilderToDocument( section=section, namespace=None, builder=config_builders[0], config_builders=config_builders, ) else: # Builder without configs return BuilderToDocument( section=section, namespace=None, builder=builder_cls(), # pytype: disable=not-instantiable config_builders=[], )
def _document_single_builder_inner( name: str, visu_doc_util: doc_utils.VisualizationDocUtil, df_doc_util: doc_utils.DataframeDocUtil, nightly_doc_util: doc_utils.NightlyDocUtil, ) -> Optional[BuilderDocumentation]: """Doc string for a single builder, with or without configs.""" builder_cls = tfds.builder_cls(name) section = _get_section(builder_cls) tqdm.tqdm.write(f'Document builder {name}...') builder, config_builders = _load_builder(builder_cls) out_str = dataset_markdown_builder.get_markdown_string( builder=builder, config_builders=config_builders, visu_doc_util=visu_doc_util, df_doc_util=df_doc_util, nightly_doc_util=nightly_doc_util, ) is_nightly = bool( nightly_doc_util and nightly_doc_util.is_builder_nightly(name) ) return BuilderDocumentation( name=name, content=out_str, section=section, is_manual=bool(builder_cls.MANUAL_DOWNLOAD_INSTRUCTIONS), is_nightly=is_nightly, )
def _make_builders( args: argparse.Namespace, ds_to_build: str, ) -> Iterator[tfds.core.DatasetBuilder]: """Yields builders to generate.""" # TODO(tfds): Infer the dataset format. # And make sure --record_checksums works. # * From file (`.py`), dataset-as-folder (`my_dataset/`): # * Nothing (use current directory) # * From module `tensorflow_datasets.text.my_dataset` # * Community datasets: `namespace/my_dataset` # If no dataset selected, use current directory if not ds_to_build: raise NotImplementedError('No datasets provided not supported yet.') # Extract `name/config:version` extract_name_and_kwargs = tfds.core.load.dataset_name_and_kwargs_from_name_str builder_name, builder_kwargs = extract_name_and_kwargs(ds_to_build) builder_cls = tfds.builder_cls(builder_name) # Eventually overwrite version if args.experimental_latest_version: if 'version' in builder_kwargs: raise ValueError( 'Can\'t have both `--experimental_latest` and version set (`:1.0.0`)' ) builder_kwargs['version'] = 'experimental_latest' # Eventually overwrite config builder_kwargs['config'] = _get_config_name( builder_cls=builder_cls, config_kwarg=builder_kwargs.get('config'), config_name=args.config, config_idx=args.config_idx, ) make_builder = functools.partial( _make_builder, builder_cls, overwrite=args.overwrite, data_dir=args.data_dir, **builder_kwargs, ) # Generate all configs if no config requested. if builder_cls.BUILDER_CONFIGS and builder_kwargs['config'] is None: for config in builder_cls.BUILDER_CONFIGS: yield make_builder(config=config.name) # Generate only the dataset else: yield make_builder()
def _collect_path_to_url_infos( ) -> Dict[tfds.core.ReadWritePath, Dict[Url, checksums.UrlInfo]]: """Collect checksums paths to url_infos.""" # Collect legacy checksums paths url_info_paths = list(checksums._checksum_paths().values()) # pylint: disable=protected-access # Collect dataset-as-folder checksums path for name in tfds.list_builders(): url_info_path = tfds.builder_cls(name)._checksums_path # pylint: disable=protected-access if url_info_path.exists(): url_info_paths.append(url_info_path) url_info_paths = [tfds.core.utils.to_write_path(p) for p in url_info_paths] return { path: typing.cast(Dict[Url, checksums.UrlInfo], checksums.load_url_infos(path)) for path in url_info_paths }
def refactor_dataset(ds_name: str) -> None: """Refactor a single dataset.""" code_info = BuilderCodeInfo.from_builder_cls(tfds.builder_cls(ds_name)) print(f'Refactoring {code_info.name} in {code_info.dst}') # Eventually cleanup previous refactoring. if code_info.dst.exists(): print(f'Cleanup existing {code_info.dst}') shutil.rmtree(code_info.dst) code_info.dst.mkdir() # Copy all files and folders _add_init_file(code_info) _mv_code(code_info) _mv_code_test(code_info) _mv_checksums(code_info) _mv_fake_data_dir(code_info) _mv_create_fake_data(code_info)