def metadata(self): paths = self.paths fs = self.fs if len(paths) > 1: # This is a list of files dataset = pq.ParquetDataset(paths, filesystem=fs, validate_schema=False) base, fns = _analyze_paths(paths, fs) elif fs.isdir(paths[0]): # This is a directory dataset = pq.ParquetDataset(paths[0], filesystem=fs, validate_schema=False) allpaths = fs.glob(paths[0] + fs.sep + "*") base, fns = _analyze_paths(allpaths, fs) else: # This is a single file dataset = pq.ParquetDataset(paths[0], filesystem=fs) base = paths[0] fns = [None] metadata = None if dataset.metadata: # We have a metadata file return dataset.metadata, base else: # Collect proper metadata manually metadata = None for piece, fn in zip(dataset.pieces, fns): md = piece.get_metadata() if fn: md.set_file_path(fn) if metadata: metadata.append_row_groups(md) else: metadata = md return metadata, base
def validate_dataset( self, add_metadata_file=False, require_metadata_file=True, row_group_max_size=None, file_min_size=None, ): """Validate ParquetDatasetEngine object for efficient processing. The purpose of this method is to validate that the raw dataset meets the minimal requirements for efficient NVTabular processing. Warnings are raised if any of the following conditions are not met: - The raw dataset directory should contain a global "_metadata" file. If this file is missing, ``add_metadata_file=True`` can be passed to generate a new one. - If there is no _metadata file, the parquet schema must be consistent for all row-groups/files in the raw dataset. Otherwise, a new _metadata file must be generated to avoid errors at IO time. - The row-groups should be no larger than the maximum size limit (``row_group_max_size``). - For multi-file datasets, the files should be no smaller than the minimum size limit (``file_min_size``). Parameters ----------- add_metadata_file : bool, default False Whether to add a global _metadata file to the dataset if one is missing. require_metadata_file : bool, default True Whether to require the existence of a _metadata file to pass the dataset validation. row_group_max_size : int or str, default None Maximum size (in bytes) of each parquet row-group in the dataset. If None, the minimum of ``self.part_size`` and 500MB will be used. file_min_size : int or str, default None Minimum size (in bytes) of each parquet file in the dataset. This limit is only applied if there are >1 file in the dataset. If None, ``self.part_size`` will be used. Returns ------- valid : bool Whether or not the input dataset is valid for efficient NVTabular processing. """ meta_valid = True # Parquet format and _metadata exists size_valid = False # Row-group sizes are appropriate # Check for user-specified row-group size limit. # Otherwise we use the smaller of the dataset partition # size and 500MB. if row_group_max_size is None: row_group_max_size = min(self.part_size, 500_000_000) else: row_group_max_size = parse_bytes(row_group_max_size) # Check for user-specified file size limit. # Otherwise we use the smaller of the dataset partition # size and 500MB. if file_min_size is None: file_min_size = self.part_size else: file_min_size = parse_bytes(file_min_size) # Get dataset and path list pa_dataset = self._legacy_dataset paths = [p.path for p in pa_dataset.pieces] root_dir, fns = _analyze_paths(paths, self.fs) # Collect dataset metadata metadata_file_exists = bool(pa_dataset.metadata) schema_errors = defaultdict(set) if metadata_file_exists: # We have a metadata file metadata = pa_dataset.metadata else: # No metadata file - Collect manually metadata = None for piece, fn in zip(pa_dataset.pieces, fns): md = piece.get_metadata() md.set_file_path(fn) if metadata: _append_row_groups(metadata, md, schema_errors, piece.path) else: metadata = md # Check for inconsistent schemas. # This is not a problem if a _metadata file exists for field in schema_errors: msg = f"Schema mismatch detected in column: '{field}'." warnings.warn(msg) for item in schema_errors[field]: msg = f"[{item[0]}] Expected {item[1]}, got {item[2]}." warnings.warn(msg) # If there is schema mismatch, urge the user to add a _metadata file if len(schema_errors): meta_valid = False # There are schema-mismatch errors # Check that the Dask version supports `create_metadata_file` if LooseVersion(dask.__version__) <= "2.30.0": msg = ( "\nThe installed version of Dask is too old to handle " "schema mismatch. Try installing the latest version.") warnings.warn(msg) return meta_valid and size_valid # Early return # Collect the metadata with dask_cudf and then convert to pyarrow metadata_bytes = dask_cudf.io.parquet.create_metadata_file( paths, out_dir=False, ) with BytesIO() as myio: myio.write(memoryview(metadata_bytes)) myio.seek(0) metadata = pq.ParquetFile(myio).metadata if not add_metadata_file: msg = ( "\nPlease pass add_metadata_file=True to add a global " "_metadata file, or use the regenerate_dataset utility to " "rewrite your dataset. Without a _metadata file, the schema " "mismatch may cause errors at read time.") warnings.warn(msg) # Record the total byte size of all row groups and files max_rg_size = 0 max_rg_size_path = None file_sizes = defaultdict(int) for rg in range(metadata.num_row_groups): row_group = metadata.row_group(rg) path = row_group.column(0).file_path total_byte_size = row_group.total_byte_size if total_byte_size > max_rg_size: max_rg_size = total_byte_size max_rg_size_path = path file_sizes[path] += total_byte_size # Check if any row groups are prohibitively large. # Also check if any row groups are larger than recommended. if max_rg_size > row_group_max_size: # One or more row-groups are above the "required" limit msg = ( f"Excessive row_group size ({max_rg_size}) detected in file " f"{max_rg_size_path}. Please use the regenerate_dataset utility " f"to rewrite your dataset.") warnings.warn(msg) else: # The only way size_valid==True is if we get here size_valid = True # Check if any files are smaller than the desired size. # We only warn if there are >1 files in the dataset. for path, size in file_sizes.items(): if size < file_min_size and len(pa_dataset.pieces) > 1: msg = ( f"File {path} is smaller than the desired dataset " f"partition size ({self.part_size}). Consider using the " f"regenerate_dataset utility to rewrite your dataset with a smaller " f"number of (larger) files.") warnings.warn(msg) size_valid = False # If the _metadata file is missing, we need to write # it (or inform the user that it is missing) if not metadata_file_exists: if add_metadata_file: # Write missing _metadata file fs = self.fs metadata_path = fs.sep.join([root_dir, "_metadata"]) with fs.open(metadata_path, "wb") as fil: metadata.write_metadata_file(fil) meta_valid = True else: # Inform user that the _metadata file is missing msg = ( "For best performance with NVTabular, there should be a " "global _metadata file located in the root directory of the " "dataset. Please pass add_metadata_file=True to add the " "missing file.") warnings.warn(msg) if require_metadata_file: meta_valid = False # Return True if we have a parquet dataset with a _metadata file (meta_valid) # and the row-groups and file are appropriate sizes (size_valid) return meta_valid and size_valid
def convert_criteo_to_parquet( input_path: str, output_path: str, client, gpu_mem_frac: float = 0.05, ): print("Converting tsv to parquet files") if not output_path: raise RuntimeError( "Intermediate directory must be defined, if the dataset is tsv.") os.makedirs(output_path, exist_ok=True) # split last day into two parts number_of_lines = int( subprocess.check_output(( f'wc -l {os.path.join(input_path, "day_23")}').split()).split()[0]) valid_set_size = number_of_lines // 2 test_set_size = number_of_lines - valid_set_size with open(os.path.join(input_path, "day_23.part1"), "w") as f: subprocess.run([ 'head', '-n', str(test_set_size), str(os.path.join(input_path, "day_23")) ], stdout=f) with open(os.path.join(input_path, "day_23.part2"), "w") as f: subprocess.run([ 'tail', '-n', str(valid_set_size), str(os.path.join(input_path, "day_23")) ], stdout=f) fs = get_fs_token_paths(input_path, mode="rb")[0] file_list = [ x for x in fs.glob(fs.sep.join([input_path, "day_*"])) if not x.endswith("parquet") ] file_list = sorted(file_list, key=natural_sort_key) name_list = _analyze_paths(file_list, fs)[1] cols = CRITEO_CLICK_COLUMNS + CRITEO_CONTINUOUS_COLUMNS + CRITEO_CATEGORICAL_COLUMNS dtypes = {} dtypes[CRITEO_CLICK_COLUMNS[0]] = np.int64 for x in CRITEO_CONTINUOUS_COLUMNS: dtypes[x] = np.int64 for x in CRITEO_CATEGORICAL_COLUMNS: dtypes[x] = "hex" dsk = {} token = tokenize(file_list, name_list, output_path, gpu_mem_frac, fs, cols, dtypes) convert_file_name = "convert_file-" + token for i, (path, name) in enumerate(zip(file_list, name_list)): key = (convert_file_name, i) dsk[key] = (_convert_file, path, name, output_path, gpu_mem_frac, fs, cols, dtypes) write_meta_name = "write-metadata-" + token dsk[write_meta_name] = ( _write_metadata, [(convert_file_name, i) for i in range(len(file_list))], fs, output_path, ) graph = HighLevelGraph.from_collections(write_meta_name, dsk, dependencies=[]) conversion_delayed = Delayed(write_meta_name, graph) if client: conversion_delayed.compute() else: conversion_delayed.compute(scheduler="synchronous") print("Converted")