def validate_platform(cls, value: str, values: Dict[str, Any]) -> Optional[str]: if value != "": return value if is_s3_uri(values["base_path"]): return "s3" return "file"
def get_workunits(self) -> Iterable[MetadataWorkUnit]: with PerfTimer() as timer: # check if file is an s3 object if is_s3_uri(self.source_config.base_path): yield from self.get_workunits_s3() else: yield from self.get_workunits_local() if not self.source_config.profiling.enabled: return total_time_taken = timer.elapsed_seconds() logger.info( f"Profiling {len(self.profiling_times_taken)} table(s) finished in {total_time_taken:.3f} seconds" ) time_percentiles: Dict[str, float] = {} if len(self.profiling_times_taken) > 0: percentiles = [50, 75, 95, 99] percentile_values = stats.calculate_percentiles( self.profiling_times_taken, percentiles) time_percentiles = { f"table_time_taken_p{percentile}": 10**int(log10(percentile_values[percentile] + 1)) for percentile in percentiles } telemetry.telemetry_instance.ping( "data_lake_profiling_summary", # bucket by taking floor of log of time taken { "total_time_taken": 10**int(log10(total_time_taken + 1)), "count": 10**int( log10(len(self.profiling_times_taken) + 1)), "platform": self.source_config.platform, **time_percentiles, }, )
def validate_path_spec(cls, values: Dict) -> Dict[str, Any]: if "**" in values["include"]: raise ValueError("path_spec.include cannot contain '**'") if values.get("file_types") is None: values["file_types"] = SUPPORTED_FILE_TYPES else: for file_type in values["file_types"]: if file_type not in SUPPORTED_FILE_TYPES: raise ValueError( f"file type {file_type} not in supported file types. Please specify one from {SUPPORTED_FILE_TYPES}" ) if values.get("default_extension") is not None: if values.get("default_extension") not in SUPPORTED_FILE_TYPES: raise ValueError( f"default extension {values.get('default_extension')} not in supported default file extension. Please specify one from {SUPPORTED_FILE_TYPES}" ) include_ext = os.path.splitext(values["include"])[1].strip(".") if ( include_ext not in values["file_types"] and include_ext != "*" and not values["default_extension"] ): raise ValueError( f"file type specified ({include_ext}) in path_spec.include is not in specified file " f'types. Please select one from {values.get("file_types")} or specify ".*" to allow all types' ) values["_parsable_include"] = PathSpec.get_parsable_include(values["include"]) logger.debug(f'Setting _parsable_include: {values.get("_parsable_include")}') compiled_include_tmp = parse.compile(values["_parsable_include"]) values["_compiled_include"] = compiled_include_tmp logger.debug(f'Setting _compiled_include: {values["_compiled_include"]}') values["_glob_include"] = re.sub( "\{[^}]+\}", "*", values["include"] # noqa: W605 ) logger.debug(f'Setting _glob_include: {values.get("_glob_include")}') if values.get("table_name") is None: if "{table}" in values["include"]: values["table_name"] = "{table}" else: logger.debug(f"include fields: {compiled_include_tmp.named_fields}") logger.debug( f"table_name fields: {parse.compile(values['table_name']).named_fields}" ) if not all( x in values["_compiled_include"].named_fields for x in parse.compile(values["table_name"]).named_fields ): raise ValueError( "Not all named variables used in path_spec.table_name are specified in " "path_spec.include" ) if values.get("exclude") is not None: for exclude_path in values["exclude"]: if len(parse.compile(exclude_path).named_fields) != 0: raise ValueError( "path_spec.exclude should not contain any named variables" ) values["_is_s3"] = is_s3_uri(values["include"]) logger.debug(f'Setting _is_s3: {values.get("_is_s3")}') return values