def get_workunits(self) -> Iterable[MetadataWorkUnit]: self.processed_containers = [] with PerfTimer() as timer: file_browser = (self.s3_browser() if self.source_config.path_spec.is_s3() else self.local_browser()) table_dict: Dict[str, TableData] = {} for file, timestamp in file_browser: if not self.source_config.path_spec.allowed(file): continue table_data = self.extract_table_data(file, timestamp) d_table_data = table_dict.setdefault(table_data.table_path, table_data) if d_table_data.timestamp < table_data.timestamp: table_dict[table_data.table_path] = table_data for guid, table_data in table_dict.items(): yield from self.ingest_table(table_data) if not self.source_config.profiling.enabled: return total_time_taken = timer.elapsed_seconds() logger.info( f"Profiling {len(self.profiling_times_taken)} table(s) finished in {total_time_taken:.3f} seconds" ) time_percentiles: Dict[str, float] = {} if len(self.profiling_times_taken) > 0: percentiles = [50, 75, 95, 99] percentile_values = stats.calculate_percentiles( self.profiling_times_taken, percentiles) time_percentiles = { f"table_time_taken_p{percentile}": 10**int(log10(percentile_values[percentile] + 1)) for percentile in percentiles } telemetry.telemetry_instance.ping( "data_lake_profiling_summary", # bucket by taking floor of log of time taken { "total_time_taken": 10**int(log10(total_time_taken + 1)), "count": 10**int( log10(len(self.profiling_times_taken) + 1)), "platform": self.source_config.platform, **time_percentiles, }, )
def get_workunits(self) -> Iterable[MetadataWorkUnit]: with PerfTimer() as timer: # check if file is an s3 object if is_s3_uri(self.source_config.base_path): yield from self.get_workunits_s3() else: yield from self.get_workunits_local() if not self.source_config.profiling.enabled: return total_time_taken = timer.elapsed_seconds() logger.info( f"Profiling {len(self.profiling_times_taken)} table(s) finished in {total_time_taken:.3f} seconds" ) time_percentiles: Dict[str, float] = {} if len(self.profiling_times_taken) > 0: percentiles = [50, 75, 95, 99] percentile_values = stats.calculate_percentiles( self.profiling_times_taken, percentiles) time_percentiles = { f"table_time_taken_p{percentile}": 10**int(log10(percentile_values[percentile] + 1)) for percentile in percentiles } telemetry.telemetry_instance.ping( "data_lake_profiling_summary", # bucket by taking floor of log of time taken { "total_time_taken": 10**int(log10(total_time_taken + 1)), "count": 10**int( log10(len(self.profiling_times_taken) + 1)), "platform": self.source_config.platform, **time_percentiles, }, )
def generate_profiles( self, requests: List[GEProfilerRequest], max_workers: int ) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]: with PerfTimer() as timer, concurrent.futures.ThreadPoolExecutor( max_workers=max_workers ) as async_executor, SQLAlchemyQueryCombiner( enabled=self.config.query_combiner_enabled, catch_exceptions=self.config.catch_exceptions, is_single_row_query_method=_is_single_row_query_method, serial_execution_fallback_enabled=True, ).activate() as query_combiner: max_workers = min(max_workers, len(requests)) logger.info( f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while" ) with unittest.mock.patch( "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count", get_column_unique_count_patch, ): with unittest.mock.patch( "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery", _get_column_quantiles_bigquery_patch, ): async_profiles = [ async_executor.submit( self._generate_profile_from_request, query_combiner, request, ) for request in requests ] # Avoid using as_completed so that the results are yielded in the # same order as the requests. # for async_profile in concurrent.futures.as_completed(async_profiles): for async_profile in async_profiles: yield async_profile.result() total_time_taken = timer.elapsed_seconds() logger.info( f"Profiling {len(requests)} table(s) finished in {total_time_taken:.3f} seconds" ) time_percentiles: Dict[str, float] = {} if len(self.times_taken) > 0: percentiles = [50, 75, 95, 99] percentile_values = stats.calculate_percentiles( self.times_taken, percentiles) time_percentiles = { f"table_time_taken_p{percentile}": 10**int(log10(percentile_values[percentile] + 1)) for percentile in percentiles } telemetry.telemetry_instance.ping( "sql_profiling_summary", # bucket by taking floor of log of time taken { "total_time_taken": 10**int(log10(total_time_taken + 1)), "count": 10**int(log10(len(self.times_taken) + 1)), "platform": self.platform, **time_percentiles, }, ) self.report.report_from_query_combiner( query_combiner.report)