def _check_usage_date_ranges(self, engine: Engine) -> Any: query = """ select min(query_start_time) as min_time, max(query_start_time) as max_time from snowflake.account_usage.access_history """ with PerfTimer() as timer: try: for db_row in engine.execute(query): if len(db_row) < 2 or db_row[0] is None or db_row[1] is None: self.warn( logger, "check-usage-data", f"Missing data for access_history {db_row} - Check if using Enterprise edition of Snowflake", ) continue self.report.min_access_history_time = db_row[0].astimezone( tz=timezone.utc ) self.report.max_access_history_time = db_row[1].astimezone( tz=timezone.utc ) self.report.access_history_range_query_secs = round( timer.elapsed_seconds(), 2 ) except Exception as e: self.error(logger, "check-usage-data", f"Error was {e}")
def get_workunits(self) -> Iterable[MetadataWorkUnit]: self.processed_containers = [] with PerfTimer() as timer: file_browser = (self.s3_browser() if self.source_config.path_spec.is_s3() else self.local_browser()) table_dict: Dict[str, TableData] = {} for file, timestamp in file_browser: if not self.source_config.path_spec.allowed(file): continue table_data = self.extract_table_data(file, timestamp) d_table_data = table_dict.setdefault(table_data.table_path, table_data) if d_table_data.timestamp < table_data.timestamp: table_dict[table_data.table_path] = table_data for guid, table_data in table_dict.items(): yield from self.ingest_table(table_data) if not self.source_config.profiling.enabled: return total_time_taken = timer.elapsed_seconds() logger.info( f"Profiling {len(self.profiling_times_taken)} table(s) finished in {total_time_taken:.3f} seconds" ) time_percentiles: Dict[str, float] = {} if len(self.profiling_times_taken) > 0: percentiles = [50, 75, 95, 99] percentile_values = stats.calculate_percentiles( self.profiling_times_taken, percentiles) time_percentiles = { f"table_time_taken_p{percentile}": 10**int(log10(percentile_values[percentile] + 1)) for percentile in percentiles } telemetry.telemetry_instance.ping( "data_lake_profiling_summary", # bucket by taking floor of log of time taken { "total_time_taken": 10**int(log10(total_time_taken + 1)), "count": 10**int( log10(len(self.profiling_times_taken) + 1)), "platform": self.source_config.platform, **time_percentiles, }, )
def _get_snowflake_history(self) -> Iterable[SnowflakeJoinedAccessEvent]: engine = self._make_sql_engine() logger.info("Checking usage date ranges") self._check_usage_date_ranges(engine) logger.info("Getting usage history") with PerfTimer() as timer: query = self._make_usage_query() results = engine.execute(query) self.report.access_history_query_secs = round(timer.elapsed_seconds(), 2) for row in results: yield from self._process_snowflake_history_row(row)
def get_workunits(self) -> Iterable[MetadataWorkUnit]: with PerfTimer() as timer: # check if file is an s3 object if is_s3_uri(self.source_config.base_path): yield from self.get_workunits_s3() else: yield from self.get_workunits_local() if not self.source_config.profiling.enabled: return total_time_taken = timer.elapsed_seconds() logger.info( f"Profiling {len(self.profiling_times_taken)} table(s) finished in {total_time_taken:.3f} seconds" ) time_percentiles: Dict[str, float] = {} if len(self.profiling_times_taken) > 0: percentiles = [50, 75, 95, 99] percentile_values = stats.calculate_percentiles( self.profiling_times_taken, percentiles) time_percentiles = { f"table_time_taken_p{percentile}": 10**int(log10(percentile_values[percentile] + 1)) for percentile in percentiles } telemetry.telemetry_instance.ping( "data_lake_profiling_summary", # bucket by taking floor of log of time taken { "total_time_taken": 10**int(log10(total_time_taken + 1)), "count": 10**int( log10(len(self.profiling_times_taken) + 1)), "platform": self.source_config.platform, **time_percentiles, }, )
def generate_profiles( self, requests: List[GEProfilerRequest], max_workers: int ) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]: with PerfTimer() as timer, concurrent.futures.ThreadPoolExecutor( max_workers=max_workers ) as async_executor, SQLAlchemyQueryCombiner( enabled=self.config.query_combiner_enabled, catch_exceptions=self.config.catch_exceptions, is_single_row_query_method=_is_single_row_query_method, serial_execution_fallback_enabled=True, ).activate() as query_combiner: max_workers = min(max_workers, len(requests)) logger.info( f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while" ) with unittest.mock.patch( "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count", get_column_unique_count_patch, ): with unittest.mock.patch( "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery", _get_column_quantiles_bigquery_patch, ): async_profiles = [ async_executor.submit( self._generate_profile_from_request, query_combiner, request, ) for request in requests ] # Avoid using as_completed so that the results are yielded in the # same order as the requests. # for async_profile in concurrent.futures.as_completed(async_profiles): for async_profile in async_profiles: yield async_profile.result() logger.info( f"Profiling {len(requests)} table(s) finished in {(timer.elapsed_seconds()):.3f} seconds" ) self.report.report_from_query_combiner( query_combiner.report)
def _generate_single_profile( self, query_combiner: SQLAlchemyQueryCombiner, pretty_name: str, schema: str = None, table: str = None, **kwargs: Any, ) -> Optional[DatasetProfileClass]: with self._ge_context() as ge_context, PerfTimer() as timer: try: logger.info(f"Profiling {pretty_name}") batch = self._get_ge_dataset( ge_context, { "schema": schema, "table": table, "limit": self.config.limit, "offset": self.config.offset, **kwargs, }, pretty_name=pretty_name, ) profile = _SingleDatasetProfiler( batch, pretty_name, self.config, self.report, query_combiner).generate_dataset_profile() logger.info( f"Finished profiling {pretty_name}; took {(timer.elapsed_seconds()):.3f} seconds" ) return profile except Exception as e: if not self.config.catch_exceptions: raise e logger.exception( f"Encountered exception while profiling {pretty_name}") self.report.report_failure(pretty_name, f"Profiling exception {e}") return None
def ingest_table(self, full_path: str, relative_path: str, is_aws: bool) -> Iterable[MetadataWorkUnit]: table_name = self.get_table_name(relative_path, full_path) # yield the table schema first logger.debug( f"Ingesting {full_path}: making table schemas {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) yield from self.get_table_schema(full_path, table_name, is_aws) # If profiling is not enabled, skip the rest if not self.source_config.profiling.enabled: return # read in the whole table with Spark for profiling table = self.read_file_spark(full_path, is_aws) # if table is not readable, skip if table is None: self.report.report_warning( table_name, f"unable to read table {table_name} from file {full_path}") return with PerfTimer() as timer: # init PySpark analysis object logger.debug( f"Profiling {full_path}: reading file and computing nulls+uniqueness {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) table_profiler = _SingleTableProfiler( table, self.spark, self.source_config.profiling, self.report, full_path, ) logger.debug( f"Profiling {full_path}: preparing profilers to run {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) # instead of computing each profile individually, we run them all in a single analyzer.run() call # we use a single call because the analyzer optimizes the number of calls to the underlying profiler # since multiple profiles reuse computations, this saves a lot of time table_profiler.prepare_table_profiles() # compute the profiles logger.debug( f"Profiling {full_path}: computing profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) analysis_result = table_profiler.analyzer.run() analysis_metrics = AnalyzerContext.successMetricsAsDataFrame( self.spark, analysis_result) logger.debug( f"Profiling {full_path}: extracting profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) table_profiler.extract_table_profiles(analysis_metrics) time_taken = timer.elapsed_seconds() logger.info( f"Finished profiling {full_path}; took {time_taken:.3f} seconds" ) self.profiling_times_taken.append(time_taken) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=make_dataset_urn(self.source_config.platform, table_name, self.source_config.env), changeType=ChangeTypeClass.UPSERT, aspectName="datasetProfile", aspect=table_profiler.profile, ) wu = MetadataWorkUnit( id=f"profile-{self.source_config.platform}-{full_path}", mcp=mcp) self.report.report_workunit(wu) yield wu
def _generate_single_profile( self, query_combiner: SQLAlchemyQueryCombiner, pretty_name: str, schema: str = None, table: str = None, partition: Optional[str] = None, custom_sql: Optional[str] = None, **kwargs: Any, ) -> Optional[DatasetProfileClass]: bigquery_temp_table: Optional[str] = None ge_config = { "schema": schema, "table": table, "limit": self.config.limit, "offset": self.config.offset, **kwargs, } # We have to create temporary tables if offset or limit or custom sql is set on Bigquery if custom_sql or self.config.limit or self.config.offset: if self.config.bigquery_temp_table_schema: bigquery_temp_table = ( f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}" ) ge_config["bigquery_temp_table"] = bigquery_temp_table else: assert table table_parts = table.split(".") if len(table_parts) == 2: bigquery_temp_table = ( f"{schema}.{table_parts[0]}.ge-temp-{uuid.uuid4()}") ge_config["bigquery_temp_table"] = bigquery_temp_table if custom_sql is not None: ge_config["query"] = custom_sql with self._ge_context() as ge_context, PerfTimer() as timer: try: logger.info(f"Profiling {pretty_name}") batch = self._get_ge_dataset( ge_context, ge_config, pretty_name=pretty_name, ) profile = _SingleDatasetProfiler( batch, pretty_name, partition, self.config, self.report, query_combiner, ).generate_dataset_profile() time_taken = timer.elapsed_seconds() logger.info( f"Finished profiling {pretty_name}; took {time_taken:.3f} seconds" ) self.times_taken.append(time_taken) return profile except Exception as e: if not self.config.catch_exceptions: raise e logger.exception( f"Encountered exception while profiling {pretty_name}") self.report.report_failure(pretty_name, f"Profiling exception {e}") return None finally: self._drop_bigquery_temp_table(ge_config)
def generate_profiles( self, requests: List[GEProfilerRequest], max_workers: int ) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]: with PerfTimer() as timer, concurrent.futures.ThreadPoolExecutor( max_workers=max_workers ) as async_executor, SQLAlchemyQueryCombiner( enabled=self.config.query_combiner_enabled, catch_exceptions=self.config.catch_exceptions, is_single_row_query_method=_is_single_row_query_method, serial_execution_fallback_enabled=True, ).activate() as query_combiner: max_workers = min(max_workers, len(requests)) logger.info( f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while" ) with unittest.mock.patch( "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count", get_column_unique_count_patch, ): with unittest.mock.patch( "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery", _get_column_quantiles_bigquery_patch, ): async_profiles = [ async_executor.submit( self._generate_profile_from_request, query_combiner, request, ) for request in requests ] # Avoid using as_completed so that the results are yielded in the # same order as the requests. # for async_profile in concurrent.futures.as_completed(async_profiles): for async_profile in async_profiles: yield async_profile.result() total_time_taken = timer.elapsed_seconds() logger.info( f"Profiling {len(requests)} table(s) finished in {total_time_taken:.3f} seconds" ) time_percentiles: Dict[str, float] = {} if len(self.times_taken) > 0: percentiles = [50, 75, 95, 99] percentile_values = stats.calculate_percentiles( self.times_taken, percentiles) time_percentiles = { f"table_time_taken_p{percentile}": 10**int(log10(percentile_values[percentile] + 1)) for percentile in percentiles } telemetry.telemetry_instance.ping( "sql_profiling_summary", # bucket by taking floor of log of time taken { "total_time_taken": 10**int(log10(total_time_taken + 1)), "count": 10**int(log10(len(self.times_taken) + 1)), "platform": self.platform, **time_percentiles, }, ) self.report.report_from_query_combiner( query_combiner.report)
def get_table_profile(self, table_data: TableData, dataset_urn: str) -> Iterable[MetadataWorkUnit]: # read in the whole table with Spark for profiling table = None try: table = self.read_file_spark( table_data.table_path, os.path.splitext(table_data.full_path)[1]) except Exception as e: logger.error(e) # if table is not readable, skip if table is None: self.report.report_warning( table_data.display_name, f"unable to read table {table_data.display_name} from file {table_data.full_path}", ) return with PerfTimer() as timer: # init PySpark analysis object logger.debug( f"Profiling {table_data.full_path}: reading file and computing nulls+uniqueness {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) table_profiler = _SingleTableProfiler( table, self.spark, self.source_config.profiling, self.report, table_data.full_path, ) logger.debug( f"Profiling {table_data.full_path}: preparing profilers to run {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) # instead of computing each profile individually, we run them all in a single analyzer.run() call # we use a single call because the analyzer optimizes the number of calls to the underlying profiler # since multiple profiles reuse computations, this saves a lot of time table_profiler.prepare_table_profiles() # compute the profiles logger.debug( f"Profiling {table_data.full_path}: computing profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) analysis_result = table_profiler.analyzer.run() analysis_metrics = AnalyzerContext.successMetricsAsDataFrame( self.spark, analysis_result) logger.debug( f"Profiling {table_data.full_path}: extracting profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) table_profiler.extract_table_profiles(analysis_metrics) time_taken = timer.elapsed_seconds() logger.info( f"Finished profiling {table_data.full_path}; took {time_taken:.3f} seconds" ) self.profiling_times_taken.append(time_taken) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="datasetProfile", aspect=table_profiler.profile, ) wu = MetadataWorkUnit( id=f"profile-{self.source_config.platform}-{table_data.table_path}", mcp=mcp) self.report.report_workunit(wu) yield wu