Example #1
0
    def _check_usage_date_ranges(self, engine: Engine) -> Any:

        query = """
            select
                min(query_start_time) as min_time,
                max(query_start_time) as max_time
            from snowflake.account_usage.access_history
        """
        with PerfTimer() as timer:
            try:
                for db_row in engine.execute(query):
                    if len(db_row) < 2 or db_row[0] is None or db_row[1] is None:
                        self.warn(
                            logger,
                            "check-usage-data",
                            f"Missing data for access_history {db_row} - Check if using Enterprise edition of Snowflake",
                        )
                        continue
                    self.report.min_access_history_time = db_row[0].astimezone(
                        tz=timezone.utc
                    )
                    self.report.max_access_history_time = db_row[1].astimezone(
                        tz=timezone.utc
                    )
                    self.report.access_history_range_query_secs = round(
                        timer.elapsed_seconds(), 2
                    )
            except Exception as e:
                self.error(logger, "check-usage-data", f"Error was {e}")
Example #2
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        self.processed_containers = []
        with PerfTimer() as timer:

            file_browser = (self.s3_browser()
                            if self.source_config.path_spec.is_s3() else
                            self.local_browser())
            table_dict: Dict[str, TableData] = {}
            for file, timestamp in file_browser:
                if not self.source_config.path_spec.allowed(file):
                    continue
                table_data = self.extract_table_data(file, timestamp)
                d_table_data = table_dict.setdefault(table_data.table_path,
                                                     table_data)
                if d_table_data.timestamp < table_data.timestamp:
                    table_dict[table_data.table_path] = table_data
            for guid, table_data in table_dict.items():
                yield from self.ingest_table(table_data)

            if not self.source_config.profiling.enabled:
                return

            total_time_taken = timer.elapsed_seconds()

            logger.info(
                f"Profiling {len(self.profiling_times_taken)} table(s) finished in {total_time_taken:.3f} seconds"
            )

            time_percentiles: Dict[str, float] = {}

            if len(self.profiling_times_taken) > 0:
                percentiles = [50, 75, 95, 99]
                percentile_values = stats.calculate_percentiles(
                    self.profiling_times_taken, percentiles)

                time_percentiles = {
                    f"table_time_taken_p{percentile}":
                    10**int(log10(percentile_values[percentile] + 1))
                    for percentile in percentiles
                }

            telemetry.telemetry_instance.ping(
                "data_lake_profiling_summary",
                # bucket by taking floor of log of time taken
                {
                    "total_time_taken": 10**int(log10(total_time_taken + 1)),
                    "count": 10**int(
                        log10(len(self.profiling_times_taken) + 1)),
                    "platform": self.source_config.platform,
                    **time_percentiles,
                },
            )
Example #3
0
    def _get_snowflake_history(self) -> Iterable[SnowflakeJoinedAccessEvent]:
        engine = self._make_sql_engine()

        logger.info("Checking usage date ranges")
        self._check_usage_date_ranges(engine)

        logger.info("Getting usage history")
        with PerfTimer() as timer:
            query = self._make_usage_query()
            results = engine.execute(query)
            self.report.access_history_query_secs = round(timer.elapsed_seconds(), 2)

        for row in results:
            yield from self._process_snowflake_history_row(row)
Example #4
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:

        with PerfTimer() as timer:

            # check if file is an s3 object
            if is_s3_uri(self.source_config.base_path):
                yield from self.get_workunits_s3()

            else:
                yield from self.get_workunits_local()

            if not self.source_config.profiling.enabled:
                return

            total_time_taken = timer.elapsed_seconds()

            logger.info(
                f"Profiling {len(self.profiling_times_taken)} table(s) finished in {total_time_taken:.3f} seconds"
            )

            time_percentiles: Dict[str, float] = {}

            if len(self.profiling_times_taken) > 0:
                percentiles = [50, 75, 95, 99]
                percentile_values = stats.calculate_percentiles(
                    self.profiling_times_taken, percentiles)

                time_percentiles = {
                    f"table_time_taken_p{percentile}":
                    10**int(log10(percentile_values[percentile] + 1))
                    for percentile in percentiles
                }

            telemetry.telemetry_instance.ping(
                "data_lake_profiling_summary",
                # bucket by taking floor of log of time taken
                {
                    "total_time_taken": 10**int(log10(total_time_taken + 1)),
                    "count": 10**int(
                        log10(len(self.profiling_times_taken) + 1)),
                    "platform": self.source_config.platform,
                    **time_percentiles,
                },
            )
    def generate_profiles(
        self, requests: List[GEProfilerRequest], max_workers: int
    ) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]:
        with PerfTimer() as timer, concurrent.futures.ThreadPoolExecutor(
                max_workers=max_workers
        ) as async_executor, SQLAlchemyQueryCombiner(
                enabled=self.config.query_combiner_enabled,
                catch_exceptions=self.config.catch_exceptions,
                is_single_row_query_method=_is_single_row_query_method,
                serial_execution_fallback_enabled=True,
        ).activate() as query_combiner:
            max_workers = min(max_workers, len(requests))
            logger.info(
                f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while"
            )
            with unittest.mock.patch(
                    "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
                    get_column_unique_count_patch,
            ):
                with unittest.mock.patch(
                        "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
                        _get_column_quantiles_bigquery_patch,
                ):
                    async_profiles = [
                        async_executor.submit(
                            self._generate_profile_from_request,
                            query_combiner,
                            request,
                        ) for request in requests
                    ]

                    # Avoid using as_completed so that the results are yielded in the
                    # same order as the requests.
                    # for async_profile in concurrent.futures.as_completed(async_profiles):
                    for async_profile in async_profiles:
                        yield async_profile.result()

                    logger.info(
                        f"Profiling {len(requests)} table(s) finished in {(timer.elapsed_seconds()):.3f} seconds"
                    )

                    self.report.report_from_query_combiner(
                        query_combiner.report)
    def _generate_single_profile(
        self,
        query_combiner: SQLAlchemyQueryCombiner,
        pretty_name: str,
        schema: str = None,
        table: str = None,
        **kwargs: Any,
    ) -> Optional[DatasetProfileClass]:
        with self._ge_context() as ge_context, PerfTimer() as timer:
            try:
                logger.info(f"Profiling {pretty_name}")

                batch = self._get_ge_dataset(
                    ge_context,
                    {
                        "schema": schema,
                        "table": table,
                        "limit": self.config.limit,
                        "offset": self.config.offset,
                        **kwargs,
                    },
                    pretty_name=pretty_name,
                )
                profile = _SingleDatasetProfiler(
                    batch, pretty_name, self.config, self.report,
                    query_combiner).generate_dataset_profile()

                logger.info(
                    f"Finished profiling {pretty_name}; took {(timer.elapsed_seconds()):.3f} seconds"
                )
                return profile
            except Exception as e:
                if not self.config.catch_exceptions:
                    raise e
                logger.exception(
                    f"Encountered exception while profiling {pretty_name}")
                self.report.report_failure(pretty_name,
                                           f"Profiling exception {e}")
                return None
Example #7
0
    def ingest_table(self, full_path: str, relative_path: str,
                     is_aws: bool) -> Iterable[MetadataWorkUnit]:

        table_name = self.get_table_name(relative_path, full_path)

        # yield the table schema first
        logger.debug(
            f"Ingesting {full_path}: making table schemas {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
        )
        yield from self.get_table_schema(full_path, table_name, is_aws)

        # If profiling is not enabled, skip the rest
        if not self.source_config.profiling.enabled:
            return

        # read in the whole table with Spark for profiling
        table = self.read_file_spark(full_path, is_aws)

        # if table is not readable, skip
        if table is None:
            self.report.report_warning(
                table_name,
                f"unable to read table {table_name} from file {full_path}")
            return

        with PerfTimer() as timer:
            # init PySpark analysis object
            logger.debug(
                f"Profiling {full_path}: reading file and computing nulls+uniqueness {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            table_profiler = _SingleTableProfiler(
                table,
                self.spark,
                self.source_config.profiling,
                self.report,
                full_path,
            )

            logger.debug(
                f"Profiling {full_path}: preparing profilers to run {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            # instead of computing each profile individually, we run them all in a single analyzer.run() call
            # we use a single call because the analyzer optimizes the number of calls to the underlying profiler
            # since multiple profiles reuse computations, this saves a lot of time
            table_profiler.prepare_table_profiles()

            # compute the profiles
            logger.debug(
                f"Profiling {full_path}: computing profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            analysis_result = table_profiler.analyzer.run()
            analysis_metrics = AnalyzerContext.successMetricsAsDataFrame(
                self.spark, analysis_result)

            logger.debug(
                f"Profiling {full_path}: extracting profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            table_profiler.extract_table_profiles(analysis_metrics)

            time_taken = timer.elapsed_seconds()

            logger.info(
                f"Finished profiling {full_path}; took {time_taken:.3f} seconds"
            )

            self.profiling_times_taken.append(time_taken)

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=make_dataset_urn(self.source_config.platform, table_name,
                                       self.source_config.env),
            changeType=ChangeTypeClass.UPSERT,
            aspectName="datasetProfile",
            aspect=table_profiler.profile,
        )
        wu = MetadataWorkUnit(
            id=f"profile-{self.source_config.platform}-{full_path}", mcp=mcp)
        self.report.report_workunit(wu)
        yield wu
Example #8
0
    def _generate_single_profile(
        self,
        query_combiner: SQLAlchemyQueryCombiner,
        pretty_name: str,
        schema: str = None,
        table: str = None,
        partition: Optional[str] = None,
        custom_sql: Optional[str] = None,
        **kwargs: Any,
    ) -> Optional[DatasetProfileClass]:
        bigquery_temp_table: Optional[str] = None

        ge_config = {
            "schema": schema,
            "table": table,
            "limit": self.config.limit,
            "offset": self.config.offset,
            **kwargs,
        }

        # We have to create temporary tables if offset or limit or custom sql is set on Bigquery
        if custom_sql or self.config.limit or self.config.offset:
            if self.config.bigquery_temp_table_schema:
                bigquery_temp_table = (
                    f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}"
                )
                ge_config["bigquery_temp_table"] = bigquery_temp_table
            else:
                assert table
                table_parts = table.split(".")
                if len(table_parts) == 2:
                    bigquery_temp_table = (
                        f"{schema}.{table_parts[0]}.ge-temp-{uuid.uuid4()}")
                    ge_config["bigquery_temp_table"] = bigquery_temp_table

        if custom_sql is not None:
            ge_config["query"] = custom_sql

        with self._ge_context() as ge_context, PerfTimer() as timer:
            try:
                logger.info(f"Profiling {pretty_name}")

                batch = self._get_ge_dataset(
                    ge_context,
                    ge_config,
                    pretty_name=pretty_name,
                )

                profile = _SingleDatasetProfiler(
                    batch,
                    pretty_name,
                    partition,
                    self.config,
                    self.report,
                    query_combiner,
                ).generate_dataset_profile()

                time_taken = timer.elapsed_seconds()
                logger.info(
                    f"Finished profiling {pretty_name}; took {time_taken:.3f} seconds"
                )
                self.times_taken.append(time_taken)

                return profile
            except Exception as e:
                if not self.config.catch_exceptions:
                    raise e
                logger.exception(
                    f"Encountered exception while profiling {pretty_name}")
                self.report.report_failure(pretty_name,
                                           f"Profiling exception {e}")
                return None
            finally:
                self._drop_bigquery_temp_table(ge_config)
Example #9
0
    def generate_profiles(
        self, requests: List[GEProfilerRequest], max_workers: int
    ) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]:
        with PerfTimer() as timer, concurrent.futures.ThreadPoolExecutor(
                max_workers=max_workers
        ) as async_executor, SQLAlchemyQueryCombiner(
                enabled=self.config.query_combiner_enabled,
                catch_exceptions=self.config.catch_exceptions,
                is_single_row_query_method=_is_single_row_query_method,
                serial_execution_fallback_enabled=True,
        ).activate() as query_combiner:
            max_workers = min(max_workers, len(requests))
            logger.info(
                f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while"
            )
            with unittest.mock.patch(
                    "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
                    get_column_unique_count_patch,
            ):
                with unittest.mock.patch(
                        "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
                        _get_column_quantiles_bigquery_patch,
                ):
                    async_profiles = [
                        async_executor.submit(
                            self._generate_profile_from_request,
                            query_combiner,
                            request,
                        ) for request in requests
                    ]

                    # Avoid using as_completed so that the results are yielded in the
                    # same order as the requests.
                    # for async_profile in concurrent.futures.as_completed(async_profiles):
                    for async_profile in async_profiles:
                        yield async_profile.result()

                    total_time_taken = timer.elapsed_seconds()

                    logger.info(
                        f"Profiling {len(requests)} table(s) finished in {total_time_taken:.3f} seconds"
                    )

                    time_percentiles: Dict[str, float] = {}

                    if len(self.times_taken) > 0:
                        percentiles = [50, 75, 95, 99]
                        percentile_values = stats.calculate_percentiles(
                            self.times_taken, percentiles)

                        time_percentiles = {
                            f"table_time_taken_p{percentile}":
                            10**int(log10(percentile_values[percentile] + 1))
                            for percentile in percentiles
                        }

                    telemetry.telemetry_instance.ping(
                        "sql_profiling_summary",
                        # bucket by taking floor of log of time taken
                        {
                            "total_time_taken":
                            10**int(log10(total_time_taken + 1)),
                            "count":
                            10**int(log10(len(self.times_taken) + 1)),
                            "platform":
                            self.platform,
                            **time_percentiles,
                        },
                    )

                    self.report.report_from_query_combiner(
                        query_combiner.report)
Example #10
0
    def get_table_profile(self, table_data: TableData,
                          dataset_urn: str) -> Iterable[MetadataWorkUnit]:
        # read in the whole table with Spark for profiling
        table = None
        try:
            table = self.read_file_spark(
                table_data.table_path,
                os.path.splitext(table_data.full_path)[1])
        except Exception as e:
            logger.error(e)

        # if table is not readable, skip
        if table is None:
            self.report.report_warning(
                table_data.display_name,
                f"unable to read table {table_data.display_name} from file {table_data.full_path}",
            )
            return

        with PerfTimer() as timer:
            # init PySpark analysis object
            logger.debug(
                f"Profiling {table_data.full_path}: reading file and computing nulls+uniqueness {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            table_profiler = _SingleTableProfiler(
                table,
                self.spark,
                self.source_config.profiling,
                self.report,
                table_data.full_path,
            )

            logger.debug(
                f"Profiling {table_data.full_path}: preparing profilers to run {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            # instead of computing each profile individually, we run them all in a single analyzer.run() call
            # we use a single call because the analyzer optimizes the number of calls to the underlying profiler
            # since multiple profiles reuse computations, this saves a lot of time
            table_profiler.prepare_table_profiles()

            # compute the profiles
            logger.debug(
                f"Profiling {table_data.full_path}: computing profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            analysis_result = table_profiler.analyzer.run()
            analysis_metrics = AnalyzerContext.successMetricsAsDataFrame(
                self.spark, analysis_result)

            logger.debug(
                f"Profiling {table_data.full_path}: extracting profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            table_profiler.extract_table_profiles(analysis_metrics)

            time_taken = timer.elapsed_seconds()

            logger.info(
                f"Finished profiling {table_data.full_path}; took {time_taken:.3f} seconds"
            )

            self.profiling_times_taken.append(time_taken)

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            changeType=ChangeTypeClass.UPSERT,
            aspectName="datasetProfile",
            aspect=table_profiler.profile,
        )
        wu = MetadataWorkUnit(
            id=f"profile-{self.source_config.platform}-{table_data.table_path}",
            mcp=mcp)
        self.report.report_workunit(wu)
        yield wu