コード例 #1
0
ファイル: __init__.py プロジェクト: arunvasudevan/datahub
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        self.processed_containers = []
        with PerfTimer() as timer:

            file_browser = (self.s3_browser()
                            if self.source_config.path_spec.is_s3() else
                            self.local_browser())
            table_dict: Dict[str, TableData] = {}
            for file, timestamp in file_browser:
                if not self.source_config.path_spec.allowed(file):
                    continue
                table_data = self.extract_table_data(file, timestamp)
                d_table_data = table_dict.setdefault(table_data.table_path,
                                                     table_data)
                if d_table_data.timestamp < table_data.timestamp:
                    table_dict[table_data.table_path] = table_data
            for guid, table_data in table_dict.items():
                yield from self.ingest_table(table_data)

            if not self.source_config.profiling.enabled:
                return

            total_time_taken = timer.elapsed_seconds()

            logger.info(
                f"Profiling {len(self.profiling_times_taken)} table(s) finished in {total_time_taken:.3f} seconds"
            )

            time_percentiles: Dict[str, float] = {}

            if len(self.profiling_times_taken) > 0:
                percentiles = [50, 75, 95, 99]
                percentile_values = stats.calculate_percentiles(
                    self.profiling_times_taken, percentiles)

                time_percentiles = {
                    f"table_time_taken_p{percentile}":
                    10**int(log10(percentile_values[percentile] + 1))
                    for percentile in percentiles
                }

            telemetry.telemetry_instance.ping(
                "data_lake_profiling_summary",
                # bucket by taking floor of log of time taken
                {
                    "total_time_taken": 10**int(log10(total_time_taken + 1)),
                    "count": 10**int(
                        log10(len(self.profiling_times_taken) + 1)),
                    "platform": self.source_config.platform,
                    **time_percentiles,
                },
            )
コード例 #2
0
ファイル: __init__.py プロジェクト: swaroopjagadish/datahub
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:

        with PerfTimer() as timer:

            # check if file is an s3 object
            if is_s3_uri(self.source_config.base_path):
                yield from self.get_workunits_s3()

            else:
                yield from self.get_workunits_local()

            if not self.source_config.profiling.enabled:
                return

            total_time_taken = timer.elapsed_seconds()

            logger.info(
                f"Profiling {len(self.profiling_times_taken)} table(s) finished in {total_time_taken:.3f} seconds"
            )

            time_percentiles: Dict[str, float] = {}

            if len(self.profiling_times_taken) > 0:
                percentiles = [50, 75, 95, 99]
                percentile_values = stats.calculate_percentiles(
                    self.profiling_times_taken, percentiles)

                time_percentiles = {
                    f"table_time_taken_p{percentile}":
                    10**int(log10(percentile_values[percentile] + 1))
                    for percentile in percentiles
                }

            telemetry.telemetry_instance.ping(
                "data_lake_profiling_summary",
                # bucket by taking floor of log of time taken
                {
                    "total_time_taken": 10**int(log10(total_time_taken + 1)),
                    "count": 10**int(
                        log10(len(self.profiling_times_taken) + 1)),
                    "platform": self.source_config.platform,
                    **time_percentiles,
                },
            )
コード例 #3
0
    def generate_profiles(
        self, requests: List[GEProfilerRequest], max_workers: int
    ) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]:
        with PerfTimer() as timer, concurrent.futures.ThreadPoolExecutor(
                max_workers=max_workers
        ) as async_executor, SQLAlchemyQueryCombiner(
                enabled=self.config.query_combiner_enabled,
                catch_exceptions=self.config.catch_exceptions,
                is_single_row_query_method=_is_single_row_query_method,
                serial_execution_fallback_enabled=True,
        ).activate() as query_combiner:
            max_workers = min(max_workers, len(requests))
            logger.info(
                f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while"
            )
            with unittest.mock.patch(
                    "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
                    get_column_unique_count_patch,
            ):
                with unittest.mock.patch(
                        "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
                        _get_column_quantiles_bigquery_patch,
                ):
                    async_profiles = [
                        async_executor.submit(
                            self._generate_profile_from_request,
                            query_combiner,
                            request,
                        ) for request in requests
                    ]

                    # Avoid using as_completed so that the results are yielded in the
                    # same order as the requests.
                    # for async_profile in concurrent.futures.as_completed(async_profiles):
                    for async_profile in async_profiles:
                        yield async_profile.result()

                    total_time_taken = timer.elapsed_seconds()

                    logger.info(
                        f"Profiling {len(requests)} table(s) finished in {total_time_taken:.3f} seconds"
                    )

                    time_percentiles: Dict[str, float] = {}

                    if len(self.times_taken) > 0:
                        percentiles = [50, 75, 95, 99]
                        percentile_values = stats.calculate_percentiles(
                            self.times_taken, percentiles)

                        time_percentiles = {
                            f"table_time_taken_p{percentile}":
                            10**int(log10(percentile_values[percentile] + 1))
                            for percentile in percentiles
                        }

                    telemetry.telemetry_instance.ping(
                        "sql_profiling_summary",
                        # bucket by taking floor of log of time taken
                        {
                            "total_time_taken":
                            10**int(log10(total_time_taken + 1)),
                            "count":
                            10**int(log10(len(self.times_taken) + 1)),
                            "platform":
                            self.platform,
                            **time_percentiles,
                        },
                    )

                    self.report.report_from_query_combiner(
                        query_combiner.report)