Beispiel #1
0
    def load(self, record) -> None:
        """
        Creates a table stub if it does not exist, updates this template with
        information in `record`.
        :param record:
        :return:
        """
        if not record:
            return

        if type(record) in [MetricValue, Watermark]:
            table = record.table
        else:
            table = record.name

        schema = record.schema
        cluster = record.cluster
        if (
                "/" in record.database
        ):  # TODO: In general, we should always use self.database_name, unless we override the amundsen extractor and add subdirectories
            database = record.database
        else:  # ... so we have to do this.
            database = self.database_name or record.database

        if cluster == "None":  # edge case for Hive Metastore
            cluster = None

        table_file_path_base = get_table_file_path_base(
            database=database,
            cluster=cluster,
            schema=schema,
            table=table,
            base_directory=self.conf.get("base_directory"),
        )

        file_path = table_file_path_base + ".md"
        subdirectory = "/".join(file_path.split("/")[:-1])
        Path(subdirectory).mkdir(parents=True, exist_ok=True)

        if not os.path.exists(file_path):
            create_base_table_stub(
                file_path=file_path,
                database=database,
                cluster=cluster,
                schema=schema,
                table=table,
            )

        update_markdown(file_path, record)

        if self.tmp_manifest_path is not None:
            _append_to_temp_manifest(
                database=database,
                cluster=cluster,
                schema=schema,
                table=table,
                tmp_manifest_path=self.tmp_manifest_path,
            )
    def _get_extract_iter(self):
        schemas = self.execute(self._sql_stmt_schemas)
        for schema_row in schemas:
            schema = schema_row[0]
            LOGGER.info('Fetching all tables in {}.'.format(schema))

            if (schema not in self._excluded_schemas) \
                and (
                    (schema in self._included_schemas)
                    or not self._included_schemas):
                full_schema_address = \
                    '.'.join(filter(None, [self._cluster, schema]))
                tables = list(
                    self.execute(
                        'show tables in {}'.format(full_schema_address)))
                n_tables = len(tables)
                LOGGER.info('There are {} tables in {}.'.format(
                    n_tables, schema))

                for i, table_row in enumerate(tables):
                    if (i % 10 == 0) or (i == n_tables - 1):
                        LOGGER.info('On table {} of {}'.format(
                            i + 1, n_tables))
                    table = table_row[0]
                    file_name = get_table_file_path_base(
                        database=self._database,
                        cluster=self._cluster,
                        schema=schema,
                        table=table,
                    )
                    # Only update if the stub already exists
                    if os.path.exists(file_name + '.md'):

                        if self._is_table_metadata_enabled:
                            table_metadata = \
                                self.get_table_metadata(
                                    schema,
                                    table,
                                    cluster=self._cluster,
                                    is_view_query_enabled=self._is_view_query_enabled)  # noqa: E501
                            yield table_metadata

                        if self._is_analyze_enabled:
                            self.get_analyze(schema, table, self._cluster)

                        if self._is_stats_enabled:
                            stats_generator = \
                                self.get_stats(schema, table, self._cluster)
                            yield from stats_generator
                    else:
                        LOGGER.info(
                            'Skipping {}.{} because the file already exists.'.
                            format(schema, table))
Beispiel #3
0
    def load(self, record) -> None:
        """
        Creates a table stub if it does not exist, updates this template with
        information in `record`.
        :param record:
        :return:
        """
        if not record:
            return

        if type(record) in [MetricValue, Watermark]:
            table = record.table
        else:
            table = record.name

        schema = record.schema
        cluster = record.cluster
        database = self.database_name or record.database

        if cluster == "None":  # edge case for Hive Metastore
            cluster = None

        table_file_path_base = get_table_file_path_base(
            database=database,
            cluster=cluster,
            schema=schema,
            table=table,
            base_directory=self.conf.get('base_directory'))

        file_path = table_file_path_base + '.md'
        subdirectory = '/'.join(file_path.split('/')[:-1])
        Path(subdirectory).mkdir(parents=True, exist_ok=True)

        if not os.path.exists(file_path):
            create_base_table_stub(file_path=file_path,
                                   database=database,
                                   cluster=cluster,
                                   schema=schema,
                                   table=table)

        update_markdown(file_path, record)

        if self.tmp_manifest_path is not None:
            _append_to_temp_manifest(database=database,
                                     cluster=cluster,
                                     schema=schema,
                                     table=table,
                                     tmp_manifest_path=self.tmp_manifest_path)