Esempio n. 1
0
    def __init__(
        self,
        name: str,
        description: Union[str, None],
        col_type: str,
        sort_order: int,
        badges: Union[List[str], None] = None,
    ) -> None:
        """
        TODO: Add stats
        :param name:
        :param description:
        :param col_type:
        :param sort_order:
        :param badges: Optional. Column level badges
        """
        self.name = name
        self.description = DescriptionMetadata.create_description_metadata(
            source=None, text=description)
        self.type = col_type
        self.sort_order = sort_order
        formatted_badges = _format_as_list(badges)
        self.badges = [Badge(badge, 'column') for badge in formatted_badges]

        # The following fields are populated by the ComplexTypeTransformer
        self._column_key: Optional[str] = None
        self._type_metadata: Optional[TypeMetadata] = None
Esempio n. 2
0
    def _load_csv(self) -> None:
        with open(self.badge_file_location, 'r') as fin:
            self.badges = [dict(i) for i in csv.DictReader(fin)]
        # print("BADGES: " + str(self.badges))

        parsed_badges = defaultdict(list)
        for badge_dict in self.badges:
            db = badge_dict['database']
            cluster = badge_dict['cluster']
            schema = badge_dict['schema']
            table_name = badge_dict['table_name']
            id = self._get_key(db, cluster, schema, table_name)
            badge = Badge(name=badge_dict['name'],
                          category=badge_dict['category'])
            parsed_badges[id].append(badge)

        with open(self.table_file_location, 'r') as fin:
            tables = [dict(i) for i in csv.DictReader(fin)]

        results = []
        for table_dict in tables:
            db = table_dict['database']
            cluster = table_dict['cluster']
            schema = table_dict['schema']
            table_name = table_dict['name']
            id = self._get_key(db, cluster, schema, table_name)
            badges = parsed_badges[id]

            if badges is None:
                badges = []
            badge_metadata = BadgeMetadata(start_label=TableMetadata.TABLE_NODE_LABEL,
                                           start_key=id,
                                           badges=badges)
            results.append(badge_metadata)
        self._iter = iter(results)
Esempio n. 3
0
    def test_extraction_of_tablecolumn_badges(self) -> None:
        """
        Test Extraction using the combined CsvTableModel model class
        """
        config_dict = {
            f'extractor.csvtablecolumn.{CsvTableColumnExtractor.TABLE_FILE_LOCATION}':
            'example/sample_data/sample_table.csv',
            f'extractor.csvtablecolumn.{CsvTableColumnExtractor.COLUMN_FILE_LOCATION}':
            'example/sample_data/sample_col.csv',
        }
        self.conf = ConfigFactory.from_dict(config_dict)

        extractor = CsvTableColumnExtractor()
        extractor.init(Scoped.get_scoped_conf(conf=self.conf,
                                              scope=extractor.get_scope()))

        result = extractor.extract()
        self.assertEqual(result.name, 'test_table1')
        self.assertEqual(result.columns[0].badges, [Badge('pk', 'column')])
        self.assertEqual(result.columns[1].badges, [Badge('pii', 'column')])
 def __init__(self,
              name: str,
              description: Union[str, None],
              col_type: str,
              sort_order: int,
              badges: Union[List[str], None] = None,
              ) -> None:
     """
     TODO: Add stats
     :param name:
     :param description:
     :param col_type:
     :param sort_order:
     :param badges: Optional. Column level badges
     """
     self.name = name
     self.description = DescriptionMetadata.create_description_metadata(source=None,
                                                                        text=description)
     self.type = col_type
     self.sort_order = sort_order
     formatted_badges = _format_as_list(badges)
     self.badges = [Badge(badge, 'column') for badge in formatted_badges]
Esempio n. 5
0
 def __init__(self,
              name: str,
              description: Union[str, None],
              col_type: str,
              sort_order: int,
              badges: Union[List[str], None] = None
              ) -> None:
     """
     TODO: Add stats
     :param name:
     :param description:
     :param col_type:
     :param sort_order:
     """
     self.name = name
     self.description = DescriptionMetadata.create_description_metadata(source=None,
                                                                        text=description)
     self.type = col_type
     self.sort_order = sort_order
     if badges:
         self.badges = [Badge(badge, 'column') for badge in badges]
     else:
         self.badges = []
Esempio n. 6
0
from databuilder.serializers.neptune_serializer import (
    NEPTUNE_CREATION_TYPE_JOB,
    NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT,
    NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT,
    NEPTUNE_HEADER_ID,
    NEPTUNE_HEADER_LABEL,
    NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT,
    NEPTUNE_RELATIONSHIP_HEADER_FROM,
    NEPTUNE_RELATIONSHIP_HEADER_TO,
)

db = 'hive'
SCHEMA = 'BASE'
TABLE = 'TEST'
CLUSTER = 'DEFAULT'
badge1 = Badge('badge1', 'column')
badge2 = Badge('badge2', 'column')


class TestBadge(unittest.TestCase):
    def setUp(self) -> None:
        super(TestBadge, self).setUp()
        self.badge_metada = BadgeMetadata(
            start_label='Column',
            start_key='hive://default.base/test/ds',
            badges=[badge1, badge2])

    def test_get_badge_key(self) -> None:
        badge_key = self.badge_metada.get_badge_key(badge1.name)
        self.assertEqual(badge_key, badge1.name)
Esempio n. 7
0
    def _get_extract_iter(self) -> Iterator[Union[TableMetadata, BadgeMetadata, TableSource, TableLineage]]:
        """
        Generates the extract iterator for all of the model types created by the dbt files.
        """
        dbt_id_to_table_key = {}
        for tbl_node, manifest_content in self._dbt_manifest['nodes'].items():

            if manifest_content['resource_type'] == DBT_MODEL_TYPE and tbl_node in self._dbt_catalog['nodes']:
                LOGGER.info(
                    'Extracting dbt {}.{}'.format(manifest_content['schema'], manifest_content[self._model_name_key])
                )

                catalog_content = self._dbt_catalog['nodes'][tbl_node]

                tbl_columns: List[ColumnMetadata] = self._get_column_values(
                    manifest_columns=manifest_content['columns'], catalog_columns=catalog_content['columns']
                )

                desc, desc_src = self._get_table_descriptions(manifest_content)
                tags, tbl_badges = self._get_table_tags_badges(manifest_content)

                tbl_metadata = TableMetadata(
                    database=self._default_sanitize(self._database_name),
                    # The dbt "database" is the cluster here
                    cluster=self._default_sanitize(manifest_content['database']),
                    schema=self._default_sanitize(manifest_content['schema']),
                    name=self._default_sanitize(manifest_content[self._model_name_key]),
                    is_view=catalog_content['metadata']['type'] == 'view',
                    columns=tbl_columns,
                    tags=tags,
                    description=desc,
                    description_source=desc_src
                )
                # Keep track for Lineage
                dbt_id_to_table_key[tbl_node] = tbl_metadata._get_table_key()

                # Optionally filter schemas in the output
                yield_schema = self._can_yield_schema(manifest_content['schema'])

                if self._extract_tables and yield_schema:
                    yield tbl_metadata

                if self._extract_tags and tbl_badges and yield_schema:
                    yield BadgeMetadata(start_label=TableMetadata.TABLE_NODE_LABEL,
                                        start_key=tbl_metadata._get_table_key(),
                                        badges=[Badge(badge, 'table') for badge in tbl_badges])

                if self._source_url and yield_schema:
                    yield TableSource(db_name=tbl_metadata.database,
                                      cluster=tbl_metadata.cluster,
                                      schema=tbl_metadata.schema,
                                      table_name=tbl_metadata.name,
                                      source=os.path.join(self._source_url, manifest_content.get('original_file_path')))

        if self._extract_lineage:
            for upstream, downstreams in self._dbt_manifest['child_map'].items():
                if upstream not in dbt_id_to_table_key:
                    continue
                valid_downstreams = [
                    dbt_id_to_table_key[k] for k in downstreams
                    if k.startswith(DBT_MODEL_PREFIX) and dbt_id_to_table_key.get(k)
                ]
                if valid_downstreams:
                    yield TableLineage(
                        table_key=dbt_id_to_table_key[upstream],
                        downstream_deps=valid_downstreams
                    )
Esempio n. 8
0
 def test_badge_name_category_are_lower_cases(self) -> None:
     uppercase_badge = Badge('BadGe3', 'COLUMN_3')
     self.assertEqual(uppercase_badge.name, 'badge3')
     self.assertEqual(uppercase_badge.category, 'column_3')
Esempio n. 9
0
    def test_extraction_with_model_class(self) -> None:
        """
        Test Extraction using model class
        """
        config_dict = {
            f'extractor.dbt.{DbtExtractor.DATABASE_NAME}': self.database_name,
            f'extractor.dbt.{DbtExtractor.CATALOG_JSON}':
            self.catalog_file_loc,
            f'extractor.dbt.{DbtExtractor.MANIFEST_JSON}': self.manifest_data,
            f'extractor.dbt.{DbtExtractor.SOURCE_URL}': self.source_url
        }
        self.conf = ConfigFactory.from_dict(config_dict)
        extractor = DbtExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=self.conf,
                                   scope=extractor.get_scope()))

        # One block of tests for each type of model created
        extracted_classes = []

        result = extractor.extract()
        self.assertTrue(isinstance(result, TableMetadata))
        self.assertEqual(result.name, 'fact_third_party_performance')
        self.assertEqual(
            result.description.text,
            'the performance for third party vendors loss rate by day.')
        self.assertEqual(result.database, self.database_name)
        self.assertEqual(result.cluster, 'dbt_demo')
        self.assertEqual(result.schema, 'public')
        self.assertEqual(result.tags, [])
        self.assertEqual(result.is_view, True)
        extracted_classes.append(TableMetadata)

        result2 = _extract_until_not_these(extractor, extracted_classes)
        self.assertTrue(isinstance(result2, TableSource))
        self.assertEqual(result2.db, self.database_name)
        self.assertEqual(result2.cluster, 'dbt_demo')
        self.assertEqual(result2.schema, 'public')
        self.assertEqual(result2.table, 'fact_third_party_performance')
        self.assertEqual(
            result2.source,
            'test_url/models/call_center/fact_third_party_performance.sql')
        extracted_classes.append(TableSource)

        result3 = _extract_until_not_these(extractor, extracted_classes)
        self.assertTrue(isinstance(result3, BadgeMetadata))
        self.assertEqual(
            result3.badges,
            [Badge('finance', 'table'),
             Badge('certified', 'table')])
        extracted_classes.append(BadgeMetadata)

        result4 = _extract_until_not_these(extractor, extracted_classes)
        self.assertTrue(isinstance(result4, TableLineage))
        self.assertEqual(result4.table_key,
                         'snowflake://dbt_demo.public/fact_catalog_returns')
        self.assertEqual(
            result4.downstream_deps,
            ['snowflake://dbt_demo.public/fact_third_party_performance'])
        extracted_classes.append(TableLineage)

        # Should not be any other unique models created
        result5 = _extract_until_not_these(extractor, extracted_classes)
        self.assertEqual(result5, None)