def _get_table_watermarks(
            self, entity: EntityUniqueAttribute) -> List[Watermark]:
        partition_value_format = '%Y-%m-%d %H:%M:%S'

        _partitions = entity.get('relationshipAttributes',
                                 dict()).get('partitions', list())

        names = [
            _partition.get('displayText') for _partition in _partitions
            if _partition.get('entityStatus') == Status.ACTIVE
            and _partition.get('relationshipStatus') == Status.ACTIVE
        ]

        if not names:
            return []

        partition_key = AtlasProxy._render_partition_key_name(entity)
        watermark_date_format = AtlasProxy._select_watermark_format(names)

        partitions = {}

        for _partition in _partitions:
            partition_name = _partition.get('displayText')
            if partition_name and watermark_date_format:
                partition_date, _ = AtlasProxy._validate_date(
                    partition_name, watermark_date_format)

                if partition_date:
                    common_values = {
                        'partition_value':
                        datetime.datetime.strftime(partition_date,
                                                   partition_value_format),
                        'create_time':
                        0,
                        'partition_key':
                        partition_key
                    }

                    partitions[partition_date] = common_values

        if partitions:
            low_watermark_date = min(partitions.keys())
            high_watermark_date = max(partitions.keys())

            low_watermark = Watermark(watermark_type='low_watermark',
                                      **partitions.get(low_watermark_date))
            high_watermark = Watermark(watermark_type='high_watermark',
                                       **partitions.get(high_watermark_date))

            return [low_watermark, high_watermark]
        else:
            return []
Example #2
0
    def test_get_table_view_only(self) -> None:
        col_usage_return_value = copy.deepcopy(self.col_usage_return_value)
        for col in col_usage_return_value:
            col['tbl']['is_view'] = True

        with patch.object(GraphDatabase, 'driver'), patch.object(Neo4jProxy, '_execute_cypher_query') as mock_execute:
            mock_execute.side_effect = [col_usage_return_value, [], self.table_level_return_value]

            neo4j_proxy = Neo4jProxy(host='DOES_NOT_MATTER', port=0000)
            table = neo4j_proxy.get_table(table_uri='dummy_uri')

            expected = Table(database='hive', cluster='gold', schema='foo_schema', name='foo_table',
                             tags=[Tag(tag_name='test', tag_type='default')],
                             badges=[Badge(badge_name='golden', category='table_status')],
                             table_readers=[], description='foo description',
                             watermarks=[Watermark(watermark_type='high_watermark',
                                                   partition_key='ds',
                                                   partition_value='fake_value',
                                                   create_time='fake_time'),
                                         Watermark(watermark_type='low_watermark',
                                                   partition_key='ds',
                                                   partition_value='fake_value',
                                                   create_time='fake_time')],
                             columns=[Column(name='bar_id_1', description='bar col description', col_type='varchar',
                                             sort_order=0, stats=[Stat(start_epoch=1,
                                                                       end_epoch=1,
                                                                       stat_type='avg',
                                                                       stat_val='1')], badges=[]),
                                      Column(name='bar_id_2', description='bar col2 description', col_type='bigint',
                                             sort_order=1, stats=[Stat(start_epoch=2,
                                                                       end_epoch=2,
                                                                       stat_type='avg',
                                                                       stat_val='2')],
                                             badges=[Badge(badge_name='primary key', category='column')])],
                             owners=[User(email='*****@*****.**')],
                             table_writer=Application(application_url=self.table_writer['application_url'],
                                                      description=self.table_writer['description'],
                                                      name=self.table_writer['name'],
                                                      id=self.table_writer['id']),
                             last_updated_timestamp=1,
                             source=Source(source='/source_file_loc',
                                           source_type='github'),
                             is_view=True,
                             programmatic_descriptions=[
                                 ProgrammaticDescription(source='quality_report',
                                                         text='Test Test'),
                                 ProgrammaticDescription(source='s3_crawler',
                                                         text='Test Test Test')
                             ])

            self.assertEqual(str(expected), str(table))
Example #3
0
    def test_get_table(self, mock_rds_client: Any) -> None:
        database = RDSDatabase(name='hive')
        cluster = RDSCluster(name='gold')
        schema = RDSSchema(name='foo_schema')
        schema.cluster = cluster
        cluster.database = database

        table = RDSTable(name='foo_table')
        table.schema = schema
        table.description = RDSTableDescription(description='foo description')

        col1 = RDSColumn(name='bar_id_1', type='varchar', sort_order=0)
        col1.description = RDSColumnDescription(
            description='bar col description')
        col1.stats = [
            RDSColumnStat(stat_type='avg',
                          start_epoch='1',
                          end_epoch='1',
                          stat_val='1')
        ]

        col2 = RDSColumn(name='bar_id_2', type='bigint', sort_order=1)
        col2.description = RDSColumnDescription(
            description='bar col2 description')
        col2.stats = [
            RDSColumnStat(stat_type='avg',
                          start_epoch='2',
                          end_epoch='2',
                          stat_val='2')
        ]
        col2.badges = [RDSBadge(rk='primary key', category='column')]
        columns = [col1, col2]

        table.watermarks = [
            RDSTableWatermark(
                rk='hive://gold.test_schema/test_table/high_watermark/',
                partition_key='ds',
                partition_value='fake_value',
                create_time='fake_time'),
            RDSTableWatermark(
                rk='hive://gold.test_schema/test_table/low_watermark/',
                partition_key='ds',
                partition_value='fake_value',
                create_time='fake_time')
        ]

        table.application = RDSApplication(
            application_url='airflow_host/admin/airflow/tree?dag_id=test_table',
            description='DAG generating a table',
            name='Airflow',
            id='dag/task_id')
        table.timestamp = RDSTableTimestamp(last_updated_timestamp=1)

        table.owners = [
            RDSUser(rk='*****@*****.**', email='*****@*****.**')
        ]
        table.tags = [RDSTag(rk='test', tag_type='default')]
        table.badges = [RDSBadge(rk='golden', category='table_status')]
        table.source = RDSTableSource(rk='some key',
                                      source_type='github',
                                      source='/source_file_loc')
        table.programmatic_descriptions = [
            RDSTableProgrammaticDescription(description_source='s3_crawler',
                                            description='Test Test Test'),
            RDSTableProgrammaticDescription(
                description_source='quality_report', description='Test Test')
        ]

        readers = [RDSTableUsage(user_rk='*****@*****.**', read_count=5)]

        mock_client = MagicMock()
        mock_rds_client.return_value = mock_client

        mock_create_session = MagicMock()
        mock_client.create_session.return_value = mock_create_session

        mock_session = MagicMock()
        mock_create_session.__enter__.return_value = mock_session

        mock_session_query = MagicMock()
        mock_session.query.return_value = mock_session_query

        mock_session_query_filter = MagicMock()
        mock_session_query.filter.return_value = mock_session_query_filter
        mock_session_query_filter.first.return_value = table

        mock_session_query_filter_orderby = MagicMock()
        mock_session_query_filter.order_by.return_value = mock_session_query_filter_orderby

        mock_session_query_filter_orderby_limit = MagicMock()
        mock_session_query_filter_orderby.limit.return_value = mock_session_query_filter_orderby_limit
        mock_session_query_filter_orderby_limit.all.return_value = readers

        mock_session_query_filter_options = MagicMock()
        mock_session_query_filter.options.return_value = mock_session_query_filter_options
        mock_session_query_filter_options.all.return_value = columns

        proxy = MySQLProxy()
        actual_table = proxy.get_table(table_uri='dummy_uri')

        expected = Table(
            database='hive',
            cluster='gold',
            schema='foo_schema',
            name='foo_table',
            tags=[Tag(tag_name='test', tag_type='default')],
            badges=[Badge(badge_name='golden', category='table_status')],
            table_readers=[
                Reader(user=User(email='*****@*****.**'), read_count=5)
            ],
            description='foo description',
            watermarks=[
                Watermark(watermark_type='high_watermark',
                          partition_key='ds',
                          partition_value='fake_value',
                          create_time='fake_time'),
                Watermark(watermark_type='low_watermark',
                          partition_key='ds',
                          partition_value='fake_value',
                          create_time='fake_time')
            ],
            columns=[
                Column(name='bar_id_1',
                       description='bar col description',
                       col_type='varchar',
                       sort_order=0,
                       stats=[
                           Stat(start_epoch=1,
                                end_epoch=1,
                                stat_type='avg',
                                stat_val='1')
                       ],
                       badges=[]),
                Column(name='bar_id_2',
                       description='bar col2 description',
                       col_type='bigint',
                       sort_order=1,
                       stats=[
                           Stat(start_epoch=2,
                                end_epoch=2,
                                stat_type='avg',
                                stat_val='2')
                       ],
                       badges=[
                           Badge(badge_name='primary key', category='column')
                       ])
            ],
            owners=[User(email='*****@*****.**')],
            table_writer=Application(
                application_url=
                'airflow_host/admin/airflow/tree?dag_id=test_table',
                description='DAG generating a table',
                name='Airflow',
                id='dag/task_id'),
            last_updated_timestamp=1,
            source=Source(source='/source_file_loc', source_type='github'),
            is_view=False,
            programmatic_descriptions=[
                ProgrammaticDescription(source='quality_report',
                                        text='Test Test'),
                ProgrammaticDescription(source='s3_crawler',
                                        text='Test Test Test')
            ])

        self.assertEqual(str(expected), str(actual_table))
    def _exec_table_query(self, table_uri: str) -> Tuple:
        """
        Queries one Cypher record with watermark list, Application,
        ,timestamp, owner records and tag records.
        """

        # Return Value: (Watermark Results, Table Writer, Last Updated Timestamp, owner records, tag records)

        table_level_query = textwrap.dedent("""\
        MATCH (tbl:Table {key: $tbl_key})
        OPTIONAL MATCH (wmk:Watermark)-[:BELONG_TO_TABLE]->(tbl)
        OPTIONAL MATCH (application:Application)-[:GENERATES]->(tbl)
        OPTIONAL MATCH (tbl)-[:LAST_UPDATED_AT]->(t:Timestamp)
        OPTIONAL MATCH (owner:User)<-[:OWNER]-(tbl)
        OPTIONAL MATCH (tbl)-[:TAGGED_BY]->(tag:Tag{tag_type: $tag_normal_type})
        OPTIONAL MATCH (tbl)-[:TAGGED_BY]->(badge:Tag{tag_type: $tag_badge_type})
        OPTIONAL MATCH (tbl)-[:SOURCE]->(src:Source)
        OPTIONAL MATCH (tbl)-[:DESCRIPTION]->(prog_descriptions:Programmatic_Description)
        RETURN collect(distinct wmk) as wmk_records,
        application,
        t.last_updated_timestamp as last_updated_timestamp,
        collect(distinct owner) as owner_records,
        collect(distinct tag) as tag_records,
        collect(distinct badge) as badge_records,
        src,
        collect(distinct prog_descriptions) as prog_descriptions
        """)

        table_records = self._execute_cypher_query(statement=table_level_query,
                                                   param_dict={'tbl_key': table_uri,
                                                               'tag_normal_type': 'default',
                                                               'tag_badge_type': 'badge'})

        table_records = table_records.single()

        wmk_results = []
        table_writer = None

        wmk_records = table_records['wmk_records']

        for record in wmk_records:
            if record['key'] is not None:
                watermark_type = record['key'].split('/')[-2]
                wmk_result = Watermark(watermark_type=watermark_type,
                                       partition_key=record['partition_key'],
                                       partition_value=record['partition_value'],
                                       create_time=record['create_time'])
                wmk_results.append(wmk_result)

        tags = []
        if table_records.get('tag_records'):
            tag_records = table_records['tag_records']
            for record in tag_records:
                tag_result = Tag(tag_name=record['key'],
                                 tag_type=record['tag_type'])
                tags.append(tag_result)

        badges = []
        if table_records.get('badge_records'):
            badge_records = table_records['badge_records']
            for record in badge_records:
                badge_result = Tag(tag_name=record['key'],
                                   tag_type=record['tag_type'])
                badges.append(badge_result)

        application_record = table_records['application']
        if application_record is not None:
            table_writer = Application(
                application_url=application_record['application_url'],
                description=application_record['description'],
                name=application_record['name'],
                id=application_record.get('id', '')
            )

        timestamp_value = table_records['last_updated_timestamp']

        owner_record = []

        for owner in table_records.get('owner_records', []):
            owner_record.append(User(email=owner['email']))

        src = None

        if table_records['src']:
            src = Source(source_type=table_records['src']['source_type'],
                         source=table_records['src']['source'])

        prog_descriptions = self._extract_programmatic_descriptions_from_query(
            table_records.get('prog_descriptions', [])
        )

        return wmk_results, table_writer, timestamp_value, owner_record, tags, src, badges, prog_descriptions
Example #5
0
    def _get_table_watermarks(
            self, entity: EntityUniqueAttribute) -> List[Watermark]:
        partition_value_format = '%Y-%m-%d %H:%M:%S'

        _partitions = entity.get('relationshipAttributes',
                                 dict()).get('partitions', list())

        guids = [
            _partition.get('guid') for _partition in _partitions
            if _partition.get('entityStatus') == Status.ACTIVE
            and _partition.get('relationshipStatus') == Status.ACTIVE
        ]

        if not guids:
            return []

        partition_key = AtlasProxy._render_partition_key_name(entity)

        full_partitions = extract_entities(
            self._driver.entity_bulk(guid=list(guids),
                                     ignoreRelationships=True))
        watermark_date_format = AtlasProxy._select_watermark_format(
            [p.attributes.get('name') for p in full_partitions])

        partitions = {}

        for partition in full_partitions:
            partition_name = partition.attributes.get('name')

            if partition_name and watermark_date_format:
                partition_date, _ = AtlasProxy._validate_date(
                    partition_name, watermark_date_format)

                if partition_date:
                    _partition_create_time = self._parse_date(
                        partition.createTime) or 0.0

                    partition_create_time = datetime.datetime.fromtimestamp(
                        _partition_create_time).strftime(
                            partition_value_format)

                    common_values = {
                        'partition_value':
                        datetime.datetime.strftime(partition_date,
                                                   partition_value_format),
                        'create_time':
                        partition_create_time,
                        'partition_key':
                        partition_key
                    }

                    partitions[partition_date] = common_values

        if partitions:
            low_watermark_date = min(partitions.keys())
            high_watermark_date = max(partitions.keys())

            low_watermark = Watermark(watermark_type='low_watermark',
                                      **partitions.get(low_watermark_date))
            high_watermark = Watermark(watermark_type='high_watermark',
                                       **partitions.get(high_watermark_date))

            return [low_watermark, high_watermark]
        else:
            return []
    def test_get_table(self) -> None:
        with patch.object(GraphDatabase, 'driver'), patch.object(
                Neo4jProxy, '_execute_cypher_query') as mock_execute:
            mock_execute.side_effect = [
                self.col_usage_return_value, [], self.table_level_return_value
            ]

            neo4j_proxy = Neo4jProxy(host='DOES_NOT_MATTER', port=0000)
            table = neo4j_proxy.get_table(table_uri='dummy_uri')

            expected = Table(
                database='hive',
                cluster='gold',
                schema='foo_schema',
                name='foo_table',
                tags=[Tag(tag_name='test', tag_type='default')],
                badges=[Tag(tag_name='golden', tag_type='badge')],
                table_readers=[],
                description='foo description',
                watermarks=[
                    Watermark(watermark_type='high_watermark',
                              partition_key='ds',
                              partition_value='fake_value',
                              create_time='fake_time'),
                    Watermark(watermark_type='low_watermark',
                              partition_key='ds',
                              partition_value='fake_value',
                              create_time='fake_time')
                ],
                columns=[
                    Column(name='bar_id_1',
                           description='bar col description',
                           col_type='varchar',
                           sort_order=0,
                           stats=[
                               Statistics(start_epoch=1,
                                          end_epoch=1,
                                          stat_type='avg',
                                          stat_val='1')
                           ]),
                    Column(name='bar_id_2',
                           description='bar col2 description',
                           col_type='bigint',
                           sort_order=1,
                           stats=[
                               Statistics(start_epoch=2,
                                          end_epoch=2,
                                          stat_type='avg',
                                          stat_val='2')
                           ])
                ],
                owners=[User(email='*****@*****.**')],
                table_writer=Application(
                    application_url=self.table_writer['application_url'],
                    description=self.table_writer['description'],
                    name=self.table_writer['name'],
                    id=self.table_writer['id']),
                last_updated_timestamp=1,
                source=Source(source='/source_file_loc', source_type='github'),
                is_view=False)

            self.assertEqual(str(expected), str(table))