Ejemplo n.º 1
0
    def test_get_readers(self) -> None:
        entity_bulk_result = MagicMock()
        entity_bulk_result.entities = self.reader_entities
        self.proxy.client.entity.get_entities_by_guids = MagicMock(
            return_value=entity_bulk_result)

        res = self.proxy._get_readers(
            dict(relationshipAttributes=dict(readers=[
                dict(
                    guid=1, entityStatus='ACTIVE', relationshipStatus='ACTIVE')
            ])), Reader, 1)
        expected_readers = [
            Reader(user=User(email='test_user_2', user_id='test_user_2'),
                   read_count=150)
        ]
        self.assertEqual(expected_readers, res)

        res = self.proxy._get_readers(
            dict(relationshipAttributes=dict(readers=[
                dict(
                    guid=1, entityStatus='ACTIVE', relationshipStatus='ACTIVE')
            ])), User, 1)
        expected_users = [User(email='test_user_1', user_id='test_user_1')]
        self.assertEqual(expected_users, res)

        res = self.proxy._get_readers(
            dict(relationshipAttributes=dict(readers=[
                dict(
                    guid=1, entityStatus='ACTIVE', relationshipStatus='ACTIVE')
            ])), 'WRONG_MODEL', 1)
        expected = []  # type: ignore
        self.assertEqual(expected, res)
Ejemplo n.º 2
0
    def test_get_readers(self) -> None:
        basic_search_result = MagicMock()
        basic_search_result.entities = self.reader_entities

        self.proxy._driver.search_basic.create = MagicMock(
            return_value=basic_search_result)

        entity_bulk_result = MagicMock()
        entity_bulk_result.entities = self.reader_entities
        self.proxy._driver.entity_bulk = MagicMock(
            return_value=[entity_bulk_result])

        res = self.proxy._get_readers('dummy', 1)

        expected: List[Reader] = []

        expected += [
            Reader(user=User(email='test_user_1', user_id='test_user_1'),
                   read_count=5)
        ]
        expected += [
            Reader(user=User(email='test_user_2', user_id='test_user_2'),
                   read_count=150)
        ]

        self.assertEqual(res, expected)
Ejemplo n.º 3
0
    def _get_table(self, custom_stats_format: bool = False) -> None:
        if custom_stats_format:
            test_exp_col = self.test_exp_col_stats_formatted
        else:
            test_exp_col = self.test_exp_col_stats_raw
        ent_attrs = cast(dict, self.entity1['attributes'])
        self._mock_get_table_entity()
        self._create_mocked_report_entities_collection()
        self.proxy._get_owners = MagicMock(
            return_value=[User(email=ent_attrs['owner'])])  # type: ignore
        self.proxy._driver.entity_bulk = MagicMock(
            return_value=self.report_entity_collection)
        response = self.proxy.get_table(table_uri=self.table_uri)

        classif_name = self.classification_entity['classifications'][0][
            'typeName']

        col_attrs = cast(dict, self.test_column['attributes'])
        exp_col_stats = list()

        for stats in test_exp_col:
            exp_col_stats.append(
                Stat(
                    stat_type=stats['attributes']['stat_name'],
                    stat_val=stats['attributes']['stat_val'],
                    start_epoch=stats['attributes']['start_epoch'],
                    end_epoch=stats['attributes']['end_epoch'],
                ))

        exp_col = Column(name=col_attrs['name'],
                         description='column description',
                         col_type='Managed',
                         sort_order=col_attrs['position'],
                         stats=exp_col_stats)
        expected = Table(
            database=self.entity_type,
            cluster=self.cluster,
            schema=self.db,
            name=ent_attrs['name'],
            tags=[Tag(tag_name=classif_name, tag_type="default")],
            description=ent_attrs['description'],
            owners=[User(email=ent_attrs['owner'])],
            resource_reports=[
                ResourceReport(name='test_report', url='http://test'),
                ResourceReport(name='test_report3', url='http://test3')
            ],
            last_updated_timestamp=int(str(self.entity1['updateTime'])[:10]),
            columns=[exp_col] * self.active_columns,
            programmatic_descriptions=[
                ProgrammaticDescription(source='test parameter key a',
                                        text='testParameterValueA'),
                ProgrammaticDescription(source='test parameter key b',
                                        text='testParameterValueB')
            ],
            is_view=False)

        self.assertEqual(str(expected), str(response))
    def _get_owners(self, data_owners: list,
                    fallback_owner: str) -> List[User]:
        owners_detail = list()
        active_owners = filter(
            lambda item: item['entityStatus'] == Status.ACTIVE and item[
                'relationshipStatus'] == Status.ACTIVE, data_owners)

        for owner in active_owners:
            owner_qn = owner['displayText']
            owner_data = self._get_user_details(owner_qn)
            owners_detail.append(User(**owner_data))

        return owners_detail or [
            User(email=fallback_owner, user_id=fallback_owner)
        ]
    def _get_readers(self, entity: AtlasEntityWithExtInfo, top: Optional[int] = 15) -> List[Reader]:
        _readers = entity.get('relationshipAttributes', dict()).get('readers', list())

        guids = [_reader.get('guid') for _reader in _readers
                 if _reader.get('entityStatus', 'INACTIVE') == Status.ACTIVE
                 and _reader.get('relationshipStatus', 'INACTIVE') == Status.ACTIVE]

        if not guids:
            return []

        readers = self.client.entity.get_entities_by_guids(guids=list(guids), ignore_relationships=False)

        _result = []

        for _reader in readers.entities or list():
            read_count = _reader.attributes['count']

            if read_count >= int(app.config['POPULAR_TABLE_MINIMUM_READER_COUNT']):
                reader_qn = _reader.relationshipAttributes['user']['displayText']
                reader_details = self._get_user_details(reader_qn)
                reader = Reader(user=User(**reader_details), read_count=read_count)

                _result.append(reader)

        result = sorted(_result, key=attrgetter('read_count'), reverse=True)[:top]

        return result
Ejemplo n.º 6
0
    def _get_readers(self,
                     qualified_name: str,
                     top: Optional[int] = 15) -> List[Reader]:
        params = {
            'typeName': self.READER_TYPE,
            'offset': '0',
            'limit': top,
            'excludeDeletedEntities': True,
            'entityFilters': {
                'condition':
                'AND',
                'criterion': [{
                    'attributeName':
                    self.QN_KEY,
                    'operator':
                    'STARTSWITH',
                    'attributeValue':
                    qualified_name.split('@')[0] + '.'
                }, {
                    'attributeName':
                    'count',
                    'operator':
                    'gte',
                    'attributeValue':
                    f'{app.config["POPULAR_TABLE_MINIMUM_READER_COUNT"]}'
                }]
            },
            'attributes': ['count', self.QN_KEY],
            'sortBy': 'count',
            'sortOrder': 'DESCENDING'
        }

        search_results = self._driver.search_basic.create(
            data=params, ignoreRelationships=False)

        readers = []

        for record in search_results.entities:
            readers.append(record.guid)

        results = []

        if readers:
            read_entities = extract_entities(
                self._driver.entity_bulk(guid=readers,
                                         ignoreRelationships=False))

            for read_entity in read_entities:
                reader_qn = read_entity.relationshipAttributes['user'][
                    'displayText']
                reader_details = self.user_detail_method(reader_qn) or {
                    'email': reader_qn,
                    'user_id': reader_qn
                }
                reader = Reader(user=User(**reader_details),
                                read_count=read_entity.attributes['count'])

                results.append(reader)

        return results
Ejemplo n.º 7
0
    def test_get_table(self) -> None:
        self._mock_get_table_entity()
        response = self.proxy.get_table(table_uri=self.table_uri)

        classif_name = self.classification_entity['classifications'][0]['typeName']
        ent_attrs = cast(dict, self.entity1['attributes'])

        col_attrs = cast(dict, self.test_column['attributes'])
        col_metadata_attrs = cast(dict, self.column_metadata_entity['attributes'])
        exp_col_stats = list()

        for stats in col_metadata_attrs['statistics']:
            exp_col_stats.append(
                Statistics(
                    stat_type=stats['attributes']['stat_name'],
                    stat_val=stats['attributes']['stat_val'],
                    start_epoch=stats['attributes']['start_epoch'],
                    end_epoch=stats['attributes']['end_epoch'],
                )
            )
        exp_col = Column(name=col_attrs['name'],
                         description='column description',
                         col_type='Managed',
                         sort_order=col_attrs['position'],
                         stats=exp_col_stats)
        expected = Table(database=self.entity_type,
                         cluster=self.cluster,
                         schema=self.db,
                         name=ent_attrs['name'],
                         tags=[Tag(tag_name=classif_name, tag_type="default")],
                         description=ent_attrs['description'],
                         owners=[User(email=ent_attrs['owner'])],
                         columns=[exp_col],
                         last_updated_timestamp=cast(int, self.entity1['updateTime']))
        self.assertEqual(str(expected), str(response))
    def get_table(self, *, table_uri: str) -> Table:
        """
        Gathers all the information needed for the Table Detail Page.
        :param table_uri:
        :return: A Table object with all the information available
        or gathered from different entities.
        """
        entity = self._get_table_entity(table_uri=table_uri)
        table_details = entity.entity

        try:
            attrs = table_details[self.ATTRS_KEY]

            programmatic_descriptions = self._get_programmatic_descriptions(
                attrs.get('parameters'))

            table_qn = parse_table_qualified_name(
                qualified_name=attrs.get(self.QN_KEY))

            tags = []
            # Using or in case, if the key 'classifications' is there with a None
            for classification in table_details.get(
                    "classifications") or list():
                tags.append(
                    Tag(tag_name=classification.get('typeName'),
                        tag_type="default"))

            columns = self._serialize_columns(entity=entity)

            reports_guids = [
                report.get("guid")
                for report in attrs.get("reports") or list()
            ]

            table = Table(
                database=table_details.get('typeName'),
                cluster=table_qn.get('cluster_name', ''),
                schema=table_qn.get('db_name', ''),
                name=attrs.get('name') or table_qn.get("table_name", ''),
                tags=tags,
                description=attrs.get('description') or attrs.get('comment'),
                owners=[User(email=attrs.get('owner'))],
                resource_reports=self._get_reports(guids=reports_guids),
                columns=columns,
                table_readers=self._get_readers(attrs.get(self.QN_KEY)),
                last_updated_timestamp=self._parse_date(
                    table_details.get('updateTime')),
                programmatic_descriptions=programmatic_descriptions)

            return table
        except KeyError as ex:
            LOGGER.exception(
                'Error while accessing table information. {}'.format(str(ex)))
            raise BadRequest(
                'Some of the required attributes '
                'are missing in : ( {table_uri} )'.format(table_uri=table_uri))
    def _get_owners(self, data_owners: list, fallback_owner: str = None) -> List[User]:
        owners_detail = list()
        active_owners_list = list()
        active_owners = filter(lambda item:
                               item['entityStatus'] == Status.ACTIVE and
                               item['relationshipStatus'] == Status.ACTIVE,
                               data_owners)

        for owner in active_owners:
            owner_qn = owner['displayText']
            owner_data = self._get_user_details(owner_qn)
            owners_detail.append(User(**owner_data))
            active_owners_list.append(owner_qn)

        # To avoid the duplication,
        # we are checking if the fallback is not in data_owners
        if fallback_owner and (fallback_owner not in active_owners_list):
            owners_detail.append(User(**self._get_user_details(fallback_owner)))

        return owners_detail
Ejemplo n.º 10
0
 def test_get_owners_details_only_fallback(self) -> None:
     self.app.config['USER_DETAIL_METHOD'] = None
     user_id = "*****@*****.**"
     res = self.proxy._get_owners(data_owners=list(),
                                  fallback_owner=user_id)
     self.assertEqual(1, len(res))
     self.assertListEqual(res,
                          [User(**{
                              'email': user_id,
                              'user_id': user_id
                          })])
Ejemplo n.º 11
0
    def test_get_table_view_only(self) -> None:
        col_usage_return_value = copy.deepcopy(self.col_usage_return_value)
        for col in col_usage_return_value:
            col['tbl']['is_view'] = True

        with patch.object(GraphDatabase, 'driver'), patch.object(Neo4jProxy, '_execute_cypher_query') as mock_execute:
            mock_execute.side_effect = [col_usage_return_value, [], self.table_level_return_value]

            neo4j_proxy = Neo4jProxy(host='DOES_NOT_MATTER', port=0000)
            table = neo4j_proxy.get_table(table_uri='dummy_uri')

            expected = Table(database='hive', cluster='gold', schema='foo_schema', name='foo_table',
                             tags=[Tag(tag_name='test', tag_type='default')],
                             badges=[Badge(badge_name='golden', category='table_status')],
                             table_readers=[], description='foo description',
                             watermarks=[Watermark(watermark_type='high_watermark',
                                                   partition_key='ds',
                                                   partition_value='fake_value',
                                                   create_time='fake_time'),
                                         Watermark(watermark_type='low_watermark',
                                                   partition_key='ds',
                                                   partition_value='fake_value',
                                                   create_time='fake_time')],
                             columns=[Column(name='bar_id_1', description='bar col description', col_type='varchar',
                                             sort_order=0, stats=[Stat(start_epoch=1,
                                                                       end_epoch=1,
                                                                       stat_type='avg',
                                                                       stat_val='1')], badges=[]),
                                      Column(name='bar_id_2', description='bar col2 description', col_type='bigint',
                                             sort_order=1, stats=[Stat(start_epoch=2,
                                                                       end_epoch=2,
                                                                       stat_type='avg',
                                                                       stat_val='2')],
                                             badges=[Badge(badge_name='primary key', category='column')])],
                             owners=[User(email='*****@*****.**')],
                             table_writer=Application(application_url=self.table_writer['application_url'],
                                                      description=self.table_writer['description'],
                                                      name=self.table_writer['name'],
                                                      id=self.table_writer['id']),
                             last_updated_timestamp=1,
                             source=Source(source='/source_file_loc',
                                           source_type='github'),
                             is_view=True,
                             programmatic_descriptions=[
                                 ProgrammaticDescription(source='quality_report',
                                                         text='Test Test'),
                                 ProgrammaticDescription(source='s3_crawler',
                                                         text='Test Test Test')
                             ])

            self.assertEqual(str(expected), str(table))
Ejemplo n.º 12
0
    def test_get_readers(self) -> None:
        basic_search_result = MagicMock()
        basic_search_result.entities = self.reader_entities

        self.proxy._driver.search_basic.create = MagicMock(return_value=basic_search_result)

        entity_bulk_result = MagicMock()
        entity_bulk_result.entities = self.reader_entities
        self.proxy._driver.entity_bulk = MagicMock(return_value=[entity_bulk_result])

        res = self.proxy._get_readers(dict(relationshipAttributes=dict(readers=[dict(guid=1, entityStatus='ACTIVE',
                                                                                     relationshipStatus='ACTIVE')])),
                                      1)

        expected = [Reader(user=User(email='test_user_2', user_id='test_user_2'), read_count=150)]

        self.assertEqual(expected, res)
    def _exec_usage_query(self, table_uri: str) -> List[Reader]:
        # Return Value: List[Reader]

        usage_query = textwrap.dedent("""\
        MATCH (user:User)-[read:READ]->(table:Table {key: $tbl_key})
        RETURN user.email as email, read.read_count as read_count, table.name as table_name
        ORDER BY read.read_count DESC LIMIT 5;
        """)

        usage_neo4j_records = self._execute_cypher_query(statement=usage_query,
                                                         param_dict={'tbl_key': table_uri})
        readers = []  # type: List[Reader]
        for usage_neo4j_record in usage_neo4j_records:
            reader = Reader(user=User(email=usage_neo4j_record['email']),
                            read_count=usage_neo4j_record['read_count'])
            readers.append(reader)

        return readers
Ejemplo n.º 14
0
    def test_get_dashboard(self) -> None:
        self.proxy.client.entity.get_entity_by_attribute = MagicMock(
            return_value=self.dashboard_data)  # type: ignore
        self.proxy._get_dashboard_group = MagicMock(
            return_value=self.dashboard_group_data)  # type: ignore
        self.proxy.client.entity.get_entities_by_guids = MagicMock(
            return_value=DottedDict({'entities': [DottedDict(self.entity1)]}))

        expected = DashboardDetail(
            uri='superset_dashboard://datalab.prod/1',
            cluster='datalab',
            group_name='prod superset',
            group_url='https://superset.prod',
            product='superset',
            name='Prod Usage',
            url='https://prod.superset/dashboards/1',
            description='Robs famous dashboard',
            created_timestamp=1619517099,
            updated_timestamp=1619626531,
            last_successful_run_timestamp=1619517099,
            last_run_timestamp=1619517150,
            last_run_state='failed',
            owners=[
                User(user_id='lisa_salinas',
                     email='lisa_salinas',
                     first_name=None,
                     last_name=None,
                     full_name=None,
                     display_name=None,
                     is_active=True,
                     github_username=None,
                     team_name=None,
                     slack_id=None,
                     employee_type=None,
                     manager_fullname=None,
                     manager_email=None,
                     manager_id=None,
                     role_name=None,
                     profile_url=None,
                     other_key_values={})
            ],
            frequent_users=[],
            chart_names=['Count Users by Time', 'Total Count'],
            query_names=['User Count By Time', 'Total Count'],
            queries=[
                DashboardQuery(
                    name='User Count By Time',
                    url='https://prod.superset/dashboards/1/query/1',
                    query_text='SELECT date, COUNT(1) FROM db.table GROUP BY 1'
                ),
                DashboardQuery(
                    name='Total Count',
                    url='https://prod.superset/dashboards/1/query/2',
                    query_text='SELECT COUNT(1) FROM db.table')
            ],
            tables=[
                PopularTable(database='hive_table',
                             cluster='TEST_CLUSTER',
                             schema='TEST_DB',
                             name='Table1',
                             description='Dummy Description')
            ],
            tags=[],
            badges=[],
            recent_view_count=0)

        result = self.proxy.get_dashboard(
            id='superset_dashboard://datalab.prod/1')

        self.assertEqual(expected, result)
Ejemplo n.º 15
0
    def test_get_dashboard(self) -> None:
        with patch.object(GraphDatabase, 'driver'), patch.object(
                Neo4jProxy, '_execute_cypher_query') as mock_execute:
            mock_execute.return_value.single.side_effect = [{
                'cluster_name':
                'cluster_name',
                'uri':
                'foo_dashboard://gold.bar/dashboard_id',
                'url':
                'http://www.foo.bar/dashboard_id',
                'product':
                'foobar',
                'name':
                'dashboard name',
                'created_timestamp':
                123456789,
                'description':
                'description',
                'group_name':
                'group_name',
                'group_url':
                'http://www.group_url.com',
                'last_successful_run_timestamp':
                9876543210,
                'last_run_timestamp':
                987654321,
                'last_run_state':
                'good_state',
                'updated_timestamp':
                123456654321,
                'recent_view_count':
                100,
                'owners': [{
                    'employee_type': 'teamMember',
                    'full_name': 'test_full_name',
                    'is_active': 'True',
                    'github_username': '******',
                    'slack_id': 'test_id',
                    'last_name': 'test_last_name',
                    'first_name': 'test_first_name',
                    'team_name': 'test_team',
                    'email': 'test_email',
                }, {
                    'employee_type': 'teamMember',
                    'full_name': 'test_full_name2',
                    'is_active': 'True',
                    'github_username': '******',
                    'slack_id': 'test_id2',
                    'last_name': 'test_last_name2',
                    'first_name': 'test_first_name2',
                    'team_name': 'test_team2',
                    'email': 'test_email2',
                }],
                'tags': [{
                    'key': 'tag_key1',
                    'tag_type': 'tag_type1'
                }, {
                    'key': 'tag_key2',
                    'tag_type': 'tag_type2'
                }],
                'charts': [{
                    'name': 'chart1'
                }, {
                    'name': 'chart2'
                }],
                'queries': [{
                    'name': 'query1'
                }, {
                    'name': 'query2'
                }],
                'tables': [{
                    'database': 'db1',
                    'name': 'table1',
                    'description': 'table description 1',
                    'cluster': 'cluster1',
                    'schema': 'schema1'
                }, {
                    'database': 'db2',
                    'name': 'table2',
                    'description': None,
                    'cluster': 'cluster2',
                    'schema': 'schema2'
                }]
            }, {
                'cluster_name':
                'cluster_name',
                'uri':
                'foo_dashboard://gold.bar/dashboard_id',
                'url':
                'http://www.foo.bar/dashboard_id',
                'product':
                'foobar',
                'name':
                'dashboard name',
                'created_timestamp':
                123456789,
                'description':
                None,
                'group_name':
                'group_name',
                'group_url':
                'http://www.group_url.com',
                'last_run_timestamp':
                None,
                'last_run_state':
                None,
                'updated_timestamp':
                None,
                'recent_view_count':
                0,
                'owners': [],
                'tags': [],
                'charts': [],
                'queries': [],
                'tables': []
            }]
            neo4j_proxy = Neo4jProxy(host='DOES_NOT_MATTER', port=0000)
            dashboard = neo4j_proxy.get_dashboard(id='dashboard_id')
            expected = DashboardDetail(
                uri='foo_dashboard://gold.bar/dashboard_id',
                cluster='cluster_name',
                group_name='group_name',
                group_url='http://www.group_url.com',
                product='foobar',
                name='dashboard name',
                url='http://www.foo.bar/dashboard_id',
                description='description',
                created_timestamp=123456789,
                last_successful_run_timestamp=9876543210,
                updated_timestamp=123456654321,
                last_run_timestamp=987654321,
                last_run_state='good_state',
                owners=[
                    User(email='test_email',
                         first_name='test_first_name',
                         last_name='test_last_name',
                         full_name='test_full_name',
                         is_active='True',
                         github_username='******',
                         team_name='test_team',
                         slack_id='test_id',
                         employee_type='teamMember',
                         manager_fullname=''),
                    User(email='test_email2',
                         first_name='test_first_name2',
                         last_name='test_last_name2',
                         full_name='test_full_name2',
                         is_active='True',
                         github_username='******',
                         team_name='test_team2',
                         slack_id='test_id2',
                         employee_type='teamMember',
                         manager_fullname='')
                ],
                frequent_users=[],
                chart_names=['chart1', 'chart2'],
                query_names=['query1', 'query2'],
                tables=[
                    PopularTable(database='db1',
                                 name='table1',
                                 description='table description 1',
                                 cluster='cluster1',
                                 schema='schema1'),
                    PopularTable(database='db2',
                                 name='table2',
                                 cluster='cluster2',
                                 schema='schema2'),
                ],
                tags=[
                    Tag(tag_type='tag_type1', tag_name='tag_key1'),
                    Tag(tag_type='tag_type2', tag_name='tag_key2')
                ],
                recent_view_count=100)

            self.assertEqual(expected, dashboard)

            dashboard2 = neo4j_proxy.get_dashboard(id='dashboard_id')
            expected2 = DashboardDetail(
                uri='foo_dashboard://gold.bar/dashboard_id',
                cluster='cluster_name',
                group_name='group_name',
                group_url='http://www.group_url.com',
                product='foobar',
                name='dashboard name',
                url='http://www.foo.bar/dashboard_id',
                description=None,
                created_timestamp=123456789,
                updated_timestamp=None,
                last_run_timestamp=None,
                last_run_state=None,
                owners=[],
                frequent_users=[],
                chart_names=[],
                query_names=[],
                tables=[],
                tags=[],
                last_successful_run_timestamp=None,
                recent_view_count=0)

            self.assertEqual(expected2, dashboard2)
    def test_get_table(self) -> None:
        with patch.object(GraphDatabase, 'driver'), patch.object(
                Neo4jProxy, '_execute_cypher_query') as mock_execute:
            mock_execute.side_effect = [
                self.col_usage_return_value, [], self.table_level_return_value
            ]

            neo4j_proxy = Neo4jProxy(host='DOES_NOT_MATTER', port=0000)
            table = neo4j_proxy.get_table(table_uri='dummy_uri')

            expected = Table(
                database='hive',
                cluster='gold',
                schema='foo_schema',
                name='foo_table',
                tags=[Tag(tag_name='test', tag_type='default')],
                badges=[Tag(tag_name='golden', tag_type='badge')],
                table_readers=[],
                description='foo description',
                watermarks=[
                    Watermark(watermark_type='high_watermark',
                              partition_key='ds',
                              partition_value='fake_value',
                              create_time='fake_time'),
                    Watermark(watermark_type='low_watermark',
                              partition_key='ds',
                              partition_value='fake_value',
                              create_time='fake_time')
                ],
                columns=[
                    Column(name='bar_id_1',
                           description='bar col description',
                           col_type='varchar',
                           sort_order=0,
                           stats=[
                               Statistics(start_epoch=1,
                                          end_epoch=1,
                                          stat_type='avg',
                                          stat_val='1')
                           ]),
                    Column(name='bar_id_2',
                           description='bar col2 description',
                           col_type='bigint',
                           sort_order=1,
                           stats=[
                               Statistics(start_epoch=2,
                                          end_epoch=2,
                                          stat_type='avg',
                                          stat_val='2')
                           ])
                ],
                owners=[User(email='*****@*****.**')],
                table_writer=Application(
                    application_url=self.table_writer['application_url'],
                    description=self.table_writer['description'],
                    name=self.table_writer['name'],
                    id=self.table_writer['id']),
                last_updated_timestamp=1,
                source=Source(source='/source_file_loc', source_type='github'),
                is_view=False)

            self.assertEqual(str(expected), str(table))
Ejemplo n.º 17
0
    def test_get_dashboard(self, mock_rds_client: Any) -> None:
        # dashboard_metadata
        dashboard = RDSDashboard(
            rk='foo_dashboard://gold.bar/dashboard_id',
            name='dashboard name',
            dashboard_url='http://www.foo.bar/dashboard_id',
            created_timestamp=123456789)
        dashboard_group = RDSDashboardGroup(
            name='group_name', dashboard_group_url='http://www.group_url.com')
        dashboard_group.cluster = RDSCluster(name='cluster_name')
        dashboard.group = dashboard_group
        dashboard.description = RDSDashboardDescription(
            description='description')
        dashboard.execution = [
            RDSDashboardExecution(rk='dashboard_last_successful_execution',
                                  timestamp=9876543210),
            RDSDashboardExecution(rk='dashboard_last_execution',
                                  timestamp=987654321,
                                  state='good_state')
        ]
        dashboard.timestamp = RDSDashboardTimestamp(timestamp=123456654321)
        dashboard.tags = [
            RDSTag(rk='tag_key1', tag_type='default'),
            RDSTag(rk='tag_key2', tag_type='default')
        ]
        dashboard.badges = [RDSBadge(rk='golden', category='table_status')]
        dashboard.owners = [
            RDSUser(email='test_email',
                    first_name='test_first_name',
                    last_name='test_last_name',
                    full_name='test_full_name',
                    is_active=True,
                    github_username='******',
                    team_name='test_team',
                    slack_id='test_id',
                    employee_type='teamMember'),
            RDSUser(email='test_email2',
                    first_name='test_first_name2',
                    last_name='test_last_name2',
                    full_name='test_full_name2',
                    is_active=True,
                    github_username='******',
                    team_name='test_team2',
                    slack_id='test_id2',
                    employee_type='teamMember')
        ]
        dashboard.usage = [RDSDashboardUsage(read_count=100)]

        mock_client = MagicMock()
        mock_rds_client.return_value = mock_client

        mock_create_session = MagicMock()
        mock_client.create_session.return_value = mock_create_session

        mock_session = MagicMock()
        mock_create_session.__enter__.return_value = mock_session

        mock_session_query = MagicMock()
        mock_session.query.return_value = mock_session_query

        mock_session_query_filter = MagicMock()
        mock_session_query.filter.return_value = mock_session_query_filter

        mock_session_query_filter.first.return_value = dashboard

        # queries
        query1 = RDSDashboardQuery(name='query1')
        query2 = RDSDashboardQuery(name='query2',
                                   url='http://foo.bar/query',
                                   query_text='SELECT * FROM foo.bar')
        query1.charts = [RDSDashboardChart(name='chart1')]
        query2.charts = [RDSDashboardChart(name='chart2')]
        queries = [query1, query2]

        # tables
        database1 = RDSDatabase(name='db1')
        database2 = RDSDatabase(name='db2')
        cluster1 = RDSCluster(name='cluster1')
        cluster2 = RDSCluster(name='cluster2')
        schema1 = RDSSchema(name='schema1')
        schema2 = RDSSchema(name='schema2')
        table1 = RDSTable(name='table1')
        table2 = RDSTable(name='table2')
        description1 = RDSTableDescription(description='table description 1')

        schema1.cluster = cluster1
        cluster1.database = database1
        schema2.cluster = cluster2
        cluster2.database = database2
        table1.schema = schema1
        table2.schema = schema2
        table1.description = description1
        tables = [table1, table2]

        mock_session_query_filter_options = MagicMock()
        mock_session_query_filter.options.return_value = mock_session_query_filter_options

        mock_session_query_filter_options.all.side_effect = [queries, tables]

        expected = DashboardDetail(
            uri='foo_dashboard://gold.bar/dashboard_id',
            cluster='cluster_name',
            group_name='group_name',
            group_url='http://www.group_url.com',
            product='foo',
            name='dashboard name',
            url='http://www.foo.bar/dashboard_id',
            description='description',
            created_timestamp=123456789,
            last_successful_run_timestamp=9876543210,
            updated_timestamp=123456654321,
            last_run_timestamp=987654321,
            last_run_state='good_state',
            owners=[
                User(email='test_email',
                     first_name='test_first_name',
                     last_name='test_last_name',
                     full_name='test_full_name',
                     is_active=True,
                     github_username='******',
                     team_name='test_team',
                     slack_id='test_id',
                     employee_type='teamMember',
                     manager_fullname=''),
                User(email='test_email2',
                     first_name='test_first_name2',
                     last_name='test_last_name2',
                     full_name='test_full_name2',
                     is_active=True,
                     github_username='******',
                     team_name='test_team2',
                     slack_id='test_id2',
                     employee_type='teamMember',
                     manager_fullname='')
            ],
            frequent_users=[],
            chart_names=['chart1', 'chart2'],
            query_names=['query1', 'query2'],
            queries=[
                DashboardQuery(name='query1'),
                DashboardQuery(name='query2',
                               url='http://foo.bar/query',
                               query_text='SELECT * FROM foo.bar')
            ],
            tables=[
                PopularTable(database='db1',
                             name='table1',
                             description='table description 1',
                             cluster='cluster1',
                             schema='schema1'),
                PopularTable(database='db2',
                             name='table2',
                             cluster='cluster2',
                             schema='schema2'),
            ],
            tags=[
                Tag(tag_type='default', tag_name='tag_key1'),
                Tag(tag_type='default', tag_name='tag_key2')
            ],
            badges=[Badge(badge_name='golden', category='table_status')],
            recent_view_count=100)

        proxy = MySQLProxy()
        actual = proxy.get_dashboard(id='dashboard_id')

        self.assertEqual(expected, actual)
Ejemplo n.º 18
0
    def test_get_table(self, mock_rds_client: Any) -> None:
        database = RDSDatabase(name='hive')
        cluster = RDSCluster(name='gold')
        schema = RDSSchema(name='foo_schema')
        schema.cluster = cluster
        cluster.database = database

        table = RDSTable(name='foo_table')
        table.schema = schema
        table.description = RDSTableDescription(description='foo description')

        col1 = RDSColumn(name='bar_id_1', type='varchar', sort_order=0)
        col1.description = RDSColumnDescription(
            description='bar col description')
        col1.stats = [
            RDSColumnStat(stat_type='avg',
                          start_epoch='1',
                          end_epoch='1',
                          stat_val='1')
        ]

        col2 = RDSColumn(name='bar_id_2', type='bigint', sort_order=1)
        col2.description = RDSColumnDescription(
            description='bar col2 description')
        col2.stats = [
            RDSColumnStat(stat_type='avg',
                          start_epoch='2',
                          end_epoch='2',
                          stat_val='2')
        ]
        col2.badges = [RDSBadge(rk='primary key', category='column')]
        columns = [col1, col2]

        table.watermarks = [
            RDSTableWatermark(
                rk='hive://gold.test_schema/test_table/high_watermark/',
                partition_key='ds',
                partition_value='fake_value',
                create_time='fake_time'),
            RDSTableWatermark(
                rk='hive://gold.test_schema/test_table/low_watermark/',
                partition_key='ds',
                partition_value='fake_value',
                create_time='fake_time')
        ]

        table.application = RDSApplication(
            application_url='airflow_host/admin/airflow/tree?dag_id=test_table',
            description='DAG generating a table',
            name='Airflow',
            id='dag/task_id')
        table.timestamp = RDSTableTimestamp(last_updated_timestamp=1)

        table.owners = [
            RDSUser(rk='*****@*****.**', email='*****@*****.**')
        ]
        table.tags = [RDSTag(rk='test', tag_type='default')]
        table.badges = [RDSBadge(rk='golden', category='table_status')]
        table.source = RDSTableSource(rk='some key',
                                      source_type='github',
                                      source='/source_file_loc')
        table.programmatic_descriptions = [
            RDSTableProgrammaticDescription(description_source='s3_crawler',
                                            description='Test Test Test'),
            RDSTableProgrammaticDescription(
                description_source='quality_report', description='Test Test')
        ]

        readers = [RDSTableUsage(user_rk='*****@*****.**', read_count=5)]

        mock_client = MagicMock()
        mock_rds_client.return_value = mock_client

        mock_create_session = MagicMock()
        mock_client.create_session.return_value = mock_create_session

        mock_session = MagicMock()
        mock_create_session.__enter__.return_value = mock_session

        mock_session_query = MagicMock()
        mock_session.query.return_value = mock_session_query

        mock_session_query_filter = MagicMock()
        mock_session_query.filter.return_value = mock_session_query_filter
        mock_session_query_filter.first.return_value = table

        mock_session_query_filter_orderby = MagicMock()
        mock_session_query_filter.order_by.return_value = mock_session_query_filter_orderby

        mock_session_query_filter_orderby_limit = MagicMock()
        mock_session_query_filter_orderby.limit.return_value = mock_session_query_filter_orderby_limit
        mock_session_query_filter_orderby_limit.all.return_value = readers

        mock_session_query_filter_options = MagicMock()
        mock_session_query_filter.options.return_value = mock_session_query_filter_options
        mock_session_query_filter_options.all.return_value = columns

        proxy = MySQLProxy()
        actual_table = proxy.get_table(table_uri='dummy_uri')

        expected = Table(
            database='hive',
            cluster='gold',
            schema='foo_schema',
            name='foo_table',
            tags=[Tag(tag_name='test', tag_type='default')],
            badges=[Badge(badge_name='golden', category='table_status')],
            table_readers=[
                Reader(user=User(email='*****@*****.**'), read_count=5)
            ],
            description='foo description',
            watermarks=[
                Watermark(watermark_type='high_watermark',
                          partition_key='ds',
                          partition_value='fake_value',
                          create_time='fake_time'),
                Watermark(watermark_type='low_watermark',
                          partition_key='ds',
                          partition_value='fake_value',
                          create_time='fake_time')
            ],
            columns=[
                Column(name='bar_id_1',
                       description='bar col description',
                       col_type='varchar',
                       sort_order=0,
                       stats=[
                           Stat(start_epoch=1,
                                end_epoch=1,
                                stat_type='avg',
                                stat_val='1')
                       ],
                       badges=[]),
                Column(name='bar_id_2',
                       description='bar col2 description',
                       col_type='bigint',
                       sort_order=1,
                       stats=[
                           Stat(start_epoch=2,
                                end_epoch=2,
                                stat_type='avg',
                                stat_val='2')
                       ],
                       badges=[
                           Badge(badge_name='primary key', category='column')
                       ])
            ],
            owners=[User(email='*****@*****.**')],
            table_writer=Application(
                application_url=
                'airflow_host/admin/airflow/tree?dag_id=test_table',
                description='DAG generating a table',
                name='Airflow',
                id='dag/task_id'),
            last_updated_timestamp=1,
            source=Source(source='/source_file_loc', source_type='github'),
            is_view=False,
            programmatic_descriptions=[
                ProgrammaticDescription(source='quality_report',
                                        text='Test Test'),
                ProgrammaticDescription(source='s3_crawler',
                                        text='Test Test Test')
            ])

        self.assertEqual(str(expected), str(actual_table))
    def _exec_table_query(self, table_uri: str) -> Tuple:
        """
        Queries one Cypher record with watermark list, Application,
        ,timestamp, owner records and tag records.
        """

        # Return Value: (Watermark Results, Table Writer, Last Updated Timestamp, owner records, tag records)

        table_level_query = textwrap.dedent("""\
        MATCH (tbl:Table {key: $tbl_key})
        OPTIONAL MATCH (wmk:Watermark)-[:BELONG_TO_TABLE]->(tbl)
        OPTIONAL MATCH (application:Application)-[:GENERATES]->(tbl)
        OPTIONAL MATCH (tbl)-[:LAST_UPDATED_AT]->(t:Timestamp)
        OPTIONAL MATCH (owner:User)<-[:OWNER]-(tbl)
        OPTIONAL MATCH (tbl)-[:TAGGED_BY]->(tag:Tag{tag_type: $tag_normal_type})
        OPTIONAL MATCH (tbl)-[:TAGGED_BY]->(badge:Tag{tag_type: $tag_badge_type})
        OPTIONAL MATCH (tbl)-[:SOURCE]->(src:Source)
        OPTIONAL MATCH (tbl)-[:DESCRIPTION]->(prog_descriptions:Programmatic_Description)
        RETURN collect(distinct wmk) as wmk_records,
        application,
        t.last_updated_timestamp as last_updated_timestamp,
        collect(distinct owner) as owner_records,
        collect(distinct tag) as tag_records,
        collect(distinct badge) as badge_records,
        src,
        collect(distinct prog_descriptions) as prog_descriptions
        """)

        table_records = self._execute_cypher_query(statement=table_level_query,
                                                   param_dict={'tbl_key': table_uri,
                                                               'tag_normal_type': 'default',
                                                               'tag_badge_type': 'badge'})

        table_records = table_records.single()

        wmk_results = []
        table_writer = None

        wmk_records = table_records['wmk_records']

        for record in wmk_records:
            if record['key'] is not None:
                watermark_type = record['key'].split('/')[-2]
                wmk_result = Watermark(watermark_type=watermark_type,
                                       partition_key=record['partition_key'],
                                       partition_value=record['partition_value'],
                                       create_time=record['create_time'])
                wmk_results.append(wmk_result)

        tags = []
        if table_records.get('tag_records'):
            tag_records = table_records['tag_records']
            for record in tag_records:
                tag_result = Tag(tag_name=record['key'],
                                 tag_type=record['tag_type'])
                tags.append(tag_result)

        badges = []
        if table_records.get('badge_records'):
            badge_records = table_records['badge_records']
            for record in badge_records:
                badge_result = Tag(tag_name=record['key'],
                                   tag_type=record['tag_type'])
                badges.append(badge_result)

        application_record = table_records['application']
        if application_record is not None:
            table_writer = Application(
                application_url=application_record['application_url'],
                description=application_record['description'],
                name=application_record['name'],
                id=application_record.get('id', '')
            )

        timestamp_value = table_records['last_updated_timestamp']

        owner_record = []

        for owner in table_records.get('owner_records', []):
            owner_record.append(User(email=owner['email']))

        src = None

        if table_records['src']:
            src = Source(source_type=table_records['src']['source_type'],
                         source=table_records['src']['source'])

        prog_descriptions = self._extract_programmatic_descriptions_from_query(
            table_records.get('prog_descriptions', [])
        )

        return wmk_results, table_writer, timestamp_value, owner_record, tags, src, badges, prog_descriptions