예제 #1
0
    def test_get_table_view_only(self) -> None:
        col_usage_return_value = copy.deepcopy(self.col_usage_return_value)
        for col in col_usage_return_value:
            col['tbl']['is_view'] = True

        with patch.object(GraphDatabase, 'driver'), patch.object(Neo4jProxy, '_execute_cypher_query') as mock_execute:
            mock_execute.side_effect = [col_usage_return_value, [], self.table_level_return_value]

            neo4j_proxy = Neo4jProxy(host='DOES_NOT_MATTER', port=0000)
            table = neo4j_proxy.get_table(table_uri='dummy_uri')

            expected = Table(database='hive', cluster='gold', schema='foo_schema', name='foo_table',
                             tags=[Tag(tag_name='test', tag_type='default')],
                             badges=[Badge(badge_name='golden', category='table_status')],
                             table_readers=[], description='foo description',
                             watermarks=[Watermark(watermark_type='high_watermark',
                                                   partition_key='ds',
                                                   partition_value='fake_value',
                                                   create_time='fake_time'),
                                         Watermark(watermark_type='low_watermark',
                                                   partition_key='ds',
                                                   partition_value='fake_value',
                                                   create_time='fake_time')],
                             columns=[Column(name='bar_id_1', description='bar col description', col_type='varchar',
                                             sort_order=0, stats=[Stat(start_epoch=1,
                                                                       end_epoch=1,
                                                                       stat_type='avg',
                                                                       stat_val='1')], badges=[]),
                                      Column(name='bar_id_2', description='bar col2 description', col_type='bigint',
                                             sort_order=1, stats=[Stat(start_epoch=2,
                                                                       end_epoch=2,
                                                                       stat_type='avg',
                                                                       stat_val='2')],
                                             badges=[Badge(badge_name='primary key', category='column')])],
                             owners=[User(email='*****@*****.**')],
                             table_writer=Application(application_url=self.table_writer['application_url'],
                                                      description=self.table_writer['description'],
                                                      name=self.table_writer['name'],
                                                      id=self.table_writer['id']),
                             last_updated_timestamp=1,
                             source=Source(source='/source_file_loc',
                                           source_type='github'),
                             is_view=True,
                             programmatic_descriptions=[
                                 ProgrammaticDescription(source='quality_report',
                                                         text='Test Test'),
                                 ProgrammaticDescription(source='s3_crawler',
                                                         text='Test Test Test')
                             ])

            self.assertEqual(str(expected), str(table))
예제 #2
0
    def _get_table(self, custom_stats_format: bool = False) -> None:
        if custom_stats_format:
            test_exp_col = self.test_exp_col_stats_formatted
        else:
            test_exp_col = self.test_exp_col_stats_raw
        ent_attrs = cast(dict, self.entity1['attributes'])
        self._mock_get_table_entity()
        self._create_mocked_report_entities_collection()
        self.proxy._get_owners = MagicMock(
            return_value=[User(email=ent_attrs['owner'])])  # type: ignore
        self.proxy._driver.entity_bulk = MagicMock(
            return_value=self.report_entity_collection)
        response = self.proxy.get_table(table_uri=self.table_uri)

        classif_name = self.classification_entity['classifications'][0][
            'typeName']

        col_attrs = cast(dict, self.test_column['attributes'])
        exp_col_stats = list()

        for stats in test_exp_col:
            exp_col_stats.append(
                Stat(
                    stat_type=stats['attributes']['stat_name'],
                    stat_val=stats['attributes']['stat_val'],
                    start_epoch=stats['attributes']['start_epoch'],
                    end_epoch=stats['attributes']['end_epoch'],
                ))

        exp_col = Column(name=col_attrs['name'],
                         description='column description',
                         col_type='Managed',
                         sort_order=col_attrs['position'],
                         stats=exp_col_stats)
        expected = Table(
            database=self.entity_type,
            cluster=self.cluster,
            schema=self.db,
            name=ent_attrs['name'],
            tags=[Tag(tag_name=classif_name, tag_type="default")],
            description=ent_attrs['description'],
            owners=[User(email=ent_attrs['owner'])],
            resource_reports=[
                ResourceReport(name='test_report', url='http://test'),
                ResourceReport(name='test_report3', url='http://test3')
            ],
            last_updated_timestamp=int(str(self.entity1['updateTime'])[:10]),
            columns=[exp_col] * self.active_columns,
            programmatic_descriptions=[
                ProgrammaticDescription(source='test parameter key a',
                                        text='testParameterValueA'),
                ProgrammaticDescription(source='test parameter key b',
                                        text='testParameterValueB')
            ],
            is_view=False)

        self.assertEqual(str(expected), str(response))
예제 #3
0
 def test_15_just_execute_next_columns(self) -> None:
     columns = next_columns(table_key='not_important')
     self.assertEqual(1, len(columns))
     self.assertEqual([
         Column(name='coopqrstuv001140',
                key='not_important/coopqrstuv001140',
                description='coopqrstuv001140 description',
                col_type='int',
                sort_order=0,
                stats=[
                    Stat(stat_type='num_rows',
                         stat_val='114200',
                         start_epoch=None,
                         end_epoch=None)
                ])
     ], columns)
    def _serialize_columns(self, *, entity: AtlasEntityWithExtInfo) -> \
            Union[List[Column], List]:
        """
        Helper function to fetch the columns from entity and serialize them
        using Column and Stat model.
        :param entity: AtlasEntityWithExtInfo object,
        along with relationshipAttributes
        :return: A list of Column objects, if there are any columns available,
        else an empty list.
        """
        columns = list()
        for column in entity.entity[self.REL_ATTRS_KEY].get('columns') or list():
            column_status = column.get('entityStatus', 'inactive').lower()

            if column_status != 'active':
                continue

            col_entity = entity.referredEntities[column[self.GUID_KEY]]
            col_attrs = col_entity[self.ATTRS_KEY]
            statistics = list()

            badges = list()
            for column_classification in col_entity.get('classifications') or list():
                if column_classification.get('entityStatus') == Status.ACTIVE:
                    name = column_classification.get('typeName')

                    badges.append(Badge(badge_name=name, category='default'))

            for stats in col_attrs.get('statistics') or list():
                stats_attrs = stats['attributes']

                stat_type = stats_attrs.get('stat_name')

                stat_format = self.STATISTICS_FORMAT_SPEC.get(stat_type, dict())

                if not stat_format.get('drop', False):
                    stat_type = stat_format.get('new_name', stat_type)

                    stat_val = stats_attrs.get('stat_val')

                    format_val = stat_format.get('format')

                    if format_val:
                        stat_val = format_val.format(stat_val)
                    else:
                        stat_val = str(stat_val)

                    start_epoch = stats_attrs.get('start_epoch')
                    end_epoch = stats_attrs.get('end_epoch')

                    statistics.append(
                        Stat(
                            stat_type=stat_type,
                            stat_val=stat_val,
                            start_epoch=start_epoch,
                            end_epoch=end_epoch,
                        )
                    )

            columns.append(
                Column(
                    name=col_attrs.get('name'),
                    description=col_attrs.get('description') or col_attrs.get('comment'),
                    col_type=col_attrs.get('type') or col_attrs.get('dataType') or col_attrs.get('data_type'),
                    sort_order=col_attrs.get('position') or 9999,
                    stats=statistics,
                    badges=badges
                )
            )
        return sorted(columns, key=lambda item: item.sort_order)
예제 #5
0
    def test_get_table(self, mock_rds_client: Any) -> None:
        database = RDSDatabase(name='hive')
        cluster = RDSCluster(name='gold')
        schema = RDSSchema(name='foo_schema')
        schema.cluster = cluster
        cluster.database = database

        table = RDSTable(name='foo_table')
        table.schema = schema
        table.description = RDSTableDescription(description='foo description')

        col1 = RDSColumn(name='bar_id_1', type='varchar', sort_order=0)
        col1.description = RDSColumnDescription(
            description='bar col description')
        col1.stats = [
            RDSColumnStat(stat_type='avg',
                          start_epoch='1',
                          end_epoch='1',
                          stat_val='1')
        ]

        col2 = RDSColumn(name='bar_id_2', type='bigint', sort_order=1)
        col2.description = RDSColumnDescription(
            description='bar col2 description')
        col2.stats = [
            RDSColumnStat(stat_type='avg',
                          start_epoch='2',
                          end_epoch='2',
                          stat_val='2')
        ]
        col2.badges = [RDSBadge(rk='primary key', category='column')]
        columns = [col1, col2]

        table.watermarks = [
            RDSTableWatermark(
                rk='hive://gold.test_schema/test_table/high_watermark/',
                partition_key='ds',
                partition_value='fake_value',
                create_time='fake_time'),
            RDSTableWatermark(
                rk='hive://gold.test_schema/test_table/low_watermark/',
                partition_key='ds',
                partition_value='fake_value',
                create_time='fake_time')
        ]

        table.application = RDSApplication(
            application_url='airflow_host/admin/airflow/tree?dag_id=test_table',
            description='DAG generating a table',
            name='Airflow',
            id='dag/task_id')
        table.timestamp = RDSTableTimestamp(last_updated_timestamp=1)

        table.owners = [
            RDSUser(rk='*****@*****.**', email='*****@*****.**')
        ]
        table.tags = [RDSTag(rk='test', tag_type='default')]
        table.badges = [RDSBadge(rk='golden', category='table_status')]
        table.source = RDSTableSource(rk='some key',
                                      source_type='github',
                                      source='/source_file_loc')
        table.programmatic_descriptions = [
            RDSTableProgrammaticDescription(description_source='s3_crawler',
                                            description='Test Test Test'),
            RDSTableProgrammaticDescription(
                description_source='quality_report', description='Test Test')
        ]

        readers = [RDSTableUsage(user_rk='*****@*****.**', read_count=5)]

        mock_client = MagicMock()
        mock_rds_client.return_value = mock_client

        mock_create_session = MagicMock()
        mock_client.create_session.return_value = mock_create_session

        mock_session = MagicMock()
        mock_create_session.__enter__.return_value = mock_session

        mock_session_query = MagicMock()
        mock_session.query.return_value = mock_session_query

        mock_session_query_filter = MagicMock()
        mock_session_query.filter.return_value = mock_session_query_filter
        mock_session_query_filter.first.return_value = table

        mock_session_query_filter_orderby = MagicMock()
        mock_session_query_filter.order_by.return_value = mock_session_query_filter_orderby

        mock_session_query_filter_orderby_limit = MagicMock()
        mock_session_query_filter_orderby.limit.return_value = mock_session_query_filter_orderby_limit
        mock_session_query_filter_orderby_limit.all.return_value = readers

        mock_session_query_filter_options = MagicMock()
        mock_session_query_filter.options.return_value = mock_session_query_filter_options
        mock_session_query_filter_options.all.return_value = columns

        proxy = MySQLProxy()
        actual_table = proxy.get_table(table_uri='dummy_uri')

        expected = Table(
            database='hive',
            cluster='gold',
            schema='foo_schema',
            name='foo_table',
            tags=[Tag(tag_name='test', tag_type='default')],
            badges=[Badge(badge_name='golden', category='table_status')],
            table_readers=[
                Reader(user=User(email='*****@*****.**'), read_count=5)
            ],
            description='foo description',
            watermarks=[
                Watermark(watermark_type='high_watermark',
                          partition_key='ds',
                          partition_value='fake_value',
                          create_time='fake_time'),
                Watermark(watermark_type='low_watermark',
                          partition_key='ds',
                          partition_value='fake_value',
                          create_time='fake_time')
            ],
            columns=[
                Column(name='bar_id_1',
                       description='bar col description',
                       col_type='varchar',
                       sort_order=0,
                       stats=[
                           Stat(start_epoch=1,
                                end_epoch=1,
                                stat_type='avg',
                                stat_val='1')
                       ],
                       badges=[]),
                Column(name='bar_id_2',
                       description='bar col2 description',
                       col_type='bigint',
                       sort_order=1,
                       stats=[
                           Stat(start_epoch=2,
                                end_epoch=2,
                                stat_type='avg',
                                stat_val='2')
                       ],
                       badges=[
                           Badge(badge_name='primary key', category='column')
                       ])
            ],
            owners=[User(email='*****@*****.**')],
            table_writer=Application(
                application_url=
                'airflow_host/admin/airflow/tree?dag_id=test_table',
                description='DAG generating a table',
                name='Airflow',
                id='dag/task_id'),
            last_updated_timestamp=1,
            source=Source(source='/source_file_loc', source_type='github'),
            is_view=False,
            programmatic_descriptions=[
                ProgrammaticDescription(source='quality_report',
                                        text='Test Test'),
                ProgrammaticDescription(source='s3_crawler',
                                        text='Test Test Test')
            ])

        self.assertEqual(str(expected), str(actual_table))