def test_get_table_view_only(self) -> None: col_usage_return_value = copy.deepcopy(self.col_usage_return_value) for col in col_usage_return_value: col['tbl']['is_view'] = True with patch.object(GraphDatabase, 'driver'), patch.object(Neo4jProxy, '_execute_cypher_query') as mock_execute: mock_execute.side_effect = [col_usage_return_value, [], self.table_level_return_value] neo4j_proxy = Neo4jProxy(host='DOES_NOT_MATTER', port=0000) table = neo4j_proxy.get_table(table_uri='dummy_uri') expected = Table(database='hive', cluster='gold', schema='foo_schema', name='foo_table', tags=[Tag(tag_name='test', tag_type='default')], badges=[Badge(badge_name='golden', category='table_status')], table_readers=[], description='foo description', watermarks=[Watermark(watermark_type='high_watermark', partition_key='ds', partition_value='fake_value', create_time='fake_time'), Watermark(watermark_type='low_watermark', partition_key='ds', partition_value='fake_value', create_time='fake_time')], columns=[Column(name='bar_id_1', description='bar col description', col_type='varchar', sort_order=0, stats=[Stat(start_epoch=1, end_epoch=1, stat_type='avg', stat_val='1')], badges=[]), Column(name='bar_id_2', description='bar col2 description', col_type='bigint', sort_order=1, stats=[Stat(start_epoch=2, end_epoch=2, stat_type='avg', stat_val='2')], badges=[Badge(badge_name='primary key', category='column')])], owners=[User(email='*****@*****.**')], table_writer=Application(application_url=self.table_writer['application_url'], description=self.table_writer['description'], name=self.table_writer['name'], id=self.table_writer['id']), last_updated_timestamp=1, source=Source(source='/source_file_loc', source_type='github'), is_view=True, programmatic_descriptions=[ ProgrammaticDescription(source='quality_report', text='Test Test'), ProgrammaticDescription(source='s3_crawler', text='Test Test Test') ]) self.assertEqual(str(expected), str(table))
def test_get_table(self, mock_rds_client: Any) -> None: database = RDSDatabase(name='hive') cluster = RDSCluster(name='gold') schema = RDSSchema(name='foo_schema') schema.cluster = cluster cluster.database = database table = RDSTable(name='foo_table') table.schema = schema table.description = RDSTableDescription(description='foo description') col1 = RDSColumn(name='bar_id_1', type='varchar', sort_order=0) col1.description = RDSColumnDescription( description='bar col description') col1.stats = [ RDSColumnStat(stat_type='avg', start_epoch='1', end_epoch='1', stat_val='1') ] col2 = RDSColumn(name='bar_id_2', type='bigint', sort_order=1) col2.description = RDSColumnDescription( description='bar col2 description') col2.stats = [ RDSColumnStat(stat_type='avg', start_epoch='2', end_epoch='2', stat_val='2') ] col2.badges = [RDSBadge(rk='primary key', category='column')] columns = [col1, col2] table.watermarks = [ RDSTableWatermark( rk='hive://gold.test_schema/test_table/high_watermark/', partition_key='ds', partition_value='fake_value', create_time='fake_time'), RDSTableWatermark( rk='hive://gold.test_schema/test_table/low_watermark/', partition_key='ds', partition_value='fake_value', create_time='fake_time') ] table.application = RDSApplication( application_url='airflow_host/admin/airflow/tree?dag_id=test_table', description='DAG generating a table', name='Airflow', id='dag/task_id') table.timestamp = RDSTableTimestamp(last_updated_timestamp=1) table.owners = [ RDSUser(rk='*****@*****.**', email='*****@*****.**') ] table.tags = [RDSTag(rk='test', tag_type='default')] table.badges = [RDSBadge(rk='golden', category='table_status')] table.source = RDSTableSource(rk='some key', source_type='github', source='/source_file_loc') table.programmatic_descriptions = [ RDSTableProgrammaticDescription(description_source='s3_crawler', description='Test Test Test'), RDSTableProgrammaticDescription( description_source='quality_report', description='Test Test') ] readers = [RDSTableUsage(user_rk='*****@*****.**', read_count=5)] mock_client = MagicMock() mock_rds_client.return_value = mock_client mock_create_session = MagicMock() mock_client.create_session.return_value = mock_create_session mock_session = MagicMock() mock_create_session.__enter__.return_value = mock_session mock_session_query = MagicMock() mock_session.query.return_value = mock_session_query mock_session_query_filter = MagicMock() mock_session_query.filter.return_value = mock_session_query_filter mock_session_query_filter.first.return_value = table mock_session_query_filter_orderby = MagicMock() mock_session_query_filter.order_by.return_value = mock_session_query_filter_orderby mock_session_query_filter_orderby_limit = MagicMock() mock_session_query_filter_orderby.limit.return_value = mock_session_query_filter_orderby_limit mock_session_query_filter_orderby_limit.all.return_value = readers mock_session_query_filter_options = MagicMock() mock_session_query_filter.options.return_value = mock_session_query_filter_options mock_session_query_filter_options.all.return_value = columns proxy = MySQLProxy() actual_table = proxy.get_table(table_uri='dummy_uri') expected = Table( database='hive', cluster='gold', schema='foo_schema', name='foo_table', tags=[Tag(tag_name='test', tag_type='default')], badges=[Badge(badge_name='golden', category='table_status')], table_readers=[ Reader(user=User(email='*****@*****.**'), read_count=5) ], description='foo description', watermarks=[ Watermark(watermark_type='high_watermark', partition_key='ds', partition_value='fake_value', create_time='fake_time'), Watermark(watermark_type='low_watermark', partition_key='ds', partition_value='fake_value', create_time='fake_time') ], columns=[ Column(name='bar_id_1', description='bar col description', col_type='varchar', sort_order=0, stats=[ Stat(start_epoch=1, end_epoch=1, stat_type='avg', stat_val='1') ], badges=[]), Column(name='bar_id_2', description='bar col2 description', col_type='bigint', sort_order=1, stats=[ Stat(start_epoch=2, end_epoch=2, stat_type='avg', stat_val='2') ], badges=[ Badge(badge_name='primary key', category='column') ]) ], owners=[User(email='*****@*****.**')], table_writer=Application( application_url= 'airflow_host/admin/airflow/tree?dag_id=test_table', description='DAG generating a table', name='Airflow', id='dag/task_id'), last_updated_timestamp=1, source=Source(source='/source_file_loc', source_type='github'), is_view=False, programmatic_descriptions=[ ProgrammaticDescription(source='quality_report', text='Test Test'), ProgrammaticDescription(source='s3_crawler', text='Test Test Test') ]) self.assertEqual(str(expected), str(actual_table))
def _exec_table_query(self, table_uri: str) -> Tuple: """ Queries one Cypher record with watermark list, Application, ,timestamp, owner records and tag records. """ # Return Value: (Watermark Results, Table Writer, Last Updated Timestamp, owner records, tag records) table_level_query = textwrap.dedent("""\ MATCH (tbl:Table {key: $tbl_key}) OPTIONAL MATCH (wmk:Watermark)-[:BELONG_TO_TABLE]->(tbl) OPTIONAL MATCH (application:Application)-[:GENERATES]->(tbl) OPTIONAL MATCH (tbl)-[:LAST_UPDATED_AT]->(t:Timestamp) OPTIONAL MATCH (owner:User)<-[:OWNER]-(tbl) OPTIONAL MATCH (tbl)-[:TAGGED_BY]->(tag:Tag{tag_type: $tag_normal_type}) OPTIONAL MATCH (tbl)-[:TAGGED_BY]->(badge:Tag{tag_type: $tag_badge_type}) OPTIONAL MATCH (tbl)-[:SOURCE]->(src:Source) OPTIONAL MATCH (tbl)-[:DESCRIPTION]->(prog_descriptions:Programmatic_Description) RETURN collect(distinct wmk) as wmk_records, application, t.last_updated_timestamp as last_updated_timestamp, collect(distinct owner) as owner_records, collect(distinct tag) as tag_records, collect(distinct badge) as badge_records, src, collect(distinct prog_descriptions) as prog_descriptions """) table_records = self._execute_cypher_query(statement=table_level_query, param_dict={'tbl_key': table_uri, 'tag_normal_type': 'default', 'tag_badge_type': 'badge'}) table_records = table_records.single() wmk_results = [] table_writer = None wmk_records = table_records['wmk_records'] for record in wmk_records: if record['key'] is not None: watermark_type = record['key'].split('/')[-2] wmk_result = Watermark(watermark_type=watermark_type, partition_key=record['partition_key'], partition_value=record['partition_value'], create_time=record['create_time']) wmk_results.append(wmk_result) tags = [] if table_records.get('tag_records'): tag_records = table_records['tag_records'] for record in tag_records: tag_result = Tag(tag_name=record['key'], tag_type=record['tag_type']) tags.append(tag_result) badges = [] if table_records.get('badge_records'): badge_records = table_records['badge_records'] for record in badge_records: badge_result = Tag(tag_name=record['key'], tag_type=record['tag_type']) badges.append(badge_result) application_record = table_records['application'] if application_record is not None: table_writer = Application( application_url=application_record['application_url'], description=application_record['description'], name=application_record['name'], id=application_record.get('id', '') ) timestamp_value = table_records['last_updated_timestamp'] owner_record = [] for owner in table_records.get('owner_records', []): owner_record.append(User(email=owner['email'])) src = None if table_records['src']: src = Source(source_type=table_records['src']['source_type'], source=table_records['src']['source']) prog_descriptions = self._extract_programmatic_descriptions_from_query( table_records.get('prog_descriptions', []) ) return wmk_results, table_writer, timestamp_value, owner_record, tags, src, badges, prog_descriptions
def test_get_table(self) -> None: with patch.object(GraphDatabase, 'driver'), patch.object( Neo4jProxy, '_execute_cypher_query') as mock_execute: mock_execute.side_effect = [ self.col_usage_return_value, [], self.table_level_return_value ] neo4j_proxy = Neo4jProxy(host='DOES_NOT_MATTER', port=0000) table = neo4j_proxy.get_table(table_uri='dummy_uri') expected = Table( database='hive', cluster='gold', schema='foo_schema', name='foo_table', tags=[Tag(tag_name='test', tag_type='default')], badges=[Tag(tag_name='golden', tag_type='badge')], table_readers=[], description='foo description', watermarks=[ Watermark(watermark_type='high_watermark', partition_key='ds', partition_value='fake_value', create_time='fake_time'), Watermark(watermark_type='low_watermark', partition_key='ds', partition_value='fake_value', create_time='fake_time') ], columns=[ Column(name='bar_id_1', description='bar col description', col_type='varchar', sort_order=0, stats=[ Statistics(start_epoch=1, end_epoch=1, stat_type='avg', stat_val='1') ]), Column(name='bar_id_2', description='bar col2 description', col_type='bigint', sort_order=1, stats=[ Statistics(start_epoch=2, end_epoch=2, stat_type='avg', stat_val='2') ]) ], owners=[User(email='*****@*****.**')], table_writer=Application( application_url=self.table_writer['application_url'], description=self.table_writer['description'], name=self.table_writer['name'], id=self.table_writer['id']), last_updated_timestamp=1, source=Source(source='/source_file_loc', source_type='github'), is_view=False) self.assertEqual(str(expected), str(table))