def test_table_without_partitions(self, mock_build: Any) -> None: mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, None) extractor = BigQueryWatermarkExtractor() extractor.init(Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result = extractor.extract() self.assertIsNone(result)
def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(VerticaMetadataExtractor.DEFAULT_CONFIG) self._cluster = '{}'.format(conf.get_string(VerticaMetadataExtractor.CLUSTER_KEY)) if conf.get_bool(VerticaMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): cluster_source = "c.table_catalog" else: cluster_source = "'{}'".format(self._cluster) self._database = conf.get_string(VerticaMetadataExtractor.DATABASE_KEY, default='vertica') self.sql_stmt = VerticaMetadataExtractor.SQL_STATEMENT.format( where_clause_suffix=conf.get_string(VerticaMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY), cluster_source=cluster_source ) self._alchemy_extractor = SQLAlchemyExtractor() sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\ .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt})) self.sql_stmt = sql_alch_conf.get_string(SQLAlchemyExtractor.EXTRACT_SQL) LOGGER.info('SQL for vertica metadata: {}'.format(self.sql_stmt)) self._alchemy_extractor.init(sql_alch_conf) self._extract_iter: Union[None, Iterator] = None
def init(self, conf): # type: (ConfigTree) -> None self.conf = conf.with_fallback(PrestoTableMetadataExtractor.DEFAULT_CONFIG) self._database = "{}".format( self.conf.get_string(PrestoTableMetadataExtractor.DATABASE_KEY) ) self._cluster = self.conf.get(PrestoTableMetadataExtractor.CLUSTER_KEY, None) LOGGER.info("Cluster name: {}".format(self._cluster)) if self._cluster is not None: cluster_prefix = self._cluster + "." else: cluster_prefix = "" self.sql_stmt = PrestoTableMetadataExtractor.SQL_STATEMENT.format( cluster_prefix=cluster_prefix, where_clause_suffix=self.conf.get_string( PrestoTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY ) or "", ) LOGGER.info("SQL for presto: {}".format(self.sql_stmt)) self._alchemy_extractor = SQLAlchemyExtractor() sql_alch_conf = Scoped.get_scoped_conf( self.conf, self._alchemy_extractor.get_scope() ).with_fallback( ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}) ) self._alchemy_extractor.init(sql_alch_conf) self._extract_iter = None # type: Union[None, Iterator]
def test_dashboard_metadata_extractor(self) -> None: config = ConfigFactory.from_dict({ 'extractor.tableau_dashboard_metadata.tableau_host': 'tableau_host', 'extractor.tableau_dashboard_metadata.api_version': 'tableau_api_version', 'extractor.tableau_dashboard_metadata.site_name': 'tableau_site_name', 'extractor.tableau_dashboard_metadata.tableau_personal_access_token_name': 'tableau_personal_access_token_name', 'extractor.tableau_dashboard_metadata.tableau_personal_access_token_secret': 'tableau_personal_access_token_secret', 'extractor.tableau_dashboard_metadata.excluded_projects': [], 'extractor.tableau_dashboard_metadata.cluster': 'tableau_dashboard_cluster', 'extractor.tableau_dashboard_metadata.database': 'tableau_dashboard_database', 'extractor.tableau_dashboard_metadata.transformer.timestamp_str_to_epoch.timestamp_format': '%Y-%m-%dT%H:%M:%SZ', }) extractor = TableauDashboardExtractor() extractor.init( Scoped.get_scoped_conf(conf=config, scope=extractor.get_scope())) record = extractor.extract() self.assertEqual(record.dashboard_id, 'Test Workbook') self.assertEqual(record.dashboard_name, 'Test Workbook') self.assertEqual(record.dashboard_group_id, 'Test Project') self.assertEqual(record.dashboard_group, 'Test Project') self.assertEqual(record.product, 'tableau') self.assertEqual(record.cluster, 'tableau_dashboard_cluster') self.assertEqual(record.created_timestamp, 1586323921)
def test_loading_with_different_object(self): # type: () -> None """ Test Loading functionality with a python Dict object """ loader = FSElasticsearchJSONLoader() loader.init(conf=Scoped.get_scoped_conf(conf=self.conf, scope=loader.get_scope())) data = dict(database='test_database', cluster='test_cluster', schema_name='test_schema', table_name='test_table', table_key='test_table_key', table_last_updated_epoch=123456789, table_description='test_description', column_names=['test_col1', 'test_col2'], column_descriptions=['test_comment1', 'test_comment2'], total_usage=10, unique_usage=5, tag_names=['test_tag1', 'test_tag2']) with self.assertRaises(Exception) as context: loader.load(data) # type: ignore self.assertTrue( "Record not of type 'ElasticsearchDocument'!" in context.exception) loader.close()
def test_key_path(self, mock_build): """ Test key_path can be used """ with tempfile.NamedTemporaryFile() as keyfile: # There are many github scanners looking for API / cloud keys, so in order not to get a # false positive triggering everywhere, I base64 encoded the key. # This is written to a tempfile as part of this test and then used. keyfile.write(base64.b64decode(KEYFILE_DATA)) keyfile.flush() config_dict = { 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): 'your-project-here', 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.KEY_PATH_KEY): keyfile.name, } conf = ConfigFactory.from_dict(config_dict) mock_build.return_value = MockLoggingClient(CORRECT_DATA) extractor = BigQueryTableUsageExtractor() extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) args, kwargs = mock_build.call_args creds = kwargs['http'].credentials self.assertEqual(creds.project_id, 'your-project-here') self.assertEqual(creds.service_account_email, '*****@*****.**')
def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback( SnowflakeTableLastUpdatedExtractor.DEFAULT_CONFIG) if conf.get_bool(SnowflakeTableLastUpdatedExtractor. USE_CATALOG_AS_CLUSTER_NAME): cluster_source = "t.table_catalog" else: cluster_source = "'{}'".format( conf.get_string( SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY)) self._database = conf.get_string( SnowflakeTableLastUpdatedExtractor.DATABASE_KEY) self._snowflake_database = conf.get_string( SnowflakeTableLastUpdatedExtractor.SNOWFLAKE_DATABASE_KEY) self.sql_stmt = SnowflakeTableLastUpdatedExtractor.SQL_STATEMENT.format( where_clause_suffix=conf.get_string( SnowflakeTableLastUpdatedExtractor.WHERE_CLAUSE_SUFFIX_KEY), cluster_source=cluster_source, database=self._snowflake_database) LOGGER.info( 'SQL for snowflake table last updated timestamp: {}'.format( self.sql_stmt)) # use an sql_alchemy_extractor to execute sql self._alchemy_extractor = SQLAlchemyExtractor() sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope()) \ .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt})) self._alchemy_extractor.init(sql_alch_conf) self._extract_iter: Union[None, Iterator] = None
def test_table_part_of_table_date_range(self, mock_build): mock_build.return_value = MockBigQueryClient(ONE_DATASET, TABLE_DATE_RANGE, None) extractor = BigQueryWatermarkExtractor() extractor.init( Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result = extractor.extract() self.assertEquals(result.part_type, 'low_watermark') self.assertEquals(result.database, 'bigquery') self.assertEquals(result.schema, 'fdgdfgh') self.assertEquals(result.table, 'date_range_') self.assertEquals(result.cluster, 'your-project-here') self.assertEquals( result.create_time, datetime.fromtimestamp(1557577779).strftime('%Y-%m-%d %H:%M:%S')) self.assertEquals(result.parts, [('__table__', '20190101')]) result = extractor.extract() self.assertEquals(result.part_type, 'high_watermark') self.assertEquals(result.database, 'bigquery') self.assertEquals(result.schema, 'fdgdfgh') self.assertEquals(result.table, 'date_range_') self.assertEquals(result.cluster, 'your-project-here') self.assertEquals( result.create_time, datetime.fromtimestamp(1557577779).strftime('%Y-%m-%d %H:%M:%S')) self.assertEquals(result.parts, [('__table__', '20190102')])
def test_publish_with_data_and_old_index(self) -> None: """ Test Publish functionality with data and with old_index in place """ mock_data = json.dumps({'KEY_DOESNOT_MATTER': 'NO_VALUE', 'KEY_DOESNOT_MATTER2': 'NO_VALUE2'}) self.mock_es_client.indices.get_alias.return_value = {'test_old_index': 'DOES_NOT_MATTER'} with patch('builtins.open', mock_open(read_data=mock_data)) as mock_file: publisher = ElasticsearchPublisher() publisher.init(conf=Scoped.get_scoped_conf(conf=self.conf, scope=publisher.get_scope())) # assert mock was called with test_file_path and test_file_mode mock_file.assert_called_once_with(self.test_file_path, self.test_file_mode) publisher.publish() # ensure indices create endpoint was called default_mapping = ElasticsearchPublisher.DEFAULT_ELASTICSEARCH_INDEX_MAPPING self.mock_es_client.indices.create.assert_called_once_with(index=self.test_es_new_index, body=default_mapping) # bulk endpoint called once self.mock_es_client.bulk.assert_called_once_with( [{'index': {'_index': self.test_es_new_index}}, {'KEY_DOESNOT_MATTER': 'NO_VALUE', 'KEY_DOESNOT_MATTER2': 'NO_VALUE2', 'resource_type': 'test_doc_type'}] ) # update alias endpoint called once self.mock_es_client.indices.update_aliases.assert_called_once_with( {'actions': [{"add": {"index": self.test_es_new_index, "alias": self.test_es_alias}}, {"remove_index": {"index": 'test_old_index'}}]} )
def _get_non_partitioned_table_sql_alchemy_extractor(self): # type: () -> Extractor """ Getting an SQLAlchemy extractor that extracts storage location for non-partitioned table for further probing last updated timestamp :return: SQLAlchemyExtractor """ if HiveTableLastUpdatedExtractor.NON_PARTITIONED_TABLE_WHERE_CLAUSE_SUFFIX_KEY in self._conf: where_clause_suffix = """ {} AND {} """.format( self._conf.get_string( HiveTableLastUpdatedExtractor. NON_PARTITIONED_TABLE_WHERE_CLAUSE_SUFFIX_KEY), HiveTableLastUpdatedExtractor.ADDTIONAL_WHERE_CLAUSE) else: where_clause_suffix = 'WHERE {}'.format( HiveTableLastUpdatedExtractor.ADDTIONAL_WHERE_CLAUSE) sql_stmt = HiveTableLastUpdatedExtractor.NON_PARTITIONED_TABLE_SQL_STATEMENT.format( where_clause_suffix=where_clause_suffix) LOGGER.info( 'SQL for non-partitioned table against Hive metastore: {}'.format( sql_stmt)) sql_alchemy_extractor = SQLAlchemyExtractor() sql_alchemy_conf = Scoped.get_scoped_conf(self._conf, sql_alchemy_extractor.get_scope()) \ .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: sql_stmt})) sql_alchemy_extractor.init(sql_alchemy_conf) return sql_alchemy_extractor
def test_table_with_field_partitions(self, mock_build): mock_build.return_value = MockBigQueryClient( ONE_DATASET, TIME_PARTITIONED_WITH_FIELD, PARTITION_DATA) extractor = BigQueryWatermarkExtractor() extractor.init( Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result = extractor.extract() self.assertEquals(result.part_type, 'low_watermark') self.assertEquals(result.database, 'bigquery') self.assertEquals(result.schema, 'fdgdfgh') self.assertEquals(result.table, 'other') self.assertEquals(result.cluster, 'your-project-here') self.assertEquals( result.create_time, datetime.fromtimestamp(1547512241).strftime('%Y-%m-%d %H:%M:%S')) self.assertEquals(result.parts, [('processed_date', '20180802')]) result = extractor.extract() self.assertEquals(result.part_type, 'high_watermark') self.assertEquals(result.database, 'bigquery') self.assertEquals(result.schema, 'fdgdfgh') self.assertEquals(result.table, 'other') self.assertEquals(result.cluster, 'your-project-here') self.assertEquals( result.create_time, datetime.fromtimestamp(1547512241).strftime('%Y-%m-%d %H:%M:%S')) self.assertEquals(result.parts, [('processed_date', '20180804')])
def test_extraction_one_object(self, mock_salesforce: Any) -> None: mock_salesforce.return_value = MockSalesForce() config_dict: Dict = { f"extractor.salesforce_metadata.{SalesForceExtractor.OBJECT_NAMES_KEY}": [ "Account" ], **self.config, } conf = ConfigFactory.from_dict(config_dict) mock_salesforce.return_value = MockSalesForce() extractor = SalesForceExtractor() extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) result = extractor.extract() self.assertIsInstance(result, TableMetadata) expected = TableMetadata( "salesforce", "gold", "default", "Account", None, [ ColumnMetadata("Id", "The Account Id", "id", 0, []), ColumnMetadata("isDeleted", "Deleted?", "bool", 1, []), ], False, [], ) self.assertEqual(expected.__repr__(), result.__repr__()) self.assertIsNone(extractor.extract())
def init(self, conf: ConfigTree) -> None: self._conf = conf self.query = """query { workbooks { name projectName upstreamTables { name schema database { name connectionType } } } }""" self._extractor = self._build_extractor() transformers = [] dict_to_model_transformer = DictToModel() dict_to_model_transformer.init(conf=Scoped.get_scoped_conf( self._conf, dict_to_model_transformer.get_scope() ).with_fallback( ConfigFactory.from_dict({ MODEL_CLASS: 'databuilder.models.dashboard.dashboard_table.DashboardTable' }))) transformers.append(dict_to_model_transformer) self._transformer = ChainedTransformer(transformers=transformers)
def test_extraction_with_model_class(self) -> None: """ Test Extraction using model class """ config_dict = { f'extractor.csv.{CsvExtractor.FILE_LOCATION}': 'example/sample_data/sample_table.csv', f'extractor.csv.model_class': 'databuilder.models.table_metadata.TableMetadata', } self.conf = ConfigFactory.from_dict(config_dict) extractor = CsvExtractor() extractor.init( Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result = extractor.extract() self.assertEqual(result.name, 'test_table1') self.assertEqual(result.description.text, '1st test table') self.assertEqual(result.database, 'hive') self.assertEqual(result.cluster, 'gold') self.assertEqual(result.schema, 'test_schema') self.assertEqual(result.tags, ['tag1', 'tag2']) self.assertEqual(result.is_view, 'false') result2 = extractor.extract() self.assertEqual(result2.name, 'test_table2') self.assertEqual(result2.is_view, 'false') result3 = extractor.extract() self.assertEqual(result3.name, 'test_view1') self.assertEqual(result3.is_view, 'true')
def test_amundsen_dataset_key(self) -> None: """ Test _amundsen_dataset_key method """ config_dict = { f'extractor.openlineage_tablelineage.{OpenLineageTableLineageExtractor.TABLE_LINEAGE_FILE_LOCATION}': 'example/sample_data/openlineage/sample_openlineage_events.ndjson', f'extractor.openlineage_tablelineage.{OpenLineageTableLineageExtractor.CLUSTER_NAME}': 'datalab', } self.conf = ConfigFactory.from_dict(config_dict) extractor = OpenLineageTableLineageExtractor() extractor.init( Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) mock_dataset = { 'name': 'mock_table', 'namespace': 'postgresql', 'database': 'testdb' } self.assertEqual('postgresql://datalab.testdb/mock_table', extractor._amundsen_dataset_key(mock_dataset)) extractor.ol_namespace_override = 'hive' self.assertEqual('hive://datalab.testdb/mock_table', extractor._amundsen_dataset_key(mock_dataset))
def init(self, conf): # type: (ConfigTree) -> None conf = Scoped.get_scoped_conf(conf, self.get_scope()) \ .with_fallback(conf) \ .with_fallback(DEFAULT_CONFIG) self.target_nodes = set(conf.get_list(TARGET_NODES)) self.target_relations = set(conf.get_list(TARGET_RELATIONS)) self.batch_size = conf.get_int(BATCH_SIZE) self.dry_run = conf.get_bool(DRY_RUN) self.staleness_pct = conf.get_int(STALENESS_MAX_PCT) self.staleness_pct_dict = conf.get(STALENESS_PCT_MAX_DICT) if JOB_PUBLISH_TAG in conf and MS_TO_EXPIRE in conf: raise Exception('Cannot have both {} and {} in job config'.format( JOB_PUBLISH_TAG, MS_TO_EXPIRE)) self.ms_to_expire = None if MS_TO_EXPIRE in conf: self.ms_to_expire = conf.get_int(MS_TO_EXPIRE) if self.ms_to_expire < conf.get_int(MIN_MS_TO_EXPIRE): raise Exception('{} is too small'.format(MS_TO_EXPIRE)) self.marker = '(timestamp() - {})'.format( conf.get_int(MS_TO_EXPIRE)) else: self.marker = conf.get_string(JOB_PUBLISH_TAG) self._driver = \ GraphDatabase.driver(conf.get_string(NEO4J_END_POINT_KEY), max_connection_life_time=conf.get_int(NEO4J_MAX_CONN_LIFE_TIME_SEC), auth=(conf.get_string(NEO4J_USER), conf.get_string(NEO4J_PASSWORD)))
def test_basic_extraction(self, mock_build): """ Test Extraction using mock class """ config_dict = { 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): 'your-project-here', } conf = ConfigFactory.from_dict(config_dict) mock_build.return_value = MockLoggingClient(CORRECT_DATA) extractor = BigQueryTableUsageExtractor() extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) result = extractor.extract() self.assertIsInstance(result, tuple) (key, value) = result self.assertIsInstance(key, TableColumnUsageTuple) self.assertIsInstance(value, int) self.assertEqual(key.database, 'bigquery') self.assertEqual(key.cluster, 'bigquery-public-data') self.assertEqual(key.schema, 'austin_incidents') self.assertEqual(key.table, 'incidents_2008') self.assertEqual(key.email, '*****@*****.**') self.assertEqual(value, 1)
def _get_extractor(self, index_names: List[str]) -> Any: extractor = ElasticsearchWatermarkExtractor() extractor.init( Scoped.get_scoped_conf(conf=self._get_config(index_names), scope=extractor.get_scope())) return extractor
def test_timestamp_pagesize_settings(self, mock_build): """ Test timestamp and pagesize can be set """ TIMESTAMP = '2019-01-01T00:00:00.00Z' PAGESIZE = 215 config_dict = { 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): 'your-project-here', 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.TIMESTAMP_KEY): TIMESTAMP, 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PAGE_SIZE_KEY): PAGESIZE, } conf = ConfigFactory.from_dict(config_dict) client = MockLoggingClient(CORRECT_DATA) mock_build.return_value = client extractor = BigQueryTableUsageExtractor() extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) args, kwargs = client.b.list.call_args body = kwargs['body'] self.assertEqual(body['pageSize'], PAGESIZE) self.assertEqual(TIMESTAMP in body['filter'], True)
def init(self, conf): conf = conf.with_fallback(self.DEFAULT_CONFIG) self._cluster = "{}".format(conf.get_string(self.CLUSTER_KEY)) self._database = conf.get_string(self.DATABASE_KEY) self.sql_stmt = self._get_sql_statement( use_catalog_as_cluster_name=conf.get_bool( self.USE_CATALOG_AS_CLUSTER_NAME), where_clause_suffix=conf.get_string(self.WHERE_CLAUSE_SUFFIX_KEY), ) self._alchemy_extractor = SQLAlchemyExtractor() sql_alch_conf = Scoped.get_scoped_conf( conf, SQLALCHEMY_ENGINE_SCOPE).with_fallback( ConfigFactory.from_dict( {SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt})) self.sql_stmt = sql_alch_conf.get_string( SQLAlchemyExtractor.EXTRACT_SQL) LOGGER.info("SQL for postgres metadata: %s", self.sql_stmt) self._alchemy_extractor.init(sql_alch_conf) self._extract_iter: Union[None, iterator] = None
def init(self, conf: ConfigTree) -> None: self._conf = conf self.query = """query externalTables($externalTableTypes: [String]) { databases (filter: {connectionTypeWithin: $externalTableTypes}) { name connectionType description tables { name } } }""" self.query_variables = { 'externalTableTypes': self._conf.get_list( TableauDashboardExternalTableExtractor.EXTERNAL_TABLE_TYPES) } self._extractor = self._build_extractor() transformers = [] dict_to_model_transformer = DictToModel() dict_to_model_transformer.init(conf=Scoped.get_scoped_conf( self._conf, dict_to_model_transformer.get_scope()).with_fallback( ConfigFactory.from_dict({ MODEL_CLASS: 'databuilder.models.table_metadata.TableMetadata' }))) transformers.append(dict_to_model_transformer) self._transformer = ChainedTransformer(transformers=transformers)
def test_extraction_with_model_class(self: Any, mock_method: Any) -> None: """ Test Extraction using model class """ config_dict = { 'extractor.sqlalchemy.conn_string': 'TEST_CONNECTION', 'extractor.sqlalchemy.extract_sql': 'SELECT 1 FROM TEST_TABLE;', 'extractor.sqlalchemy.model_class': 'tests.unit.extractor.test_sql_alchemy_extractor.TableMetadataResult' } self.conf = ConfigFactory.from_dict(config_dict) extractor = SQLAlchemyExtractor() extractor.results = [ dict(database='test_database', schema='test_schema', name='test_table', description='test_description', column_name='test_column_name', column_type='test_column_type', column_comment='test_column_comment', owner='test_owner') ] extractor.init( Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result = extractor.extract() self.assertIsInstance(result, TableMetadataResult) self.assertEqual(result.name, 'test_table')
def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(BasePostgresMetadataExtractor.DEFAULT_CONFIG) self._cluster = conf.get_string( BasePostgresMetadataExtractor.CLUSTER_KEY) self._database = conf.get_string( BasePostgresMetadataExtractor.DATABASE_KEY, default='postgres') self.sql_stmt = self.get_sql_statement( use_catalog_as_cluster_name=conf.get_bool( BasePostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME), where_clause_suffix=conf.get_string( BasePostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY), ) self._alchemy_extractor = SQLAlchemyExtractor() sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\ .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt})) self.sql_stmt = sql_alch_conf.get_string( SQLAlchemyExtractor.EXTRACT_SQL) LOGGER.info('SQL for postgres metadata: %s', self.sql_stmt) self._alchemy_extractor.init(sql_alch_conf) self._extract_iter: Union[None, Iterator] = None
def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(SqliteMetadataExtractor.DEFAULT_CONFIG) self._cluster = conf.get_string(SqliteMetadataExtractor.CLUSTER_KEY) self._database = conf.get_string(SqliteMetadataExtractor.DATABASE_KEY, default="sqlite") self.sql_stmt = SqliteMetadataExtractor.SQL_STATEMENT.format( where_clause_suffix=conf.get_string( SqliteMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY), cluster_source=self._cluster, ) self._alchemy_extractor = SQLAlchemyExtractor() sql_alch_conf = Scoped.get_scoped_conf( conf, self._alchemy_extractor.get_scope()).with_fallback( ConfigFactory.from_dict( {SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt})) self.sql_stmt = sql_alch_conf.get_string( SQLAlchemyExtractor.EXTRACT_SQL) LOGGER.info("SQL for sqlite metadata: %s", self.sql_stmt) self._alchemy_extractor.init(sql_alch_conf) self._extract_iter: Union[None, Iterator] = None
def test_loading_with_single_object(self): # type: () -> None """ Test Loading functionality with single python object """ loader = FSElasticsearchJSONLoader() loader.init(conf=Scoped.get_scoped_conf(conf=self.conf, scope=loader.get_scope())) data = TableESDocument( database='test_database', cluster='test_cluster', schema_name='test_schema', table_name='test_table', table_key='test_table_key', table_last_updated_epoch=123456789, table_description='test_description', column_names=['test_col1', 'test_col2'], column_descriptions=['test_comment1', 'test_comment2'], total_usage=10, unique_usage=5, tag_names=['test_tag1', 'test_tag2']) loader.load(data) loader.close() expected = [( '{"table_key": "test_table_key", "column_descriptions": ["test_comment1", "test_comment2"], ' '"schema_name": "test_schema", "database": "test_database", "cluster": "test_cluster", ' '"column_names": ["test_col1", "test_col2"], "table_name": "test_table", ' '"table_last_updated_epoch": 123456789,' '"table_description": "test_description", "unique_usage": 5, "total_usage": 10, ' '"tag_names": ["test_tag1", "test_tag2"]}')] self._check_results_helper(expected=expected)
def init(self, conf): # type: (ConfigTree) -> None conf = conf.with_fallback(SnowflakeMetadataExtractor.DEFAULT_CONFIG) self._cluster = '{}'.format( conf.get_string(SnowflakeMetadataExtractor.CLUSTER_KEY)) if conf.get_bool( SnowflakeMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): cluster_source = "c.table_catalog" else: cluster_source = "'{}'".format(self._cluster) self._database = conf.get_string( SnowflakeMetadataExtractor.DATABASE_KEY) if six.PY2: self._database = self._database.encode('utf-8', 'ignore') self.sql_stmt = SnowflakeMetadataExtractor.SQL_STATEMENT.format( where_clause_suffix=conf.get_string( SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY), cluster_source=cluster_source, database=self._database) LOGGER.info('SQL for snowflake metadata: {}'.format(self.sql_stmt)) self._alchemy_extractor = SQLAlchemyExtractor() sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\ .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt})) self._alchemy_extractor.init(sql_alch_conf) self._extract_iter = None # type: Union[None, Iterator]
def init(self, conf: ConfigTree) -> None: conf = Scoped.get_scoped_conf(conf, self.get_scope()) \ .with_fallback(conf) \ .with_fallback(DEFAULT_CONFIG) self.target_nodes = set(conf.get_list(TARGET_NODES)) self.target_relations = set(conf.get_list(TARGET_RELATIONS)) self.batch_size = conf.get_int(BATCH_SIZE) self.dry_run = conf.get_bool(DRY_RUN) self.staleness_pct = conf.get_int(STALENESS_MAX_PCT) self.staleness_pct_dict = conf.get(STALENESS_PCT_MAX_DICT) self.retain_data_with_no_publisher_metadata = conf.get_bool(RETAIN_DATA_WITH_NO_PUBLISHER_METADATA) if JOB_PUBLISH_TAG in conf and MS_TO_EXPIRE in conf: raise Exception(f'Cannot have both {JOB_PUBLISH_TAG} and {MS_TO_EXPIRE} in job config') self.ms_to_expire = None if MS_TO_EXPIRE in conf: self.ms_to_expire = conf.get_int(MS_TO_EXPIRE) if self.ms_to_expire < conf.get_int(MIN_MS_TO_EXPIRE): raise Exception(f'{MS_TO_EXPIRE} is too small') self.marker = self.ms_to_expire else: self.marker = conf.get_string(JOB_PUBLISH_TAG) trust = neo4j.TRUST_SYSTEM_CA_SIGNED_CERTIFICATES if conf.get_bool(NEO4J_VALIDATE_SSL) \ else neo4j.TRUST_ALL_CERTIFICATES self._driver = \ GraphDatabase.driver(conf.get_string(NEO4J_END_POINT_KEY), max_connection_life_time=conf.get_int(NEO4J_MAX_CONN_LIFE_TIME_SEC), auth=(conf.get_string(NEO4J_USER), conf.get_string(NEO4J_PASSWORD)), encrypted=conf.get_bool(NEO4J_ENCRYPTED), trust=trust)
def test_extraction_with_multiple_query_result(self): # type: (Any) -> None """ Test Extraction with multiple result from query """ with patch.object(Neo4jExtractor, '_get_driver'): extractor = Neo4jExtractor() extractor.init( Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) extractor.results = [ 'test_result1', 'test_result2', 'test_result3' ] result = extractor.extract() self.assertEqual(result, 'test_result1') result = extractor.extract() self.assertEqual(result, 'test_result2') result = extractor.extract() self.assertEqual(result, 'test_result3') # Ensure next result is None result = extractor.extract() self.assertIsNone(result)
def init(self, conf): # type: (ConfigTree) -> None """ Initialize Neo4jExtractor object from configuration and use that for extraction """ self.conf = conf self.entity = conf.get_string(Neo4jSearchDataExtractor.ENTITY_TYPE, default='table').lower() # extract cypher query from conf, if specified, else use default query if Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY in conf: self.cypher_query = conf.get_string( Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY) else: default_query = Neo4jSearchDataExtractor.DEFAULT_QUERY_BY_ENTITY[ self.entity] self.cypher_query = self._add_publish_tag_filter( conf.get_string(JOB_PUBLISH_TAG, ''), cypher_query=default_query) self.neo4j_extractor = Neo4jExtractor() # write the cypher query in configs in Neo4jExtractor scope key = self.neo4j_extractor.get_scope( ) + '.' + Neo4jExtractor.CYPHER_QUERY_CONFIG_KEY self.conf.put(key, self.cypher_query) # initialize neo4j_extractor from configs self.neo4j_extractor.init( Scoped.get_scoped_conf(self.conf, self.neo4j_extractor.get_scope()))
def test_empty_dataset(self, mock_build: Any) -> None: mock_build.return_value = MockBigQueryClient(ONE_DATASET, NO_TABLES, None) extractor = BigQueryWatermarkExtractor() extractor.init(Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result = extractor.extract() self.assertIsNone(result)