def test_from_dict_with_ordered_dict(self): d = OrderedDict() d["banana"] = 3 d["apple"] = 4 d["pear"] = 1 d["orange"] = 2 config = ConfigFactory.from_dict(d) assert config == d
def test_from_dict_with_ordered_dict(self): d = OrderedDict() d['banana'] = 3 d['apple'] = 4 d['pear'] = 1 d['orange'] = 2 config = ConfigFactory.from_dict(d) assert config == d
def test_from_dict_with_dict(self): d = { 'banana': 3, 'apple': 4, 'pear': 1, 'orange': 2, } config = ConfigFactory.from_dict(d) assert config == d
def test_from_dict_with_nested_dict(self): d = OrderedDict() d['banana'] = 3 d['apple'] = 4 d['pear'] = 1 d['tree'] = { 'a': 'abc\ntest\n', 'b': [1, 2, 3] } config = ConfigFactory.from_dict(d) assert config == d
def update_parameters(parameters, template_variables, instance_path, param_type): updated_parameters = deepcopy(parameters) info_variables = set(updated_parameters.keys()) for variable in info_variables.union(template_variables): if variable not in info_variables: print("Adding missing variable '%s' to the parameterInfos of instance %s" % (variable, instance_path)) updated_parameters[variable] = ConfigFactory.from_dict({"type": param_type}) else: if "type" not in updated_parameters[variable]: print("Adding missing type for '%s' to the parameterInfos of instance %s" % (variable, instance_path)) updated_parameters[variable]["type"] = param_type return updated_parameters
def configure_splice_machine_extractors(connection: ConnectionConfigSchema): Extractor = SpliceMachineMetadataExtractor extractor = Extractor() scope = extractor.get_scope() conf = ConfigFactory.from_dict( { f"{scope}.{Extractor.HOST_KEY}": connection.uri, f"{scope}.{Extractor.USERNAME_KEY}": connection.username, f"{scope}.{Extractor.PASSWORD_KEY}": connection.password, f"{scope}.{Extractor.WHERE_CLAUSE_SUFFIX_KEY}": connection.where_clause_suffix, } ) extractors = [extractor] # extractors, conf = add_ugc_runner(extractors, conf, connection) return extractors, conf
def test_hive_sql_statement_with_custom_sql(self) -> None: """ Test Extraction by providing a custom sql :return: """ with patch.object(SQLAlchemyExtractor, '_get_connection'): config_dict = { HiveTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY: self.where_clause_suffix, 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): 'TEST_CONNECTION', HiveTableMetadataExtractor.EXTRACT_SQL: 'select sth for test {where_clause_suffix}' } conf = ConfigFactory.from_dict(config_dict) extractor = HiveTableMetadataExtractor() extractor.init(conf) self.assertTrue('select sth for test' in extractor.sql_stmt)
def configure_hive_metastore_extractors(connection: ConnectionConfigSchema): Extractor = HiveTableMetadataExtractor extractor = Extractor() scope = extractor.get_scope() conn_string_key = get_sql_alchemy_conn_string_key(scope) conf = ConfigFactory.from_dict({ conn_string_key: connection.conn_string, f"{scope}.{Extractor.CLUSTER_KEY}": connection.cluster, # f"{scope}.{Extractor.DATABASE_KEY}": connection.name, # TODO: Modify metastore connector to work f"{scope}.{Extractor.WHERE_CLAUSE_SUFFIX_KEY}": connection.where_clause_suffix, }) extractors = [extractor] return extractors, conf
def test_keypath_and_pagesize_can_be_set(self, mock_build: Any) -> None: config_dict = { 'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PROJECT_ID_KEY): 'your-project-here', 'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PAGE_SIZE_KEY): 200, 'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.KEY_PATH_KEY): '/tmp/doesnotexist', } conf = ConfigFactory.from_dict(config_dict) mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, TABLE_DATA) extractor = BigQueryMetadataExtractor() with self.assertRaises(FileNotFoundError): extractor.init( Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))
def _create_snowflake_extractor( source: CatSource, ) -> Tuple[SnowflakeMetadataExtractor, Any]: extractor = SnowflakeMetadataExtractor() scope = extractor.get_scope() conn_string_key = f"{scope}.{SQLAlchemyExtractor().get_scope()}.{SQLAlchemyExtractor.CONN_STRING}" conf = ConfigFactory.from_dict( { conn_string_key: source.conn_string, f"{scope}.{SnowflakeMetadataExtractor.CLUSTER_KEY}": source.cluster, f"{scope}.{SnowflakeMetadataExtractor.DATABASE_KEY}": source.database, f"{scope}.{SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY}": source.database, # f"{scope}.{SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}": connection.where_clause_suffix, } ) return extractor, conf
def test_collapse_config_env_vars_empty_env(self): # empty env config = ConfigFactory.from_dict({ "facade": { "base_url": "http://base.url", "base_url~qa": "http://base-qa.url" } }) config = configs.collapse_environment(config, env="") config.get("facade.base_url").should.be.equal("http://base.url") config.get("facade.base_url~qa").should.be.equal("http://base-qa.url") config = configs.collapse_environment(config, env=None) config.get("facade.base_url").should.be.equal("http://base.url") config.get("facade.base_url~qa").should.be.equal("http://base-qa.url")
def test_conversion(self): # type: () -> None transformer = RemoveFieldTransformer() config = ConfigFactory.from_dict({ FIELD_NAMES: ['foo', 'bar'], }) transformer.init(conf=config) actual = transformer.transform({ 'foo': 'foo_val', 'bar': 'bar_val', 'baz': 'baz_val', }) expected = { 'baz': 'baz_val' } self.assertDictEqual(expected, actual)
def init(self, conf): # type: (ConfigTree) -> None conf = conf.with_fallback(PrestoViewMetadataExtractor.DEFAULT_CONFIG) self._cluster = '{}'.format( conf.get_string(PrestoViewMetadataExtractor.CLUSTER_KEY)) self.sql_stmt = PrestoViewMetadataExtractor.SQL_STATEMENT.format( where_clause_suffix=conf.get_string( PrestoViewMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY)) LOGGER.info('SQL for hive metastore: {}'.format(self.sql_stmt)) self._alchemy_extractor = SQLAlchemyExtractor() sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\ .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt})) self._alchemy_extractor.init(sql_alch_conf) self._extract_iter = None # type: Union[None, Iterator]
def configure_neo4j_extractors(connection: ConnectionConfigSchema): extractor = AmundsenNeo4jMetadataExtractor() scope = extractor.get_scope() conf = ConfigFactory.from_dict( { f"{scope}.graph_url": connection.conn_string, f"{scope}.neo4j_auth_user": connection.username, f"{scope}.neo4j_auth_pw": connection.password, f"{scope}.included_keys": connection.included_keys, f"{scope}.excluded_keys": connection.excluded_keys, f"{scope}.included_key_regex": connection.included_key_regex, f"{scope}.excluded_key_regex": connection.excluded_key_regex, } ) extractors = [extractor] return extractors, conf
def create_sample_dremio_job(): tmp_folder = f'/var/tmp/amundsen/{"tables"}' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' extractor = DremioMetadataExtractor() loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=extractor, loader=loader) job_config = ConfigFactory.from_dict({ f'extractor.dremio.{DremioMetadataExtractor.DREMIO_USER_KEY}': DREMIO_USER, f'extractor.dremio.{DremioMetadataExtractor.DREMIO_PASSWORD_KEY}': DREMIO_PASSWORD, f'extractor.dremio.{DremioMetadataExtractor.DREMIO_HOST_KEY}': DREMIO_HOST, f'extractor.dremio.{DremioMetadataExtractor.DREMIO_EXCLUDE_PDS_TABLES_KEY}': True, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}': True, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': NEO4J_ENDPOINT, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': NEO4J_USER, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': NEO4J_PASSWORD, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag' }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def run_mssql_job(): where_clause_suffix = textwrap.dedent(""" ('dbo') """) tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships/'.format( tmp_folder=tmp_folder) job_config = ConfigFactory.from_dict({ # MSSQL Loader 'extractor.mssql_metadata.{}'.format( MSSQLMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause_suffix, 'extractor.mssql_metadata.{}'.format( MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): True, 'extractor.mssql_metadata.extractor.sqlalchemy.{}'.format( SQLAlchemyExtractor.CONN_STRING): connection_string(), # NEO4J Loader 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob( conf=job_config, task=DefaultTask( extractor=MSSQLMetadataExtractor(), loader=FsNeo4jCSVLoader()), publisher=Neo4jCsvPublisher()) return job
def test_validation_threshold_override(self) -> None: with patch.object(GraphDatabase, 'driver'): task = Neo4jStalenessRemovalTask() job_config = ConfigFactory.from_dict({ 'job.identifier': 'remove_stale_data_job', '{}.{}'.format( task.get_scope(), neo4j_staleness_removal_task.NEO4J_END_POINT_KEY): 'foobar', '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_USER): 'foo', '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_PASSWORD): 'bar', '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.STALENESS_MAX_PCT): 5, '{}.{}'.format( task.get_scope(), neo4j_staleness_removal_task.STALENESS_PCT_MAX_DICT): { 'foo': 51 }, neo4j_csv_publisher.JOB_PUBLISH_TAG: 'foo' }) task.init(job_config) total_records = [{ 'type': 'foo', 'count': 100 }, { 'type': 'bar', 'count': 100 }] stale_records = [{ 'type': 'foo', 'count': 50 }, { 'type': 'bar', 'count': 3 }] targets = {'foo', 'bar'} task._validate_staleness_pct(total_records=total_records, stale_records=stale_records, types=targets)
def run_bq_tu_job(job_name): #where_clause_suffix = " " gcloud_project = "bpy---pedidosya" #label_filter = "" tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) bq_usage_extractor = BigQueryTableUsageExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=bq_usage_extractor, loader=csv_loader, transformer=BigqueryUsageTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): gcloud_project, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
class MetaframeLoader(Loader): """ Loader class to format metadata as as a markdown doc for metaframe. """ DEFAULT_CONFIG = ConfigFactory.from_dict( {'base_directory': os.path.join(Path.home(), '.metaframe/metadata/')}) def init(self, conf: ConfigTree): self.conf = conf.with_fallback(MetaframeLoader.DEFAULT_CONFIG) self.base_directory = self.conf.get_string('base_directory') self.database_name = self.conf.get_string('database_name', None) Path(self.base_directory).mkdir(parents=True, exist_ok=True) def load(self, record): # type: (Any) -> None """ Write record object as csv to file :param record: :return: """ if not record: return table_file_path_base = get_table_file_path_base( database=self.database_name or record.database, cluster=record.cluster, schema=record.schema, table=record.name) file_path = table_file_path_base + '.md' file_path_docs = table_file_path_base + '.docs.md' subdirectory = '/'.join(file_path.split('/')[:-1]) Path(subdirectory).mkdir(parents=True, exist_ok=True) Path(file_path_docs).touch() with open(file_path, 'w') as f: f.write(record.markdown_blob) def close(self): pass def get_scope(self): # type: () -> str return "loader.metaframe"
def test_preprocessor(self) -> None: with patch.object(GraphDatabase, 'driver') as mock_driver: mock_session = MagicMock() mock_driver.return_value.session.return_value = mock_session mock_transaction = MagicMock() mock_session.begin_transaction.return_value = mock_transaction mock_run = MagicMock() mock_transaction.run = mock_run mock_commit = MagicMock() mock_transaction.commit = mock_commit mock_preprocessor = MagicMock() mock_preprocessor.is_perform_preprocess.return_value = MagicMock( return_value=True) mock_preprocessor.preprocess_cypher.return_value = ( 'MATCH (f:Foo) RETURN f', {}) publisher = Neo4jCsvPublisher() conf = ConfigFactory.from_dict({ neo4j_csv_publisher.NEO4J_END_POINT_KEY: 'dummy://999.999.999.999:7687/', neo4j_csv_publisher.NODE_FILES_DIR: f'{self._resource_path}/nodes', neo4j_csv_publisher.RELATION_FILES_DIR: f'{self._resource_path}/relations', neo4j_csv_publisher.RELATION_PREPROCESSOR: mock_preprocessor, neo4j_csv_publisher.NEO4J_USER: '******', neo4j_csv_publisher.NEO4J_PASSWORD: '******', neo4j_csv_publisher.JOB_PUBLISH_TAG: str(uuid.uuid4()) }) publisher.init(conf) publisher.publish() self.assertEqual(mock_run.call_count, 8) # 2 node files, 1 relation file self.assertEqual(mock_commit.call_count, 1)
def test_extraction_with_model_class(self): # type: (Any) -> None """ Test Extraction using model class """ config_dict = { 'extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): 'TEST_GRAPH_URL', 'extractor.neo4j.{}'.format(Neo4jExtractor.CYPHER_QUERY_CONFIG_KEY): 'TEST_QUERY', 'extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): 'TEST_USER', 'extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): 'TEST_PW', 'extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): 'databuilder.models.table_elasticsearch_document.TableESDocument' } self.conf = ConfigFactory.from_dict(config_dict) with patch.object(Neo4jExtractor, '_get_driver'): extractor = Neo4jExtractor() extractor.init(Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result_dict = dict(database='test_database', cluster='test_cluster', schema='test_schema', name='test_table_name', display_name='test_schema.test_table_name', key='test_table_key', description='test_table_description', last_updated_timestamp=123456789, column_names=['test_col1', 'test_col2', 'test_col3'], column_descriptions=['test_description1', 'test_description2', ''], total_usage=100, unique_usage=5, tags=['hive'], badges=['badge1'], schema_description='schema_description', programmatic_descriptions=['TEST']) extractor.results = [result_dict] result_obj = extractor.extract() self.assertIsInstance(result_obj, TableESDocument) self.assertDictEqual(vars(result_obj), result_dict)
def create_sample_job(table_name, model_name): sql = textwrap.dedent(""" select * from {table_name}; """).format(table_name=table_name) tmp_folder = '/var/tmp/amundsen/{table_name}'.format(table_name=table_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) sql_extractor = SQLAlchemyExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=sql_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): SQLITE_CONN_STRING, 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.EXTRACT_SQL): sql, 'extractor.sqlalchemy.model_class': model_name, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def test_accepts_dataset_filter_by_label(self, mock_datacatalogue, mock_bigquery): config_dict = { "extractor.bigquery_table_metadata.{}".format(BigQueryMetadataExtractor.PROJECT_ID_KEY): "your-project-here", "extractor.bigquery_table_metadata.{}".format(BigQueryMetadataExtractor.FILTER_KEY): "label.key:value", } conf = ConfigFactory.from_dict(config_dict) mock_bigquery.return_value = MockBigQueryClient( ONE_DATASET, ONE_TABLE, TABLE_DATA) mock_datacatalogue.DataCatalogClient.return_value = MockDataCatalogClient( ENTRY, TAGS) extractor = BigQueryMetadataExtractor() extractor.init( Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) result = extractor.extract() self.assertIsInstance(result, TableMetadata)
def test_static_data(self): # type: (...) -> None conf = ConfigFactory.from_dict({ REST_API_QUERY: RestApiQuerySeed(seed_record=[{ 'foo': 'bar' }]), STATIC_RECORD_DICT: { 'john': 'doe' } }) extractor = RestAPIExtractor() extractor.init(conf=conf) record = extractor.extract() expected = {'foo': 'bar', 'john': 'doe'} self.assertDictEqual(expected, record)
def configure_redshift_extractors(connection: ConnectionConfigSchema): Extractor = RedshiftMetadataExtractor extractor = Extractor() scope = extractor.get_scope() conn_string_key = get_sql_alchemy_conn_string_key(scope) conf = ConfigFactory.from_dict( { conn_string_key: connection.conn_string, f"{scope}.{Extractor.CLUSTER_KEY}": connection.cluster, f"{scope}.{Extractor.DATABASE_KEY}": connection.name, f"{scope}.{Extractor.WHERE_CLAUSE_SUFFIX_KEY}": connection.where_clause_suffix, } ) extractors = [extractor] extractors, conf = add_metrics(extractors, conf, connection) return extractors, conf
def create_es_publisher_sample_job(): # loader saves data to this location and publisher reads it from here extracted_search_data_path = '/var/tmp/amundsen/search_data.json' task = DefaultTask(loader=FSElasticsearchJSONLoader(), extractor=Neo4jSearchDataExtractor(), transformer=NoopTransformer()) # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) # related to mapping type from /databuilder/publisher/elasticsearch_publisher.py#L38 elasticsearch_new_index_key_type = 'table' # alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index elasticsearch_index_alias = 'table_search_index' job_config = ConfigFactory.from_dict({ 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): neo4j_endpoint, 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): 'databuilder.models.table_elasticsearch_document.TableESDocument', 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): neo4j_user, 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): neo4j_password, 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): extracted_search_data_path, 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): 'w', 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): extracted_search_data_path, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): 'r', 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): elasticsearch_client, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): elasticsearch_new_index_key, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY): elasticsearch_new_index_key_type, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): elasticsearch_index_alias }) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) job.launch()
def test_init_not_called(self) -> None: mock_transformer1 = MagicMock() mock_transformer1.transform.return_value = "foo" mock_transformer2 = MagicMock() mock_transformer2.transform.return_value = "bar" chained_transformer = ChainedTransformer( transformers=[mock_transformer1, mock_transformer2]) config = ConfigFactory.from_dict({}) chained_transformer.init(conf=config) next(chained_transformer.transform({"foo": "bar"})) mock_transformer1.init.assert_not_called() mock_transformer1.transform.assert_called_once() mock_transformer2.init.assert_not_called() mock_transformer2.transform.assert_called_once()
def test_extraction_with_filter_conf(self, mock_columns, mock_tables, mock_keyspaces): # type: () -> None mock_keyspaces.return_value = {'test_schema': None} mock_tables.return_value = {'test_table': None} columns_dict = OrderedDict() columns_dict['id'] = CassandraColumnMetadata(None, 'id', 'int') columns_dict['txt'] = CassandraColumnMetadata(None, 'txt', 'text') mock_columns.return_value = columns_dict def filter_function(k, t): return False if 'test' in k or 'test' in t else False conf = ConfigFactory.from_dict( {CassandraExtractor.FILTER_FUNCTION_KEY: filter_function}) extractor = CassandraExtractor() extractor.init(conf) self.assertIsNone(extractor.extract())
def setUp(self) -> None: self.test_file_path = 'test_publisher_file.json' self.test_file_mode = 'r' self.mock_es_client = MagicMock() self.test_es_new_index = 'test_new_index' self.test_es_alias = 'test_index_alias' self.test_doc_type = 'test_doc_type' config_dict = { 'publisher.elasticsearch.file_path': self.test_file_path, 'publisher.elasticsearch.mode': self.test_file_mode, 'publisher.elasticsearch.client': self.mock_es_client, 'publisher.elasticsearch.new_index': self.test_es_new_index, 'publisher.elasticsearch.alias': self.test_es_alias, 'publisher.elasticsearch.doc_type': self.test_doc_type } self.conf = ConfigFactory.from_dict(config_dict)
def test_default_search_query(self: Any) -> None: with patch.object(Neo4jExtractor, '_get_driver'): extractor = Neo4jSearchDataExtractor() conf = ConfigFactory.from_dict({ f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': 'test-endpoint', f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': 'test-user', f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': 'test-passwd', f'extractor.search_data.{Neo4jSearchDataExtractor.ENTITY_TYPE}': 'dashboard', }) extractor.init( Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) self.assertEqual( extractor.cypher_query, Neo4jSearchDataExtractor.DEFAULT_NEO4J_DASHBOARD_CYPHER_QUERY. format(publish_tag_filter=''))
def test_merge_defaults_reference_conf_empty(self): application_conf = ConfigFactory.from_dict({ "facade": { "base_url": "http://config1.url" }, "logging": { "config": "config1.yaml" } }) reference_conf = ConfigFactory.parse_file("unexist_referece_conf_file", required=False) config = configs.merge_configs(reference_conf, application_conf) # facade.base_url rewrite by application_conf config.get("facade.base_url").should.be.equal("http://config1.url") # logging.config merged config.get("logging.config").should.be.equal("config1.yaml")
def _config_dict_to_text(config): if not isinstance(config, dict): raise ValueError( "Model configuration only supports dictionary objects") try: # hack, pyhocon is not very good with dict conversion so we pass through json try: import json text = json.dumps(config) text = HOCONConverter.convert(ConfigFactory.parse_string(text), 'hocon') except Exception: # fallback pyhocon text = HOCONConverter.convert(ConfigFactory.from_dict(config), 'hocon') except Exception: raise ValueError("Could not serialize configuration dictionary:\n", config) return text
def configure_spanner_extractors(connection: ConnectionConfigSchema): Extractor = SpannerMetadataExtractor extractor = Extractor() scope = extractor.get_scope() conf = ConfigFactory.from_dict( { f"{scope}.{Extractor.CONNECTION_NAME_KEY}": connection.name, f"{scope}.{Extractor.DATABASE_ID_KEY}": connection.database, f"{scope}.{Extractor.INSTANCE_ID_KEY}": connection.instance, f"{scope}.{Extractor.KEY_PATH_KEY}": connection.key_path, f"{scope}.{Extractor.PROJECT_ID_KEY}": connection.project_id, } ) extractors = [extractor] extractors, conf = add_metrics(extractors, conf, connection) return extractors, conf
def test_extraction_table_lineage(self) -> None: """ Test table lineage extraction using model class """ config_dict = { f'extractor.csvtablelineage.{CsvTableLineageExtractor.TABLE_LINEAGE_FILE_LOCATION}': 'example/sample_data/sample_table_lineage.csv' } self.conf = ConfigFactory.from_dict(config_dict) extractor = CsvTableLineageExtractor() extractor.init( Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result = extractor.extract() self.assertEqual(result.table_key, 'hive://gold.test_schema/test_table1') self.assertEqual(result.downstream_deps, ['dynamo://gold.test_schema/test_table2'])
def test_model_construction(self) -> None: conf = ConfigFactory.from_dict( { REST_API_QUERY: RestApiQuerySeed( seed_record=[{'dashboard_group': 'foo', 'dashboard_name': 'bar', 'description': 'john', 'dashboard_group_description': 'doe'}]), MODEL_CLASS: 'databuilder.models.dashboard.dashboard_metadata.DashboardMetadata', } ) extractor = RestAPIExtractor() extractor.init(conf=conf) record = extractor.extract() expected = DashboardMetadata(dashboard_group='foo', dashboard_name='bar', description='john', dashboard_group_description='doe') self.assertEqual(expected.__repr__(), record.__repr__())
def test_from_dict_with_dict(self): d = {"banana": 3, "apple": 4, "pear": 1, "orange": 2} config = ConfigFactory.from_dict(d) assert config == d