def test_from_dict_with_ordered_dict(self):
     d = OrderedDict()
     d["banana"] = 3
     d["apple"] = 4
     d["pear"] = 1
     d["orange"] = 2
     config = ConfigFactory.from_dict(d)
     assert config == d
Example #2
0
 def test_from_dict_with_ordered_dict(self):
     d = OrderedDict()
     d['banana'] = 3
     d['apple'] = 4
     d['pear'] = 1
     d['orange'] = 2
     config = ConfigFactory.from_dict(d)
     assert config == d
Example #3
0
 def test_from_dict_with_dict(self):
     d = {
         'banana': 3,
         'apple': 4,
         'pear': 1,
         'orange': 2,
     }
     config = ConfigFactory.from_dict(d)
     assert config == d
Example #4
0
 def test_from_dict_with_nested_dict(self):
     d = OrderedDict()
     d['banana'] = 3
     d['apple'] = 4
     d['pear'] = 1
     d['tree'] = {
         'a': 'abc\ntest\n',
         'b': [1, 2, 3]
     }
     config = ConfigFactory.from_dict(d)
     assert config == d
def update_parameters(parameters, template_variables, instance_path, param_type):
    updated_parameters = deepcopy(parameters)
    info_variables = set(updated_parameters.keys())
    for variable in info_variables.union(template_variables):
        if variable not in info_variables:
            print("Adding missing variable '%s' to the parameterInfos of instance %s" % (variable, instance_path))
            updated_parameters[variable] = ConfigFactory.from_dict({"type": param_type})
        else:
            if "type" not in updated_parameters[variable]:
                print("Adding missing type for '%s' to the parameterInfos of instance %s" % (variable, instance_path))
                updated_parameters[variable]["type"] = param_type
    return updated_parameters
Example #6
0
def configure_splice_machine_extractors(connection: ConnectionConfigSchema):
    Extractor = SpliceMachineMetadataExtractor
    extractor = Extractor()
    scope = extractor.get_scope()

    conf = ConfigFactory.from_dict(
        {
            f"{scope}.{Extractor.HOST_KEY}": connection.uri,
            f"{scope}.{Extractor.USERNAME_KEY}": connection.username,
            f"{scope}.{Extractor.PASSWORD_KEY}": connection.password,
            f"{scope}.{Extractor.WHERE_CLAUSE_SUFFIX_KEY}": connection.where_clause_suffix,
        }
    )

    extractors = [extractor]
    # extractors, conf = add_ugc_runner(extractors, conf, connection)

    return extractors, conf
 def test_hive_sql_statement_with_custom_sql(self) -> None:
     """
     Test Extraction by providing a custom sql
     :return:
     """
     with patch.object(SQLAlchemyExtractor, '_get_connection'):
         config_dict = {
             HiveTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY:
             self.where_clause_suffix,
             'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING):
             'TEST_CONNECTION',
             HiveTableMetadataExtractor.EXTRACT_SQL:
             'select sth for test {where_clause_suffix}'
         }
         conf = ConfigFactory.from_dict(config_dict)
         extractor = HiveTableMetadataExtractor()
         extractor.init(conf)
         self.assertTrue('select sth for test' in extractor.sql_stmt)
Example #8
0
def configure_hive_metastore_extractors(connection: ConnectionConfigSchema):
    Extractor = HiveTableMetadataExtractor
    extractor = Extractor()
    scope = extractor.get_scope()
    conn_string_key = get_sql_alchemy_conn_string_key(scope)

    conf = ConfigFactory.from_dict({
        conn_string_key:
        connection.conn_string,
        f"{scope}.{Extractor.CLUSTER_KEY}":
        connection.cluster,
        # f"{scope}.{Extractor.DATABASE_KEY}": connection.name,  # TODO: Modify metastore connector to work
        f"{scope}.{Extractor.WHERE_CLAUSE_SUFFIX_KEY}":
        connection.where_clause_suffix,
    })

    extractors = [extractor]
    return extractors, conf
Example #9
0
    def test_keypath_and_pagesize_can_be_set(self, mock_build: Any) -> None:
        config_dict = {
            'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PROJECT_ID_KEY):
            'your-project-here',
            'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PAGE_SIZE_KEY):
            200,
            'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.KEY_PATH_KEY):
            '/tmp/doesnotexist',
        }
        conf = ConfigFactory.from_dict(config_dict)

        mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE,
                                                     TABLE_DATA)
        extractor = BigQueryMetadataExtractor()

        with self.assertRaises(FileNotFoundError):
            extractor.init(
                Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))
Example #10
0
File: db.py Project: vrajat/dbcat
    def _create_snowflake_extractor(
        source: CatSource,
    ) -> Tuple[SnowflakeMetadataExtractor, Any]:
        extractor = SnowflakeMetadataExtractor()
        scope = extractor.get_scope()
        conn_string_key = f"{scope}.{SQLAlchemyExtractor().get_scope()}.{SQLAlchemyExtractor.CONN_STRING}"

        conf = ConfigFactory.from_dict(
            {
                conn_string_key: source.conn_string,
                f"{scope}.{SnowflakeMetadataExtractor.CLUSTER_KEY}": source.cluster,
                f"{scope}.{SnowflakeMetadataExtractor.DATABASE_KEY}": source.database,
                f"{scope}.{SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY}": source.database,
                # f"{scope}.{SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}": connection.where_clause_suffix,
            }
        )

        return extractor, conf
Example #11
0
    def test_collapse_config_env_vars_empty_env(self):
        # empty env
        config = ConfigFactory.from_dict({
            "facade": {
                "base_url": "http://base.url",
                "base_url~qa": "http://base-qa.url"
            }
        })

        config = configs.collapse_environment(config, env="")

        config.get("facade.base_url").should.be.equal("http://base.url")
        config.get("facade.base_url~qa").should.be.equal("http://base-qa.url")

        config = configs.collapse_environment(config, env=None)

        config.get("facade.base_url").should.be.equal("http://base.url")
        config.get("facade.base_url~qa").should.be.equal("http://base-qa.url")
    def test_conversion(self):
        # type: () -> None

        transformer = RemoveFieldTransformer()
        config = ConfigFactory.from_dict({
            FIELD_NAMES: ['foo', 'bar'],
        })
        transformer.init(conf=config)

        actual = transformer.transform({
            'foo': 'foo_val',
            'bar': 'bar_val',
            'baz': 'baz_val',
        })
        expected = {
            'baz': 'baz_val'
        }
        self.assertDictEqual(expected, actual)
    def init(self, conf):
        # type: (ConfigTree) -> None
        conf = conf.with_fallback(PrestoViewMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = '{}'.format(
            conf.get_string(PrestoViewMetadataExtractor.CLUSTER_KEY))

        self.sql_stmt = PrestoViewMetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(
                PrestoViewMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY))

        LOGGER.info('SQL for hive metastore: {}'.format(self.sql_stmt))

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter = None  # type: Union[None, Iterator]
Example #14
0
def configure_neo4j_extractors(connection: ConnectionConfigSchema):
    extractor = AmundsenNeo4jMetadataExtractor()
    scope = extractor.get_scope()
    conf = ConfigFactory.from_dict(
        {
            f"{scope}.graph_url": connection.conn_string,
            f"{scope}.neo4j_auth_user": connection.username,
            f"{scope}.neo4j_auth_pw": connection.password,
            f"{scope}.included_keys": connection.included_keys,
            f"{scope}.excluded_keys": connection.excluded_keys,
            f"{scope}.included_key_regex": connection.included_key_regex,
            f"{scope}.excluded_key_regex": connection.excluded_key_regex,
        }
    )

    extractors = [extractor]

    return extractors, conf
Example #15
0
def create_sample_dremio_job():
    tmp_folder = f'/var/tmp/amundsen/{"tables"}'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'

    extractor = DremioMetadataExtractor()
    loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=extractor, loader=loader)

    job_config = ConfigFactory.from_dict({
        f'extractor.dremio.{DremioMetadataExtractor.DREMIO_USER_KEY}':
        DREMIO_USER,
        f'extractor.dremio.{DremioMetadataExtractor.DREMIO_PASSWORD_KEY}':
        DREMIO_PASSWORD,
        f'extractor.dremio.{DremioMetadataExtractor.DREMIO_HOST_KEY}':
        DREMIO_HOST,
        f'extractor.dremio.{DremioMetadataExtractor.DREMIO_EXCLUDE_PDS_TABLES_KEY}':
        True,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}':
        node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}':
        relationship_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}':
        True,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}':
        True,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}':
        node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}':
        relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}':
        NEO4J_ENDPOINT,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}':
        NEO4J_USER,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}':
        NEO4J_PASSWORD,
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}':
        'unique_tag'
    })

    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())

    return job
Example #16
0
def run_mssql_job():
    where_clause_suffix = textwrap.dedent("""
        ('dbo')
    """)

    tmp_folder = '/var/tmp/amundsen/table_metadata'
    node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships/'.format(
        tmp_folder=tmp_folder)

    job_config = ConfigFactory.from_dict({
        # MSSQL Loader
        'extractor.mssql_metadata.{}'.format(
            MSSQLMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause_suffix,
        'extractor.mssql_metadata.{}'.format(
            MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): True,
        'extractor.mssql_metadata.extractor.sqlalchemy.{}'.format(
            SQLAlchemyExtractor.CONN_STRING): connection_string(),
        # NEO4J Loader
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
            node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
            relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
            node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
            relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
            neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
            neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
            neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
            'unique_tag',  # should use unique tag here like {ds}
    })

    job = DefaultJob(
        conf=job_config,
        task=DefaultTask(
            extractor=MSSQLMetadataExtractor(),
            loader=FsNeo4jCSVLoader()),
        publisher=Neo4jCsvPublisher())
    return job
    def test_validation_threshold_override(self) -> None:

        with patch.object(GraphDatabase, 'driver'):
            task = Neo4jStalenessRemovalTask()
            job_config = ConfigFactory.from_dict({
                'job.identifier':
                'remove_stale_data_job',
                '{}.{}'.format(
                    task.get_scope(), neo4j_staleness_removal_task.NEO4J_END_POINT_KEY):
                'foobar',
                '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_USER):
                'foo',
                '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_PASSWORD):
                'bar',
                '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.STALENESS_MAX_PCT):
                5,
                '{}.{}'.format(
                    task.get_scope(), neo4j_staleness_removal_task.STALENESS_PCT_MAX_DICT):
                {
                    'foo': 51
                },
                neo4j_csv_publisher.JOB_PUBLISH_TAG:
                'foo'
            })

            task.init(job_config)
            total_records = [{
                'type': 'foo',
                'count': 100
            }, {
                'type': 'bar',
                'count': 100
            }]
            stale_records = [{
                'type': 'foo',
                'count': 50
            }, {
                'type': 'bar',
                'count': 3
            }]
            targets = {'foo', 'bar'}
            task._validate_staleness_pct(total_records=total_records,
                                         stale_records=stale_records,
                                         types=targets)
def run_bq_tu_job(job_name):

    #where_clause_suffix = " "
    gcloud_project = "bpy---pedidosya"
    #label_filter = ""

    tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name)
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    bq_usage_extractor = BigQueryTableUsageExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=bq_usage_extractor,
                       loader=csv_loader,
                       transformer=BigqueryUsageTransformer())

    job_config = ConfigFactory.from_dict({
        'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
        gcloud_project,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR):
        True,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
        'unique_tag',  # should use unique tag here like {ds}
    })

    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())

    job.launch()
Example #19
0
class MetaframeLoader(Loader):
    """
    Loader class to format metadata as as a markdown doc for metaframe.
    """
    DEFAULT_CONFIG = ConfigFactory.from_dict(
        {'base_directory': os.path.join(Path.home(), '.metaframe/metadata/')})

    def init(self, conf: ConfigTree):
        self.conf = conf.with_fallback(MetaframeLoader.DEFAULT_CONFIG)
        self.base_directory = self.conf.get_string('base_directory')
        self.database_name = self.conf.get_string('database_name', None)
        Path(self.base_directory).mkdir(parents=True, exist_ok=True)

    def load(self, record):
        # type: (Any) -> None
        """
        Write record object as csv to file
        :param record:
        :return:
        """
        if not record:
            return

        table_file_path_base = get_table_file_path_base(
            database=self.database_name or record.database,
            cluster=record.cluster,
            schema=record.schema,
            table=record.name)

        file_path = table_file_path_base + '.md'
        file_path_docs = table_file_path_base + '.docs.md'
        subdirectory = '/'.join(file_path.split('/')[:-1])
        Path(subdirectory).mkdir(parents=True, exist_ok=True)

        Path(file_path_docs).touch()
        with open(file_path, 'w') as f:
            f.write(record.markdown_blob)

    def close(self):
        pass

    def get_scope(self):
        # type: () -> str
        return "loader.metaframe"
Example #20
0
    def test_preprocessor(self) -> None:
        with patch.object(GraphDatabase, 'driver') as mock_driver:
            mock_session = MagicMock()
            mock_driver.return_value.session.return_value = mock_session

            mock_transaction = MagicMock()
            mock_session.begin_transaction.return_value = mock_transaction

            mock_run = MagicMock()
            mock_transaction.run = mock_run
            mock_commit = MagicMock()
            mock_transaction.commit = mock_commit

            mock_preprocessor = MagicMock()
            mock_preprocessor.is_perform_preprocess.return_value = MagicMock(
                return_value=True)
            mock_preprocessor.preprocess_cypher.return_value = (
                'MATCH (f:Foo) RETURN f', {})

            publisher = Neo4jCsvPublisher()

            conf = ConfigFactory.from_dict({
                neo4j_csv_publisher.NEO4J_END_POINT_KEY:
                'dummy://999.999.999.999:7687/',
                neo4j_csv_publisher.NODE_FILES_DIR:
                f'{self._resource_path}/nodes',
                neo4j_csv_publisher.RELATION_FILES_DIR:
                f'{self._resource_path}/relations',
                neo4j_csv_publisher.RELATION_PREPROCESSOR:
                mock_preprocessor,
                neo4j_csv_publisher.NEO4J_USER:
                '******',
                neo4j_csv_publisher.NEO4J_PASSWORD:
                '******',
                neo4j_csv_publisher.JOB_PUBLISH_TAG:
                str(uuid.uuid4())
            })
            publisher.init(conf)
            publisher.publish()

            self.assertEqual(mock_run.call_count, 8)

            # 2 node files, 1 relation file
            self.assertEqual(mock_commit.call_count, 1)
Example #21
0
    def test_extraction_with_model_class(self):
        # type: (Any) -> None
        """
        Test Extraction using model class
        """
        config_dict = {
            'extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): 'TEST_GRAPH_URL',
            'extractor.neo4j.{}'.format(Neo4jExtractor.CYPHER_QUERY_CONFIG_KEY): 'TEST_QUERY',
            'extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): 'TEST_USER',
            'extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): 'TEST_PW',
            'extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY):
                'databuilder.models.table_elasticsearch_document.TableESDocument'
        }

        self.conf = ConfigFactory.from_dict(config_dict)

        with patch.object(Neo4jExtractor, '_get_driver'):
            extractor = Neo4jExtractor()
            extractor.init(Scoped.get_scoped_conf(conf=self.conf,
                                                  scope=extractor.get_scope()))

            result_dict = dict(database='test_database',
                               cluster='test_cluster',
                               schema='test_schema',
                               name='test_table_name',
                               display_name='test_schema.test_table_name',
                               key='test_table_key',
                               description='test_table_description',
                               last_updated_timestamp=123456789,
                               column_names=['test_col1', 'test_col2', 'test_col3'],
                               column_descriptions=['test_description1', 'test_description2', ''],
                               total_usage=100,
                               unique_usage=5,
                               tags=['hive'],
                               badges=['badge1'],
                               schema_description='schema_description',
                               programmatic_descriptions=['TEST'])

            extractor.results = [result_dict]
            result_obj = extractor.extract()

            self.assertIsInstance(result_obj, TableESDocument)
            self.assertDictEqual(vars(result_obj), result_dict)
Example #22
0
def create_sample_job(table_name, model_name):
    sql = textwrap.dedent("""
    select * from {table_name};
    """).format(table_name=table_name)

    tmp_folder = '/var/tmp/amundsen/{table_name}'.format(table_name=table_name)
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    sql_extractor = SQLAlchemyExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=sql_extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())

    job_config = ConfigFactory.from_dict({
        'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING):
        SQLITE_CONN_STRING,
        'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.EXTRACT_SQL):
        sql,
        'extractor.sqlalchemy.model_class':
        model_name,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR):
        True,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        neo4j_password,
    })
    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
    return job
Example #23
0
    def test_accepts_dataset_filter_by_label(self, mock_datacatalogue,
                                             mock_bigquery):
        config_dict = {
            "extractor.bigquery_table_metadata.{}".format(BigQueryMetadataExtractor.PROJECT_ID_KEY):
            "your-project-here",
            "extractor.bigquery_table_metadata.{}".format(BigQueryMetadataExtractor.FILTER_KEY):
            "label.key:value",
        }
        conf = ConfigFactory.from_dict(config_dict)

        mock_bigquery.return_value = MockBigQueryClient(
            ONE_DATASET, ONE_TABLE, TABLE_DATA)
        mock_datacatalogue.DataCatalogClient.return_value = MockDataCatalogClient(
            ENTRY, TAGS)
        extractor = BigQueryMetadataExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))
        result = extractor.extract()
        self.assertIsInstance(result, TableMetadata)
    def test_static_data(self):
        # type: (...) -> None

        conf = ConfigFactory.from_dict({
            REST_API_QUERY:
            RestApiQuerySeed(seed_record=[{
                'foo': 'bar'
            }]),
            STATIC_RECORD_DICT: {
                'john': 'doe'
            }
        })
        extractor = RestAPIExtractor()
        extractor.init(conf=conf)

        record = extractor.extract()
        expected = {'foo': 'bar', 'john': 'doe'}

        self.assertDictEqual(expected, record)
Example #25
0
def configure_redshift_extractors(connection: ConnectionConfigSchema):
    Extractor = RedshiftMetadataExtractor
    extractor = Extractor()
    scope = extractor.get_scope()
    conn_string_key = get_sql_alchemy_conn_string_key(scope)

    conf = ConfigFactory.from_dict(
        {
            conn_string_key: connection.conn_string,
            f"{scope}.{Extractor.CLUSTER_KEY}": connection.cluster,
            f"{scope}.{Extractor.DATABASE_KEY}": connection.name,
            f"{scope}.{Extractor.WHERE_CLAUSE_SUFFIX_KEY}": connection.where_clause_suffix,
        }
    )

    extractors = [extractor]
    extractors, conf = add_metrics(extractors, conf, connection)

    return extractors, conf
def create_es_publisher_sample_job():
    # loader saves data to this location and publisher reads it from here
    extracted_search_data_path = '/var/tmp/amundsen/search_data.json'

    task = DefaultTask(loader=FSElasticsearchJSONLoader(),
                       extractor=Neo4jSearchDataExtractor(),
                       transformer=NoopTransformer())

    # elastic search client instance
    elasticsearch_client = es
    # unique name of new index in Elasticsearch
    elasticsearch_new_index_key = 'tables' + str(uuid.uuid4())
    # related to mapping type from /databuilder/publisher/elasticsearch_publisher.py#L38
    elasticsearch_new_index_key_type = 'table'
    # alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index
    elasticsearch_index_alias = 'table_search_index'

    job_config = ConfigFactory.from_dict({
        'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): neo4j_endpoint,
        'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY):
            'databuilder.models.table_elasticsearch_document.TableESDocument',
        'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): neo4j_user,
        'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): neo4j_password,
        'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY):
            extracted_search_data_path,
        'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): 'w',
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY):
            extracted_search_data_path,
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): 'r',
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY):
            elasticsearch_client,
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY):
            elasticsearch_new_index_key,
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY):
            elasticsearch_new_index_key_type,
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY):
            elasticsearch_index_alias
    })

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=ElasticsearchPublisher())
    job.launch()
    def test_init_not_called(self) -> None:

        mock_transformer1 = MagicMock()
        mock_transformer1.transform.return_value = "foo"
        mock_transformer2 = MagicMock()
        mock_transformer2.transform.return_value = "bar"

        chained_transformer = ChainedTransformer(
            transformers=[mock_transformer1, mock_transformer2])

        config = ConfigFactory.from_dict({})
        chained_transformer.init(conf=config)

        next(chained_transformer.transform({"foo": "bar"}))

        mock_transformer1.init.assert_not_called()
        mock_transformer1.transform.assert_called_once()
        mock_transformer2.init.assert_not_called()
        mock_transformer2.transform.assert_called_once()
    def test_extraction_with_filter_conf(self, mock_columns, mock_tables,
                                         mock_keyspaces):
        # type: () -> None
        mock_keyspaces.return_value = {'test_schema': None}
        mock_tables.return_value = {'test_table': None}
        columns_dict = OrderedDict()
        columns_dict['id'] = CassandraColumnMetadata(None, 'id', 'int')
        columns_dict['txt'] = CassandraColumnMetadata(None, 'txt', 'text')
        mock_columns.return_value = columns_dict

        def filter_function(k, t):
            return False if 'test' in k or 'test' in t else False

        conf = ConfigFactory.from_dict(
            {CassandraExtractor.FILTER_FUNCTION_KEY: filter_function})

        extractor = CassandraExtractor()
        extractor.init(conf)
        self.assertIsNone(extractor.extract())
Example #29
0
    def setUp(self) -> None:
        self.test_file_path = 'test_publisher_file.json'
        self.test_file_mode = 'r'

        self.mock_es_client = MagicMock()
        self.test_es_new_index = 'test_new_index'
        self.test_es_alias = 'test_index_alias'
        self.test_doc_type = 'test_doc_type'

        config_dict = {
            'publisher.elasticsearch.file_path': self.test_file_path,
            'publisher.elasticsearch.mode': self.test_file_mode,
            'publisher.elasticsearch.client': self.mock_es_client,
            'publisher.elasticsearch.new_index': self.test_es_new_index,
            'publisher.elasticsearch.alias': self.test_es_alias,
            'publisher.elasticsearch.doc_type': self.test_doc_type
        }

        self.conf = ConfigFactory.from_dict(config_dict)
Example #30
0
 def test_default_search_query(self: Any) -> None:
     with patch.object(Neo4jExtractor, '_get_driver'):
         extractor = Neo4jSearchDataExtractor()
         conf = ConfigFactory.from_dict({
             f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}':
             'test-endpoint',
             f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}':
             'test-user',
             f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}':
             'test-passwd',
             f'extractor.search_data.{Neo4jSearchDataExtractor.ENTITY_TYPE}':
             'dashboard',
         })
         extractor.init(
             Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))
         self.assertEqual(
             extractor.cypher_query,
             Neo4jSearchDataExtractor.DEFAULT_NEO4J_DASHBOARD_CYPHER_QUERY.
             format(publish_tag_filter=''))
Example #31
0
    def test_merge_defaults_reference_conf_empty(self):
        application_conf = ConfigFactory.from_dict({
            "facade": {
                "base_url": "http://config1.url"
            },
            "logging": {
                "config": "config1.yaml"
            }
        })
        reference_conf = ConfigFactory.parse_file("unexist_referece_conf_file",
                                                  required=False)

        config = configs.merge_configs(reference_conf, application_conf)

        # facade.base_url rewrite by application_conf
        config.get("facade.base_url").should.be.equal("http://config1.url")

        # logging.config merged
        config.get("logging.config").should.be.equal("config1.yaml")
Example #32
0
 def _config_dict_to_text(config):
     if not isinstance(config, dict):
         raise ValueError(
             "Model configuration only supports dictionary objects")
     try:
         # hack, pyhocon is not very good with dict conversion so we pass through json
         try:
             import json
             text = json.dumps(config)
             text = HOCONConverter.convert(ConfigFactory.parse_string(text),
                                           'hocon')
         except Exception:
             # fallback pyhocon
             text = HOCONConverter.convert(ConfigFactory.from_dict(config),
                                           'hocon')
     except Exception:
         raise ValueError("Could not serialize configuration dictionary:\n",
                          config)
     return text
Example #33
0
def configure_spanner_extractors(connection: ConnectionConfigSchema):
    Extractor = SpannerMetadataExtractor
    extractor = Extractor()
    scope = extractor.get_scope()

    conf = ConfigFactory.from_dict(
        {
            f"{scope}.{Extractor.CONNECTION_NAME_KEY}": connection.name,
            f"{scope}.{Extractor.DATABASE_ID_KEY}": connection.database,
            f"{scope}.{Extractor.INSTANCE_ID_KEY}": connection.instance,
            f"{scope}.{Extractor.KEY_PATH_KEY}": connection.key_path,
            f"{scope}.{Extractor.PROJECT_ID_KEY}": connection.project_id,
        }
    )

    extractors = [extractor]
    extractors, conf = add_metrics(extractors, conf, connection)

    return extractors, conf
Example #34
0
    def test_extraction_table_lineage(self) -> None:
        """
        Test table lineage extraction using model class
        """
        config_dict = {
            f'extractor.csvtablelineage.{CsvTableLineageExtractor.TABLE_LINEAGE_FILE_LOCATION}':
            'example/sample_data/sample_table_lineage.csv'
        }
        self.conf = ConfigFactory.from_dict(config_dict)
        extractor = CsvTableLineageExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=self.conf,
                                   scope=extractor.get_scope()))

        result = extractor.extract()
        self.assertEqual(result.table_key,
                         'hive://gold.test_schema/test_table1')
        self.assertEqual(result.downstream_deps,
                         ['dynamo://gold.test_schema/test_table2'])
    def test_model_construction(self) -> None:
        conf = ConfigFactory.from_dict(
            {
                REST_API_QUERY: RestApiQuerySeed(
                    seed_record=[{'dashboard_group': 'foo',
                                  'dashboard_name': 'bar',
                                  'description': 'john',
                                  'dashboard_group_description': 'doe'}]),
                MODEL_CLASS: 'databuilder.models.dashboard.dashboard_metadata.DashboardMetadata',
            }
        )
        extractor = RestAPIExtractor()
        extractor.init(conf=conf)

        record = extractor.extract()
        expected = DashboardMetadata(dashboard_group='foo', dashboard_name='bar', description='john',
                                     dashboard_group_description='doe')

        self.assertEqual(expected.__repr__(), record.__repr__())
 def test_from_dict_with_dict(self):
     d = {"banana": 3, "apple": 4, "pear": 1, "orange": 2}
     config = ConfigFactory.from_dict(d)
     assert config == d