def test_sql_statement(self) -> None:
     """
     Test Extraction with empty result from query
     """
     with patch.object(SQLAlchemyExtractor, '_get_connection'):
         extractor = SnowflakeMetadataExtractor()
         extractor.init(self.conf)
         self.assertFalse(self.database_key in extractor.sql_stmt)
 def test_sql_statement(self):
     # type: () -> None
     """
     Test Extraction with empty result from query
     """
     with patch.object(SQLAlchemyExtractor, '_get_connection'):
         extractor = SnowflakeMetadataExtractor()
         extractor.init(self.conf)
         self.assertTrue(self.where_clause_suffix in extractor.sql_stmt)
 def test_sql_statement(self) -> None:
     """
     Test Extraction with empty result from query
     """
     with patch.object(SQLAlchemyExtractor, '_get_connection'):
         extractor = SnowflakeMetadataExtractor()
         extractor.init(self.conf)
         self.assertTrue(SnowflakeMetadataExtractor.DEFAULT_CLUSTER_NAME in
                         extractor.sql_stmt)
 def test_sql_statement(self):
     # type: () -> None
     """
     Test Extraction with empty result from query
     """
     with patch.object(SQLAlchemyExtractor, '_get_connection'):
         extractor = SnowflakeMetadataExtractor()
         extractor.init(self.conf)
         self.assertTrue('table_catalog' in extractor.sql_stmt)
         self.assertFalse(self.cluster_key in extractor.sql_stmt)
    def test_extraction_with_empty_query_result(self) -> None:
        """
        Test Extraction with empty result from query
        """
        with patch.object(SQLAlchemyExtractor, '_get_connection'):
            extractor = SnowflakeMetadataExtractor()
            extractor.init(self.conf)

            results = extractor.extract()
            self.assertEqual(results, None)
def create_snowflake_table_metadata_job():
    """
    Launches databuilder job that extracts table and column metadata from Snowflake database and publishes
    to Neo4j.
    """

    where_clause_suffix = textwrap.dedent("""
            WHERE c.TABLE_SCHEMA IN {schemas}
            AND lower(c.COLUMN_NAME) not like 'dw_%';
    """).format(schemas=SUPPORTED_SCHEMA_SQL_IN_CLAUSE)

    tmp_folder = '/var/tmp/amundsen/table_metadata'
    node_files_folder = f'{tmp_folder}/nodes/'
    relationship_files_folder = f'{tmp_folder}/relationships/'

    job_config = ConfigFactory.from_dict({
        f'extractor.snowflake.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(),
        f'extractor.snowflake.{SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY}': SNOWFLAKE_DATABASE_KEY,
        f'extractor.snowflake.{SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password,
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'some_unique_tag'  # TO-DO unique tag must be added
    })

    job = DefaultJob(conf=job_config,
                     task=DefaultTask(extractor=SnowflakeMetadataExtractor(), loader=FsNeo4jCSVLoader()),
                     publisher=Neo4jCsvPublisher())
    job.launch()
Beispiel #7
0
    def _create_snowflake_extractor(
        source: CatSource,
    ) -> Tuple[SnowflakeMetadataExtractor, Any]:
        extractor = SnowflakeMetadataExtractor()
        scope = extractor.get_scope()
        conn_string_key = f"{scope}.{SQLAlchemyExtractor().get_scope()}.{SQLAlchemyExtractor.CONN_STRING}"

        conf = ConfigFactory.from_dict(
            {
                conn_string_key: source.conn_string,
                f"{scope}.{SnowflakeMetadataExtractor.CLUSTER_KEY}": source.cluster,
                f"{scope}.{SnowflakeMetadataExtractor.DATABASE_KEY}": source.database,
                f"{scope}.{SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY}": source.database,
                # f"{scope}.{SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}": connection.where_clause_suffix,
            }
        )

        return extractor, conf
    def test_extraction_with_database_specified(self):
        # type: () -> None
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute

            sql_execute.return_value = [{
                'schema': 'test_schema',
                'name': 'test_table',
                'description': 'a table for testing',
                'cluster': 'MY_CLUSTER',
                'is_view': 'false',
                'col_name': 'ds',
                'col_type': 'varchar',
                'col_description': None,
                'col_sort_order': 0
            }]

            extractor = SnowflakeMetadataExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                self.database_key, 'MY_CLUSTER', 'test_schema', 'test_table',
                'a table for testing',
                [ColumnMetadata('ds', None, 'varchar', 0)])

            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
Beispiel #9
0
def create_sample_snowflake_job():

    where_clause = "WHERE c.TABLE_SCHEMA not in ({0}) \
            AND c.TABLE_SCHEMA not like 'STAGE_%' \
            AND c.TABLE_SCHEMA not like 'HIST_%' \
            AND c.TABLE_SCHEMA not like 'SNAP_%' \
            AND lower(c.COLUMN_NAME) not like 'dw_%';".format(
        ','.join(IGNORED_SCHEMAS))

    tmp_folder = '/var/tmp/amundsen/{}'.format('tables')
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    sql_extractor = SnowflakeMetadataExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=sql_extractor, loader=csv_loader)

    job_config = ConfigFactory.from_dict({
        'extractor.snowflake.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING):
        SNOWFLAKE_CONN_STRING,
        'extractor.snowflake.{}'.format(SnowflakeMetadataExtractor.DATABASE_KEY):
        'YourSnowflakeDbName',
        'extractor.snowflake.{}'.format(SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY):
        where_clause,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR):
        True,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.FORCE_CREATE_DIR):
        True,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
        'unique_tag'
    })
    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
    return job
def create_sample_snowflake_job():
    where_clause = f"WHERE c.TABLE_SCHEMA not in ({','.join(IGNORED_SCHEMAS)}) \
            AND c.TABLE_SCHEMA not like 'STAGE_%' \
            AND c.TABLE_SCHEMA not like 'HIST_%' \
            AND c.TABLE_SCHEMA not like 'SNAP_%' \
            AND lower(c.COLUMN_NAME) not like 'dw_%';"

    tmp_folder = '/var/tmp/amundsen/tables'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'

    sql_extractor = SnowflakeMetadataExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=sql_extractor, loader=csv_loader)

    job_config = ConfigFactory.from_dict({
        f'extractor.snowflake.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}':
        connection_string(),
        f'extractor.snowflake.{SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY}':
        SNOWFLAKE_DATABASE_KEY,
        f'extractor.snowflake.{SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}':
        where_clause,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}':
        node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}':
        relationship_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}':
        True,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}':
        True,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}':
        node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}':
        relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}':
        neo4j_endpoint,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}':
        neo4j_user,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}':
        neo4j_password,
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}':
        'unique_tag'
    })
    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
    return job
Beispiel #11
0
def create_snowflake_metadata_job(*, database, ignore_schemas, conn_string,
                                  host, neo4j, **kwargs):
    node_files_folder = host["node_files_folder"]
    relationship_files_folder = host["relationship_files_folder"]
    where_clause = f"WHERE c.TABLE_SCHEMA not in (\'{', '.join(ignore_schemas)}\')"
    task = DefaultTask(extractor=SnowflakeMetadataExtractor(),
                       loader=FsNeo4jCSVLoader())
    job_config = ConfigFactory.from_dict({
        f'extractor.snowflake.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}':
        conn_string,
        f'extractor.snowflake.{SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY}':
        database,
        f'extractor.snowflake.{SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}':
        where_clause,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}':
        node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}':
        relationship_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}':
        True,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}':
        True,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}':
        node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}':
        relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}':
        neo4j["endpoint"],
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}':
        neo4j["user"],
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}':
        neo4j["password"],
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}':
        'unique_tag'
    })
    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
    return job
    def test_extraction_with_single_result(self):
        # type: () -> None
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {
                'schema':
                'test_schema',
                'name':
                'test_table',
                'description':
                'a table for testing',
                'cluster':
                self.conf['extractor.snowflake_metadata.{}'.format(
                    SnowflakeMetadataExtractor.CLUSTER_KEY)],
                'is_view':
                'false'
            }

            sql_execute.return_value = [
                self._union(
                    {
                        'col_name': 'col_id1',
                        'col_type': 'number',
                        'col_description': 'description of id1',
                        'col_sort_order': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'col_id2',
                        'col_type': 'number',
                        'col_description': 'description of id2',
                        'col_sort_order': 1
                    }, table),
                self._union(
                    {
                        'col_name': 'is_active',
                        'col_type': 'boolean',
                        'col_description': None,
                        'col_sort_order': 2
                    }, table),
                self._union(
                    {
                        'col_name': 'source',
                        'col_type': 'varchar',
                        'col_description': 'description of source',
                        'col_sort_order': 3
                    }, table),
                self._union(
                    {
                        'col_name': 'etl_created_at',
                        'col_type': 'timestamp_ltz',
                        'col_description': 'description of etl_created_at',
                        'col_sort_order': 4
                    }, table),
                self._union(
                    {
                        'col_name': 'ds',
                        'col_type': 'varchar',
                        'col_description': None,
                        'col_sort_order': 5
                    }, table)
            ]

            extractor = SnowflakeMetadataExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                'snowflake', 'MY_CLUSTER', 'test_schema', 'test_table',
                'a table for testing', [
                    ColumnMetadata('col_id1', 'description of id1', 'number',
                                   0),
                    ColumnMetadata('col_id2', 'description of id2', 'number',
                                   1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp_ltz', 4),
                    ColumnMetadata('ds', None, 'varchar', 5)
                ])

            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
    def test_extraction_with_multiple_result(self) -> None:
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {
                'schema':
                'test_schema1',
                'name':
                'test_table1',
                'description':
                'test table 1',
                'cluster':
                self.conf[
                    f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'],
                'is_view':
                'nottrue'
            }

            table1 = {
                'schema':
                'test_schema1',
                'name':
                'test_table2',
                'description':
                'test table 2',
                'cluster':
                self.conf[
                    f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'],
                'is_view':
                'false'
            }

            table2 = {
                'schema':
                'test_schema2',
                'name':
                'test_table3',
                'description':
                'test table 3',
                'cluster':
                self.conf[
                    f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'],
                'is_view':
                'true'
            }

            sql_execute.return_value = [
                self._union(
                    {
                        'col_name': 'col_id1',
                        'col_type': 'number',
                        'col_description': 'description of col_id1',
                        'col_sort_order': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'col_id2',
                        'col_type': 'number',
                        'col_description': 'description of col_id2',
                        'col_sort_order': 1
                    }, table),
                self._union(
                    {
                        'col_name': 'is_active',
                        'col_type': 'boolean',
                        'col_description': None,
                        'col_sort_order': 2
                    }, table),
                self._union(
                    {
                        'col_name': 'source',
                        'col_type': 'varchar',
                        'col_description': 'description of source',
                        'col_sort_order': 3
                    }, table),
                self._union(
                    {
                        'col_name': 'etl_created_at',
                        'col_type': 'timestamp_ltz',
                        'col_description': 'description of etl_created_at',
                        'col_sort_order': 4
                    }, table),
                self._union(
                    {
                        'col_name': 'ds',
                        'col_type': 'varchar',
                        'col_description': None,
                        'col_sort_order': 5
                    }, table),
                self._union(
                    {
                        'col_name': 'col_name',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name',
                        'col_sort_order': 0
                    }, table1),
                self._union(
                    {
                        'col_name': 'col_name2',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name2',
                        'col_sort_order': 1
                    }, table1),
                self._union(
                    {
                        'col_name': 'col_id3',
                        'col_type': 'varchar',
                        'col_description': 'description of col_id3',
                        'col_sort_order': 0
                    }, table2),
                self._union(
                    {
                        'col_name': 'col_name3',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name3',
                        'col_sort_order': 1
                    }, table2)
            ]

            extractor = SnowflakeMetadataExtractor()
            extractor.init(self.conf)

            expected = TableMetadata(
                'snowflake', self.conf[
                    f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'],
                'test_schema1', 'test_table1', 'test table 1', [
                    ColumnMetadata('col_id1', 'description of col_id1',
                                   'number', 0),
                    ColumnMetadata('col_id2', 'description of col_id2',
                                   'number', 1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp_ltz', 4),
                    ColumnMetadata('ds', None, 'varchar', 5)
                ])
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                'snowflake', self.conf[
                    f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'],
                'test_schema1', 'test_table2', 'test table 2', [
                    ColumnMetadata('col_name', 'description of col_name',
                                   'varchar', 0),
                    ColumnMetadata('col_name2', 'description of col_name2',
                                   'varchar', 1)
                ])
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                'snowflake', self.conf[
                    f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'],
                'test_schema2', 'test_table3', 'test table 3', [
                    ColumnMetadata('col_id3', 'description of col_id3',
                                   'varchar', 0),
                    ColumnMetadata('col_name3', 'description of col_name3',
                                   'varchar', 1)
                ], True)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            self.assertIsNone(extractor.extract())
            self.assertIsNone(extractor.extract())