Example #1
0
 def test_sql_statement(self) -> None:
     """
     Test DATABASE_KEY in extractor sql stmt
     """
     with patch.object(SQLAlchemyExtractor, '_get_connection'):
         extractor = SnowflakeTableLastUpdatedExtractor()
         extractor.init(self.conf)
         self.assertFalse(self.database_key in extractor.sql_stmt)
Example #2
0
 def test_sql_statement(self) -> None:
     """
     Test cluster_key in extractor sql stmt
     """
     with patch.object(SQLAlchemyExtractor, '_get_connection'):
         extractor = SnowflakeTableLastUpdatedExtractor()
         extractor.init(self.conf)
         self.assertTrue(self.cluster_key in extractor.sql_stmt)
Example #3
0
 def test_sql_statement(self) -> None:
     """
     test where clause in extractor sql statement
     """
     with patch.object(SQLAlchemyExtractor, '_get_connection'):
         extractor = SnowflakeTableLastUpdatedExtractor()
         extractor.init(self.conf)
         self.assertTrue(self.where_clause_suffix in extractor.sql_stmt)
Example #4
0
 def test_sql_statement(self) -> None:
     """
     Ensure catalog is used as cluster in extract sql stmt
     """
     with patch.object(SQLAlchemyExtractor, '_get_connection'):
         extractor = SnowflakeTableLastUpdatedExtractor()
         extractor.init(self.conf)
         self.assertTrue('table_catalog' in extractor.sql_stmt)
         self.assertFalse(self.cluster_key in extractor.sql_stmt)
Example #5
0
    def test_extraction_with_empty_query_result(self) -> None:
        """
        Test Extraction with empty result from query
        """
        with patch.object(SQLAlchemyExtractor, '_get_connection'):
            extractor = SnowflakeTableLastUpdatedExtractor()
            extractor.init(self.conf)

            results = extractor.extract()
            self.assertIsNone(results)
Example #6
0
    def test_extraction_with_single_result(self) -> None:
        """
        Test Extraction with default cluster and database and with one table as result
        """
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            sql_execute.return_value = [{
                'schema':
                'test_schema',
                'table_name':
                'test_table',
                'last_updated_time':
                1000,
                'cluster':
                self.conf['extractor.snowflake_table_last_updated.{}'.format(
                    SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY)],
            }]

            extractor = SnowflakeTableLastUpdatedExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()

            expected = TableLastUpdated(schema='test_schema',
                                        table_name='test_table',
                                        last_updated_time_epoch=1000,
                                        db='snowflake',
                                        cluster='MY_CLUSTER')
            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
Example #7
0
    def test_extraction_with_database_specified(self) -> None:
        """
        Test DATABASE_KEY in extractor result
        """
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute

            sql_execute.return_value = [{
                'schema': 'test_schema',
                'table_name': 'test_table',
                'last_updated_time': 1000,
                'cluster': 'MY_CLUSTER',
            }]

            extractor = SnowflakeTableLastUpdatedExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableLastUpdated(schema='test_schema',
                                        table_name='test_table',
                                        last_updated_time_epoch=1000,
                                        db=self.database_key,
                                        cluster='MY_CLUSTER')
            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
Example #8
0
def create_sample_snowflake_last_updated_job():
    where_clause = "WHERE c.TABLE_SCHEMA not in ({0}) \
                AND c.TABLE_SCHEMA not like 'STAGE_%' \
                AND c.TABLE_SCHEMA not like 'HIST_%' \
                AND c.TABLE_SCHEMA not like 'SNAP_%' \
                AND lower(c.COLUMN_NAME) not like 'dw_%';".format(','.join(IGNORED_SCHEMAS))

    where_clause = ' WHERE t.last_altered IS NOT NULL '
    tmp_folder = '/var/tmp/amundsen/{}'.format('tables')
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder)

    sql_extractor = SnowflakeTableLastUpdatedExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=sql_extractor,
                       loader=csv_loader)

    job_config = ConfigFactory.from_dict({
        'extractor.snowflake_table_last_updated.{}'.format(SnowflakeTableLastUpdatedExtractor.SNOWFLAKE_DATABASE_KEY): SNOWFLAKE_DATABASE_KEY,
        'extractor.snowflake_table_last_updated.{}'.format(SnowflakeTableLastUpdatedExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause,
        'extractor.snowflake_table_last_updated.{}'.format(SnowflakeTableLastUpdatedExtractor.USE_CATALOG_AS_CLUSTER_NAME): True,
        'extractor.snowflake_table_last_updated.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): connection_string(),
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.FORCE_CREATE_DIR): True,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag'
    })

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=Neo4jCsvPublisher())

    return job
Example #9
0
def create_snowflake_last_update_job(*, database, ignore_schemas, conn_string,
                                     host, neo4j, **kwargs):
    node_files_folder = host["node_files_folder"]
    relationship_files_folder = host["relationship_files_folder"]
    where_clause = f"WHERE t.TABLE_SCHEMA not in (\'{', '.join(ignore_schemas)}\')"
    task = DefaultTask(extractor=SnowflakeTableLastUpdatedExtractor(),
                       loader=FsNeo4jCSVLoader())
    job_config = ConfigFactory.from_dict({
        f'extractor.snowflake_table_last_updated.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}':
        conn_string,
        f'extractor.snowflake_table_last_updated.{SnowflakeTableLastUpdatedExtractor.SNOWFLAKE_DATABASE_KEY}':
        database,
        f'extractor.snowflake_table_last_updated.{SnowflakeTableLastUpdatedExtractor.WHERE_CLAUSE_SUFFIX_KEY}':
        where_clause,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}':
        node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}':
        relationship_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}':
        True,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}':
        True,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}':
        node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}':
        relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}':
        neo4j["endpoint"],
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}':
        neo4j["user"],
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}':
        neo4j["password"],
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}':
        'unique_tag'
    })
    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
    return job
Example #10
0
    def test_extraction_with_multiple_result(self) -> None:
        """
        Test Extraction with default cluster and database and with multiple tables as result
        """
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute

            default_cluster = self.conf[
                'extractor.snowflake_table_last_updated.{}'.format(
                    SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY)]

            table = {
                'schema': 'test_schema1',
                'table_name': 'test_table1',
                'last_updated_time': 1000,
                'cluster': default_cluster
            }

            table1 = {
                'schema': 'test_schema1',
                'table_name': 'test_table2',
                'last_updated_time': 2000,
                'cluster': default_cluster
            }

            table2 = {
                'schema': 'test_schema2',
                'table_name': 'test_table3',
                'last_updated_time': 3000,
                'cluster': default_cluster
            }

            sql_execute.return_value = [table, table1, table2]

            extractor = SnowflakeTableLastUpdatedExtractor()
            extractor.init(self.conf)

            expected = TableLastUpdated(schema='test_schema1',
                                        table_name='test_table1',
                                        last_updated_time_epoch=1000,
                                        db='snowflake',
                                        cluster='MY_CLUSTER')
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableLastUpdated(schema='test_schema1',
                                        table_name='test_table2',
                                        last_updated_time_epoch=2000,
                                        db='snowflake',
                                        cluster='MY_CLUSTER')
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableLastUpdated(schema='test_schema2',
                                        table_name='test_table3',
                                        last_updated_time_epoch=3000,
                                        db='snowflake',
                                        cluster='MY_CLUSTER')
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            self.assertIsNone(extractor.extract())