def test_sql_statement(self) -> None: """ Test Extraction with empty result from query """ with patch.object(SQLAlchemyExtractor, '_get_connection'): extractor = PostgresMetadataExtractor() extractor.init(self.conf) self.assertTrue(self.where_clause_suffix in extractor.sql_stmt)
def test_sql_statement(self) -> None: """ Test Extraction with empty result from query """ with patch.object(SQLAlchemyExtractor, '_get_connection'): extractor = PostgresMetadataExtractor() extractor.init(self.conf) self.assertTrue(PostgresMetadataExtractor.DEFAULT_CLUSTER_NAME in extractor.sql_stmt)
def test_sql_statement(self) -> None: """ Test Extraction with empty result from query """ with patch.object(SQLAlchemyExtractor, '_get_connection'): extractor = PostgresMetadataExtractor() extractor.init(self.conf) self.assertTrue('table_catalog' in extractor.sql_stmt) self.assertFalse(self.cluster_key in extractor.sql_stmt)
def test_extraction_with_empty_query_result(self) -> None: """ Test Extraction with empty result from query """ with patch.object(SQLAlchemyExtractor, '_get_connection'): extractor = PostgresMetadataExtractor() extractor.init(self.conf) results = extractor.extract() self.assertEqual(results, None)
def create_table_extract_job(**kwargs): where_clause_suffix = textwrap.dedent(""" where table_schema in {schemas} """.format(schemas=SUPPORTED_SCHEMA_SQL_IN_CLAUSE)) tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships/'.format(tmp_folder=tmp_folder) job_config = ConfigFactory.from_dict({ 'extractor.postgres_metadata.{}'.format(PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause_suffix, 'extractor.postgres_metadata.{}'.format(PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): True, 'extractor.postgres_metadata.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): connection_string(), 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=PostgresMetadataExtractor(), loader=FsNeo4jCSVLoader()), publisher=Neo4jCsvPublisher()) job.launch()
def run_postgres_job(): where_clause_suffix = textwrap.dedent(""" schemaname = 'public' """) tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = f'{tmp_folder}/nodes/' relationship_files_folder = f'{tmp_folder}/relationships/' job_config = ConfigFactory.from_dict({ f'extractor.postgres_metadata.{PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix, f'extractor.postgres_metadata.{PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}': True, f'extractor.postgres_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=PostgresMetadataExtractor(), loader=FsNeo4jCSVLoader()), publisher=Neo4jCsvPublisher()) return job
def create_table_extract_job(): where_clause_suffix = f"st.schemaname in {SUPPORTED_SCHEMA_SQL_IN_CLAUSE}" tmp_folder = "/var/tmp/amundsen/table_metadata" node_files_folder = f"{tmp_folder}/nodes/" relationship_files_folder = f"{tmp_folder}/relationships/" job_config = ConfigFactory.from_dict( { f"extractor.postgres_metadata.{PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}": where_clause_suffix, f"extractor.postgres_metadata.{PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}": True, f"extractor.postgres_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}": connection_string(), f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}": node_files_folder, f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}": relationship_files_folder, f"publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}": node_files_folder, f"publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}": relationship_files_folder, f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}": neo4j_endpoint, f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}": neo4j_user, f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}": neo4j_password, f"publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}": "unique_tag", # should use unique tag here like {ds} } ) job = DefaultJob( conf=job_config, task=DefaultTask( extractor=PostgresMetadataExtractor(), loader=FsNeo4jCSVLoader() ), publisher=Neo4jCsvPublisher(), ) job.launch()
def test_extraction_with_single_result(self): # type: () -> None with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute table = {'schema': 'test_schema', 'name': 'test_table', 'description': 'a table for testing', 'cluster': self.conf['extractor.postgres_metadata.{}'.format(PostgresMetadataExtractor.CLUSTER_KEY)] } sql_execute.return_value = [ self._union( {'col_name': 'col_id1', 'col_type': 'bigint', 'col_description': 'description of id1', 'col_sort_order': 0}, table), self._union( {'col_name': 'col_id2', 'col_type': 'bigint', 'col_description': 'description of id2', 'col_sort_order': 1}, table), self._union( {'col_name': 'is_active', 'col_type': 'boolean', 'col_description': None, 'col_sort_order': 2}, table), self._union( {'col_name': 'source', 'col_type': 'varchar', 'col_description': 'description of source', 'col_sort_order': 3}, table), self._union( {'col_name': 'etl_created_at', 'col_type': 'timestamp', 'col_description': 'description of etl_created_at', 'col_sort_order': 4}, table), self._union( {'col_name': 'ds', 'col_type': 'varchar', 'col_description': None, 'col_sort_order': 5}, table) ] extractor = PostgresMetadataExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata('postgres', 'MY_CLUSTER', 'test_schema', 'test_table', 'a table for testing', [ColumnMetadata('col_id1', 'description of id1', 'bigint', 0), ColumnMetadata('col_id2', 'description of id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5)]) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def run_mysql_job() -> DefaultJob: where_clause_suffix = textwrap.dedent(f""" where c.table_schema = '{DATABASE_SCHEMA}' """) tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships/'.format(tmp_folder=tmp_folder) job_config = ConfigFactory.from_dict({ 'extractor.postgres_metadata.{}'.format(PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause_suffix, 'extractor.postgres_metadata.{}'.format(PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): True, 'extractor.postgres_metadata.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): POSTGRES_CONN_STRING, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'publisher.awssqs.{}'.format(aws_sqs_csv_puiblisher.NODE_FILES_DIR): node_files_folder, 'publisher.awssqs.{}'.format(aws_sqs_csv_puiblisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.awssqs.{}'.format(aws_sqs_csv_puiblisher.AWS_SQS_REGION): AWS_SQS_REGION, 'publisher.awssqs.{}'.format(aws_sqs_csv_puiblisher.AWS_SQS_URL): AWS_SQS_URL, 'publisher.awssqs.{}'.format(aws_sqs_csv_puiblisher.AWS_SQS_ACCESS_KEY_ID): AWS_SQS_ACCESS_KEY_ID, 'publisher.awssqs.{}'.format(aws_sqs_csv_puiblisher.AWS_SQS_SECRET_ACCESS_KEY): AWS_SQS_SECRET_ACCESS_KEY, 'publisher.awssqs.{}'.format(aws_sqs_csv_puiblisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=PostgresMetadataExtractor(), loader=FsNeo4jCSVLoader()), publisher=AWSSQSCsvPublisher()) return job
def run_postgres_job(job_name): tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) loader = FSNeptuneCSVLoader() publisher = NeptuneCSVPublisher() where_clause_suffix = textwrap.dedent( """ where table_schema = '{}' """.format(postgres_schema) ) job_config = ConfigFactory.from_dict({ f'extractor.postgres_metadata.{PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix, f'extractor.postgres_metadata.{PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}': True, f'extractor.postgres_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), loader.get_scope(): { FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder, FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder, FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True, FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag' }, publisher.get_scope(): { NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder, NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder, NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME, NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH, NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT, NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name, NeptuneCSVPublisher.AWS_REGION: AWS_REGION }, }) DefaultJob( conf=job_config, task=DefaultTask(extractor=PostgresMetadataExtractor(), loader=loader), publisher=publisher).launch()
def test_extraction_with_multiple_result(self) -> None: with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute table = {'schema': 'test_schema1', 'name': 'test_table1', 'description': 'test table 1', 'cluster': self.conf[PostgresMetadataExtractor.CLUSTER_KEY] } table1 = {'schema': 'test_schema1', 'name': 'test_table2', 'description': 'test table 2', 'cluster': self.conf[PostgresMetadataExtractor.CLUSTER_KEY] } table2 = {'schema': 'test_schema2', 'name': 'test_table3', 'description': 'test table 3', 'cluster': self.conf[PostgresMetadataExtractor.CLUSTER_KEY] } sql_execute.return_value = [ self._union( {'col_name': 'col_id1', 'col_type': 'bigint', 'col_description': 'description of col_id1', 'col_sort_order': 0}, table), self._union( {'col_name': 'col_id2', 'col_type': 'bigint', 'col_description': 'description of col_id2', 'col_sort_order': 1}, table), self._union( {'col_name': 'is_active', 'col_type': 'boolean', 'col_description': None, 'col_sort_order': 2}, table), self._union( {'col_name': 'source', 'col_type': 'varchar', 'col_description': 'description of source', 'col_sort_order': 3}, table), self._union( {'col_name': 'etl_created_at', 'col_type': 'timestamp', 'col_description': 'description of etl_created_at', 'col_sort_order': 4}, table), self._union( {'col_name': 'ds', 'col_type': 'varchar', 'col_description': None, 'col_sort_order': 5}, table), self._union( {'col_name': 'col_name', 'col_type': 'varchar', 'col_description': 'description of col_name', 'col_sort_order': 0}, table1), self._union( {'col_name': 'col_name2', 'col_type': 'varchar', 'col_description': 'description of col_name2', 'col_sort_order': 1}, table1), self._union( {'col_name': 'col_id3', 'col_type': 'varchar', 'col_description': 'description of col_id3', 'col_sort_order': 0}, table2), self._union( {'col_name': 'col_name3', 'col_type': 'varchar', 'col_description': 'description of col_name3', 'col_sort_order': 1}, table2) ] extractor = PostgresMetadataExtractor() extractor.init(self.conf) expected = TableMetadata('postgres', self.conf[PostgresMetadataExtractor.CLUSTER_KEY], 'test_schema1', 'test_table1', 'test table 1', [ColumnMetadata('col_id1', 'description of col_id1', 'bigint', 0), ColumnMetadata('col_id2', 'description of col_id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5)]) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableMetadata('postgres', self.conf[PostgresMetadataExtractor.CLUSTER_KEY], 'test_schema1', 'test_table2', 'test table 2', [ColumnMetadata('col_name', 'description of col_name', 'varchar', 0), ColumnMetadata('col_name2', 'description of col_name2', 'varchar', 1)]) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableMetadata('postgres', self.conf[PostgresMetadataExtractor.CLUSTER_KEY], 'test_schema2', 'test_table3', 'test table 3', [ColumnMetadata('col_id3', 'description of col_id3', 'varchar', 0), ColumnMetadata('col_name3', 'description of col_name3', 'varchar', 1)]) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) self.assertIsNone(extractor.extract()) self.assertIsNone(extractor.extract())