Beispiel #1
0
    def test_extraction_with_partition_badge(self) -> None:
        with patch.object(GlueExtractor, '_search_tables') as mock_search:
            mock_search.return_value = [test_table]

            extractor = GlueExtractor()
            extractor.init(conf=ConfigFactory.from_dict({
                GlueExtractor.PARTITION_BADGE_LABEL_KEY:
                "partition_key",
            }))
            actual = extractor.extract()
            expected = TableMetadata(
                'glue', 'gold', 'test_schema', 'test_table',
                'a table for testing', [
                    ColumnMetadata('col_id1', 'description of id1', 'bigint',
                                   0),
                    ColumnMetadata('col_id2', 'description of id2', 'bigint',
                                   1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5),
                    ColumnMetadata(
                        'partition_key1',
                        'description of partition_key1',
                        'string',
                        6,
                        ["partition_key"],
                    ),
                ], False)
            self.assertEqual(expected.__repr__(), actual.__repr__())
Beispiel #2
0
    def test_extraction_with_empty_query_result(self) -> None:
        """
        Test Extraction with empty result from query
        """
        with patch.object(GlueExtractor, '_search_tables'):
            extractor = GlueExtractor()
            extractor.init(self.conf)

            results = extractor.extract()
            self.assertEqual(results, None)
Beispiel #3
0
def create_glue_extractor_job():

    tmp_folder = '/var/tmp/amundsen/table_metadata'
    node_files_folder = Path(tmp_folder, 'nodes')
    relationship_files_folder = Path(tmp_folder, 'relationships')

    job_config = ConfigFactory.from_dict({
        'extractor.glue.{}'.format(GlueExtractor.CLUSTER_KEY):
        GLUE_CLUSTER_KEY,
        'extractor.glue.{}'.format(GlueExtractor.FILTER_KEY): [],
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        NEO4J_ENDPOINT,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        NEO4j_USERNAME,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        NEO4j_PASSWORD,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
        str(int(datetime.utcnow().timestamp()))
    })

    return DefaultJob(conf=job_config,
                      task=DefaultTask(extractor=GlueExtractor(),
                                       loader=FsNeo4jCSVLoader(),
                                       transformer=NoopTransformer()),
                      publisher=Neo4jCsvPublisher())
Beispiel #4
0
    def _create_glue_extractor(source: CatSource) -> Tuple[GlueExtractor, Any]:
        extractor = GlueExtractor()

        conf = ConfigFactory.from_dict(
            {
                f"extractor.glue.{GlueExtractor.CLUSTER_KEY}": "",  # TODO Setup Glue Config correctly
                f"extractor.glue.{GlueExtractor.FILTER_KEY}": [],
            }
        )

        return extractor, conf
Beispiel #5
0
    def test_extraction_with_single_result(self):
        # type: () -> None
        with patch.object(GlueExtractor, '_search_tables') as mock_search:
            mock_search.return_value = [{
                'Name': 'test_table',
                'DatabaseName': 'test_schema',
                'Description': 'a table for testing',
                'StorageDescriptor': {
                    'Columns': [{
                        'Name': 'col_id1',
                        'Type': 'bigint',
                        'Comment': 'description of id1'
                    }, {
                        'Name': 'col_id2',
                        'Type': 'bigint',
                        'Comment': 'description of id2'
                    }, {
                        'Name': 'is_active',
                        'Type': 'boolean'
                    }, {
                        'Name': 'source',
                        'Type': 'varchar',
                        'Comment': 'description of source'
                    }, {
                        'Name': 'etl_created_at',
                        'Type': 'timestamp',
                        'Comment': 'description of etl_created_at'
                    }, {
                        'Name': 'ds',
                        'Type': 'varchar'
                    }]
                }
            }]

            extractor = GlueExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                'glue', 'gold', 'test_schema', 'test_table',
                'a table for testing', [
                    ColumnMetadata('col_id1', 'description of id1', 'bigint',
                                   0),
                    ColumnMetadata('col_id2', 'description of id2', 'bigint',
                                   1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5)
                ])
            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
Beispiel #6
0
    def test_extraction_with_resource_link_result(self) -> None:
        with patch.object(GlueExtractor, '_search_tables') as mock_search:
            mock_search.return_value = [
                test_table, {
                    "Name": "test_resource_link",
                    "DatabaseName": "test_schema",
                    "TargetTable": {
                        "CatalogId": "111111111111",
                        "DatabaseName": "test_schema_external",
                        "Name": "test_table"
                    },
                    "CatalogId": "222222222222"
                }
            ]

            extractor = GlueExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                'glue', 'gold', 'test_schema', 'test_table',
                'a table for testing', [
                    ColumnMetadata('col_id1', 'description of id1', 'bigint',
                                   0),
                    ColumnMetadata('col_id2', 'description of id2', 'bigint',
                                   1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5),
                    ColumnMetadata('partition_key1',
                                   'description of partition_key1', 'string',
                                   6),
                ], False)
            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
Beispiel #7
0
    def test_extraction_with_multiple_result(self) -> None:
        with patch.object(GlueExtractor, '_search_tables') as mock_search:
            mock_search.return_value = [
                test_table,
                {
                    'Name': 'test_table2',
                    'DatabaseName': 'test_schema1',
                    'Description': 'test table 2',
                    'StorageDescriptor': {
                        'Columns': [{
                            'Name': 'col_name',
                            'Type': 'varchar',
                            'Comment': 'description of col_name'
                        }, {
                            'Name': 'col_name2',
                            'Type': 'varchar',
                            'Comment': 'description of col_name2'
                        }]
                    },
                    'TableType': 'EXTERNAL_TABLE',
                },
                {
                    'Name': 'test_table3',
                    'DatabaseName': 'test_schema2',
                    'StorageDescriptor': {
                        'Columns': [{
                            'Name': 'col_id3',
                            'Type': 'varchar',
                            'Comment': 'description of col_id3'
                        }, {
                            'Name': 'col_name3',
                            'Type': 'varchar',
                            'Comment': 'description of col_name3'
                        }]
                    },
                    'Parameters': {
                        'comment': 'description of test table 3 from comment'
                    },
                    'TableType': 'EXTERNAL_TABLE',
                },
                {
                    'Name': 'test_view1',
                    'DatabaseName': 'test_schema1',
                    'Description': 'test view 1',
                    'StorageDescriptor': {
                        'Columns': [{
                            'Name': 'col_id3',
                            'Type': 'varchar',
                            'Comment': 'description of col_id3'
                        }, {
                            'Name': 'col_name3',
                            'Type': 'varchar',
                            'Comment': 'description of col_name3'
                        }]
                    },
                    'TableType': 'VIRTUAL_VIEW',
                },
            ]

            extractor = GlueExtractor()
            extractor.init(self.conf)

            expected = TableMetadata(
                'glue', 'gold', 'test_schema', 'test_table',
                'a table for testing', [
                    ColumnMetadata('col_id1', 'description of id1', 'bigint',
                                   0),
                    ColumnMetadata('col_id2', 'description of id2', 'bigint',
                                   1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5),
                    ColumnMetadata('partition_key1',
                                   'description of partition_key1', 'string',
                                   6),
                ], False)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                'glue', 'gold', 'test_schema1', 'test_table2', 'test table 2',
                [
                    ColumnMetadata('col_name', 'description of col_name',
                                   'varchar', 0),
                    ColumnMetadata('col_name2', 'description of col_name2',
                                   'varchar', 1)
                ], False)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                'glue', 'gold', 'test_schema2', 'test_table3',
                'description of test table 3 from comment', [
                    ColumnMetadata('col_id3', 'description of col_id3',
                                   'varchar', 0),
                    ColumnMetadata('col_name3', 'description of col_name3',
                                   'varchar', 1)
                ], False)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                'glue', 'gold', 'test_schema1', 'test_view1', 'test view 1', [
                    ColumnMetadata('col_id3', 'description of col_id3',
                                   'varchar', 0),
                    ColumnMetadata('col_name3', 'description of col_name3',
                                   'varchar', 1)
                ], True)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            self.assertIsNone(extractor.extract())
            self.assertIsNone(extractor.extract())