def test_keypath_and_pagesize_can_be_set(self, mock_build: Any) -> None:
        config_dict = {
            f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.PROJECT_ID_KEY}': 'your-project-here',
            f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.PAGE_SIZE_KEY}': 200,
            f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.KEY_PATH_KEY}': '/tmp/doesnotexist',
        }
        conf = ConfigFactory.from_dict(config_dict)

        mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, TABLE_DATA)
        extractor = BigQueryMetadataExtractor()

        with self.assertRaises(FileNotFoundError):
            extractor.init(Scoped.get_scoped_conf(conf=conf,
                                                  scope=extractor.get_scope()))
Ejemplo n.º 2
0
Archivo: db.py Proyecto: vrajat/dbcat
    def _create_big_query_extractor(
        source: CatSource,
    ) -> Tuple[BigQueryMetadataExtractor, Any]:
        extractor = BigQueryMetadataExtractor()
        scope = extractor.get_scope()

        conf = ConfigFactory.from_dict(
            {
                f"{scope}.connection_name": source.name,
                f"{scope}.key_path": source.key_path,
                f"{scope}.project_id": source.project_id,
                f"{scope}.project_credentials": source.project_credentials,
                f"{scope}.page_size": source.page_size,
                f"{scope}.filter_key": source.filter_key,
                f"{scope}.included_tables_regex": source.included_tables_regex,
            }
        )

        return extractor, conf
 def test_empty_dataset(self, mock_build: Any) -> None:
     mock_build.return_value = MockBigQueryClient(ONE_DATASET, NO_TABLES, None)
     extractor = BigQueryMetadataExtractor()
     extractor.init(Scoped.get_scoped_conf(conf=self.conf,
                                           scope=extractor.get_scope()))
     result = extractor.extract()
     self.assertIsNone(result)
 def test_can_handle_datasets(self, mock_build: Any) -> None:
     mock_build.return_value = MockBigQueryClient(NO_DATASETS, None, None)
     extractor = BigQueryMetadataExtractor()
     extractor.init(Scoped.get_scoped_conf(conf=self.conf,
                                           scope=extractor.get_scope()))
     result = extractor.extract()
     self.assertIsNone(result)
Ejemplo n.º 5
0
def create_extractor(metadata_type):
    if metadata_type == MetadataType.DSL:
        extractor = BigQueryMetadataExtractor()
        extractor_key = 'extractor.bigquery_table_metadata.{}'.format(
            BigQueryMetadataExtractor.PROJECT_ID_KEY)
    elif metadata_type == MetadataType.USAGE:
        extractor = BigQueryTableUsageExtractor()
        extractor_key = 'extractor.bigquery_table_usage.{}'.format(
            BigQueryTableUsageExtractor.PROJECT_ID_KEY)
    else:
        raise ValueError('Invalid metadata_type')

    return extractor, extractor_key
 def test_view(self, mock_build: Any) -> None:
     mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_VIEW, VIEW_DATA)
     extractor = BigQueryMetadataExtractor()
     extractor.init(Scoped.get_scoped_conf(conf=self.conf,
                                           scope=extractor.get_scope()))
     result = extractor.extract()
     self.assertIsInstance(result, TableMetadata)
     self.assertEqual(result.is_view, True)
    def test_table_part_of_table_date_range(self, mock_build: Any) -> None:
        mock_build.return_value = MockBigQueryClient(ONE_DATASET, TABLE_DATE_RANGE, TABLE_DATA)
        extractor = BigQueryMetadataExtractor()
        extractor.init(Scoped.get_scoped_conf(conf=self.conf,
                                              scope=extractor.get_scope()))

        count = 0
        result = extractor.extract()
        table_name = result.name
        while result:
            count += 1
            result = extractor.extract()

        self.assertEqual(count, 1)
        self.assertEqual(table_name, 'date_range_')
    def test_accepts_dataset_filter_by_label(self, mock_build: Any) -> None:
        config_dict = {
            f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.PROJECT_ID_KEY}': 'your-project-here',
            f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.FILTER_KEY}': 'label.key:value'
        }
        conf = ConfigFactory.from_dict(config_dict)

        mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, TABLE_DATA)
        extractor = BigQueryMetadataExtractor()
        extractor.init(Scoped.get_scoped_conf(conf=conf,
                                              scope=extractor.get_scope()))
        result = extractor.extract()
        self.assertIsInstance(result, TableMetadata)
    def test_table_without_columns(self, mock_build: Any) -> None:
        mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, NO_COLS)
        extractor = BigQueryMetadataExtractor()
        extractor.init(Scoped.get_scoped_conf(conf=self.conf,
                                              scope=extractor.get_scope()))
        result = extractor.extract()

        self.assertEqual(result.database, 'bigquery')
        self.assertEqual(result.cluster, 'your-project-here')
        self.assertEqual(result.schema, 'fdgdfgh')
        self.assertEqual(result.name, 'nested_recs')
        self.assertEqual(result.description.text, "")
        self.assertEqual(result.columns, [])
        self.assertEqual(result.is_view, False)
Ejemplo n.º 10
0
def create_table_extract_job(**kwargs):

    tmp_folder = '/var/tmp/amundsen/{metadata_type}'.format(
        metadata_type=kwargs['metadata_type'])
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    bq_meta_extractor = BigQueryMetadataExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=bq_meta_extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())

    job_config = ConfigFactory.from_dict({
        'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PROJECT_ID_KEY):
            kwargs['PROJECT_ID_KEY'],
        'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.FILTER_KEY): #filter desired datasets only
        'labels.set_label:data_platform',
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
            node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
            relationship_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR):
            True,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
            node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
            relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
            neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
            neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
            neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
            'unique_tag',  # should use unique tag here like {ds}
    })
    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())

    job.launch()
    def test_table_with_nested_records(self, mock_build: Any) -> None:
        mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, NESTED_DATA)
        extractor = BigQueryMetadataExtractor()
        extractor.init(Scoped.get_scoped_conf(conf=self.conf,
                                              scope=extractor.get_scope()))
        result = extractor.extract()

        first_col = result.columns[0]
        self.assertEqual(first_col.name, 'nested')
        self.assertEqual(first_col.type, 'RECORD')
        second_col = result.columns[1]
        self.assertEqual(second_col.name, 'nested.nested2')
        self.assertEqual(second_col.type, 'RECORD')
        third_col = result.columns[2]
        self.assertEqual(third_col.name, 'nested.nested2.ahah')
        self.assertEqual(third_col.type, 'STRING')
    def test_normal_table(self, mock_build: Any) -> None:
        mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, TABLE_DATA)
        extractor = BigQueryMetadataExtractor()
        extractor.init(Scoped.get_scoped_conf(conf=self.conf,
                                              scope=extractor.get_scope()))
        result = extractor.extract()

        self.assertEqual(result.database, 'bigquery')
        self.assertEqual(result.cluster, 'your-project-here')
        self.assertEqual(result.schema, 'fdgdfgh')
        self.assertEqual(result.name, 'nested_recs')
        self.assertEqual(result.description.text, "")

        first_col = result.columns[0]
        self.assertEqual(first_col.name, 'test')
        self.assertEqual(first_col.type, 'STRING')
        self.assertEqual(first_col.description.text, 'some_description')
        self.assertEqual(result.is_view, False)
Ejemplo n.º 13
0
def create_bq_job(metadata_type, gcloud_project):
    tmp_folder = f'/var/tmp/amundsen/{metadata_type}'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'

    bq_meta_extractor = BigQueryMetadataExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=bq_meta_extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())

    job_config = ConfigFactory.from_dict({
        f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.PROJECT_ID_KEY}':
        gcloud_project,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}':
        node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}':
        relationship_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}':
        True,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}':
        node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}':
        relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}':
        neo4j_endpoint,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}':
        neo4j_user,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}':
        neo4j_password,
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}':
        'unique_tag',  # should use unique tag here like {ds}
    })
    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
    return job