def test_timestamp_pagesize_settings(self, mock_build: Any) -> None:
        """
        Test timestamp and pagesize can be set
        """
        TIMESTAMP = '2019-01-01T00:00:00.00Z'
        PAGESIZE = 215

        config_dict = {
            'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
            'your-project-here',
            'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.TIMESTAMP_KEY):
            TIMESTAMP,
            'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PAGE_SIZE_KEY):
            PAGESIZE,
        }
        conf = ConfigFactory.from_dict(config_dict)

        client = MockLoggingClient(CORRECT_DATA)
        mock_build.return_value = client
        extractor = BigQueryTableUsageExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))

        args, kwargs = client.b.list.call_args
        body = kwargs['body']

        self.assertEqual(body['pageSize'], PAGESIZE)
        self.assertEqual(TIMESTAMP in body['filter'], True)
    def test_key_path(self, mock_build: Any) -> None:
        """
        Test key_path can be used
        """

        with tempfile.NamedTemporaryFile() as keyfile:
            # There are many github scanners looking for API / cloud keys, so in order not to get a
            # false positive triggering everywhere, I base64 encoded the key.
            # This is written to a tempfile as part of this test and then used.
            keyfile.write(base64.b64decode(KEYFILE_DATA))
            keyfile.flush()
            config_dict = {
                'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
                'your-project-here',
                'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.KEY_PATH_KEY):
                keyfile.name,
            }
            conf = ConfigFactory.from_dict(config_dict)

            mock_build.return_value = MockLoggingClient(CORRECT_DATA)
            extractor = BigQueryTableUsageExtractor()
            extractor.init(
                Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))

            args, kwargs = mock_build.call_args
            creds = kwargs['http'].credentials
            self.assertEqual(creds.project_id, 'your-project-here')
            self.assertEqual(
                creds.service_account_email,
                '*****@*****.**')
    def test_email_filter_counted(self, mock_build: Any) -> None:
        config_dict = {
            'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
            'your-project-here',
            'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.EMAIL_PATTERN):
            '.*@test.com.*',
        }
        conf = ConfigFactory.from_dict(config_dict)

        mock_build.return_value = MockLoggingClient(CORRECT_DATA)
        extractor = BigQueryTableUsageExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))
        result = extractor.extract()
        assert result is not None
        self.assertIsInstance(result, tuple)

        (key, value) = result
        self.assertIsInstance(key, TableColumnUsageTuple)
        self.assertIsInstance(value, int)

        self.assertEqual(key.database, 'bigquery')
        self.assertEqual(key.cluster, 'bigquery-public-data')
        self.assertEqual(key.schema, 'austin_incidents')
        self.assertEqual(key.table, 'incidents_2008')
        self.assertEqual(key.email, '*****@*****.**')
        self.assertEqual(value, 1)
Exemple #4
0
    def test_basic_extraction(self, mock_build):
        """
        Test Extraction using mock class
        """
        config_dict = {
            'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
            'your-project-here',
        }
        conf = ConfigFactory.from_dict(config_dict)

        mock_build.return_value = MockLoggingClient(CORRECT_DATA)
        extractor = BigQueryTableUsageExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))
        result = extractor.extract()
        self.assertIsInstance(result, tuple)

        (key, value) = result
        self.assertIsInstance(key, TableColumnUsageTuple)
        self.assertIsInstance(value, int)

        self.assertEqual(key.database, 'bigquery')
        self.assertEqual(key.cluster, 'bigquery-public-data')
        self.assertEqual(key.schema, 'austin_incidents')
        self.assertEqual(key.table, 'incidents_2008')
        self.assertEqual(key.email, '*****@*****.**')
        self.assertEqual(value, 1)
Exemple #5
0
    def test_counting_referenced_table_belonging_to_different_project(
            self, mock_build: Any) -> None:
        """
        Test result when referenced table belongs to a project different from the PROJECT_ID_KEY of the extractor
        and COUNT_READS_ONLY_FROM_PROJECT is set to False
        """
        config_dict = {
            f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.PROJECT_ID_KEY}':
            'your-project-here',
            f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.COUNT_READS_ONLY_FROM_PROJECT_ID_KEY}':
            False,
        }
        conf = ConfigFactory.from_dict(config_dict)

        mock_build.return_value = MockLoggingClient(CORRECT_DATA)
        extractor = BigQueryTableUsageExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))

        result = extractor.extract()
        assert result is not None
        self.assertIsInstance(result, tuple)

        (key, value) = result
        self.assertIsInstance(key, TableColumnUsageTuple)
        self.assertIsInstance(value, int)

        self.assertEqual(key.database, 'bigquery')
        self.assertEqual(key.cluster, 'bigquery-public-data')
        self.assertEqual(key.schema, 'austin_incidents')
        self.assertEqual(key.table, 'incidents_2008')
        self.assertEqual(key.email, '*****@*****.**')
        self.assertEqual(value, 1)
Exemple #6
0
def create_extractor(metadata_type):
    if metadata_type == MetadataType.DSL:
        extractor = BigQueryMetadataExtractor()
        extractor_key = 'extractor.bigquery_table_metadata.{}'.format(
            BigQueryMetadataExtractor.PROJECT_ID_KEY)
    elif metadata_type == MetadataType.USAGE:
        extractor = BigQueryTableUsageExtractor()
        extractor_key = 'extractor.bigquery_table_usage.{}'.format(
            BigQueryTableUsageExtractor.PROJECT_ID_KEY)
    else:
        raise ValueError('Invalid metadata_type')

    return extractor, extractor_key
    def test_no_entries(self, mock_build: Any) -> None:
        config_dict = {
            'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
            'your-project-here',
        }
        conf = ConfigFactory.from_dict(config_dict)

        mock_build.return_value = MockLoggingClient(NO_ENTRIES)
        extractor = BigQueryTableUsageExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))
        result = extractor.extract()
        self.assertIsNone(result)
    def test_email_filter_not_counted(self, mock_build: Any) -> None:
        config_dict = {
            'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
            'your-project-here',
            'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.EMAIL_PATTERN):
            'emailFilter',
        }
        conf = ConfigFactory.from_dict(config_dict)

        mock_build.return_value = MockLoggingClient(CORRECT_DATA)
        extractor = BigQueryTableUsageExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))
        result = extractor.extract()
        self.assertIsNone(result)
Exemple #9
0
    def test_failed_jobs_should_not_be_counted(self, mock_build: Any) -> None:
        config_dict = {
            f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.PROJECT_ID_KEY}':
            'bigquery-public-data',
        }
        conf = ConfigFactory.from_dict(config_dict)

        client = MockLoggingClient(FAILURE)
        mock_build.return_value = client
        extractor = BigQueryTableUsageExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))

        result = extractor.extract()
        self.assertIsNone(result)
Exemple #10
0
    def test_failed_jobs_should_not_be_counted(self, mock_build):

        config_dict = {
            'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
            'your-project-here',
        }
        conf = ConfigFactory.from_dict(config_dict)

        client = MockLoggingClient(FAILURE)
        mock_build.return_value = client
        extractor = BigQueryTableUsageExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))

        result = extractor.extract()
        self.assertIsNone(result)
Exemple #11
0
def create_bq_job(metadata_type, gcloud_project):
    tmp_folder = '/var/tmp/amundsen/{metadata_type}'.format(
        metadata_type=metadata_type)
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    bq_usage_extractor = BigQueryTableUsageExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=bq_usage_extractor,
                       loader=csv_loader,
                       transformer=BigqueryUsageTransformer())

    job_config = ConfigFactory.from_dict({
        'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
        gcloud_project,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR):
        True,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
        'unique_tag',  # should use unique tag here like {ds}
    })
    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
    return job
Exemple #12
0
    def test_not_counting_referenced_table_belonging_to_different_project(
            self, mock_build: Any) -> None:
        """
        Test result when referenced table belongs to a project different from the PROJECT_ID_KEY of the extractor
        """
        config_dict = {
            f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.PROJECT_ID_KEY}':
            'your-project-here',
        }
        conf = ConfigFactory.from_dict(config_dict)

        mock_build.return_value = MockLoggingClient(CORRECT_DATA)
        extractor = BigQueryTableUsageExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))

        result = extractor.extract()
        assert result is None