Example #1
0
    def test_run_given_entry_group_id_and_entry_id_should_enrich_a_single_entry(
            self, get_manually_created_fileset_entries, get_entry,
            parse_gcs_file_patterns, create_filtered_data_for_single_bucket,
            create_filtered_data_for_multiple_buckets,
            create_stats_from_dataframe, create_tag_from_stats):  # noqa: E125

        get_entry.return_value = self.__make_fake_fileset_entry()

        parse_gcs_file_patterns.return_value = [{
            'bucket_name': 'my_bucket',
            'file_regex': '.*'
        }]

        dataframe = pd.DataFrame()
        filtered_buckets_stats = {}
        create_filtered_data_for_single_bucket.return_value = (
            dataframe, filtered_buckets_stats)

        stats = {}
        create_stats_from_dataframe.return_value = stats

        datacatalog_fileset_enricher = DatacatalogFilesetEnricher(
            'test_project')
        datacatalog_fileset_enricher.run('entry_group_id', 'entry_id')

        get_manually_created_fileset_entries.assert_not_called()
        get_entry.assert_called_once()
        parse_gcs_file_patterns.assert_called_once()
        create_filtered_data_for_single_bucket.assert_called_once()
        create_filtered_data_for_multiple_buckets.assert_not_called()
        create_stats_from_dataframe.assert_called_once()
        create_tag_from_stats.assert_called_once()
Example #2
0
    def test_run_given_bucket_with_wildcard_and_multiple_gcs_patterns_should_call_retrieve_multiple_buckets(  # noqa: E501
            self, get_manually_created_fileset_entries, get_entry,
            parse_gcs_file_patterns, create_filtered_data_for_single_bucket,
            create_filtered_data_for_multiple_buckets,
            create_stats_from_dataframe, create_tag_from_stats):  # noqa:E125

        entry = self.__make_fake_fileset_entry()

        entry.gcs_fileset_spec.file_patterns.append('gs://my_bucket*/*csv')

        get_entry.return_value = entry

        parse_gcs_file_patterns.return_value = [{
            'bucket_name': 'my_bucket*',
            'file_regex': '.*'
        }, {
            'bucket_name': 'my_bucket*',
            'file_regex': '.*csv'
        }]

        dataframe = pd.DataFrame()
        filtered_buckets_stats = {}
        create_filtered_data_for_multiple_buckets.return_value = (
            dataframe, filtered_buckets_stats)

        stats = {}
        create_stats_from_dataframe.return_value = stats

        datacatalog_fileset_enricher = DatacatalogFilesetEnricher(
            'test_project')
        datacatalog_fileset_enricher.run('entry_group_id', 'entry_id')

        get_manually_created_fileset_entries.assert_not_called()
        get_entry.assert_called_once()
        parse_gcs_file_patterns.assert_called_once()
        create_filtered_data_for_single_bucket.assert_not_called()
        self.assertEqual(2,
                         create_filtered_data_for_multiple_buckets.call_count)
        create_stats_from_dataframe.assert_called_once()
        create_tag_from_stats.assert_called_once()