def test_load_group_by_none(self, _):
     with warnings.catch_warnings():
         warnings.filterwarnings('ignore',
                                 'The compiler package is deprecated')
         import_pipeline.run([
             '--load_time=', '--input=tests/data/resource.json',
             '--group_by=NONE', '--stage={}'.format(STAGE_PATH),
             '--dataset=test_resource'
         ])
         rows = []
         export_files = 0
         for fn in glob.glob(os.path.join(STAGE_PATH, '*.json')):
             export_files += 1
             with open(fn) as f:
                 for line in f:
                     rows.append(json.loads(line))
         self.assertEqual(export_files, 1)
         found_assets = {}
         found_names = {}
         for row in rows:
             found_assets[row['asset_type']] = row
             found_names[row['name']] = row
         self.assertEqual(len(found_names), 2)
         self.assertEqual(len(found_assets), 2)
         instance_row = found_assets['google.compute.Instance']
         resource_properties = instance_row['resource']['json_data']
         self.assertIsInstance(resource_properties, string_types)
         self.assertNotIn('data', instance_row['resource'])
    def test_resources(self, _):
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore',
                                    'The compiler package is deprecated')
            import_pipeline.run([
                '--load_time=', '--input=tests/data/resource.json',
                '--group_by=ASSET_TYPE', '--stage={}'.format(STAGE_PATH),
                '--dataset=test_iam_resource'
            ])

            rows = []
            export_files = 0
            for fn in glob.glob(os.path.join(STAGE_PATH, 'google.compute.*')):
                export_files += 1
                with open(fn) as f:
                    for line in f:
                        rows.append(json.loads(line))
            self.assertEqual(export_files, 2)
            found_assets = {}
            found_names = {}
            for row in rows:
                found_assets[row['asset_type']] = row
                found_names[row['name']] = row
            self.assertEqual(len(found_names), 2)
            self.assertEqual(len(found_assets), 2)
            instance_row = found_assets['google.compute.Instance']
            instance_labels = instance_row['resource']['data']['labels']
            self.assertIsInstance(instance_labels, list)
            self.assertEqual(len(instance_labels), 1)
    def test_assets(self, _):
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore',
                                    'The compiler package is deprecated')
            import_pipeline.run([
                '--load_time=', '--input=tests/data/iam_policy.json',
                '--group_by=ASSET_TYPE', '--stage={}'.format(STAGE_PATH),
                '--dataset=test_iam_policy'
            ])

            rows = []
            for fn in glob.glob(os.path.join(STAGE_PATH, 'google.*')):
                with open(fn) as f:
                    for line in f:
                        rows.append(json.loads(line))
            self.assertEqual(len(rows), 2)
            found_names = {}
            for row in rows:
                found_names[row['name']] = row
                self.assertEqual(row['asset_type'],
                                 'google.cloud.billing.BillingAccount')
            self.assertEqual(len(found_names), 2)
Beispiel #4
0
def run_pipeline_beam_runner(pipeline_runner, dataflow_project, input_location,
                             group_by, write_disposition, dataset, stage,
                             load_time, num_shards, pipeline_arguments):
    """Invokes the pipeline with a beam runner.

    Only tested with the dataflow and direct runners.

    Args:
        pipeline_runner: The Beam runner to use.
        dataflow_project: Project to run the dataflow job in.
        input_location: GCS path load json documents from,
        group_by: How to split assets into tables.
        write_disposition: To append to or ovewrite BigQuery tables.
        dataset: BigQuery dataset to write to.
        stage: GCS path to write BigQuery load files.
        load_time: Timestamp to add to data during during BigQuery load.
        num_shards: Shards for for each asset type.
        pipeline_arguments: List of additional runner arguments.
    Returns:
        The end state of the pipeline run (a string), and PipelineResult.
    """

    # pylint: disable=import-error
    # import on demand as we don't want to depend on pipeline code which imports
    # apache beam code unless we are using a beam runner and not invoking a
    # template.
    from asset_inventory import import_pipeline
    job_name = get_job_name(load_time)

    pipeline_parameters = pipeline_arguments

    parameters = {
        '--load_time': load_time,
        '--job_name': job_name,
        '--project': dataflow_project,
        '--input': input_location,
        '--group_by': group_by,
        '--write_disposition': write_disposition,
        '--num_shards': num_shards,
        '--dataset': dataset,
        '--stage': stage,
        '--runner': pipeline_runner
    }
    for arg_name, value in parameters.items():
        if value and arg_name not in pipeline_parameters:
            pipeline_parameters += [arg_name, value]
    pipeline_result = import_pipeline.run(pipeline_parameters)
    logging.info('waiting on pipeline : %s', pprint.pformat(pipeline_result))
    state = pipeline_result.wait_until_finish()
    logging.info('final pipeline state: %s', state)
    return pipeline_result.state, pipeline_result