def test_load_group_by_none(self, _): with warnings.catch_warnings(): warnings.filterwarnings('ignore', 'The compiler package is deprecated') import_pipeline.run([ '--load_time=', '--input=tests/data/resource.json', '--group_by=NONE', '--stage={}'.format(STAGE_PATH), '--dataset=test_resource' ]) rows = [] export_files = 0 for fn in glob.glob(os.path.join(STAGE_PATH, '*.json')): export_files += 1 with open(fn) as f: for line in f: rows.append(json.loads(line)) self.assertEqual(export_files, 1) found_assets = {} found_names = {} for row in rows: found_assets[row['asset_type']] = row found_names[row['name']] = row self.assertEqual(len(found_names), 2) self.assertEqual(len(found_assets), 2) instance_row = found_assets['google.compute.Instance'] resource_properties = instance_row['resource']['json_data'] self.assertIsInstance(resource_properties, string_types) self.assertNotIn('data', instance_row['resource'])
def test_resources(self, _): with warnings.catch_warnings(): warnings.filterwarnings('ignore', 'The compiler package is deprecated') import_pipeline.run([ '--load_time=', '--input=tests/data/resource.json', '--group_by=ASSET_TYPE', '--stage={}'.format(STAGE_PATH), '--dataset=test_iam_resource' ]) rows = [] export_files = 0 for fn in glob.glob(os.path.join(STAGE_PATH, 'google.compute.*')): export_files += 1 with open(fn) as f: for line in f: rows.append(json.loads(line)) self.assertEqual(export_files, 2) found_assets = {} found_names = {} for row in rows: found_assets[row['asset_type']] = row found_names[row['name']] = row self.assertEqual(len(found_names), 2) self.assertEqual(len(found_assets), 2) instance_row = found_assets['google.compute.Instance'] instance_labels = instance_row['resource']['data']['labels'] self.assertIsInstance(instance_labels, list) self.assertEqual(len(instance_labels), 1)
def test_assets(self, _): with warnings.catch_warnings(): warnings.filterwarnings('ignore', 'The compiler package is deprecated') import_pipeline.run([ '--load_time=', '--input=tests/data/iam_policy.json', '--group_by=ASSET_TYPE', '--stage={}'.format(STAGE_PATH), '--dataset=test_iam_policy' ]) rows = [] for fn in glob.glob(os.path.join(STAGE_PATH, 'google.*')): with open(fn) as f: for line in f: rows.append(json.loads(line)) self.assertEqual(len(rows), 2) found_names = {} for row in rows: found_names[row['name']] = row self.assertEqual(row['asset_type'], 'google.cloud.billing.BillingAccount') self.assertEqual(len(found_names), 2)
def run_pipeline_beam_runner(pipeline_runner, dataflow_project, input_location, group_by, write_disposition, dataset, stage, load_time, num_shards, pipeline_arguments): """Invokes the pipeline with a beam runner. Only tested with the dataflow and direct runners. Args: pipeline_runner: The Beam runner to use. dataflow_project: Project to run the dataflow job in. input_location: GCS path load json documents from, group_by: How to split assets into tables. write_disposition: To append to or ovewrite BigQuery tables. dataset: BigQuery dataset to write to. stage: GCS path to write BigQuery load files. load_time: Timestamp to add to data during during BigQuery load. num_shards: Shards for for each asset type. pipeline_arguments: List of additional runner arguments. Returns: The end state of the pipeline run (a string), and PipelineResult. """ # pylint: disable=import-error # import on demand as we don't want to depend on pipeline code which imports # apache beam code unless we are using a beam runner and not invoking a # template. from asset_inventory import import_pipeline job_name = get_job_name(load_time) pipeline_parameters = pipeline_arguments parameters = { '--load_time': load_time, '--job_name': job_name, '--project': dataflow_project, '--input': input_location, '--group_by': group_by, '--write_disposition': write_disposition, '--num_shards': num_shards, '--dataset': dataset, '--stage': stage, '--runner': pipeline_runner } for arg_name, value in parameters.items(): if value and arg_name not in pipeline_parameters: pipeline_parameters += [arg_name, value] pipeline_result = import_pipeline.run(pipeline_parameters) logging.info('waiting on pipeline : %s', pprint.pformat(pipeline_result)) state = pipeline_result.wait_until_finish() logging.info('final pipeline state: %s', state) return pipeline_result.state, pipeline_result