def test_parse_pipeline_options(self): expected_options = PipelineOptions([]) expected_options.view_as( SdkWorkerMainTest.MockOptions).m_m_option = ['beam_fn_api'] expected_options.view_as( SdkWorkerMainTest.MockOptions).m_option = '/tmp/requirements.txt' self.assertEqual( expected_options.get_all_options(), sdk_worker_main._parse_pipeline_options( '{"options": {' + '"m_option": "/tmp/requirements.txt", ' + '"m_m_option":["beam_fn_api"]' + '}}').get_all_options()) self.assertEqual( expected_options.get_all_options(), sdk_worker_main._parse_pipeline_options( '{"beam:option:m_option:v1": "/tmp/requirements.txt", ' + '"beam:option:m_m_option:v1":["beam_fn_api"]}'). get_all_options()) self.assertEqual( {'beam:option:m_option:v': 'mock_val'}, sdk_worker_main._parse_pipeline_options( '{"options": {"beam:option:m_option:v":"mock_val"}}'). get_all_options(drop_default=True)) self.assertEqual( {'eam:option:m_option:v1': 'mock_val'}, sdk_worker_main._parse_pipeline_options( '{"options": {"eam:option:m_option:v1":"mock_val"}}'). get_all_options(drop_default=True)) self.assertEqual( {'eam:option:m_option:v': 'mock_val'}, sdk_worker_main._parse_pipeline_options( '{"options": {"eam:option:m_option:v":"mock_val"}}'). get_all_options(drop_default=True))
def test_override_options(self): base_flags = ['--num_workers', '5'] options = PipelineOptions(base_flags) self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertEqual(options.get_all_options()['mock_flag'], False) options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertTrue(options.get_all_options()['mock_flag'])
def test_override_options(self): base_flags = ['--num_workers', '5'] options = PipelineOptions(base_flags) self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertEqual(options.get_all_options()['mock_flag'], False) options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertTrue(options.get_all_options()['mock_flag'])
def test_extra_package(self): options = PipelineOptions(['--extra_package', 'abc', '--extra_packages', 'def', '--extra_packages', 'ghi']) self.assertEqual( sorted(options.get_all_options()['extra_packages']), ['abc', 'def', 'ghi']) options = PipelineOptions(flags=['']) self.assertEqual(options.get_all_options()['extra_packages'], None)
def test_extra_package(self): options = PipelineOptions([ '--extra_package', 'abc', '--extra_packages', 'def', '--extra_packages', 'ghi' ]) self.assertEqual(sorted(options.get_all_options()['extra_packages']), ['abc', 'def', 'ghi']) options = PipelineOptions(flags=['']) self.assertEqual(options.get_all_options()['extra_packages'], None)
def test_experiments(self): options = PipelineOptions(['--experiment', 'abc', '--experiment', 'def']) self.assertEqual( sorted(options.get_all_options()['experiments']), ['abc', 'def']) options = PipelineOptions(['--experiments', 'abc', '--experiments', 'def']) self.assertEqual( sorted(options.get_all_options()['experiments']), ['abc', 'def']) options = PipelineOptions(flags=['']) self.assertEqual(options.get_all_options()['experiments'], None)
def test_experiments(self): options = PipelineOptions(['--experiment', 'abc', '--experiment', 'def']) self.assertEqual( sorted(options.get_all_options()['experiments']), ['abc', 'def']) options = PipelineOptions(['--experiments', 'abc', '--experiments', 'def']) self.assertEqual( sorted(options.get_all_options()['experiments']), ['abc', 'def']) options = PipelineOptions(flags=['']) self.assertEqual(options.get_all_options()['experiments'], None)
def test_service_options(self): options = PipelineOptions( ['--service_option', 'whizz=bang', '--service_option', 'beep=boop']) self.assertEqual( sorted(options.get_all_options()['service_options']), ['beep=boop', 'whizz=bang']) options = PipelineOptions( ['--service_options', 'whizz=bang', '--service_options', 'beep=boop']) self.assertEqual( sorted(options.get_all_options()['service_options']), ['beep=boop', 'whizz=bang']) options = PipelineOptions(flags=['']) self.assertEqual(options.get_all_options()['service_options'], None)
def load( self, use_apache_beam: bool = False, pipeline_options: PipelineOptions = PipelineOptions(), ) -> pd.DataFrame: """Returns a DataFrame of all results from this ExperimentalDesign.""" if not self._all_trials: self.generate_trials() temp_location = pipeline_options.get_all_options().get( "temp_location", "") temp_result_path = (temp_location + "/temp_result.csv" if temp_location else "/temp_result.csv") if use_apache_beam: self._evaluate_all_trials_using_apache_beam( pipeline_options, temp_result_path) elif self._cores != 1: self._evaluate_all_trials_in_parallel() result = None if use_apache_beam: with self._filesystem.open(temp_result_path) as file: result = pd.read_csv(file) else: result = pd.concat( trial.evaluate(self._seed, self._filesystem) for trial in self._all_trials) return result
def main(argv): """Main entry point""" # Define and parse command line arguments parser = argparse.ArgumentParser() parser.add_argument('--input', type=str, default='', help='Path to the data file(s) containing game data.') parser.add_argument( '--output_dataset', type=str, default='', help='The BigQuery dataset name where to write all the data.') parser.add_argument( '--output_table_name', type=str, default='', help='The BigQuery table name where to write all the data.') args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) # Create and run the pipeline with beam.Pipeline(options=options) as p: (p | 'ReadInputText' >> beam.io.ReadFromText(args.input) | 'ParseGameEvent' >> ParDo(ParseEventFn()) | 'ExtractUserScore' >> ExtractAndSumScore() | 'FormatUserScoreSums' >> ParDo(FormatUserScoreSumsFn()) | 'WriteTeamScoreSums' >> WriteToBigQuery( args.output_table_name, args.output_dataset, options.get_all_options().get("project"), table_schema()))
def test_parse_pipeline_options(self): expected_options = PipelineOptions([]) expected_options.view_as( SdkWorkerMainTest.MockOptions).m_m_option = [ 'worker_threads=1', 'beam_fn_api' ] expected_options.view_as( SdkWorkerMainTest.MockOptions).m_option = '/tmp/requirements.txt' self.assertEqual( {'m_m_option': ['worker_threads=1']}, sdk_worker_main._parse_pipeline_options( '{"options": {"m_m_option":["worker_threads=1"]}}') .get_all_options(drop_default=True)) self.assertEqual( expected_options.get_all_options(), sdk_worker_main._parse_pipeline_options( '{"options": {' + '"m_option": "/tmp/requirements.txt", ' + '"m_m_option":["worker_threads=1", "beam_fn_api"]' + '}}').get_all_options()) self.assertEqual( {'m_m_option': ['worker_threads=1']}, sdk_worker_main._parse_pipeline_options( '{"beam:option:m_m_option:v1":["worker_threads=1"]}') .get_all_options(drop_default=True)) self.assertEqual( expected_options.get_all_options(), sdk_worker_main._parse_pipeline_options( '{"beam:option:m_option:v1": "/tmp/requirements.txt", ' + '"beam:option:m_m_option:v1":["worker_threads=1", ' + '"beam_fn_api"]}').get_all_options()) self.assertEqual( {'beam:option:m_option:v': 'mock_val'}, sdk_worker_main._parse_pipeline_options( '{"options": {"beam:option:m_option:v":"mock_val"}}') .get_all_options(drop_default=True)) self.assertEqual( {'eam:option:m_option:v1': 'mock_val'}, sdk_worker_main._parse_pipeline_options( '{"options": {"eam:option:m_option:v1":"mock_val"}}') .get_all_options(drop_default=True)) self.assertEqual( {'eam:option:m_option:v': 'mock_val'}, sdk_worker_main._parse_pipeline_options( '{"options": {"eam:option:m_option:v":"mock_val"}}') .get_all_options(drop_default=True))
def run(pipeline_options, known_args): global force_tf_compat_v1 argv = None # if None, uses sys.argv pipeline_options = PipelineOptions(argv) pipeline = beam.Pipeline(options=pipeline_options) if "universal-sentence-encoder" in MODEL_URL and int( MODEL_URL.split("/")[-1]) <= 2: # https://github.com/tensorflow/transform/issues/160 force_tf_compat_v1 = True with tft_beam.Context(temp_dir=tempfile.mkdtemp(), force_tf_compat_v1=force_tf_compat_v1): print("Context force_tf_compat_v1: {}".format( tft_beam.Context.get_use_tf_compat_v1())) articles = ( pipeline | beam.Create([ { "id": "01", "text": "To be, or not to be: that is the question: " }, { "id": "02", "text": "Whether 'tis nobler in the mind to suffer " }, { "id": "03", "text": "The slings and arrows of outrageous fortune, " }, { "id": "04", "text": "Or to take arms against a sea of troubles, " }, ])) articles_dataset = (articles, get_metadata()) transformed_dataset, transform_fn = ( articles_dataset | "Extract embeddings" >> tft_beam.AnalyzeAndTransformDataset(preprocess_fn)) transformed_data, transformed_metadata = transformed_dataset _ = (transformed_data | "Print embeddings" >> beam.Map(print_pass) | "Write embeddings to TFRecords" >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix="{0}".format(known_args.output_dir), file_name_suffix=".tfrecords", coder=tft_coders.example_proto_coder.ExampleProtoCoder( transformed_metadata.schema), num_shards=1)) job = pipeline.run() if pipeline_options.get_all_options()["runner"] == "DirectRunner": job.wait_until_finish()
def run(argv=None): """The main function which creates the pipeline and runs it.""" parser = argparse.ArgumentParser() # Here we add some specific command line arguments we expect. # Specifically we have the input file to load and the output table to # This is the final stage of the pipeline, where we define the destination # of the data. In this case we are writing to BigQuery. parser.add_argument( '--input_subscription', required=True, help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) parser.add_argument('--output', required=True, help='Output bucket for data', default='') parser.add_argument('--log', required=True, help='log bucket', default='') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) #import pprint #pprint.pprint(known_args) #pprint.pprint(pipeline_args) #pprint.pprint(pipeline_options.get_all_options()) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True # get options project_id = pipeline_options.get_all_options()['project'] output_bucket_name = known_args.output log_bucket_name = known_args.log log_file_path = 'gs://{}/logs'.format(log_bucket_name) fs = GCSFileSystem(pipeline_options=pipeline_options) # DataIngestion is a class we built in this script to hold the logic for # transforming the file into a BigQuery table. data_copier = DataCopier() # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information including where Dataflow should # store temp files, and what the project id is p = beam.Pipeline(options=pipeline_options) (p | beam.io.ReadFromPubSub(subscription=known_args.input_subscription) | 'Copying customer data to the final data-bucket/customer-id' >> beam.Map(lambda m: data_copier.parse_method(m, project_id, fs, output_bucket_name)) | 'Write results to the output bucket' >> WriteToText(file_path_prefix=log_file_path)) p.run().wait_until_finish()
def test_option_with_space(self): options = PipelineOptions(flags=['--option with space= value with space']) self.assertEqual( getattr(options.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space') options_from_dict = PipelineOptions.from_dictionary( options.get_all_options()) self.assertEqual( getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space')
def test_get_all_options(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) self.assertDictContainsSubset(case['expected'], options.get_all_options()) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test_should_create_pipeline_from_pipeline_options( self, _create_pipeline_mock: mock.Mock, ): # given _create_pipeline_mock.return_value.run.return_value = RunnerResult( 'DONE', None) driver = CountWordsDriver() options = PipelineOptions() options.view_as(StandardOptions).runner = 'DataflowRunner' options.view_as(GoogleCloudOptions).project = 'gcp_project_id' options.view_as(GoogleCloudOptions).job_name = 'beam-wordcount-uuid' options.view_as( GoogleCloudOptions).staging_location = "gs://staging_location" options.view_as( GoogleCloudOptions).temp_location = "gs://temp_location" options.view_as(GoogleCloudOptions).region = 'region' options.view_as( GoogleCloudOptions).service_account_email = 'service-account' options.view_as(WorkerOptions).machine_type = 'n2-standard-8' options.view_as(WorkerOptions).max_num_workers = 2 options.view_as( WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED' options.view_as(SetupOptions).setup_file = "/path/to/setup.py" job = BeamJob( id='count_words', entry_point=driver.nope, pipeline_options=options, execution_timeout_sec=10, ) # when job.execute(JobContext.make()) # then options.get_all_options() self.assertDictEqual( options.get_all_options(), _create_pipeline_mock.call_args[1]['options'].get_all_options(), )
def test_option_with_space(self): options = PipelineOptions(flags=['--option with space= value with space']) self.assertEqual( getattr(options.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space') options_from_dict = PipelineOptions.from_dictionary( options.get_all_options()) self.assertEqual( getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space')
def test_from_dictionary(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) all_options_dict = options.get_all_options() options_from_dict = PipelineOptions.from_dictionary(all_options_dict) self.assertEqual(options_from_dict.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test_get_all_options(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) self.assertDictContainsSubset(case['expected'], options.get_all_options()) self.assertEqual( options.view_as(PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual( options.view_as(PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test_from_dictionary(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) all_options_dict = options.get_all_options() options_from_dict = PipelineOptions.from_dictionary(all_options_dict) self.assertEqual(options_from_dict.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def pipeline_constructor(options: PipelineOptions) -> TestPipeline: non_default_options = options.get_all_options(drop_default=True) expected_non_default_options = { "project": project_id, "save_main_session": True, } if not expected_non_default_options == non_default_options: raise ValueError( f"Expected non-default options [{expected_non_default_options}] do not match actual " f"non-default options [{non_default_options}]") return TestPipeline()
def test_redefine_options(self): class TestRedefinedOptions(PipelineOptions): # pylint: disable=unused-variable @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--redefined_flag', action='store_true') class TestRedefinedOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--redefined_flag', action='store_true') options = PipelineOptions(['--redefined_flag']) self.assertTrue(options.get_all_options()['redefined_flag'])
def test_extra_args(self): options = PipelineOptions([ '--extra_arg', 'val1', '--extra_arg', 'val2', '--extra_arg=val3', '--unknown_arg', 'val4']) def add_extra_options(parser): parser.add_argument("--extra_arg", action='append') self.assertEqual(options.get_all_options( add_extra_args_fn=add_extra_options) ['extra_arg'], ['val1', 'val2', 'val3'])
def test_extra_args(self): options = PipelineOptions([ '--extra_arg', 'val1', '--extra_arg', 'val2', '--extra_arg=val3', '--unknown_arg', 'val4' ]) def add_extra_options(parser): parser.add_argument("--extra_arg", action='append') self.assertEqual( options.get_all_options( add_extra_args_fn=add_extra_options)['extra_arg'], ['val1', 'val2', 'val3'])
def main(argv): """Main entry point""" # Define and parse command line arguments parser = argparse.ArgumentParser() parser.add_argument( '--input', type=str, default='', help= 'Path to the data file(s) containing game data (use either this parameter or --topic but not both).' ) parser.add_argument( '--topic', type=str, default='', help= 'Topic to subscribe to (use either this parameter or --input but not both).' ) parser.add_argument( '--output_dataset', type=str, default='', help='The BigQuery dataset name where to write all the data.') parser.add_argument( '--output_table_name', type=str, default='', help='The BigQuery table name where to write all the data.') args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: (p | 'ReadGameEvents' >> ReadGameEvents(args) | 'WindowedTeamScore' >> WindowedTeamScore(30) | 'FormatTeamScoreSums' >> ParDo( FormatTeamScoreSumsFn( (args.topic != None) and (args.topic != ""))) | 'WriteTeamScoreSums' >> WriteToBigQuery(args.output_table_name, args.output_dataset, options.get_all_options().get("project"), table_schema(), BigQueryDisposition.CREATE_IF_NEEDED, BigQueryDisposition.WRITE_APPEND))
def test_redefine_options(self): class TestRedefinedOptios(PipelineOptions): # pylint: disable=unused-variable @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--redefined_flag', action='store_true') class TestRedefinedOptios(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--redefined_flag', action='store_true') options = PipelineOptions(['--redefined_flag']) self.assertTrue(options.get_all_options()['redefined_flag'])
def run(known_args, argv): """ Main funtion that create pipeline and run it""" options = PipelineOptions(argv) pipeline = beam.Pipeline(options=options) words = "asdf asdf asdf asdf asdf asdf" lines = pipeline | 'create words' >> beam.Create(words.split(" ")) result = lines | 'count words' >> beam.ParDo(count) \ | 'sum' >> beam.CombineGlobally(sum) \ | 'save' >> beam.io.WriteToText(known_args.output) result = pipeline.run() if options.get_all_options()['runner'] == "DirectRunner": result.wait_until_finish()
def main(argv): parser = create_arg_parser() known_args, pipeline_args = parser.parse_known_args(argv) experiment_driver = ExperimentDriver( known_args.data_design_dir, known_args.experimental_design, known_args.output_file, known_args.intermediates_dir, known_args.seed, known_args.cores, known_args.analysis_type, ) pipeline_args.extend( [ f"--temp_location={known_args.intermediates_dir}", f"--direct_num_workers={known_args.cores}", ] ) pipeline_options = PipelineOptions(pipeline_args) # Set up a filesystem object according to the runner mode # Currently, we only support GCS for the data storage for the Dataflow runner. filesystem = None if pipeline_options.get_all_options()["runner"] in [ "dataflow", "DataflowRunner", ]: filesystem = FsCloudPathWrapper() filesystem.set_default_client_to_gs_client() else: filesystem = FsPathlibWrapper() experiment_driver.execute( known_args.use_apache_beam, pipeline_options, filesystem, )
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # 1..matから配列を生成し、パイプラインの入力に設定 rows = (p | 'new rows' >> beam.Create( MatSource(GS_PATH, 'imdb.mat', pipeline_options.get_all_options()['runner'])())) p_img = rows | 'process image file path' >> beam.ParDo(ProcessImgFn()) p_csv = rows | 'produce csv' >> beam.ParDo(ConvertToStr( )) | 'write to csv' >> beam.io.WriteToText(GS_UPPATH + '/csv/path_age.csv') result = p.run() result.wait_until_finish()
def main(argv): """Main entry point""" # Define and parse command line arguments parser = argparse.ArgumentParser() parser.add_argument('--input', type=str, default='', help='Path to the data file(s) containing game data.') parser.add_argument('--output_dataset', type=str, default='', help='The BigQuery dataset name where to write all the data.') parser.add_argument('--output_table_name', type=str, default='', help='The BigQuery table name where to write all the data.') args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True # Create and run the pipeline with beam.Pipeline(options=options) as p: (p | 'ReadInputText' >> beam.io.ReadFromText(args.input) | 'ParseGameEvent' >> ParDo(ParseEventFn()) | 'AddEventTimestamps' >> beam.Map(lambda element: TimestampedValue(element, element['timestamp'])) | 'WindowedTeamScore' >> WindowedTeamScore(3600000) # 1 hour = 3600 seconds = 3600000 milliseconds | 'FormatTeamScoreSums' >> ParDo(FormatTeamScoreSumsFn()) | 'WriteTeamScoreSums' >> WriteToBigQuery( args.output_table_name, args.output_dataset, options.get_all_options().get("project"), table_schema() ) )
def generate_statistics_from_tfrecord( pipeline_args, # type: List[str] data_location, # type: str output_path, # type: str stats_options # type: StatsOptions ): # type: (...) -> statistics_pb2.DatasetFeatureStatisticsList """ Generate stats file from a tfrecord dataset using TFDV :param pipeline_args: un-parsed Dataflow arguments :param data_location: input data dir containing tfrecord files :param output_path: output path for the stats file :return a DatasetFeatureStatisticsList proto. """ assert_not_empty_string(data_location) assert_not_empty_string(output_path) args_in_snake_case = clean_up_pipeline_args(pipeline_args) pipeline_options = PipelineOptions(flags=args_in_snake_case) all_options = pipeline_options.get_all_options() if all_options["job_name"] is None: gcloud_options = pipeline_options.view_as(GoogleCloudOptions) gcloud_options.job_name = "generatestats-%s" % str(int(time.time())) if all_options["setup_file"] is None: setup_file_path = create_setup_file() setup_options = pipeline_options.view_as(SetupOptions) setup_options.setup_file = setup_file_path input_files = os.path.join(data_location, "*.tfrecords*") return tfdv.generate_statistics_from_tfrecord( data_location=input_files, output_path=output_path, stats_options=stats_options, pipeline_options=pipeline_options)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('--topic', type=str, default='', help='Topic to subscribe to (use either this parameter or --input but not both).') parser.add_argument('--output_dataset', type=str, default='', help='The BigQuery dataset name where to write all the data.') parser.add_argument('--output_table_name', type=str, default='', help='The BigQuery table name where to write all the data.') args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: (p | 'ReadMessages' >> ReadFromPubSub(args.topic) | 'FormatRecord' >> beam.Map(lambda element: {"data": element}) # | "PrintBeforeInsert" >> beam.Map(lambda record: print str(element)) | 'WriteDataElementBQ' >> WriteToBigQuery( args.output_table_name, args.output_dataset, options.get_all_options().get("project"), table_schema(), BigQueryDisposition.CREATE_IF_NEEDED, BigQueryDisposition.WRITE_APPEND ) )
def test_template_location(self): options = PipelineOptions(['--template_location', 'abc']) self.assertEqual(options.get_all_options()['template_location'], 'abc') options = PipelineOptions(flags=['']) self.assertEqual(options.get_all_options()['template_location'], None)
def test_unknown_duplicate_args_converted_to_list(self): options = PipelineOptions(['--dup_arg', 'val1', '--dup_arg', 'val2', '--dup_arg=val3']) self.assertEqual(options.get_all_options()['dup_arg'], ['val1', 'val2', 'val3'])
def test_create_test_pipeline_options(self): test_pipeline = TestPipeline(argv=self.TEST_CASE['options']) test_options = PipelineOptions(test_pipeline.get_full_options_as_args()) self.assertDictContainsSubset(self.TEST_CASE['expected_dict'], test_options.get_all_options())
def test_dataflow_job_file(self): options = PipelineOptions(['--dataflow_job_file', 'abc']) self.assertEqual(options.get_all_options()['dataflow_job_file'], 'abc') options = PipelineOptions(flags=['']) self.assertEqual(options.get_all_options()['dataflow_job_file'], None)
def test_retain_unknown_options_unary_missing_prefix(self): options = PipelineOptions(['bad_option']) with self.assertRaises(SystemExit): options.get_all_options(retain_unknown_options=True)
def test_retain_unknown_options_unary_single_dash_store_true(self): options = PipelineOptions(['-i']) result = options.get_all_options(retain_unknown_options=True) self.assertEqual(result['i'], True)
def test_invalid_override_init_options(self): base_flags = ['--num_workers', '5'] options = PipelineOptions(base_flags, mock_invalid_flag=True) self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertEqual(options.get_all_options()['mock_flag'], False)
def test_dataflow_job_file(self): options = PipelineOptions(['--dataflow_job_file', 'abc']) self.assertEqual(options.get_all_options()['dataflow_job_file'], 'abc') options = PipelineOptions(flags=['']) self.assertEqual(options.get_all_options()['dataflow_job_file'], None)
def test_template_location(self): options = PipelineOptions(['--template_location', 'abc']) self.assertEqual(options.get_all_options()['template_location'], 'abc') options = PipelineOptions(flags=['']) self.assertEqual(options.get_all_options()['template_location'], None)
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: Optional[str], calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the supervision calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() input_dataset = all_pipeline_options['project'] + '.' + data_input reference_dataset = all_pipeline_options['project'] + '.' + reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationPeriods incarceration_periods = ( p | 'Load IncarcerationPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionViolations supervision_violations = ( p | 'Load SupervisionViolations' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # TODO(2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = ( p | 'Load SupervisionViolationResponses' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionSentences supervision_sentences = ( p | 'Load SupervisionSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationSentences incarceration_sentences = ( p | 'Load IncarcerationSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionPeriods supervision_periods = ( p | 'Load SupervisionPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateAssessments assessments = (p | 'Load Assessments' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) supervision_contacts = ( p | 'Load StateSupervisionContacts' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionContact, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Bring in the table that associates StateSupervisionViolationResponses to information about StateAgents ssvr_to_agent_association_query = select_all_by_person_query( reference_dataset, SSVR_TO_AGENT_ASSOCIATION_VIEW_NAME, state_code, person_id_filter_set) ssvr_to_agent_associations = ( p | "Read SSVR to Agent table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=ssvr_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the # supervision_violation_response_id column as the key ssvr_agent_associations_as_kv = ( ssvr_to_agent_associations | 'Convert SSVR to Agent table to KV tuples' >> beam.ParDo( ConvertDictToKVTuple(), 'supervision_violation_response_id')) supervision_period_to_agent_association_query = select_all_by_person_query( reference_dataset, SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, state_code, person_id_filter_set) supervision_period_to_agent_associations = ( p | "Read Supervision Period to Agent table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource( query=supervision_period_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the supervision_period_id column # as the key supervision_period_to_agent_associations_as_kv = ( supervision_period_to_agent_associations | 'Convert Supervision Period to Agent table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id')) if state_code is None or state_code == 'US_MO': # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set) us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=us_mo_sentence_status_query, use_standard_sql=True))) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert MO sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) sentences_and_statuses = ( { 'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences, 'sentence_statuses': us_mo_sentence_status_rankings_as_kv } | 'Group sentences to the sentence statuses for that person' >> beam.CoGroupByKey()) sentences_converted = ( sentences_and_statuses | 'Convert to state-specific sentences' >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( 'incarceration_sentences', 'supervision_sentences')) # Bring in the judicial districts associated with supervision_periods sp_to_judicial_district_query = select_all_by_person_query( reference_dataset, SUPERVISION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, state_code, person_id_filter_set) sp_to_judicial_district_kv = ( p | "Read supervision_period to judicial_district associations from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=sp_to_judicial_district_query, use_standard_sql=True)) | "Convert supervision_period to judicial_district association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( { 'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on the corresponding # StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo( SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id incarceration_periods_and_violation_responses = ( { 'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations } | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding # StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo( SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their related entities person_entities = ( { 'person': persons, 'assessments': assessments, 'incarceration_periods': incarceration_periods_with_source_violations, 'supervision_periods': supervision_periods, 'supervision_sentences': sentences_converted.supervision_sentences, 'incarceration_sentences': sentences_converted.incarceration_sentences, 'violation_responses': violation_responses_with_hydrated_violations, 'supervision_contacts': supervision_contacts, 'supervision_period_judicial_district_association': sp_to_judicial_district_kv } | 'Group StatePerson to all entities' >> beam.CoGroupByKey()) # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods person_time_buckets = ( person_entities | 'Get SupervisionTimeBuckets' >> beam.ParDo( ClassifySupervisionTimeBuckets(), AsDict(ssvr_agent_associations_as_kv), AsDict(supervision_period_to_agent_associations_as_kv))) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Get the type of metric to calculate metric_types_set = set(metric_types) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get supervision metrics supervision_metrics = ( person_time_buckets | 'Get Supervision Metrics' >> GetSupervisionMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( supervision_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( SupervisionMetricType.SUPERVISION_COMPLIANCE.value, SupervisionMetricType.SUPERVISION_POPULATION.value, SupervisionMetricType.SUPERVISION_REVOCATION.value, SupervisionMetricType.SUPERVISION_REVOCATION_ANALYSIS.value, SupervisionMetricType. SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS.value, SupervisionMetricType.SUPERVISION_SUCCESS.value, SupervisionMetricType. SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED.value, SupervisionMetricType.SUPERVISION_TERMINATION.value)) # Write the metrics to the output tables in BigQuery terminations_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionTerminationMetric) compliance_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionCaseComplianceMetric) populations_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionPopulationMetric) revocations_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionRevocationMetric) revocation_analysis_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionRevocationAnalysisMetric) revocation_violation_type_analysis_table_id = \ DATAFLOW_METRICS_TO_TABLES.get(SupervisionRevocationViolationTypeAnalysisMetric) successes_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionSuccessMetric) successful_sentence_lengths_table_id = DATAFLOW_METRICS_TO_TABLES.get( SuccessfulSupervisionSentenceDaysServedMetric) _ = (writable_metrics.SUPERVISION_POPULATION | f"Write population metrics to BQ table: {populations_table_id}" >> beam.io.WriteToBigQuery( table=populations_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_REVOCATION | f"Write revocation metrics to BQ table: {revocations_table_id}" >> beam.io.WriteToBigQuery( table=revocations_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_SUCCESS | f"Write success metrics to BQ table: {successes_table_id}" >> beam.io.WriteToBigQuery( table=successes_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED | f"Write supervision successful sentence length metrics to BQ" f" table: {successful_sentence_lengths_table_id}" >> beam.io.WriteToBigQuery( table=successful_sentence_lengths_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_TERMINATION | f"Write termination metrics to BQ table: {terminations_table_id}" >> beam.io.WriteToBigQuery( table=terminations_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = ( writable_metrics.SUPERVISION_REVOCATION_ANALYSIS | f"Write revocation analyses metrics to BQ table: {revocation_analysis_table_id}" >> beam.io.WriteToBigQuery( table=revocation_analysis_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS | f"Write revocation violation type analyses metrics to BQ table: " f"{revocation_violation_type_analysis_table_id}" >> beam.io.WriteToBigQuery( table=revocation_violation_type_analysis_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_COMPLIANCE | f"Write compliance metrics to BQ table: {compliance_table_id}" >> beam.io.WriteToBigQuery( table=compliance_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
def run(argv=None): """Runs the recidivism calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is # necessary because the BuildRootEntity function tries to access attributes # of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been # instantiated, then the relationship properties are loaded and their # attributes can be successfully accessed. _ = schema.StatePerson() # Parse command-line arguments known_args, pipeline_args = parse_arguments(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = pipeline_options.get_all_options() query_dataset = all_pipeline_options['project'] + '.' + known_args.input reference_dataset = all_pipeline_options['project'] + '.' + \ known_args.reference_input with beam.Pipeline(argv=pipeline_args) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StatePerson, root_entity_class=entities.StatePerson, unifying_id_field='person_id', build_related_entities=True)) # Get StateIncarcerationPeriods incarceration_periods = ( p | 'Load IncarcerationPeriods' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StateIncarcerationPeriod, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field='person_id', build_related_entities=True)) # Get StateSupervisionViolations supervision_violations = \ (p | 'Load SupervisionViolations' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StateSupervisionViolation, root_entity_class=entities.StateSupervisionViolation, unifying_id_field='person_id', build_related_entities=True )) # TODO(2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = \ (p | 'Load SupervisionViolationResponses' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StateSupervisionViolationResponse, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field='person_id', build_related_entities=True )) # Group StateSupervisionViolationResponses and # StateSupervisionViolations by person_id supervision_violations_and_responses = ( { 'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on # the corresponding StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo( SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses # by person_id incarceration_periods_and_violation_responses = ( { 'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations } | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolationResponse entities on # the corresponding StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo( SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their StateIncarcerationPeriods person_and_incarceration_periods = ( { 'person': persons, 'incarceration_periods': incarceration_periods_with_source_violations } | 'Group StatePerson to StateIncarcerationPeriods' >> beam.CoGroupByKey()) # Bring in the table that associates people and their county of # residence person_id_to_county_query = \ f"SELECT * FROM " \ f"`{reference_dataset}.persons_to_recent_county_of_residence`" person_id_to_county_kv = ( p | "Read person_id to county associations from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=person_id_to_county_query, use_standard_sql=True)) | "Convert person_id to county association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) # Identify ReleaseEvents events from the StatePerson's # StateIncarcerationPeriods person_events = ( person_and_incarceration_periods | "ClassifyReleaseEvents" >> beam.ParDo( ClassifyReleaseEvents(), AsDict(person_id_to_county_kv))) # Get dimensions to include and methodologies to use inclusions, methodologies = dimensions_and_methodologies(known_args) # Get pipeline job details for accessing job_id all_pipeline_options = pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get recidivism metrics recidivism_metrics = ( person_events | 'Get Recidivism Metrics' >> GetRecidivismMetrics( pipeline_options=all_pipeline_options, inclusions=inclusions)) filter_metrics_kwargs = {'methodologies': methodologies} # Filter out unneeded metrics final_recidivism_metrics = ( recidivism_metrics | 'Filter out unwanted metrics' >> beam.ParDo( FilterMetrics(), **filter_metrics_kwargs)) # Convert the metrics into a format that's writable to BQ writable_metrics = ( final_recidivism_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo( RecidivismMetricWritableDict()).with_outputs( 'rates', 'counts', 'liberties')) # Write the recidivism metrics to the output tables in BigQuery rates_table = known_args.output + '.recidivism_rate_metrics' counts_table = known_args.output + '.recidivism_count_metrics' liberty_table = known_args.output + '.recidivism_liberty_metrics' _ = (writable_metrics.rates | f"Write rate metrics to BQ table: {rates_table}" >> beam.io.WriteToBigQuery( table=rates_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.counts | f"Write count metrics to BQ table: {counts_table}" >> beam.io.WriteToBigQuery( table=counts_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.liberties | f"Write liberty metrics to BQ table: {liberty_table}" >> beam.io.WriteToBigQuery( table=liberty_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))