Exemple #1
0
 def test_parse_pipeline_options(self):
     expected_options = PipelineOptions([])
     expected_options.view_as(
         SdkWorkerMainTest.MockOptions).m_m_option = ['beam_fn_api']
     expected_options.view_as(
         SdkWorkerMainTest.MockOptions).m_option = '/tmp/requirements.txt'
     self.assertEqual(
         expected_options.get_all_options(),
         sdk_worker_main._parse_pipeline_options(
             '{"options": {' + '"m_option": "/tmp/requirements.txt", ' +
             '"m_m_option":["beam_fn_api"]' + '}}').get_all_options())
     self.assertEqual(
         expected_options.get_all_options(),
         sdk_worker_main._parse_pipeline_options(
             '{"beam:option:m_option:v1": "/tmp/requirements.txt", ' +
             '"beam:option:m_m_option:v1":["beam_fn_api"]}').
         get_all_options())
     self.assertEqual(
         {'beam:option:m_option:v': 'mock_val'},
         sdk_worker_main._parse_pipeline_options(
             '{"options": {"beam:option:m_option:v":"mock_val"}}').
         get_all_options(drop_default=True))
     self.assertEqual(
         {'eam:option:m_option:v1': 'mock_val'},
         sdk_worker_main._parse_pipeline_options(
             '{"options": {"eam:option:m_option:v1":"mock_val"}}').
         get_all_options(drop_default=True))
     self.assertEqual(
         {'eam:option:m_option:v': 'mock_val'},
         sdk_worker_main._parse_pipeline_options(
             '{"options": {"eam:option:m_option:v":"mock_val"}}').
         get_all_options(drop_default=True))
    def test_override_options(self):
        base_flags = ['--num_workers', '5']
        options = PipelineOptions(base_flags)
        self.assertEqual(options.get_all_options()['num_workers'], 5)
        self.assertEqual(options.get_all_options()['mock_flag'], False)

        options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True
        self.assertEqual(options.get_all_options()['num_workers'], 5)
        self.assertTrue(options.get_all_options()['mock_flag'])
  def test_override_options(self):
    base_flags = ['--num_workers', '5']
    options = PipelineOptions(base_flags)
    self.assertEqual(options.get_all_options()['num_workers'], 5)
    self.assertEqual(options.get_all_options()['mock_flag'], False)

    options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True
    self.assertEqual(options.get_all_options()['num_workers'], 5)
    self.assertTrue(options.get_all_options()['mock_flag'])
  def test_extra_package(self):
    options = PipelineOptions(['--extra_package', 'abc',
                               '--extra_packages', 'def',
                               '--extra_packages', 'ghi'])
    self.assertEqual(
        sorted(options.get_all_options()['extra_packages']),
        ['abc', 'def', 'ghi'])

    options = PipelineOptions(flags=[''])
    self.assertEqual(options.get_all_options()['extra_packages'], None)
    def test_extra_package(self):
        options = PipelineOptions([
            '--extra_package', 'abc', '--extra_packages', 'def',
            '--extra_packages', 'ghi'
        ])
        self.assertEqual(sorted(options.get_all_options()['extra_packages']),
                         ['abc', 'def', 'ghi'])

        options = PipelineOptions(flags=[''])
        self.assertEqual(options.get_all_options()['extra_packages'], None)
Exemple #6
0
  def test_experiments(self):
    options = PipelineOptions(['--experiment', 'abc', '--experiment', 'def'])
    self.assertEqual(
        sorted(options.get_all_options()['experiments']), ['abc', 'def'])

    options = PipelineOptions(['--experiments', 'abc', '--experiments', 'def'])
    self.assertEqual(
        sorted(options.get_all_options()['experiments']), ['abc', 'def'])

    options = PipelineOptions(flags=[''])
    self.assertEqual(options.get_all_options()['experiments'], None)
  def test_experiments(self):
    options = PipelineOptions(['--experiment', 'abc', '--experiment', 'def'])
    self.assertEqual(
        sorted(options.get_all_options()['experiments']), ['abc', 'def'])

    options = PipelineOptions(['--experiments', 'abc', '--experiments', 'def'])
    self.assertEqual(
        sorted(options.get_all_options()['experiments']), ['abc', 'def'])

    options = PipelineOptions(flags=[''])
    self.assertEqual(options.get_all_options()['experiments'], None)
  def test_service_options(self):
    options = PipelineOptions(
        ['--service_option', 'whizz=bang', '--service_option', 'beep=boop'])
    self.assertEqual(
        sorted(options.get_all_options()['service_options']),
        ['beep=boop', 'whizz=bang'])

    options = PipelineOptions(
        ['--service_options', 'whizz=bang', '--service_options', 'beep=boop'])
    self.assertEqual(
        sorted(options.get_all_options()['service_options']),
        ['beep=boop', 'whizz=bang'])

    options = PipelineOptions(flags=[''])
    self.assertEqual(options.get_all_options()['service_options'], None)
    def load(
        self,
        use_apache_beam: bool = False,
        pipeline_options: PipelineOptions = PipelineOptions(),
    ) -> pd.DataFrame:
        """Returns a DataFrame of all results from this ExperimentalDesign."""
        if not self._all_trials:
            self.generate_trials()

        temp_location = pipeline_options.get_all_options().get(
            "temp_location", "")
        temp_result_path = (temp_location + "/temp_result.csv"
                            if temp_location else "/temp_result.csv")

        if use_apache_beam:
            self._evaluate_all_trials_using_apache_beam(
                pipeline_options, temp_result_path)
        elif self._cores != 1:
            self._evaluate_all_trials_in_parallel()

        result = None
        if use_apache_beam:
            with self._filesystem.open(temp_result_path) as file:
                result = pd.read_csv(file)
        else:
            result = pd.concat(
                trial.evaluate(self._seed, self._filesystem)
                for trial in self._all_trials)

        return result
Exemple #10
0
def main(argv):
    """Main entry point"""

    # Define and parse command line arguments
    parser = argparse.ArgumentParser()

    parser.add_argument('--input',
                        type=str,
                        default='',
                        help='Path to the data file(s) containing game data.')

    parser.add_argument(
        '--output_dataset',
        type=str,
        default='',
        help='The BigQuery dataset name where to write all the data.')

    parser.add_argument(
        '--output_table_name',
        type=str,
        default='',
        help='The BigQuery table name where to write all the data.')

    args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)

    # Create and run the pipeline
    with beam.Pipeline(options=options) as p:
        (p | 'ReadInputText' >> beam.io.ReadFromText(args.input)
         | 'ParseGameEvent' >> ParDo(ParseEventFn())
         | 'ExtractUserScore' >> ExtractAndSumScore()
         | 'FormatUserScoreSums' >> ParDo(FormatUserScoreSumsFn())
         | 'WriteTeamScoreSums' >> WriteToBigQuery(
             args.output_table_name, args.output_dataset,
             options.get_all_options().get("project"), table_schema()))
 def test_parse_pipeline_options(self):
   expected_options = PipelineOptions([])
   expected_options.view_as(
       SdkWorkerMainTest.MockOptions).m_m_option = [
           'worker_threads=1', 'beam_fn_api'
       ]
   expected_options.view_as(
       SdkWorkerMainTest.MockOptions).m_option = '/tmp/requirements.txt'
   self.assertEqual(
       {'m_m_option': ['worker_threads=1']},
       sdk_worker_main._parse_pipeline_options(
           '{"options": {"m_m_option":["worker_threads=1"]}}')
       .get_all_options(drop_default=True))
   self.assertEqual(
       expected_options.get_all_options(),
       sdk_worker_main._parse_pipeline_options(
           '{"options": {' +
           '"m_option": "/tmp/requirements.txt", ' +
           '"m_m_option":["worker_threads=1", "beam_fn_api"]' +
           '}}').get_all_options())
   self.assertEqual(
       {'m_m_option': ['worker_threads=1']},
       sdk_worker_main._parse_pipeline_options(
           '{"beam:option:m_m_option:v1":["worker_threads=1"]}')
       .get_all_options(drop_default=True))
   self.assertEqual(
       expected_options.get_all_options(),
       sdk_worker_main._parse_pipeline_options(
           '{"beam:option:m_option:v1": "/tmp/requirements.txt", ' +
           '"beam:option:m_m_option:v1":["worker_threads=1", ' +
           '"beam_fn_api"]}').get_all_options())
   self.assertEqual(
       {'beam:option:m_option:v': 'mock_val'},
       sdk_worker_main._parse_pipeline_options(
           '{"options": {"beam:option:m_option:v":"mock_val"}}')
       .get_all_options(drop_default=True))
   self.assertEqual(
       {'eam:option:m_option:v1': 'mock_val'},
       sdk_worker_main._parse_pipeline_options(
           '{"options": {"eam:option:m_option:v1":"mock_val"}}')
       .get_all_options(drop_default=True))
   self.assertEqual(
       {'eam:option:m_option:v': 'mock_val'},
       sdk_worker_main._parse_pipeline_options(
           '{"options": {"eam:option:m_option:v":"mock_val"}}')
       .get_all_options(drop_default=True))
Exemple #12
0
def run(pipeline_options, known_args):
    global force_tf_compat_v1
    argv = None  # if None, uses sys.argv
    pipeline_options = PipelineOptions(argv)
    pipeline = beam.Pipeline(options=pipeline_options)

    if "universal-sentence-encoder" in MODEL_URL and int(
            MODEL_URL.split("/")[-1]) <= 2:
        # https://github.com/tensorflow/transform/issues/160
        force_tf_compat_v1 = True

    with tft_beam.Context(temp_dir=tempfile.mkdtemp(),
                          force_tf_compat_v1=force_tf_compat_v1):
        print("Context force_tf_compat_v1: {}".format(
            tft_beam.Context.get_use_tf_compat_v1()))
        articles = (
            pipeline
            | beam.Create([
                {
                    "id": "01",
                    "text": "To be, or not to be: that is the question: "
                },
                {
                    "id": "02",
                    "text": "Whether 'tis nobler in the mind to suffer "
                },
                {
                    "id": "03",
                    "text": "The slings and arrows of outrageous fortune, "
                },
                {
                    "id": "04",
                    "text": "Or to take arms against a sea of troubles, "
                },
            ]))

        articles_dataset = (articles, get_metadata())

        transformed_dataset, transform_fn = (
            articles_dataset
            | "Extract embeddings" >>
            tft_beam.AnalyzeAndTransformDataset(preprocess_fn))

        transformed_data, transformed_metadata = transformed_dataset

        _ = (transformed_data
             | "Print embeddings" >> beam.Map(print_pass)
             | "Write embeddings to TFRecords" >>
             beam.io.tfrecordio.WriteToTFRecord(
                 file_path_prefix="{0}".format(known_args.output_dir),
                 file_name_suffix=".tfrecords",
                 coder=tft_coders.example_proto_coder.ExampleProtoCoder(
                     transformed_metadata.schema),
                 num_shards=1))

    job = pipeline.run()
    if pipeline_options.get_all_options()["runner"] == "DirectRunner":
        job.wait_until_finish()
Exemple #13
0
def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()
    # Here we add some specific command line arguments we expect.
    # Specifically we have the input file to load and the output table to
    # This is the final stage of the pipeline, where we define the destination
    # of the data.  In this case we are writing to BigQuery.

    parser.add_argument(
        '--input_subscription',
        required=True,
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))

    parser.add_argument('--output',
                        required=True,
                        help='Output bucket for data',
                        default='')
    parser.add_argument('--log', required=True, help='log bucket', default='')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)
    #import pprint
    #pprint.pprint(known_args)
    #pprint.pprint(pipeline_args)
    #pprint.pprint(pipeline_options.get_all_options())

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True

    # get options
    project_id = pipeline_options.get_all_options()['project']
    output_bucket_name = known_args.output
    log_bucket_name = known_args.log
    log_file_path = 'gs://{}/logs'.format(log_bucket_name)

    fs = GCSFileSystem(pipeline_options=pipeline_options)

    # DataIngestion is a class we built in this script to hold the logic for
    # transforming the file into a BigQuery table.
    data_copier = DataCopier()
    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information including where Dataflow should
    # store temp files, and what the project id is

    p = beam.Pipeline(options=pipeline_options)

    (p | beam.io.ReadFromPubSub(subscription=known_args.input_subscription)
     | 'Copying customer data to the final data-bucket/customer-id' >>
     beam.Map(lambda m: data_copier.parse_method(m, project_id, fs,
                                                 output_bucket_name))
     | 'Write results to the output bucket' >>
     WriteToText(file_path_prefix=log_file_path))

    p.run().wait_until_finish()
 def test_option_with_space(self):
   options = PipelineOptions(flags=['--option with space= value with space'])
   self.assertEqual(
       getattr(options.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
   options_from_dict = PipelineOptions.from_dictionary(
       options.get_all_options())
   self.assertEqual(
       getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
 def test_get_all_options(self):
   for case in PipelineOptionsTest.TEST_CASES:
     options = PipelineOptions(flags=case['flags'])
     self.assertDictContainsSubset(case['expected'], options.get_all_options())
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_flag,
                      case['expected']['mock_flag'])
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_option,
                      case['expected']['mock_option'])
Exemple #16
0
    def test_should_create_pipeline_from_pipeline_options(
        self,
        _create_pipeline_mock: mock.Mock,
    ):
        # given
        _create_pipeline_mock.return_value.run.return_value = RunnerResult(
            'DONE', None)

        driver = CountWordsDriver()

        options = PipelineOptions()
        options.view_as(StandardOptions).runner = 'DataflowRunner'
        options.view_as(GoogleCloudOptions).project = 'gcp_project_id'
        options.view_as(GoogleCloudOptions).job_name = 'beam-wordcount-uuid'
        options.view_as(
            GoogleCloudOptions).staging_location = "gs://staging_location"
        options.view_as(
            GoogleCloudOptions).temp_location = "gs://temp_location"
        options.view_as(GoogleCloudOptions).region = 'region'
        options.view_as(
            GoogleCloudOptions).service_account_email = 'service-account'
        options.view_as(WorkerOptions).machine_type = 'n2-standard-8'
        options.view_as(WorkerOptions).max_num_workers = 2
        options.view_as(
            WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'
        options.view_as(SetupOptions).setup_file = "/path/to/setup.py"

        job = BeamJob(
            id='count_words',
            entry_point=driver.nope,
            pipeline_options=options,
            execution_timeout_sec=10,
        )

        # when
        job.execute(JobContext.make())

        # then
        options.get_all_options()
        self.assertDictEqual(
            options.get_all_options(),
            _create_pipeline_mock.call_args[1]['options'].get_all_options(),
        )
 def test_option_with_space(self):
   options = PipelineOptions(flags=['--option with space= value with space'])
   self.assertEqual(
       getattr(options.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
   options_from_dict = PipelineOptions.from_dictionary(
       options.get_all_options())
   self.assertEqual(
       getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
 def test_from_dictionary(self):
   for case in PipelineOptionsTest.TEST_CASES:
     options = PipelineOptions(flags=case['flags'])
     all_options_dict = options.get_all_options()
     options_from_dict = PipelineOptions.from_dictionary(all_options_dict)
     self.assertEqual(options_from_dict.view_as(
         PipelineOptionsTest.MockOptions).mock_flag,
                      case['expected']['mock_flag'])
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_option,
                      case['expected']['mock_option'])
Exemple #19
0
 def test_get_all_options(self):
     for case in PipelineOptionsTest.TEST_CASES:
         options = PipelineOptions(flags=case['flags'])
         self.assertDictContainsSubset(case['expected'],
                                       options.get_all_options())
         self.assertEqual(
             options.view_as(PipelineOptionsTest.MockOptions).mock_flag,
             case['expected']['mock_flag'])
         self.assertEqual(
             options.view_as(PipelineOptionsTest.MockOptions).mock_option,
             case['expected']['mock_option'])
 def test_from_dictionary(self):
   for case in PipelineOptionsTest.TEST_CASES:
     options = PipelineOptions(flags=case['flags'])
     all_options_dict = options.get_all_options()
     options_from_dict = PipelineOptions.from_dictionary(all_options_dict)
     self.assertEqual(options_from_dict.view_as(
         PipelineOptionsTest.MockOptions).mock_flag,
                      case['expected']['mock_flag'])
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_option,
                      case['expected']['mock_option'])
    def pipeline_constructor(options: PipelineOptions) -> TestPipeline:
        non_default_options = options.get_all_options(drop_default=True)
        expected_non_default_options = {
            "project": project_id,
            "save_main_session": True,
        }

        if not expected_non_default_options == non_default_options:
            raise ValueError(
                f"Expected non-default options [{expected_non_default_options}] do not match actual "
                f"non-default options [{non_default_options}]")
        return TestPipeline()
    def test_redefine_options(self):
        class TestRedefinedOptions(PipelineOptions):  # pylint: disable=unused-variable
            @classmethod
            def _add_argparse_args(cls, parser):
                parser.add_argument('--redefined_flag', action='store_true')

        class TestRedefinedOptions(PipelineOptions):
            @classmethod
            def _add_argparse_args(cls, parser):
                parser.add_argument('--redefined_flag', action='store_true')

        options = PipelineOptions(['--redefined_flag'])
        self.assertTrue(options.get_all_options()['redefined_flag'])
  def test_extra_args(self):
    options = PipelineOptions([
        '--extra_arg', 'val1',
        '--extra_arg', 'val2',
        '--extra_arg=val3',
        '--unknown_arg', 'val4'])

    def add_extra_options(parser):
      parser.add_argument("--extra_arg", action='append')

    self.assertEqual(options.get_all_options(
        add_extra_args_fn=add_extra_options)
                     ['extra_arg'], ['val1', 'val2', 'val3'])
    def test_extra_args(self):
        options = PipelineOptions([
            '--extra_arg', 'val1', '--extra_arg', 'val2', '--extra_arg=val3',
            '--unknown_arg', 'val4'
        ])

        def add_extra_options(parser):
            parser.add_argument("--extra_arg", action='append')

        self.assertEqual(
            options.get_all_options(
                add_extra_args_fn=add_extra_options)['extra_arg'],
            ['val1', 'val2', 'val3'])
def main(argv):
    """Main entry point"""

    # Define and parse command line arguments
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--input',
        type=str,
        default='',
        help=
        'Path to the data file(s) containing game data (use either this parameter or --topic but not both).'
    )

    parser.add_argument(
        '--topic',
        type=str,
        default='',
        help=
        'Topic to subscribe to (use either this parameter or --input but not both).'
    )

    parser.add_argument(
        '--output_dataset',
        type=str,
        default='',
        help='The BigQuery dataset name where to write all the data.')

    parser.add_argument(
        '--output_table_name',
        type=str,
        default='',
        help='The BigQuery table name where to write all the data.')

    args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=options) as p:
        (p | 'ReadGameEvents' >> ReadGameEvents(args)
         | 'WindowedTeamScore' >> WindowedTeamScore(30)
         | 'FormatTeamScoreSums' >> ParDo(
             FormatTeamScoreSumsFn(
                 (args.topic != None) and (args.topic != "")))
         | 'WriteTeamScoreSums' >>
         WriteToBigQuery(args.output_table_name, args.output_dataset,
                         options.get_all_options().get("project"),
                         table_schema(), BigQueryDisposition.CREATE_IF_NEEDED,
                         BigQueryDisposition.WRITE_APPEND))
  def test_redefine_options(self):

    class TestRedefinedOptios(PipelineOptions):  # pylint: disable=unused-variable

      @classmethod
      def _add_argparse_args(cls, parser):
        parser.add_argument('--redefined_flag', action='store_true')

    class TestRedefinedOptios(PipelineOptions):

      @classmethod
      def _add_argparse_args(cls, parser):
        parser.add_argument('--redefined_flag', action='store_true')

    options = PipelineOptions(['--redefined_flag'])
    self.assertTrue(options.get_all_options()['redefined_flag'])
Exemple #27
0
def run(known_args, argv):
    """ Main funtion that create pipeline and run it"""
    options = PipelineOptions(argv)

    pipeline = beam.Pipeline(options=options)

    words = "asdf asdf asdf asdf asdf asdf"
    lines = pipeline | 'create words' >> beam.Create(words.split(" "))

    result = lines | 'count words' >> beam.ParDo(count) \
    | 'sum' >> beam.CombineGlobally(sum) \
    | 'save' >> beam.io.WriteToText(known_args.output)

    result = pipeline.run()
    if options.get_all_options()['runner'] == "DirectRunner":
        result.wait_until_finish()
Exemple #28
0
def main(argv):
    parser = create_arg_parser()
    known_args, pipeline_args = parser.parse_known_args(argv)

    experiment_driver = ExperimentDriver(
        known_args.data_design_dir,
        known_args.experimental_design,
        known_args.output_file,
        known_args.intermediates_dir,
        known_args.seed,
        known_args.cores,
        known_args.analysis_type,
    )

    pipeline_args.extend(
        [
            f"--temp_location={known_args.intermediates_dir}",
            f"--direct_num_workers={known_args.cores}",
        ]
    )
    pipeline_options = PipelineOptions(pipeline_args)

    # Set up a filesystem object according to the runner mode
    # Currently, we only support GCS for the data storage for the Dataflow runner.
    filesystem = None
    if pipeline_options.get_all_options()["runner"] in [
        "dataflow",
        "DataflowRunner",
    ]:
        filesystem = FsCloudPathWrapper()
        filesystem.set_default_client_to_gs_client()
    else:
        filesystem = FsPathlibWrapper()

    experiment_driver.execute(
        known_args.use_apache_beam,
        pipeline_options,
        filesystem,
    )
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # 1..matから配列を生成し、パイプラインの入力に設定
    rows = (p
            | 'new rows' >> beam.Create(
                MatSource(GS_PATH, 'imdb.mat',
                          pipeline_options.get_all_options()['runner'])()))

    p_img = rows | 'process image file path' >> beam.ParDo(ProcessImgFn())
    p_csv = rows | 'produce csv' >> beam.ParDo(ConvertToStr(
    )) | 'write to csv' >> beam.io.WriteToText(GS_UPPATH + '/csv/path_age.csv')
    result = p.run()
    result.wait_until_finish()
def main(argv):
    """Main entry point"""

    # Define and parse command line arguments
    parser = argparse.ArgumentParser()

    parser.add_argument('--input',
                        type=str,
                        default='',
                        help='Path to the data file(s) containing game data.')

    parser.add_argument('--output_dataset',
                        type=str,
                        default='',
                        help='The BigQuery dataset name where to write all the data.')

    parser.add_argument('--output_table_name',
                        type=str,
                        default='',
                        help='The BigQuery table name where to write all the data.')

    args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True

    # Create and run the pipeline
    with beam.Pipeline(options=options) as p:
        (p | 'ReadInputText'          >> beam.io.ReadFromText(args.input)
           | 'ParseGameEvent'         >> ParDo(ParseEventFn())
           | 'AddEventTimestamps'     >> beam.Map(lambda element: TimestampedValue(element, element['timestamp']))
           | 'WindowedTeamScore'      >> WindowedTeamScore(3600000) # 1 hour = 3600 seconds = 3600000 milliseconds
           | 'FormatTeamScoreSums'    >> ParDo(FormatTeamScoreSumsFn())
           | 'WriteTeamScoreSums'     >> WriteToBigQuery(
                    args.output_table_name,
                    args.output_dataset,
                    options.get_all_options().get("project"),
                    table_schema()
            )
        )
Exemple #31
0
def generate_statistics_from_tfrecord(
        pipeline_args,  # type: List[str]
        data_location,  # type: str
        output_path,  # type: str
        stats_options  # type: StatsOptions
):
    # type: (...) ->  statistics_pb2.DatasetFeatureStatisticsList
    """
    Generate stats file from a tfrecord dataset using TFDV

    :param pipeline_args: un-parsed Dataflow arguments
    :param data_location: input data dir containing tfrecord files
    :param output_path: output path for the stats file
    :return a DatasetFeatureStatisticsList proto.
    """
    assert_not_empty_string(data_location)
    assert_not_empty_string(output_path)

    args_in_snake_case = clean_up_pipeline_args(pipeline_args)
    pipeline_options = PipelineOptions(flags=args_in_snake_case)

    all_options = pipeline_options.get_all_options()

    if all_options["job_name"] is None:
        gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
        gcloud_options.job_name = "generatestats-%s" % str(int(time.time()))

    if all_options["setup_file"] is None:
        setup_file_path = create_setup_file()
        setup_options = pipeline_options.view_as(SetupOptions)
        setup_options.setup_file = setup_file_path

    input_files = os.path.join(data_location, "*.tfrecords*")
    return tfdv.generate_statistics_from_tfrecord(
        data_location=input_files,
        output_path=output_path,
        stats_options=stats_options,
        pipeline_options=pipeline_options)
def main(argv):
    parser = argparse.ArgumentParser()

    parser.add_argument('--topic',
                        type=str,
                        default='',
                        help='Topic to subscribe to (use either this parameter or --input but not both).')

    parser.add_argument('--output_dataset',
                        type=str,
                        default='',
                        help='The BigQuery dataset name where to write all the data.')

    parser.add_argument('--output_table_name',
                        type=str,
                        default='',
                        help='The BigQuery table name where to write all the data.')


    args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=options) as p:
        (p | 'ReadMessages'           >> ReadFromPubSub(args.topic)             
           | 'FormatRecord'           >> beam.Map(lambda element: {"data": element})  
#          | "PrintBeforeInsert"      >> beam.Map(lambda record: print str(element))
           | 'WriteDataElementBQ'     >> WriteToBigQuery(                                                        
                     args.output_table_name,
                     args.output_dataset,
                     options.get_all_options().get("project"),
                     table_schema(),
                     BigQueryDisposition.CREATE_IF_NEEDED,
                     BigQueryDisposition.WRITE_APPEND
                 )
           )
  def test_template_location(self):
    options = PipelineOptions(['--template_location', 'abc'])
    self.assertEqual(options.get_all_options()['template_location'], 'abc')

    options = PipelineOptions(flags=[''])
    self.assertEqual(options.get_all_options()['template_location'], None)
 def test_unknown_duplicate_args_converted_to_list(self):
   options = PipelineOptions(['--dup_arg', 'val1',
                              '--dup_arg', 'val2',
                              '--dup_arg=val3'])
   self.assertEqual(options.get_all_options()['dup_arg'],
                    ['val1', 'val2', 'val3'])
 def test_create_test_pipeline_options(self):
   test_pipeline = TestPipeline(argv=self.TEST_CASE['options'])
   test_options = PipelineOptions(test_pipeline.get_full_options_as_args())
   self.assertDictContainsSubset(self.TEST_CASE['expected_dict'],
                                 test_options.get_all_options())
  def test_dataflow_job_file(self):
    options = PipelineOptions(['--dataflow_job_file', 'abc'])
    self.assertEqual(options.get_all_options()['dataflow_job_file'], 'abc')

    options = PipelineOptions(flags=[''])
    self.assertEqual(options.get_all_options()['dataflow_job_file'], None)
 def test_retain_unknown_options_unary_missing_prefix(self):
     options = PipelineOptions(['bad_option'])
     with self.assertRaises(SystemExit):
         options.get_all_options(retain_unknown_options=True)
 def test_retain_unknown_options_unary_single_dash_store_true(self):
     options = PipelineOptions(['-i'])
     result = options.get_all_options(retain_unknown_options=True)
     self.assertEqual(result['i'], True)
 def test_invalid_override_init_options(self):
     base_flags = ['--num_workers', '5']
     options = PipelineOptions(base_flags, mock_invalid_flag=True)
     self.assertEqual(options.get_all_options()['num_workers'], 5)
     self.assertEqual(options.get_all_options()['mock_flag'], False)
    def test_dataflow_job_file(self):
        options = PipelineOptions(['--dataflow_job_file', 'abc'])
        self.assertEqual(options.get_all_options()['dataflow_job_file'], 'abc')

        options = PipelineOptions(flags=[''])
        self.assertEqual(options.get_all_options()['dataflow_job_file'], None)
    def test_template_location(self):
        options = PipelineOptions(['--template_location', 'abc'])
        self.assertEqual(options.get_all_options()['template_location'], 'abc')

        options = PipelineOptions(flags=[''])
        self.assertEqual(options.get_all_options()['template_location'], None)
Exemple #42
0
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_input: str, output: str, calculation_month_count: int,
        metric_types: List[str], state_code: Optional[str],
        calculation_end_month: Optional[str],
        person_filter_ids: Optional[List[int]]):
    """Runs the supervision calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()

    input_dataset = all_pipeline_options['project'] + '.' + data_input
    reference_dataset = all_pipeline_options['project'] + '.' + reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (p | 'Load Persons' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        # Get StateIncarcerationPeriods
        incarceration_periods = (
            p | 'Load IncarcerationPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateIncarcerationPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionViolations
        supervision_violations = (
            p | 'Load SupervisionViolations' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolation,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # TODO(2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = (
            p | 'Load SupervisionViolationResponses' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolationResponse,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionSentences
        supervision_sentences = (
            p | 'Load SupervisionSentences' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            p | 'Load IncarcerationSentences' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionPeriods
        supervision_periods = (
            p | 'Load SupervisionPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateAssessments
        assessments = (p | 'Load Assessments' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        supervision_contacts = (
            p | 'Load StateSupervisionContacts' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionContact,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=False,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Bring in the table that associates StateSupervisionViolationResponses to information about StateAgents
        ssvr_to_agent_association_query = select_all_by_person_query(
            reference_dataset, SSVR_TO_AGENT_ASSOCIATION_VIEW_NAME, state_code,
            person_id_filter_set)

        ssvr_to_agent_associations = (
            p | "Read SSVR to Agent table from BigQuery" >> beam.io.Read(
                beam.io.BigQuerySource(query=ssvr_to_agent_association_query,
                                       use_standard_sql=True)))

        # Convert the association table rows into key-value tuples with the value for the
        # supervision_violation_response_id column as the key
        ssvr_agent_associations_as_kv = (
            ssvr_to_agent_associations
            | 'Convert SSVR to Agent table to KV tuples' >> beam.ParDo(
                ConvertDictToKVTuple(), 'supervision_violation_response_id'))

        supervision_period_to_agent_association_query = select_all_by_person_query(
            reference_dataset,
            SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, state_code,
            person_id_filter_set)

        supervision_period_to_agent_associations = (
            p | "Read Supervision Period to Agent table from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(
                    query=supervision_period_to_agent_association_query,
                    use_standard_sql=True)))

        # Convert the association table rows into key-value tuples with the value for the supervision_period_id column
        # as the key
        supervision_period_to_agent_associations_as_kv = (
            supervision_period_to_agent_associations
            | 'Convert Supervision Period to Agent table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id'))

        if state_code is None or state_code == 'US_MO':
            # Bring in the reference table that includes sentence status ranking information
            us_mo_sentence_status_query = select_all_by_person_query(
                reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME,
                state_code, person_id_filter_set)

            us_mo_sentence_statuses = (
                p |
                "Read MO sentence status table from BigQuery" >> beam.io.Read(
                    beam.io.BigQuerySource(query=us_mo_sentence_status_query,
                                           use_standard_sql=True)))
        else:
            us_mo_sentence_statuses = (
                p |
                f"Generate empty MO statuses list for non-MO state run: {state_code} "
                >> beam.Create([]))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | 'Convert MO sentence status ranking table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        sentences_and_statuses = (
            {
                'incarceration_sentences': incarceration_sentences,
                'supervision_sentences': supervision_sentences,
                'sentence_statuses': us_mo_sentence_status_rankings_as_kv
            }
            | 'Group sentences to the sentence statuses for that person' >>
            beam.CoGroupByKey())

        sentences_converted = (
            sentences_and_statuses
            | 'Convert to state-specific sentences' >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    'incarceration_sentences', 'supervision_sentences'))

        # Bring in the judicial districts associated with supervision_periods
        sp_to_judicial_district_query = select_all_by_person_query(
            reference_dataset,
            SUPERVISION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME,
            state_code, person_id_filter_set)

        sp_to_judicial_district_kv = (
            p |
            "Read supervision_period to judicial_district associations from BigQuery"
            >> beam.io.Read(
                beam.io.BigQuerySource(query=sp_to_judicial_district_query,
                                       use_standard_sql=True))
            |
            "Convert supervision_period to judicial_district association table to KV"
            >> beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                'violations': supervision_violations,
                'violation_responses': supervision_violation_responses
            } | 'Group StateSupervisionViolationResponses to '
            'StateSupervisionViolations' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolation entities on the corresponding
        # StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | 'Set hydrated StateSupervisionViolations on '
            'the StateSupervisionViolationResponses' >> beam.ParDo(
                SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id
        incarceration_periods_and_violation_responses = (
            {
                'incarceration_periods': incarceration_periods,
                'violation_responses':
                violation_responses_with_hydrated_violations
            }
            | 'Group StateIncarcerationPeriods to '
            'StateSupervisionViolationResponses' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding
        # StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | 'Set hydrated StateSupervisionViolationResponses on '
            'the StateIncarcerationPeriods' >> beam.ParDo(
                SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their related entities
        person_entities = (
            {
                'person':
                persons,
                'assessments':
                assessments,
                'incarceration_periods':
                incarceration_periods_with_source_violations,
                'supervision_periods':
                supervision_periods,
                'supervision_sentences':
                sentences_converted.supervision_sentences,
                'incarceration_sentences':
                sentences_converted.incarceration_sentences,
                'violation_responses':
                violation_responses_with_hydrated_violations,
                'supervision_contacts':
                supervision_contacts,
                'supervision_period_judicial_district_association':
                sp_to_judicial_district_kv
            }
            | 'Group StatePerson to all entities' >> beam.CoGroupByKey())

        # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods
        person_time_buckets = (
            person_entities
            | 'Get SupervisionTimeBuckets' >> beam.ParDo(
                ClassifySupervisionTimeBuckets(),
                AsDict(ssvr_agent_associations_as_kv),
                AsDict(supervision_period_to_agent_associations_as_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get supervision metrics
        supervision_metrics = (
            person_time_buckets
            | 'Get Supervision Metrics' >> GetSupervisionMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count))
        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            supervision_metrics | 'Convert to dict to be written to BQ' >>
            beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                SupervisionMetricType.SUPERVISION_COMPLIANCE.value,
                SupervisionMetricType.SUPERVISION_POPULATION.value,
                SupervisionMetricType.SUPERVISION_REVOCATION.value,
                SupervisionMetricType.SUPERVISION_REVOCATION_ANALYSIS.value,
                SupervisionMetricType.
                SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS.value,
                SupervisionMetricType.SUPERVISION_SUCCESS.value,
                SupervisionMetricType.
                SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED.value,
                SupervisionMetricType.SUPERVISION_TERMINATION.value))

        # Write the metrics to the output tables in BigQuery
        terminations_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionTerminationMetric)
        compliance_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionCaseComplianceMetric)
        populations_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionPopulationMetric)
        revocations_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionRevocationMetric)
        revocation_analysis_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionRevocationAnalysisMetric)
        revocation_violation_type_analysis_table_id = \
            DATAFLOW_METRICS_TO_TABLES.get(SupervisionRevocationViolationTypeAnalysisMetric)
        successes_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionSuccessMetric)
        successful_sentence_lengths_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SuccessfulSupervisionSentenceDaysServedMetric)

        _ = (writable_metrics.SUPERVISION_POPULATION
             | f"Write population metrics to BQ table: {populations_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=populations_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_REVOCATION
             | f"Write revocation metrics to BQ table: {revocations_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=revocations_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_SUCCESS
             | f"Write success metrics to BQ table: {successes_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=successes_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED
             | f"Write supervision successful sentence length metrics to BQ"
             f" table: {successful_sentence_lengths_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=successful_sentence_lengths_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_TERMINATION
             |
             f"Write termination metrics to BQ table: {terminations_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=terminations_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (
            writable_metrics.SUPERVISION_REVOCATION_ANALYSIS
            |
            f"Write revocation analyses metrics to BQ table: {revocation_analysis_table_id}"
            >> beam.io.WriteToBigQuery(
                table=revocation_analysis_table_id,
                dataset=output,
                create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS
             |
             f"Write revocation violation type analyses metrics to BQ table: "
             f"{revocation_violation_type_analysis_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=revocation_violation_type_analysis_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_COMPLIANCE
             | f"Write compliance metrics to BQ table: {compliance_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=compliance_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
Exemple #43
0
def run(argv=None):
    """Runs the recidivism calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is
    # necessary because the BuildRootEntity function tries to access attributes
    # of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been
    # instantiated, then the relationship properties are loaded and their
    # attributes can be successfully accessed.
    _ = schema.StatePerson()

    # Parse command-line arguments
    known_args, pipeline_args = parse_arguments(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = pipeline_options.get_all_options()

    query_dataset = all_pipeline_options['project'] + '.' + known_args.input
    reference_dataset = all_pipeline_options['project'] + '.' + \
                        known_args.reference_input

    with beam.Pipeline(argv=pipeline_args) as p:
        # Get StatePersons
        persons = (p
                   | 'Load Persons' >> BuildRootEntity(
                       dataset=query_dataset,
                       data_dict=None,
                       root_schema_class=schema.StatePerson,
                       root_entity_class=entities.StatePerson,
                       unifying_id_field='person_id',
                       build_related_entities=True))

        # Get StateIncarcerationPeriods
        incarceration_periods = (
            p
            | 'Load IncarcerationPeriods' >> BuildRootEntity(
                dataset=query_dataset,
                data_dict=None,
                root_schema_class=schema.StateIncarcerationPeriod,
                root_entity_class=entities.StateIncarcerationPeriod,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateSupervisionViolations
        supervision_violations = \
            (p
             | 'Load SupervisionViolations' >>
             BuildRootEntity(
                 dataset=query_dataset,
                 data_dict=None,
                 root_schema_class=schema.StateSupervisionViolation,
                 root_entity_class=entities.StateSupervisionViolation,
                 unifying_id_field='person_id',
                 build_related_entities=True
             ))

        # TODO(2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = \
            (p
             | 'Load SupervisionViolationResponses' >>
             BuildRootEntity(
                 dataset=query_dataset,
                 data_dict=None,
                 root_schema_class=schema.StateSupervisionViolationResponse,
                 root_entity_class=entities.StateSupervisionViolationResponse,
                 unifying_id_field='person_id',
                 build_related_entities=True
             ))

        # Group StateSupervisionViolationResponses and
        # StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                'violations': supervision_violations,
                'violation_responses': supervision_violation_responses
            } | 'Group StateSupervisionViolationResponses to '
            'StateSupervisionViolations' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolation entities on
        # the corresponding StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | 'Set hydrated StateSupervisionViolations on '
            'the StateSupervisionViolationResponses' >> beam.ParDo(
                SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses
        # by person_id
        incarceration_periods_and_violation_responses = (
            {
                'incarceration_periods': incarceration_periods,
                'violation_responses':
                violation_responses_with_hydrated_violations
            }
            | 'Group StateIncarcerationPeriods to '
            'StateSupervisionViolationResponses' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolationResponse entities on
        # the corresponding StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | 'Set hydrated StateSupervisionViolationResponses on '
            'the StateIncarcerationPeriods' >> beam.ParDo(
                SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their StateIncarcerationPeriods
        person_and_incarceration_periods = (
            {
                'person':
                persons,
                'incarceration_periods':
                incarceration_periods_with_source_violations
            }
            | 'Group StatePerson to StateIncarcerationPeriods' >>
            beam.CoGroupByKey())

        # Bring in the table that associates people and their county of
        # residence
        person_id_to_county_query = \
            f"SELECT * FROM " \
            f"`{reference_dataset}.persons_to_recent_county_of_residence`"

        person_id_to_county_kv = (
            p
            | "Read person_id to county associations from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(query=person_id_to_county_query,
                                       use_standard_sql=True))
            | "Convert person_id to county association table to KV" >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Identify ReleaseEvents events from the StatePerson's
        # StateIncarcerationPeriods
        person_events = (
            person_and_incarceration_periods
            | "ClassifyReleaseEvents" >> beam.ParDo(
                ClassifyReleaseEvents(), AsDict(person_id_to_county_kv)))

        # Get dimensions to include and methodologies to use
        inclusions, methodologies = dimensions_and_methodologies(known_args)

        # Get pipeline job details for accessing job_id
        all_pipeline_options = pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get recidivism metrics
        recidivism_metrics = (
            person_events
            | 'Get Recidivism Metrics' >> GetRecidivismMetrics(
                pipeline_options=all_pipeline_options, inclusions=inclusions))

        filter_metrics_kwargs = {'methodologies': methodologies}

        # Filter out unneeded metrics
        final_recidivism_metrics = (
            recidivism_metrics
            | 'Filter out unwanted metrics' >> beam.ParDo(
                FilterMetrics(), **filter_metrics_kwargs))

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            final_recidivism_metrics
            | 'Convert to dict to be written to BQ' >> beam.ParDo(
                RecidivismMetricWritableDict()).with_outputs(
                    'rates', 'counts', 'liberties'))

        # Write the recidivism metrics to the output tables in BigQuery
        rates_table = known_args.output + '.recidivism_rate_metrics'
        counts_table = known_args.output + '.recidivism_count_metrics'
        liberty_table = known_args.output + '.recidivism_liberty_metrics'

        _ = (writable_metrics.rates
             | f"Write rate metrics to BQ table: {rates_table}" >>
             beam.io.WriteToBigQuery(
                 table=rates_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (writable_metrics.counts
             | f"Write count metrics to BQ table: {counts_table}" >>
             beam.io.WriteToBigQuery(
                 table=counts_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (writable_metrics.liberties
             | f"Write liberty metrics to BQ table: {liberty_table}" >>
             beam.io.WriteToBigQuery(
                 table=liberty_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))