Exemple #1
0
def main(argv=None):
    options = PipelineOptions(argv)
    options.view_as(SetupOptions).save_main_session = True

    p = Pipeline(options=options)

    input1 = p | 'Input1' >> beam.Create([1, 2, 3], reshuffle=False)
    input2 = p | 'Input2' >> beam.Create([4, 5, 6], reshuffle=False)

    output_a, output_b = (
        (input1, input2)
        | 'Flatten' >> beam.Flatten()
        | 'Split' >> beam.ParDo(MultiOutputDoFn()).with_outputs(
            MultiOutputDoFn.OUTPUT_TAG_B, main=MultiOutputDoFn.OUTPUT_TAG_A))

    # IdentityA and IdentityB are to set output types and set right coders for
    # Dataflow Runner. You may see type inference error (BEAM-4132) without them.

    (output_a
     | 'IdentityA' >> beam.Map(lambda x: x).with_output_types(Tuple[str, int])
     | 'PrintA' >> beam.ParDo(StatefulPrintDoFn('PrintA')))

    (output_b
     | 'IdentityB' >> beam.Map(lambda x: x).with_output_types(Tuple[str, int])
     | 'PrintB' >> beam.ParDo(StatefulPrintDoFn('PrintB')))

    p.run()
Exemple #2
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('input_topic',
                        type=str,
                        help="Input Pub/Sub topic name.")
    parser.add_argument(
        'output_table',
        type=str,
        help="Output BigQuery table name. Example: project.db.name")
    parser.add_argument('--model_project',
                        type=str,
                        help="Google Project ID with model.")
    parser.add_argument('--model_name',
                        type=str,
                        help="Name of the Google AI Platform model name.")
    parser.add_argument('--model_region',
                        type=str,
                        help="AI Platform region name.")
    parser.add_argument('--model_version',
                        type=str,
                        help="AI Platform model version.")

    known_args, pipeline_args = parser.parse_known_args(argv)

    _topic_comp = known_args.input_topic.split('/')
    if len(_topic_comp) != 4 or _topic_comp[0] != 'projects' or _topic_comp[
            2] != 'topics':
        raise ValueError("Table topic name has inappropriate format.")

    if len(known_args.output_table.split('.')) != 2:
        raise ValueError("Table name has inappropriate format.")

    inf_args = [
        known_args.model_project, known_args.model_name,
        known_args.model_region, known_args.model_version
    ]
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    options.view_as(StandardOptions).streaming = True

    p = Pipeline(options=options)
    _ = (p | 'read from pub/sub' >> ReadFromPubSub(
        known_args.input_topic).with_output_types(bytes)
         | 'windowing' >> WindowInto(window.FixedWindows(10, 0))
         | 'convert to dict' >> Map(json.loads)
         | 'pre processing' >> PreProcessing()
         | 'make inference' >> ParDo(MakeRemoteInferenceDoFn(*inf_args))
         | 'format message' >> Map(formatter)
         | 'write to BQ' >> WriteToBigQuery(
             table=known_args.output_table,
             schema=build_bq_schema(),
             create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=BigQueryDisposition.WRITE_APPEND))
    if os.environ.get('DEPLOY'):
        p.run(
        )  # I use p.run() instead of "opening context `with Pipeline() as p`" because it need to exit after running.
    else:
        p.run().wait_until_finish()
Exemple #3
0
def main(argv=None):
    options = PipelineOptions(argv)
    p = Pipeline(options=options)

    (p
     | GenerateSequence(
         0, stop=100, expansion_service=BEAM_IO_EXPANSION_SERVICE)
     | Map(lambda x: logging.info(x)))

    p.run()
Exemple #4
0
def main(argv=None):
  options = PipelineOptions(argv)

  p = Pipeline(options=options)

  (p
   | Create(["a", "b", "c", "d", "e"], reshuffle=False)
   | Print("hello", expansion_service(options)))

  p.run()
    def handle_return(self, pipeline: beam.Pipeline) -> None:
        """Appends a beam.io.WriteToParquet at the end of a beam pipeline
        and therefore persists the results.

        Args:
            pipeline: A beam.pipeline object.
        """
        # TODO [ENG-139]: Implement beam writing
        super().handle_return(pipeline)
        pipeline | beam.ParDo()
        pipeline.run()
Exemple #6
0
def main(argv=None):
    options = PipelineOptions(argv)
    p = Pipeline(options=options)

    input = p | 'Input' >> beam.Create([1, 2, 3], reshuffle=False)
    output1 = input | 'Output1' >> beam.Map(lambda x, side:
                                            (x, side), AsList(input))
    input | 'Output2' >> beam.Map(
        lambda x, side: logging.info('x: %s, side: %s', x, side),
        AsList(output1))

    p.run()
Exemple #7
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True
    project = options.view_as(GoogleCloudOptions).project

    p = Pipeline(options=options)
    (p
     | Create(EN_TEXTS)
     | ParDo(TranslateDoFn(project, SOURCE_LANGUAGE_CODE,
                           TARGET_LANGUAGE_CODE))
     | Map(print_translation))

    p.run()
Exemple #8
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    p = Pipeline(options=options)
    (p
     | Create(list(range(NUM_SHARDS)))
     | FlatMap(lambda _:
               (bytes(ELEMENT_BYTES) for _ in range(NUM_ELEMENTS_PER_SHARD)))
     | WithKeys('')
     | ParDo(BigBagDoFn()))

    p.run()
Exemple #9
0
def main(argv=None):
    options = PipelineOptions(argv)
    kafka_options = options.view_as(KafkaReadOptions)

    p = Pipeline(options=options)
    (p
     | ReadFromKafka(consumer_config={
         'bootstrap.servers': kafka_options.bootstrap_servers
     },
                     topics=[kafka_options.topic])
     | Map(lambda x: logging.info('kafka element: %s', x)))

    p.run()
Exemple #10
0
def pipeline_options_remote(argv):
    """Creating a Pipeline using a PipelineOptions object for remote execution."""

    from apache_beam import Pipeline
    from apache_beam.options.pipeline_options import PipelineOptions

    # [START pipeline_options_create]
    options = PipelineOptions(flags=argv)

    # [END pipeline_options_create]

    # [START pipeline_options_define_custom]
    class MyOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument('--input')
            parser.add_argument('--output')

    # [END pipeline_options_define_custom]

    from apache_beam.options.pipeline_options import GoogleCloudOptions
    from apache_beam.options.pipeline_options import StandardOptions

    # [START pipeline_options_dataflow_service]
    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=argv)

    # For Cloud execution, set the Cloud Platform project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = 'my-project-id'
    google_cloud_options.job_name = 'myjob'
    google_cloud_options.staging_location = 'gs://my-bucket/binaries'
    google_cloud_options.temp_location = 'gs://my-bucket/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)
    # [END pipeline_options_dataflow_service]

    my_options = options.view_as(MyOptions)
    my_input = my_options.input
    my_output = my_options.output

    p = TestPipeline()  # Use TestPipeline for testing.

    lines = p | beam.io.ReadFromText(my_input)
    lines | beam.io.WriteToText(my_output)

    p.run()
Exemple #11
0
def pipeline_options_remote(argv):
  """Creating a Pipeline using a PipelineOptions object for remote execution."""

  from apache_beam import Pipeline
  from apache_beam.options.pipeline_options import PipelineOptions

  # [START pipeline_options_create]
  options = PipelineOptions(flags=argv)
  # [END pipeline_options_create]

  # [START pipeline_options_define_custom]
  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input')
      parser.add_argument('--output')
  # [END pipeline_options_define_custom]

  from apache_beam.options.pipeline_options import GoogleCloudOptions
  from apache_beam.options.pipeline_options import StandardOptions

  # [START pipeline_options_dataflow_service]
  # Create and set your PipelineOptions.
  options = PipelineOptions(flags=argv)

  # For Cloud execution, set the Cloud Platform project, job_name,
  # staging location, temp_location and specify DataflowRunner.
  google_cloud_options = options.view_as(GoogleCloudOptions)
  google_cloud_options.project = 'my-project-id'
  google_cloud_options.job_name = 'myjob'
  google_cloud_options.staging_location = 'gs://my-bucket/binaries'
  google_cloud_options.temp_location = 'gs://my-bucket/temp'
  options.view_as(StandardOptions).runner = 'DataflowRunner'

  # Create the Pipeline with the specified options.
  p = Pipeline(options=options)
  # [END pipeline_options_dataflow_service]

  my_options = options.view_as(MyOptions)
  my_input = my_options.input
  my_output = my_options.output

  p = TestPipeline()  # Use TestPipeline for testing.

  lines = p | beam.io.ReadFromText(my_input)
  lines | beam.io.WriteToText(my_output)

  p.run()
Exemple #12
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    p = Pipeline(options=options)

    start = 1
    end = 100
    (p
     |
     'From {} to {}'.format(start, end) >> Create(list(range(start, end + 1)))
     | 'Sum' >> CombineGlobally(sum)
     | 'Print' >>
     ParDo(lambda total: logging.info('Sum from 1 to 100 is %s', total)))

    p.run()
Exemple #13
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    project = options.view_as(GoogleCloudOptions).project
    assert project is not None, '"project" is not specified.'

    source_code = 'en-US'
    target_code = 'ja'
    texts = ['Hello', 'Thank you', 'Goodbye']

    p = Pipeline(options=options)
    (p
     | 'Texts' >> Create(texts)
     | 'Translate' >> ParDo(Translate(project, source_code, target_code))
     | 'Print' >> Map(lambda pair: logging.info('%s -> %s', pair[0], pair[1])))

    p.run()
Exemple #14
0
def main():
  options = PipelineOptions()
  options.view_as(SetupOptions).save_main_session = True
  p = Pipeline(options=options)

  start = 1
  end = 10

  (p
   | 'From {} to {}'.format(start, end)
   >> Create(list(range(start, end + 1)))
   | 'ToXml' >> ParDo(ToXmlDoFn())
   # If a job finishes too quickly, worker VMs can be shutdown before they send
   # logs in local files to Cloud Logging. Adding 30s sleep to avoid this
   | 'Sleep30s' >> ParDo(Sleep(30))
   | 'Print' >> ParDo(lambda xml: logging.info(xml))
   )

  p.run()
Exemple #15
0
def main(argv=None):
    options = PipelineOptions(argv)
    topic = options.view_as(PubSubTopicOptions).topic

    p = Pipeline(options=options)
    (p
     # This is an external transform
     # `apache_beam.io.external.gcp.pubsub.ReadFromPubSub`. This is different from
     # `apache_beam.io.gcp.pubsub.ReadFromPubSub` which is native transform used
     # for most cases.
     #
     # If you set expansion_service as BeamJarExpansionService(
     # 'sdks:java:io:google-cloud-platform:expansion-service:shadowJar'), it will
     # fail as the beam jar has no dependency for DirectRunner. As a workaround,
     # specify custom expansion service jar in this project.
     | ReadFromPubSub(topic=topic,
                      with_attributes=True,
                      expansion_service=expansion_service(options))
     | Map(lambda message: logging.info("message: %s", message)))
    p.run()
Exemple #16
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    opt = options.view_as(_Options)
    inputs = opt.inputs
    output_prefix = opt.output_prefix or os.path.join(
        options.view_as(GoogleCloudOptions).temp_location, 'output')
    shards = opt.shards

    p = Pipeline(options=options)

    def generate(n):
        yield from range(n * _ELEMENTS_PER_INPUT,
                         (n + 1) * _ELEMENTS_PER_INPUT)

    (p
     | Create(range(inputs))
     | ParDo(generate).with_output_types(int)
     | WriteToText(output_prefix, num_shards=shards))

    p.run()
Exemple #17
0
def pipeline_options_local(argv):
    """Creating a Pipeline using a PipelineOptions object for local execution."""

    from apache_beam import Pipeline
    from apache_beam.options.pipeline_options import PipelineOptions

    options = PipelineOptions(flags=argv)

    # [START pipeline_options_define_custom_with_help_and_default]
    class MyOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument('--input',
                                help='Input for the pipeline',
                                default='gs://my-bucket/input')
            parser.add_argument('--output',
                                help='Output for the pipeline',
                                default='gs://my-bucket/output')

    # [END pipeline_options_define_custom_with_help_and_default]

    my_options = options.view_as(MyOptions)

    my_input = my_options.input
    my_output = my_options.output

    # [START pipeline_options_local]
    # Create and set your Pipeline Options.
    options = PipelineOptions()
    p = Pipeline(options=options)
    # [END pipeline_options_local]

    p = TestPipeline()  # Use TestPipeline for testing.
    lines = p | beam.io.ReadFromText(my_input)
    lines | beam.io.WriteToText(my_output)
    p.run()
Exemple #18
0
def pipeline_options_local(argv):
  """Creating a Pipeline using a PipelineOptions object for local execution."""

  from apache_beam import Pipeline
  from apache_beam.options.pipeline_options import PipelineOptions

  options = PipelineOptions(flags=argv)

  # [START pipeline_options_define_custom_with_help_and_default]
  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          help='Input for the pipeline',
                          default='gs://my-bucket/input')
      parser.add_argument('--output',
                          help='Output for the pipeline',
                          default='gs://my-bucket/output')
  # [END pipeline_options_define_custom_with_help_and_default]

  my_options = options.view_as(MyOptions)

  my_input = my_options.input
  my_output = my_options.output

  # [START pipeline_options_local]
  # Create and set your Pipeline Options.
  options = PipelineOptions()
  p = Pipeline(options=options)
  # [END pipeline_options_local]

  p = TestPipeline()  # Use TestPipeline for testing.
  lines = p | beam.io.ReadFromText(my_input)
  lines | beam.io.WriteToText(my_output)
  p.run()
Exemple #19
0
def main():
  options = PipelineOptions()
  options.view_as(SetupOptions).save_main_session = True

  BATCH_SIZE = 1000000
  BUFFERING_SECS = 600

  p = Pipeline(options=options)
  (p
   | Create(range(100), reshuffle=True)
   | ParDo(make_large_elements)  # 128 KiB
   | WithKeys('')
   | GroupIntoBatchesWithMultiBags(BATCH_SIZE, BUFFERING_SECS)  # Big batch size with 1 minute trigger
   | Map(lambda kv: logging.info('key: %s, value count: %s',
                                 kv[0], len(kv[1]))))

  run = p.run()
  run.wait_until_finish()
Exemple #20
0
def tensorize_sql_fields(pipeline: Pipeline, output_path: str,
                         sql_dataset: str, tensor_type: str):

    if tensor_type == 'categorical':
        query = _get_categorical_query(sql_dataset)
    elif tensor_type == 'continuous':
        query = _get_continuous_query(sql_dataset)
    elif tensor_type == 'icd':
        query = _get_icd_query(sql_dataset)
    elif tensor_type == 'disease':
        query = _get_disease_query(sql_dataset)
    elif tensor_type == 'phecode_disease':
        query = _get_phecode_query(sql_dataset)
    elif tensor_type == 'death':
        query = _get_death_and_censor_query(sql_dataset)
    else:
        raise ValueError(
            "Can tensorize only categorical or continuous fields, got ",
            tensor_type)

    bigquery_source = beam.io.BigQuerySource(query=query,
                                             use_standard_sql=True)
    # Query table in BQ
    steps = (
        pipeline
        | 'QueryTables' >> beam.io.Read(bigquery_source)

        # Each row is a dictionary where the keys are the BigQuery columns
        | 'CreateKey' >> beam.Map(lambda row: (row['sample_id'], row))

        # Group by key
        | 'GroupByKey' >> beam.GroupByKey()

        # Format into hd5 files and upload to GCS
        | 'CreateHd5sAndUploadToGCS' >> beam.Map(write_tensor_from_sql,
                                                 output_path, tensor_type))

    result = pipeline.run()
    result.wait_until_finish()
Exemple #21
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    BATCH_SIZE = 1000000
    BUFFERING_SECS = 600

    p = Pipeline(options=options)
    (p
     | Create(range(100), reshuffle=True)
     | ParDo(make_large_elements)  # 128 KiB
     | WithKeys('')
     | WindowInto(GlobalWindows(),
                  trigger=Repeatedly(
                      AfterAny(AfterCount(BATCH_SIZE),
                               AfterProcessingTime(BUFFERING_SECS))),
                  accumulation_mode=AccumulationMode.DISCARDING)
     | GroupByKey()
     | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1]
                                                                          ))))

    run = p.run()
    run.wait_until_finish()
class PerformanceRuntimeTypeCheckTest(unittest.TestCase):
    def setUp(self):
        self.p = Pipeline(options=PipelineOptions(
            performance_runtime_type_check=True, pipeline_type_check=False))

    def assertStartswith(self, msg, prefix):
        self.assertTrue(msg.startswith(prefix),
                        '"%s" does not start with "%s"' % (msg, prefix))

    def test_simple_input_error(self):
        with self.assertRaises(TypeCheckError) as e:
            (self.p
             | beam.Create([1, 1])
             | beam.FlatMap(lambda x: [int(x)]).with_input_types(
                 str).with_output_types(int))
            self.p.run()

        self.assertIn(
            "Type-hint for argument: 'x' violated. "
            "Expected an instance of {}, "
            "instead found 1, an instance of {}".format(str, int),
            e.exception.args[0])

    def test_simple_output_error(self):
        with self.assertRaises(TypeCheckError) as e:
            (self.p
             | beam.Create(['1', '1'])
             | beam.FlatMap(lambda x: [int(x)]).with_input_types(
                 int).with_output_types(int))
            self.p.run()

        self.assertIn(
            "Type-hint for argument: 'x' violated. "
            "Expected an instance of {}, "
            "instead found 1, an instance of {}.".format(int, str),
            e.exception.args[0])

    def test_simple_input_error_with_kwarg_typehints(self):
        @with_input_types(element=int)
        @with_output_types(int)
        class ToInt(beam.DoFn):
            def process(self, element, *args, **kwargs):
                yield int(element)

        with self.assertRaises(TypeCheckError) as e:
            (self.p | beam.Create(['1', '1']) | beam.ParDo(ToInt()))
            self.p.run()

        self.assertStartswith(
            e.exception.args[0], "Runtime type violation detected within "
            "ParDo(ToInt): Type-hint for argument: "
            "'element' violated. Expected an instance of "
            "{}, instead found 1, "
            "an instance of {}.".format(int, str))

    def test_do_fn_returning_non_iterable_throws_error(self):
        # This function is incorrect because it returns a non-iterable object
        def incorrect_par_do_fn(x):
            return x + 5

        with self.assertRaises(TypeError) as cm:
            (self.p | beam.Create([1, 1]) | beam.FlatMap(incorrect_par_do_fn))
            self.p.run()

        self.assertStartswith(cm.exception.args[0],
                              "'int' object is not iterable ")

    def test_simple_type_satisfied(self):
        @with_input_types(int, int)
        @with_output_types(int)
        class AddWithNum(beam.DoFn):
            def process(self, element, num):
                return [element + num]

        results = (self.p
                   | 'T' >> beam.Create([1, 2, 3]).with_output_types(int)
                   | 'Add' >> beam.ParDo(AddWithNum(), 1))

        assert_that(results, equal_to([2, 3, 4]))
        self.p.run()

    def test_simple_type_violation(self):
        self.p._options.view_as(TypeOptions).pipeline_type_check = False

        @with_output_types(str)
        @with_input_types(x=int)
        def int_to_string(x):
            return str(x)

        (self.p
         | 'Create' >> beam.Create(['some_string'])
         | 'ToStr' >> beam.Map(int_to_string))
        with self.assertRaises(TypeCheckError) as e:
            self.p.run()

        self.assertStartswith(
            e.exception.args[0],
            "Runtime type violation detected within ParDo(ToStr): "
            "Type-hint for argument: 'x' violated. "
            "Expected an instance of {}, "
            "instead found some_string, an instance of {}.".format(int, str))

    def test_pipeline_checking_satisfied_but_run_time_types_violate(self):
        self.p._options.view_as(TypeOptions).pipeline_type_check = False

        @with_output_types(Tuple[bool, int])
        @with_input_types(a=int)
        def is_even_as_key(a):
            # Simulate a programming error, should be: return (a % 2 == 0, a)
            # However this returns Tuple[int, int]
            return (a % 2, a)

        (self.p
         | 'Nums' >> beam.Create(range(1)).with_output_types(int)
         | 'IsEven' >> beam.Map(is_even_as_key)
         | 'Parity' >> beam.GroupByKey())

        with self.assertRaises(TypeCheckError) as e:
            self.p.run()

        self.assertStartswith(
            e.exception.args[0],
            "Runtime type violation detected within ParDo(IsEven): "
            "Type-hint for return type violated: "
            "Tuple[bool, int] hint type-constraint violated. "
            "The type of element #0 in the passed tuple is incorrect. "
            "Expected an instance of type bool, "
            "instead received an instance of type int. ")

    def test_pipeline_runtime_checking_violation_composite_type_output(self):
        self.p._options.view_as(TypeOptions).pipeline_type_check = False

        # The type-hinted applied via the 'returns()' method indicates the ParDo
        # should return an instance of type: Tuple[float, int]. However, an instance
        # of 'int' will be generated instead.
        with self.assertRaises(TypeCheckError) as e:
            (self.p
             | beam.Create([(1, 3.0)])
             | ('Swap' >> beam.FlatMap(lambda x_y1: [x_y1[0] + x_y1[1]]).
                with_input_types(Tuple[int, float]).with_output_types(int)))
            self.p.run()

        self.assertStartswith(
            e.exception.args[0],
            "Runtime type violation detected within ParDo(Swap): "
            "Type-hint for return type violated. "
            "Expected an instance of {}, "
            "instead found 4.0, an instance of {}.".format(int, float))

    def test_downstream_input_type_hint_error_has_descriptive_error_msg(self):
        @with_input_types(int)
        @with_output_types(int)
        class IntToInt(beam.DoFn):
            def process(self, element, *args, **kwargs):
                yield element

        @with_input_types(str)
        @with_output_types(int)
        class StrToInt(beam.DoFn):
            def process(self, element, *args, **kwargs):
                yield int(element)

        # This will raise a type check error in IntToInt even though the actual
        # type check error won't happen until StrToInt. The user will be told that
        # StrToInt's input type hints were not satisfied while running IntToInt.
        with self.assertRaises(TypeCheckError) as e:
            (self.p
             | beam.Create([9])
             | beam.ParDo(IntToInt())
             | beam.ParDo(StrToInt()))
            self.p.run()

        self.assertStartswith(
            e.exception.args[0],
            "Runtime type violation detected within ParDo(StrToInt): "
            "Type-hint for argument: 'element' violated. "
            "Expected an instance of {}, "
            "instead found 9, an instance of {}. "
            "[while running 'ParDo(IntToInt)']".format(str, int))
Exemple #23
0
 def run_pipeline(self, context: JobContext, pipeline: Pipeline):
     logger.info("Run pipeline, context %s", context)
     return pipeline.run()