Esempi in Python per Pipeline.Pipeline, esempi in Python per apache_beam.Pipeline.Pipeline

Esempio n. 1

0

Mostra file

File: main_got_dataflow.py Progetto: andrewlarimer/streaming_sentiment

def main():
    # bq_source = BigQuerySource(query="""
    #                            SELECT created_at, text
    #                            FROM got_sentiment.got_tweets
    #                            """,
    #                            validate=False, coder=None,
    #                            use_standard_sql=True, flatten_results=True,
    #                            kms_key=None)

    # Removed attributes from ReadFromPubSub:
    #                              with_attributes=False,
    #                             timestamp_attribute='created_at'

    # Create the Pipeline with the specified options.
    with Pipeline(options=options) as p:
        results = (
            p | 'read_from_topic' >> ReadFromPubSub(topic=PUBSUB_TOPIC)
            | 'Window' >> WindowInto(window.FixedWindows(60))
            | 'Emit_needed_values' >> FlatMap(emit_values, entity_map)
            | 'Combine' >> CombinePerKey(EntityScoreCombine())
            | 'Add Window Timestamp' >> beam.ParDo(AddWindowTimestampFn())
            | 'FormatForWrite' >> Map(format_for_write)
            | 'Write' >> WriteToBigQuery('streaming_scores',
                                         dataset=BQ_DATASET,
                                         project=PROJECT_ID,
                                         create_disposition='CREATE_IF_NEEDED',
                                         write_disposition='WRITE_APPEND',
                                         batch_size=20))

Esempio n. 2

0

Mostra file

File: snippets.py Progetto: wanwanzhu/beam

def pipeline_options_local(argv):
  """Creating a Pipeline using a PipelineOptions object for local execution."""

  from apache_beam import Pipeline
  from apache_beam.options.pipeline_options import PipelineOptions

  options = PipelineOptions(flags=argv)

  # [START pipeline_options_define_custom_with_help_and_default]
  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          help='Input for the pipeline',
                          default='gs://my-bucket/input')
      parser.add_argument('--output',
                          help='Output for the pipeline',
                          default='gs://my-bucket/output')
  # [END pipeline_options_define_custom_with_help_and_default]

  my_options = options.view_as(MyOptions)

  my_input = my_options.input
  my_output = my_options.output

  # [START pipeline_options_local]
  # Create and set your Pipeline Options.
  options = PipelineOptions()
  with Pipeline(options=options) as p:
    # [END pipeline_options_local]

    with TestPipeline() as p:  # Use TestPipeline for testing.
      lines = p | beam.io.ReadFromText(my_input)
      lines | beam.io.WriteToText(my_output)

Esempio n. 3

0

Mostra file

def main(argv=None):
    options = PipelineOptions(argv)
    options.view_as(SetupOptions).save_main_session = True

    p = Pipeline(options=options)

    input1 = p | 'Input1' >> beam.Create([1, 2, 3], reshuffle=False)
    input2 = p | 'Input2' >> beam.Create([4, 5, 6], reshuffle=False)

    output_a, output_b = (
        (input1, input2)
        | 'Flatten' >> beam.Flatten()
        | 'Split' >> beam.ParDo(MultiOutputDoFn()).with_outputs(
            MultiOutputDoFn.OUTPUT_TAG_B, main=MultiOutputDoFn.OUTPUT_TAG_A))

    # IdentityA and IdentityB are to set output types and set right coders for
    # Dataflow Runner. You may see type inference error (BEAM-4132) without them.

    (output_a
     | 'IdentityA' >> beam.Map(lambda x: x).with_output_types(Tuple[str, int])
     | 'PrintA' >> beam.ParDo(StatefulPrintDoFn('PrintA')))

    (output_b
     | 'IdentityB' >> beam.Map(lambda x: x).with_output_types(Tuple[str, int])
     | 'PrintB' >> beam.ParDo(StatefulPrintDoFn('PrintB')))

    p.run()

Esempio n. 4

0

Mostra file

def run(argv=None):
  options = PipelineOptions(argv)
  options.view_as(SetupOptions).save_main_session = True

  with Pipeline(options=options) as p:
    (p
     | beam.Create([None])
     | beam.ParDo(connect_and_query))

Esempio n. 5

0

Mostra file

def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('input_topic',
                        type=str,
                        help="Input Pub/Sub topic name.")
    parser.add_argument(
        'output_table',
        type=str,
        help="Output BigQuery table name. Example: project.db.name")
    parser.add_argument('--model_project',
                        type=str,
                        help="Google Project ID with model.")
    parser.add_argument('--model_name',
                        type=str,
                        help="Name of the Google AI Platform model name.")
    parser.add_argument('--model_region',
                        type=str,
                        help="AI Platform region name.")
    parser.add_argument('--model_version',
                        type=str,
                        help="AI Platform model version.")

    known_args, pipeline_args = parser.parse_known_args(argv)

    _topic_comp = known_args.input_topic.split('/')
    if len(_topic_comp) != 4 or _topic_comp[0] != 'projects' or _topic_comp[
            2] != 'topics':
        raise ValueError("Table topic name has inappropriate format.")

    if len(known_args.output_table.split('.')) != 2:
        raise ValueError("Table name has inappropriate format.")

    inf_args = [
        known_args.model_project, known_args.model_name,
        known_args.model_region, known_args.model_version
    ]
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    options.view_as(StandardOptions).streaming = True

    p = Pipeline(options=options)
    _ = (p | 'read from pub/sub' >> ReadFromPubSub(
        known_args.input_topic).with_output_types(bytes)
         | 'windowing' >> WindowInto(window.FixedWindows(10, 0))
         | 'convert to dict' >> Map(json.loads)
         | 'pre processing' >> PreProcessing()
         | 'make inference' >> ParDo(MakeRemoteInferenceDoFn(*inf_args))
         | 'format message' >> Map(formatter)
         | 'write to BQ' >> WriteToBigQuery(
             table=known_args.output_table,
             schema=build_bq_schema(),
             create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=BigQueryDisposition.WRITE_APPEND))
    if os.environ.get('DEPLOY'):
        p.run(
        )  # I use p.run() instead of "opening context `with Pipeline() as p`" because it need to exit after running.
    else:
        p.run().wait_until_finish()

Esempio n. 6

0

Mostra file

def main(argv=None):
    options = PipelineOptions(argv)
    p = Pipeline(options=options)

    (p
     | GenerateSequence(
         0, stop=100, expansion_service=BEAM_IO_EXPANSION_SERVICE)
     | Map(lambda x: logging.info(x)))

    p.run()

Esempio n. 7

0

Mostra file

def main(argv=None):
  options = PipelineOptions(argv)

  p = Pipeline(options=options)

  (p
   | Create(["a", "b", "c", "d", "e"], reshuffle=False)
   | Print("hello", expansion_service(options)))

  p.run()

Esempio n. 8

0

Mostra file

def main(argv=None):
    options = PipelineOptions(argv)
    p = Pipeline(options=options)

    input = p | 'Input' >> beam.Create([1, 2, 3], reshuffle=False)
    output1 = input | 'Output1' >> beam.Map(lambda x, side:
                                            (x, side), AsList(input))
    input | 'Output2' >> beam.Map(
        lambda x, side: logging.info('x: %s, side: %s', x, side),
        AsList(output1))

    p.run()

Esempio n. 9

0

Mostra file

def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True
    project = options.view_as(GoogleCloudOptions).project

    p = Pipeline(options=options)
    (p
     | Create(EN_TEXTS)
     | ParDo(TranslateDoFn(project, SOURCE_LANGUAGE_CODE,
                           TARGET_LANGUAGE_CODE))
     | Map(print_translation))

    p.run()

Esempio n. 10

0

Mostra file

def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    p = Pipeline(options=options)
    (p
     | Create(list(range(NUM_SHARDS)))
     | FlatMap(lambda _:
               (bytes(ELEMENT_BYTES) for _ in range(NUM_ELEMENTS_PER_SHARD)))
     | WithKeys('')
     | ParDo(BigBagDoFn()))

    p.run()

Esempio n. 11

0

Mostra file

def main(argv=None):
    options = PipelineOptions(argv)
    kafka_options = options.view_as(KafkaReadOptions)

    p = Pipeline(options=options)
    (p
     | ReadFromKafka(consumer_config={
         'bootstrap.servers': kafka_options.bootstrap_servers
     },
                     topics=[kafka_options.topic])
     | Map(lambda x: logging.info('kafka element: %s', x)))

    p.run()

Esempio n. 12

0

Mostra file

File: snippets.py Progetto: zoyahav/beam

def pipeline_options_remote(argv):
    """Creating a Pipeline using a PipelineOptions object for remote execution."""

    from apache_beam import Pipeline
    from apache_beam.options.pipeline_options import PipelineOptions

    # [START pipeline_options_create]
    options = PipelineOptions(flags=argv)

    # [END pipeline_options_create]

    # [START pipeline_options_define_custom]
    class MyOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument('--input')
            parser.add_argument('--output')

    # [END pipeline_options_define_custom]

    from apache_beam.options.pipeline_options import GoogleCloudOptions
    from apache_beam.options.pipeline_options import StandardOptions

    # [START pipeline_options_dataflow_service]
    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=argv)

    # For Cloud execution, set the Cloud Platform project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = 'my-project-id'
    google_cloud_options.job_name = 'myjob'
    google_cloud_options.staging_location = 'gs://my-bucket/binaries'
    google_cloud_options.temp_location = 'gs://my-bucket/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)
    # [END pipeline_options_dataflow_service]

    my_options = options.view_as(MyOptions)
    my_input = my_options.input
    my_output = my_options.output

    p = TestPipeline()  # Use TestPipeline for testing.

    lines = p | beam.io.ReadFromText(my_input)
    lines | beam.io.WriteToText(my_output)

    p.run()

Esempio n. 13

0

Mostra file

def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    p = Pipeline(options=options)

    start = 1
    end = 100
    (p
     |
     'From {} to {}'.format(start, end) >> Create(list(range(start, end + 1)))
     | 'Sum' >> CombineGlobally(sum)
     | 'Print' >>
     ParDo(lambda total: logging.info('Sum from 1 to 100 is %s', total)))

    p.run()

Esempio n. 14

0

Mostra file

def main():
  options = PipelineOptions()
  options.view_as(SetupOptions).save_main_session = True

  BATCH_SIZE = 1000000
  BUFFERING_SECS = 600

  p = Pipeline(options=options)
  (p
   | Create(range(100), reshuffle=True)
   | ParDo(make_large_elements)  # 128 KiB
   | WithKeys('')
   | GroupIntoBatchesWithMultiBags(BATCH_SIZE, BUFFERING_SECS)  # Big batch size with 1 minute trigger
   | Map(lambda kv: logging.info('key: %s, value count: %s',
                                 kv[0], len(kv[1]))))

  run = p.run()
  run.wait_until_finish()

Esempio n. 15

0

Mostra file

def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    project = options.view_as(GoogleCloudOptions).project
    assert project is not None, '"project" is not specified.'

    source_code = 'en-US'
    target_code = 'ja'
    texts = ['Hello', 'Thank you', 'Goodbye']

    p = Pipeline(options=options)
    (p
     | 'Texts' >> Create(texts)
     | 'Translate' >> ParDo(Translate(project, source_code, target_code))
     | 'Print' >> Map(lambda pair: logging.info('%s -> %s', pair[0], pair[1])))

    p.run()

Esempio n. 16

0

Mostra file

def main():
  options = PipelineOptions()
  options.view_as(SetupOptions).save_main_session = True
  p = Pipeline(options=options)

  start = 1
  end = 10

  (p
   | 'From {} to {}'.format(start, end)
   >> Create(list(range(start, end + 1)))
   | 'ToXml' >> ParDo(ToXmlDoFn())
   # If a job finishes too quickly, worker VMs can be shutdown before they send
   # logs in local files to Cloud Logging. Adding 30s sleep to avoid this
   | 'Sleep30s' >> ParDo(Sleep(30))
   | 'Print' >> ParDo(lambda xml: logging.info(xml))
   )

  p.run()

Esempio n. 17

0

Mostra file

File: Beam.py Progetto: Vamsi-Bandi/GCP

def run(input_topic, num_shards, window_size):

    # Set `save_main_session` to True so DoFns can access globally imported modules.
    pipeline_options = PipelineOptions(pipeline_args,
                                       streaming=True,
                                       save_main_session=True)

    custom_options = pipeline_options.view_as(CustomPipelineOptions)

    with Pipeline(options=custom_options) as pipeline:
        (pipeline
         # Because `timestamp_attribute` is unspecified in `ReadFromPubSub`, Beam
         # binds the publish time returned by the Pub/Sub server for each message
         # to the element's timestamp parameter, accessible via `DoFn.TimestampParam`.
         # https://beam.apache.org/releases/pydoc/current/apache_beam.io.gcp.pubsub.html#apache_beam.io.gcp.pubsub.ReadFromPubSub
         | "Read from Pub/Sub" >> io.ReadFromPubSub(topic=input_topic)
         |
         "Window into" >> GroupMessagesByFixedWindows(window_size, num_shards)
         | "Write to GCS" >> ParDo(WriteToGCS(custom_options.output_path)))

Esempio n. 18

0

Mostra file

def main(argv=None):
    options = PipelineOptions(argv)
    topic = options.view_as(PubSubTopicOptions).topic

    p = Pipeline(options=options)
    (p
     # This is an external transform
     # `apache_beam.io.external.gcp.pubsub.ReadFromPubSub`. This is different from
     # `apache_beam.io.gcp.pubsub.ReadFromPubSub` which is native transform used
     # for most cases.
     #
     # If you set expansion_service as BeamJarExpansionService(
     # 'sdks:java:io:google-cloud-platform:expansion-service:shadowJar'), it will
     # fail as the beam jar has no dependency for DirectRunner. As a workaround,
     # specify custom expansion service jar in this project.
     | ReadFromPubSub(topic=topic,
                      with_attributes=True,
                      expansion_service=expansion_service(options))
     | Map(lambda message: logging.info("message: %s", message)))
    p.run()

Esempio n. 19

0

Mostra file

def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    opt = options.view_as(_Options)
    inputs = opt.inputs
    output_prefix = opt.output_prefix or os.path.join(
        options.view_as(GoogleCloudOptions).temp_location, 'output')
    shards = opt.shards

    p = Pipeline(options=options)

    def generate(n):
        yield from range(n * _ELEMENTS_PER_INPUT,
                         (n + 1) * _ELEMENTS_PER_INPUT)

    (p
     | Create(range(inputs))
     | ParDo(generate).with_output_types(int)
     | WriteToText(output_prefix, num_shards=shards))

    p.run()

Esempio n. 20

0

Mostra file

def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    BATCH_SIZE = 1000000
    BUFFERING_SECS = 600

    p = Pipeline(options=options)
    (p
     | Create(range(100), reshuffle=True)
     | ParDo(make_large_elements)  # 128 KiB
     | WithKeys('')
     | WindowInto(GlobalWindows(),
                  trigger=Repeatedly(
                      AfterAny(AfterCount(BATCH_SIZE),
                               AfterProcessingTime(BUFFERING_SECS))),
                  accumulation_mode=AccumulationMode.DISCARDING)
     | GroupByKey()
     | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1]
                                                                          ))))

    run = p.run()
    run.wait_until_finish()

Esempio n. 21

0

Mostra file

File: flink_runner_test.py Progetto: yjshen/beam

    def test_metrics(self):
      """Run a simple DoFn that increments a counter, and verify that its
       expected value is written to a temporary file by the FileReporter"""

      counter_name = 'elem_counter'

      class DoFn(beam.DoFn):
        def __init__(self):
          self.counter = Metrics.counter(self.__class__, counter_name)
          logging.info('counter: %s' % self.counter.metric_name)

        def process(self, v):
          self.counter.inc()

      options = self.create_options()
      # Test only supports parallelism of 1
      options._all_options['parallelism'] = 1
      n = 100
      with Pipeline(self.get_runner(), options) as p:
        # pylint: disable=expression-not-assigned
        (p
         | beam.Create(list(range(n)))
         | beam.ParDo(DoFn()))

      with open(self.test_metrics_path, 'r') as f:
        lines = [line for line in f.readlines() if counter_name in line]
        self.assertEqual(
            len(lines), 1,
            msg='Expected 1 line matching "{}":\n{}'.format(
                counter_name, '\n'.join(lines))
        )
        line = lines[0]
        self.assertTrue(
            '{}: {}'.format(counter_name in line, n),
            msg='Failed to find expected counter {} in line {}'.format(
                counter_name, line)
        )

Esempio n. 22

0

Mostra file

File: dataflow.py Progetto: ThomasStencel/bigflow

 def _create_pipeline(self, options):
     return Pipeline(options=options)

Esempio n. 23

0

Mostra file

def run(argv=None):
    options = PipelineOptions(argv)
    options.view_as(SetupOptions).save_main_session = True
    with Pipeline(options=options) as p:
        (p | beam.Impulse() | beam.ParDo(MysqlDoFn()))

Esempio n. 24

0

Mostra file

File: typecheck_test_py3.py Progetto: AfterShip/aftership-beam

 def setUp(self):
     self.p = Pipeline(options=PipelineOptions(
         performance_runtime_type_check=True, pipeline_type_check=False))

Esempio n. 25

0

Mostra file

def run():
    '''Entry point, it defines and runs the pipeline.'''

    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--db-user',
        dest='db_user',
        help='Username for Postgresql instance.',
        required=True
    )
    parser.add_argument(
        '--db-password',
        dest='db_password',
        help='Password for Postgresql instance.',
        required=True
    )
    parser.add_argument(
        '--db-name',
        dest='db_name',
        help='Patents database name.',
        required=True
    )
    parser.add_argument(
        '--db-host',
        dest='db_host',
        help='Hostname for Postgresql instance.',
        required=True
    )
    parser.add_argument(
        '--db-port',
        dest='db_port',
        help='Port number for Postgresql instance.',
        required=True
    )
    parser.add_argument(
        '--application-numbers-filepath',
        dest='application_numbers_filepath',
        help='Local or ``gs://`` path to file with application numbers.',
        required=True
    )

    known_args, pipeline_args = parser.parse_known_args()
    logging.info('Starting onboarding pipeline (args=%s)', known_args)

    # We use the `save_main_session` option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with Pipeline(options=pipeline_options) as pipeline:
        # Read the text file[pattern] into a PCollection.
        lines = pipeline | 'ReadTextFile' >> ReadFromText(
            known_args.application_numbers_filepath
        )

        # Transform PCollection of text lines into PCollection of one element
        # which is slice of application numbers.
        app_numbers = lines | 'CombineApplicationNumbers' >> ToList()

        publications = (
            app_numbers
            | 'QueryPublications' >> ParDo(QueryPublications())
        )

        patents = (
            publications |
            'MapPublicationsToPatents' >> Map(
                lambda pb: {
                    'application_number': pb.application_number,
                    'application_kind': pb.application_kind,
                    'grant_date': datetime.datetime.fromtimestamp(pb.grant_date),
                },
            )
        )

        # Transform PCollection of patents into PCollection of one element which
        # is slice of patents.
        batch = patents | 'CombinePatentsToBatch' >> ToList()
        result = batch | 'UpsertPatentsToDB' >> ParDo(
            UpsertPatentsToDB(
                known_args.db_user,
                known_args.db_password,
                known_args.db_name,
                known_args.db_host,
                known_args.db_port,
            ),
        )

        return result

Esempio n. 26

0

Mostra file

        def test_metrics(self):
            """Run a simple DoFn that increments a counter and verifies state
      caching metrics. Verifies that its expected value is written to a
      temporary file by the FileReporter"""

            counter_name = 'elem_counter'
            state_spec = userstate.BagStateSpec('state', VarIntCoder())

            class DoFn(beam.DoFn):
                def __init__(self):
                    self.counter = Metrics.counter(self.__class__,
                                                   counter_name)
                    _LOGGER.info('counter: %s' % self.counter.metric_name)

                def process(self, kv, state=beam.DoFn.StateParam(state_spec)):
                    # Trigger materialization
                    list(state.read())
                    state.add(1)
                    self.counter.inc()

            options = self.create_options()
            # Test only supports parallelism of 1
            options._all_options['parallelism'] = 1
            # Create multiple bundles to test cache metrics
            options._all_options['max_bundle_size'] = 10
            options._all_options['max_bundle_time_millis'] = 95130590130
            experiments = options.view_as(DebugOptions).experiments or []
            experiments.append('state_cache_size=123')
            options.view_as(DebugOptions).experiments = experiments
            with Pipeline(self.get_runner(), options) as p:
                # pylint: disable=expression-not-assigned
                (p
                 | "create" >> beam.Create(list(range(0, 110)))
                 | "mapper" >> beam.Map(lambda x: (x % 10, 'val'))
                 | "stateful" >> beam.ParDo(DoFn()))

            lines_expected = {'counter: 110'}
            if streaming:
                lines_expected.update([
                    # Gauges for the last finished bundle
                    'stateful.beam.metric:statecache:capacity: 123',
                    # These are off by 10 because the first bundle contains all the keys
                    # once. Caching is only initialized after the first bundle. Caching
                    # depends on the cache token which is lazily initialized by the
                    # Runner's StateRequestHandlers.
                    'stateful.beam.metric:statecache:size: 10',
                    'stateful.beam.metric:statecache:get: 10',
                    'stateful.beam.metric:statecache:miss: 0',
                    'stateful.beam.metric:statecache:hit: 10',
                    'stateful.beam.metric:statecache:put: 0',
                    'stateful.beam.metric:statecache:extend: 10',
                    'stateful.beam.metric:statecache:evict: 0',
                    # Counters
                    # (total of get/hit will be off by 10 due to the caching
                    # only getting initialized after the first bundle.
                    # Caching depends on the cache token which is lazily
                    # initialized by the Runner's StateRequestHandlers).
                    'stateful.beam.metric:statecache:get_total: 100',
                    'stateful.beam.metric:statecache:miss_total: 10',
                    'stateful.beam.metric:statecache:hit_total: 90',
                    'stateful.beam.metric:statecache:put_total: 10',
                    'stateful.beam.metric:statecache:extend_total: 100',
                    'stateful.beam.metric:statecache:evict_total: 0',
                ])
            else:
                # Batch has a different processing model. All values for
                # a key are processed at once.
                lines_expected.update([
                    # Gauges
                    'stateful).beam.metric:statecache:capacity: 123',
                    # For the first key, the cache token will not be set yet.
                    # It's lazily initialized after first access in StateRequestHandlers
                    'stateful).beam.metric:statecache:size: 9',
                    # We have 11 here because there are 110 / 10 elements per key
                    'stateful).beam.metric:statecache:get: 11',
                    'stateful).beam.metric:statecache:miss: 1',
                    'stateful).beam.metric:statecache:hit: 10',
                    # State is flushed back once per key
                    'stateful).beam.metric:statecache:put: 1',
                    'stateful).beam.metric:statecache:extend: 1',
                    'stateful).beam.metric:statecache:evict: 0',
                    # Counters
                    'stateful).beam.metric:statecache:get_total: 99',
                    'stateful).beam.metric:statecache:miss_total: 9',
                    'stateful).beam.metric:statecache:hit_total: 90',
                    'stateful).beam.metric:statecache:put_total: 9',
                    'stateful).beam.metric:statecache:extend_total: 9',
                    'stateful).beam.metric:statecache:evict_total: 0',
                ])
            lines_actual = set()
            with open(self.test_metrics_path, 'r') as f:
                line = f.readline()
                while line:
                    for metric_str in lines_expected:
                        if metric_str in line:
                            lines_actual.add(metric_str)
                    line = f.readline()
            self.assertSetEqual(lines_actual, lines_expected)

Esempio n. 27

0

Mostra file

File: main.py Progetto: mjsampson/Beam_Testing

import apache_beam as beam
from apache_beam import Pipeline
from apache_beam.options.pipeline_options import PipelineOptions
from pipelines.bitcoin.service.blocks import ReadBitcoinBlocks
import time
start_time = time.time()
print("Reading Blocks")
with Pipeline(options=PipelineOptions()) as p:
    numbers = p | 'GetBlocks' >> ReadBitcoinBlocks(100)
    numbers | "WriteToText" >> beam.io.textio.WriteToText("test.txt")
print("Writing Blocks done. Total Time " + str(time.time() - start_time) +
      " seconds")

Esempio n. 28

0

Mostra file

 def new_pipeline(self, context: JobContext) -> Pipeline:
     logger.debug("Create new pipline for context %s", context)
     popts = self.create_pipeline_options(context)
     return Pipeline(options=popts)