Esempio n. 1
0
  def test_gbk_execution(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(20)
                   .add_elements(['d'])
                   .add_elements(['e'])
                   .advance_processing_time(10)
                   .advance_watermark_to(300)
                   .add_elements([TimestampedValue('late', 12)])
                   .add_elements([TimestampedValue('last', 310)]))

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p
               | test_stream
               | beam.WindowInto(FixedWindows(15))
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey())
    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.
    assert_that(records, equal_to([
        ('k', ['a', 'b', 'c']),
        ('k', ['d', 'e']),
        ('k', ['late']),
        ('k', ['last'])]))
    p.run()
Esempio n. 2
0
def pipeline_monitoring(renames):
  """Using monitoring interface snippets."""

  import re
  import apache_beam as beam
  from apache_beam.options.pipeline_options import PipelineOptions

  class WordCountOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          help='Input for the pipeline',
                          default='gs://my-bucket/input')
      parser.add_argument('--output',
                          help='output for the pipeline',
                          default='gs://my-bucket/output')

  class ExtractWordsFn(beam.DoFn):

    def process(self, element):
      words = re.findall(r'[A-Za-z\']+', element)
      for word in words:
        yield word

  class FormatCountsFn(beam.DoFn):

    def process(self, element):
      word, count = element
      yield '%s: %s' % (word, count)

  # [START pipeline_monitoring_composite]
  # The CountWords Composite Transform inside the WordCount pipeline.
  class CountWords(beam.PTransform):

    def expand(self, pcoll):
      return (pcoll
              # Convert lines of text into individual words.
              | 'ExtractWords' >> beam.ParDo(ExtractWordsFn())
              # Count the number of times each word occurs.
              | beam.combiners.Count.PerElement()
              # Format each word and count into a printable string.
              | 'FormatCounts' >> beam.ParDo(FormatCountsFn()))
  # [END pipeline_monitoring_composite]

  pipeline_options = PipelineOptions()
  options = pipeline_options.view_as(WordCountOptions)
  with TestPipeline() as p:  # Use TestPipeline for testing.

    # [START pipeline_monitoring_execution]
    (p
     # Read the lines of the input text.
     | 'ReadLines' >> beam.io.ReadFromText(options.input)
     # Count the words.
     | CountWords()
     # Write the formatted word counts to output.
     | 'WriteCounts' >> beam.io.WriteToText(options.output))
    # [END pipeline_monitoring_execution]

    p.visit(SnippetUtils.RenameFiles(renames))
def run(argv=None):

  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      required=True,
                      help='Input file to process.')
  parser.add_argument('--output',
                      required=True,
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  with beam.Pipeline(options=pipeline_options) as p:
    def format_result(prefix_candidates):
      (prefix, candidates) = prefix_candidates
      return '%s: %s' % (prefix, candidates)

    (p  # pylint: disable=expression-not-assigned
     | 'read' >> ReadFromText(known_args.input)
     | 'split' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
     | 'TopPerPrefix' >> TopPerPrefix(5)
     | 'format' >> beam.Map(format_result)
     | 'write' >> WriteToText(known_args.output))
Esempio n. 4
0
  def test_basic_execution_sideinputs(self):
    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)

    main_stream = (p
                   | 'main TestStream' >> TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['e']))
    side_stream = (p
                   | 'side TestStream' >> TestStream()
                   .add_elements([window.TimestampedValue(2, 2)])
                   .add_elements([window.TimestampedValue(1, 1)])
                   .add_elements([window.TimestampedValue(7, 7)])
                   .add_elements([window.TimestampedValue(4, 4)])
                  )

    class RecordFn(beam.DoFn):
      def process(self,
                  elm=beam.DoFn.ElementParam,
                  ts=beam.DoFn.TimestampParam,
                  side=beam.DoFn.SideInputParam):
        yield (elm, ts, side)

    records = (main_stream        # pylint: disable=unused-variable
               | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream)))

    assert_that(records, equal_to([('e', Timestamp(10), [2, 1, 7, 4])]))

    p.run()
Esempio n. 5
0
def model_pcollection(argv):
  """Creating a PCollection from data in local memory."""
  from apache_beam.options.pipeline_options import PipelineOptions

  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--output',
                          dest='output',
                          required=True,
                          help='Output file to write results to.')

  pipeline_options = PipelineOptions(argv)
  my_options = pipeline_options.view_as(MyOptions)

  # [START model_pcollection]
  with beam.Pipeline(options=pipeline_options) as p:

    lines = (p
             | beam.Create([
                 'To be, or not to be: that is the question: ',
                 'Whether \'tis nobler in the mind to suffer ',
                 'The slings and arrows of outrageous fortune, ',
                 'Or to take arms against a sea of troubles, ']))
    # [END model_pcollection]

    (lines
     | beam.io.WriteToText(my_options.output))
Esempio n. 6
0
def run(argv=None):
  """Main entry point; defines and runs the tfidf pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--uris',
                      required=True,
                      help='URIs to process.')
  parser.add_argument('--output',
                      required=True,
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Read documents specified by the uris command line option.
  pcoll = read_documents(p, glob.glob(known_args.uris))
  # Compute TF-IDF information for each word.
  output = pcoll | TfIdf()
  # Write the output using a "Write" transform that has side effects.
  # pylint: disable=expression-not-assigned
  output | 'write' >> WriteToText(known_args.output)
  # Execute the pipeline and wait until it is completed.
  p.run().wait_until_finish()
Esempio n. 7
0
  def test_read_messages_timestamp_attribute_missing(self, mock_pubsub):
    data = b'data'
    attributes = {}
    publish_time_secs = 1520861821
    publish_time_nanos = 234567000
    publish_time = '2018-03-12T13:37:01.234567Z'
    ack_id = 'ack_id'
    pull_response = test_utils.create_pull_response([
        test_utils.PullResponseMessage(
            data, attributes, publish_time_secs, publish_time_nanos, ack_id)
    ])
    expected_elements = [
        TestWindowedValue(
            PubsubMessage(data, attributes),
            timestamp.Timestamp.from_rfc3339(publish_time),
            [window.GlobalWindow()]),
    ]
    mock_pubsub.return_value.pull.return_value = pull_response

    options = PipelineOptions([])
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    pcoll = (p
             | ReadFromPubSub(
                 'projects/fakeprj/topics/a_topic', None, None,
                 with_attributes=True, timestamp_attribute='nonexistent'))
    assert_that(pcoll, equal_to(expected_elements), reify_windows=True)
    p.run()
    mock_pubsub.return_value.acknowledge.assert_has_calls([
        mock.call(mock.ANY, [ack_id])])
Esempio n. 8
0
  def test_with_requirements_file(self):
    try:
      staging_dir = tempfile.mkdtemp()
      requirements_cache_dir = tempfile.mkdtemp()
      source_dir = tempfile.mkdtemp()

      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).requirements_cache = requirements_cache_dir
      options.view_as(SetupOptions).requirements_file = os.path.join(
          source_dir, dependency.REQUIREMENTS_FILE)
      self.create_temp_file(
          os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
      self.assertEqual(
          sorted([dependency.REQUIREMENTS_FILE,
                  'abc.txt', 'def.txt']),
          sorted(dependency.stage_job_resources(
              options,
              populate_requirements_cache=self.populate_requirements_cache)))
      self.assertTrue(
          os.path.isfile(
              os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
      self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt')))
      self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt')))
    finally:
      shutil.rmtree(staging_dir)
      shutil.rmtree(requirements_cache_dir)
      shutil.rmtree(source_dir)
Esempio n. 9
0
def run(argv=None):
  """Main entry point; defines and runs the user_score pipeline."""
  parser = argparse.ArgumentParser()

  # The default maps to two large Google Cloud Storage files (each ~12GB)
  # holding two subsequent day's worth (roughly) of data.
  parser.add_argument('--input',
                      type=str,
                      default='gs://apache-beam-samples/game/gaming_data*.csv',
                      help='Path to the data file(s) containing game data.')
  parser.add_argument('--output',
                      type=str,
                      required=True,
                      help='Path to the output file(s).')

  args, pipeline_args = parser.parse_known_args(argv)

  options = PipelineOptions(pipeline_args)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  options.view_as(SetupOptions).save_main_session = True

  with beam.Pipeline(options=options) as p:
    def format_user_score_sums(user_score):
      (user, score) = user_score
      return 'user: %s, total_score: %s' % (user, score)

    (p  # pylint: disable=expression-not-assigned
     | 'ReadInputText' >> beam.io.ReadFromText(args.input)
     | 'UserScore' >> UserScore()
     | 'FormatUserScoreSums' >> beam.Map(format_user_score_sums)
     | 'WriteUserScoreSums' >> beam.io.WriteToText(args.output))
Esempio n. 10
0
  def test_write_messages_unsupported_features(self, mock_pubsub):
    data = b'data'
    attributes = {'key': 'value'}
    payloads = [PubsubMessage(data, attributes)]

    options = PipelineOptions([])
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    _ = (p
         | Create(payloads)
         | WriteToPubSub('projects/fakeprj/topics/a_topic',
                         id_label='a_label'))
    with self.assertRaisesRegexp(NotImplementedError,
                                 r'id_label is not supported'):
      p.run()
    options = PipelineOptions([])
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    _ = (p
         | Create(payloads)
         | WriteToPubSub('projects/fakeprj/topics/a_topic',
                         timestamp_attribute='timestamp'))
    with self.assertRaisesRegexp(NotImplementedError,
                                 r'timestamp_attribute is not supported'):
      p.run()
def run(argv=None):
  """Build and run the pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input_topic', required=True,
      help=('Input PubSub topic of the form '
            '"projects/<PROJECT>/topics/<TOPIC>".'))
  parser.add_argument(
      '--output_topic', required=True,
      help=('Output PubSub topic of the form '
            '"projects/<PROJECT>/topic/<TOPIC>".'))
  known_args, pipeline_args = parser.parse_known_args(argv)
  options = PipelineOptions(pipeline_args)
  options.view_as(StandardOptions).streaming = True

  with beam.Pipeline(options=options) as p:

    # Read from PubSub into a PCollection.
    lines = p | beam.io.ReadStringsFromPubSub(known_args.input_topic)

    # Capitalize the characters in each line.
    transformed = (lines
                   # Use a pre-defined function that imports the re package.
                   | 'Split' >> (
                       beam.FlatMap(split_fn).with_output_types(unicode))
                   | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
                   | beam.WindowInto(window.FixedWindows(15, 0))
                   | 'Group' >> beam.GroupByKey()
                   | 'Count' >> beam.Map(lambda (word, ones): (word, sum(ones)))
                   | 'Format' >> beam.Map(lambda tup: '%s: %d' % tup))

    # Write to PubSub.
    # pylint: disable=expression-not-assigned
    transformed | beam.io.WriteStringsToPubSub(known_args.output_topic)
Esempio n. 12
0
  def test_with_setup_file(self):
    staging_dir = self.make_temp_dir()
    source_dir = self.make_temp_dir()
    self.create_temp_file(os.path.join(source_dir, 'setup.py'), 'notused')

    options = PipelineOptions()
    self.update_options(options)
    options.view_as(SetupOptions).setup_file = os.path.join(
        source_dir, 'setup.py')

    self.assertEqual(
        [stager.WORKFLOW_TARBALL_FILE],
        self.stager.stage_job_resources(
            options,
            # We replace the build setup command because a realistic one would
            # require the setuptools package to be installed. Note that we can't
            # use "touch" here to create the expected output tarball file, since
            # touch is not available on Windows, so we invoke python to produce
            # equivalent behavior.
            build_setup_args=[
                'python', '-c', 'open(__import__("sys").argv[1], "a")',
                os.path.join(source_dir, stager.WORKFLOW_TARBALL_FILE)
            ],
            temp_dir=source_dir,
            staging_location=staging_dir))
    self.assertTrue(
        os.path.isfile(os.path.join(staging_dir, stager.WORKFLOW_TARBALL_FILE)))
Esempio n. 13
0
  def test_sdk_location_http(self):
    staging_dir = self.make_temp_dir()
    sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz'

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    def file_download(_, to_folder):
      tarball_path = os.path.join(to_folder, 'sdk-tarball')
      with open(tarball_path, 'w') as f:
        f.write('Package content.')
      return tarball_path

    with mock.patch('apache_beam.runners.dataflow.internal.'
                    'dependency._dependency_file_download', file_download):
      self.assertEqual(
          [names.DATAFLOW_SDK_TARBALL_FILE],
          dependency.stage_job_resources(options))

    tarball_path = os.path.join(
        staging_dir, names.DATAFLOW_SDK_TARBALL_FILE)
    with open(tarball_path) as f:
      self.assertEqual(f.read(), 'Package content.')
Esempio n. 14
0
def model_pipelines(argv):
  """A wordcount snippet as a simple pipeline example."""
  # [START model_pipelines]
  import re

  import apache_beam as beam
  from apache_beam.options.pipeline_options import PipelineOptions

  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          dest='input',
                          default='gs://dataflow-samples/shakespeare/kinglear'
                          '.txt',
                          help='Input file to process.')
      parser.add_argument('--output',
                          dest='output',
                          required=True,
                          help='Output file to write results to.')

  pipeline_options = PipelineOptions(argv)
  my_options = pipeline_options.view_as(MyOptions)

  with beam.Pipeline(options=pipeline_options) as p:

    (p
     | beam.io.ReadFromText(my_options.input)
     | beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
     | beam.Map(lambda x: (x, 1))
     | beam.combiners.Count.PerKey()
     | beam.io.WriteToText(my_options.output))
  def test_model_composite_triggers(self):
    pipeline_options = PipelineOptions()
    pipeline_options.view_as(StandardOptions).streaming = True

    with TestPipeline(options=pipeline_options) as p:
      test_stream = (TestStream()
                     .advance_watermark_to(10)
                     .add_elements(['a', 'a', 'a', 'b', 'b'])
                     .advance_watermark_to(70)
                     .add_elements([TimestampedValue('a', 10),
                                    TimestampedValue('a', 10),
                                    TimestampedValue('c', 10),
                                    TimestampedValue('c', 10)])
                     .advance_processing_time(600))
      pcollection = (p
                     | test_stream
                     | 'pair_with_one' >> beam.Map(lambda x: (x, 1)))

      counts = (
          # [START model_composite_triggers]
          pcollection | WindowInto(
              FixedWindows(1 * 60),
              trigger=AfterWatermark(
                  late=AfterProcessingTime(10 * 60)),
              accumulation_mode=AccumulationMode.DISCARDING)
          # [END model_composite_triggers]
          | 'group' >> beam.GroupByKey()
          | 'count' >> beam.Map(
              lambda word_ones: (word_ones[0], sum(word_ones[1]))))
      assert_that(counts, equal_to([('a', 3), ('b', 2), ('a', 2), ('c', 2)]))
Esempio n. 16
0
def run(argv=None):
  """Runs the Wikipedia top edits pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """

  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://dataflow-samples/wikipedia_edits/*.json',
      help='Input specified as a GCS path containing a BigQuery table exported '
      'as json.')
  parser.add_argument('--output',
                      required=True,
                      help='Output file to write results to.')
  parser.add_argument('--sampling_threshold',
                      type=float,
                      default=0.1,
                      help='Fraction of entries used for session tracking')
  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  with beam.Pipeline(options=pipeline_options) as p:

    (p  # pylint: disable=expression-not-assigned
     | ReadFromText(known_args.input)
     | ComputeTopSessions(known_args.sampling_threshold)
     | WriteToText(known_args.output))
Esempio n. 17
0
def run(pipeline_args, input_file, output_file):

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection.
  lines = p | 'read' >> ReadFromText(input_file)

  counts = (lines
            | 'split' >> (beam.ParDo(WordExtractingDoFn())
                          .with_output_types(bytes))
            | 'count' >> beam.ExternalTransform(
                'pytest:beam:transforms:count', None, EXPANSION_SERVICE_ADDR))

  # Format the counts into a PCollection of strings.
  def format_result(word_count):
    (word, count) = word_count
    return '%s: %d' % (word, count)

  output = counts | 'format' >> beam.Map(format_result)

  # Write the output using a "Write" transform that has side effects.
  # pylint: disable=expression-not-assigned
  output | 'write' >> WriteToText(output_file)

  result = p.run()
  result.wait_until_finish()
Esempio n. 18
0
  def test_basic_execution(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(20)
                   .add_elements(['d'])
                   .add_elements(['e'])
                   .advance_processing_time(10)
                   .advance_watermark_to(300)
                   .add_elements([TimestampedValue('late', 12)])
                   .add_elements([TimestampedValue('last', 310)]))

    class RecordFn(beam.DoFn):
      def process(self, element=beam.DoFn.ElementParam,
                  timestamp=beam.DoFn.TimestampParam):
        yield (element, timestamp)

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    my_record_fn = RecordFn()
    records = p | test_stream | beam.ParDo(my_record_fn)
    assert_that(records, equal_to([
        ('a', timestamp.Timestamp(10)),
        ('b', timestamp.Timestamp(10)),
        ('c', timestamp.Timestamp(10)),
        ('d', timestamp.Timestamp(20)),
        ('e', timestamp.Timestamp(20)),
        ('late', timestamp.Timestamp(12)),
        ('last', timestamp.Timestamp(310)),]))
    p.run()
def run(argv=None):
  """Test Avro IO (backed by fastavro or Apache Avro) on a simple pipeline
  that transforms bitcoin transactions"""
  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://beam-avro-test/bitcoin/txns/*',
                      help='Input file(s) to process.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output file to write results to.')
  parser.add_argument('--compress',
                      dest='compress',
                      required=False,
                      action='store_true',
                      help='When set, compress the output data')
  parser.add_argument('--fastavro',
                      dest='use_fastavro',
                      required=False,
                      action='store_true',
                      help='When set, use fastavro for Avro I/O')

  opts, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Read the avro file[pattern] into a PCollection.
  records = \
      p | 'read' >> ReadFromAvro(opts.input, use_fastavro=opts.use_fastavro)

  measured = records | 'scan' >> beam.ParDo(BitcoinTxnCountDoFn())

  # pylint: disable=expression-not-assigned
  measured | 'write' >> \
      WriteToAvro(
          opts.output,
          schema=SCHEMA,
          codec=('deflate' if opts.compress else 'null'),
          use_fastavro=opts.use_fastavro
      )

  result = p.run()
  result.wait_until_finish()

  # Do not query metrics when creating a template which doesn't run
  if (not hasattr(result, 'has_job')        # direct runner
      or result.has_job):               # not just a template creation
    metrics = result.metrics().query()

    for counter in metrics['counters']:
      logging.info("Counter: %s", counter)

    for dist in metrics['distributions']:
      logging.info("Distribution: %s", dist)
Esempio n. 20
0
def run(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""

  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--output',
                      dest='output',
                      # CHANGE 1/5: The Google Cloud Storage path is required
                      # for outputting the results.
                      default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  pipeline_args.extend([
      # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to
      # run your pipeline on the Google Cloud Dataflow Service.
      '--runner=DirectRunner',
      # CHANGE 3/5: Your project ID is required in order to run your pipeline on
      # the Google Cloud Dataflow Service.
      '--project=SET_YOUR_PROJECT_ID_HERE',
      # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
      # files.
      '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
      # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
      # files.
      '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
      '--job_name=your-wordcount-job',
  ])

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  with beam.Pipeline(options=pipeline_options) as p:

    # Read the text file[pattern] into a PCollection.
    lines = p | ReadFromText(known_args.input)

    # Count the occurrences of each word.
    counts = (
        lines
        | 'Split' >> (beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
                      .with_output_types(unicode))
        | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
        | 'GroupAndSum' >> beam.CombinePerKey(sum))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
      (word, count) = word_count
      return '%s: %s' % (word, count)

    output = counts | 'Format' >> beam.Map(format_result)

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | WriteToText(known_args.output)
Esempio n. 21
0
def run(argv=None):
  """Build and run the pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--output_topic', required=True,
      help=('Output PubSub topic of the form '
            '"projects/<PROJECT>/topic/<TOPIC>".'))
  group = parser.add_mutually_exclusive_group(required=True)
  group.add_argument(
      '--input_topic',
      help=('Input PubSub topic of the form '
            '"projects/<PROJECT>/topics/<TOPIC>".'))
  group.add_argument(
      '--input_subscription',
      help=('Input PubSub subscription of the form '
            '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  pipeline_options.view_as(StandardOptions).streaming = True
  p = beam.Pipeline(options=pipeline_options)

  # Read from PubSub into a PCollection.
  if known_args.input_subscription:
    lines = p | beam.io.ReadStringsFromPubSub(
        subscription=known_args.input_subscription)
  else:
    lines = p | beam.io.ReadStringsFromPubSub(topic=known_args.input_topic)

  # Count the occurrences of each word.
  def count_ones(word_ones):
    (word, ones) = word_ones
    return (word, sum(ones))

  counts = (lines
            | 'split' >> (beam.ParDo(WordExtractingDoFn())
                          .with_output_types(unicode))
            | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
            | beam.WindowInto(window.FixedWindows(15, 0))
            | 'group' >> beam.GroupByKey()
            | 'count' >> beam.Map(count_ones))

  # Format the counts into a PCollection of strings.
  def format_result(word_count):
    (word, count) = word_count
    return '%s: %d' % (word, count)

  output = counts | 'format' >> beam.Map(format_result)

  # Write to PubSub.
  # pylint: disable=expression-not-assigned
  output | beam.io.WriteStringsToPubSub(known_args.output_topic)

  result = p.run()
  result.wait_until_finish()
def main(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""

  PROJECT = 'project-id'
  BUCKET = "cloud storage bucker"

  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='input path',
                      help='Input file to process.')
  parser.add_argument('--output',
                      dest='output',
                      # CHANGE 1/5: The Google Cloud Storage path is required
                      # for outputting the results.
                      default='output path',
                      help='Output file to write results to.')

  known_args, pipeline_args = parser.parse_known_args(argv)
  pipeline_args.extend([
     
      '--runner=DataFlowRunner',
      
      '--PROJECT',
      '--staging_location=gs://somepath/tweets',
      '--temp_location=gs://someptah/TEMP',
      '--job_name=tweet-wordcount-job',
  ])

 # pipeline creation

  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  with beam.Pipeline(options=pipeline_options) as p:


    # Read the text file[pattern] into a PCollection.

    lines = p | ReadFromText(known_args.input)

    # Count the occurrences of each word.
    counts = (
        lines
        | 'Split' >> (beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
                      .with_output_types(unicode))
        | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
        | 'GroupAndSum' >> beam.CombinePerKey(sum))
    #counts | 'Print' >> beam.ParDo(lambda (w, c): print('%s: %s' % (word, count)))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
      (word, count) = word_count
      return '%s: %s' % (word, count)

    output = counts | 'Format' >> beam.Map(format_result)
    #counts | 'Print' >> beam.ParDo(lambda (w, c): print('%s: %s' % (w, c)))
    # Write the output using a "Write" transform that has side effects.
    output | WriteToText(known_args.output)
Esempio n. 23
0
  def test_default_resources(self):
    staging_dir = self.make_temp_dir()
    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)

    self.assertEqual(
        [],
        dependency.stage_job_resources(options))
Esempio n. 24
0
 def test_read_message_id_label_unsupported(self, unused_mock_pubsub):
   # id_label is unsupported in DirectRunner.
   options = PipelineOptions([])
   options.view_as(StandardOptions).streaming = True
   p = TestPipeline(options=options)
   _ = (p | ReadFromPubSub('projects/fakeprj/topics/a_topic', None, 'a_label'))
   with self.assertRaisesRegexp(NotImplementedError,
                                r'id_label is not supported'):
     p.run()
Esempio n. 25
0
def examples_wordcount_minimal(renames):
  """MinimalWordCount example snippets."""
  import re

  import apache_beam as beam

  from apache_beam.options.pipeline_options import GoogleCloudOptions
  from apache_beam.options.pipeline_options import StandardOptions
  from apache_beam.options.pipeline_options import PipelineOptions

  # [START examples_wordcount_minimal_options]
  options = PipelineOptions()
  google_cloud_options = options.view_as(GoogleCloudOptions)
  google_cloud_options.project = 'my-project-id'
  google_cloud_options.job_name = 'myjob'
  google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging'
  google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp'
  options.view_as(StandardOptions).runner = 'DataflowRunner'
  # [END examples_wordcount_minimal_options]

  # Run it locally for testing.
  options = PipelineOptions()

  # [START examples_wordcount_minimal_create]
  p = beam.Pipeline(options=options)
  # [END examples_wordcount_minimal_create]

  (
      # [START examples_wordcount_minimal_read]
      p | beam.io.ReadFromText(
          'gs://dataflow-samples/shakespeare/kinglear.txt')
      # [END examples_wordcount_minimal_read]

      # [START examples_wordcount_minimal_pardo]
      | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
      # [END examples_wordcount_minimal_pardo]

      # [START examples_wordcount_minimal_count]
      | beam.combiners.Count.PerElement()
      # [END examples_wordcount_minimal_count]

      # [START examples_wordcount_minimal_map]
      | beam.Map(lambda word_count: '%s: %s' % (word_count[0], word_count[1]))
      # [END examples_wordcount_minimal_map]

      # [START examples_wordcount_minimal_write]
      | beam.io.WriteToText('gs://my-bucket/counts.txt')
      # [END examples_wordcount_minimal_write]
  )

  p.visit(SnippetUtils.RenameFiles(renames))

  # [START examples_wordcount_minimal_run]
  result = p.run()
  # [END examples_wordcount_minimal_run]
  result.wait_until_finish()
Esempio n. 26
0
  def test_with_extra_packages(self):
    staging_dir = self.make_temp_dir()
    source_dir = self.make_temp_dir()
    self.create_temp_file(os.path.join(source_dir, 'abc.tar.gz'), 'nothing')
    self.create_temp_file(os.path.join(source_dir, 'xyz.tar.gz'), 'nothing')
    self.create_temp_file(os.path.join(source_dir, 'xyz2.tar'), 'nothing')
    self.create_temp_file(os.path.join(source_dir, 'whl.whl'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, stager.EXTRA_PACKAGES_FILE), 'nothing')

    options = PipelineOptions()
    self.update_options(options)
    options.view_as(SetupOptions).extra_packages = [
        os.path.join(source_dir, 'abc.tar.gz'),
        os.path.join(source_dir, 'xyz.tar.gz'),
        os.path.join(source_dir, 'xyz2.tar'),
        os.path.join(source_dir, 'whl.whl'), '/tmp/remote/remote_file.tar.gz'
    ]

    remote_copied_files = []

    # We can not rely on actual remote file systems paths hence making
    # '/tmp/remote/' a new remote path.
    def is_remote_path(path):
      return path.startswith('/tmp/remote/')

    def file_copy(from_path, to_path):
      if is_remote_path(from_path):
        remote_copied_files.append(from_path)
        _, from_name = os.path.split(from_path)
        if os.path.isdir(to_path):
          to_path = os.path.join(to_path, from_name)
        self.create_temp_file(to_path, 'nothing')
        logging.info('Fake copied remote file: %s to %s', from_path, to_path)
      elif is_remote_path(to_path):
        logging.info('Faking upload_file(%s, %s)', from_path, to_path)
      else:
        shutil.copyfile(from_path, to_path)

    with mock.patch(
        'apache_beam.runners.portability.stager_test'
        '.stager.Stager._download_file', staticmethod(file_copy)):
      with mock.patch(
          'apache_beam.runners.portability.stager_test'
          '.stager.Stager._is_remote_path', staticmethod(is_remote_path)):
        self.assertEqual([
            'abc.tar.gz', 'xyz.tar.gz', 'xyz2.tar', 'whl.whl',
            'remote_file.tar.gz', stager.EXTRA_PACKAGES_FILE
        ], self.stager.stage_job_resources(
            options, staging_location=staging_dir))
    with open(os.path.join(staging_dir, stager.EXTRA_PACKAGES_FILE)) as f:
      self.assertEqual([
          'abc.tar.gz\n', 'xyz.tar.gz\n', 'xyz2.tar\n', 'whl.whl\n',
          'remote_file.tar.gz\n'
      ], f.readlines())
    self.assertEqual(['/tmp/remote/remote_file.tar.gz'], remote_copied_files)
  def test_extra_package(self):
    options = PipelineOptions(['--extra_package', 'abc',
                               '--extra_packages', 'def',
                               '--extra_packages', 'ghi'])
    self.assertEqual(
        sorted(options.get_all_options()['extra_packages']),
        ['abc', 'def', 'ghi'])

    options = PipelineOptions(flags=[''])
    self.assertEqual(options.get_all_options()['extra_packages'], None)
Esempio n. 28
0
  def test_no_main_session(self):
    staging_dir = self.make_temp_dir()
    options = PipelineOptions()

    options.view_as(SetupOptions).save_main_session = False
    self.update_options(options)

    self.assertEqual([],
                     self.stager.stage_job_resources(
                         options, staging_location=staging_dir))
 def test_option_with_space(self):
   options = PipelineOptions(flags=['--option with space= value with space'])
   self.assertEqual(
       getattr(options.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
   options_from_dict = PipelineOptions.from_dictionary(
       options.get_all_options())
   self.assertEqual(
       getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
 def test_get_all_options(self):
   for case in PipelineOptionsTest.TEST_CASES:
     options = PipelineOptions(flags=case['flags'])
     self.assertDictContainsSubset(case['expected'], options.get_all_options())
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_flag,
                      case['expected']['mock_flag'])
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_option,
                      case['expected']['mock_option'])
Esempio n. 31
0
 def test_create_application_client(self):
     pipeline_options = PipelineOptions()
     apiclient.DataflowApplicationClient(pipeline_options)
    def test_recordings_record(self):
        """Tests that recording pipeline succeeds."""

        # Add the TestStream so that it can be cached.
        ib.options.recordable_sources.add(TestStream)

        # Create a pipeline with an arbitrary amonunt of elements.
        p = beam.Pipeline(ir.InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        # pylint: disable=unused-variable
        _ = (p
             | TestStream()
                 .advance_watermark_to(0)
                 .advance_processing_time(1)
                 .add_elements(list(range(10)))
                 .advance_processing_time(1))  # yapf: disable
        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        # Assert that the pipeline starts in a good state.
        self.assertEqual(
            ib.recordings.describe(p)['state'], PipelineState.STOPPED)
        self.assertEqual(ib.recordings.describe(p)['size'], 0)

        # Create a lmiter that stops the background caching job when something is
        # written to cache. This is used to make ensure that the pipeline is
        # functioning properly and that there are no data races with the test.
        class SizeLimiter(Limiter):
            def __init__(self, pipeline):
                self.pipeline = pipeline
                self.should_trigger = False

            def is_triggered(self):
                return (ib.recordings.describe(self.pipeline)['size'] > 0
                        and self.should_trigger)

        limiter = SizeLimiter(p)
        ib.options.capture_control.set_limiters_for_test([limiter])

        # Assert that a recording can be started only once.
        self.assertTrue(ib.recordings.record(p))
        self.assertFalse(ib.recordings.record(p))
        self.assertEqual(
            ib.recordings.describe(p)['state'], PipelineState.RUNNING)

        # Wait for the pipeline to start and write something to cache.
        limiter.should_trigger = True
        for _ in range(60):
            if limiter.is_triggered():
                break
            time.sleep(1)
        self.assertTrue(
            limiter.is_triggered(),
            'Test timed out waiting for limiter to be triggered. This indicates '
            'that the BackgroundCachingJob did not cache anything.')

        # Assert that a recording can be stopped and can't be started again until
        # after the cache is cleared.
        ib.recordings.stop(p)
        self.assertEqual(
            ib.recordings.describe(p)['state'], PipelineState.STOPPED)
        self.assertFalse(ib.recordings.record(p))
        ib.recordings.clear(p)
        self.assertTrue(ib.recordings.record(p))
        ib.recordings.stop(p)
Esempio n. 33
0
 def test_interpreter_version_check_passes_with_experiment(self):
     pipeline_options = PipelineOptions(
         ["--experiment=use_unsupported_python_version"])
     apiclient._verify_interpreter_version_is_supported(pipeline_options)
Esempio n. 34
0
def main(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument(
        '--output',
        dest='output',
        # CHANGE 1/6: The Google Cloud Storage path is required
        # for outputting the results.
        default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        # CHANGE 2/6: (OPTIONAL) Change this to DataflowRunner to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--runner=DirectRunner',
        # CHANGE 3/6: (OPTIONAL) Your project ID is required in order to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--project=SET_YOUR_PROJECT_ID_HERE',
        # CHANGE 4/6: (OPTIONAL) The Google Cloud region (e.g. us-central1)
        # is required in order to run your pipeline on the Google Cloud
        # Dataflow Service.
        '--region=SET_REGION_HERE',
        # CHANGE 5/6: Your Google Cloud Storage path is required for staging local
        # files.
        '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
        # CHANGE 6/6: Your Google Cloud Storage path is required for temporary
        # files.
        '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
        '--job_name=your-wordcount-job',
    ])

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    with beam.Pipeline(options=pipeline_options) as p:

        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input)

        # Count the occurrences of each word.
        counts = (lines
                  | 'Split' >> (beam.FlatMap(lambda x: re.findall(
                      r'[A-Za-z\']+', x)).with_output_types(str))
                  | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
                  | 'GroupAndSum' >> beam.CombinePerKey(sum))

        # Format the counts into a PCollection of strings.
        def format_result(word_count):
            (word, count) = word_count
            return '%s: %s' % (word, count)

        output = counts | 'Format' >> beam.Map(format_result)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | WriteToText(known_args.output)
Esempio n. 35
0
    def testIncarcerationPipeline(self):
        fake_person_id = 12345

        fake_person = schema.StatePerson(
            person_id=fake_person_id,
            gender=Gender.MALE,
            birthdate=date(1970, 1, 1),
            residency_status=ResidencyStatus.PERMANENT)

        persons_data = [normalized_database_base_dict(fake_person)]

        race_1 = schema.StatePersonRace(person_race_id=111,
                                        state_code='CA',
                                        race=Race.BLACK,
                                        person_id=fake_person_id)

        race_2 = schema.StatePersonRace(person_race_id=111,
                                        state_code='ND',
                                        race=Race.WHITE,
                                        person_id=fake_person_id)

        races_data = normalized_database_base_dict_list([race_1, race_2])

        ethnicity = schema.StatePersonEthnicity(person_ethnicity_id=111,
                                                state_code='CA',
                                                ethnicity=Ethnicity.HISPANIC,
                                                person_id=fake_person_id)

        ethnicity_data = normalized_database_base_dict_list([ethnicity])

        sentence_group = schema.StateSentenceGroup(sentence_group_id=111,
                                                   person_id=fake_person_id)

        initial_incarceration = schema.StateIncarcerationPeriod(
            incarceration_period_id=1111,
            status=StateIncarcerationPeriodStatus.NOT_IN_CUSTODY,
            state_code='CA',
            county_code='124',
            facility='San Quentin',
            facility_security_level=StateIncarcerationFacilitySecurityLevel.
            MAXIMUM,
            admission_reason=StateIncarcerationPeriodAdmissionReason.
            NEW_ADMISSION,
            projected_release_reason=StateIncarcerationPeriodReleaseReason.
            CONDITIONAL_RELEASE,
            admission_date=date(2008, 11, 20),
            release_date=date(2010, 12, 4),
            release_reason=StateIncarcerationPeriodReleaseReason.
            SENTENCE_SERVED,
            person_id=fake_person_id,
        )

        first_reincarceration = schema.StateIncarcerationPeriod(
            incarceration_period_id=2222,
            status=StateIncarcerationPeriodStatus.NOT_IN_CUSTODY,
            state_code='CA',
            county_code='124',
            facility='San Quentin',
            facility_security_level=StateIncarcerationFacilitySecurityLevel.
            MAXIMUM,
            admission_reason=StateIncarcerationPeriodAdmissionReason.
            NEW_ADMISSION,
            projected_release_reason=StateIncarcerationPeriodReleaseReason.
            CONDITIONAL_RELEASE,
            admission_date=date(2011, 4, 5),
            release_date=date(2014, 4, 14),
            release_reason=StateIncarcerationPeriodReleaseReason.
            SENTENCE_SERVED,
            person_id=fake_person_id)

        subsequent_reincarceration = schema.StateIncarcerationPeriod(
            incarceration_period_id=3333,
            status=StateIncarcerationPeriodStatus.IN_CUSTODY,
            state_code='CA',
            county_code='124',
            facility='San Quentin',
            facility_security_level=StateIncarcerationFacilitySecurityLevel.
            MAXIMUM,
            admission_reason=StateIncarcerationPeriodAdmissionReason.
            NEW_ADMISSION,
            projected_release_reason=StateIncarcerationPeriodReleaseReason.
            CONDITIONAL_RELEASE,
            admission_date=date(2017, 1, 4),
            person_id=fake_person_id)

        incarceration_sentence = schema.StateIncarcerationSentence(
            incarceration_sentence_id=1111,
            sentence_group_id=sentence_group.sentence_group_id,
            incarceration_periods=[
                initial_incarceration, first_reincarceration,
                subsequent_reincarceration
            ],
            person_id=fake_person_id)

        supervision_sentence = schema.StateSupervisionSentence(
            supervision_sentence_id=123, person_id=fake_person_id)

        sentence_group.incarceration_sentences = [incarceration_sentence]

        sentence_group_data = [normalized_database_base_dict(sentence_group)]

        incarceration_sentence_data = [
            normalized_database_base_dict(incarceration_sentence)
        ]

        supervision_sentence_data = [
            normalized_database_base_dict(supervision_sentence)
        ]

        incarceration_periods_data = [
            normalized_database_base_dict(initial_incarceration),
            normalized_database_base_dict(first_reincarceration),
            normalized_database_base_dict(subsequent_reincarceration)
        ]

        state_incarceration_sentence_incarceration_period_association = [
            {
                'incarceration_period_id':
                initial_incarceration.incarceration_period_id,
                'incarceration_sentence_id':
                incarceration_sentence.incarceration_sentence_id,
            },
            {
                'incarceration_period_id':
                first_reincarceration.incarceration_period_id,
                'incarceration_sentence_id':
                incarceration_sentence.incarceration_sentence_id,
            },
            {
                'incarceration_period_id':
                subsequent_reincarceration.incarceration_period_id,
                'incarceration_sentence_id':
                incarceration_sentence.incarceration_sentence_id,
            },
        ]

        data_dict = {
            schema.StatePerson.__tablename__:
            persons_data,
            schema.StatePersonRace.__tablename__:
            races_data,
            schema.StatePersonEthnicity.__tablename__:
            ethnicity_data,
            schema.StateSentenceGroup.__tablename__:
            sentence_group_data,
            schema.StateIncarcerationSentence.__tablename__:
            incarceration_sentence_data,
            schema.StateSupervisionSentence.__tablename__:
            supervision_sentence_data,
            schema.StateIncarcerationPeriod.__tablename__:
            incarceration_periods_data,
            schema.state_incarceration_sentence_incarceration_period_association_table.name:
            state_incarceration_sentence_incarceration_period_association,
            schema.state_supervision_sentence_incarceration_period_association_table.name:
            [{}]
        }

        test_pipeline = TestPipeline()

        # Get StatePersons
        persons = (test_pipeline
                   | 'Load Persons' >> extractor_utils.BuildRootEntity(
                       dataset=None,
                       data_dict=data_dict,
                       root_schema_class=schema.StatePerson,
                       root_entity_class=entities.StatePerson,
                       unifying_id_field='person_id',
                       build_related_entities=True))

        # Get StateSentenceGroups
        sentence_groups = (
            test_pipeline
            | 'Load StateSentencegroups' >> extractor_utils.BuildRootEntity(
                dataset=None,
                data_dict=data_dict,
                root_schema_class=schema.StateSentenceGroup,
                root_entity_class=entities.StateSentenceGroup,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            test_pipeline | 'Load StateIncarcerationSentences' >>
            extractor_utils.BuildRootEntity(
                dataset=None,
                data_dict=data_dict,
                root_schema_class=schema.StateIncarcerationSentence,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateSupervisionSentences
        supervision_sentences = (
            test_pipeline | 'Load StateSupervisionSentences' >>
            extractor_utils.BuildRootEntity(
                dataset=None,
                data_dict=data_dict,
                root_schema_class=schema.StateSupervisionSentence,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field='person_id',
                build_related_entities=True))

        sentences_and_sentence_groups = (
            {
                'sentence_groups': sentence_groups,
                'incarceration_sentences': incarceration_sentences,
                'supervision_sentences': supervision_sentences
            }
            | 'Group sentences to sentence groups' >> beam.CoGroupByKey())

        sentence_groups_with_hydrated_sentences = (
            sentences_and_sentence_groups
            | 'Set hydrated sentences on sentence groups' >> beam.ParDo(
                SetSentencesOnSentenceGroup()))

        # Group each StatePerson with their related entities
        person_and_sentence_groups = (
            {
                'person': persons,
                'sentence_groups': sentence_groups_with_hydrated_sentences
            }
            | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey())

        # Identify IncarcerationEvents events from the StatePerson's
        # StateIncarcerationPeriods
        fake_person_id_to_county_query_result = [{
            'person_id':
            fake_person_id,
            'county_of_residence':
            _COUNTY_OF_RESIDENCE
        }]
        person_id_to_county_kv = (
            test_pipeline
            | "Read person id to county associations from BigQuery" >>
            beam.Create(fake_person_id_to_county_query_result)
            |
            "Convert to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        person_events = (person_and_sentence_groups
                         | 'Classify Incarceration Events' >> beam.ParDo(
                             pipeline.ClassifyIncarcerationEvents(),
                             AsDict(person_id_to_county_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = PipelineOptions().get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get IncarcerationMetrics
        incarceration_metrics = (
            person_events
            | 'Get Incarceration Metrics' >> pipeline.GetIncarcerationMetrics(
                pipeline_options=all_pipeline_options,
                inclusions=ALL_INCLUSIONS_DICT,
                calculation_month_limit=-1))

        assert_that(incarceration_metrics,
                    AssertMatchers.validate_metric_type())

        test_pipeline.run()
Esempio n. 36
0
 def setUp(self):
     pipeline_options = PipelineOptions()
     self.fs = gcsfilesystem.GCSFileSystem(
         pipeline_options=pipeline_options)
Esempio n. 37
0
def run(argv=None):
    """Main entry point; defines and runs the hourly_team_value pipeline."""
    parser = argparse.ArgumentParser()

    parser.add_argument('--topic', type=str, help='Pub/Sub topic to read from')
    parser.add_argument('--subscription',
                        type=str,
                        help='Pub/Sub subscription to read from')
    parser.add_argument('--dataset',
                        type=str,
                        required=True,
                        help='BigQuery Dataset to write tables to. '
                        'Must already exist.')
    parser.add_argument(
        '--table_name',
        default='high_values',
        help='The BigQuery table name. Should not already exist.')
    parser.add_argument(
        '--group_window_duration',
        type=int,
        default=60,
        # default=360,
        help='Numeric value of fixed window duration for group '
        'analysis, in minutes')
    parser.add_argument(
        '--allowed_lateness',
        type=int,
        default=120,
        # default=720,
        help='Numeric value of allowed data lateness, in minutes')

    args, pipeline_args = parser.parse_known_args(argv)

    logging.info(40 * "#")
    logging.info(datetime.now())
    logging.info(40 * "#")

    if args.topic is None and args.subscription is None:
        parser.print_usage()
        print(sys.argv[0] +
              ': error: one of --topic or --subscription is required')
        sys.exit(1)

    options = PipelineOptions(pipeline_args)

    # We also require the --project option to access --dataset
    if options.view_as(GoogleCloudOptions).project is None:
        parser.print_usage()
        print(sys.argv[0] + ': error: argument --project is required')
        sys.exit(1)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    options.view_as(SetupOptions).save_main_session = True

    # Enforce that this pipeline is always run in streaming mode
    options.view_as(StandardOptions).streaming = True

    p = beam.Pipeline(options=options)

    # Read from PubSub into a PCollection.
    if args.subscription:
        values = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(
            subscription=args.subscription)
    else:
        values = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(topic=args.topic)

    events = (values
              | 'ParseEventFn' >> beam.ParDo(ParseEventFn())
              | 'AddEventTimestamps' >>
              beam.Map(lambda elem: beam.window.TimestampedValue(
                  elem, elem['timestamp'])))

    # Get group values and write the results to BigQuery
    (events  # pylint: disable=expression-not-assigned
     | 'CalculateGroupValues' >> CalculateGroupValues(
         args.group_window_duration, args.allowed_lateness)
     | 'GroupValuesDict' >> beam.ParDo(GroupValuesDict())
     | 'WriteGroupValueSums' >> WriteToBigQuery(
         args.table_name + '_groups', args.dataset, {
             'group': 'STRING',
             'total_value': 'INTEGER',
             'window_start': 'STRING',
             'processing_time': 'STRING',
         },
         options.view_as(GoogleCloudOptions).project))

    def format_user_value_sums(user_value):
        (user, value) = user_value
        t = timestamp2str(int(time.time()))
        return {'user': user, 'total_value': value, 'update_time': t}

    # Get user values and write the results to BigQuery
    (events  # pylint: disable=expression-not-assigned
     | 'CalculateUserValues' >> CalculateUserValues(args.allowed_lateness)
     | 'FormatUserValueSums' >> beam.Map(format_user_value_sums)
     | 'WriteUserValueSums' >> WriteToBigQuery(
         args.table_name + '_users', args.dataset, {
             'user': '******',
             'total_value': 'INTEGER',
             'update_time': 'STRING',
         },
         options.view_as(GoogleCloudOptions).project))

    p.run()
Esempio n. 38
0
class Pipeline(object):
    """A pipeline object that manages a DAG of
  :class:`~apache_beam.pvalue.PValue` s and their
  :class:`~apache_beam.transforms.ptransform.PTransform` s.

  Conceptually the :class:`~apache_beam.pvalue.PValue` s are the DAG's nodes and
  the :class:`~apache_beam.transforms.ptransform.PTransform` s computing
  the :class:`~apache_beam.pvalue.PValue` s are the edges.

  All the transforms applied to the pipeline must have distinct full labels.
  If same transform instance needs to be applied then the right shift operator
  should be used to designate new names
  (e.g. ``input | "label" >> my_tranform``).
  """
    def __init__(self, runner=None, options=None, argv=None):
        """Initialize a pipeline object.

    Args:
      runner (~apache_beam.runners.runner.PipelineRunner): An object of
        type :class:`~apache_beam.runners.runner.PipelineRunner` that will be
        used to execute the pipeline. For registered runners, the runner name
        can be specified, otherwise a runner object must be supplied.
      options (~apache_beam.options.pipeline_options.PipelineOptions):
        A configured
        :class:`~apache_beam.options.pipeline_options.PipelineOptions` object
        containing arguments that should be used for running the Beam job.
      argv (List[str]): a list of arguments (such as :data:`sys.argv`)
        to be used for building a
        :class:`~apache_beam.options.pipeline_options.PipelineOptions` object.
        This will only be used if argument **options** is :data:`None`.

    Raises:
      ~exceptions.ValueError: if either the runner or options argument is not
        of the expected type.
    """
        if options is not None:
            if isinstance(options, PipelineOptions):
                self._options = options
            else:
                raise ValueError(
                    'Parameter options, if specified, must be of type PipelineOptions. '
                    'Received : %r', options)
        elif argv is not None:
            if isinstance(argv, list):
                self._options = PipelineOptions(argv)
            else:
                raise ValueError(
                    'Parameter argv, if specified, must be a list. Received : %r',
                    argv)
        else:
            self._options = PipelineOptions([])

        FileSystems.set_options(self._options)

        if runner is None:
            runner = self._options.view_as(StandardOptions).runner
            if runner is None:
                runner = StandardOptions.DEFAULT_RUNNER
                logging.info(
                    ('Missing pipeline option (runner). Executing pipeline '
                     'using the default runner: %s.'), runner)

        if isinstance(runner, str):
            runner = create_runner(runner)
        elif not isinstance(runner, PipelineRunner):
            raise TypeError('Runner must be a PipelineRunner object or the '
                            'name of a registered runner.')

        # Validate pipeline options
        errors = PipelineOptionsValidator(self._options, runner).validate()
        if errors:
            raise ValueError('Pipeline has validations errors: \n' +
                             '\n'.join(errors))

        # Default runner to be used.
        self.runner = runner
        # Stack of transforms generated by nested apply() calls. The stack will
        # contain a root node as an enclosing (parent) node for top transforms.
        self.transforms_stack = [AppliedPTransform(None, None, '', None)]
        # Set of transform labels (full labels) applied to the pipeline.
        # If a transform is applied and the full label is already in the set
        # then the transform will have to be cloned with a new label.
        self.applied_labels = set()

    @property
    @deprecated(since='First stable release',
                extra_message='References to <pipeline>.options'
                ' will not be supported')
    def options(self):
        return self._options

    def _current_transform(self):
        """Returns the transform currently on the top of the stack."""
        return self.transforms_stack[-1]

    def _root_transform(self):
        """Returns the root transform of the transform stack."""
        return self.transforms_stack[0]

    def _remove_labels_recursively(self, applied_transform):
        for part in applied_transform.parts:
            if part.full_label in self.applied_labels:
                self.applied_labels.remove(part.full_label)
                self._remove_labels_recursively(part)

    def _replace(self, override):

        assert isinstance(override, PTransformOverride)
        matcher = override.get_matcher()

        output_map = {}
        output_replacements = {}
        input_replacements = {}

        class TransformUpdater(PipelineVisitor):  # pylint: disable=used-before-assignment
            """"A visitor that replaces the matching PTransforms."""
            def __init__(self, pipeline):
                self.pipeline = pipeline

            def _replace_if_needed(self, original_transform_node):
                if matcher(original_transform_node):
                    assert isinstance(original_transform_node,
                                      AppliedPTransform)
                    replacement_transform = override.get_replacement_transform(
                        original_transform_node.transform)

                    replacement_transform_node = AppliedPTransform(
                        original_transform_node.parent, replacement_transform,
                        original_transform_node.full_label,
                        original_transform_node.inputs)

                    # Transform execution could depend on order in which nodes are
                    # considered. Hence we insert the replacement transform node to same
                    # index as the original transform node. Note that this operation
                    # removes the original transform node.
                    if original_transform_node.parent:
                        assert isinstance(original_transform_node.parent,
                                          AppliedPTransform)
                        parent_parts = original_transform_node.parent.parts
                        parent_parts[parent_parts.index(
                            original_transform_node)] = (
                                replacement_transform_node)
                    else:
                        # Original transform has to be a root.
                        roots = self.pipeline.transforms_stack[0].parts
                        assert original_transform_node in roots
                        roots[roots.index(original_transform_node)] = (
                            replacement_transform_node)

                    inputs = replacement_transform_node.inputs
                    # TODO:  Support replacing PTransforms with multiple inputs.
                    if len(inputs) > 1:
                        raise NotImplementedError(
                            'PTransform overriding is only supported for PTransforms that '
                            'have a single input. Tried to replace input of '
                            'AppliedPTransform %r that has %d inputs',
                            original_transform_node, len(inputs))

                    # We have to add the new AppliedTransform to the stack before expand()
                    # and pop it out later to make sure that parts get added correctly.
                    self.pipeline.transforms_stack.append(
                        replacement_transform_node)

                    # Keeping the same label for the replaced node but recursively
                    # removing labels of child transforms of original transform since they
                    # will be replaced during the expand below. This is needed in case
                    # the replacement contains children that have labels that conflicts
                    # with labels of the children of the original.
                    self.pipeline._remove_labels_recursively(
                        original_transform_node)

                    new_output = replacement_transform.expand(inputs[0])
                    replacement_transform_node.add_output(new_output)

                    # We only support replacing transforms with a single output with
                    # another transform that produces a single output.
                    # TODO: Support replacing PTransforms with multiple outputs.
                    if (len(original_transform_node.outputs) > 1
                            or not isinstance(
                                original_transform_node.outputs[None],
                                PCollection)
                            or not isinstance(new_output, PCollection)):
                        raise NotImplementedError(
                            'PTransform overriding is only supported for PTransforms that '
                            'have a single output. Tried to replace output of '
                            'AppliedPTransform %r with %r.',
                            original_transform_node, new_output)

                    # Recording updated outputs. This cannot be done in the same visitor
                    # since if we dynamically update output type here, we'll run into
                    # errors when visiting child nodes.
                    output_map[
                        original_transform_node.outputs[None]] = new_output

                    self.pipeline.transforms_stack.pop()

            def enter_composite_transform(self, transform_node):
                self._replace_if_needed(transform_node)

            def visit_transform(self, transform_node):
                self._replace_if_needed(transform_node)

        self.visit(TransformUpdater(self))

        # Adjusting inputs and outputs
        class InputOutputUpdater(PipelineVisitor):  # pylint: disable=used-before-assignment
            """"A visitor that records input and output values to be replaced.

      Input and output values that should be updated are recorded in maps
      input_replacements and output_replacements respectively.

      We cannot update input and output values while visiting since that results
      in validation errors.
      """
            def __init__(self, pipeline):
                self.pipeline = pipeline

            def enter_composite_transform(self, transform_node):
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                if (None in transform_node.outputs
                        and transform_node.outputs[None] in output_map):
                    output_replacements[transform_node] = (
                        output_map[transform_node.outputs[None]])

                replace_input = False
                for input in transform_node.inputs:
                    if input in output_map:
                        replace_input = True
                        break

                if replace_input:
                    new_input = [
                        input if not input in output_map else output_map[input]
                        for input in transform_node.inputs
                    ]
                    input_replacements[transform_node] = new_input

        self.visit(InputOutputUpdater(self))

        for transform in output_replacements:
            transform.replace_output(output_replacements[transform])

        for transform in input_replacements:
            transform.inputs = input_replacements[transform]

    def _check_replacement(self, override):
        matcher = override.get_matcher()

        class ReplacementValidator(PipelineVisitor):
            def visit_transform(self, transform_node):
                if matcher(transform_node):
                    raise RuntimeError(
                        'Transform node %r was not replaced as expected.',
                        transform_node)

        self.visit(ReplacementValidator())

    def replace_all(self, replacements):
        """ Dynamically replaces PTransforms in the currently populated hierarchy.

    Currently this only works for replacements where input and output types
    are exactly the same.

    TODO: Update this to also work for transform overrides where input and
    output types are different.

    Args:
      replacements (List[~apache_beam.pipeline.PTransformOverride]): a list of
        :class:`~apache_beam.pipeline.PTransformOverride` objects.
    """
        for override in replacements:
            assert isinstance(override, PTransformOverride)
            self._replace(override)

        # Checking if the PTransforms have been successfully replaced. This will
        # result in a failure if a PTransform that was replaced in a given override
        # gets re-added in a subsequent override. This is not allowed and ordering
        # of PTransformOverride objects in 'replacements' is important.
        for override in replacements:
            self._check_replacement(override)

    def run(self, test_runner_api=True):
        """Runs the pipeline. Returns whatever our runner returns after running."""

        # When possible, invoke a round trip through the runner API.
        if test_runner_api and self._verify_runner_api_compatible():
            return Pipeline.from_runner_api(self.to_runner_api(), self.runner,
                                            self._options).run(False)

        if self._options.view_as(SetupOptions).save_main_session:
            # If this option is chosen, verify we can pickle the main session early.
            tmpdir = tempfile.mkdtemp()
            try:
                pickler.dump_session(
                    os.path.join(tmpdir, 'main_session.pickle'))
            finally:
                shutil.rmtree(tmpdir)
        return self.runner.run_pipeline(self)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if not exc_type:
            self.run().wait_until_finish()

    def visit(self, visitor):
        """Visits depth-first every node of a pipeline's DAG.

    Runner-internal implementation detail; no backwards-compatibility guarantees

    Args:
      visitor (~apache_beam.pipeline.PipelineVisitor):
        :class:`~apache_beam.pipeline.PipelineVisitor` object whose callbacks
        will be called for each node visited. See
        :class:`~apache_beam.pipeline.PipelineVisitor` comments.

    Raises:
      ~exceptions.TypeError: if node is specified and is not a
        :class:`~apache_beam.pvalue.PValue`.
      ~apache_beam.error.PipelineError: if node is specified and does not
        belong to this pipeline instance.
    """

        visited = set()
        self._root_transform().visit(visitor, self, visited)

    def apply(self, transform, pvalueish=None, label=None):
        """Applies a custom transform using the pvalueish specified.

    Args:
      transform (~apache_beam.transforms.ptransform.PTransform): the
        :class:`~apache_beam.transforms.ptransform.PTransform` to apply.
      pvalueish (~apache_beam.pvalue.PCollection): the input for the
        :class:`~apache_beam.transforms.ptransform.PTransform` (typically a
        :class:`~apache_beam.pvalue.PCollection`).
      label (str): label of the
        :class:`~apache_beam.transforms.ptransform.PTransform`.

    Raises:
      ~exceptions.TypeError: if the transform object extracted from the
        argument list is not a
        :class:`~apache_beam.transforms.ptransform.PTransform`.
      ~exceptions.RuntimeError: if the transform object was already applied to
        this pipeline and needs to be cloned in order to apply again.
    """
        if isinstance(transform, ptransform._NamedPTransform):
            return self.apply(transform.transform, pvalueish, label
                              or transform.label)

        if not isinstance(transform, ptransform.PTransform):
            raise TypeError("Expected a PTransform object, got %s" % transform)

        if label:
            # Fix self.label as it is inspected by some PTransform operations
            # (e.g. to produce error messages for type hint violations).
            try:
                old_label, transform.label = transform.label, label
                return self.apply(transform, pvalueish)
            finally:
                transform.label = old_label

        full_label = '/'.join(
            [self._current_transform().full_label, label
             or transform.label]).lstrip('/')
        if full_label in self.applied_labels:
            raise RuntimeError(
                'Transform "%s" does not have a stable unique label. '
                'This will prevent updating of pipelines. '
                'To apply a transform with a specified label write '
                'pvalue | "label" >> transform' % full_label)
        self.applied_labels.add(full_label)

        pvalueish, inputs = transform._extract_input_pvalues(pvalueish)
        try:
            inputs = tuple(inputs)
            for leaf_input in inputs:
                if not isinstance(leaf_input, pvalue.PValue):
                    raise TypeError
        except TypeError:
            raise NotImplementedError(
                'Unable to extract PValue inputs from %s; either %s does not accept '
                'inputs of this format, or it does not properly override '
                '_extract_input_pvalues' % (pvalueish, transform))

        current = AppliedPTransform(self._current_transform(), transform,
                                    full_label, inputs)
        self._current_transform().add_part(current)
        self.transforms_stack.append(current)

        type_options = self._options.view_as(TypeOptions)
        if type_options.pipeline_type_check:
            transform.type_check_inputs(pvalueish)

        pvalueish_result = self.runner.apply(transform, pvalueish)

        if type_options is not None and type_options.pipeline_type_check:
            transform.type_check_outputs(pvalueish_result)

        for result in ptransform.get_nested_pvalues(pvalueish_result):
            assert isinstance(result, (pvalue.PValue, pvalue.DoOutputsTuple))

            # Make sure we set the producer only for a leaf node in the transform DAG.
            # This way we preserve the last transform of a composite transform as
            # being the real producer of the result.
            if result.producer is None:
                result.producer = current
            # TODO(robertwb): Multi-input, multi-output inference.
            # TODO(robertwb): Ideally we'd do intersection here.
            if (type_options is not None and type_options.pipeline_type_check
                    and isinstance(result, pvalue.PCollection)
                    and not result.element_type):
                input_element_type = (inputs[0].element_type
                                      if len(inputs) == 1 else typehints.Any)
                type_hints = transform.get_type_hints()
                declared_output_type = type_hints.simple_output_type(
                    transform.label)
                if declared_output_type:
                    input_types = type_hints.input_types
                    if input_types and input_types[0]:
                        declared_input_type = input_types[0][0]
                        result.element_type = typehints.bind_type_variables(
                            declared_output_type,
                            typehints.match_type_variables(
                                declared_input_type, input_element_type))
                    else:
                        result.element_type = declared_output_type
                else:
                    result.element_type = transform.infer_output_type(
                        input_element_type)

            assert isinstance(result.producer.inputs, tuple)
            current.add_output(result)

        if (type_options is not None
                and type_options.type_check_strictness == 'ALL_REQUIRED'
                and transform.get_type_hints().output_types is None):
            ptransform_name = '%s(%s)' % (transform.__class__.__name__,
                                          full_label)
            raise TypeCheckError(
                'Pipeline type checking is enabled, however no '
                'output type-hint was found for the '
                'PTransform %s' % ptransform_name)

        current.update_input_refcounts()
        self.transforms_stack.pop()
        return pvalueish_result

    def __reduce__(self):
        # Some transforms contain a reference to their enclosing pipeline,
        # which in turn reference all other transforms (resulting in quadratic
        # time/space to pickle each transform individually).  As we don't
        # require pickled pipelines to be executable, break the chain here.
        return str, ('Pickled pipeline stub.', )

    def _verify_runner_api_compatible(self):
        if self._options.view_as(TypeOptions).runtime_type_check:
            # This option is incompatible with the runner API as it requires
            # the runner to inspect non-serialized hints on the transform
            # itself.
            return False

        class Visitor(PipelineVisitor):  # pylint: disable=used-before-assignment
            ok = True  # Really a nonlocal.

            def enter_composite_transform(self, transform_node):
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                try:
                    # Transforms must be picklable.
                    pickler.loads(pickler.dumps(transform_node.transform,
                                                enable_trace=False),
                                  enable_trace=False)
                except Exception:
                    Visitor.ok = False

            def visit_value(self, value, _):
                if isinstance(value, pvalue.PDone):
                    Visitor.ok = False

        self.visit(Visitor())
        return Visitor.ok

    def to_runner_api(self, return_context=False):
        """For internal use only; no backwards-compatibility guarantees."""
        from apache_beam.runners import pipeline_context
        from apache_beam.portability.api import beam_runner_api_pb2
        context = pipeline_context.PipelineContext()
        # Mutates context; placing inline would force dependence on
        # argument evaluation order.
        root_transform_id = context.transforms.get_id(self._root_transform())
        proto = beam_runner_api_pb2.Pipeline(
            root_transform_ids=[root_transform_id],
            components=context.to_runner_api())
        if return_context:
            return proto, context
        else:
            return proto

    @staticmethod
    def from_runner_api(proto, runner, options, return_context=False):
        """For internal use only; no backwards-compatibility guarantees."""
        p = Pipeline(runner=runner, options=options)
        from apache_beam.runners import pipeline_context
        context = pipeline_context.PipelineContext(proto.components)
        root_transform_id, = proto.root_transform_ids
        p.transforms_stack = [context.transforms.get_by_id(root_transform_id)]
        # TODO(robertwb): These are only needed to continue construction. Omit?
        p.applied_labels = set(
            [t.unique_name for t in proto.components.transforms.values()])
        for id in proto.components.pcollections:
            pcollection = context.pcollections.get_by_id(id)
            pcollection.pipeline = p
            if not pcollection.producer:
                raise ValueError('No producer for %s' % id)

        # Inject PBegin input where necessary.
        from apache_beam.io.iobase import Read
        from apache_beam.transforms.core import Create
        has_pbegin = [Read, Create]
        for id in proto.components.transforms:
            transform = context.transforms.get_by_id(id)
            if not transform.inputs and transform.transform.__class__ in has_pbegin:
                transform.inputs = (pvalue.PBegin(p), )

        if return_context:
            return p, context
        else:
            return p
Esempio n. 39
0
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_input: str, output: str, calculation_month_count: int,
        metric_types: List[str], state_code: Optional[str],
        calculation_end_month: Optional[str],
        person_filter_ids: Optional[List[int]]):
    """Runs the program calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()

    input_dataset = all_pipeline_options['project'] + '.' + data_input
    reference_dataset = all_pipeline_options['project'] + '.' + reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (p | 'Load Persons' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set))

        # Get StateProgramAssignments
        program_assignments = (
            p | 'Load Program Assignments' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateProgramAssignment,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateAssessments
        assessments = (p | 'Load Assessments' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        # Get StateSupervisionPeriods
        supervision_periods = (
            p | 'Load SupervisionPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=False,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        supervision_period_to_agent_association_query = \
            f"SELECT * FROM `{reference_dataset}.supervision_period_to_agent_association`"

        supervision_period_to_agent_associations = (
            p | "Read Supervision Period to Agent table from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(
                    query=supervision_period_to_agent_association_query,
                    use_standard_sql=True)))

        # Convert the association table rows into key-value tuples with the value for the supervision_period_id column
        # as the key
        supervision_period_to_agent_associations_as_kv = (
            supervision_period_to_agent_associations
            | 'Convert Supervision Period to Agent table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id'))

        # Group each StatePerson with their other entities
        persons_entities = ({
            'person': persons,
            'program_assignments': program_assignments,
            'assessments': assessments,
            'supervision_periods': supervision_periods
        }
                            |
                            'Group StatePerson to StateProgramAssignments and'
                            >> beam.CoGroupByKey())

        # Identify ProgramEvents from the StatePerson's StateProgramAssignments
        person_program_events = (
            persons_entities
            | beam.ParDo(
                ClassifyProgramAssignments(),
                AsDict(supervision_period_to_agent_associations_as_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get program metrics
        program_metrics = (
            person_program_events | 'Get Program Metrics' >> GetProgramMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count))

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            program_metrics
            | 'Convert to dict to be written to BQ' >> beam.ParDo(
                ProgramMetricWritableDict()).with_outputs('referrals'))

        # Write the metrics to the output tables in BigQuery
        referrals_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            ProgramReferralMetric)

        _ = (writable_metrics.referrals
             | f"Write referral metrics to BQ table: {referrals_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=referrals_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
Esempio n. 40
0
    def _create_pipeline(
        cls,
        algorithm,
        algorithm_transform,
        algorithm_options,
        input_fn=None,
        map_fn=None,
        solve_fn=None,
        unmap_fn=None,
        output_fn=None,
        solver=LocalSolver(exact=False),  # default solver
        initial_mtype=sawatabi.constants.MODEL_ISING,
        pipeline_args=None,
    ):
        if pipeline_args is None:
            pipeline_args = ["--runner=DirectRunner"]
        cls._check_argument_type("initial_mtype", initial_mtype, str)
        valid_initial_mtypes = [sawatabi.constants.MODEL_ISING, sawatabi.constants.MODEL_QUBO]
        if initial_mtype not in valid_initial_mtypes:
            raise ValueError(f"'initial_mtype' must be one of {valid_initial_mtypes}.")

        pipeline_options = PipelineOptions(pipeline_args)
        p = beam.Pipeline(options=pipeline_options)

        # fmt: off

        # --------------------------------
        # Input part
        # --------------------------------

        inputs = p
        if input_fn is not None:
            inputs = (p
                | "Input" >> input_fn)

        with_indices = (inputs
            | "Prepare key" >> beam.Map(lambda element: (None, element))
            | "Assign global index for Ising variables" >> beam.ParDo(AbstractAlgorithm.IndexAssigningStatefulDoFn()))

        if "input.reassign_timestamp" in algorithm_options:
            # Add (Re-assign) event timestamp based on the index
            # - element[0]: index
            # - element[1]: data
            with_indices = (with_indices
                | "Assign timestamp by index" >> beam.Map(lambda element: beam.window.TimestampedValue(element, element[0])))

        # --------------------------------
        # Algorithm part
        # --------------------------------

        algorithm_transformed = with_indices | algorithm_transform

        # --------------------------------
        # Solving part
        # --------------------------------

        solved = (algorithm_transformed
            | "Make windows to key-value pairs for stateful DoFn" >> beam.Map(lambda element: (None, element))
            | "Solve" >> beam.ParDo(
                sawatabi.algorithm.Window.SolveDoFn(),
                algorithm=algorithm,
                algorithm_options=algorithm_options,
                map_fn=map_fn,
                solve_fn=solve_fn,
                unmap_fn=unmap_fn,
                solver=solver,
                initial_mtype=initial_mtype,
            ))

        # --------------------------------
        # Output part
        # --------------------------------

        if "output.with_timestamp" in algorithm_options:
            solved = (solved
                | "With timestamp for each window" >> beam.ParDo(AbstractAlgorithm.WithTimestampStrFn()))

        if "output.prefix" in algorithm_options:
            solved = (solved
                | "Add output prefix" >> beam.Map(lambda element: algorithm_options["output.prefix"] + element))
        if "output.suffix" in algorithm_options:
            solved = (solved
                | "Add output suffix" >> beam.Map(lambda element: element + algorithm_options["output.suffix"]))

        if output_fn is not None:
            outputs = (solved  # noqa: F841
                | "Output" >> output_fn)

        # fmt: on

        return p
Esempio n. 41
0
 def test_transform_name_mapping(self):
   options = PipelineOptions(['--transform_name_mapping={\"from\":\"to\"}'])
   mapping = options.view_as(GoogleCloudOptions).transform_name_mapping
   self.assertEqual(mapping['from'], 'to')
Esempio n. 42
0
  def test_template_location(self):
    options = PipelineOptions(['--template_location', 'abc'])
    self.assertEqual(options.get_all_options()['template_location'], 'abc')

    options = PipelineOptions(flags=[''])
    self.assertEqual(options.get_all_options()['template_location'], None)
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io import ReadFromText, ReadAllFromText


from dsba6155project.constants import Constants
import os
import re

files = [ os.path.abspath(os.path.join(Constants.DATA_PATH,f)) for f in os.listdir(Constants.DATA_PATH)]

options = PipelineOptions()
pipeline = beam.Pipeline(options=options)

books = pipeline | beam.Create(files[:10])

import nltk
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def POS(line):
    text = nltk.word_tokenize(line.lower())
    filtered_sentence = [w for w in text if not w in stop_words]
    tags = nltk.pos_tag(filtered_sentence)
    return tags


def count_ones(word_ones):
    print(word_ones)
    (word, tag) , ones = word_ones
def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()
    # Here we add some specific command line arguments we expect.   Specifically
    # we have the input file to load and the output table to write to.
    parser.add_argument(
        '--input',
        dest='input',
        required=False,
        help='Input file to read.  This can be a local file or '
        'a file in a Google Storage Bucket.',
        # This example file contains a total of only 10 lines.
        # Useful for quickly debugging on a small set of data
        default='gs://python-dataflow-example/data_files/head_usa_names.csv')
    # The output defaults to the lake dataset in your BigQuery project.  You'll have
    # to create the lake dataset yourself using this command:
    # bq mk lake
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        help='Output BQ table to write results to.',
                        default='lake.usa_names_enriched')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)

    # DataIngestion is a class we built in this script to hold the logic for
    # transforming the file into a BigQuery table.
    data_ingestion = DataIngestion()

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information like where Dataflow should store
    #  temp files, and what the project id is
    p = beam.Pipeline(options=PipelineOptions(pipeline_args))
    schema = parse_table_schema_from_json(data_ingestion.schema_str)

    # This function adds in a full state name by looking up the
    # full name in the short_to_long_name_map.  The short_to_long_name_map
    # comes from a read from BigQuery in the next few lines
    def add_full_state_name(row, short_to_long_name_map):
        row['state_full_name'] = short_to_long_name_map[row['state']]
        return row

    # This is a second source of data.  The source is from BigQuery.
    # This will come into our pipeline a side input.

    read_query = """
    SELECT
    state_name,
    state_abbreviation
    FROM
    `python-dataflow-example.example_data.state_abbreviations`"""

    state_abbreviations = (
        p
        | 'Read from BigQuery' >> beam.io.Read(
            beam.io.BigQuerySource(query=read_query, use_standard_sql=True))
        # We must create a python tuple of key to value pairs here in order to
        # use the data as a side input.  Dataflow will use the keys to distribute the
        # work to the correct worker.
        | 'Abbreviation to Full Name' >>
        beam.Map(lambda row: (row['state_abbreviation'], row['state_name'])))

    (p
     # Read the file.  This is the source of the pipeline.  All further
     # processing starts with lines read from the file.  We use the input
     # argument from the command line.  We also skip the first line which is
     # a header row.
     | 'Read From Text' >> beam.io.ReadFromText(known_args.input,
                                                skip_header_lines=1)
     # Translates from the raw string data in the CSV to a dictionary.
     # The dictionary is a keyed by column names with the values being the values
     # we want to store in BigQuery.
     | 'String to BigQuery Row' >>
     beam.Map(lambda s: data_ingestion.parse_method(s))
     # Here we pass in a side input, which is data that comes from outside our
     # CSV source.  The side input contains a map of states to their full name.
     |
     'Join Data' >> beam.Map(add_full_state_name, AsDict(state_abbreviations))
     # This is the final stage of the pipeline, where we define the destination
     #  of the data.  In this case we are writing to BigQuery.
     | 'Write to BigQuery' >> beam.io.Write(
         beam.io.BigQuerySink(
             # The table name is a required argument for the BigQuery sink.
             # In this case we use the value passed in from the command line.
             known_args.output,
             # Here we use the JSON schema read in from a JSON file.
             # Specifying the schema allows the API to create the table correctly if it does not yet exist.
             schema=schema,
             # Creates the table in BigQuery if it does not yet exist.
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             # Deletes all data in the BigQuery table before writing.
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
    p.run().wait_until_finish()
Esempio n. 45
0
 def test_default_ip_configuration(self):
     pipeline_options = PipelineOptions(
         ['--temp_location', 'gs://any-location/temp'])
     env = apiclient.Environment([], pipeline_options, '2.0.0',
                                 FAKE_PIPELINE_URL)
     self.assertEqual(env.proto.workerPools[0].ipConfiguration, None)
Esempio n. 46
0
def run(argv=None):
    """Build and run the pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--output_topic',
                        required=True,
                        help=('Output PubSub topic of the form '
                              '"projects/<PROJECT>/topic/<TOPIC>".'))
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--input_topic',
                       help=('Input PubSub topic of the form '
                             '"projects/<PROJECT>/topics/<TOPIC>".'))
    group.add_argument(
        '--input_subscription',
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True
    p = beam.Pipeline(options=pipeline_options)

    # Read from PubSub into a PCollection.
    if known_args.input_subscription:
        lines = p | beam.io.ReadStringsFromPubSub(
            subscription=known_args.input_subscription)
    else:
        lines = p | beam.io.ReadStringsFromPubSub(topic=known_args.input_topic)

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return (word, sum(ones))

    counts = (
        lines
        | 'AddTimestampFn' >> beam.ParDo(AddTimestampFn())
        | 'After AddTimestampFn' >> ParDo(PrintFn('After AddTimestampFn'))
        | 'Split' >>
        (beam.ParDo(WordExtractingDoFn()).with_output_types(six.text_type))
        | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
        | beam.WindowInto(window.FixedWindows(5, 0))
        | 'GroupByKey' >> beam.GroupByKey()
        | 'CountOnes' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
        (word, count) = word_count
        return '%s: %d' % (word, count)

    output = counts | 'format' >> beam.Map(format_result)

    # Write to PubSub.
    # pylint: disable=expression-not-assigned
    output | beam.io.WriteStringsToPubSub(known_args.output_topic)

    def check_gbk_format():
        # A matcher that checks that the output of GBK is of the form word: count.
        def matcher(elements):
            # pylint: disable=unused-variable
            actual_elements_in_window, window = elements
            for elm in actual_elements_in_window:
                assert re.match(r'\S+:\s+\d+', elm) is not None

        return matcher

    # Check that the format of the output is correct.
    assert_that(output,
                check_gbk_format(),
                use_global_window=False,
                label='Assert word:count format.')

    # Check also that elements are ouput in the right window.
    # This expects exactly 1 occurrence of any subset of the elements
    # 150, 151, 152, 153, 154 in the window [150, 155)
    # or exactly 1 occurrence of any subset of the elements
    # 210, 211, 212, 213, 214 in the window [210, 215).
    expected_window_to_elements = {
        window.IntervalWindow(150, 155): [
            ('150: 1'),
            ('151: 1'),
            ('152: 1'),
            ('153: 1'),
            ('154: 1'),
        ],
        window.IntervalWindow(210, 215): [
            ('210: 1'),
            ('211: 1'),
            ('212: 1'),
            ('213: 1'),
            ('214: 1'),
        ],
    }

    # To make it pass, publish numbers in [150-155) or [210-215) with no repeats.
    # To make it fail, publish a repeated number in the range above range.
    # For example: '210 213 151 213'
    assert_that(output,
                equal_to_per_window(expected_window_to_elements),
                use_global_window=False,
                label='Assert correct streaming windowing.')

    result = p.run()
    result.wait_until_finish()
Esempio n. 47
0
 def test_interpreter_version_check_passes_py38(self):
     pipeline_options = PipelineOptions([])
     apiclient._verify_interpreter_version_is_supported(pipeline_options)
Esempio n. 48
0
 def test_display_data(self):
   for case in PipelineOptionsTest.TEST_CASES:
     options = PipelineOptions(flags=case['flags'])
     dd = DisplayData.create_from(options)
     hc.assert_that(dd.items, hc.contains_inanyorder(*case['display_data']))
Esempio n. 49
0
    def __init__(self, runner=None, options=None, argv=None):
        """Initialize a pipeline object.

    Args:
      runner (~apache_beam.runners.runner.PipelineRunner): An object of
        type :class:`~apache_beam.runners.runner.PipelineRunner` that will be
        used to execute the pipeline. For registered runners, the runner name
        can be specified, otherwise a runner object must be supplied.
      options (~apache_beam.options.pipeline_options.PipelineOptions):
        A configured
        :class:`~apache_beam.options.pipeline_options.PipelineOptions` object
        containing arguments that should be used for running the Beam job.
      argv (List[str]): a list of arguments (such as :data:`sys.argv`)
        to be used for building a
        :class:`~apache_beam.options.pipeline_options.PipelineOptions` object.
        This will only be used if argument **options** is :data:`None`.

    Raises:
      ~exceptions.ValueError: if either the runner or options argument is not
        of the expected type.
    """
        if options is not None:
            if isinstance(options, PipelineOptions):
                self._options = options
            else:
                raise ValueError(
                    'Parameter options, if specified, must be of type PipelineOptions. '
                    'Received : %r', options)
        elif argv is not None:
            if isinstance(argv, list):
                self._options = PipelineOptions(argv)
            else:
                raise ValueError(
                    'Parameter argv, if specified, must be a list. Received : %r',
                    argv)
        else:
            self._options = PipelineOptions([])

        FileSystems.set_options(self._options)

        if runner is None:
            runner = self._options.view_as(StandardOptions).runner
            if runner is None:
                runner = StandardOptions.DEFAULT_RUNNER
                logging.info(
                    ('Missing pipeline option (runner). Executing pipeline '
                     'using the default runner: %s.'), runner)

        if isinstance(runner, str):
            runner = create_runner(runner)
        elif not isinstance(runner, PipelineRunner):
            raise TypeError('Runner must be a PipelineRunner object or the '
                            'name of a registered runner.')

        # Validate pipeline options
        errors = PipelineOptionsValidator(self._options, runner).validate()
        if errors:
            raise ValueError('Pipeline has validations errors: \n' +
                             '\n'.join(errors))

        # Default runner to be used.
        self.runner = runner
        # Stack of transforms generated by nested apply() calls. The stack will
        # contain a root node as an enclosing (parent) node for top transforms.
        self.transforms_stack = [AppliedPTransform(None, None, '', None)]
        # Set of transform labels (full labels) applied to the pipeline.
        # If a transform is applied and the full label is already in the set
        # then the transform will have to be cloned with a new label.
        self.applied_labels = set()
Esempio n. 50
0
 def test_interpreter_version_check_fails_on_not_yet_supported_version(
         self):
     pipeline_options = PipelineOptions([])
     self.assertRaises(Exception,
                       apiclient._verify_interpreter_version_is_supported,
                       pipeline_options)
Esempio n. 51
0
def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()
    # Here we add some specific command line arguments we expect.   S
    # This defaults the output table in your BigQuery you'll have
    # to create the example_data dataset yourself using bq mk temp
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        help='Output BQ table to write results to.',
                        default='lake.orders_denormalized_sideinput')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)

    # DataLakeToDataMart is a class we built in this script to hold the logic for
    # transforming the file into a BigQuery table.
    data_lake_to_data_mart = DataLakeToDataMart()

    p = beam.Pipeline(options=PipelineOptions(pipeline_args))
    schema = parse_table_schema_from_json(data_lake_to_data_mart.schema_str)
    pipeline = beam.Pipeline(options=PipelineOptions(pipeline_args))

    # This query returns details about the account, normalized into a
    # different table.  We will be joining the data in to the main orders dataset in order
    # to create a denormalized table.
    account_details_source = (
        pipeline
        | 'Read Account Details from BigQuery ' >> beam.io.Read(
            beam.io.BigQuerySource(
                query="""
                SELECT
                  acct_number,
                  acct_company_name,
                  acct_group_name,
                  acct_name,
                  acct_org_name,
                  address,
                  city,
                  state,
                  zip_code,
                  country
                FROM
                  `python-dataflow-example.example_data.account`""",
                # This next stage of the pipeline maps the acct_number to a single row of
                # results from BigQuery.  Mapping this way helps Dataflow move your data around
                # to different workers.  When later stages of the pipeline run, all results from
                # a given account number will run on one worker.
                use_standard_sql=True))
        | 'Account Details' >> beam.Map(lambda row: (row['acct_number'], row)))

    orders_query = data_lake_to_data_mart.get_orders_query()
    (p
     # Read the orders from BigQuery.  This is the source of the pipeline.  All further
     # processing starts with rows read from the query results here.
     | 'Read Orders from BigQuery ' >> beam.io.Read(
         beam.io.BigQuerySource(query=orders_query, use_standard_sql=True))
     # Here we pass in a side input, which is data that comes from outside our
     # main source.  The side input contains a map of states to their full name
     | 'Join Data with sideInput' >> beam.Map(
         data_lake_to_data_mart.add_account_details,
         AsDict(account_details_source))
     # This is the final stage of the pipeline, where we define the destination
     # of the data.  In this case we are writing to BigQuery.
     | 'Write Data to BigQuery' >> beam.io.Write(
         beam.io.BigQuerySink(
             # The table name is a required argument for the BigQuery sink.
             # In this case we use the value passed in from the command line.
             known_args.output,
             # Here we use the JSON schema read in from a JSON file.
             # Specifying the schema allows the API to create the table correctly if it does not yet exist.
             schema=schema,
             # Creates the table in BigQuery if it does not yet exist.
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             # Deletes all data in the BigQuery table before writing.
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
    p.run().wait_until_finish()
Esempio n. 52
0
def run(argv=None):
    """Main entry point; defines and runs the hourly_team_score pipeline."""
    parser = argparse.ArgumentParser()

    # The default maps to two large Google Cloud Storage files (each ~12GB)
    # holding two subsequent day's worth (roughly) of data.
    parser.add_argument(
        '--input',
        type=str,
        default='gs://apache-beam-samples/game/gaming_data*.csv',
        help='Path to the data file(s) containing game data.')
    parser.add_argument('--dataset',
                        type=str,
                        required=True,
                        help='BigQuery Dataset to write tables to. '
                        'Must already exist.')
    parser.add_argument(
        '--table_name',
        default='leader_board',
        help='The BigQuery table name. Should not already exist.')
    parser.add_argument(
        '--window_duration',
        type=int,
        default=60,
        help='Numeric value of fixed window duration, in minutes')
    parser.add_argument('--start_min',
                        type=str,
                        default='1970-01-01-00-00',
                        help='String representation of the first minute after '
                        'which to generate results in the format: '
                        'yyyy-MM-dd-HH-mm. Any input data timestamped '
                        'prior to that minute won\'t be included in the '
                        'sums.')
    parser.add_argument('--stop_min',
                        type=str,
                        default='2100-01-01-00-00',
                        help='String representation of the first minute for '
                        'which to generate results in the format: '
                        'yyyy-MM-dd-HH-mm. Any input data timestamped '
                        'after to that minute won\'t be included in the '
                        'sums.')

    args, pipeline_args = parser.parse_known_args(argv)

    options = PipelineOptions(pipeline_args)

    # We also require the --project option to access --dataset
    if options.view_as(GoogleCloudOptions).project is None:
        parser.print_usage()
        print(sys.argv[0] + ': error: argument --project is required')
        sys.exit(1)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=options) as p:
        (p  # pylint: disable=expression-not-assigned
         | 'ReadInputText' >> beam.io.ReadFromText(args.input)
         | 'HourlyTeamScore' >> HourlyTeamScore(args.start_min, args.stop_min,
                                                args.window_duration)
         | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict())
         | 'WriteTeamScoreSums' >> WriteToBigQuery(
             args.table_name, args.dataset, {
                 'team': 'STRING',
                 'total_score': 'INTEGER',
                 'window_start': 'STRING',
             }))
Esempio n. 53
0
def printing(x):
    print(x)
    print(type(x))
    return x


class StreamPubSubToElasticsearch(GApplication):
    def __init__(self, args):
        GApplication.__init__(self)
        self.logger.info("this is the arguments {}".format(args))
        self.pipeline_args = args
        self.sink_opts = eval(args.sink_opts)
        self.parser_type = args.parser_type
        self.input_topic = args.input_topic

    def run(self):
        with beam.Pipeline(options=self.pipeline_args) as pcoll:
            pcoll = pcoll | beam.io.ReadFromPubSub(self.input_topic)
            pcoll = pcoll | beam.ParDo(Parser[self.parser_type]())
            pcoll = pcoll | beam.Map(printing)
            sink = ElasticSearchWriter(pcoll, self.sink_opts)
            sink.parse()
            sink.write()


if __name__ == '__main__':
    pipeline_args = PipelineOptions(flags=None)
    args = pipeline_args.view_as(AppOptions)
    stream_app = StreamPubSubToElasticsearch(args=args)
    stream_app.start()
Esempio n. 54
0
import json
import os
import apache_beam as beam
# from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
# import pandas as pd
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions

# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './credentils.json'
# Dflow_option = ['--project = iucc-assaf-anderson', '--job_name = bqetl4', '--temp_location = gs://dataflow-iucc-assaf-anderson/temp','--staging_location = gs://dataflow-iucc-assaf-anderson/staging']
options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = 'iucc-assaf-anderson'
google_cloud_options.job_name = 'bqetl4'
google_cloud_options.staging_location = 'gs://dataflow-iucc-assaf-anderson/staging'
google_cloud_options.temp_location = 'gs://dataflow-iucc-assaf-anderson/temp'
options.view_as(StandardOptions).runner = 'Dataflow'

infile = 'gs://firebase2bigquery/fb_Samples_items.json'
outfile = 'gs://dataflow-iucc-assaf-anderson/extracted_data'


class JsonCoder(object):
    """A JSON coder interpreting each line as a JSON string."""
    def encode(self, x):
        return json.dumps(x)

    def decode(self, x):
        return json.loads(x)
Esempio n. 55
0
def model_custom_sink(simplekv, KVs, final_table_name_no_ptransform,
                      final_table_name_with_ptransform):
    """Demonstrates creating a new custom sink and using it in a pipeline.

  Defines a new sink ``SimpleKVSink`` that demonstrates writing to a simple
  key-value based storage system which has following API.

    simplekv.connect(url) -
        connects to the storage system and returns an access token which can be
        used to perform further operations
    simplekv.open_table(access_token, table_name) -
        creates a table named 'table_name'. Returns a table object.
    simplekv.write_to_table(access_token, table, key, value) -
        writes a key-value pair to the given table.
    simplekv.rename_table(access_token, old_name, new_name) -
        renames the table named 'old_name' to 'new_name'.

  Uses the new sink in an example pipeline.

  Additionally demonstrates how a sink should be implemented using a
  ``PTransform``. This is the recommended way to develop sinks that are to be
  distributed to a large number of end users.

  This method runs two pipelines.

  (1) A pipeline that uses ``SimpleKVSink`` directly using the ``df.Write``
      transform.
  (2) A pipeline that uses a custom ``PTransform`` that wraps
      ``SimpleKVSink``.

  Args:
    simplekv: an object that mocks the key-value storage.

    KVs: the set of key-value pairs to be written in the example pipeline.

    final_table_name_no_ptransform: the prefix of final set of tables to be
                                    created by the example pipeline that uses
                                    ``SimpleKVSink`` directly.

    final_table_name_with_ptransform: the prefix of final set of tables to be
                                      created by the example pipeline that uses
                                      a ``PTransform`` that wraps
                                      ``SimpleKVSink``.
  """

    import apache_beam as beam
    from apache_beam.io import iobase
    from apache_beam.transforms.core import PTransform
    from apache_beam.options.pipeline_options import PipelineOptions

    # Defining the new sink.
    # [START model_custom_sink_new_sink]
    class SimpleKVSink(iobase.Sink):
        def __init__(self, url, final_table_name):
            self._url = url
            self._final_table_name = final_table_name

        def initialize_write(self):
            access_token = simplekv.connect(self._url)
            return access_token

        def open_writer(self, access_token, uid):
            table_name = 'table' + uid
            return SimpleKVWriter(access_token, table_name)

        def finalize_write(self, access_token, table_names):
            for i, table_name in enumerate(table_names):
                simplekv.rename_table(access_token, table_name,
                                      self._final_table_name + str(i))

    # [END model_custom_sink_new_sink]

    # Defining a writer for the new sink.
    # [START model_custom_sink_new_writer]
    class SimpleKVWriter(iobase.Writer):
        def __init__(self, access_token, table_name):
            self._access_token = access_token
            self._table_name = table_name
            self._table = simplekv.open_table(access_token, table_name)

        def write(self, record):
            key, value = record

            simplekv.write_to_table(self._access_token, self._table, key,
                                    value)

        def close(self):
            return self._table_name

    # [END model_custom_sink_new_writer]

    final_table_name = final_table_name_no_ptransform

    # Using the new sink in an example pipeline.
    # [START model_custom_sink_use_new_sink]
    with beam.Pipeline(options=PipelineOptions()) as p:
        kvs = p | 'CreateKVs' >> beam.Create(KVs)

        kvs | 'WriteToSimpleKV' >> beam.io.Write(
            SimpleKVSink('http://url_to_simple_kv/', final_table_name))
        # [END model_custom_sink_use_new_sink]

    # We recommend users to start Sink class names with an underscore to
    # discourage using the Sink class directly when a PTransform for the sink is
    # available. We simulate that here by simply extending the previous Sink
    # class.
    class _SimpleKVSink(SimpleKVSink):
        pass

    # [START model_custom_sink_new_ptransform]
    class WriteToKVSink(PTransform):
        def __init__(self, url, final_table_name, **kwargs):
            super(WriteToKVSink, self).__init__(**kwargs)
            self._url = url
            self._final_table_name = final_table_name

        def expand(self, pcoll):
            return pcoll | iobase.Write(
                _SimpleKVSink(self._url, self._final_table_name))

    # [END model_custom_sink_new_ptransform]

    final_table_name = final_table_name_with_ptransform

    # [START model_custom_sink_use_ptransform]
    with beam.Pipeline(options=PipelineOptions()) as p:
        kvs = p | 'CreateKVs' >> beam.core.Create(KVs)
        kvs | 'WriteToSimpleKV' >> WriteToKVSink('http://url_to_simple_kv/',
                                                 final_table_name)
Esempio n. 56
0
 def build_container_image(cls, pipeline_options: PipelineOptions) -> str:
     setup_options = pipeline_options.view_as(SetupOptions)
     container_build_engine = setup_options.prebuild_sdk_container_engine
     builder_cls = cls._get_subclass_by_key(container_build_engine)
     builder = builder_cls(pipeline_options)
     return builder._build()
Esempio n. 57
0
 def setUp(self):
     pipeline_options = PipelineOptions()
     self.fs = s3filesystem.S3FileSystem(pipeline_options=pipeline_options)
Esempio n. 58
0
def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()
    # Here we add some specific command line arguments we expect.
    # Specifically we have the input file to load and the output table to
    # This is the final stage of the pipeline, where we define the destination
    # of the data.  In this case we are writing to BigQuery.
    parser.add_argument(
        '--input', dest='input', required=False,
        help='Input file to read.  This can be a local file or '
             'a file in a Google Storage Bucket.',
        # This example file contains a total of only 10 lines.
        # Useful for developing on a small set of data
        default='gs://erik-dataflow/test_data')
    # This defaults to the lake dataset in your bigquery project.  You'll have
    # to create the lake dataset yourself using this command:
    # bq mk lake

    parser.add_argument('--output', dest='output', required=False,
                        help='Output BQ table to write results to.',
                        default='poc.test_table')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)

    print known_args

    pipeline_args.extend([
        # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--runner=DataflowRunner',
        # CHANGE 3/5: Your project ID is required in order to run your pipeline on
        # the Google Cloud Dataflow Service.
        '--project=idyllic-kit-191017',
        # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
        # files.
        '--staging_location=gs://erik-dataflow/stg',
        # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
        # files.
        '--temp_location=gs://erik-dataflow/tmp',
        '--job_name=poc-job',
    ])


    # DataIngestion is a class we built in this script to hold the logic for
    # transforming the file into a BigQuery table.
    data_ingestion = DataIngestion()

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information including where Dataflow should
    # store temp files, and what the project id is

    p = beam.Pipeline(options=PipelineOptions(pipeline_args))

    (p
     # Read the file.  This is the source of the pipeline.  All further
     # processing starts with lines read from the file.  We use the input
     # argument from the command line.  We also skip the first line which is a
     # header row.
     | 'Read from a File' >> beam.io.ReadFromText(known_args.input,
                                                  skip_header_lines=1)
     # This stage of the pipeline translates from a CSV file single row
     # input as a string, to a dictionary object consumable by BigQuery.
     # It refers to a function we have written.  This function will
     # be run in parallel on different workers using input from the
     # previous stage of the pipeline.
     | 'String To BigQuery Row' >> beam.Map(lambda s:
                                            data_ingestion.parse_method(s))
     | 'Write to BigQuery' >> beam.io.Write(
                beam.io.BigQuerySink(
                    # The table name is a required argument for the BigQuery sink.
                    # In this case we use the value passed in from the command line.
                    known_args.output,
                    # Here we use the simplest way of defining a schema:
                    # fieldName:fieldType
                    schema='id:INTEGER,random_data:STRING',
                    # Creates the table in BigQuery if it does not yet exist.
                    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                    # Deletes all data in the BigQuery table before writing.
                    write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
    p.run().wait_until_finish()
Esempio n. 59
0
  def test_dataflow_job_file(self):
    options = PipelineOptions(['--dataflow_job_file', 'abc'])
    self.assertEqual(options.get_all_options()['dataflow_job_file'], 'abc')

    options = PipelineOptions(flags=[''])
    self.assertEqual(options.get_all_options()['dataflow_job_file'], None)
Esempio n. 60
0
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

p = beam.Pipeline(options=PipelineOptions())

# <class 'apache_beam.pvalue.PCollection'>
lines = p | 'ReadMyFile' >> beam.io.ReadFromText(file_pattern='./inputs.txt')