def test_with_setup_file(self):
    staging_dir = self.make_temp_dir()
    source_dir = self.make_temp_dir()
    self.create_temp_file(
        os.path.join(source_dir, 'setup.py'), 'notused')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).setup_file = os.path.join(
        source_dir, 'setup.py')

    self.assertEqual(
        [dependency.WORKFLOW_TARBALL_FILE],
        dependency.stage_job_resources(
            options,
            # We replace the build setup command because a realistic one would
            # require the setuptools package to be installed. Note that we can't
            # use "touch" here to create the expected output tarball file, since
            # touch is not available on Windows, so we invoke python to produce
            # equivalent behavior.
            build_setup_args=[
                'python', '-c', 'open(__import__("sys").argv[1], "a")',
                os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)],
            temp_dir=source_dir))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
Exemple #2
0
  def test_write_messages_unsupported_features(self, mock_pubsub):
    data = b'data'
    attributes = {'key': 'value'}
    payloads = [PubsubMessage(data, attributes)]

    options = PipelineOptions([])
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    _ = (p
         | Create(payloads)
         | WriteToPubSub('projects/fakeprj/topics/a_topic',
                         id_label='a_label'))
    with self.assertRaisesRegexp(NotImplementedError,
                                 r'id_label is not supported'):
      p.run()
    options = PipelineOptions([])
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    _ = (p
         | Create(payloads)
         | WriteToPubSub('projects/fakeprj/topics/a_topic',
                         timestamp_attribute='timestamp'))
    with self.assertRaisesRegexp(NotImplementedError,
                                 r'timestamp_attribute is not supported'):
      p.run()
  def test_with_requirements_file(self):
    try:
      staging_dir = tempfile.mkdtemp()
      requirements_cache_dir = tempfile.mkdtemp()
      source_dir = tempfile.mkdtemp()

      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).requirements_cache = requirements_cache_dir
      options.view_as(SetupOptions).requirements_file = os.path.join(
          source_dir, dependency.REQUIREMENTS_FILE)
      self.create_temp_file(
          os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
      self.assertEqual(
          sorted([dependency.REQUIREMENTS_FILE,
                  'abc.txt', 'def.txt']),
          sorted(dependency.stage_job_resources(
              options,
              populate_requirements_cache=self.populate_requirements_cache)))
      self.assertTrue(
          os.path.isfile(
              os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
      self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt')))
      self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt')))
    finally:
      shutil.rmtree(staging_dir)
      shutil.rmtree(requirements_cache_dir)
      shutil.rmtree(source_dir)
  def test_sdk_location_http(self):
    staging_dir = self.make_temp_dir()
    sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz'

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    def file_download(_, to_folder):
      tarball_path = os.path.join(to_folder, 'sdk-tarball')
      with open(tarball_path, 'w') as f:
        f.write('Package content.')
      return tarball_path

    with mock.patch('apache_beam.runners.dataflow.internal.'
                    'dependency._dependency_file_download', file_download):
      self.assertEqual(
          [names.DATAFLOW_SDK_TARBALL_FILE],
          dependency.stage_job_resources(options))

    tarball_path = os.path.join(
        staging_dir, names.DATAFLOW_SDK_TARBALL_FILE)
    with open(tarball_path) as f:
      self.assertEqual(f.read(), 'Package content.')
Exemple #5
0
def run(argv=None):
  """Build and run the pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--output_topic', required=True,
      help=('Output PubSub topic of the form '
            '"projects/<PROJECT>/topic/<TOPIC>".'))
  group = parser.add_mutually_exclusive_group(required=True)
  group.add_argument(
      '--input_topic',
      help=('Input PubSub topic of the form '
            '"projects/<PROJECT>/topics/<TOPIC>".'))
  group.add_argument(
      '--input_subscription',
      help=('Input PubSub subscription of the form '
            '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  pipeline_options.view_as(StandardOptions).streaming = True
  p = beam.Pipeline(options=pipeline_options)

  # Read from PubSub into a PCollection.
  if known_args.input_subscription:
    lines = p | beam.io.ReadStringsFromPubSub(
        subscription=known_args.input_subscription)
  else:
    lines = p | beam.io.ReadStringsFromPubSub(topic=known_args.input_topic)

  # Count the occurrences of each word.
  def count_ones(word_ones):
    (word, ones) = word_ones
    return (word, sum(ones))

  counts = (lines
            | 'split' >> (beam.ParDo(WordExtractingDoFn())
                          .with_output_types(unicode))
            | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
            | beam.WindowInto(window.FixedWindows(15, 0))
            | 'group' >> beam.GroupByKey()
            | 'count' >> beam.Map(count_ones))

  # Format the counts into a PCollection of strings.
  def format_result(word_count):
    (word, count) = word_count
    return '%s: %d' % (word, count)

  output = counts | 'format' >> beam.Map(format_result)

  # Write to PubSub.
  # pylint: disable=expression-not-assigned
  output | beam.io.WriteStringsToPubSub(known_args.output_topic)

  result = p.run()
  result.wait_until_finish()
Exemple #6
0
def examples_wordcount_minimal(renames):
  """MinimalWordCount example snippets."""
  import re

  import apache_beam as beam

  from apache_beam.options.pipeline_options import GoogleCloudOptions
  from apache_beam.options.pipeline_options import StandardOptions
  from apache_beam.options.pipeline_options import PipelineOptions

  # [START examples_wordcount_minimal_options]
  options = PipelineOptions()
  google_cloud_options = options.view_as(GoogleCloudOptions)
  google_cloud_options.project = 'my-project-id'
  google_cloud_options.job_name = 'myjob'
  google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging'
  google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp'
  options.view_as(StandardOptions).runner = 'DataflowRunner'
  # [END examples_wordcount_minimal_options]

  # Run it locally for testing.
  options = PipelineOptions()

  # [START examples_wordcount_minimal_create]
  p = beam.Pipeline(options=options)
  # [END examples_wordcount_minimal_create]

  (
      # [START examples_wordcount_minimal_read]
      p | beam.io.ReadFromText(
          'gs://dataflow-samples/shakespeare/kinglear.txt')
      # [END examples_wordcount_minimal_read]

      # [START examples_wordcount_minimal_pardo]
      | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
      # [END examples_wordcount_minimal_pardo]

      # [START examples_wordcount_minimal_count]
      | beam.combiners.Count.PerElement()
      # [END examples_wordcount_minimal_count]

      # [START examples_wordcount_minimal_map]
      | beam.Map(lambda word_count: '%s: %s' % (word_count[0], word_count[1]))
      # [END examples_wordcount_minimal_map]

      # [START examples_wordcount_minimal_write]
      | beam.io.WriteToText('gs://my-bucket/counts.txt')
      # [END examples_wordcount_minimal_write]
  )

  p.visit(SnippetUtils.RenameFiles(renames))

  # [START examples_wordcount_minimal_run]
  result = p.run()
  # [END examples_wordcount_minimal_run]
  result.wait_until_finish()
 def test_get_all_options(self):
   for case in PipelineOptionsTest.TEST_CASES:
     options = PipelineOptions(flags=case['flags'])
     self.assertDictContainsSubset(case['expected'], options.get_all_options())
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_flag,
                      case['expected']['mock_flag'])
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_option,
                      case['expected']['mock_option'])
  def test_unknown_option_prefix(self):
    # Test that the "ambiguous option" error is suppressed.
    options = PipelineOptions(['--profi', 'val1'])
    options.view_as(ProfilingOptions)

    # Test that valid errors are not suppressed.
    with self.assertRaises(SystemExit):
      # Invalid option choice.
      options = PipelineOptions(['--type_check_strictness', 'blahblah'])
      options.view_as(TypeOptions)
  def test_no_main_session(self):
    staging_dir = self.make_temp_dir()
    options = PipelineOptions()

    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    options.view_as(SetupOptions).save_main_session = False
    self.update_options(options)

    self.assertEqual(
        [],
        dependency.stage_job_resources(options))
  def test_sdk_location_gcs(self):
    staging_dir = self.make_temp_dir()
    sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
    self.override_file_copy(sdk_location, staging_dir)

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    self.assertEqual(
        [names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(options))
 def test_requirements_file_not_present(self):
   staging_dir = self.make_temp_dir()
   with self.assertRaises(RuntimeError) as cm:
     options = PipelineOptions()
     options.view_as(GoogleCloudOptions).staging_location = staging_dir
     self.update_options(options)
     options.view_as(SetupOptions).requirements_file = 'nosuchfile'
     dependency.stage_job_resources(
         options, populate_requirements_cache=self.populate_requirements_cache)
   self.assertEqual(
       cm.exception.message,
       'The file %s cannot be found. It was specified in the '
       '--requirements_file command line option.' % 'nosuchfile')
def run_pipeline(argv, with_attributes, id_label, timestamp_attribute):
  """Build and run the pipeline."""

  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--output_topic', required=True,
      help=('Output PubSub topic of the form '
            '"projects/<PROJECT>/topic/<TOPIC>".'))
  parser.add_argument(
      '--input_subscription', required=True,
      help=('Input PubSub subscription of the form '
            '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  pipeline_options.view_as(StandardOptions).streaming = True
  p = beam.Pipeline(options=pipeline_options)

  # Read from PubSub into a PCollection.
  messages = p | beam.io.ReadFromPubSub(
      subscription=known_args.input_subscription,
      id_label=id_label,
      with_attributes=with_attributes,
      timestamp_attribute=timestamp_attribute)

  def add_attribute(msg, timestamp=beam.DoFn.TimestampParam):
    msg.data += '-seen'
    msg.attributes['processed'] = 'IT'
    if timestamp_attribute in msg.attributes:
      msg.attributes[timestamp_attribute + '_out'] = timestamp.to_rfc3339()
    return msg

  def modify_data(data):
    return data + '-seen'

  if with_attributes:
    output = messages | 'add_attribute' >> beam.Map(add_attribute)
  else:
    output = messages | 'modify_data' >> beam.Map(modify_data)

  # Write to PubSub.
  _ = output | beam.io.WriteToPubSub(known_args.output_topic,
                                     id_label=id_label,
                                     with_attributes=with_attributes,
                                     timestamp_attribute=timestamp_attribute)

  result = p.run()
  result.wait_until_finish()
  def test_with_main_session(self):
    staging_dir = self.make_temp_dir()
    options = PipelineOptions()

    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    options.view_as(SetupOptions).save_main_session = True
    self.update_options(options)

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(options))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
  def test_setup_file_not_present(self):
    staging_dir = tempfile.mkdtemp()

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).setup_file = 'nosuchfile'

    with self.assertRaises(RuntimeError) as cm:
      dependency.stage_job_resources(options)
    self.assertEqual(
        cm.exception.message,
        'The file %s cannot be found. It was specified in the '
        '--setup_file command line option.' % 'nosuchfile')
  def test_with_extra_packages_missing_files(self):
    staging_dir = self.make_temp_dir()
    with self.assertRaises(RuntimeError) as cm:

      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).extra_packages = ['nosuchfile.tar.gz']

      dependency.stage_job_resources(options)
    self.assertEqual(
        cm.exception.message,
        'The file %s cannot be found. It was specified in the '
        '--extra_packages command line option.' % 'nosuchfile.tar.gz')
  def test_sdk_location_gcs_source_file(self):
    staging_dir = self.make_temp_dir()
    sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    with mock.patch('apache_beam.runners.dataflow.internal.'
                    'dependency._dependency_file_copy'):
      self.assertEqual(
          [names.DATAFLOW_SDK_TARBALL_FILE],
          dependency.stage_job_resources(options))
Exemple #17
0
def pipeline_options_remote(argv):
  """Creating a Pipeline using a PipelineOptions object for remote execution."""

  from apache_beam import Pipeline
  from apache_beam.options.pipeline_options import PipelineOptions

  # [START pipeline_options_create]
  options = PipelineOptions(flags=argv)
  # [END pipeline_options_create]

  # [START pipeline_options_define_custom]
  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input')
      parser.add_argument('--output')
  # [END pipeline_options_define_custom]

  from apache_beam.options.pipeline_options import GoogleCloudOptions
  from apache_beam.options.pipeline_options import StandardOptions

  # [START pipeline_options_dataflow_service]
  # Create and set your PipelineOptions.
  options = PipelineOptions(flags=argv)

  # For Cloud execution, set the Cloud Platform project, job_name,
  # staging location, temp_location and specify DataflowRunner.
  google_cloud_options = options.view_as(GoogleCloudOptions)
  google_cloud_options.project = 'my-project-id'
  google_cloud_options.job_name = 'myjob'
  google_cloud_options.staging_location = 'gs://my-bucket/binaries'
  google_cloud_options.temp_location = 'gs://my-bucket/temp'
  options.view_as(StandardOptions).runner = 'DataflowRunner'

  # Create the Pipeline with the specified options.
  p = Pipeline(options=options)
  # [END pipeline_options_dataflow_service]

  my_options = options.view_as(MyOptions)
  my_input = my_options.input
  my_output = my_options.output

  p = TestPipeline()  # Use TestPipeline for testing.

  lines = p | beam.io.ReadFromText(my_input)
  lines | beam.io.WriteToText(my_output)

  p.run()
  def test_with_extra_packages(self):
    staging_dir = self.make_temp_dir()
    source_dir = self.make_temp_dir()
    self.create_temp_file(
        os.path.join(source_dir, 'abc.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'xyz.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'xyz2.tar'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'whl.whl'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).extra_packages = [
        os.path.join(source_dir, 'abc.tar.gz'),
        os.path.join(source_dir, 'xyz.tar.gz'),
        os.path.join(source_dir, 'xyz2.tar'),
        os.path.join(source_dir, 'whl.whl'),
        'gs://my-gcs-bucket/gcs.tar.gz']

    gcs_copied_files = []

    def file_copy(from_path, to_path):
      if from_path.startswith('gs://'):
        gcs_copied_files.append(from_path)
        _, from_name = os.path.split(from_path)
        if os.path.isdir(to_path):
          to_path = os.path.join(to_path, from_name)
        self.create_temp_file(to_path, 'nothing')
        logging.info('Fake copied GCS file: %s to %s', from_path, to_path)
      elif to_path.startswith('gs://'):
        logging.info('Faking file_copy(%s, %s)', from_path, to_path)
      else:
        shutil.copyfile(from_path, to_path)

    dependency._dependency_file_copy = file_copy

    self.assertEqual(
        ['abc.tar.gz', 'xyz.tar.gz', 'xyz2.tar', 'whl.whl', 'gcs.tar.gz',
         dependency.EXTRA_PACKAGES_FILE],
        dependency.stage_job_resources(options))
    with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f:
      self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'xyz2.tar\n',
                        'whl.whl\n', 'gcs.tar.gz\n'], f.readlines())
    self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
  def test_sdk_location_local_not_present(self):
    staging_dir = self.make_temp_dir()
    sdk_location = 'nosuchdir'
    with self.assertRaises(RuntimeError) as cm:
      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).sdk_location = sdk_location

      dependency.stage_job_resources(options)
    self.assertEqual(
        'The file "%s" cannot be found. Its '
        'location was specified by the --sdk_location command-line option.' %
        sdk_location,
        cm.exception.message)
  def test_sdk_location_gcs_wheel_file(self):
    staging_dir = self.make_temp_dir()
    sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl'
    sdk_location = 'gs://my-gcs-bucket/' + sdk_filename

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    with mock.patch('apache_beam.runners.dataflow.internal.'
                    'dependency._dependency_file_copy'):
      self.assertEqual(
          [sdk_filename],
          dependency.stage_job_resources(options))
Exemple #21
0
def run(argv=None):
  """Main entry point; defines and runs the user_score pipeline."""
  parser = argparse.ArgumentParser()

  # The default maps to two large Google Cloud Storage files (each ~12GB)
  # holding two subsequent day's worth (roughly) of data.
  parser.add_argument('--input',
                      type=str,
                      default='gs://apache-beam-samples/game/gaming_data*.csv',
                      help='Path to the data file(s) containing game data.')
  parser.add_argument('--output',
                      type=str,
                      required=True,
                      help='Path to the output file(s).')

  args, pipeline_args = parser.parse_known_args(argv)

  options = PipelineOptions(pipeline_args)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  options.view_as(SetupOptions).save_main_session = True

  with beam.Pipeline(options=options) as p:
    def format_user_score_sums(user_score):
      (user, score) = user_score
      return 'user: %s, total_score: %s' % (user, score)

    (p  # pylint: disable=expression-not-assigned
     | 'ReadInputText' >> beam.io.ReadFromText(args.input)
     | 'UserScore' >> UserScore()
     | 'FormatUserScoreSums' >> beam.Map(format_user_score_sums)
     | 'WriteUserScoreSums' >> beam.io.WriteToText(args.output))
Exemple #22
0
  def test_read_messages_timestamp_attribute_missing(self, mock_pubsub):
    data = b'data'
    attributes = {}
    publish_time_secs = 1520861821
    publish_time_nanos = 234567000
    publish_time = '2018-03-12T13:37:01.234567Z'
    ack_id = 'ack_id'
    pull_response = test_utils.create_pull_response([
        test_utils.PullResponseMessage(
            data, attributes, publish_time_secs, publish_time_nanos, ack_id)
    ])
    expected_elements = [
        TestWindowedValue(
            PubsubMessage(data, attributes),
            timestamp.Timestamp.from_rfc3339(publish_time),
            [window.GlobalWindow()]),
    ]
    mock_pubsub.return_value.pull.return_value = pull_response

    options = PipelineOptions([])
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    pcoll = (p
             | ReadFromPubSub(
                 'projects/fakeprj/topics/a_topic', None, None,
                 with_attributes=True, timestamp_attribute='nonexistent'))
    assert_that(pcoll, equal_to(expected_elements), reify_windows=True)
    p.run()
    mock_pubsub.return_value.acknowledge.assert_has_calls([
        mock.call(mock.ANY, [ack_id])])
  def test_model_composite_triggers(self):
    pipeline_options = PipelineOptions()
    pipeline_options.view_as(StandardOptions).streaming = True

    with TestPipeline(options=pipeline_options) as p:
      test_stream = (TestStream()
                     .advance_watermark_to(10)
                     .add_elements(['a', 'a', 'a', 'b', 'b'])
                     .advance_watermark_to(70)
                     .add_elements([TimestampedValue('a', 10),
                                    TimestampedValue('a', 10),
                                    TimestampedValue('c', 10),
                                    TimestampedValue('c', 10)])
                     .advance_processing_time(600))
      pcollection = (p
                     | test_stream
                     | 'pair_with_one' >> beam.Map(lambda x: (x, 1)))

      counts = (
          # [START model_composite_triggers]
          pcollection | WindowInto(
              FixedWindows(1 * 60),
              trigger=AfterWatermark(
                  late=AfterProcessingTime(10 * 60)),
              accumulation_mode=AccumulationMode.DISCARDING)
          # [END model_composite_triggers]
          | 'group' >> beam.GroupByKey()
          | 'count' >> beam.Map(
              lambda word_ones: (word_ones[0], sum(word_ones[1]))))
      assert_that(counts, equal_to([('a', 3), ('b', 2), ('a', 2), ('c', 2)]))
 def test_with_extra_packages_invalid_file_name(self):
   staging_dir = tempfile.mkdtemp()
   source_dir = tempfile.mkdtemp()
   self.create_temp_file(
       os.path.join(source_dir, 'abc.tgz'), 'nothing')
   with self.assertRaises(RuntimeError) as cm:
     options = PipelineOptions()
     options.view_as(GoogleCloudOptions).staging_location = staging_dir
     self.update_options(options)
     options.view_as(SetupOptions).extra_packages = [
         os.path.join(source_dir, 'abc.tgz')]
     dependency.stage_job_resources(options)
   self.assertEqual(
       cm.exception.message,
       'The --extra_package option expects a full path ending with ".tar" or '
       '".tar.gz" instead of %s' % os.path.join(source_dir, 'abc.tgz'))
Exemple #25
0
def run(pipeline_args, input_file, output_file):

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection.
  lines = p | 'read' >> ReadFromText(input_file)

  counts = (lines
            | 'split' >> (beam.ParDo(WordExtractingDoFn())
                          .with_output_types(bytes))
            | 'count' >> beam.ExternalTransform(
                'pytest:beam:transforms:count', None, EXPANSION_SERVICE_ADDR))

  # Format the counts into a PCollection of strings.
  def format_result(word_count):
    (word, count) = word_count
    return '%s: %d' % (word, count)

  output = counts | 'format' >> beam.Map(format_result)

  # Write the output using a "Write" transform that has side effects.
  # pylint: disable=expression-not-assigned
  output | 'write' >> WriteToText(output_file)

  result = p.run()
  result.wait_until_finish()
def run(argv=None):
  """Runs the Wikipedia top edits pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """

  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://dataflow-samples/wikipedia_edits/*.json',
      help='Input specified as a GCS path containing a BigQuery table exported '
      'as json.')
  parser.add_argument('--output',
                      required=True,
                      help='Output file to write results to.')
  parser.add_argument('--sampling_threshold',
                      type=float,
                      default=0.1,
                      help='Fraction of entries used for session tracking')
  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  with beam.Pipeline(options=pipeline_options) as p:

    (p  # pylint: disable=expression-not-assigned
     | ReadFromText(known_args.input)
     | ComputeTopSessions(known_args.sampling_threshold)
     | WriteToText(known_args.output))
Exemple #27
0
def model_pipelines(argv):
  """A wordcount snippet as a simple pipeline example."""
  # [START model_pipelines]
  import re

  import apache_beam as beam
  from apache_beam.options.pipeline_options import PipelineOptions

  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          dest='input',
                          default='gs://dataflow-samples/shakespeare/kinglear'
                          '.txt',
                          help='Input file to process.')
      parser.add_argument('--output',
                          dest='output',
                          required=True,
                          help='Output file to write results to.')

  pipeline_options = PipelineOptions(argv)
  my_options = pipeline_options.view_as(MyOptions)

  with beam.Pipeline(options=pipeline_options) as p:

    (p
     | beam.io.ReadFromText(my_options.input)
     | beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
     | beam.Map(lambda x: (x, 1))
     | beam.combiners.Count.PerKey()
     | beam.io.WriteToText(my_options.output))
Exemple #28
0
def model_pcollection(argv):
  """Creating a PCollection from data in local memory."""
  from apache_beam.options.pipeline_options import PipelineOptions

  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--output',
                          dest='output',
                          required=True,
                          help='Output file to write results to.')

  pipeline_options = PipelineOptions(argv)
  my_options = pipeline_options.view_as(MyOptions)

  # [START model_pcollection]
  with beam.Pipeline(options=pipeline_options) as p:

    lines = (p
             | beam.Create([
                 'To be, or not to be: that is the question: ',
                 'Whether \'tis nobler in the mind to suffer ',
                 'The slings and arrows of outrageous fortune, ',
                 'Or to take arms against a sea of troubles, ']))
    # [END model_pcollection]

    (lines
     | beam.io.WriteToText(my_options.output))
Exemple #29
0
def pipeline_monitoring(renames):
  """Using monitoring interface snippets."""

  import re
  import apache_beam as beam
  from apache_beam.options.pipeline_options import PipelineOptions

  class WordCountOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          help='Input for the pipeline',
                          default='gs://my-bucket/input')
      parser.add_argument('--output',
                          help='output for the pipeline',
                          default='gs://my-bucket/output')

  class ExtractWordsFn(beam.DoFn):

    def process(self, element):
      words = re.findall(r'[A-Za-z\']+', element)
      for word in words:
        yield word

  class FormatCountsFn(beam.DoFn):

    def process(self, element):
      word, count = element
      yield '%s: %s' % (word, count)

  # [START pipeline_monitoring_composite]
  # The CountWords Composite Transform inside the WordCount pipeline.
  class CountWords(beam.PTransform):

    def expand(self, pcoll):
      return (pcoll
              # Convert lines of text into individual words.
              | 'ExtractWords' >> beam.ParDo(ExtractWordsFn())
              # Count the number of times each word occurs.
              | beam.combiners.Count.PerElement()
              # Format each word and count into a printable string.
              | 'FormatCounts' >> beam.ParDo(FormatCountsFn()))
  # [END pipeline_monitoring_composite]

  pipeline_options = PipelineOptions()
  options = pipeline_options.view_as(WordCountOptions)
  with TestPipeline() as p:  # Use TestPipeline for testing.

    # [START pipeline_monitoring_execution]
    (p
     # Read the lines of the input text.
     | 'ReadLines' >> beam.io.ReadFromText(options.input)
     # Count the words.
     | CountWords()
     # Write the formatted word counts to output.
     | 'WriteCounts' >> beam.io.WriteToText(options.output))
    # [END pipeline_monitoring_execution]

    p.visit(SnippetUtils.RenameFiles(renames))
def run(argv=None):
  """Build and run the pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input_topic', required=True,
      help=('Input PubSub topic of the form '
            '"projects/<PROJECT>/topics/<TOPIC>".'))
  parser.add_argument(
      '--output_topic', required=True,
      help=('Output PubSub topic of the form '
            '"projects/<PROJECT>/topic/<TOPIC>".'))
  known_args, pipeline_args = parser.parse_known_args(argv)
  options = PipelineOptions(pipeline_args)
  options.view_as(StandardOptions).streaming = True

  with beam.Pipeline(options=options) as p:

    # Read from PubSub into a PCollection.
    lines = p | beam.io.ReadStringsFromPubSub(known_args.input_topic)

    # Capitalize the characters in each line.
    transformed = (lines
                   # Use a pre-defined function that imports the re package.
                   | 'Split' >> (
                       beam.FlatMap(split_fn).with_output_types(unicode))
                   | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
                   | beam.WindowInto(window.FixedWindows(15, 0))
                   | 'Group' >> beam.GroupByKey()
                   | 'Count' >> beam.Map(lambda (word, ones): (word, sum(ones)))
                   | 'Format' >> beam.Map(lambda tup: '%s: %d' % tup))

    # Write to PubSub.
    # pylint: disable=expression-not-assigned
    transformed | beam.io.WriteStringsToPubSub(known_args.output_topic)
Exemple #31
0
class NexmarkLauncher(object):
    def __init__(self):
        self.parse_args()
        self.uuid = str(uuid.uuid4())
        self.topic_name = self.args.topic_name + self.uuid
        self.subscription_name = self.args.subscription_name + self.uuid

    def parse_args(self):
        parser = argparse.ArgumentParser()

        parser.add_argument('--query',
                            '-q',
                            type=int,
                            action='append',
                            required=True,
                            choices=[0, 1, 2],
                            help='Query to run')

        parser.add_argument('--subscription_name',
                            type=str,
                            help='Pub/Sub subscription to read from')

        parser.add_argument('--topic_name',
                            type=str,
                            help='Pub/Sub topic to read from')

        parser.add_argument(
            '--loglevel',
            choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
            default='INFO',
            help='Set logging level to debug')
        parser.add_argument(
            '--input',
            type=str,
            required=True,
            help='Path to the data file containing nexmark events.')

        self.args, self.pipeline_args = parser.parse_known_args()
        logging.basicConfig(level=getattr(logging, self.args.loglevel, None),
                            format='(%(threadName)-10s) %(message)s')

        self.pipeline_options = PipelineOptions(self.pipeline_args)
        logging.debug('args, pipeline_args: %s, %s', self.args,
                      self.pipeline_args)

        # Usage with Dataflow requires a project to be supplied.
        self.project = self.pipeline_options.view_as(
            GoogleCloudOptions).project
        if self.project is None:
            parser.print_usage()
            print(sys.argv[0] + ': error: argument --project is required')
            sys.exit(1)

        # Pub/Sub is currently available for use only in streaming pipelines.
        self.streaming = self.pipeline_options.view_as(
            StandardOptions).streaming
        if self.streaming is None:
            parser.print_usage()
            print(sys.argv[0] + ': error: argument --streaming is required')
            sys.exit(1)

        # wait_until_finish ensures that the streaming job is canceled.
        self.wait_until_finish_duration = (self.pipeline_options.view_as(
            TestOptions).wait_until_finish_duration)
        if self.wait_until_finish_duration is None:
            parser.print_usage()
            print(sys.argv[0] + ': error: argument --wait_until_finish_duration is required')  # pylint: disable=line-too-long
            sys.exit(1)

        # We use the save_main_session option because one or more DoFn's in this
        # workflow rely on global context (e.g., a module imported at module level).
        self.pipeline_options.view_as(SetupOptions).save_main_session = True

    def generate_events(self):
        publish_client = pubsub.Client(project=self.project)
        topic = publish_client.topic(self.topic_name)
        if topic.exists():
            topic.delete()
        topic.create()
        sub = topic.subscription(self.subscription_name)
        if sub.exists():
            sub.delete()
        sub.create()

        logging.info('Generating auction events to topic %s', topic.name)

        if self.args.input.startswith('gs://'):
            from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem
            fs = GCSFileSystem(self.pipeline_options)
            with fs.open(self.args.input) as infile:
                for line in infile:
                    topic.publish(line)
        else:
            with open(self.args.input) as infile:
                for line in infile:
                    topic.publish(line)

        logging.info('Finished event generation.')

        # Read from PubSub into a PCollection.
        if self.args.subscription_name:
            raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub(
                subscription=sub.full_name)
        else:
            raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub(
                topic=topic.full_name)

        return raw_events

    def run_query(self, query, query_errors):
        try:
            self.pipeline = beam.Pipeline(options=self.pipeline_options)
            raw_events = self.generate_events()
            query.load(raw_events)
            result = self.pipeline.run()
            job_duration = (self.pipeline_options.view_as(
                TestOptions).wait_until_finish_duration)
            if self.pipeline_options.view_as(StandardOptions).runner == 'DataflowRunner':  # pylint: disable=line-too-long
                result.wait_until_finish(duration=job_duration)
                result.cancel()
            else:
                result.wait_until_finish()
        except Exception as exc:
            query_errors.append(str(exc))
            raise

    def cleanup(self):
        publish_client = pubsub.Client(project=self.project)
        topic = publish_client.topic(self.topic_name)
        if topic.exists():
            topic.delete()
        sub = topic.subscription(self.subscription_name)
        if sub.exists():
            sub.delete()

    def run(self):
        queries = {
            0: query0,
            # TODO(mariagh): Add more queries.
        }

        query_errors = []
        for i in self.args.query:
            self.parse_args()
            logging.info('Running query %d', i)

            # The DirectRunner is the default runner, and it needs
            # special handling to cancel streaming jobs.
            launch_from_direct_runner = self.pipeline_options.view_as(
                StandardOptions).runner in [None, 'DirectRunner']

            if launch_from_direct_runner:
                command = Command(self.run_query,
                                  args=[queries[i], query_errors])
                query_duration = self.pipeline_options.view_as(TestOptions).wait_until_finish_duration  # pylint: disable=line-too-long
                command.run(timeout=query_duration // 1000)
            else:
                try:
                    self.run_query(queries[i], query_errors=None)
                except Exception as exc:
                    query_errors.append(exc)

        if query_errors:
            logging.error('Query failed with %s', ', '.join(query_errors))
        else:
            logging.info('Queries run: %s', self.args.query)
Exemple #32
0
class Pipeline(object):
  """A pipeline object that manages a DAG of PValues and their PTransforms.

  Conceptually the PValues are the DAG's nodes and the PTransforms computing
  the PValues are the edges.

  All the transforms applied to the pipeline must have distinct full labels.
  If same transform instance needs to be applied then a clone should be created
  with a new label (e.g., transform.clone('new label')).
  """

  def __init__(self, runner=None, options=None, argv=None):
    """Initialize a pipeline object.

    Args:
      runner: An object of type 'PipelineRunner' that will be used to execute
        the pipeline. For registered runners, the runner name can be specified,
        otherwise a runner object must be supplied.
      options: A configured 'PipelineOptions' object containing arguments
        that should be used for running the Dataflow job.
      argv: a list of arguments (such as sys.argv) to be used for building a
        'PipelineOptions' object. This will only be used if argument 'options'
        is None.

    Raises:
      ValueError: if either the runner or options argument is not of the
      expected type.
    """
    if options is not None:
      if isinstance(options, PipelineOptions):
        self._options = options
      else:
        raise ValueError(
            'Parameter options, if specified, must be of type PipelineOptions. '
            'Received : %r', options)
    elif argv is not None:
      if isinstance(argv, list):
        self._options = PipelineOptions(argv)
      else:
        raise ValueError(
            'Parameter argv, if specified, must be a list. Received : %r', argv)
    else:
      self._options = PipelineOptions([])

    if runner is None:
      runner = self._options.view_as(StandardOptions).runner
      if runner is None:
        runner = StandardOptions.DEFAULT_RUNNER
        logging.info(('Missing pipeline option (runner). Executing pipeline '
                      'using the default runner: %s.'), runner)

    if isinstance(runner, str):
      runner = create_runner(runner)
    elif not isinstance(runner, PipelineRunner):
      raise TypeError('Runner must be a PipelineRunner object or the '
                      'name of a registered runner.')

    # Validate pipeline options
    errors = PipelineOptionsValidator(self._options, runner).validate()
    if errors:
      raise ValueError(
          'Pipeline has validations errors: \n' + '\n'.join(errors))

    # Default runner to be used.
    self.runner = runner
    # Stack of transforms generated by nested apply() calls. The stack will
    # contain a root node as an enclosing (parent) node for top transforms.
    self.transforms_stack = [AppliedPTransform(None, None, '', None)]
    # Set of transform labels (full labels) applied to the pipeline.
    # If a transform is applied and the full label is already in the set
    # then the transform will have to be cloned with a new label.
    self.applied_labels = set()

  @property
  @deprecated(since='First stable release',
              extra_message='References to <pipeline>.options'
              ' will not be supported')
  def options(self):
    return self._options

  def _current_transform(self):
    """Returns the transform currently on the top of the stack."""
    return self.transforms_stack[-1]

  def _root_transform(self):
    """Returns the root transform of the transform stack."""
    return self.transforms_stack[0]

  def run(self, test_runner_api=True):
    """Runs the pipeline. Returns whatever our runner returns after running."""

    # When possible, invoke a round trip through the runner API.
    if test_runner_api and self._verify_runner_api_compatible():
      return Pipeline.from_runner_api(
          self.to_runner_api(), self.runner, self._options).run(False)

    if self._options.view_as(SetupOptions).save_main_session:
      # If this option is chosen, verify we can pickle the main session early.
      tmpdir = tempfile.mkdtemp()
      try:
        pickler.dump_session(os.path.join(tmpdir, 'main_session.pickle'))
      finally:
        shutil.rmtree(tmpdir)
    return self.runner.run(self)

  def __enter__(self):
    return self

  def __exit__(self, exc_type, exc_val, exc_tb):
    if not exc_type:
      self.run().wait_until_finish()

  def visit(self, visitor):
    """Visits depth-first every node of a pipeline's DAG.

    Args:
      visitor: PipelineVisitor object whose callbacks will be called for each
        node visited. See PipelineVisitor comments.

    Raises:
      TypeError: if node is specified and is not a PValue.
      pipeline.PipelineError: if node is specified and does not belong to this
        pipeline instance.
    """

    visited = set()
    self._root_transform().visit(visitor, self, visited)

  def apply(self, transform, pvalueish=None, label=None):
    """Applies a custom transform using the pvalueish specified.

    Args:
      transform: the PTranform to apply.
      pvalueish: the input for the PTransform (typically a PCollection).
      label: label of the PTransform.

    Raises:
      TypeError: if the transform object extracted from the argument list is
        not a PTransform.
      RuntimeError: if the transform object was already applied to this pipeline
        and needs to be cloned in order to apply again.
    """
    if isinstance(transform, ptransform._NamedPTransform):
      return self.apply(transform.transform, pvalueish,
                        label or transform.label)

    if not isinstance(transform, ptransform.PTransform):
      raise TypeError("Expected a PTransform object, got %s" % transform)

    if label:
      # Fix self.label as it is inspected by some PTransform operations
      # (e.g. to produce error messages for type hint violations).
      try:
        old_label, transform.label = transform.label, label
        return self.apply(transform, pvalueish)
      finally:
        transform.label = old_label

    full_label = '/'.join([self._current_transform().full_label,
                           label or transform.label]).lstrip('/')
    if full_label in self.applied_labels:
      raise RuntimeError(
          'Transform "%s" does not have a stable unique label. '
          'This will prevent updating of pipelines. '
          'To apply a transform with a specified label write '
          'pvalue | "label" >> transform'
          % full_label)
    self.applied_labels.add(full_label)

    pvalueish, inputs = transform._extract_input_pvalues(pvalueish)
    try:
      inputs = tuple(inputs)
      for leaf_input in inputs:
        if not isinstance(leaf_input, pvalue.PValue):
          raise TypeError
    except TypeError:
      raise NotImplementedError(
          'Unable to extract PValue inputs from %s; either %s does not accept '
          'inputs of this format, or it does not properly override '
          '_extract_input_pvalues' % (pvalueish, transform))

    current = AppliedPTransform(
        self._current_transform(), transform, full_label, inputs)
    self._current_transform().add_part(current)
    self.transforms_stack.append(current)

    type_options = self._options.view_as(TypeOptions)
    if type_options.pipeline_type_check:
      transform.type_check_inputs(pvalueish)

    pvalueish_result = self.runner.apply(transform, pvalueish)

    if type_options is not None and type_options.pipeline_type_check:
      transform.type_check_outputs(pvalueish_result)

    for result in ptransform.GetPValues().visit(pvalueish_result):
      assert isinstance(result, (pvalue.PValue, pvalue.DoOutputsTuple))

      # Make sure we set the producer only for a leaf node in the transform DAG.
      # This way we preserve the last transform of a composite transform as
      # being the real producer of the result.
      if result.producer is None:
        result.producer = current
      # TODO(robertwb): Multi-input, multi-output inference.
      # TODO(robertwb): Ideally we'd do intersection here.
      if (type_options is not None and type_options.pipeline_type_check
          and isinstance(result, pvalue.PCollection)
          and not result.element_type):
        input_element_type = (
            inputs[0].element_type
            if len(inputs) == 1
            else typehints.Any)
        type_hints = transform.get_type_hints()
        declared_output_type = type_hints.simple_output_type(transform.label)
        if declared_output_type:
          input_types = type_hints.input_types
          if input_types and input_types[0]:
            declared_input_type = input_types[0][0]
            result.element_type = typehints.bind_type_variables(
                declared_output_type,
                typehints.match_type_variables(declared_input_type,
                                               input_element_type))
          else:
            result.element_type = declared_output_type
        else:
          result.element_type = transform.infer_output_type(input_element_type)

      assert isinstance(result.producer.inputs, tuple)
      current.add_output(result)

    if (type_options is not None and
        type_options.type_check_strictness == 'ALL_REQUIRED' and
        transform.get_type_hints().output_types is None):
      ptransform_name = '%s(%s)' % (transform.__class__.__name__, full_label)
      raise TypeCheckError('Pipeline type checking is enabled, however no '
                           'output type-hint was found for the '
                           'PTransform %s' % ptransform_name)

    current.update_input_refcounts()
    self.transforms_stack.pop()
    return pvalueish_result

  def _verify_runner_api_compatible(self):
    class Visitor(PipelineVisitor):  # pylint: disable=used-before-assignment
      ok = True  # Really a nonlocal.

      def visit_transform(self, transform_node):
        if transform_node.side_inputs:
          # No side inputs (yet).
          Visitor.ok = False
        try:
          # Transforms must be picklable.
          pickler.loads(pickler.dumps(transform_node.transform,
                                      enable_trace=False),
                        enable_trace=False)
        except Exception:
          Visitor.ok = False

      def visit_value(self, value, _):
        if isinstance(value, pvalue.PDone):
          Visitor.ok = False

    self.visit(Visitor())
    return Visitor.ok

  def to_runner_api(self):
    from apache_beam.runners import pipeline_context
    from apache_beam.runners.api import beam_runner_api_pb2
    context = pipeline_context.PipelineContext()
    # Mutates context; placing inline would force dependence on
    # argument evaluation order.
    root_transform_id = context.transforms.get_id(self._root_transform())
    proto = beam_runner_api_pb2.Pipeline(
        root_transform_ids=[root_transform_id],
        components=context.to_runner_api())
    return proto

  @staticmethod
  def from_runner_api(proto, runner, options):
    p = Pipeline(runner=runner, options=options)
    from apache_beam.runners import pipeline_context
    context = pipeline_context.PipelineContext(proto.components)
    root_transform_id, = proto.root_transform_ids
    p.transforms_stack = [
        context.transforms.get_by_id(root_transform_id)]
    # TODO(robertwb): These are only needed to continue construction. Omit?
    p.applied_labels = set([
        t.unique_name for t in proto.components.transforms.values()])
    for id in proto.components.pcollections:
      context.pcollections.get_by_id(id).pipeline = p
    return p
Exemple #33
0
def preprocess():
    """
    Arguments:
        -RUNNER: "DirectRunner" or "DataflowRunner". Specfy to run the pipeline locally or on Google Cloud respectively.
    Side-effects:
        -Creates and executes dataflow pipeline.
        See https://beam.apache.org/documentation/programming-guide/#creating-a-pipeline
    """
    job_name = 'stackoverflow-raphael' + '-' + datetime.datetime.now(
    ).strftime('%y%m%d-%H%M%S')
    project = os.environ['PROJECT_ID']
    region = os.environ['REGION']
    output_dir = "gs://{0}/".format(os.environ['BUCKET_NAME'])

    #options
    options = PipelineOptions()

    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = project
    google_cloud_options.region = region
    google_cloud_options.job_name = job_name
    google_cloud_options.staging_location = os.path.join(
        output_dir, 'beam', 'stage')
    google_cloud_options.temp_location = os.path.join(output_dir, 'beam',
                                                      'temp')

    worker_options = options.view_as(WorkerOptions)
    worker_options.max_num_workers = 100
    worker_options.zone = 'europe-west6-b'
    worker_options.use_public_ips = False
    worker_options.network = 'default'
    # worker_options.disk_size_gb = 50

    #options.view_as(StandardOptions).runner = RUNNER
    options.view_as(
        SetupOptions).setup_file = os.environ['DIR_PROJ'] + '/setup.py'

    # instantantiate Pipeline object using PipelineOptions
    print('Launching Dataflow job {} ... hang on'.format(job_name))

    #table reference
    new_table = beam.io.gcp.internal.clients.bigquery.TableReference(
        projectId='nlp-text-classification',
        datasetId='stackoverflow',
        tableId='posts_preprocessed')

    with beam.Pipeline(options=options) as p:
        post_table = p | "Read Posts from BigQuery" >> beam.io.Read(
            beam.io.BigQuerySource(query=data_query(), use_standard_sql=True))
        #tag_table = p             | "Read Tags from BigQuery" >> beam.io.Read(beam.io.BigQuerySource(
        #query=tag_query(),
        #use_standard_sql=True))
        clean_text = post_table | "Preprocessing" >> beam.ParDo(pp.NLP())
        clean_text | "Write Posts to BigQuery" >> beam.io.WriteToBigQuery(
            new_table,
            schema=table_schema,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
        str_values = clean_text | "Post Records to Text" >> beam.ParDo(
            pp.CSV())

        str_values | "Write Posts to GCS" >> beam.io.WriteToText(
            output_dir + 'results/posts',
            file_name_suffix='.csv',
            header='id, title, text_body, code_body, tags')

    if options.view_as(StandardOptions).runner == 'DataflowRunner':
        print('DataflowRunner')
        p.run()
    else:
        print('Default: DirectRunner')
        result = p.run()
        result.wait_until_finish()
    print('Done')
class UserOptions(PipelineOptions):
    @classmethod
    def _add_argparse_args(cls, parser):
        # Estos parametros se usan para la ejecucion de la plantilla y no se deben indicar cuando se crea la plantilla
        parser.add_value_provider_argument("--url_raw", type=str)
        parser.add_value_provider_argument("--url_trn", type=str)
        # Estos parametros se usan para la creacion de la plantilla y corresponden a datos que quedan estaticos dentro de la misma
        parser.add_value_provider_argument("--rename_columns", type=str)
        parser.add_value_provider_argument("--schema_source", type=str)


pipeline_options = PipelineOptions()

with beam.Pipeline(options=pipeline_options) as p:
    print("Start Pipeline")
    user_options = pipeline_options.view_as(UserOptions)

    # Esta funcion renombra las "columnas" y regresa los datos renombrados
    def reColumns(row, rename_cols=None):
        for col in rename_cols:
            dict_rename = {
                value: row[key]
                for (key, value) in ast.literal_eval(col).items()
            }
        return dict_rename

    # Esta funcion recibe el parametro rename_columns y calcula un diccionario con las parejas de nombres de columnas
    def mapRenameCols(row,
                      rename_cols=ast.literal_eval(
                          user_options.rename_columns.get())):
        cols_before = list(row)
Exemple #35
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the hourly_team_score pipeline."""
    parser = argparse.ArgumentParser()

    parser.add_argument('--topic', type=str, help='Pub/Sub topic to read from')
    parser.add_argument('--output_team',
                        type=str,
                        required=True,
                        help='Pub/Sub topic to write team score')
    parser.add_argument('--output_user',
                        type=str,
                        required=True,
                        help='Pub/Sub topic to write user score.')
    parser.add_argument('--subscription',
                        type=str,
                        help='Pub/Sub subscription to read from')
    parser.add_argument('--team_window_duration',
                        type=int,
                        default=3,
                        help='Numeric value of fixed window duration for team '
                        'analysis, in minutes')
    parser.add_argument(
        '--allowed_lateness',
        type=int,
        default=6,
        help='Numeric value of allowed data lateness, in minutes')

    args, pipeline_args = parser.parse_known_args(argv)

    if args.topic is None and args.subscription is None:
        parser.print_usage()
        print(sys.argv[0] +
              ': error: one of --topic or --subscription is required')
        sys.exit(1)

    options = PipelineOptions(pipeline_args)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    options.view_as(SetupOptions).save_main_session = save_main_session

    # Enforce that this pipeline is always run in streaming mode
    options.view_as(StandardOptions).streaming = True

    with beam.Pipeline(options=options) as p:
        # Read game events from Pub/Sub using custom timestamps, which are extracted
        # from the pubsub data elements, and parse the data.

        # Read from PubSub into a PCollection.
        if args.subscription:
            scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(
                subscription=args.subscription)
        else:
            scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(
                topic=args.topic)

        events = (scores
                  | 'DecodeString' >> beam.Map(lambda b: b.decode('utf-8'))
                  | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn())
                  | 'AddEventTimestamps' >>
                  beam.Map(lambda elem: beam.window.TimestampedValue(
                      elem, elem['timestamp'])))

        def format_team_score_sums(team_score):
            team = team_score['team']
            score = team_score['total_score']
            print(team_score)
            return '%s: %d' % (team, score)

        # Get team scores and write the results to the topic output_team
        (  # pylint: disable=expression-not-assigned
            events
            | 'CalculateTeamScores' >> CalculateTeamScores(
                args.team_window_duration, args.allowed_lateness)
            | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict())
            | 'FormatTeamScoreSums' >> beam.Map(format_team_score_sums)
            | 'EncodeTeamScoreSums' >>
            beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes)
            | 'WriteTeamScoreSums' >> beam.io.WriteToPubSub(args.output_team))

        def format_user_score_sums(user_score):
            (user, score) = user_score
            print(user_score)
            return '%s: %d' % (user, score)

        # Get user scores and write the results to the topic output_user
        (  # pylint: disable=expression-not-assigned
            events
            |
            'CalculateUserScores' >> CalculateUserScores(args.allowed_lateness)
            | 'FormatUserScoreSums' >> beam.Map(format_user_score_sums)
            | 'EncodeUserScoreSums' >>
            beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes)
            | 'WriteUserScoreSums' >> beam.io.WriteToPubSub(args.output_user))
Exemple #36
0
                        required=True,
                        help="Month for input data")
    parser.add_argument("--input.day",
                        dest="input_day",
                        required=True,
                        help="Day for input data")
    parser.add_argument("--input.hour",
                        dest="input_hour",
                        required=True,
                        help="Hour for input data")

    parser.add_argument("--bq.project",
                        dest="bq_project",
                        required=True,
                        help="Project Name for Bigquery")
    parser.add_argument("--bq.dataset",
                        dest="bq_dataset",
                        required=True,
                        help="Dataset Name for Bigquery")
    parser.add_argument("--bq.table",
                        dest="bq_table",
                        required=True,
                        help="Table Name for Bigquery")

    app_args, pipeline_args = parser.parse_known_args()
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    logging.getLogger().setLevel(logging.INFO)

    main(pipeline_options, app_args)
def run(argv=None):
  """Test Avro IO (backed by fastavro or Apache Avro) on a simple pipeline
  that transforms bitcoin transactions"""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://beam-avro-test/bitcoin/txns/*',
      help='Input file(s) to process.')
  parser.add_argument(
      '--output',
      dest='output',
      required=True,
      help='Output file to write results to.')
  parser.add_argument(
      '--compress',
      dest='compress',
      required=False,
      action='store_true',
      help='When set, compress the output data')
  parser.add_argument(
      '--fastavro',
      dest='use_fastavro',
      required=False,
      action='store_true',
      help='When set, use fastavro for Avro I/O')

  opts, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Read the avro file[pattern] into a PCollection.
  records = \
      p | 'read' >> ReadFromAvro(opts.input)

  measured = records | 'scan' >> beam.ParDo(BitcoinTxnCountDoFn())

  # pylint: disable=expression-not-assigned
  measured | 'write' >> \
      WriteToAvro(
          opts.output,
          schema=SCHEMA,
          codec=('deflate' if opts.compress else 'null'),
      )

  result = p.run()
  result.wait_until_finish()

  # Do not query metrics when creating a template which doesn't run
  if (not hasattr(result, 'has_job')  # direct runner
      or result.has_job):  # not just a template creation
    metrics = result.metrics().query()

    for counter in metrics['counters']:
      logging.info("Counter: %s", counter)

    for dist in metrics['distributions']:
      logging.info("Distribution: %s", dist)
Exemple #38
0

class FilteringDoFn(beam.DoFn):
    def __init__(self, filter_val):
        self.filter_val = filter_val

    def process(self, element):
        if element['gender'] == self.filter_val.get():
            yield element
        else:
            return  # Return nothing


logging.getLogger().setLevel(logging.INFO)

pipeline_options = PipelineOptions()
# Create pipeline.
with beam.Pipeline(options=pipeline_options) as p:

    def print_row(element):
        logging.info("the count is ", element)

    my_options = pipeline_options.view_as(DataflowExample)
    select_query = (p | 'QueryTableStdSQL' >> beam.io.Read(beam.io.BigQuerySource(
        query='SELECT gender FROM ' \
              '`startgcp-268623.lake.usa_names`',
        use_standard_sql=True)))
    select_query | beam.ParDo(FilteringDoFn(my_options.filter_val)) | beam.combiners.Count.Globally() \
    | 'Print result' >> beam.Map(print_row)
    p.run().wait_until_finish()
Exemple #39
0
from __future__ import absolute_import
import logging
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import StandardOptions


output_table = 'automatic-asset-253215:CORE.IM_CUSTOMER_ATTRIBUTE_REF'
dataflow_options = {'--project=automatic-asset-253215',
                    '--job_name=xfm-vustgclic-custsorgext-to-imcustomerattributeref',
                    '--temp_location=gs://raw_source_files/Customers/temp',
                    '--staging_location=gs://raw_source_files/Customers/temp/stg'}
options = PipelineOptions(dataflow_options)
gcloud_options = options.view_as(GoogleCloudOptions)
options.view_as(StandardOptions).runner = 'dataflow'


class LeftJoin(beam.PTransform):
    """This PTransform performs a left join given source_pipeline_name, source_data,
     join_pipeline_name, join_data, common_key constructors"""

    def __init__(self, src_pipeline, CustOrg_ID, join_pipeline, IMCust_ID, common_key):
        self.join_pipeline = join_pipeline
        self.CustOrg_ID = CustOrg_ID
        self.src_pipeline = src_pipeline
        self.IMCust_ID = IMCust_ID
        self.common_key = common_key

    def expand(self, pcolls):
        def _format_as_common_key_tuple(data_dict, common_key):
Exemple #40
0
class Pipeline(object):
    """A pipeline object that manages a DAG of
  :class:`~apache_beam.pvalue.PValue` s and their
  :class:`~apache_beam.transforms.ptransform.PTransform` s.

  Conceptually the :class:`~apache_beam.pvalue.PValue` s are the DAG's nodes and
  the :class:`~apache_beam.transforms.ptransform.PTransform` s computing
  the :class:`~apache_beam.pvalue.PValue` s are the edges.

  All the transforms applied to the pipeline must have distinct full labels.
  If same transform instance needs to be applied then the right shift operator
  should be used to designate new names
  (e.g. ``input | "label" >> my_tranform``).
  """
    def __init__(self, runner=None, options=None, argv=None):
        """Initialize a pipeline object.

    Args:
      runner (~apache_beam.runners.runner.PipelineRunner): An object of
        type :class:`~apache_beam.runners.runner.PipelineRunner` that will be
        used to execute the pipeline. For registered runners, the runner name
        can be specified, otherwise a runner object must be supplied.
      options (~apache_beam.options.pipeline_options.PipelineOptions):
        A configured
        :class:`~apache_beam.options.pipeline_options.PipelineOptions` object
        containing arguments that should be used for running the Beam job.
      argv (List[str]): a list of arguments (such as :data:`sys.argv`)
        to be used for building a
        :class:`~apache_beam.options.pipeline_options.PipelineOptions` object.
        This will only be used if argument **options** is :data:`None`.

    Raises:
      ~exceptions.ValueError: if either the runner or options argument is not
        of the expected type.
    """
        if options is not None:
            if isinstance(options, PipelineOptions):
                self._options = options
            else:
                raise ValueError(
                    'Parameter options, if specified, must be of type PipelineOptions. '
                    'Received : %r' % options)
        elif argv is not None:
            if isinstance(argv, list):
                self._options = PipelineOptions(argv)
            else:
                raise ValueError(
                    'Parameter argv, if specified, must be a list. Received : %r'
                    % argv)
        else:
            self._options = PipelineOptions([])

        FileSystems.set_options(self._options)

        if runner is None:
            runner = self._options.view_as(StandardOptions).runner
            if runner is None:
                runner = StandardOptions.DEFAULT_RUNNER
                logging.info(
                    ('Missing pipeline option (runner). Executing pipeline '
                     'using the default runner: %s.'), runner)

        if isinstance(runner, str):
            runner = create_runner(runner)
        elif not isinstance(runner, PipelineRunner):
            raise TypeError('Runner %s is not a PipelineRunner object or the '
                            'name of a registered runner.' % runner)

        # Validate pipeline options
        errors = PipelineOptionsValidator(self._options, runner).validate()
        if errors:
            raise ValueError('Pipeline has validations errors: \n' +
                             '\n'.join(errors))

        # set default experiments for portable runners
        # (needs to occur prior to pipeline construction)
        if runner.is_fnapi_compatible():
            experiments = (self._options.view_as(DebugOptions).experiments
                           or [])
            if not 'beam_fn_api' in experiments:
                experiments.append('beam_fn_api')
                self._options.view_as(DebugOptions).experiments = experiments

        # Default runner to be used.
        self.runner = runner
        # Stack of transforms generated by nested apply() calls. The stack will
        # contain a root node as an enclosing (parent) node for top transforms.
        self.transforms_stack = [AppliedPTransform(None, None, '', None)]
        # Set of transform labels (full labels) applied to the pipeline.
        # If a transform is applied and the full label is already in the set
        # then the transform will have to be cloned with a new label.
        self.applied_labels = set()

    @property
    @deprecated(since='First stable release',
                extra_message='References to <pipeline>.options'
                ' will not be supported')
    def options(self):
        return self._options

    def _current_transform(self):
        """Returns the transform currently on the top of the stack."""
        return self.transforms_stack[-1]

    def _root_transform(self):
        """Returns the root transform of the transform stack."""
        return self.transforms_stack[0]

    def _remove_labels_recursively(self, applied_transform):
        for part in applied_transform.parts:
            if part.full_label in self.applied_labels:
                self.applied_labels.remove(part.full_label)
                self._remove_labels_recursively(part)

    def _replace(self, override):

        assert isinstance(override, PTransformOverride)

        output_map = {}
        output_replacements = {}
        input_replacements = {}
        side_input_replacements = {}

        class TransformUpdater(PipelineVisitor):  # pylint: disable=used-before-assignment
            """"A visitor that replaces the matching PTransforms."""
            def __init__(self, pipeline):
                self.pipeline = pipeline

            def _replace_if_needed(self, original_transform_node):
                if override.matches(original_transform_node):
                    assert isinstance(original_transform_node,
                                      AppliedPTransform)
                    replacement_transform = override.get_replacement_transform(
                        original_transform_node.transform)
                    if replacement_transform is original_transform_node.transform:
                        return

                    replacement_transform_node = AppliedPTransform(
                        original_transform_node.parent, replacement_transform,
                        original_transform_node.full_label,
                        original_transform_node.inputs)

                    # Transform execution could depend on order in which nodes are
                    # considered. Hence we insert the replacement transform node to same
                    # index as the original transform node. Note that this operation
                    # removes the original transform node.
                    if original_transform_node.parent:
                        assert isinstance(original_transform_node.parent,
                                          AppliedPTransform)
                        parent_parts = original_transform_node.parent.parts
                        parent_parts[parent_parts.index(
                            original_transform_node)] = (
                                replacement_transform_node)
                    else:
                        # Original transform has to be a root.
                        roots = self.pipeline.transforms_stack[0].parts
                        assert original_transform_node in roots
                        roots[roots.index(original_transform_node)] = (
                            replacement_transform_node)

                    inputs = replacement_transform_node.inputs
                    # TODO:  Support replacing PTransforms with multiple inputs.
                    if len(inputs) > 1:
                        raise NotImplementedError(
                            'PTransform overriding is only supported for PTransforms that '
                            'have a single input. Tried to replace input of '
                            'AppliedPTransform %r that has %d inputs' %
                            original_transform_node, len(inputs))
                    elif len(inputs) == 1:
                        input_node = inputs[0]
                    elif len(inputs) == 0:
                        input_node = pvalue.PBegin(self)

                    # We have to add the new AppliedTransform to the stack before expand()
                    # and pop it out later to make sure that parts get added correctly.
                    self.pipeline.transforms_stack.append(
                        replacement_transform_node)

                    # Keeping the same label for the replaced node but recursively
                    # removing labels of child transforms of original transform since they
                    # will be replaced during the expand below. This is needed in case
                    # the replacement contains children that have labels that conflicts
                    # with labels of the children of the original.
                    self.pipeline._remove_labels_recursively(
                        original_transform_node)

                    new_output = replacement_transform.expand(input_node)

                    new_output.element_type = None
                    self.pipeline._infer_result_type(replacement_transform,
                                                     inputs, new_output)

                    replacement_transform_node.add_output(new_output)
                    if not new_output.producer:
                        new_output.producer = replacement_transform_node

                    # We only support replacing transforms with a single output with
                    # another transform that produces a single output.
                    # TODO: Support replacing PTransforms with multiple outputs.
                    if (len(original_transform_node.outputs) > 1
                            or not isinstance(
                                original_transform_node.outputs[None],
                                (PCollection, PDone))
                            or not isinstance(new_output,
                                              (PCollection, PDone))):
                        raise NotImplementedError(
                            'PTransform overriding is only supported for PTransforms that '
                            'have a single output. Tried to replace output of '
                            'AppliedPTransform %r with %r.' %
                            (original_transform_node, new_output))

                    # Recording updated outputs. This cannot be done in the same visitor
                    # since if we dynamically update output type here, we'll run into
                    # errors when visiting child nodes.
                    output_map[
                        original_transform_node.outputs[None]] = new_output

                    self.pipeline.transforms_stack.pop()

            def enter_composite_transform(self, transform_node):
                self._replace_if_needed(transform_node)

            def visit_transform(self, transform_node):
                self._replace_if_needed(transform_node)

        self.visit(TransformUpdater(self))

        # Adjusting inputs and outputs
        class InputOutputUpdater(PipelineVisitor):  # pylint: disable=used-before-assignment
            """"A visitor that records input and output values to be replaced.

      Input and output values that should be updated are recorded in maps
      input_replacements and output_replacements respectively.

      We cannot update input and output values while visiting since that results
      in validation errors.
      """
            def __init__(self, pipeline):
                self.pipeline = pipeline

            def enter_composite_transform(self, transform_node):
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                if (None in transform_node.outputs
                        and transform_node.outputs[None] in output_map):
                    output_replacements[transform_node] = (
                        output_map[transform_node.outputs[None]])

                replace_input = False
                for input in transform_node.inputs:
                    if input in output_map:
                        replace_input = True
                        break

                replace_side_inputs = False
                for side_input in transform_node.side_inputs:
                    if side_input.pvalue in output_map:
                        replace_side_inputs = True
                        break

                if replace_input:
                    new_input = [
                        input if not input in output_map else output_map[input]
                        for input in transform_node.inputs
                    ]
                    input_replacements[transform_node] = new_input

                if replace_side_inputs:
                    new_side_inputs = []
                    for side_input in transform_node.side_inputs:
                        if side_input.pvalue in output_map:
                            side_input.pvalue = output_map[side_input.pvalue]
                            new_side_inputs.append(side_input)
                        else:
                            new_side_inputs.append(side_input)
                    side_input_replacements[transform_node] = new_side_inputs

        self.visit(InputOutputUpdater(self))

        for transform in output_replacements:
            transform.replace_output(output_replacements[transform])

        for transform in input_replacements:
            transform.inputs = input_replacements[transform]

        for transform in side_input_replacements:
            transform.side_inputs = side_input_replacements[transform]

    def _check_replacement(self, override):
        class ReplacementValidator(PipelineVisitor):
            def visit_transform(self, transform_node):
                if override.matches(transform_node):
                    raise RuntimeError(
                        'Transform node %r was not replaced as expected.' %
                        transform_node)

        self.visit(ReplacementValidator())

    def replace_all(self, replacements):
        """ Dynamically replaces PTransforms in the currently populated hierarchy.

    Currently this only works for replacements where input and output types
    are exactly the same.

    TODO: Update this to also work for transform overrides where input and
    output types are different.

    Args:
      replacements (List[~apache_beam.pipeline.PTransformOverride]): a list of
        :class:`~apache_beam.pipeline.PTransformOverride` objects.
    """
        for override in replacements:
            assert isinstance(override, PTransformOverride)
            self._replace(override)

        # Checking if the PTransforms have been successfully replaced. This will
        # result in a failure if a PTransform that was replaced in a given override
        # gets re-added in a subsequent override. This is not allowed and ordering
        # of PTransformOverride objects in 'replacements' is important.
        for override in replacements:
            self._check_replacement(override)

    def run(self, test_runner_api=True):
        """Runs the pipeline. Returns whatever our runner returns after running."""

        # When possible, invoke a round trip through the runner API.
        if test_runner_api and self._verify_runner_api_compatible():
            return Pipeline.from_runner_api(
                self.to_runner_api(use_fake_coders=True), self.runner,
                self._options).run(False)

        if self._options.view_as(TypeOptions).runtime_type_check:
            from apache_beam.typehints import typecheck
            self.visit(typecheck.TypeCheckVisitor())

        if self._options.view_as(SetupOptions).save_main_session:
            # If this option is chosen, verify we can pickle the main session early.
            tmpdir = tempfile.mkdtemp()
            try:
                pickler.dump_session(
                    os.path.join(tmpdir, 'main_session.pickle'))
            finally:
                shutil.rmtree(tmpdir)
        return self.runner.run_pipeline(self, self._options)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if not exc_type:
            self.run().wait_until_finish()

    def visit(self, visitor):
        """Visits depth-first every node of a pipeline's DAG.

    Runner-internal implementation detail; no backwards-compatibility guarantees

    Args:
      visitor (~apache_beam.pipeline.PipelineVisitor):
        :class:`~apache_beam.pipeline.PipelineVisitor` object whose callbacks
        will be called for each node visited. See
        :class:`~apache_beam.pipeline.PipelineVisitor` comments.

    Raises:
      ~exceptions.TypeError: if node is specified and is not a
        :class:`~apache_beam.pvalue.PValue`.
      ~apache_beam.error.PipelineError: if node is specified and does not
        belong to this pipeline instance.
    """

        visited = set()
        self._root_transform().visit(visitor, self, visited)

    def apply(self, transform, pvalueish=None, label=None):
        """Applies a custom transform using the pvalueish specified.

    Args:
      transform (~apache_beam.transforms.ptransform.PTransform): the
        :class:`~apache_beam.transforms.ptransform.PTransform` to apply.
      pvalueish (~apache_beam.pvalue.PCollection): the input for the
        :class:`~apache_beam.transforms.ptransform.PTransform` (typically a
        :class:`~apache_beam.pvalue.PCollection`).
      label (str): label of the
        :class:`~apache_beam.transforms.ptransform.PTransform`.

    Raises:
      ~exceptions.TypeError: if the transform object extracted from the
        argument list is not a
        :class:`~apache_beam.transforms.ptransform.PTransform`.
      ~exceptions.RuntimeError: if the transform object was already applied to
        this pipeline and needs to be cloned in order to apply again.
    """
        if isinstance(transform, ptransform._NamedPTransform):
            return self.apply(transform.transform, pvalueish, label
                              or transform.label)

        if not isinstance(transform, ptransform.PTransform):
            raise TypeError("Expected a PTransform object, got %s" % transform)

        if label:
            # Fix self.label as it is inspected by some PTransform operations
            # (e.g. to produce error messages for type hint violations).
            try:
                old_label, transform.label = transform.label, label
                return self.apply(transform, pvalueish)
            finally:
                transform.label = old_label

        full_label = '/'.join(
            [self._current_transform().full_label, label
             or transform.label]).lstrip('/')
        if full_label in self.applied_labels:
            raise RuntimeError(
                'A transform with label "%s" already exists in the pipeline. '
                'To apply a transform with a specified label write '
                'pvalue | "label" >> transform' % full_label)
        self.applied_labels.add(full_label)

        pvalueish, inputs = transform._extract_input_pvalues(pvalueish)
        try:
            inputs = tuple(inputs)
            for leaf_input in inputs:
                if not isinstance(leaf_input, pvalue.PValue):
                    raise TypeError
        except TypeError:
            raise NotImplementedError(
                'Unable to extract PValue inputs from %s; either %s does not accept '
                'inputs of this format, or it does not properly override '
                '_extract_input_pvalues' % (pvalueish, transform))

        current = AppliedPTransform(self._current_transform(), transform,
                                    full_label, inputs)
        self._current_transform().add_part(current)
        self.transforms_stack.append(current)

        type_options = self._options.view_as(TypeOptions)
        if type_options.pipeline_type_check:
            transform.type_check_inputs(pvalueish)

        pvalueish_result = self.runner.apply(transform, pvalueish,
                                             self._options)

        if type_options is not None and type_options.pipeline_type_check:
            transform.type_check_outputs(pvalueish_result)

        for result in ptransform.get_nested_pvalues(pvalueish_result):
            assert isinstance(result, (pvalue.PValue, pvalue.DoOutputsTuple))

            # Make sure we set the producer only for a leaf node in the transform DAG.
            # This way we preserve the last transform of a composite transform as
            # being the real producer of the result.
            if result.producer is None:
                result.producer = current

            self._infer_result_type(transform, inputs, result)

            assert isinstance(result.producer.inputs, tuple)
            current.add_output(result)

        if (type_options is not None
                and type_options.type_check_strictness == 'ALL_REQUIRED'
                and transform.get_type_hints().output_types is None):
            ptransform_name = '%s(%s)' % (transform.__class__.__name__,
                                          full_label)
            raise TypeCheckError(
                'Pipeline type checking is enabled, however no '
                'output type-hint was found for the '
                'PTransform %s' % ptransform_name)

        self.transforms_stack.pop()
        return pvalueish_result

    def _infer_result_type(self, transform, inputs, result_pcollection):
        # TODO(robertwb): Multi-input, multi-output inference.
        type_options = self._options.view_as(TypeOptions)
        if (type_options is not None and type_options.pipeline_type_check
                and isinstance(result_pcollection, pvalue.PCollection)
                and (not result_pcollection.element_type
                     # TODO(robertwb): Ideally we'd do intersection here.
                     or result_pcollection.element_type == typehints.Any)):
            input_element_type = (inputs[0].element_type
                                  if len(inputs) == 1 else typehints.Any)
            type_hints = transform.get_type_hints()
            declared_output_type = type_hints.simple_output_type(
                transform.label)
            if declared_output_type:
                input_types = type_hints.input_types
                if input_types and input_types[0]:
                    declared_input_type = input_types[0][0]
                    result_pcollection.element_type = typehints.bind_type_variables(
                        declared_output_type,
                        typehints.match_type_variables(declared_input_type,
                                                       input_element_type))
                else:
                    result_pcollection.element_type = declared_output_type
            else:
                result_pcollection.element_type = transform.infer_output_type(
                    input_element_type)

    def __reduce__(self):
        # Some transforms contain a reference to their enclosing pipeline,
        # which in turn reference all other transforms (resulting in quadratic
        # time/space to pickle each transform individually).  As we don't
        # require pickled pipelines to be executable, break the chain here.
        return str, ('Pickled pipeline stub.', )

    def _verify_runner_api_compatible(self):
        if self._options.view_as(TypeOptions).runtime_type_check:
            # This option is incompatible with the runner API as it requires
            # the runner to inspect non-serialized hints on the transform
            # itself.
            return False

        class Visitor(PipelineVisitor):  # pylint: disable=used-before-assignment
            ok = True  # Really a nonlocal.

            def enter_composite_transform(self, transform_node):
                pass

            def visit_transform(self, transform_node):
                try:
                    # Transforms must be picklable.
                    pickler.loads(pickler.dumps(transform_node.transform,
                                                enable_trace=False),
                                  enable_trace=False)
                except Exception:
                    Visitor.ok = False

            def visit_value(self, value, _):
                if isinstance(value, pvalue.PDone):
                    Visitor.ok = False

        self.visit(Visitor())
        return Visitor.ok

    def to_runner_api(self,
                      return_context=False,
                      context=None,
                      use_fake_coders=False,
                      default_environment=None):
        """For internal use only; no backwards-compatibility guarantees."""
        from apache_beam.runners import pipeline_context
        from apache_beam.portability.api import beam_runner_api_pb2
        if context is None:
            context = pipeline_context.PipelineContext(
                use_fake_coders=use_fake_coders,
                default_environment=default_environment)
        elif default_environment is not None:
            raise ValueError(
                'Only one of context or default_environment may be specified.')

        # The RunnerAPI spec requires certain transforms and side-inputs to have KV
        # inputs (and corresponding outputs).
        # Currently we only upgrade to KV pairs.  If there is a need for more
        # general shapes, potential conflicts will have to be resolved.
        # We also only handle single-input, and (for fixing the output) single
        # output, which is sufficient.
        class ForceKvInputTypes(PipelineVisitor):
            def enter_composite_transform(self, transform_node):
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                if not transform_node.transform:
                    return
                if transform_node.transform.runner_api_requires_keyed_input():
                    pcoll = transform_node.inputs[0]
                    pcoll.element_type = typehints.coerce_to_kv_type(
                        pcoll.element_type, transform_node.full_label)
                    if len(transform_node.outputs) == 1:
                        # The runner often has expectations about the output types as well.
                        output, = transform_node.outputs.values()
                        if not output.element_type:
                            output.element_type = transform_node.transform.infer_output_type(
                                pcoll.element_type)
                for side_input in transform_node.transform.side_inputs:
                    if side_input.requires_keyed_input():
                        side_input.pvalue.element_type = typehints.coerce_to_kv_type(
                            side_input.pvalue.element_type,
                            transform_node.full_label,
                            side_input_producer=side_input.pvalue.producer.
                            full_label)

        self.visit(ForceKvInputTypes())

        # Mutates context; placing inline would force dependence on
        # argument evaluation order.
        root_transform_id = context.transforms.get_id(self._root_transform())
        proto = beam_runner_api_pb2.Pipeline(
            root_transform_ids=[root_transform_id],
            components=context.to_runner_api())
        proto.components.transforms[root_transform_id].unique_name = (
            root_transform_id)
        if return_context:
            return proto, context
        else:
            return proto

    @staticmethod
    def from_runner_api(proto,
                        runner,
                        options,
                        return_context=False,
                        allow_proto_holders=False):
        """For internal use only; no backwards-compatibility guarantees."""
        p = Pipeline(runner=runner, options=options)
        from apache_beam.runners import pipeline_context
        context = pipeline_context.PipelineContext(
            proto.components, allow_proto_holders=allow_proto_holders)
        root_transform_id, = proto.root_transform_ids
        p.transforms_stack = [context.transforms.get_by_id(root_transform_id)]
        # TODO(robertwb): These are only needed to continue construction. Omit?
        p.applied_labels = set(
            [t.unique_name for t in proto.components.transforms.values()])
        for id in proto.components.pcollections:
            pcollection = context.pcollections.get_by_id(id)
            pcollection.pipeline = p
            if not pcollection.producer:
                raise ValueError('No producer for %s' % id)

        # Inject PBegin input where necessary.
        from apache_beam.io.iobase import Read
        from apache_beam.transforms.core import Create
        has_pbegin = [Read, Create]
        for id in proto.components.transforms:
            transform = context.transforms.get_by_id(id)
            if not transform.inputs and transform.transform.__class__ in has_pbegin:
                transform.inputs = (pvalue.PBegin(p), )

        if return_context:
            return p, context
        else:
            return p
Exemple #41
0
def run(argv=None):
  """Main entry point; defines and runs the hourly_team_score pipeline."""
  parser = argparse.ArgumentParser()

  parser.add_argument('--topic',
                      type=str,
                      required=True,
                      help='Pub/Sub topic to read from')
  parser.add_argument('--dataset',
                      type=str,
                      required=True,
                      help='BigQuery Dataset to write tables to. '
                      'Must already exist.')
  parser.add_argument('--table_name',
                      type=str,
                      default='game_stats',
                      help='The BigQuery table name. Should not already exist.')
  parser.add_argument('--fixed_window_duration',
                      type=int,
                      default=60,
                      help='Numeric value of fixed window duration for user '
                           'analysis, in minutes')
  parser.add_argument('--session_gap',
                      type=int,
                      default=5,
                      help='Numeric value of gap between user sessions, '
                           'in minutes')
  parser.add_argument('--user_activity_window_duration',
                      type=int,
                      default=30,
                      help='Numeric value of fixed window for finding mean of '
                           'user session duration, in minutes')

  args, pipeline_args = parser.parse_known_args(argv)

  options = PipelineOptions(pipeline_args)

  # We also require the --project option to access --dataset
  if options.view_as(GoogleCloudOptions).project is None:
    parser.print_usage()
    print(sys.argv[0] + ': error: argument --project is required')
    sys.exit(1)

  fixed_window_duration = args.fixed_window_duration * 60
  session_gap = args.session_gap * 60
  user_activity_window_duration = args.user_activity_window_duration * 60

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  options.view_as(SetupOptions).save_main_session = True

  # Enforce that this pipeline is always run in streaming mode
  options.view_as(StandardOptions).streaming = True

  with beam.Pipeline(options=options) as p:
    # Read events from Pub/Sub using custom timestamps
    raw_events = (
        p
        | 'ReadPubSub' >> beam.io.gcp.pubsub.ReadStringsFromPubSub(args.topic)
        | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn())
        | 'AddEventTimestamps' >> beam.Map(
            lambda elem: beam.window.TimestampedValue(elem, elem['timestamp'])))

    # Extract username/score pairs from the event stream
    user_events = (
        raw_events
        | 'ExtractUserScores' >> beam.Map(
            lambda elem: (elem['user'], elem['score'])))

    # Calculate the total score per user over fixed windows, and cumulative
    # updates for late data
    spammers_view = (
        user_events
        | 'UserFixedWindows' >> beam.WindowInto(
            beam.window.FixedWindows(fixed_window_duration))

        # Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate.
        # These might be robots/spammers.
        | 'CalculateSpammyUsers' >> CalculateSpammyUsers()

        # Derive a view from the collection of spammer users. It will be used as
        # a side input in calculating the team score sums, below
        | 'CreateSpammersView' >> beam.CombineGlobally(
            beam.combiners.ToDictCombineFn()).as_singleton_view())

    # [START filter_and_calc]
    # Calculate the total score per team over fixed windows, and emit cumulative
    # updates for late data. Uses the side input derived above --the set of
    # suspected robots-- to filter out scores from those users from the sum.
    # Write the results to BigQuery.
    (raw_events  # pylint: disable=expression-not-assigned
     | 'WindowIntoFixedWindows' >> beam.WindowInto(
         beam.window.FixedWindows(fixed_window_duration))

     # Filter out the detected spammer users, using the side input derived above
     | 'FilterOutSpammers' >> beam.Filter(
         lambda elem, spammers: elem['user'] not in spammers,
         spammers_view)
     # Extract and sum teamname/score pairs from the event data.
     | 'ExtractAndSumScore' >> ExtractAndSumScore('team')
     # [END filter_and_calc]
     | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict())
     | 'WriteTeamScoreSums' >> WriteToBigQuery(
         args.table_name + '_teams', args.dataset, {
             'team': 'STRING',
             'total_score': 'INTEGER',
             'window_start': 'STRING',
             'processing_time': 'STRING',
         }))

    # [START session_calc]
    # Detect user sessions-- that is, a burst of activity separated by a gap
    # from further activity. Find and record the mean session lengths.
    # This information could help the game designers track the changing user
    # engagement as their set of game changes.
    (user_events  # pylint: disable=expression-not-assigned
     | 'WindowIntoSessions' >> beam.WindowInto(
         beam.window.Sessions(session_gap),
         timestamp_combiner=beam.window.TimestampCombiner.OUTPUT_AT_EOW)

     # For this use, we care only about the existence of the session, not any
     # particular information aggregated over it, so we can just group by key
     # and assign a "dummy value" of None.
     | beam.CombinePerKey(lambda _: None)

     # Get the duration of the session
     | 'UserSessionActivity' >> beam.ParDo(UserSessionActivity())
     # [END session_calc]

     # [START rewindow]
     # Re-window to process groups of session sums according to when the
     # sessions complete
     | 'WindowToExtractSessionMean' >> beam.WindowInto(
         beam.window.FixedWindows(user_activity_window_duration))

     # Find the mean session duration in each window
     | beam.CombineGlobally(beam.combiners.MeanCombineFn()).without_defaults()
     | 'FormatAvgSessionLength' >> beam.Map(
         lambda elem: {'mean_duration': float(elem)})
     | 'WriteAvgSessionLength' >> WriteToBigQuery(
         args.table_name + '_sessions', args.dataset, {
             'mean_duration': 'FLOAT',
         }))
def run(argv=None, save_main_session=True):
  """Main entry point; defines and runs the wordcount pipeline."""

  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://dataflow-samples/shakespeare/kinglear.txt',
      help='Input file to process.')
  parser.add_argument(
      '--output',
      dest='output',
      # CHANGE 1/5: The Google Cloud Storage path is required
      # for outputting the results.
      default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  pipeline_args.extend([
      # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to
      # run your pipeline on the Google Cloud Dataflow Service.
      '--runner=DirectRunner',
      # CHANGE 3/5: Your project ID is required in order to run your pipeline on
      # the Google Cloud Dataflow Service.
      '--project=SET_YOUR_PROJECT_ID_HERE',
      # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
      # files.
      '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
      # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
      # files.
      '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
      '--job_name=your-wordcount-job',
  ])

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
  with beam.Pipeline(options=pipeline_options) as p:

    # Read the text file[pattern] into a PCollection.
    lines = p | ReadFromText(known_args.input)

    # Count the occurrences of each word.
    counts = (
        lines
        | 'Split' >> (
            beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)).
            with_output_types(unicode))
        | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
        | 'GroupAndSum' >> beam.CombinePerKey(sum))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
      (word, count) = word_count
      return '%s: %s' % (word, count)

    output = counts | 'Format' >> beam.Map(format_result)

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | WriteToText(known_args.output)
Exemple #43
0
#                         help='Input for the pipeline',
#                         default='gs://cxr-to-chest-ct/')
#     parser.add_argument('--output',
#                         help='Output for the pipeline',
#                         default='gs://cxr-to-chest-ct2/resampled/')
#     parser.add_argument('--project',
#                         dest='project',
#                         help='Project',
#                         default='x-ray-reconstruction')
#     parser.add_argument('--temp_location',
#                         dest='temp_location',
#                         help='temp_location',
#                         default='gs://cxr-to-chest-ct2/tmp/')

options = PipelineOptions(flags=sys.argv)
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = 'x-ray-reconstruction'
google_cloud_options.job_name = 'numpy-highmem-int16-with-rotation'
google_cloud_options.staging_location = 'gs://cxr-to-chest-ct2/binaries'
google_cloud_options.temp_location = 'gs://cxr-to-chest-ct2/temp'
# google_cloud_options.machine_type = 'n1-highmem-2'
options.view_as(SetupOptions).save_main_session = True

with beam.Pipeline(options=options) as p:
    # embed()
    # dicom_urls = p | 'read csv data' >> beam.io.Read(CsvFileSource('gs://cxr-to-chest-ct/datasets/LIDC-IDRI Dataset/ct_scan_urls.csv'))

    dicom_urls = p | 'read csv file' >> beam.io.textio.ReadFromText(
        'gs://cxr-to-chest-ct/datasets/LIDC-IDRI Dataset/ct_scan_urls.csv'
    ) | 'split stuff' >> beam.ParDo(Split())
Exemple #44
0
import apache_beam as beam

import config

import json
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.io.gcp.internal.clients import bigquery
from textblob import TextBlob

options = PipelineOptions()

google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = config.PROJECT_ID
google_cloud_options.staging_location = 'gs://dod-mwja-project1/staging'
google_cloud_options.temp_location = 'gs://dod-mwja-project1/temp'
options.view_as(StandardOptions).runner = 'DataflowRunner'
options.view_as(StandardOptions).streaming = True


def compute_sentiment(line):
    import os
    os.system('sudo pip install textblob')
    from textblob import TextBlob
    templist = line.split('-=-')
    for j, item in enumerate(templist):
        templist[j] = item.replace(',', '')
    tweet = templist[1]
    sent = TextBlob(tweet).sentiment.polarity
Exemple #45
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the translate pipeline."""
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)

    # Define pipeline options.
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    p = beam.Pipeline(options=pipeline_options)

    def translate_text(text_dict, target="en"):
        """Translates text into the target language. Target must be an ISO 639-1 language code.
        See https://g.co/cloud/translate/v2/translate-reference#supported_languages
        
        Args:
          text_dict: Dictionary format input.
        """
        import six
        from google.cloud import translate_v2 as translate

        text = text_dict[text_column_in]

        translate_client = translate.Client()

        if isinstance(text, six.binary_type):
            text = text.decode("utf-8")

        # Text can also be a sequence of strings, in which case this method
        # will return a sequence of results for each text.
        result = translate_client.translate(text, target_language=target)

        result_str = result['translatedText']

        # Construct dict matching output table schema table_schema_out.
        return {text_column_out: result_str, text_column_in: text}

    # Debug. Test translate_text fn.
    text_dict = {}
    text_dict[text_column_in] = '寿司は美味しです'
    print(translate_text(text_dict))

    translate_jp2en = (
        p
        | 'Read table from BQ' >> beam.io.ReadFromBigQuery(table=table_spec_in)

        # Debug.
        # | 'Create dict' >> beam.Create([
        #       {
        #         'td_title': '魚も美味しいです'
        #       },
        #       {
        #         'td_title': '寿司は美味しいです'
        #       },
        #   ])

        # Each row is a dictionary where the keys are the BigQuery columns
        | 'Translating' >> beam.Map(translate_text))

    # Debug. Print translated jp texts.
    # translate_jp2en | 'Print' >> beam.Map(print)

    # Write translated data back to a new table in BigQuery.
    translate_jp2en | 'Write back to BQ' >> beam.io.WriteToBigQuery(
        table_spec_out,
        schema=table_schema_out,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)

    p.run()
Exemple #46
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--kind',
                        dest='kind',
                        required=True,
                        help='Datastore Kind')
    parser.add_argument('--namespace',
                        dest='namespace',
                        help='Datastore Namespace')
    parser.add_argument('--ancestor',
                        dest='ancestor',
                        default='root',
                        help='The ancestor key name for all entities.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    parser.add_argument('--read_only',
                        action='store_true',
                        help='Read an existing dataset, do not write first')
    parser.add_argument(
        '--num_shards',
        dest='num_shards',
        type=int,
        # If the system should choose automatically.
        default=0,
        help='Number of output shards')

    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    gcloud_options = pipeline_options.view_as(GoogleCloudOptions)

    # Write to Datastore if `read_only` options is not specified.
    if not known_args.read_only:
        write_to_datastore(gcloud_options.project, known_args,
                           pipeline_options)

    # Read entities from Datastore.
    result = read_from_datastore(gcloud_options.project, known_args,
                                 pipeline_options)

    empty_lines_filter = MetricsFilter().with_name('empty_lines')
    query_result = result.metrics().query(empty_lines_filter)
    if query_result['counters']:
        empty_lines_counter = query_result['counters'][0]
        logging.info('number of empty lines: %d',
                     empty_lines_counter.committed)
    else:
        logging.warn('unable to retrieve counter metrics from runner')

    word_lengths_filter = MetricsFilter().with_name('word_len_dist')
    query_result = result.metrics().query(word_lengths_filter)
    if query_result['distributions']:
        word_lengths_dist = query_result['distributions'][0]
        logging.info('average word length: %d',
                     word_lengths_dist.committed.mean)
    else:
        logging.warn('unable to retrieve distribution metrics from runner')
Exemple #47
0
def run():
    import pickle
    import sys

    import math

    import numpy as np

    reload(sys)
    sys.setdefaultencoding('utf8')

    from gensim.models import KeyedVectors
    import apache_beam as beam
    from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions
    from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore
    from google.cloud.proto.datastore.v1 import query_pb2
    from apache_beam.io.textio import WriteToText
    import nltk.data
    import re
    import uuid
    import perceptron

    _sentence_tokenizer = nltk.data.load("./tokenizer/punkt_turkish.pickle")
    abbreviations = set()
    with open("./tokenizer/abbreviations-long.txt") as f:
        for l in f:
            abbreviations.add(l.split(':')[0])

    _sentence_tokenizer._params.abbrev_types = abbreviations

    model_file = "perceptron_word2vec_stemmed_normalized.pickle"
    with open(model_file, 'rb') as model:
        w, b = pickle.load(model)

    def sentences_from_text(text):
        return _sentence_tokenizer.tokenize(text.strip())

    def tokens_from_sentence(sentence):
        return sentence.split(" ") # nltk.word_tokenize(sentence)

    def ngrams(obj, n):
        tokens = []
        sentences = (
            sentences_from_text(obj["title"]) +
            sentences_from_text(obj["description"]) +
            sentences_from_text(obj["content"])
        )

        for sentence in sentences:
            tokens += tokens_from_sentence(sentence)

        pairs = nltk.ngrams(tokens, n)
        return [" ".join(pair) for pair in pairs]


    def convertToObject(jsonObj):
        x = jsonObj

        link = x.properties.get('link', None)
        link = link.string_value if link else ""

        title = x.properties.get('title', None)
        title = title.string_value if title else ""

        description = x.properties.get("description", None)
        description = description.string_value if description else ""

        content = x.properties.get("text", "")
        content = content.string_value if content else ""

        published = x.properties.get("published")
        published = published.string_value if published else ""

        obj = {
            "link": link,
            "title": title,
            "description": description,
            "content": content,
            "published": published
        }

        obj["key"] = obj["link"] if obj["link"] else str(uuid.uuid4())

        return obj

    # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
    def cleanhtml(raw_html):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext

    def removeHTMLFromStrings(obj):
        for key in obj.keys():
            obj[key] = cleanhtml(obj[key])

        return obj

    def tokenize_to_sentences(obj):

        obj["sentences"] = (
            sentences_from_text(obj["title"]) +
            sentences_from_text(obj["description"]) +
            sentences_from_text(obj["content"])
        )

        return obj

    def tokenize_to_words(obj):

        obj["tokens"] = []

        for sentence in obj["sentences"]:
            obj["tokens"] += tokens_from_sentence(sentence)

        for token in obj["tokens"]:
            yield (obj["key"], token)

    options = PipelineOptions()
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = 'news-197916'
    google_cloud_options.job_name = 'sentiment-analysis'
    google_cloud_options.staging_location = 'gs://news-197916.appspot.com/word_count/'
    google_cloud_options.temp_location = 'gs://news-197916.appspot.com/df_tmp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    setup_options = options.view_as(SetupOptions)
    setup_options.requirements_file = "requirements.txt"
    setup_options.save_main_session = True

    p = beam.Pipeline(options=options)
    query = query_pb2.Query()
    query.kind.add().name = "News_Entry"

    pairs = (p
            | 'Read From Datastore' >> ReadFromDatastore(project = google_cloud_options.project, query=query)
        #     | "Read From Text" >> ReadFromText("news.json", coder=beam.coders.coders.StrUtf8Coder()) # line by line
        #     | "Convert to Json Object" >> beam.Map(convertToJsonObj)
             | "Convert to Python Object" >> beam.Map(convertToObject)
             | "Remove HTML Tags From Strings (Normalization 1)" >> beam.Map(removeHTMLFromStrings)
    )

    tokens_1gram = (pairs
                    | 'Sentence Tokenization' >> beam.Map(tokenize_to_sentences)
                    | 'Word Tokenization' >> beam.FlatMap(tokenize_to_words)  # also convert to key value pairs
                    )
    """
    tokens_2gram = (pairs
            | "Create 2-grams" >> beam.FlatMap(lambda obj: [(obj["key"], token) for token in ngrams(obj, 2)])
        )
    """

    tokens = tokens_1gram

    """
    vocabulary = (tokens
                  | "Get words only" >> beam.Values()
                  | "Remove duplicate words" >> beam.RemoveDuplicates()
                  )
    vocabulary_size = (vocabulary
            | "Count Vocabulary elements" >> beam.combiners.Count.Globally()
        )

    doc_total_words = (tokens
            | "Count Words of Doc" >> beam.combiners.Count.PerKey()
    )
    """

    tokens_paired_with_1 = (tokens
                            | "Pair with 1" >> beam.Map(lambda (doc, token): ((doc, token), 1))
                            )
    """
    token_counts_per_doc = (tokens_paired_with_1
            | "Group by Doc,Word" >> beam.GroupByKey()
            | "Count ones" >> beam.Map(lambda ((doc, token), counts): (doc, (token, sum(counts))))
            | "Group by Doc" >> beam.GroupByKey()
        )



    num_docs = (token_counts_per_doc
            | "Get Docs" >> beam.Keys()
            | "Count Docs" >> beam.combiners.Count.Globally()
    )


    word_tf_pre = (
        { 'total_tokens': doc_total_words, 'token_counts_per_doc': token_counts_per_doc }
        | "CoGroup By Document" >> beam.CoGroupByKey()
    )

    def calc_tf((doc, count)):
        [token_count] = count['token_counts_per_doc']

        [tokens_total] = count['total_tokens']

        for token, cnt in token_count:
            yield token, (doc, float(cnt) / tokens_total)


    doc_word_tf = (word_tf_pre
        | "Compute Term Frequencies" >> beam.FlatMap(calc_tf)
        )

    word_occurrences = (tokens
        | "Remove Multiple occurrences per doc" >> beam.RemoveDuplicates()
        | "Pair with 1s" >> beam.Map(lambda (doc, word): (word, 1))
        | "Group by Word" >> beam.GroupByKey()
        | "Sum 1s" >> beam.Map(lambda (word, counts): (word, sum(counts)))
    )

    token_df = (
        word_occurrences
        | "Compute Document Frequency">> beam.Map(lambda (token, count), total: (token, float(count) / total), AsSingleton(num_docs)))

    token_tf_df = (
        { 'term_frequency': doc_word_tf, 'document_frequency': token_df}
        | "CoGroup By Token" >> beam.CoGroupByKey())

    def calc_tfidf((token, tfdf)):
      [df] = tfdf['document_frequency']
      for doc, tf in tfdf['term_frequency']:
        yield (doc, token), tf * math.log(1.0 / df)

    token_tf_idf = (token_tf_df
        | "Calculate TF-IDF Scores" >> beam.FlatMap(calc_tfidf)
    )
    """

    word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True)

    def get_vec(word2vec, token):
        if word2vec is None:
            word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True)

        try:
            x = word2vec.get_vector(token)
            x = x.reshape(400)
        except:
            x = np.zeros(400)

        return x

    def analyze_sentiment(x):

        res = perceptron.f(x, w, b)

        return res

    doc_sentiment = (tokens_paired_with_1
                     | "Create Word2Vec Vector" >> beam.Map(lambda ((doc, token), cnt): (doc, get_vec(word2vec, token)))
                     | "Group Word2Vec Vectors By Document" >> beam.GroupByKey()
                     | "Sum Word2Vec Vectors" >> beam.Map(
        lambda (doc, vecs): (doc, analyze_sentiment(np.sum(vecs, axis=0))[0]))
                     )

    result = (doc_sentiment |
              "Format  Results" >> beam.Map(lambda (doc, tokens): '%s %s' % (doc, tokens))
              )

    (result
     | "Write Results" >> WriteToText("sentiments")
     )

    p.run()
 def _setup_pipeline(self):
   options = PipelineOptions(self.pipeline.get_full_options_as_args())
   options.view_as(SetupOptions).save_main_session = True
   options.view_as(StandardOptions).streaming = True
   self.pipeline = TestPipeline(options=options)
Exemple #49
0
def main():
  project = 'chromeperf'
  options = PipelineOptions()
  options.view_as(DebugOptions).add_experiment('use_beam_bq_sink')
  options.view_as(GoogleCloudOptions).project = project
  bq_export_options = options.view_as(BqExportOptions)

  p = beam.Pipeline(options=options)
  entities_read = Metrics.counter('main', 'entities_read')
  failed_entity_transforms = Metrics.counter('main', 'failed_entity_transforms')

  # Read 'Anomaly' entities from datastore.
  entities = (
      p
      | 'ReadFromDatastore(Anomaly)' >> ReadTimestampRangeFromDatastore(
          {'project': project, 'kind': 'Anomaly'},
          time_range_provider=bq_export_options.GetTimeRangeProvider()))

  def AnomalyEntityToRowDict(entity):
    entities_read.inc()
    try:
      # We do the iso conversion of the nullable timestamps in isolation.
      earliest_input_timestamp = entity.get('earliest_input_timestamp')
      if earliest_input_timestamp:
        earliest_input_timestamp = earliest_input_timestamp.isoformat()
      latest_input_timestamp = entity.get('latest_input_timestamp')
      if latest_input_timestamp:
        latest_input_timestamp = latest_input_timestamp.isoformat()
      d = {
          'id': entity.key.id,
          # TODO: 'sheriff'
          # 'subscriptions' omitted; subscription_names is sufficient
          'subscription_names': entity.get('subscription_names', []),
          'test': TestPath(entity['test']),
          'start_revision': entity['start_revision'],
          'end_revision': entity['end_revision'],
          'display_start': entity.get('display_start'),
          'display_end': entity.get('display_end'),
          # TODO: 'ownership'
          'statistic': entity['statistic'],
          'bug_id': entity['bug_id'],
          'internal_only': entity['internal_only'],
          'timestamp': entity['timestamp'].isoformat(),
          'segment_size_before': entity.get('segment_size_before'),
          'segment_size_after': entity.get('segment_size_after'),
          'median_before_anomaly': entity.get('median_before_anomaly'),
          'median_after_anomaly': entity.get('median_after_anomaly'),
          'std_dev_before_anomaly': entity.get('std_dev_before_anomaly'),
          'window_end_revision': entity.get('window_end_revision'),
          't_statistic': FloatHack(entity.get('t_statistic')),
          'degrees_of_freedom': entity.get('degrees_of_freedom'),
          'p_value': entity.get('p_value'),
          'is_improvement': entity.get('is_improvement', False),
          'recovered': entity.get('recovered', False),
          # TODO: 'ref_test'
          'units': entity.get('units'),
          # TODO: 'recipe_bisects'
          'pinpoint_bisects': entity.get('pinpoint_bisects', []),
          # These are critical to "time-to-culprit" calculations.
          'earliest_input_timestamp': earliest_input_timestamp,
          'latest_input_timestamp': latest_input_timestamp,
      }
      if d['statistic'] is None:
        # Some years-old anomalies lack this.
        raise UnconvertibleAnomalyError()
      return [d]
    except (KeyError, UnconvertibleAnomalyError):
      failed_entity_transforms.inc()
      return []
  anomaly_dicts = (
      entities
      | 'ConvertEntityToRow(Anomaly)' >> beam.FlatMap(AnomalyEntityToRowDict))

  """
  CREATE TABLE `chromeperf.chromeperf_dashboard_data.anomalies`
  (id INT64 NOT NULL,
   `timestamp` TIMESTAMP NOT NULL,
   subscription_names ARRAY<STRING>,
   `test` STRING NOT NULL,
   start_revision INT64 NOT NULL,
   end_revision INT64 NOT NULL,
   display_start INT64,
   display_end INT64,
   statistic STRING NOT NULL,
   bug_id INT64,
   internal_only BOOLEAN NOT NULL,
   segment_size_before INT64,
   segment_size_after INT64,
   median_before_anomaly FLOAT64,
   median_after_anomaly FLOAT64,
   std_dev_before_anomaly FLOAT64,
   window_end_revision INT64,
   t_statistic FLOAT64,
   degrees_of_freedom FLOAT64,
   p_value FLOAT64,
   is_improvement BOOLEAN NOT NULL,
   recovered BOOLEAN NOT NULL,
   units STRING,
   pinpoint_bisects ARRAY<STRING>,
   earliest_input_timestamp TIMESTAMP,
   latest_input_timestamp TIMESTAMP)
  PARTITION BY DATE(`timestamp`);
  """  # pylint: disable=pointless-string-statement
  bq_anomaly_schema = {
      'fields': [
          {
              'name': 'id',
              'type': 'INT64',
              'mode': 'REQUIRED'
          },
          {
              'name': 'subscription_names',
              'type': 'STRING',
              'mode': 'REPEATED'
          },
          {
              'name': 'test',
              'type': 'STRING',
              'mode': 'REQUIRED'
          },
          {
              'name': 'start_revision',
              'type': 'INT64',
              'mode': 'REQUIRED'
          },
          {
              'name': 'end_revision',
              'type': 'INT64',
              'mode': 'REQUIRED'
          },
          {
              'name': 'display_start',
              'type': 'INT64',
              'mode': 'NULLABLE'
          },
          {
              'name': 'display_end',
              'type': 'INT64',
              'mode': 'NULLABLE'
          },
          {
              'name': 'statistic',
              'type': 'STRING',
              'mode': 'REQUIRED'
          },
          {
              'name': 'bug_id',
              'type': 'INT64',
              'mode': 'NULLABLE'
          },
          {
              'name': 'internal_only',
              'type': 'BOOLEAN',
              'mode': 'REQUIRED'
          },
          {
              'name': 'timestamp',
              'type': 'TIMESTAMP',
              'mode': 'REQUIRED'
          },
          {
              'name': 'segment_size_before',
              'type': 'INT64',
              'mode': 'NULLABLE'
          },
          {
              'name': 'segment_size_after',
              'type': 'INT64',
              'mode': 'NULLABLE'
          },
          {
              'name': 'median_before_anomaly',
              'type': 'FLOAT',
              'mode': 'NULLABLE'
          },
          {
              'name': 'median_after_anomaly',
              'type': 'FLOAT',
              'mode': 'NULLABLE'
          },
          {
              'name': 'std_dev_before_anomaly',
              'type': 'FLOAT',
              'mode': 'NULLABLE'
          },
          {
              'name': 'window_end_revision',
              'type': 'INT64',
              'mode': 'NULLABLE'
          },
          {
              'name': 't_statistic',
              'type': 'FLOAT',
              'mode': 'NULLABLE'
          },
          {
              'name': 'degrees_of_freedom',
              'type': 'FLOAT',
              'mode': 'NULLABLE'
          },
          {
              'name': 'p_value',
              'type': 'FLOAT',
              'mode': 'NULLABLE'
          },
          {
              'name': 'is_improvement',
              'type': 'BOOLEAN',
              'mode': 'REQUIRED'
          },
          {
              'name': 'recovered',
              'type': 'BOOLEAN',
              'mode': 'REQUIRED'
          },
          {
              'name': 'units',
              'type': 'STRING',
              'mode': 'NULLABLE'
          },
          {
              'name': 'pinpoint_bisects',
              'type': 'STRING',
              'mode': 'REPEATED'
          },
          {
              'name': 'earliest_input_timestamp',
              'type': 'TIMESTAMP',
              'mode': 'NULLABLE'
          },
          {
              'name': 'latest_input_timestamp',
              'type': 'TIMESTAMP',
              'mode': 'NULLABLE'
          },
      ]
  }

  # 'dataset' may be a RuntimeValueProvider, so we have to defer calculating
  # the table name until runtime.  The simplest way to do this is by passing a
  # function for the table name rather than a string.
  def TableNameFn(unused_element):
    return '{}:{}.anomalies{}'.format(project, bq_export_options.dataset.get(),
                                      bq_export_options.table_suffix)
  _ = (
      anomaly_dicts | 'WriteToBigQuery(anomalies)' >>
      WriteToPartitionedBigQuery(TableNameFn, bq_anomaly_schema))

  result = p.run()
  result.wait_until_finish()
  PrintCounters(result)
Exemple #50
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--questions',
                        dest='questions',
                        required=True,
                        help='Questions file.')
    parser.add_argument('--users',
                        dest='users',
                        required=True,
                        help='Users file.')
    parser.add_argument('--from-ts',
                        dest='from_ts',
                        required=True,
                        type=int,
                        help='Start of the time range.')
    parser.add_argument('--to-ts',
                        dest='to_ts',
                        required=True,
                        type=int,
                        help='End of the time range.')
    parser.add_argument(
        '--engagement-range',
        dest='engagement_range',
        default=10,
        type=int,
        help=
        'Maximum number of days from first step to the last step of an engagement.'
    )
    parser.add_argument('--giap-es-index',
                        dest='giap_es_index',
                        required=True,
                        help='GIAP ES index.')
    parser.add_argument('--giap-es-username',
                        dest='giap_es_username',
                        required=True,
                        help='GIAP ES username.')
    parser.add_argument('--giap-es-password',
                        dest='giap_es_password',
                        required=True,
                        help='GIAP ES password.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    with beam.Pipeline(options=pipeline_options) as p:
        recent_questions = get_recent_questions(p, known_args.questions)

        recently_active_users = get_recently_active_users(p, known_args.users)

        latest_engagements = get_latest_engagements(
            p,
            from_ts=known_args.from_ts,
            to_ts=known_args.to_ts,
            engagement_range=known_args.engagement_range)

        question_engagement_pairs = ({
            'questions': recent_questions,
            'engagements': latest_engagements,
            'users': recently_active_users
        }) | "Group by uid" >> beam.CoGroupByKey()

        calculateAskEngagement = CalculateAskEngagement()
        calculateAskEngagement.engagement_range = known_args.engagement_range
        calculateAskEngagement.from_ts = known_args.from_ts
        calculateAskEngagement.to_ts = known_args.to_ts
        calculateAskEngagement.giap_es_index = known_args.giap_es_index
        calculateAskEngagement.giap_es_username = known_args.giap_es_username
        calculateAskEngagement.giap_es_password = known_args.giap_es_password

        engagement_table_spec = bigquery.TableReference(
            projectId='gotit-analytics',
            datasetId='study_pn_campaign',
            tableId='engagement')

        new_engagements = (
            question_engagement_pairs
            |
            "Calculate 'ask' engagements" >> beam.ParDo(calculateAskEngagement)
            | 'Write result to BQ' >> beam.io.WriteToBigQuery(
                engagement_table_spec,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
            ))
Exemple #51
0
    """Main function"""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--topic',
                        default='molecules-predictions',
                        help='PubSub topic to subscribe for predictions.')

    args, pipeline_args = parser.parse_known_args()

    beam_options = PipelineOptions(
        pipeline_args,
        save_main_session=True,
        streaming=True,
    )

    project = beam_options.view_as(GoogleCloudOptions).project
    if not project:
        parser.print_usage()
        print('error: argument --project is required')
        sys.exit(1)

    # We'll just log the results
    logging.basicConfig(level=logging.INFO)
    logging.info('Listening...')
    topic_path = 'projects/{}/topics/{}'.format(project, args.topic)
    with beam.Pipeline(options=beam_options) as p:
        _ = (p
             | 'Read predictions' >> beam.io.ReadFromPubSub(topic=topic_path)
             | 'Log' >> beam.Map(logging.info))
Exemple #52
0
def run(argv=None):
  """Main entry point; defines and runs the hourly_team_score pipeline."""
  parser = argparse.ArgumentParser()

  # The default maps to two large Google Cloud Storage files (each ~12GB)
  # holding two subsequent day's worth (roughly) of data.
  parser.add_argument('--input',
                      type=str,
                      default='gs://apache-beam-samples/game/gaming_data*.csv',
                      help='Path to the data file(s) containing game data.')
  parser.add_argument('--dataset',
                      type=str,
                      required=True,
                      help='BigQuery Dataset to write tables to. '
                      'Must already exist.')
  parser.add_argument('--table_name',
                      default='leader_board',
                      help='The BigQuery table name. Should not already exist.')
  parser.add_argument('--window_duration',
                      type=int,
                      default=60,
                      help='Numeric value of fixed window duration, in minutes')
  parser.add_argument('--start_min',
                      type=str,
                      default='1970-01-01-00-00',
                      help='String representation of the first minute after '
                           'which to generate results in the format: '
                           'yyyy-MM-dd-HH-mm. Any input data timestamped '
                           'prior to that minute won\'t be included in the '
                           'sums.')
  parser.add_argument('--stop_min',
                      type=str,
                      default='2100-01-01-00-00',
                      help='String representation of the first minute for '
                           'which to generate results in the format: '
                           'yyyy-MM-dd-HH-mm. Any input data timestamped '
                           'after to that minute won\'t be included in the '
                           'sums.')

  args, pipeline_args = parser.parse_known_args(argv)

  options = PipelineOptions(pipeline_args)

  # We also require the --project option to access --dataset
  if options.view_as(GoogleCloudOptions).project is None:
    parser.print_usage()
    print(sys.argv[0] + ': error: argument --project is required')
    sys.exit(1)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  options.view_as(SetupOptions).save_main_session = True

  with beam.Pipeline(options=options) as p:
    (p  # pylint: disable=expression-not-assigned
     | 'ReadInputText' >> beam.io.ReadFromText(args.input)
     | 'HourlyTeamScore' >> HourlyTeamScore(
         args.start_min, args.stop_min, args.window_duration)
     | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict())
     | 'WriteTeamScoreSums' >> WriteToBigQuery(
         args.table_name, args.dataset, {
             'team': 'STRING',
             'total_score': 'INTEGER',
             'window_start': 'STRING',
         }))
Exemple #53
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the hourly_team_score pipeline."""
    parser = argparse.ArgumentParser()

    parser.add_argument('--topic', type=str, help='Pub/Sub topic to read from')
    parser.add_argument('--subscription',
                        type=str,
                        help='Pub/Sub subscription to read from')
    parser.add_argument('--dataset',
                        type=str,
                        required=True,
                        help='BigQuery Dataset to write tables to. '
                        'Must already exist.')
    parser.add_argument(
        '--table_name',
        default='leader_board',
        help='The BigQuery table name. Should not already exist.')
    parser.add_argument('--team_window_duration',
                        type=int,
                        default=60,
                        help='Numeric value of fixed window duration for team '
                        'analysis, in minutes')
    parser.add_argument(
        '--allowed_lateness',
        type=int,
        default=120,
        help='Numeric value of allowed data lateness, in minutes')

    args, pipeline_args = parser.parse_known_args(argv)

    if args.topic is None and args.subscription is None:
        parser.print_usage()
        print(sys.argv[0] +
              ': error: one of --topic or --subscription is required')
        sys.exit(1)

    options = PipelineOptions(pipeline_args)

    # We also require the --project option to access --dataset
    if options.view_as(GoogleCloudOptions).project is None:
        parser.print_usage()
        print(sys.argv[0] + ': error: argument --project is required')
        sys.exit(1)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    options.view_as(SetupOptions).save_main_session = save_main_session

    # Enforce that this pipeline is always run in streaming mode
    options.view_as(StandardOptions).streaming = True

    with beam.Pipeline(options=options) as p:
        # Read game events from Pub/Sub using custom timestamps, which are extracted
        # from the pubsub data elements, and parse the data.

        # Read from PubSub into a PCollection.
        if args.subscription:
            scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(
                subscription=args.subscription)
        else:
            scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(
                topic=args.topic)

        events = (scores
                  | 'DecodeString' >> beam.Map(lambda b: b.decode('utf-8'))
                  | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn())
                  | 'AddEventTimestamps' >>
                  beam.Map(lambda elem: beam.window.TimestampedValue(
                      elem, elem['timestamp'])))

        # Get team scores and write the results to BigQuery
        (events  # pylint: disable=expression-not-assigned
         | 'CalculateTeamScores' >> CalculateTeamScores(
             args.team_window_duration, args.allowed_lateness)
         | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict())
         | 'WriteTeamScoreSums' >> WriteToBigQuery(
             args.table_name + '_teams', args.dataset, {
                 'team': 'STRING',
                 'total_score': 'INTEGER',
                 'window_start': 'STRING',
                 'processing_time': 'STRING',
             },
             options.view_as(GoogleCloudOptions).project))

        def format_user_score_sums(user_score):
            (user, score) = user_score
            return {'user': user, 'total_score': score}

        # Get user scores and write the results to BigQuery
        (events  # pylint: disable=expression-not-assigned
         | 'CalculateUserScores' >> CalculateUserScores(args.allowed_lateness)
         | 'FormatUserScoreSums' >> beam.Map(format_user_score_sums)
         | 'WriteUserScoreSums' >> WriteToBigQuery(
             args.table_name + '_users', args.dataset, {
                 'user': '******',
                 'total_score': 'INTEGER',
             },
             options.view_as(GoogleCloudOptions).project))
Exemple #54
0
def run(argv=None):
    """Main entry point"""
    parser = argparse.ArgumentParser()
    # parser.add_argument('--project', type=str, required=False, help='project')
    parser.add_argument(
        '--records',
        dest='records',
        type=int,
        # default='gs://dataflow-samples/shakespeare/kinglear.txt',
        default='10',  # gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt
        help='Number of records to be generate')
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        default='./',
                        help='Output file to write results to.')
    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)

    # Store the CLI arguments to variables
    # project_id = known_args.project

    # Setup the dataflow pipeline options
    pipeline_options = PipelineOptions(pipeline_args)
    # pipeline_options.view_as(SetupOptions).save_main_session = True
    # google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    # google_cloud_options.project = project_id

    save_main_session = True
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    # SCHEMA_STRING = '''
    # {"namespace": "example.avro",
    # "type": "record",
    # "name": "User",
    # "fields": [
    #     {"name": "ACNO", "type": "int"},
    #     {"name": "PRIN_BAL", "type": "int"},
    #     {"name": "FEE_ANT", "default": null, "type": ["null", "double"]},
    #     {"name": "GENDER",  "default": null, "type": ["null", {"logicalType": "char", "type": "string", "maxLength": 1}]}

    # ]
    # }
    # '''

    SCHEMA = {
        "namespace":
        "example.avro",
        "type":
        "record",
        "name":
        "User",
        "fields": [{
            "name":
            "ACNO",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "string",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_1",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_2",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }]
    }

    # {"name": "GENDER', "type": "string"}

    # {"name": "FEE_ANT", "type": "long"}

    # p = beam.Pipeline(options=pipeline_options)
    rec_cnt = known_args.records
    with beam.Pipeline(options=pipeline_options) as p:
        left_pcol_name = 'p1'
        file = p | 'read_source' >> beam.io.ReadFromAvro(
            "./data/account_id_schema_new.avro")
        p1 = file | beam.Map(lambda x: {
            'ACNO': x['ACNO'],
            'FIELD_1': x["FIELD_1"]
        })
        p2 = file | beam.Map(lambda x: {
            'ACNO': x['ACNO'],
            'FIELD_2': x["FIELD_2"]
        })

        P1_1 = p1 | "write" >> beam.io.WriteToText('./data.csv')
        P2_2 = p2 | "write2" >> beam.io.WriteToText('./data2.csv')

        right_pcol_name = 'p2'

        join_keys = {
            left_pcol_name: [
                'ACNO'
                # 't1_col_B'
            ],
            right_pcol_name: [
                'ACNO'
                # 't2_col_B'
            ]
        }

        pipelines_dictionary = {left_pcol_name: p1, right_pcol_name: p2}
        test_pipeline = pipelines_dictionary | 'left join' >> Join(
            left_pcol_name=left_pcol_name,
            left_pcol=p1,
            right_pcol_name=right_pcol_name,
            right_pcol=p2,
            join_type='left',
            join_keys=join_keys)
        print(type(test_pipeline))
        test_pipeline | "print" >> beam.io.WriteToText('./test.csv')

        compressIdc = True
        use_fastavro = True
        #

        test_pipeline | 'write_fastavro' >> WriteToAvro(
            known_args.output,
            # '/tmp/dataflow/{}/{}'.format(
            #     'demo', 'output'),
            # parse_schema(json.loads(SCHEMA_STRING)),
            parse_schema(SCHEMA),
            use_fastavro=use_fastavro,
            file_name_suffix='.avro',
            codec=('deflate' if compressIdc else 'null'),
        )
    result = p.run()
    result.wait_until_finish()
Exemple #55
0
def main():
    project = 'chromeperf'
    options = PipelineOptions()
    options.view_as(DebugOptions).add_experiment('use_beam_bq_sink')
    options.view_as(GoogleCloudOptions).project = project
    bq_export_options = options.view_as(BqExportOptions)

    p = beam.Pipeline(options=options)
    entities_read = Metrics.counter('main', 'entities_read')
    failed_entity_transforms = Metrics.counter('main',
                                               'failed_entity_transforms')

    # Read 'Job' entities from datastore.
    job_entities = (
        p
        | 'ReadFromDatastore(Job)' >> ReadTimestampRangeFromDatastore(
            {
                'project': project,
                'kind': 'Job'
            },
            time_range_provider=bq_export_options.GetTimeRangeProvider(),
            timestamp_property='created'))

    def ConvertEntity(entity):
        entities_read.inc()
        try:
            row_dict = JobEntityToRowDict(entity)
        except UnconvertibleJobError:
            logging.getLogger().exception('Failed to convert Job')
            failed_entity_transforms.inc()
            return []
        return [row_dict]

    job_dicts = (job_entities
                 | 'ConvertEntityToRow(Job)' >> beam.FlatMap(ConvertEntity))

    """
  CREATE TABLE `chromeperf.chromeperf_dashboard_data.jobs`
  (id INT64 NOT NULL,
   arguments STRING NOT NULL,
   bug_id INT64,
   comparison_mode STRING,
   gerrit STRUCT<server STRING, change_id STRING>,
   name STRING,
   tags STRING,
   user_email STRING,
   create_time TIMESTAMP NOT NULL,
   start_time TIMESTAMP,
   update_time TIMESTAMP NOT NULL,
   started BOOLEAN NOT NULL,
   done BOOLEAN NOT NULL,
   cancelled BOOLEAN NOT NULL,
   cancel_reason STRING,
   task STRING,
   exception STRING,
   exception_details STRING,
   difference_count INT64,
   retry_count INT64 NOT NULL,
   benchmark_arguments STRUCT<benchmark STRING, story STRING,
                              story_tags STRING, chart STRING,
                              statistic STRING>,
   use_execution_engine BOOLEAN NOT NULL,
   completed BOOLEAN NOT NULL,
   failed BOOLEAN NOT NULL,
   running BOOLEAN NOT NULL,
   configuration STRING)
  PARTITION BY DATE(`create_time`);
  """  # pylint: disable=pointless-string-statement
    bq_job_schema = {
        'fields': [
            {
                'name': 'id',
                'type': 'INT64',
                'mode': 'REQUIRED'
            },
            {
                'name': 'arguments',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'bug_id',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'comparison_mode',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name':
                'gerrit',
                'type':
                'RECORD',
                'mode':
                'NULLABLE',
                'fields': [
                    {
                        'name': 'server',
                        'type': 'STRING',
                        'mode': 'NULLABLE'
                    },
                    {
                        'name': 'change_id',
                        'type': 'STRING',
                        'mode': 'NULLABLE'
                    },
                ]
            },
            {
                'name': 'name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'tags',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'user_email',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'create_time',
                'type': 'TIMESTAMP',
                'mode': 'REQUIRED'
            },
            {
                'name': 'start_time',
                'type': 'TIMESTAMP',
                'mode': 'NULLABLE'
            },
            {
                'name': 'update_time',
                'type': 'TIMESTAMP',
                'mode': 'REQUIRED'
            },
            {
                'name': 'started',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'done',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'cancelled',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'cancel_reason',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'task',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'exception',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'exception_details',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'difference_count',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'retry_count',
                'type': 'INT64',
                'mode': 'REQUIRED'
            },
            {
                'name':
                'benchmark_arguments',
                'type':
                'RECORD',
                'mode':
                'NULLABLE',
                'fields': [
                    {
                        'name': 'benchmark',
                        'type': 'STRING',
                        'mode': 'NULLABLE'
                    },
                    {
                        'name': 'story',
                        'type': 'STRING',
                        'mode': 'NULLABLE'
                    },
                    {
                        'name': 'story_tags',
                        'type': 'STRING',
                        'mode': 'NULLABLE'
                    },
                    {
                        'name': 'chart',
                        'type': 'STRING',
                        'mode': 'NULLABLE'
                    },
                    {
                        'name': 'statistic',
                        'type': 'STRING',
                        'mode': 'NULLABLE'
                    },
                ]
            },
            {
                'name': 'use_execution_engine',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'completed',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'failed',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'running',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'configuration',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
        ]
    }

    # 'dataset' may be a RuntimeValueProvider, so we have to defer calculating
    # the table name until runtime.  The simplest way to do this is by passing a
    # function for the table name rather than a string.
    def TableNameFn(unused_element):
        return '{}:{}.jobs{}'.format(project, bq_export_options.dataset.get(),
                                     bq_export_options.table_suffix)

    _ = job_dicts | 'WriteToBigQuery(jobs)' >> WriteToPartitionedBigQuery(
        TableNameFn, bq_job_schema, element_to_yyyymmdd_fn=_JobToYYYYMMDD)

    result = p.run()
    result.wait_until_finish()
    PrintCounters(result)
def run(argv=[]):
  project_id = 'grass-clump-479'
  instance_id = 'python-write-2'
  DEFAULT_TABLE_PREFIX = "python-test"
  #table_id = DEFAULT_TABLE_PREFIX + "-" + str(uuid.uuid4())[:8]
  #table_id = 'testmillionb38c02c4'
  #table_id = 'testmillioned113e20'
  #table_id = 'testmillion2ee87b99'
  guid = str(uuid.uuid4())[:8]
  table_id = 'testboth' + guid
  jobname = 'testmillion-both-' + guid
  

  argv.extend([
    '--experiments=beam_fn_api',
    '--project={}'.format(project_id),
    '--instance={}'.format(instance_id),
    '--table={}'.format(table_id),
    '--projectId={}'.format(project_id),
    '--instanceId={}'.format(instance_id),
    '--tableId={}'.format(table_id),
    '--job_name={}'.format(jobname),
    '--requirements_file=requirements.txt',
    '--disk_size_gb=100',
    '--region=us-central1',
    '--runner=dataflow',
    #'--runner=directRunner',
    '--autoscaling_algorithm=NONE',
    '--num_workers=100',
    '--staging_location=gs://juantest/stage',
    '--temp_location=gs://juantest/temp',
    '--setup_file=C:\\Users\\Juan\\Project\\python\\example_bigtable_beam\\beam_bigtable_package\\setup.py',
#    '--setup_file=/usr/src/app/example_bigtable_beam/beam_bigtable_package/setup.py',
    '--extra_package=C:\\Users\\Juan\\Project\\python\\example_bigtable_beam\\beam_bigtable_package\\dist\\beam_bigtable-0.3.106.tar.gz'
#    '--extra_package=/usr/src/app/example_bigtable_beam/beam_bigtable_package/dist/beam_bigtable-0.3.30.tar.gz'
  ])
  parser = argparse.ArgumentParser(argv)
  parser.add_argument('--projectId')
  parser.add_argument('--instanceId')
  parser.add_argument('--tableId')
  (known_args, pipeline_args) = parser.parse_known_args(argv)

  create_table = CreateAll(project_id, instance_id, table_id)

  print('ProjectID:',project_id)
  print('InstanceID:',instance_id)
  print('TableID:',table_id)
  print('JobID:', jobname)
  create_table.create_table()

  pipeline_options = PipelineOptions(argv)
  pipeline_options.view_as(SetupOptions).save_main_session = True

  row_count = 10000
  row_limit = 100
  row_step = row_count if row_count <= row_limit else row_count/row_limit

  with beam.Pipeline(options=pipeline_options) as p:
    second_step = (p
                   | 'Ranges' >> beam.Create([(str(i),str(i+row_step)) for i in xrange(0, row_count, row_step)])
                   | 'Group' >> beam.GroupByKey()
                   | 'Generate' >> beam.ParDo(GenerateRow())
                   | 'Write' >> WriteToBigTable(project_id=project_id,
                                                instance_id=instance_id,
                                                table_id=table_id)
                   | 'BigtableFromRead' >> ReadFromBigTable_Read(project_id=project_id,
                                                                 instance_id=instance_id,
                                                                 table_id=table_id))
    count = (second_step
             | 'Count' >> beam.combiners.Count.Globally())
    row_count = 10000
    assert_that(count, equal_to([row_count]))

    result = p.run()
Exemple #57
0
def run(argv=None, save_main_session=True):
    """Build and run the pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--output_topic',
                        required=True,
                        help=('Output PubSub topic of the form '
                              '"projects/<PROJECT>/topics/<TOPIC>".'))
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--input_topic',
                       help=('Input PubSub topic of the form '
                             '"projects/<PROJECT>/topics/<TOPIC>".'))
    group.add_argument(
        '--input_subscription',
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    pipeline_options.view_as(StandardOptions).streaming = True
    with beam.Pipeline(options=pipeline_options) as p:

        # Read from PubSub into a PCollection.
        if known_args.input_subscription:
            messages = (p
                        | beam.io.ReadFromPubSub(
                            subscription=known_args.input_subscription).
                        with_output_types(bytes))
        else:
            messages = (
                p
                | beam.io.ReadFromPubSub(
                    topic=known_args.input_topic).with_output_types(bytes))

        lines = messages | 'decode' >> beam.Map(lambda x: x.decode('utf-8'))

        # Count the occurrences of each word.
        def count_ones(word_ones):
            (word, ones) = word_ones
            return (word, sum(ones))

        counts = (lines
                  | 'split' >>
                  (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | beam.WindowInto(window.FixedWindows(15, 0))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(count_ones))

        # Format the counts into a PCollection of strings.
        def format_result(word_count):
            (word, count) = word_count
            return '%s: %d' % (word, count)

        output = (
            counts
            | 'format' >> beam.Map(format_result)
            | 'encode' >>
            beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes))

        # Write to PubSub.
        # pylint: disable=expression-not-assigned
        output | beam.io.WriteToPubSub(known_args.output_topic)
Exemple #58
0
def main(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(known_args.input)

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return (word, sum(ones))

    counts = (lines
              | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
        (word, count) = word_count
        return '%s: %d' % (word, count)

    output = counts | 'format' >> beam.Map(format_result)

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'write' >> WriteToText(known_args.output)

    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        empty_lines_filter = MetricsFilter().with_name('empty_lines')
        query_result = result.metrics().query(empty_lines_filter)
        if query_result['counters']:
            empty_lines_counter = query_result['counters'][0]
            logging.info('number of empty lines: %d',
                         empty_lines_counter.result)

        word_lengths_filter = MetricsFilter().with_name('word_len_dist')
        query_result = result.metrics().query(word_lengths_filter)
        if query_result['distributions']:
            word_lengths_dist = query_result['distributions'][0]
            logging.info('average word length: %d',
                         word_lengths_dist.result.mean)
Exemple #59
0
def run(argv=None):
    """Runs the workflow."""
    known_args, pipeline_args = parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    input_info = known_args.input

    with TestPipeline(options=pipeline_options) as p:
        source = SyntheticSource(input_info)

        # pylint: disable=expression-not-assigned
        barrier = known_args.barrier

        pc_list = []
        num_roots = 2**(len(known_args.steps) -
                        1) if (barrier == 'merge-gbk'
                               or barrier == 'merge-side-input') else 1
        for read_no in range(num_roots):
            pc_list.append((p | ('Read %d' % read_no) >> beam.io.Read(source)))

        for step_no, steps in enumerate(known_args.steps):
            if step_no != 0:
                new_pc_list = []
                for pc_no, pc in enumerate(pc_list):
                    if barrier == 'shuffle':
                        new_pc_list.append(
                            (pc | ('shuffle %d.%d' %
                                   (step_no, pc_no)) >> ShuffleBarrier()))
                    elif barrier == 'side-input':
                        new_pc_list.append(
                            (pc | ('side-input %d.%d' %
                                   (step_no, pc_no)) >> SideInputBarrier()))
                    elif barrier == 'expand-gbk':
                        new_pc_list.extend(
                            expand_using_gbk(
                                ('expand-gbk %d.%d' % (step_no, pc_no)), pc))
                    elif barrier == 'expand-second-output':
                        new_pc_list.extend(
                            expand_using_second_output(
                                ('expand-second-output %d.%d' %
                                 (step_no, pc_no)), pc))
                    elif barrier == 'merge-gbk':
                        if pc_no % 2 == 0:
                            new_pc_list.append(
                                merge_using_gbk(
                                    ('merge-gbk %d.%d' % (step_no, pc_no)), pc,
                                    pc_list[pc_no + 1]))
                        else:
                            continue
                    elif barrier == 'merge-side-input':
                        if pc_no % 2 == 0:
                            new_pc_list.append(
                                merge_using_side_input(
                                    ('merge-side-input %d.%d' %
                                     (step_no, pc_no)), pc,
                                    pc_list[pc_no + 1]))
                        else:
                            continue

                pc_list = new_pc_list

            new_pc_list = []
            for pc_no, pc in enumerate(pc_list):
                new_pc = pc | 'SyntheticStep %d.%d' % (
                    step_no, pc_no) >> beam.ParDo(
                        SyntheticStep(
                            per_element_delay_sec=steps['per_element_delay'],
                            per_bundle_delay_sec=steps['per_bundle_delay'],
                            output_records_per_input_record=steps[
                                'output_records_per_input_record'],
                            output_filter_ratio=steps['output_filter_ratio']))
                new_pc_list.append(new_pc)
            pc_list = new_pc_list

        if known_args.output:
            # If an output location is provided we format and write output.
            if len(pc_list) == 1:
                (pc_list[0]
                 | 'FormatOutput' >> beam.Map(lambda elm: (elm[0] + elm[1]))
                 | 'WriteOutput' >> WriteToText(known_args.output))

    logging.info('Pipeline run completed.')
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_input: str, output: str, metric_types: List[str],
        state_code: Optional[str], person_filter_ids: Optional[List[int]]):
    """Runs the recidivism calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is
    # necessary because the BuildRootEntity function tries to access attributes
    # of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been
    # instantiated, then the relationship properties are loaded and their
    # attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()

    query_dataset = all_pipeline_options['project'] + '.' + data_input
    reference_dataset = all_pipeline_options['project'] + '.' + reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (
            p
            | 'Load Persons' >> BuildRootEntity(
                dataset=query_dataset,
                root_entity_class=entities.StatePerson,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set))

        # Get StateIncarcerationPeriods
        incarceration_periods = (
            p
            | 'Load IncarcerationPeriods' >> BuildRootEntity(
                dataset=query_dataset,
                root_entity_class=entities.StateIncarcerationPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionViolations
        supervision_violations = \
            (p
             | 'Load SupervisionViolations' >>
             BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StateSupervisionViolation,
                             unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True,
                             unifying_id_field_filter_set=person_id_filter_set,
                             state_code=state_code
                             ))

        # TODO(2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = \
            (p
             | 'Load SupervisionViolationResponses' >>
             BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StateSupervisionViolationResponse,
                             unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True,
                             unifying_id_field_filter_set=person_id_filter_set,
                             state_code=state_code
                             ))

        # Group StateSupervisionViolationResponses and
        # StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                'violations': supervision_violations,
                'violation_responses': supervision_violation_responses
            } | 'Group StateSupervisionViolationResponses to '
            'StateSupervisionViolations' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolation entities on
        # the corresponding StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | 'Set hydrated StateSupervisionViolations on '
            'the StateSupervisionViolationResponses' >> beam.ParDo(
                SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses
        # by person_id
        incarceration_periods_and_violation_responses = (
            {
                'incarceration_periods': incarceration_periods,
                'violation_responses':
                violation_responses_with_hydrated_violations
            }
            | 'Group StateIncarcerationPeriods to '
            'StateSupervisionViolationResponses' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolationResponse entities on
        # the corresponding StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | 'Set hydrated StateSupervisionViolationResponses on '
            'the StateIncarcerationPeriods' >> beam.ParDo(
                SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their StateIncarcerationPeriods
        person_and_incarceration_periods = (
            {
                'person':
                persons,
                'incarceration_periods':
                incarceration_periods_with_source_violations
            }
            | 'Group StatePerson to StateIncarcerationPeriods' >>
            beam.CoGroupByKey())

        # Bring in the table that associates people and their county of residence
        person_id_to_county_query = select_all_by_person_query(
            reference_dataset,
            PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME,
            # TODO(3602): Once we put state_code on StatePerson objects, we can update the
            # persons_to_recent_county_of_residence query to have a state_code field, allowing us to also filter the
            # output by state_code.
            state_code_filter=None,
            person_id_filter_set=person_id_filter_set)

        person_id_to_county_kv = (
            p
            | "Read person_id to county associations from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(query=person_id_to_county_query,
                                       use_standard_sql=True))
            | "Convert person_id to county association table to KV" >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Identify ReleaseEvents events from the StatePerson's
        # StateIncarcerationPeriods
        person_events = (
            person_and_incarceration_periods
            | "ClassifyReleaseEvents" >> beam.ParDo(
                ClassifyReleaseEvents(), AsDict(person_id_to_county_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get recidivism metrics
        recidivism_metrics = (person_events
                              |
                              'Get Recidivism Metrics' >> GetRecidivismMetrics(
                                  pipeline_options=all_pipeline_options,
                                  metric_types=metric_types_set))

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            recidivism_metrics
            | 'Convert to dict to be written to BQ' >> beam.ParDo(
                RecidivismMetricWritableDict()).with_outputs(
                    'rates', 'counts'))

        # Write the recidivism metrics to the output tables in BigQuery
        rates_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            ReincarcerationRecidivismRateMetric)
        counts_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            ReincarcerationRecidivismCountMetric)

        _ = (writable_metrics.rates
             | f"Write rate metrics to BQ table: {rates_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=rates_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.counts
             | f"Write count metrics to BQ table: {counts_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=counts_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))