コード例 #1
0
ファイル: snippets.py プロジェクト: vikkyrk/incubator-beam
def model_pcollection(argv):
  """Creating a PCollection from data in local memory."""
  from apache_beam.utils.pipeline_options import PipelineOptions

  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--output',
                          dest='output',
                          required=True,
                          help='Output file to write results to.')

  pipeline_options = PipelineOptions(argv)
  my_options = pipeline_options.view_as(MyOptions)

  # [START model_pcollection]
  p = beam.Pipeline(options=pipeline_options)

  (p
   | beam.Create([
       'To be, or not to be: that is the question: ',
       'Whether \'tis nobler in the mind to suffer ',
       'The slings and arrows of outrageous fortune, ',
       'Or to take arms against a sea of troubles, '])
   | beam.io.WriteToText(my_options.output))

  result = p.run()
  # [END model_pcollection]
  result.wait_until_finish()
コード例 #2
0
  def test_with_requirements_file(self):
    try:
      staging_dir = tempfile.mkdtemp()
      requirements_cache_dir = tempfile.mkdtemp()
      source_dir = tempfile.mkdtemp()

      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).requirements_cache = requirements_cache_dir
      options.view_as(SetupOptions).requirements_file = os.path.join(
          source_dir, dependency.REQUIREMENTS_FILE)
      self.create_temp_file(
          os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
      self.assertEqual(
          sorted([dependency.REQUIREMENTS_FILE,
                  'abc.txt', 'def.txt']),
          sorted(dependency.stage_job_resources(
              options,
              populate_requirements_cache=self.populate_requirements_cache)))
      self.assertTrue(
          os.path.isfile(
              os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
      self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt')))
      self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt')))
    finally:
      shutil.rmtree(staging_dir)
      shutil.rmtree(requirements_cache_dir)
      shutil.rmtree(source_dir)
コード例 #3
0
  def test_with_setup_file(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()
    self.create_temp_file(
        os.path.join(source_dir, 'setup.py'), 'notused')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).setup_file = os.path.join(
        source_dir, 'setup.py')

    self.assertEqual(
        [dependency.WORKFLOW_TARBALL_FILE],
        dependency.stage_job_resources(
            options,
            # We replace the build setup command because a realistic one would
            # require the setuptools package to be installed. Note that we can't
            # use "touch" here to create the expected output tarball file, since
            # touch is not available on Windows, so we invoke python to produce
            # equivalent behavior.
            build_setup_args=[
                'python', '-c', 'open(__import__("sys").argv[1], "a")',
                os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)],
            temp_dir=source_dir))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
コード例 #4
0
def run(argv=None):
  """Runs the Wikipedia top edits pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """

  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://dataflow-samples/wikipedia_edits/*.json',
      help='Input specified as a GCS path containing a BigQuery table exported '
      'as json.')
  parser.add_argument('--output',
                      required=True,
                      help='Output file to write results to.')
  parser.add_argument('--sampling_threshold',
                      type=float,
                      default=0.1,
                      help='Fraction of entries used for session tracking')
  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  (p  # pylint: disable=expression-not-assigned
   | ReadFromText(known_args.input)
   | ComputeTopSessions(known_args.sampling_threshold)
   | WriteToText(known_args.output))

  p.run()
コード例 #5
0
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument('--run_locally', dest='run_locally', default='', help='Run data subset and do not save.')
    known_args, pipeline_args = parser.parse_known_args()
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
    delete_from_datastore('dancedeets-hrd', gcloud_options, known_args.run_locally)
コード例 #6
0
  def test_default_resources(self):
    staging_dir = tempfile.mkdtemp()
    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)

    self.assertEqual(
        [],
        dependency.stage_job_resources(options))
コード例 #7
0
def run(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""

  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--output',
                      dest='output',
                      # CHANGE 1/5: The Google Cloud Storage path is required
                      # for outputting the results.
                      default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  pipeline_args.extend([
      # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to
      # run your pipeline on the Google Cloud Dataflow Service.
      '--runner=DirectRunner',
      # CHANGE 3/5: Your project ID is required in order to run your pipeline on
      # the Google Cloud Dataflow Service.
      '--project=SET_YOUR_PROJECT_ID_HERE',
      # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
      # files.
      '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
      # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
      # files.
      '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
      '--job_name=your-wordcount-job',
  ])

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection.
  lines = p | 'read' >> ReadFromText(known_args.input)

  # Count the occurrences of each word.
  counts = (lines
            | 'split' >> (beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
                          .with_output_types(unicode))
            | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
            | 'group' >> beam.GroupByKey()
            | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))))

  # Format the counts into a PCollection of strings.
  output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))

  # Write the output using a "Write" transform that has side effects.
  # pylint: disable=expression-not-assigned
  output | 'write' >> WriteToText(known_args.output)

  # Actually run the pipeline (all operations above are deferred).
  p.run().wait_until_finish()
コード例 #8
0
ファイル: snippets.py プロジェクト: vikkyrk/incubator-beam
def examples_wordcount_minimal(renames):
  """MinimalWordCount example snippets."""
  import re

  import apache_beam as beam

  from apache_beam.utils.pipeline_options import GoogleCloudOptions
  from apache_beam.utils.pipeline_options import StandardOptions
  from apache_beam.utils.pipeline_options import PipelineOptions

  # [START examples_wordcount_minimal_options]
  options = PipelineOptions()
  google_cloud_options = options.view_as(GoogleCloudOptions)
  google_cloud_options.project = 'my-project-id'
  google_cloud_options.job_name = 'myjob'
  google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging'
  google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp'
  options.view_as(StandardOptions).runner = 'DataflowRunner'
  # [END examples_wordcount_minimal_options]

  # Run it locally for testing.
  options = PipelineOptions()

  # [START examples_wordcount_minimal_create]
  p = beam.Pipeline(options=options)
  # [END examples_wordcount_minimal_create]

  (
      # [START examples_wordcount_minimal_read]
      p | beam.io.ReadFromText(
          'gs://dataflow-samples/shakespeare/kinglear.txt')
      # [END examples_wordcount_minimal_read]

      # [START examples_wordcount_minimal_pardo]
      | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
      # [END examples_wordcount_minimal_pardo]

      # [START examples_wordcount_minimal_count]
      | beam.combiners.Count.PerElement()
      # [END examples_wordcount_minimal_count]

      # [START examples_wordcount_minimal_map]
      | beam.Map(lambda (word, count): '%s: %s' % (word, count))
      # [END examples_wordcount_minimal_map]

      # [START examples_wordcount_minimal_write]
      | beam.io.WriteToText('gs://my-bucket/counts.txt')
      # [END examples_wordcount_minimal_write]
  )

  p.visit(SnippetUtils.RenameFiles(renames))

  # [START examples_wordcount_minimal_run]
  result = p.run()
  # [END examples_wordcount_minimal_run]
  result.wait_until_finish()
コード例 #9
0
  def test_extra_package(self):
    options = PipelineOptions(['--extra_package', 'abc',
                               '--extra_packages', 'def',
                               '--extra_packages', 'ghi'])
    self.assertEqual(
        sorted(options.get_all_options()['extra_packages']),
        ['abc', 'def', 'ghi'])

    options = PipelineOptions(flags=[''])
    self.assertEqual(options.get_all_options()['extra_packages'], None)
コード例 #10
0
 def test_get_all_options(self):
   for case in PipelineOptionsTest.TEST_CASES:
     options = PipelineOptions(flags=case['flags'])
     self.assertDictContainsSubset(case['expected'], options.get_all_options())
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_flag,
                      case['expected']['mock_flag'])
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_option,
                      case['expected']['mock_option'])
コード例 #11
0
 def test_option_with_space(self):
   options = PipelineOptions(flags=['--option with space= value with space'])
   self.assertEqual(
       getattr(options.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
   options_from_dict = PipelineOptions.from_dictionary(
       options.get_all_options())
   self.assertEqual(
       getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
コード例 #12
0
def run(argv=None):
  """Main entry point; defines and runs the hourly_team_score pipeline."""
  parser = argparse.ArgumentParser()

  # The default maps to two large Google Cloud Storage files (each ~12GB)
  # holding two subsequent day's worth (roughly) of data.
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/game/gaming_data*.csv',
                      help='Path to the data file(s) containing game data.')
  parser.add_argument('--dataset',
                      dest='dataset',
                      required=True,
                      help='BigQuery Dataset to write tables to. '
                           'Must already exist.')
  parser.add_argument('--table_name',
                      dest='table_name',
                      default='hourly_team_score',
                      help='The BigQuery table name. Should not already exist.')
  parser.add_argument('--window_duration',
                      type=int,
                      default=60,
                      help='Numeric value of fixed window duration, in minutes')
  parser.add_argument('--start_min',
                      dest='start_min',
                      default='1970-01-01-00-00',
                      help='String representation of the first minute after '
                           'which to generate results in the format: '
                           'yyyy-MM-dd-HH-mm. Any input data timestamped '
                           'prior to that minute won\'t be included in the '
                           'sums.')
  parser.add_argument('--stop_min',
                      dest='stop_min',
                      default='2100-01-01-00-00',
                      help='String representation of the first minute for '
                           'which to generate results in the format: '
                           'yyyy-MM-dd-HH-mm. Any input data timestamped '
                           'after to that minute won\'t be included in the '
                           'sums.')

  known_args, pipeline_args = parser.parse_known_args(argv)

  pipeline_options = PipelineOptions(pipeline_args)
  p = beam.Pipeline(options=pipeline_options)
  pipeline_options.view_as(SetupOptions).save_main_session = True

  (p  # pylint: disable=expression-not-assigned
   | ReadFromText(known_args.input)
   | HourlyTeamScore(
       known_args.start_min, known_args.stop_min, known_args.window_duration)
   | WriteWindowedToBigQuery(
       known_args.table_name, known_args.dataset, configure_bigquery_write()))

  result = p.run()
  result.wait_until_finish()
コード例 #13
0
  def test_no_main_session(self):
    staging_dir = tempfile.mkdtemp()
    options = PipelineOptions()

    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    options.view_as(SetupOptions).save_main_session = False
    self.update_options(options)

    self.assertEqual(
        [],
        dependency.stage_job_resources(options))
コード例 #14
0
 def test_from_dictionary(self):
   for case in PipelineOptionsTest.TEST_CASES:
     options = PipelineOptions(flags=case['flags'])
     all_options_dict = options.get_all_options()
     options_from_dict = PipelineOptions.from_dictionary(all_options_dict)
     self.assertEqual(options_from_dict.view_as(
         PipelineOptionsTest.MockOptions).mock_flag,
                      case['expected']['mock_flag'])
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_option,
                      case['expected']['mock_option'])
コード例 #15
0
 def test_no_temp_location(self):
   staging_dir = tempfile.mkdtemp()
   options = PipelineOptions()
   google_cloud_options = options.view_as(GoogleCloudOptions)
   google_cloud_options.staging_location = staging_dir
   self.update_options(options)
   google_cloud_options.temp_location = None
   with self.assertRaises(RuntimeError) as cm:
     dependency.stage_job_resources(options)
   self.assertEqual('The --temp_location option must be specified.',
                    cm.exception.message)
コード例 #16
0
def run(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""

  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--kind',
                      dest='kind',
                      required=True,
                      help='Datastore Kind')
  parser.add_argument('--namespace',
                      dest='namespace',
                      help='Datastore Namespace')
  parser.add_argument('--ancestor',
                      dest='ancestor',
                      default='root',
                      help='The ancestor key name for all entities.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output file to write results to.')
  parser.add_argument('--read_only',
                      action='store_true',
                      help='Read an existing dataset, do not write first')
  parser.add_argument('--num_shards',
                      dest='num_shards',
                      type=int,
                      # If the system should choose automatically.
                      default=0,
                      help='Number of output shards')

  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  gcloud_options = pipeline_options.view_as(GoogleCloudOptions)

  # Write to Datastore if `read_only` options is not specified.
  if not known_args.read_only:
    write_to_datastore(gcloud_options.project, known_args, pipeline_options)

  # Read entities from Datastore.
  result = read_from_datastore(gcloud_options.project, known_args,
                               pipeline_options)

  empty_lines_filter = MetricsFilter().with_name('empty_lines')
  query_result = result.metrics().query(empty_lines_filter)
  if query_result['counters']:
    empty_lines_counter = query_result['counters'][0]
    logging.info('number of empty lines: %d', empty_lines_counter.committed)
コード例 #17
0
ファイル: bigshuffle.py プロジェクト: amitsela/incubator-beam
def run(argv=None):
  # pylint: disable=expression-not-assigned

  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      required=True,
                      help='Input file pattern to process.')
  parser.add_argument('--output',
                      required=True,
                      help='Output file pattern to write results to.')
  parser.add_argument('--checksum_output',
                      help='Checksum output file pattern.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection.
  lines = p | ReadFromText(known_args.input, coder=beam.coders.BytesCoder())

  # Count the occurrences of each word.
  output = (lines
            | 'split' >> beam.Map(
                lambda x: (x[:10], x[10:99]))
            .with_output_types(beam.typehints.KV[str, str])
            | 'group' >> beam.GroupByKey()
            | 'format' >> beam.FlatMap(
                lambda (key, vals): ['%s%s' % (key, val) for val in vals]))

  # Write the output using a "Write" transform that has side effects.
  output | WriteToText(known_args.output)

  # Optionally write the input and output checksums.
  if known_args.checksum_output:
    input_csum = (lines
                  | 'input-csum' >> beam.Map(crc32line)
                  | 'combine-input-csum' >> beam.CombineGlobally(sum)
                  | 'hex-format' >> beam.Map(lambda x: '%x' % x))
    input_csum | 'write-input-csum' >> WriteToText(
        known_args.checksum_output + '-input')

    output_csum = (output
                   | 'output-csum' >> beam.Map(crc32line)
                   | 'combine-output-csum' >> beam.CombineGlobally(sum)
                   | 'hex-format-output' >> beam.Map(lambda x: '%x' % x))
    output_csum | 'write-output-csum' >> WriteToText(
        known_args.checksum_output + '-output')

  # Actually run the pipeline (all operations above are deferred).
  return p.run()
コード例 #18
0
def run(argv=None):
  """Runs the workflow counting the long words and short words separately."""

  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--output',
                      required=True,
                      help='Output prefix for files to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  lines = p | ReadFromText(known_args.input)

  # with_outputs allows accessing the side outputs of a DoFn.
  split_lines_result = (lines
                        | beam.ParDo(SplitLinesToWordsFn()).with_outputs(
                            SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS,
                            SplitLinesToWordsFn.SIDE_OUTPUT_TAG_CHARACTER_COUNT,
                            main='words'))

  # split_lines_result is an object of type DoOutputsTuple. It supports
  # accessing result in alternative ways.
  words, _, _ = split_lines_result
  short_words = split_lines_result[
      SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS]
  character_count = split_lines_result.tag_character_count

  # pylint: disable=expression-not-assigned
  (character_count
   | 'pair_with_key' >> beam.Map(lambda x: ('chars_temp_key', x))
   | beam.GroupByKey()
   | 'count chars' >> beam.Map(lambda (_, counts): sum(counts))
   | 'write chars' >> WriteToText(known_args.output + '-chars'))

  # pylint: disable=expression-not-assigned
  (short_words
   | 'count short words' >> CountWords()
   | 'write short words' >> WriteToText(
       known_args.output + '-short-words'))

  # pylint: disable=expression-not-assigned
  (words
   | 'count words' >> CountWords()
   | 'write words' >> WriteToText(known_args.output + '-words'))

  return p.run()
コード例 #19
0
ファイル: dependency_test.py プロジェクト: qq840873731/beam-1
    def test_with_main_session(self):
        staging_dir = tempfile.mkdtemp()
        options = PipelineOptions()

        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        options.view_as(SetupOptions).save_main_session = True
        self.update_options(options)

        self.assertEqual([names.PICKLED_MAIN_SESSION_FILE],
                         dependency.stage_job_resources(options))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
コード例 #20
0
 def test_requirements_file_not_present(self):
   staging_dir = tempfile.mkdtemp()
   with self.assertRaises(RuntimeError) as cm:
     options = PipelineOptions()
     options.view_as(GoogleCloudOptions).staging_location = staging_dir
     self.update_options(options)
     options.view_as(SetupOptions).requirements_file = 'nosuchfile'
     dependency.stage_job_resources(
         options, populate_requirements_cache=self.populate_requirements_cache)
   self.assertEqual(
       cm.exception.message,
       'The file %s cannot be found. It was specified in the '
       '--requirements_file command line option.' % 'nosuchfile')
コード例 #21
0
  def test_sdk_location_gcs(self):
    staging_dir = tempfile.mkdtemp()
    sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
    self.override_file_copy(sdk_location, staging_dir)

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    self.assertEqual(
        [names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(options))
コード例 #22
0
 def test_from_dictionary(self):
     for case in PipelineOptionsTest.TEST_CASES:
         options = PipelineOptions(flags=case['flags'])
         all_options_dict = options.get_all_options()
         options_from_dict = PipelineOptions.from_dictionary(
             all_options_dict)
         self.assertEqual(
             options_from_dict.view_as(
                 PipelineOptionsTest.MockOptions).mock_flag,
             case['expected']['mock_flag'])
         self.assertEqual(
             options.view_as(PipelineOptionsTest.MockOptions).mock_option,
             case['expected']['mock_option'])
コード例 #23
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(known_args.input)

    # Count the occurrences of each word.
    counts = (lines
              | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))))

    # Format the counts into a PCollection of strings.
    output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' %
                                           (word, c))

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'write' >> WriteToText(known_args.output)

    # Actually run the pipeline (all operations above are deferred).
    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        empty_lines_filter = MetricsFilter().with_name('empty_lines')
        query_result = result.metrics().query(empty_lines_filter)
        if query_result['counters']:
            empty_lines_counter = query_result['counters'][0]
            logging.info('number of empty lines: %d',
                         empty_lines_counter.committed)
コード例 #24
0
  def test_with_extra_packages_missing_files(self):
    staging_dir = tempfile.mkdtemp()
    with self.assertRaises(RuntimeError) as cm:

      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).extra_packages = ['nosuchfile.tar.gz']

      dependency.stage_job_resources(options)
    self.assertEqual(
        cm.exception.message,
        'The file %s cannot be found. It was specified in the '
        '--extra_packages command line option.' % 'nosuchfile.tar.gz')
コード例 #25
0
    def test_sdk_location_local_not_present(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = 'nosuchdir'
        with self.assertRaises(RuntimeError) as cm:
            options = PipelineOptions()
            options.view_as(GoogleCloudOptions).staging_location = staging_dir
            self.update_options(options)
            options.view_as(SetupOptions).sdk_location = sdk_location

            dependency.stage_job_resources(options)
        self.assertEqual(
            'The file "%s" cannot be found. Its '
            'location was specified by the --sdk_location command-line option.'
            % sdk_location, cm.exception.message)
コード例 #26
0
  def test_with_main_session(self):
    staging_dir = tempfile.mkdtemp()
    options = PipelineOptions()

    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    options.view_as(SetupOptions).save_main_session = True
    self.update_options(options)

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(options))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
コード例 #27
0
    def test_setup_file_not_present(self):
        staging_dir = tempfile.mkdtemp()

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).setup_file = 'nosuchfile'

        with self.assertRaises(RuntimeError) as cm:
            dependency.stage_job_resources(options)
        self.assertEqual(
            cm.exception.message,
            'The file %s cannot be found. It was specified in the '
            '--setup_file command line option.' % 'nosuchfile')
コード例 #28
0
  def test_experiments(self):
    options = PipelineOptions(['--experiment', 'abc', '--experiment', 'def'])
    self.assertEqual(
        sorted(options.get_all_options()['experiments']), ['abc', 'def'])

    options = PipelineOptions(['--experiments', 'abc', '--experiments', 'def'])
    self.assertEqual(
        sorted(options.get_all_options()['experiments']), ['abc', 'def'])

    options = PipelineOptions(flags=[''])
    self.assertEqual(options.get_all_options()['experiments'], None)
コード例 #29
0
ファイル: datastore_wordcount.py プロジェクト: deeareyou/beam
def run(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""

  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--kind',
                      dest='kind',
                      required=True,
                      help='Datastore Kind')
  parser.add_argument('--namespace',
                      dest='namespace',
                      help='Datastore Namespace')
  parser.add_argument('--ancestor',
                      dest='ancestor',
                      default='root',
                      help='The ancestor key name for all entities.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output file to write results to.')
  parser.add_argument('--read_only',
                      action='store_true',
                      help='Read an existing dataset, do not write first')
  parser.add_argument('--num_shards',
                      dest='num_shards',
                      type=int,
                      # If the system should choose automatically.
                      default=0,
                      help='Number of output shards')

  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  gcloud_options = pipeline_options.view_as(GoogleCloudOptions)

  # Write to Datastore if `read_only` options is not specified.
  if not known_args.read_only:
    write_to_datastore(gcloud_options.project, known_args, pipeline_options)

  # Read entities from Datastore.
  result = read_from_datastore(gcloud_options.project, known_args,
                               pipeline_options)

  result.metrics().query()
コード例 #30
0
ファイル: snippets.py プロジェクト: vikkyrk/incubator-beam
def pipeline_options_remote(argv):
  """Creating a Pipeline using a PipelineOptions object for remote execution."""

  from apache_beam import Pipeline
  from apache_beam.utils.pipeline_options import PipelineOptions

  # [START pipeline_options_create]
  options = PipelineOptions(flags=argv)
  # [END pipeline_options_create]

  # [START pipeline_options_define_custom]
  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input')
      parser.add_argument('--output')
  # [END pipeline_options_define_custom]

  from apache_beam.utils.pipeline_options import GoogleCloudOptions
  from apache_beam.utils.pipeline_options import StandardOptions

  # [START pipeline_options_dataflow_service]
  # Create and set your PipelineOptions.
  options = PipelineOptions(flags=argv)

  # For Cloud execution, set the Cloud Platform project, job_name,
  # staging location, temp_location and specify DataflowRunner.
  google_cloud_options = options.view_as(GoogleCloudOptions)
  google_cloud_options.project = 'my-project-id'
  google_cloud_options.job_name = 'myjob'
  google_cloud_options.staging_location = 'gs://my-bucket/binaries'
  google_cloud_options.temp_location = 'gs://my-bucket/temp'
  options.view_as(StandardOptions).runner = 'DataflowRunner'

  # Create the Pipeline with the specified options.
  p = Pipeline(options=options)
  # [END pipeline_options_dataflow_service]

  my_options = options.view_as(MyOptions)
  my_input = my_options.input
  my_output = my_options.output

  p = TestPipeline()  # Use TestPipeline for testing.

  lines = p | beam.io.ReadFromText(my_input)
  lines | beam.io.WriteToText(my_output)

  p.run()
コード例 #31
0
ファイル: wordcount.py プロジェクト: amitsela/incubator-beam
def run(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""
  class WordcountOptions(PipelineOptions):
    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_value_provider_argument(
          '--input',
          dest='input',
          default='gs://dataflow-samples/shakespeare/kinglear.txt',
          help='Input file to process.')
      parser.add_value_provider_argument(
          '--output',
          dest='output',
          required=True,
          help='Output file to write results to.')
  pipeline_options = PipelineOptions(argv)
  wordcount_options = pipeline_options.view_as(WordcountOptions)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection.
  lines = p | 'read' >> ReadFromText(wordcount_options.input)

  # Count the occurrences of each word.
  counts = (lines
            | 'split' >> (beam.ParDo(WordExtractingDoFn())
                          .with_output_types(unicode))
            | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
            | 'group' >> beam.GroupByKey()
            | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))))

  # Format the counts into a PCollection of strings.
  output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))

  # Write the output using a "Write" transform that has side effects.
  # pylint: disable=expression-not-assigned
  output | 'write' >> WriteToText(wordcount_options.output)

  # Actually run the pipeline (all operations above are deferred).
  result = p.run()
  result.wait_until_finish()
  empty_lines_filter = MetricsFilter().with_name('empty_lines')
  query_result = result.metrics().query(empty_lines_filter)
  if query_result['counters']:
    empty_lines_counter = query_result['counters'][0]
    logging.info('number of empty lines: %d', empty_lines_counter.committed)
コード例 #32
0
  def test_sdk_location_local_not_present(self):
    staging_dir = tempfile.mkdtemp()
    sdk_location = 'nosuchdir'
    with self.assertRaises(RuntimeError) as cm:
      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).sdk_location = sdk_location

      dependency.stage_job_resources(options)
    self.assertEqual(
        'The file "%s" cannot be found. Its '
        'location was specified by the --sdk_location command-line option.' %
        sdk_location,
        cm.exception.message)
コード例 #33
0
  def test_redefine_options(self):

    class TestRedefinedOptios(PipelineOptions):  # pylint: disable=unused-variable

      @classmethod
      def _add_argparse_args(cls, parser):
        parser.add_argument('--redefined_flag', action='store_true')

    class TestRedefinedOptios(PipelineOptions):

      @classmethod
      def _add_argparse_args(cls, parser):
        parser.add_argument('--redefined_flag', action='store_true')

    options = PipelineOptions(['--redefined_flag'])
    self.assertTrue(options.get_all_options()['redefined_flag'])
コード例 #34
0
    def test_full_completion(self):
        # Create dummy file and close it.  Note that we need to do this because
        # Windows does not allow NamedTemporaryFiles to be reopened elsewhere
        # before the temporary file is closed.
        dummy_file = tempfile.NamedTemporaryFile(delete=False)
        dummy_file_name = dummy_file.name
        dummy_file.close()

        dummy_dir = tempfile.mkdtemp()

        remote_runner = DataflowRunner()
        pipeline = Pipeline(
            remote_runner,
            options=PipelineOptions([
                '--dataflow_endpoint=ignored',
                '--sdk_location=' + dummy_file_name, '--job_name=test-job',
                '--project=test-project', '--staging_location=' + dummy_dir,
                '--temp_location=/dev/null',
                '--template_location=' + dummy_file_name, '--no_auth=True'
            ]))

        pipeline | beam.Create([1, 2, 3]) | beam.Map(lambda x: x)  # pylint: disable=expression-not-assigned
        pipeline.run().wait_until_finish()
        with open(dummy_file_name) as template_file:
            saved_job_dict = json.load(template_file)
            self.assertEqual(
                saved_job_dict['environment']['sdkPipelineOptions']['options']
                ['project'], 'test-project')
            self.assertEqual(
                saved_job_dict['environment']['sdkPipelineOptions']['options']
                ['job_name'], 'test-job')
コード例 #35
0
 def test_dataflow_job_file_and_template_location_mutually_exclusive(self):
     runner = MockRunners.OtherRunner()
     options = PipelineOptions(
         ['--template_location', 'abc', '--dataflow_job_file', 'def'])
     validator = PipelineOptionsValidator(options, runner)
     errors = validator.validate()
     self.assertTrue(errors)
コード例 #36
0
ファイル: preprocess.py プロジェクト: larry-fuy/kaggle-fish
def run(in_args=None):
    """Runs the pre-processing pipeline."""

    pipeline_options = PipelineOptions.from_dictionary(vars(in_args))
    p = beam.Pipeline(options=pipeline_options)
    configure_pipeline(p, in_args)
    p.run()
コード例 #37
0
  def test_streaming(self):
    pipeline_options = PipelineOptions(['--streaming'])
    runner = MockRunners.TestDataflowRunner()
    validator = PipelineOptionsValidator(pipeline_options, runner)
    errors = validator.validate()

    self.assertIn('Streaming pipelines are not supported.', errors)
コード例 #38
0
  def test_redefine_options(self):

    class TestRedefinedOptios(PipelineOptions):  # pylint: disable=unused-variable

      @classmethod
      def _add_argparse_args(cls, parser):
        parser.add_argument('--redefined_flag', action='store_true')

    class TestRedefinedOptios(PipelineOptions):

      @classmethod
      def _add_argparse_args(cls, parser):
        parser.add_argument('--redefined_flag', action='store_true')

    options = PipelineOptions(['--redefined_flag'])
    self.assertEqual(options.get_all_options()['redefined_flag'], True)
コード例 #39
0
def run(argv=None):
    """Main entry point; defines and runs the user_score pipeline."""
    parser = argparse.ArgumentParser()

    # The default maps to two large Google Cloud Storage files (each ~12GB)
    # holding two subsequent day's worth (roughly) of data.
    parser.add_argument('--input',
                        dest='input',
                        default='gs://dataflow-samples/game/gaming_data*.csv',
                        help='Path to the data file(s) containing game data.')
    parser.add_argument('--dataset',
                        dest='dataset',
                        required=True,
                        help='BigQuery Dataset to write tables to. '
                        'Must already exist.')
    parser.add_argument(
        '--table_name',
        dest='table_name',
        default='user_score',
        help='The BigQuery table name. Should not already exist.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    p = beam.Pipeline(options=pipeline_options)

    (p  # pylint: disable=expression-not-assigned
     | ReadFromText(
         known_args.input)  # Read events from a file and parse them.
     | UserScore()
     | WriteToBigQuery(known_args.table_name, known_args.dataset,
                       configure_bigquery_write()))

    result = p.run()
    result.wait_until_finish()
コード例 #40
0
 def test_with_extra_packages_invalid_file_name(self):
   staging_dir = tempfile.mkdtemp()
   source_dir = tempfile.mkdtemp()
   self.create_temp_file(
       os.path.join(source_dir, 'abc.tgz'), 'nothing')
   with self.assertRaises(RuntimeError) as cm:
     options = PipelineOptions()
     options.view_as(GoogleCloudOptions).staging_location = staging_dir
     self.update_options(options)
     options.view_as(SetupOptions).extra_packages = [
         os.path.join(source_dir, 'abc.tgz')]
     dependency.stage_job_resources(options)
   self.assertEqual(
       cm.exception.message,
       'The --extra_package option expects a full path ending with ".tar" or '
       '".tar.gz" instead of %s' % os.path.join(source_dir, 'abc.tgz'))
コード例 #41
0
 def test_with_extra_packages_invalid_file_name(self):
     staging_dir = tempfile.mkdtemp()
     source_dir = tempfile.mkdtemp()
     self.create_temp_file(os.path.join(source_dir, 'abc.tgz'), 'nothing')
     with self.assertRaises(RuntimeError) as cm:
         options = PipelineOptions()
         options.view_as(GoogleCloudOptions).staging_location = staging_dir
         self.update_options(options)
         options.view_as(SetupOptions).extra_packages = [
             os.path.join(source_dir, 'abc.tgz')
         ]
         dependency.stage_job_resources(options)
     self.assertEqual(
         cm.exception.message,
         'The --extra_package option expects a full path ending with ".tar" or '
         '".tar.gz" instead of %s' % os.path.join(source_dir, 'abc.tgz'))
コード例 #42
0
    def test_sdk_location_default(self):
        staging_dir = tempfile.mkdtemp()
        expected_from_url = 'pypi'
        expected_from_path = self.override_pypi_download(
            expected_from_url, staging_dir)
        self.override_file_copy(expected_from_path, staging_dir)

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = 'default'

        self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
                         dependency.stage_job_resources(
                             options,
                             file_copy=dependency._dependency_file_copy))
コード例 #43
0
    def test_with_extra_packages_missing_files(self):
        staging_dir = tempfile.mkdtemp()
        with self.assertRaises(RuntimeError) as cm:

            options = PipelineOptions()
            options.view_as(GoogleCloudOptions).staging_location = staging_dir
            self.update_options(options)
            options.view_as(SetupOptions).extra_packages = [
                'nosuchfile.tar.gz'
            ]

            dependency.stage_job_resources(options)
        self.assertEqual(
            cm.exception.message,
            'The file %s cannot be found. It was specified in the '
            '--extra_packages command line option.' % 'nosuchfile.tar.gz')
コード例 #44
0
    def __init__(self,
                 runner=None,
                 options=None,
                 argv=None,
                 is_integration_test=False,
                 blocking=True):
        """Initialize a pipeline object for test.

    Args:
      runner: An object of type 'PipelineRunner' that will be used to execute
        the pipeline. For registered runners, the runner name can be specified,
        otherwise a runner object must be supplied.
      options: A configured 'PipelineOptions' object containing arguments
        that should be used for running the pipeline job.
      argv: A list of arguments (such as sys.argv) to be used for building a
        'PipelineOptions' object. This will only be used if argument 'options'
        is None.
      is_integration_test: True if the test is an integration test, False
        otherwise.
      blocking: Run method will wait until pipeline execution is completed.

    Raises:
      ValueError: if either the runner or options argument is not of the
      expected type.
    """
        self.is_integration_test = is_integration_test
        self.options_list = self._parse_test_option_args(argv)
        self.blocking = blocking
        if options is None:
            options = PipelineOptions(self.options_list)
        super(TestPipeline, self).__init__(runner, options)
コード例 #45
0
ファイル: snippets.py プロジェクト: qq840873731/beam-1
def model_textio(renames):
    """Using a Read and Write transform to read/write text files."""
    def filter_words(x):
        import re
        return re.findall(r'[A-Za-z\']+', x)

    import apache_beam as beam
    from apache_beam.utils.pipeline_options import PipelineOptions

    # [START model_textio_read]
    p = beam.Pipeline(options=PipelineOptions())
    # [START model_pipelineio_read]
    lines = p | 'ReadFromText' >> beam.io.ReadFromText('path/to/input-*.csv')
    # [END model_pipelineio_read]
    # [END model_textio_read]

    # [START model_textio_write]
    filtered_words = lines | 'FilterWords' >> beam.FlatMap(filter_words)
    # [START model_pipelineio_write]
    filtered_words | 'WriteToText' >> beam.io.WriteToText(
        '/path/to/numbers', file_name_suffix='.csv')
    # [END model_pipelineio_write]
    # [END model_textio_write]

    p.visit(SnippetUtils.RenameFiles(renames))
    p.run().wait_until_finish()
コード例 #46
0
ファイル: bigquery_test.py プロジェクト: wikier/beam
 def test_table_schema_without_project(self):
     # Writer should pick executing project by default.
     sink = beam.io.BigQuerySink(table='mydataset.mytable')
     options = PipelineOptions(flags=['--project', 'myproject'])
     sink.pipeline_options = options
     writer = sink.writer()
     self.assertEquals('myproject', writer.project_id)
コード例 #47
0
ファイル: snippets_test.py プロジェクト: wileeam/beam
    def test_bad_types(self):
        # [START type_hints_missing_define_numbers]
        p = TestPipeline(options=PipelineOptions(pipeline_type_check=True))

        numbers = p | beam.Create(['1', '2', '3'])
        # [END type_hints_missing_define_numbers]

        # Consider the following code.
        # pylint: disable=expression-not-assigned
        # pylint: disable=unused-variable
        # [START type_hints_missing_apply]
        evens = numbers | beam.Filter(lambda x: x % 2 == 0)
        # [END type_hints_missing_apply]

        # Now suppose numbers was defined as [snippet above].
        # When running this pipeline, you'd get a runtime error,
        # possibly on a remote machine, possibly very late.

        with self.assertRaises(TypeError):
            p.run()

        # To catch this early, we can assert what types we expect.
        with self.assertRaises(typehints.TypeCheckError):
            # [START type_hints_takes]
            evens = numbers | beam.Filter(
                lambda x: x % 2 == 0).with_input_types(int)
            # [END type_hints_takes]

        # Type hints can be declared on DoFns and callables as well, rather
        # than where they're used, to be more self contained.
        with self.assertRaises(typehints.TypeCheckError):
            # [START type_hints_do_fn]
            @beam.typehints.with_input_types(int)
            class FilterEvensDoFn(beam.DoFn):
                def process(self, element):
                    if element % 2 == 0:
                        yield element

            evens = numbers | beam.ParDo(FilterEvensDoFn())
            # [END type_hints_do_fn]

        words = p | 'words' >> beam.Create(['a', 'bb', 'c'])
        # One can assert outputs and apply them to transforms as well.
        # Helps document the contract and checks it at pipeline construction time.
        # [START type_hints_transform]
        T = beam.typehints.TypeVariable('T')

        @beam.typehints.with_input_types(T)
        @beam.typehints.with_output_types(beam.typehints.Tuple[int, T])
        class MyTransform(beam.PTransform):
            def expand(self, pcoll):
                return pcoll | beam.Map(lambda x: (len(x), x))

        words_with_lens = words | MyTransform()
        # [END type_hints_transform]

        # pylint: disable=expression-not-assigned
        with self.assertRaises(typehints.TypeCheckError):
            words_with_lens | beam.Map(lambda x: x).with_input_types(
                beam.typehints.Tuple[int, int])
コード例 #48
0
ファイル: display_test.py プロジェクト: sbilac/incubator-beam
 def test_create_list_display_data(self):
   flags = ['--extra_package', 'package1', '--extra_package', 'package2']
   pipeline_options = PipelineOptions(flags=flags)
   items = DisplayData.create_from_options(pipeline_options).items
   hc.assert_that(items, hc.contains_inanyorder(
       DisplayDataItemMatcher('extra_packages',
                              str(['package1', 'package2']))))
コード例 #49
0
def run(argv=None):
    known_args, pipeline_args = get_args(argv)
    options = PipelineOptions(pipeline_args)

    run_count1(known_args, options)
    run_count2(known_args, options)
    run_count3(known_args, options)
コード例 #50
0
def run(argv=None):
    """Runs the debugging wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection, count the occurrences of
    # each word and filter by a list of words.
    filtered_words = (
        p | 'read' >> ReadFromText(known_args.input)
        | CountWords()
        | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach')))

    # assert_that is a convenient PTransform that checks a PCollection has an
    # expected value. Asserts are best used in unit tests with small data sets but
    # is demonstrated here as a teaching tool.
    #
    # Note assert_that does not provide any output and that successful completion
    # of the Pipeline implies that the expectations were  met. Learn more at
    # https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to
    # test your pipeline.
    beam.assert_that(filtered_words,
                     beam.equal_to([('Flourish', 3), ('stomach', 1)]))

    # Format the counts into a PCollection of strings and write the output using a
    # "Write" transform that has side effects.
    # pylint: disable=unused-variable
    output = (filtered_words
              | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))
              | 'write' >> WriteToText(known_args.output))

    # Actually run the pipeline (all operations above are deferred).
    p.run().wait_until_finish()
コード例 #51
0
  def test_with_extra_packages(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()
    self.create_temp_file(
        os.path.join(source_dir, 'abc.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'xyz.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'xyz2.tar'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'whl.whl'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).extra_packages = [
        os.path.join(source_dir, 'abc.tar.gz'),
        os.path.join(source_dir, 'xyz.tar.gz'),
        os.path.join(source_dir, 'xyz2.tar'),
        os.path.join(source_dir, 'whl.whl'),
        'gs://my-gcs-bucket/gcs.tar.gz']

    gcs_copied_files = []

    def file_copy(from_path, to_path):
      if from_path.startswith('gs://'):
        gcs_copied_files.append(from_path)
        _, from_name = os.path.split(from_path)
        self.create_temp_file(os.path.join(to_path, from_name), 'nothing')
        logging.info('Fake copied GCS file: %s to %s', from_path, to_path)
      elif to_path.startswith('gs://'):
        logging.info('Faking file_copy(%s, %s)', from_path, to_path)
      else:
        shutil.copyfile(from_path, to_path)

    dependency._dependency_file_copy = file_copy

    self.assertEqual(
        ['abc.tar.gz', 'xyz.tar.gz', 'xyz2.tar', 'whl.whl', 'gcs.tar.gz',
         dependency.EXTRA_PACKAGES_FILE],
        dependency.stage_job_resources(options))
    with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f:
      self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'xyz2.tar\n',
                        'whl.whl\n', 'gcs.tar.gz\n'], f.readlines())
    self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
コード例 #52
0
def run(argv=None):
  """Runs the debugging wordcount pipeline."""

  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection, count the occurrences of
  # each word and filter by a list of words.
  filtered_words = (
      p | 'read' >> ReadFromText(known_args.input)
      | CountWords()
      | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach')))

  # assert_that is a convenient PTransform that checks a PCollection has an
  # expected value. Asserts are best used in unit tests with small data sets but
  # is demonstrated here as a teaching tool.
  #
  # Note assert_that does not provide any output and that successful completion
  # of the Pipeline implies that the expectations were  met. Learn more at
  # https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to
  # test your pipeline.
  beam.assert_that(
      filtered_words, beam.equal_to([('Flourish', 3), ('stomach', 1)]))

  # Format the counts into a PCollection of strings and write the output using a
  # "Write" transform that has side effects.
  # pylint: disable=unused-variable
  output = (filtered_words
            | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))
            | 'write' >> WriteToText(known_args.output))

  # Actually run the pipeline (all operations above are deferred).
  p.run().wait_until_finish()
コード例 #53
0
 def test_validate_dataflow_job_file(self):
   runner = MockRunners.OtherRunner()
   options = PipelineOptions([
       '--dataflow_job_file', 'abc'
   ])
   validator = PipelineOptionsValidator(options, runner)
   errors = validator.validate()
   self.assertFalse(errors)
コード例 #54
0
ファイル: bigquery_test.py プロジェクト: wikier/beam
 def test_table_schema_without_project(self):
     # Reader should pick executing project by default.
     source = beam.io.BigQuerySource(table='mydataset.mytable')
     options = PipelineOptions(flags=['--project', 'myproject'])
     source.pipeline_options = options
     reader = source.reader()
     self.assertEquals('SELECT * FROM [myproject:mydataset.mytable];',
                       reader.query)
コード例 #55
0
  def test_sdk_location_default(self):
    staging_dir = tempfile.mkdtemp()
    expected_from_url = 'pypi'
    expected_from_path = self.override_pypi_download(
        expected_from_url, staging_dir)
    self.override_file_copy(expected_from_path, staging_dir)

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = 'default'

    self.assertEqual(
        [names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(
            options,
            file_copy=dependency._dependency_file_copy))
コード例 #56
0
 def test_validate_template_location(self):
   runner = MockRunners.OtherRunner()
   options = PipelineOptions([
       '--template_location', 'abc',
   ])
   validator = PipelineOptionsValidator(options, runner)
   errors = validator.validate()
   self.assertFalse(errors)
コード例 #57
0
ファイル: snippets_test.py プロジェクト: wileeam/beam
 def test_runtime_checks_on(self):
     # pylint: disable=expression-not-assigned
     p = TestPipeline(options=PipelineOptions(runtime_type_check=True))
     with self.assertRaises(typehints.TypeCheckError):
         # [START type_hints_runtime_on]
         p | beam.Create(['a'
                          ]) | beam.Map(lambda x: 3).with_output_types(str)
         p.run()
コード例 #58
0
def model_bigqueryio():
    """Using a Read and Write transform to read/write to BigQuery."""
    import apache_beam as beam
    from apache_beam.utils.pipeline_options import PipelineOptions

    # [START model_bigqueryio_read]
    p = beam.Pipeline(options=PipelineOptions())
    weather_data = p | beam.io.Read(
        'ReadWeatherStations',
        beam.io.BigQuerySource(
            'clouddataflow-readonly:samples.weather_stations'))
    # [END model_bigqueryio_read]

    # [START model_bigqueryio_query]
    p = beam.Pipeline(options=PipelineOptions())
    weather_data = p | beam.io.Read(
        'ReadYearAndTemp',
        beam.io.BigQuerySource(
            query='SELECT year, mean_temp FROM samples.weather_stations'))
    # [END model_bigqueryio_query]

    # [START model_bigqueryio_query_standard_sql]
    p = beam.Pipeline(options=PipelineOptions())
    weather_data = p | beam.io.Read(
        'ReadYearAndTemp',
        beam.io.BigQuerySource(
            query='SELECT year, mean_temp FROM `samples.weather_stations`',
            use_standard_sql=True))
    # [END model_bigqueryio_query_standard_sql]

    # [START model_bigqueryio_schema]
    schema = 'source:STRING, quote:STRING'
    # [END model_bigqueryio_schema]

    # [START model_bigqueryio_write]
    quotes = p | beam.Create([{
        'source': 'Mahatma Ghandi',
        'quote': 'My life is my message.'
    }])
    quotes | beam.io.Write(
        'Write',
        beam.io.BigQuerySink(
            'my-project:output.output_table',
            schema=schema,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))
コード例 #59
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(known_args.input)

    # Count the occurrences of each word.
    counts = (lines
              | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))))

    # Format the counts into a PCollection of strings.
    output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' %
                                           (word, c))

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'write' >> WriteToText(known_args.output)

    # Actually run the pipeline (all operations above are deferred).
    result = p.run()
    result.wait_until_finish()
    empty_line_values = result.aggregated_values(empty_line_aggregator)
    logging.info('number of empty lines: %d', sum(empty_line_values.values()))
    word_length_values = result.aggregated_values(average_word_size_aggregator)
    logging.info('average word lengths: %s', word_length_values.values())
コード例 #60
0
    def test_sdk_location_local(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = tempfile.mkdtemp()
        self.create_temp_file(
            os.path.join(sdk_location, names.DATAFLOW_SDK_TARBALL_FILE),
            'contents')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = sdk_location

        self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
                         dependency.stage_job_resources(options))
        tarball_path = os.path.join(staging_dir,
                                    names.DATAFLOW_SDK_TARBALL_FILE)
        with open(tarball_path) as f:
            self.assertEqual(f.read(), 'contents')