def run():
    options = PipelineOptions()

    input_ = 'gs://tempgcpbucket1/entries.csv'
    output_ = 'gs://tempgcpbucket1/counts/'
    # options.input = input_
    # options.output = output_

    options = options.view_as(GoogleCloudOptions)
    options.project = 'rk-playground'
    options.job_name = 'entriesjob'
    options.staging_location = 'gs://tempgcpbucket1/binaries'
    options.temp_location = 'gs://tempgcpbucket1/tmp'

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    options.view_as(SetupOptions).save_main_session = True

    options.view_as(StandardOptions).runner = 'DataflowRunner'

    p = beam.Pipeline(options=options)

    (p  # pylint: disable=expression-not-assigned
        | 'ReadInputText' >> beam.io.ReadFromText(input_)
        | 'UserAggregate' >> UserAggregate()
        | 'FormatUserScoreSums' >> beam.Map(format_user_score_sums)
        | 'WriteUserScoreSums' >> beam.io.WriteToText(output_))

    p.run()
Beispiel #2
0
  def test_model_bigqueryio(self):
    # We cannot test BigQueryIO functionality in unit tests, therefore we limit
    # ourselves to making sure the pipeline containing BigQuery sources and
    # sinks can be built.
    #
    # To run locally, set `run_locally` to `True`. You will also have to set
    # `project`, `dataset` and `table` to the BigQuery table the test will write
    # to.
    run_locally = False
    if run_locally:
      project = 'my-project'
      dataset = 'samples'  # this must already exist
      table = 'model_bigqueryio'  # this will be created if needed

      options = PipelineOptions().view_as(GoogleCloudOptions)
      options.project = project
      with beam.Pipeline(options=options) as p:
        snippets.model_bigqueryio(p, project, dataset, table)
    else:
      p = TestPipeline()
      snippets.model_bigqueryio(p)
  def test_model_bigqueryio(self):
    # We cannot test BigQueryIO functionality in unit tests, therefore we limit
    # ourselves to making sure the pipeline containing BigQuery sources and
    # sinks can be built.
    #
    # To run locally, set `run_locally` to `True`. You will also have to set
    # `project`, `dataset` and `table` to the BigQuery table the test will write
    # to.
    run_locally = False
    if run_locally:
      project = 'my-project'
      dataset = 'samples'  # this must already exist
      table = 'model_bigqueryio'  # this will be created if needed

      options = PipelineOptions().view_as(GoogleCloudOptions)
      options.project = project
      with beam.Pipeline(options=options) as p:
        snippets.model_bigqueryio(p, project, dataset, table)
    else:
      p = TestPipeline()
      snippets.model_bigqueryio(p)