def run(): options = PipelineOptions() input_ = 'gs://tempgcpbucket1/entries.csv' output_ = 'gs://tempgcpbucket1/counts/' # options.input = input_ # options.output = output_ options = options.view_as(GoogleCloudOptions) options.project = 'rk-playground' options.job_name = 'entriesjob' options.staging_location = 'gs://tempgcpbucket1/binaries' options.temp_location = 'gs://tempgcpbucket1/tmp' # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).runner = 'DataflowRunner' p = beam.Pipeline(options=options) (p # pylint: disable=expression-not-assigned | 'ReadInputText' >> beam.io.ReadFromText(input_) | 'UserAggregate' >> UserAggregate() | 'FormatUserScoreSums' >> beam.Map(format_user_score_sums) | 'WriteUserScoreSums' >> beam.io.WriteToText(output_)) p.run()
def main(): # Create options options = PipelineOptions() options = options.view_as(beam.options.pipeline_options.SetupOptions) options.setup_file = "./setup.py" options = options.view_as(beam.options.pipeline_options.GoogleCloudOptions) options.job_name = "gcs2gdrive" options = options.view_as(TemplateOptions) p = beam.Pipeline(options=options) (p | "Read" >> beam.io.ReadFromText(options.input_csv) | "Write" >> beam.ParDo( CopyFile(options.gdrive_directory_id, options.service_account_file))) p.run()