def get_replacement_transform_for_applied_ptransform( self, applied_ptransform): from apache_beam import pvalue from apache_beam.io import iobase transform = applied_ptransform.transform class Read(iobase.Read): override = True def expand(self, pbegin): return pvalue.PCollection(self.pipeline, is_bounded=self.source.is_bounded()) return Read(transform.source).with_output_types( transform.get_type_hints().simple_output_type('Read'))
def test_read(self): schema = 'struct<a:int,b:struct<x:string,y:boolean>>' files = [] with tempfile.NamedTemporaryFile() as f1, \ tempfile.NamedTemporaryFile() as f2: files.append(f1.name) with pyorc.Writer(f1, schema) as writer: writer.write((1, ('x', True))) files.append(f2.name) with pyorc.Writer(f2, schema) as writer: writer.write((2, ('y', False))) writer.write((3, ('z', False))) with TestPipeline() as p: pc = (p | Read( FileSource( file_patterns=files, reader=OrcReader(pyorc_options={ 'struct_repr': pyorc.StructRepr.DICT, })))) assert_that( pc, equal_to([ { 'a': 1, 'b': { 'x': 'x', 'y': True, }, }, { 'a': 2, 'b': { 'x': 'y', 'y': False, }, }, { 'a': 3, 'b': { 'x': 'z', 'y': False, }, }, ]))
def test(self): SCHEMA = parse_table_schema_from_json( '{"fields": [{"name": "data", "type": "BYTES"}]}') def format_record(record): # Since Synthetic Source returns data as a dictionary, we should skip one # of the part return {'data': base64.b64encode(record[1])} # pylint: disable=expression-not-assigned (self.pipeline | 'ProduceRows' >> Read( SyntheticSource(self.parseTestPipelineOptions())) | 'Format' >> Map(format_record) | 'WriteToBigQuery' >> WriteToBigQuery( self.output_dataset + '.' + self.output_table, schema=SCHEMA, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_EMPTY))
def get_replacement_transform(self, ptransform): # Imported here to avoid circular dependencies. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam import pvalue from apache_beam.io import iobase # This is purposely subclassed from the Read transform to take advantage of # the existing windowing, typing, and display data. class Read(iobase.Read): override = True def expand(self, pbegin): return pvalue.PCollection.from_(pbegin) # Use the source's coder type hint as this replacement's output. Otherwise, # the typing information is not properly forwarded to the DataflowRunner and # will choose the incorrect coder for this transform. return Read(ptransform.source).with_output_types( ptransform.source.coder.to_type_hint())
def test(self): def to_pubsub_message(element): import uuid from apache_beam.io import PubsubMessage return PubsubMessage( data=element[1], attributes={'id': str(uuid.uuid1()).encode('utf-8')}, ) _ = ( self.pipeline | 'Create input' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Format to pubsub message in bytes' >> beam.Map(to_pubsub_message) | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)) | 'Write to Pubsub' >> beam.io.WriteToPubSub( self.topic_name, with_attributes=True, id_label='id', ))
def test(self): SCHEMA = parse_table_schema_from_json( '{"fields": [{"name": "data", "type": "BYTES"}]}') def format_record(record): # Since Synthetic Source returns data as a dictionary, we should skip one # of the part return {'data': base64.b64encode(record[1])} ( # pylint: disable=expression-not-assigned self.pipeline | 'Produce rows' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace)) | 'Format' >> Map(format_record) | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace)) | 'Write to BigQuery' >> WriteToBigQuery( dataset=self.output_dataset, table=self.output_table, schema=SCHEMA, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_TRUNCATE))
def test_root_transforms(self): class DummySource(iobase.BoundedSource): pass root_read = Read(DummySource()) root_flatten = Flatten(pipeline=self.pipeline) pbegin = pvalue.PBegin(self.pipeline) pcoll_read = pbegin | 'read' >> root_read pcoll_read | FlatMap(lambda x: x) [] | 'flatten' >> root_flatten self.pipeline.visit(self.visitor) root_transforms = sorted( [t.transform for t in self.visitor.root_transforms]) self.assertEqual(root_transforms, sorted([root_read, root_flatten])) pbegin_consumers = sorted( [c.transform for c in self.visitor.value_to_consumers[pbegin]]) self.assertEqual(pbegin_consumers, sorted([root_read])) self.assertEqual(len(self.visitor.step_names), 3)
def _create_input_data(self): """ Runs an additional pipeline which creates test data and waits for its completion. """ SCHEMA = parse_table_schema_from_json( '{"fields": [{"name": "data", "type": "BYTES"}]}') def format_record(record): # Since Synthetic Source returns data as a dictionary, we should skip one # of the part return {'data': base64.b64encode(record[1])} p = TestPipeline() # pylint: disable=expression-not-assigned (p | 'Produce rows' >> Read(SyntheticSource(self.parseTestPipelineOptions())) | 'Format' >> Map(format_record) | 'Write to BigQuery' >> WriteToBigQuery( dataset=self.input_dataset, table=self.input_table, schema=SCHEMA, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_EMPTY)) p.run().wait_until_finish()
def test_read(self): pipeline = TestPipeline() pcoll = pipeline | 'read' >> Read(FakeSource([1, 2, 3])) assert_that(pcoll, equal_to([1, 2, 3])) pipeline.run()
def test(self): self.result = (self.pipeline | 'Read from BigQuery' >> Read( BigQuerySource(dataset=self.input_dataset, table=self.input_table)) | 'Count' >> Count.Globally())
def run(argv=None, comments=None): """Run the beam pipeline. Args: argv: (optional) the command line flags to parse. comments_collection: (optional) a list of comment JSON objects to process. Used in unit-tests to avoid requiring a BigQuery source. """ args, pipeline_args = _parse_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) if comments is not None: comments = p | ("Read in-memory comments") >> beam.Create(comments) else: comments = p | ("Read " + args.reddit_table) >> Read( BigQuerySource(args.reddit_table)) comments |= ("Normalise comments" >> beam.Map( partial(normalise_comment, max_length=args.max_length))) thread_id_to_comments = comments | ( "Key by thread id" >> beam.Map(lambda comment: (comment.thread_id, comment))) threads = thread_id_to_comments | ( "Group comments by thread ID" >> beam.GroupByKey()) threads = threads | ("Get threads" >> beam.Map(lambda t: t[1])) examples = threads | ( "Create {} examples".format(args.dataset_format) >> beam.FlatMap( partial( create_examples, parent_depth=args.parent_depth, min_length=args.min_length, format=args.dataset_format, ))) examples = _shuffle(examples) examples |= "split train and test" >> beam.ParDo( _TrainTestSplitFn(train_split=args.train_split)).with_outputs( _TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG) if args.dataset_format == _JSON_FORMAT: write_sink = WriteToText file_name_suffix = ".json" serialize_fn = json.dumps else: assert args.dataset_format == _TF_FORMAT write_sink = WriteToTFRecord file_name_suffix = ".tfrecord" serialize_fn = _features_to_serialized_tf_example for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG), ("test", _TrainTestSplitFn.TEST_TAG)]: serialized_examples = examples[tag] | ( "serialize {} examples".format(name) >> beam.Map(serialize_fn)) (serialized_examples | ("write " + name) >> write_sink( os.path.join(args.output_dir, name), file_name_suffix=file_name_suffix, num_shards=args.num_shards_train, )) result = p.run() result.wait_until_finish()
def run(argv=None, comments=None): """Run the beam pipeline. Args: argv: (optional) the command line flags to parse. comments_collection: (optional) a list of comment JSON objects to process. Used in unit-tests to avoid requiring a BigQuery source. """ args, pipeline_args = _parse_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) if comments is not None: comments = p | ("Read in-memory comments") >> beam.Create(comments) else: comments = p | ("Read " + args.reddit_table) >> Read( BigQuerySource(args.reddit_table)) comments |= ( "Normalise comments" >> beam.Map( partial(normalise_comment, max_length=args.max_length))) thread_id_to_comments = comments | ( "Key by thread id" >> beam.Map( lambda comment: (comment.thread_id, comment))) threads = thread_id_to_comments | ( "Group comments by thread ID" >> beam.GroupByKey()) threads = threads | ("Get threads" >> beam.Map(lambda t: t[1])) examples = threads | ( "Create {} examples".format(args.dataset_format) >> beam.FlatMap( partial(create_examples, parent_depth=args.parent_depth, min_length=args.min_length, format=args.dataset_format, ))) examples = _shuffle(examples) # [START dataflow_molecules_split_to_train_and_eval_datasets] # Split the dataset into a training set and an evaluation set assert 0 < (100 - args.train_split*100) < 100, 'eval_percent must in the range (0-100)' eval_percent = 100 - args.train_split*100 train_dataset, eval_dataset = ( examples | 'Split dataset' >> beam.Partition( lambda elem, _: int(random.uniform(0, 100) < eval_percent), 2)) # [END dataflow_molecules_split_to_train_and_eval_datasets] if args.dataset_format == _JSON_FORMAT: write_sink = WriteToText file_name_suffix = ".json" serialize_fn = json.dumps serialized_train_examples = train_dataset | ( "serialize {} examples".format('train') >> beam.Map(serialize_fn)) ( serialized_train_examples | ("write " + 'train') >> write_sink( os.path.join(args.output_dir, 'train'), file_name_suffix=file_name_suffix, num_shards=args.num_shards_train, ) ) serialized_test_examples = eval_dataset | ( "serialize {} examples".format('valid') >> beam.Map(serialize_fn)) ( serialized_test_examples | ("write " + 'valid') >> write_sink( os.path.join(args.output_dir, 'valid'), file_name_suffix=file_name_suffix, num_shards=args.num_shards_train, ) ) result = p.run() result.wait_until_finish()
def test_fake_read(self): # FakeSource mock requires DirectRunner. pipeline = TestPipeline(runner='DirectRunner') pcoll = pipeline | 'read' >> Read(FakeSource([1, 2, 3])) assert_that(pcoll, equal_to([1, 2, 3])) pipeline.run()
def expand(self, pvalue): # pylint: disable=arguments-differ return pvalue.pipeline | Read(self._source)
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) ############################################### # (1) pipeline を作成する ############################################### # まず PipelineOptions オブジェクトを作成 # パイプラインを実行する pipeline runner や、選択した runner が必要とする固有の設定など、さまざまなオプションを設定できる pipeline_options = PipelineOptions(pipeline_args) # 作成した PipelineOptions オプジェクトを直接編集する例 # 今回は DoFn transform を使用するため、save_main_sessionオプションを有効にする pipeline_options.view_as(SetupOptions).save_main_session = True # オプションを元に pipeline (p) を作成 p5 = beam.Pipeline(options=pipeline_options) #in→bigquery out→textのパイプライン ############################################## # (2) transformを設定 ############################################### #p2にtransformを設定 query = 'select * from babynames.names2012' query2 = 'select * from babynames.names2011' query_reslt1 = p5 | 'read1' >> Read( beam.io.BigQuerySource( project='gcp-project-210712', use_standard_sql=False, query=query)) query_reslt2 = p5 | 'read2' >> Read( beam.io.BigQuerySource(project='gcp-project-210712', use_standard_sql=False, query=query2)) branch1 = query_reslt1 | 'modifiy1' >> beam.Filter(modify_data1) branch2 = query_reslt2 | 'modifiy2' >> beam.Filter(modify_data1) #テーブル定義は全て書かないとwriteの時エラーとなる ((branch1, branch2) | beam.Flatten() | 'write' >> beam.io.Write( beam.io.BigQuerySink( 'babynames.testtable3', schema='name:string, gender:string, count:integer', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) ############################################### # (3) Pipeline を実行 ##############################################) result5 = p5.run() # 終了を待つ # 記述しなければそのまま抜ける # →DataFlowRunnerの場合、Ctrl-Cでもパイプラインは停止しない。Gooleコンソールから停止する必要がある #ここで結果が終了するのを待ち合わせている。記載がなければ後続は処理されない。 result5.wait_until_finish()
def expand(self, pvalue): return pvalue.pipeline | Read(self._source)
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) ############################################### # (1) pipeline を作成する ############################################### # まず PipelineOptions オブジェクトを作成 # パイプラインを実行する pipeline runner や、選択した runner が必要とする固有の設定など、さまざまなオプションを設定できる pipeline_options = PipelineOptions(pipeline_args) # 作成した PipelineOptions オプジェクトを直接編集する例 # 今回は DoFn transform を使用するため、save_main_sessionオプションを有効にする pipeline_options.view_as(SetupOptions).save_main_session = True # オプションを元に pipeline (p) を作成 p = beam.Pipeline(options=pipeline_options) #in→text out→textのパイプライン p2 = beam.Pipeline(options=pipeline_options) #in→bigquery out→textのパイプライン ############################################## # (2) transformを設定 ############################################### #pにtransformを設定 lines = p | 'read' >> ReadFromText(known_args.input) #ファイル出力するためのサンプルメソッド def add(line): num = int(line.strip()) return num * 3 counts = lines | 'add' >> beam.Map(add) counts | 'write' >> WriteToText(known_args.output) #p2にtransformを設定 query = 'select * from babynames.names2012 limit 1000' p2 | 'read' >> Read(beam.io.BigQuerySource(project='gcp-project-210712', use_standard_sql=False, query=query)) \ | 'write' >> WriteToText('gs://gcp_dataflowsample/query_result.txt', num_shards=1) ############################################### # (3) Pipeline を実行 ############################################### #result = p.run() result2 = p2.run() # 終了を待つ # 記述しなければそのまま抜ける # →DataFlowRunnerの場合、Ctrl-Cでもパイプラインは停止しない。Gooleコンソールから停止する必要がある #ここで結果が終了するのを待ち合わせている。記載がなければ後続は処理されない。 #result.wait_until_finish() result2.wait_until_finish()
def expand(self, pvalue): return pvalue.pipeline | Read(_TFRecordSource(*self._args))
def test_fake_read(self): with TestPipeline() as pipeline: pcoll = pipeline | 'read' >> Read(FakeSource([1, 2, 3])) assert_that(pcoll, equal_to([1, 2, 3]))
def build(self, p): file_read = (p | Read( FileSource(self.path_prefix + '*', self.parser, self.decoders[0]))) return file_read