Example #1
0
    def get_replacement_transform_for_applied_ptransform(
            self, applied_ptransform):

        from apache_beam import pvalue
        from apache_beam.io import iobase

        transform = applied_ptransform.transform

        class Read(iobase.Read):
            override = True

            def expand(self, pbegin):
                return pvalue.PCollection(self.pipeline,
                                          is_bounded=self.source.is_bounded())

        return Read(transform.source).with_output_types(
            transform.get_type_hints().simple_output_type('Read'))
Example #2
0
 def test_read(self):
     schema = 'struct<a:int,b:struct<x:string,y:boolean>>'
     files = []
     with tempfile.NamedTemporaryFile() as f1, \
          tempfile.NamedTemporaryFile() as f2:
         files.append(f1.name)
         with pyorc.Writer(f1, schema) as writer:
             writer.write((1, ('x', True)))
         files.append(f2.name)
         with pyorc.Writer(f2, schema) as writer:
             writer.write((2, ('y', False)))
             writer.write((3, ('z', False)))
         with TestPipeline() as p:
             pc = (p | Read(
                 FileSource(
                     file_patterns=files,
                     reader=OrcReader(pyorc_options={
                         'struct_repr': pyorc.StructRepr.DICT,
                     }))))
         assert_that(
             pc,
             equal_to([
                 {
                     'a': 1,
                     'b': {
                         'x': 'x',
                         'y': True,
                     },
                 },
                 {
                     'a': 2,
                     'b': {
                         'x': 'y',
                         'y': False,
                     },
                 },
                 {
                     'a': 3,
                     'b': {
                         'x': 'z',
                         'y': False,
                     },
                 },
             ]))
Example #3
0
    def test(self):
        SCHEMA = parse_table_schema_from_json(
            '{"fields": [{"name": "data", "type": "BYTES"}]}')

        def format_record(record):
            # Since Synthetic Source returns data as a dictionary, we should skip one
            # of the part
            return {'data': base64.b64encode(record[1])}

        # pylint: disable=expression-not-assigned
        (self.pipeline
         | 'ProduceRows' >> Read(
             SyntheticSource(self.parseTestPipelineOptions()))
         | 'Format' >> Map(format_record)
         | 'WriteToBigQuery' >> WriteToBigQuery(
             self.output_dataset + '.' + self.output_table,
             schema=SCHEMA,
             create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=BigQueryDisposition.WRITE_EMPTY))
Example #4
0
  def get_replacement_transform(self, ptransform):
    # Imported here to avoid circular dependencies.
    # pylint: disable=wrong-import-order, wrong-import-position
    from apache_beam import pvalue
    from apache_beam.io import iobase

    # This is purposely subclassed from the Read transform to take advantage of
    # the existing windowing, typing, and display data.
    class Read(iobase.Read):
      override = True

      def expand(self, pbegin):
        return pvalue.PCollection.from_(pbegin)

    # Use the source's coder type hint as this replacement's output. Otherwise,
    # the typing information is not properly forwarded to the DataflowRunner and
    # will choose the incorrect coder for this transform.
    return Read(ptransform.source).with_output_types(
        ptransform.source.coder.to_type_hint())
Example #5
0
  def test(self):
    def to_pubsub_message(element):
      import uuid
      from apache_beam.io import PubsubMessage
      return PubsubMessage(
          data=element[1],
          attributes={'id': str(uuid.uuid1()).encode('utf-8')},
      )

    _ = (
        self.pipeline
        | 'Create input' >> Read(
            SyntheticSource(self.parse_synthetic_source_options()))
        | 'Format to pubsub message in bytes' >> beam.Map(to_pubsub_message)
        | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace))
        | 'Write to Pubsub' >> beam.io.WriteToPubSub(
            self.topic_name,
            with_attributes=True,
            id_label='id',
        ))
Example #6
0
    def test(self):
        SCHEMA = parse_table_schema_from_json(
            '{"fields": [{"name": "data", "type": "BYTES"}]}')

        def format_record(record):
            # Since Synthetic Source returns data as a dictionary, we should skip one
            # of the part
            return {'data': base64.b64encode(record[1])}

        (  # pylint: disable=expression-not-assigned
            self.pipeline
            | 'Produce rows' >> Read(
                SyntheticSource(self.parse_synthetic_source_options()))
            | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
            | 'Format' >> Map(format_record)
            | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
            | 'Write to BigQuery' >> WriteToBigQuery(
                dataset=self.output_dataset,
                table=self.output_table,
                schema=SCHEMA,
                create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                write_disposition=BigQueryDisposition.WRITE_TRUNCATE))
    def test_root_transforms(self):
        class DummySource(iobase.BoundedSource):
            pass

        root_read = Read(DummySource())
        root_flatten = Flatten(pipeline=self.pipeline)

        pbegin = pvalue.PBegin(self.pipeline)
        pcoll_read = pbegin | 'read' >> root_read
        pcoll_read | FlatMap(lambda x: x)
        [] | 'flatten' >> root_flatten

        self.pipeline.visit(self.visitor)

        root_transforms = sorted(
            [t.transform for t in self.visitor.root_transforms])

        self.assertEqual(root_transforms, sorted([root_read, root_flatten]))

        pbegin_consumers = sorted(
            [c.transform for c in self.visitor.value_to_consumers[pbegin]])
        self.assertEqual(pbegin_consumers, sorted([root_read]))
        self.assertEqual(len(self.visitor.step_names), 3)
  def _create_input_data(self):
    """
    Runs an additional pipeline which creates test data and waits for its
    completion.
    """
    SCHEMA = parse_table_schema_from_json(
        '{"fields": [{"name": "data", "type": "BYTES"}]}')

    def format_record(record):
      # Since Synthetic Source returns data as a dictionary, we should skip one
      # of the part
      return {'data': base64.b64encode(record[1])}

    p = TestPipeline()
    # pylint: disable=expression-not-assigned
    (p
     | 'Produce rows' >> Read(SyntheticSource(self.parseTestPipelineOptions()))
     | 'Format' >> Map(format_record)
     | 'Write to BigQuery' >> WriteToBigQuery(
         dataset=self.input_dataset, table=self.input_table,
         schema=SCHEMA,
         create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=BigQueryDisposition.WRITE_EMPTY))
    p.run().wait_until_finish()
Example #9
0
 def test_read(self):
     pipeline = TestPipeline()
     pcoll = pipeline | 'read' >> Read(FakeSource([1, 2, 3]))
     assert_that(pcoll, equal_to([1, 2, 3]))
     pipeline.run()
 def test(self):
     self.result = (self.pipeline
                    | 'Read from BigQuery' >> Read(
                        BigQuerySource(dataset=self.input_dataset,
                                       table=self.input_table))
                    | 'Count' >> Count.Globally())
Example #11
0
def run(argv=None, comments=None):
    """Run the beam pipeline.

    Args:
        argv: (optional) the command line flags to parse.
        comments_collection: (optional) a list of comment JSON objects to
            process. Used in unit-tests to avoid requiring a BigQuery source.
    """
    args, pipeline_args = _parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    if comments is not None:
        comments = p | ("Read in-memory comments") >> beam.Create(comments)
    else:
        comments = p | ("Read " + args.reddit_table) >> Read(
            BigQuerySource(args.reddit_table))

    comments |= ("Normalise comments" >> beam.Map(
        partial(normalise_comment, max_length=args.max_length)))

    thread_id_to_comments = comments | (
        "Key by thread id" >> beam.Map(lambda comment:
                                       (comment.thread_id, comment)))
    threads = thread_id_to_comments | (
        "Group comments by thread ID" >> beam.GroupByKey())
    threads = threads | ("Get threads" >> beam.Map(lambda t: t[1]))

    examples = threads | (
        "Create {} examples".format(args.dataset_format) >> beam.FlatMap(
            partial(
                create_examples,
                parent_depth=args.parent_depth,
                min_length=args.min_length,
                format=args.dataset_format,
            )))
    examples = _shuffle(examples)

    examples |= "split train and test" >> beam.ParDo(
        _TrainTestSplitFn(train_split=args.train_split)).with_outputs(
            _TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG)

    if args.dataset_format == _JSON_FORMAT:
        write_sink = WriteToText
        file_name_suffix = ".json"
        serialize_fn = json.dumps
    else:
        assert args.dataset_format == _TF_FORMAT
        write_sink = WriteToTFRecord
        file_name_suffix = ".tfrecord"
        serialize_fn = _features_to_serialized_tf_example

    for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG),
                      ("test", _TrainTestSplitFn.TEST_TAG)]:

        serialized_examples = examples[tag] | (
            "serialize {} examples".format(name) >> beam.Map(serialize_fn))
        (serialized_examples | ("write " + name) >> write_sink(
            os.path.join(args.output_dir, name),
            file_name_suffix=file_name_suffix,
            num_shards=args.num_shards_train,
        ))

    result = p.run()
    result.wait_until_finish()
def run(argv=None, comments=None):
    """Run the beam pipeline.

    Args:
        argv: (optional) the command line flags to parse.
        comments_collection: (optional) a list of comment JSON objects to
            process. Used in unit-tests to avoid requiring a BigQuery source.
    """
    args, pipeline_args = _parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    if comments is not None:
        comments = p | ("Read in-memory comments") >> beam.Create(comments)
    else:
        comments = p | ("Read " + args.reddit_table) >> Read(
            BigQuerySource(args.reddit_table))

    comments |= (
        "Normalise comments" >> beam.Map(
            partial(normalise_comment, max_length=args.max_length)))

    thread_id_to_comments = comments | (
        "Key by thread id" >> beam.Map(
            lambda comment: (comment.thread_id, comment)))
    threads = thread_id_to_comments | (
        "Group comments by thread ID" >> beam.GroupByKey())
    threads = threads | ("Get threads" >> beam.Map(lambda t: t[1]))

    examples = threads | (
        "Create {} examples".format(args.dataset_format) >> beam.FlatMap(
            partial(create_examples,
                    parent_depth=args.parent_depth,
                    min_length=args.min_length,
                    format=args.dataset_format,
                    )))
    examples = _shuffle(examples)

    # [START dataflow_molecules_split_to_train_and_eval_datasets]
    # Split the dataset into a training set and an evaluation set
    assert 0 < (100 - args.train_split*100) < 100, 'eval_percent must in the range (0-100)'
    eval_percent = 100 - args.train_split*100
    train_dataset, eval_dataset = (
        examples
        | 'Split dataset' >> beam.Partition(
            lambda elem, _: int(random.uniform(0, 100) < eval_percent), 2))
    # [END dataflow_molecules_split_to_train_and_eval_datasets]

    if args.dataset_format == _JSON_FORMAT:
        write_sink = WriteToText
        file_name_suffix = ".json"
        serialize_fn = json.dumps

    serialized_train_examples = train_dataset | (
        "serialize {} examples".format('train') >> beam.Map(serialize_fn))
    (
        serialized_train_examples | ("write " + 'train')
        >> write_sink(
            os.path.join(args.output_dir, 'train'),
            file_name_suffix=file_name_suffix,
            num_shards=args.num_shards_train,
        )
    )

    serialized_test_examples = eval_dataset | (
        "serialize {} examples".format('valid') >> beam.Map(serialize_fn))
    (
        serialized_test_examples | ("write " + 'valid')
        >> write_sink(
            os.path.join(args.output_dir, 'valid'),
            file_name_suffix=file_name_suffix,
            num_shards=args.num_shards_train,
        )
    )

    result = p.run()
    result.wait_until_finish()
Example #13
0
 def test_fake_read(self):
     # FakeSource mock requires DirectRunner.
     pipeline = TestPipeline(runner='DirectRunner')
     pcoll = pipeline | 'read' >> Read(FakeSource([1, 2, 3]))
     assert_that(pcoll, equal_to([1, 2, 3]))
     pipeline.run()
Example #14
0
 def expand(self, pvalue):  # pylint: disable=arguments-differ
     return pvalue.pipeline | Read(self._source)
Example #15
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    ###############################################
    # (1) pipeline を作成する
    ###############################################

    # まず PipelineOptions オブジェクトを作成
    # パイプラインを実行する pipeline runner や、選択した runner が必要とする固有の設定など、さまざまなオプションを設定できる
    pipeline_options = PipelineOptions(pipeline_args)

    # 作成した PipelineOptions オプジェクトを直接編集する例
    # 今回は DoFn transform を使用するため、save_main_sessionオプションを有効にする
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # オプションを元に pipeline (p) を作成
    p5 = beam.Pipeline(options=pipeline_options)  #in→bigquery out→textのパイプライン

    ##############################################
    # (2) transformを設定
    ###############################################

    #p2にtransformを設定
    query = 'select * from babynames.names2012'
    query2 = 'select * from babynames.names2011'
    query_reslt1 = p5 | 'read1' >> Read(
        beam.io.BigQuerySource(
            project='gcp-project-210712', use_standard_sql=False, query=query))
    query_reslt2 = p5 | 'read2' >> Read(
        beam.io.BigQuerySource(project='gcp-project-210712',
                               use_standard_sql=False,
                               query=query2))

    branch1 = query_reslt1 | 'modifiy1' >> beam.Filter(modify_data1)
    branch2 = query_reslt2 | 'modifiy2' >> beam.Filter(modify_data1)
    #テーブル定義は全て書かないとwriteの時エラーとなる
    ((branch1, branch2) | beam.Flatten()
     | 'write' >> beam.io.Write(
         beam.io.BigQuerySink(
             'babynames.testtable3',
             schema='name:string, gender:string, count:integer',
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))

    ###############################################
    # (3) Pipeline を実行
    ##############################################)
    result5 = p5.run()

    # 終了を待つ
    # 記述しなければそのまま抜ける
    # →DataFlowRunnerの場合、Ctrl-Cでもパイプラインは停止しない。Gooleコンソールから停止する必要がある
    #ここで結果が終了するのを待ち合わせている。記載がなければ後続は処理されない。
    result5.wait_until_finish()
Example #16
0
 def expand(self, pvalue):
   return pvalue.pipeline | Read(self._source)
Example #17
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    ###############################################
    # (1) pipeline を作成する
    ###############################################

    # まず PipelineOptions オブジェクトを作成
    # パイプラインを実行する pipeline runner や、選択した runner が必要とする固有の設定など、さまざまなオプションを設定できる
    pipeline_options = PipelineOptions(pipeline_args)

    # 作成した PipelineOptions オプジェクトを直接編集する例
    # 今回は DoFn transform を使用するため、save_main_sessionオプションを有効にする
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # オプションを元に pipeline (p) を作成
    p = beam.Pipeline(options=pipeline_options)  #in→text out→textのパイプライン
    p2 = beam.Pipeline(options=pipeline_options)  #in→bigquery out→textのパイプライン

    ##############################################
    # (2) transformを設定
    ###############################################

    #pにtransformを設定
    lines = p | 'read' >> ReadFromText(known_args.input)

    #ファイル出力するためのサンプルメソッド
    def add(line):
        num = int(line.strip())
        return num * 3

    counts = lines | 'add' >> beam.Map(add)
    counts | 'write' >> WriteToText(known_args.output)

    #p2にtransformを設定
    query = 'select * from babynames.names2012 limit 1000'
    p2 | 'read' >> Read(beam.io.BigQuerySource(project='gcp-project-210712', use_standard_sql=False, query=query)) \
       | 'write' >> WriteToText('gs://gcp_dataflowsample/query_result.txt', num_shards=1)

    ###############################################
    # (3) Pipeline を実行
    ###############################################

    #result = p.run()
    result2 = p2.run()

    # 終了を待つ
    # 記述しなければそのまま抜ける
    # →DataFlowRunnerの場合、Ctrl-Cでもパイプラインは停止しない。Gooleコンソールから停止する必要がある
    #ここで結果が終了するのを待ち合わせている。記載がなければ後続は処理されない。
    #result.wait_until_finish()
    result2.wait_until_finish()
Example #18
0
 def expand(self, pvalue):
   return pvalue.pipeline | Read(_TFRecordSource(*self._args))
Example #19
0
 def test_fake_read(self):
     with TestPipeline() as pipeline:
         pcoll = pipeline | 'read' >> Read(FakeSource([1, 2, 3]))
         assert_that(pcoll, equal_to([1, 2, 3]))
Example #20
0
 def build(self, p):
     file_read = (p
                  | Read(
                      FileSource(self.path_prefix + '*', self.parser,
                                 self.decoders[0])))
     return file_read