def expand(self, pcoll): if self.with_attributes: pcoll = pcoll | 'ToProtobuf' >> Map(self.message_to_proto_str) else: pcoll = pcoll | 'ToProtobuf' >> Map(self.bytes_to_proto_str) pcoll.element_type = bytes return pcoll | Write(self._sink)
def run(argv=None, save_main_session=True): parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=pipeline_options) as p: # Read the text file. lines = p | 'Read' >> ReadFromText(known_args.input) output = ( lines | 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'PairWIthOne' >> beam.Map(lambda x: (x, 1)) | 'GroupAndSum' >> beam.CombinePerKey(sum) # For Logging Purposes | 'Format' >> beam.MapTuple(format_output)) # A custom text sink so it displays nicely in GCS :( output | 'Write' >> Write(Utf8TextSink(known_args.output))
def expand(self, pcoll): if self.with_attributes: pcoll = pcoll | 'ToProtobuf' >> Map(self.to_proto_str) # Without attributes, message data is written as-is. With attributes, # message data + attributes are passed as a serialized protobuf string (see # ``PubsubMessage._to_proto_str`` for exact protobuf message type). pcoll.element_type = bytes return pcoll | Write(self._sink)
def expand(self, pcoll): pcoll = pcoll | 'EncodeString' >> Map(lambda s: s.encode('utf-8')) pcoll.element_type = bytes return pcoll | Write(self._sink)
def expand(self, pcoll): return pcoll | Write(self._sink)
def expand(self, pcoll): return pcoll | Write(_TFRecordSink(*self._args))
def expand(self, pcoll): pcoll = pcoll | 'encode string' >> ParDo(_encodeUtf8String) pcoll.element_type = bytes return pcoll | Write(self._sink)