def _write_files(self, destination_data_kv_pc, file_prefix_pcv): outputs = ( destination_data_kv_pc | beam.ParDo( WriteRecordsToFile( schema=self.schema, max_files_per_bundle=self.max_files_per_bundle, max_file_size=self.max_file_size, file_format=self._temp_file_format), file_prefix_pcv, *self.schema_side_inputs).with_outputs( WriteRecordsToFile.UNWRITTEN_RECORD_TAG, WriteRecordsToFile.WRITTEN_FILE_TAG)) # A PCollection of (destination, file) tuples. It lists files with records, # and the destination each file is meant to be imported into. destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG] # A PCollection of (destination, record) tuples. These are later sharded, # grouped, and all records for each destination-shard is written to files. # This PCollection is necessary because not all records can be written into # files in ``WriteRecordsToFile``. unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG] more_destination_files_kv_pc = ( unwritten_records_pc | beam.ParDo(_ShardDestinations()) | "GroupShardedRows" >> beam.GroupByKey() | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1])) | "WriteGroupedRecordsToFile" >> beam.ParDo( WriteGroupedRecordsToFile( schema=self.schema, file_format=self._temp_file_format), file_prefix_pcv, *self.schema_side_inputs)) # TODO(BEAM-9494): Remove the identity transform. We flatten both # PCollection paths and use an identity function to work around a # flatten optimization issue where the wrong coder is being used. all_destination_file_pairs_pc = ( (destination_files_kv_pc, more_destination_files_kv_pc) | "DestinationFilesUnion" >> beam.Flatten() | "IdentityWorkaround" >> beam.Map(lambda x: x)) if self.is_streaming_pipeline: # Apply the user's trigger back before we start triggering load jobs all_destination_file_pairs_pc = ( all_destination_file_pairs_pc | "ApplyUserTrigger" >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAll( trigger.AfterProcessingTime(self.triggering_frequency), trigger.AfterCount(1))), accumulation_mode=trigger.AccumulationMode.DISCARDING)) return all_destination_file_pairs_pc
def _maybe_apply_user_trigger(self, destination_file_kv_pc): if self.is_streaming_pipeline: # Apply the user's trigger back before we start triggering load jobs return (destination_file_kv_pc | "ApplyUserTrigger" >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAll( trigger.AfterProcessingTime( self.triggering_frequency), trigger.AfterCount(1))), accumulation_mode=trigger.AccumulationMode.DISCARDING)) else: return destination_file_kv_pc
def _write_files(self, destination_data_kv_pc, file_prefix_pcv): outputs = (destination_data_kv_pc | beam.ParDo(WriteRecordsToFile( max_files_per_bundle=self.max_files_per_bundle, max_file_size=self.max_file_size, coder=self.coder), file_prefix=file_prefix_pcv).with_outputs( WriteRecordsToFile.UNWRITTEN_RECORD_TAG, WriteRecordsToFile.WRITTEN_FILE_TAG)) # A PCollection of (destination, file) tuples. It lists files with records, # and the destination each file is meant to be imported into. destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG] # A PCollection of (destination, record) tuples. These are later sharded, # grouped, and all records for each destination-shard is written to files. # This PCollection is necessary because not all records can be written into # files in ``WriteRecordsToFile``. unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG] more_destination_files_kv_pc = ( unwritten_records_pc | beam.ParDo(_ShardDestinations()) | "GroupShardedRows" >> beam.GroupByKey() | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1])) | "WriteGroupedRecordsToFile" >> beam.ParDo( WriteGroupedRecordsToFile(coder=self.coder), file_prefix=file_prefix_pcv)) all_destination_file_pairs_pc = ( (destination_files_kv_pc, more_destination_files_kv_pc) | "DestinationFilesUnion" >> beam.Flatten()) if self.is_streaming_pipeline: # Apply the user's trigger back before we start triggering load jobs all_destination_file_pairs_pc = ( all_destination_file_pairs_pc | "ApplyUserTrigger" >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAll( trigger.AfterProcessingTime( self.triggering_frequency), trigger.AfterCount(1))), accumulation_mode=trigger.AccumulationMode.DISCARDING)) return all_destination_file_pairs_pc
def main(argv=None): def json_parser(x): parsed = json.loads(x) return parsed def bye(x): logging.info('outing: %s', x) return x parser = argparse.ArgumentParser() parser.add_argument("--input_topic") parser.add_argument("--output_topic") known_args = parser.parse_known_args(argv) p = beam.Pipeline(options=PipelineOptions()) data = (p | 'ReadData' >> beam.io.ReadFromPubSub(topic=READ_TOPIC).with_output_types(bytes) | "JSONParse" >> beam.Map(json_parser)) (data | "AddingKeyToSumUp" >> beam.WithKeys(lambda x: x["ride_id"]) | "Windowing" >> beam.WindowInto( window.Sessions(60), trigger=tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(2)))), accumulation_mode=tr.AccumulationMode.DISCARDING, allowed_lateness=0) | 'ToBytes' >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye' >> beam.Map(bye) | 'WriteToPubSub' >> beam.io.WriteToPubSub(TOPIC)) (data | "SlidWindowing" >> beam.WindowInto( window.FixedWindows(60), trigger=(tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1))), late=tr.Repeatedly(tr.AfterCount(1)))), allowed_lateness=300, accumulation_mode=tr.AccumulationMode.ACCUMULATING) | "Extract" >> beam.Map(lambda x: x["meter_increment"]) | "Sum_up" >> beam.CombineGlobally(sum).without_defaults() | "Reformat" >> beam.Map(lambda x: {"dollar_run_rate_per_minute": x}) | "Enrich with time data" >> beam.ParDo(Enrich()) | "ToBytesCount" >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye2' >> beam.Map(bye) | "WriteCount" >> beam.io.WriteToPubSub(TOPIC)) (data | "AddingKey" >> beam.WithKeys(lambda x: x["ride_id"]) | "SessionWindowing" >> beam.WindowInto( window.Sessions(60), trigger=tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1)))), accumulation_mode=tr.AccumulationMode.ACCUMULATING, allowed_lateness=0) | "GroupInPickup" >> beam.CombinePerKey(PickupFn()) | "Discarding Key" >> beam.Map(lambda x: x[1]) | "Filter not pickup" >> beam.Map(lambda x: x if str(x["ride_status"]) == "pickup" else None) | "ToBytesPickup" >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye3' >> beam.Map(bye) | "WritePickup" >> beam.io.WriteToPubSub(TOPIC)) result = p.run() result.wait_until_finish()