Esempio n. 1
0
  def _write_files(self, destination_data_kv_pc, file_prefix_pcv):
    outputs = (
        destination_data_kv_pc
        | beam.ParDo(
            WriteRecordsToFile(
                schema=self.schema,
                max_files_per_bundle=self.max_files_per_bundle,
                max_file_size=self.max_file_size,
                file_format=self._temp_file_format),
            file_prefix_pcv,
            *self.schema_side_inputs).with_outputs(
                WriteRecordsToFile.UNWRITTEN_RECORD_TAG,
                WriteRecordsToFile.WRITTEN_FILE_TAG))

    # A PCollection of (destination, file) tuples. It lists files with records,
    # and the destination each file is meant to be imported into.
    destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG]

    # A PCollection of (destination, record) tuples. These are later sharded,
    # grouped, and all records for each destination-shard is written to files.
    # This PCollection is necessary because not all records can be written into
    # files in ``WriteRecordsToFile``.
    unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

    more_destination_files_kv_pc = (
        unwritten_records_pc
        | beam.ParDo(_ShardDestinations())
        | "GroupShardedRows" >> beam.GroupByKey()
        | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1]))
        | "WriteGroupedRecordsToFile" >> beam.ParDo(
            WriteGroupedRecordsToFile(
                schema=self.schema, file_format=self._temp_file_format),
            file_prefix_pcv,
            *self.schema_side_inputs))

    # TODO(BEAM-9494): Remove the identity transform. We flatten both
    # PCollection paths and use an identity function to work around a
    # flatten optimization issue where the wrong coder is being used.
    all_destination_file_pairs_pc = (
        (destination_files_kv_pc, more_destination_files_kv_pc)
        | "DestinationFilesUnion" >> beam.Flatten()
        | "IdentityWorkaround" >> beam.Map(lambda x: x))

    if self.is_streaming_pipeline:
      # Apply the user's trigger back before we start triggering load jobs
      all_destination_file_pairs_pc = (
          all_destination_file_pairs_pc
          | "ApplyUserTrigger" >> beam.WindowInto(
              beam.window.GlobalWindows(),
              trigger=trigger.Repeatedly(
                  trigger.AfterAll(
                      trigger.AfterProcessingTime(self.triggering_frequency),
                      trigger.AfterCount(1))),
              accumulation_mode=trigger.AccumulationMode.DISCARDING))
    return all_destination_file_pairs_pc
 def _maybe_apply_user_trigger(self, destination_file_kv_pc):
     if self.is_streaming_pipeline:
         # Apply the user's trigger back before we start triggering load jobs
         return (destination_file_kv_pc
                 | "ApplyUserTrigger" >> beam.WindowInto(
                     beam.window.GlobalWindows(),
                     trigger=trigger.Repeatedly(
                         trigger.AfterAll(
                             trigger.AfterProcessingTime(
                                 self.triggering_frequency),
                             trigger.AfterCount(1))),
                     accumulation_mode=trigger.AccumulationMode.DISCARDING))
     else:
         return destination_file_kv_pc
Esempio n. 3
0
    def _write_files(self, destination_data_kv_pc, file_prefix_pcv):
        outputs = (destination_data_kv_pc
                   | beam.ParDo(WriteRecordsToFile(
                       max_files_per_bundle=self.max_files_per_bundle,
                       max_file_size=self.max_file_size,
                       coder=self.coder),
                                file_prefix=file_prefix_pcv).with_outputs(
                                    WriteRecordsToFile.UNWRITTEN_RECORD_TAG,
                                    WriteRecordsToFile.WRITTEN_FILE_TAG))

        # A PCollection of (destination, file) tuples. It lists files with records,
        # and the destination each file is meant to be imported into.
        destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG]

        # A PCollection of (destination, record) tuples. These are later sharded,
        # grouped, and all records for each destination-shard is written to files.
        # This PCollection is necessary because not all records can be written into
        # files in ``WriteRecordsToFile``.
        unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

        more_destination_files_kv_pc = (
            unwritten_records_pc
            | beam.ParDo(_ShardDestinations())
            | "GroupShardedRows" >> beam.GroupByKey()
            | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1]))
            | "WriteGroupedRecordsToFile" >> beam.ParDo(
                WriteGroupedRecordsToFile(coder=self.coder),
                file_prefix=file_prefix_pcv))

        all_destination_file_pairs_pc = (
            (destination_files_kv_pc, more_destination_files_kv_pc)
            | "DestinationFilesUnion" >> beam.Flatten())

        if self.is_streaming_pipeline:
            # Apply the user's trigger back before we start triggering load jobs
            all_destination_file_pairs_pc = (
                all_destination_file_pairs_pc
                | "ApplyUserTrigger" >> beam.WindowInto(
                    beam.window.GlobalWindows(),
                    trigger=trigger.Repeatedly(
                        trigger.AfterAll(
                            trigger.AfterProcessingTime(
                                self.triggering_frequency),
                            trigger.AfterCount(1))),
                    accumulation_mode=trigger.AccumulationMode.DISCARDING))
        return all_destination_file_pairs_pc
def main(argv=None):
    def json_parser(x):
        parsed = json.loads(x)
        return parsed

    def bye(x):
        logging.info('outing: %s', x)
        return x

    parser = argparse.ArgumentParser()
    parser.add_argument("--input_topic")
    parser.add_argument("--output_topic")
    known_args = parser.parse_known_args(argv)

    p = beam.Pipeline(options=PipelineOptions())

    data = (p
            | 'ReadData' >>
            beam.io.ReadFromPubSub(topic=READ_TOPIC).with_output_types(bytes)
            | "JSONParse" >> beam.Map(json_parser))

    (data
     | "AddingKeyToSumUp" >> beam.WithKeys(lambda x: x["ride_id"])
     | "Windowing" >> beam.WindowInto(
         window.Sessions(60),
         trigger=tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(2)))),
         accumulation_mode=tr.AccumulationMode.DISCARDING,
         allowed_lateness=0)
     | 'ToBytes' >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye' >> beam.Map(bye)
     | 'WriteToPubSub' >> beam.io.WriteToPubSub(TOPIC))

    (data
     | "SlidWindowing" >> beam.WindowInto(
         window.FixedWindows(60),
         trigger=(tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1))),
                                    late=tr.Repeatedly(tr.AfterCount(1)))),
         allowed_lateness=300,
         accumulation_mode=tr.AccumulationMode.ACCUMULATING)
     | "Extract" >> beam.Map(lambda x: x["meter_increment"])
     | "Sum_up" >> beam.CombineGlobally(sum).without_defaults()
     | "Reformat" >> beam.Map(lambda x: {"dollar_run_rate_per_minute": x})
     | "Enrich with time data" >> beam.ParDo(Enrich())
     | "ToBytesCount" >>
     beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye2' >> beam.Map(bye)
     | "WriteCount" >> beam.io.WriteToPubSub(TOPIC))

    (data
     | "AddingKey" >> beam.WithKeys(lambda x: x["ride_id"])
     | "SessionWindowing" >> beam.WindowInto(
         window.Sessions(60),
         trigger=tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1)))),
         accumulation_mode=tr.AccumulationMode.ACCUMULATING,
         allowed_lateness=0)
     | "GroupInPickup" >> beam.CombinePerKey(PickupFn())
     | "Discarding Key" >> beam.Map(lambda x: x[1])
     | "Filter not pickup" >>
     beam.Map(lambda x: x if str(x["ride_status"]) == "pickup" else None)
     | "ToBytesPickup" >>
     beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye3' >> beam.Map(bye)
     | "WritePickup" >> beam.io.WriteToPubSub(TOPIC))

    result = p.run()
    result.wait_until_finish()