Beispiel #1
0
 def test_expand_kafka_write(self):
   # We just test the expansion but do not execute.
   # pylint: disable=expression-not-assigned
   (
       self.create_pipeline()
       | Impulse()
       | Map(lambda input: (1, input))
       | WriteToKafka(
           producer_config={
               'bootstrap.servers': 'localhost:9092, notvalid2:3531'
           },
           topic='topic1',
           key_serializer='org.apache.kafka.'
           'common.serialization.'
           'LongSerializer',
           value_serializer='org.apache.kafka.'
           'common.serialization.'
           'ByteArraySerializer',
           expansion_service=self.get_expansion_service()))
Beispiel #2
0
def run(bootstrap_servers, topic, pipeline_args):
    # bootstrap_servers = '123.45.67.89:123:9092'
    # topic = 'kafka_taxirides_realtime'
    # pipeline_args = ['--project', 'my-project',
    #                  '--runner', 'DataflowRunner',
    #                  '--temp_location', 'my-temp-location',
    #                  '--region', 'my-region',
    #                  '--num_workers', 'my-num-workers',
    #                  '--experiments', 'use_runner_v2']

    pipeline_options = PipelineOptions(pipeline_args,
                                       save_main_session=True,
                                       streaming=True)
    window_size = 15  # size of the Window in seconds.

    def log_ride(ride_bytes):
        # Converting bytes record from Kafka to a dictionary.
        import ast
        ride = ast.literal_eval(ride_bytes.decode("UTF-8"))
        logging.info(
            'Found ride at latitude %r and longitude %r with %r '
            'passengers', ride['latitude'], ride['longitude'],
            ride['passenger_count'])

    with beam.Pipeline(options=pipeline_options) as pipeline:
        _ = (
            pipeline
            | beam.io.ReadFromPubSub(
                topic='projects/pubsub-public-data/topics/taxirides-realtime').
            with_output_types(bytes)
            | beam.Map(lambda x: (b'', x)).with_output_types(
                typing.Tuple[bytes,
                             bytes])  # Kafka write transforms expects KVs.
            | beam.WindowInto(beam.window.FixedWindows(window_size))
            | WriteToKafka(
                producer_config={'bootstrap.servers': bootstrap_servers},
                topic=topic))

        _ = (pipeline
             | ReadFromKafka(
                 consumer_config={'bootstrap.servers': bootstrap_servers},
                 topics=[topic])
             | beam.FlatMap(lambda kv: log_ride(kv[1])))
def toPositions(x, stamp=beam.DoFn.TimestampParam):
    return (x[0].decode("utf-8"),
            tuple(map(float, x[1].decode("utf-8").split(" "))) +
            (stamp.micros / 1000., ))


if __name__ == "__main__":
    if len(sys.argv) < 4:
        usage()

    bootstrapServer, inputTopic, outputTopic = sys.argv[1:4]

    with beam.Pipeline(options=PipelineOptions(["--streaming"] +
                                               sys.argv[4:])) as p:
        (p | ReadFromKafka(
            consumer_config={'bootstrap.servers': bootstrapServer},
            topics=[inputTopic],
            timestamp_policy=ReadFromKafka.create_time_policy,
            expansion_service=get_expansion_service())
         | "ToPositions" >> beam.Map(toPositions)
         | "SportTracker" >> SportTrackerCalc()
         | "ToKv" >> beam.Map(toKv)
         | "StoreOutput" >>
         WriteToKafka(producer_config={'bootstrap.servers': bootstrapServer},
                      topic=outputTopic,
                      key_serializer=
                      "org.apache.kafka.common.serialization.StringSerializer",
                      value_serializer=
                      "org.apache.kafka.common.serialization.StringSerializer",
                      expansion_service=get_expansion_service()))
Beispiel #4
0
      | beam.WindowInto(
          window.GlobalWindows(),
          trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
          accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
          allowed_lateness=window.Duration.of(0))
      | "MaxLength" >> beam.combiners.Top.Of(1, key=len).without_defaults()
      | "Flatten" >> beam.FlatMap(lambda x: x))

def toKv(s: str) -> beam.typehints.KV[bytes, bytes]:
  return ("".encode("utf-8"), s.encode("utf-8"))

if __name__ == "__main__":
  if len(sys.argv) < 4:
    usage()

  bootstrapServer, inputTopic, outputTopic = sys.argv[1:4]

  with beam.Pipeline(options=PipelineOptions(["--streaming"] + sys.argv[4:])) as p:
    (p | ReadFromKafka(
        consumer_config={'bootstrap.servers': bootstrapServer},
        topics=[inputTopic],
        expansion_service=get_expansion_service())
     | "ToLines" >> beam.Map(lambda x: "%s %s" % (x[0].decode("utf-8"), x[1].decode("utf-8")))
     | "ComputeLongestWord" >> ComputeLongestWord()
     | beam.Map(toKv)
     | "StoreOutput" >> WriteToKafka(
         producer_config={'bootstrap.servers': bootstrapServer},
         topic=outputTopic,
         expansion_service=get_expansion_service()))

Beispiel #5
0
def run(bootstrap_servers, topic, with_metadata, bq_dataset, bq_table_name,
        project, pipeline_options):
    # bootstrap_servers = '123.45.67.89:123:9092'
    # topic = 'kafka_taxirides_realtime'
    # pipeline_args = ['--project', 'my-project',
    #                  '--runner', 'DataflowRunner',
    #                  '--temp_location', 'my-temp-location',
    #                  '--region', 'my-region',
    #                  '--num_workers', 'my-num-workers',
    #                  '--experiments', 'use_runner_v2']

    window_size = 15  # size of the Window in seconds.

    def log_ride(ride):
        if 'timestamp' in ride:
            logging.info(
                'Found ride at latitude %r and longitude %r with %r '
                'passengers at timestamp %r', ride['latitude'],
                ride['longitude'], ride['passenger_count'], ride['timestamp'])
        else:
            logging.info(
                'Found ride at latitude %r and longitude %r with %r '
                'passengers', ride['latitude'], ride['longitude'],
                ride['passenger_count'])

    def convert_kafka_record_to_dictionary(record):
        # the records have 'value' attribute when --with_metadata is given
        if hasattr(record, 'value'):
            ride_bytes = record.value
        elif isinstance(record, tuple):
            ride_bytes = record[1]
        else:
            raise RuntimeError('unknown record type: %s' % type(record))
        # Converting bytes record from Kafka to a dictionary.
        import ast
        ride = ast.literal_eval(ride_bytes.decode("UTF-8"))
        output = {
            key: ride[key]
            for key in ['latitude', 'longitude', 'passenger_count']
        }
        if hasattr(record, 'timestamp'):
            # timestamp is read from Kafka metadata
            output['timestamp'] = record.timestamp
        return output

    with beam.Pipeline(options=pipeline_options) as pipeline:
        _ = (
            pipeline
            | beam.io.ReadFromPubSub(
                topic='projects/pubsub-public-data/topics/taxirides-realtime').
            with_output_types(bytes)
            | beam.Map(lambda x: (b'', x)).with_output_types(
                typing.Tuple[bytes,
                             bytes])  # Kafka write transforms expects KVs.
            | beam.WindowInto(beam.window.FixedWindows(window_size))
            | WriteToKafka(
                producer_config={'bootstrap.servers': bootstrap_servers},
                topic=topic))

        ride_col = (
            pipeline
            | ReadFromKafka(
                consumer_config={'bootstrap.servers': bootstrap_servers},
                topics=[topic],
                with_metadata=with_metadata)
            |
            beam.Map(lambda record: convert_kafka_record_to_dictionary(record))
        )

        if bq_dataset:
            schema = 'latitude:STRING,longitude:STRING,passenger_count:INTEGER'
            if with_metadata:
                schema += ',timestamp:STRING'
            _ = (ride_col
                 | beam.io.WriteToBigQuery(bq_table_name, bq_dataset, project,
                                           schema))
        else:
            _ = ride_col | beam.FlatMap(lambda ride: log_ride(ride))