Ejemplo n.º 1
0
def start_stream():
    # Define the streaming pipeline.
    # note: currently cudf engine supports only flatten json message format.
    source = Stream.from_kafka_batched(
        args.input_topic,
        consumer_conf,
        poll_interval=args.poll_interval,
        # npartitions value varies based on kafka topic partitions configuration.
        npartitions=1,
        asynchronous=True,
        dask=True,
        engine="cudf",
        max_batch_size=args.max_batch_size,
    )
    global output
    # If benchmark arg is True, use streamz to compute benchmark
    if args.benchmark:
        print("Benchmark will be calculated")
        output = (source.map(inference).map(
            lambda x: (x[0], x[1], int(round(time.time())), x[2])).map(
                sink_to_kafka).gather().sink_to_list())
    else:
        output = source.map(inference).map(sink_to_kafka).gather()

    source.start()
Ejemplo n.º 2
0
    def _start_stream(self):
        # Define the streaming pipeline.
        if self.config["cudf_engine"]:
            source = Stream.from_kafka_batched(
                self.kafka_conf["input_topic"],
                self.kafka_conf["consumer_conf"],
                poll_interval=self.args.poll_interval,
                # npartitions value varies based on kafka topic partitions configuration.
                npartitions=self.kafka_conf["n_partitions"],
                asynchronous=True,
                dask=True,
                engine="cudf",
                max_batch_size=self.args.max_batch_size,
            )
        else:
            source = Stream.from_kafka_batched(
                self.kafka_conf["input_topic"],
                self.kafka_conf["consumer_conf"],
                poll_interval=self.args.poll_interval,
                # npartitions value varies based on kafka topic partitions configuration.
                npartitions=self.kafka_conf["n_partitions"],
                asynchronous=True,
                dask=True,
                max_batch_size=self.args.max_batch_size,
            )

        sink = self.config["sink"]
        global output
        # If benchmark arg is True, use streamz to compute benchmark
        if self.args.benchmark:
            print("Benchmark will be calculated")
            output = (source.map(self.inference).map(
                lambda x: (x[0], x[1], int(round(time.time())), x[2])).map(
                    self.sink_dict[sink]).gather().sink_to_list())
        else:
            output = source.map(self.inference).map(
                self.sink_dict[sink]).gather()

        source.start()
Ejemplo n.º 3
0
def start_stream():
    source = Stream.from_kafka_batched(
        args.input_topic,
        consumer_conf,
        poll_interval=args.poll_interval,
        # npartitions value varies based on kafka topic partitions configuration.
        npartitions=1,
        asynchronous=True,
        dask=True,
        max_batch_size=args.max_batch_size,
    )
    global output
    # If benchmark arg is True, use streamz to compute benchmark
    if args.benchmark:
        print("Benchmark will be calculated")
        output = (source.map(inference).map(
            lambda x: (x[0], x[1], int(round(time.time())), x[2])).map(
                sink_to_kafka).gather().sink_to_list())
    else:
        output = source.map(inference).map(sink_to_kafka).gather()

    source.start()
Ejemplo n.º 4
0
    client.run(worker_init)

    # Define the streaming pipeline.
    consumer_conf = {
        "bootstrap.servers": args.broker,
        "group.id": args.group_id,
        "session.timeout.ms": 60000,
        "enable.partition.eof": "true",
        "auto.offset.reset": "earliest",
    }
    print("Consumer conf:", consumer_conf)
    source = Stream.from_kafka_batched(
        args.input_topic,
        consumer_conf,
        poll_interval=args.poll_interval,
        npartitions=1,
        asynchronous=True,
        dask=True,
        max_batch_size=args.max_batch_size,
    )

    # If benchmark arg is True, use streamz to compute benchmark
    if args.benchmark:
        print("Benchmark will be calculated")
        output = (source.map(inference).map(
            lambda x: (x[0], x[1], x[2], int(round(time.time())), x[3])).map(
                sink_to_kafka).gather().sink_to_list())
    else:
        output = source.map(inference).map(sink_to_kafka).gather()

    source.start()
    if i % 1000 == 0:
        producer.flush()

consumer = ck.Consumer(cconf)
tp = ck.TopicPartition(topic, 0, 0)

t0 = time.time()
msg = consumer.poll(0)
while msg and msg.value():
    keep = msg
    msg = consumer.poll(0)
print('direct', time.time() - t0)

print('batched', time.time())
stream = Stream.from_kafka_batched(topic,
                                   cconf,
                                   npartitions=n_parts,
                                   poll_interval=0.1)
stream.map(lambda batch: (any(int(msg) >= n_msg - 1
                              for msg in batch), time.time())).sink(print)
stream.start()

# import dask.distributed
# client = dask.distributed.Client(processes=False)
#
# print('dask start', time.time())
# stream = Stream.from_kafka_batched(topic, cconf, npartitions=n_parts, dask=True)
# stream.map(lambda batch: (any(
#     int(msg) >= n_msg-1 for msg in batch), time.time())).gather().sink(print)
# stream.start()
Ejemplo n.º 6
0
                     default="localhost:9092",
                     help="Kafka broker")
 parser.add_argument("--input_topic",
                     default="input",
                     help="Input kafka topic")
 parser.add_argument("--output_topic",
                     default="output",
                     help="Output kafka topic")
 parser.add_argument("--group_id", default="streamz", help="Kafka group ID")
 args = parser.parse_args()
 cluster = LocalCUDACluster()
 client = Client(cluster)
 print(client)
 client.run(worker_init)
 # Define the streaming pipeline.
 consumer_conf = {
     'bootstrap.servers': args.broker,
     'group.id': args.group_id,
     'session.timeout.ms': 60000
 }
 source = Stream.from_kafka_batched(args.input_topic,
                                    consumer_conf,
                                    poll_interval='1s',
                                    npartitions=1,
                                    asynchronous=True,
                                    dask=False)
 inference = source.map(predict_batch)
 wel_parsing = inference.map(wel_parsing)
 alerts = wel_parsing.map(threshold_alert).map(sink_to_kafka)
 # Start the stream.
 source.start()