def test_external_transforms(self): # TODO Move expansion address resides into PipelineOptions def get_expansion_service(): return "localhost:" + str(self.expansion_port) with self.create_pipeline() as p: res = (p | GenerateSequence( start=1, stop=10, expansion_service=get_expansion_service())) assert_that(res, equal_to([i for i in range(1, 10)])) # We expect to fail here because we do not have a Kafka cluster handy. # Nevertheless, we check that the transform is expanded by the # ExpansionService and that the pipeline fails during execution. with self.assertRaises(Exception) as ctx: with self.create_pipeline() as p: # pylint: disable=expression-not-assigned (p | ReadFromKafka(consumer_config={ 'bootstrap.servers': 'notvalid1:7777, notvalid2:3531' }, topics=['topic1', 'topic2'], key_deserializer='org.apache.kafka.' 'common.serialization.' 'ByteArrayDeserializer', value_deserializer='org.apache.kafka.' 'common.serialization.' 'LongDeserializer', expansion_service=get_expansion_service())) self.assertTrue( 'No resolvable bootstrap urls given in bootstrap.servers' in str(ctx.exception), 'Expected to fail due to invalid bootstrap.servers, but ' 'failed due to:\n%s' % str(ctx.exception)) # We just test the expansion but do not execute. # pylint: disable=expression-not-assigned (self.create_pipeline() | Impulse() | Map(lambda input: (1, input)) | WriteToKafka(producer_config={ 'bootstrap.servers': 'localhost:9092, notvalid2:3531' }, topic='topic1', key_serializer='org.apache.kafka.' 'common.serialization.' 'LongSerializer', value_serializer='org.apache.kafka.' 'common.serialization.' 'ByteArraySerializer', expansion_service=get_expansion_service()))
def build_read_pipeline(self, pipeline): _ = (pipeline | 'ReadFromKafka' >> ReadFromKafka( consumer_config={ 'bootstrap.servers': self.bootstrap_servers, 'auto.offset.reset': 'earliest' }, topics=[self.topic], expansion_service=self.expansion_service) | 'Windowing' >> beam.WindowInto( beam.window.FixedWindows(300), trigger=beam.transforms.trigger.AfterProcessingTime(60), accumulation_mode=beam.transforms.trigger.AccumulationMode. DISCARDING) | 'DecodingValue' >> beam.Map(lambda elem: int(elem[1].decode())) | 'CombineGlobally' >> beam.CombineGlobally(sum).without_defaults() | 'SetSumCounter' >> beam.Map(self.sum_counter.inc))
def test_external_transforms(self): options = self.create_options() options._all_options['parallelism'] = 1 options._all_options['streaming'] = True expansion_address = "localhost:" + str( FlinkRunnerTest.expansion_port) with self.create_pipeline() as p: res = (p | GenerateSequence(start=1, stop=10, expansion_service=expansion_address)) assert_that(res, equal_to([i for i in range(1, 10)])) # We expect to fail here because we do not have a Kafka cluster handy. # Nevertheless, we check that the transform is expanded by the # ExpansionService and that the pipeline fails during execution. with self.assertRaises(Exception) as ctx: with self.create_pipeline() as p: # pylint: disable=expression-not-assigned (p | ReadFromKafka(consumer_config={ 'bootstrap.servers': 'notvalid1:7777, notvalid2:3531' }, topics=['topic1', 'topic2'], key_deserializer='org.apache.kafka.' 'common.serialization.' 'ByteArrayDeserializer', value_deserializer='org.apache.kafka.' 'common.serialization.' 'LongDeserializer', expansion_service=expansion_address)) self.assertTrue( 'No resolvable bootstrap urls given in bootstrap.servers' in str(ctx.exception), 'Expected to fail due to invalid bootstrap.servers, but ' 'failed due to:\n%s' % str(ctx.exception))
def run(): options = PipelineOptions([ "--runner=PortableRunner", "--job_endpoint=localhost:8099", "--environment_type=LOOPBACK" ]) # options = PipelineOptions([ # "--runner=FlinkRunner", # "--flink_master=localhost:8081", # ]) with beam.Pipeline(options=options) as p: (p | 'ReadFromKafka' >> ReadFromKafka( consumer_config={"bootstrap.servers": "localhost:9092"}, topics=["beam-input"]) | 'ExtractWords' >> beam.FlatMap(lambda kv: re.findall(r'[A-Za-z\']+', kv[1])) | 'Window' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING) | 'Count' >> beam.combiners.Count.PerElement() | 'Format' >> beam.Map(lambda word_count: '%s: %s' % (word_count[0], word_count[1])) | 'Log' >> beam.ParDo(LoggingDoFn()))
def run_pipeline(): options = PipelineOptions( # runner = "DirectRunner", runner="PortableRunner", job_endpoint="localhost:8099", environment_type="LOOPBACK") #print(options) # options = PipelineOptions([ # "--runner=PortableRunner", # "--job_endpoint=localhost:8099", # "--environment_type=LOOPBACK" # ]) # beam_options = PipelineOptions( # beam_args, # runner='DataflowRunner', # project='my-project-id', # job_name='unique-job-name', # temp_location='gs://my-bucket/temp', # region='us-central1') #pipeline_options = PipelineOptions() # with beam.Pipeline(options = options) as p: with beam.Pipeline() as p: (p # | beam.Create(['alpha','beta', 'gamma']) | 'Read from Kafka' >> ReadFromKafka( consumer_config={ 'bootstrap.servers': brokers, 'auto.offset.reset': 'latest', 'session.timeout.ms': '12000' # ,'request.timeout.ms.config': 120000 }, topics=[kafka_topic]) | 'Print' >> beam.Map(lambda x: print('*' * 100, '\n', x)))
def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) (p | 'ReadFromKafka' >> ReadFromKafka( consumer_config={"bootstrap.servers": "localhost:9092"}, topics=["beam-input"]) | 'ExtractWords' >> beam.FlatMap(lambda (k, v): re.findall(r'[A-Za-z\']+', v)) | 'Window' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING) | 'Count' >> beam.combiners.Count.PerElement() | 'Format' >> beam.Map(lambda word_count: '%s: %s' % (word_count[0], word_count[1])) | 'Log' >> beam.ParDo(LoggingDoFn())) result = p.run() result.wait_until_finish()
def main(pipeline_options, args): farm_kw = ( ('graphs', 1), ('jobs', 2), ('tasks', 3), ('outputs', 4), ) pipe = beam.Pipeline(options=pipeline_options) feed = (pipe | 'KafkaInflow' >> ReadFromKafka( consumer_config={ 'bootstrap.servers': 'localhost:9092', }, topics=[TOPIC], key_deserializer= 'org.apache.kafka.common.serialization.ByteArrayDeserializer', value_deserializer= 'org.apache.kafka.common.serialization.ByteArrayDeserializer', expansion_service='localhost:8097') | 'RawFeed' >> Log(color=('white', ['dark']))) # ( # feed # | JobAggregateLevel.TASK >> JobOutput(JobAggregateLevel.TASK) # | 'PerTask' >> Log(color=('yellow', ['bold'])) # ) # # ( # feed # | JobAggregateLevel.JOB >> JobOutput(JobAggregateLevel.JOB) # | 'PerJob' >> Log(color=('blue', ['bold'])) # ) # # ( # feed # | JobAggregateLevel.GRAPH >> JobOutput(JobAggregateLevel.GRAPH) # | 'PerGraph' >> Log(color=('green', ['bold'])) # ) result = pipe.run() # type: PipelineResult time.sleep(10) while result.state != PipelineState.RUNNING: time.sleep(10) print cprint('Starting streaming graph forever. Kill with ctrl+c', 'red', attrs=['bold']) print cprint('Generating farm jobs:', 'yellow') for k, v in farm_kw: print ' {}={}'.format(k, colored(repr(v), 'white', attrs=['bold'])) print admin = kafka.admin.KafkaAdminClient( # bootstrap_servers=['localhost:9092'], ) try: admin.create_topics([kafka.admin.NewTopic(TOPIC, 1, 1)]) except kafka.errors.TopicAlreadyExistsError: pass # producer = kafka.KafkaProducer( # # bootstrap_servers=['localhost:9092'], # ) # # for i, payload in enumerate(rillbeam.data.farm.gen_farm_messages(**dict(farm_kw))): # print payload # producer.send('beam-kfarm', 'foo') try: result.wait_until_finish() except KeyboardInterrupt: print cprint('Shutting down...', 'yellow') result.cancel()