def run_write(): def user_data_mapper(test_row): return [str(test_row.text_column).encode('utf-8'), str(test_row.number_column).encode('utf-8'), str(test_row.boolean_column).encode('utf-8') ] p = beam.Pipeline(options=PipelineOptions(OPTIONS)) (p | GenerateSequence(start=1, stop=3, expansion_service=EXPANSION_SERVICE) | beam.Map(lambda num: Row("test" + str(num), num, True)) | "Writing into Snowflake" >> WriteToSnowflake( server_name=SERVER_NAME, username=USERNAME, password=PASSWORD, schema=SCHEMA, database=DATABASE, staging_bucket_name=STAGING_BUCKET_NAME, storage_integration=STORAGE_INTEGRATION, create_disposition="CREATE_IF_NEEDED", write_disposition="TRUNCATE", table_schema=SCHEMA_STRING, user_data_mapper=user_data_mapper, table=TABLE, query=None, expansion_service=EXPANSION_SERVICE) ) result = p.run() result.wait_until_finish()
def main(options): pipe = beam.Pipeline(options=options) ( pipe # | 'Gen' >> beam.Create(range(10)) | 'Gen' >> GenerateSequence( start=1, stop=22, expansion_service='localhost:8097', ) | 'Log' >> Log() ) print cprint('Starting streaming graph forever. Kill with ctrl+c', 'red', attrs=['bold']) print result = pipe.run() try: while True: time.sleep(10) except KeyboardInterrupt: print cprint('Shutting down...', 'yellow') result.cancel()
def test_external_transform(self): with self.create_pipeline() as p: res = ( p | GenerateSequence( start=1, stop=10, expansion_service=self.get_expansion_service())) assert_that(res, equal_to([i for i in range(1, 10)]))
def test_external_transforms(self): # TODO Move expansion address resides into PipelineOptions def get_expansion_service(): return "localhost:" + str(self.expansion_port) with self.create_pipeline() as p: res = (p | GenerateSequence( start=1, stop=10, expansion_service=get_expansion_service())) assert_that(res, equal_to([i for i in range(1, 10)])) # We expect to fail here because we do not have a Kafka cluster handy. # Nevertheless, we check that the transform is expanded by the # ExpansionService and that the pipeline fails during execution. with self.assertRaises(Exception) as ctx: with self.create_pipeline() as p: # pylint: disable=expression-not-assigned (p | ReadFromKafka(consumer_config={ 'bootstrap.servers': 'notvalid1:7777, notvalid2:3531' }, topics=['topic1', 'topic2'], key_deserializer='org.apache.kafka.' 'common.serialization.' 'ByteArrayDeserializer', value_deserializer='org.apache.kafka.' 'common.serialization.' 'LongDeserializer', expansion_service=get_expansion_service())) self.assertTrue( 'No resolvable bootstrap urls given in bootstrap.servers' in str(ctx.exception), 'Expected to fail due to invalid bootstrap.servers, but ' 'failed due to:\n%s' % str(ctx.exception)) # We just test the expansion but do not execute. # pylint: disable=expression-not-assigned (self.create_pipeline() | Impulse() | Map(lambda input: (1, input)) | WriteToKafka(producer_config={ 'bootstrap.servers': 'localhost:9092, notvalid2:3531' }, topic='topic1', key_serializer='org.apache.kafka.' 'common.serialization.' 'LongSerializer', value_serializer='org.apache.kafka.' 'common.serialization.' 'ByteArraySerializer', expansion_service=get_expansion_service()))
def main(argv=None): options = PipelineOptions(argv) p = Pipeline(options=options) (p | GenerateSequence( 0, stop=100, expansion_service=BEAM_IO_EXPANSION_SERVICE) | Map(lambda x: logging.info(x))) p.run()
def test_java_expansion(self): if not self.expansion_service_jar: raise unittest.SkipTest('No expansion service jar provided.') # The actual definitions of these transforms is in # org.apache.beam.runners.core.construction.TestExpansionService. TEST_COUNT_URN = "pytest:beam:transforms:count" TEST_FILTER_URN = "pytest:beam:transforms:filter_less_than" # Run as cheaply as possible on the portable runner. # TODO(robertwb): Support this directly in the direct runner. options = beam.options.pipeline_options.PipelineOptions( runner='PortableRunner', experiments=['beam_fn_api'], environment_type=python_urns.EMBEDDED_PYTHON, job_endpoint='embed') try: # Start the java server and wait for it to be ready. port = '8091' address = 'localhost:%s' % port server = subprocess.Popen( ['java', '-jar', self.expansion_service_jar, port]) with grpc.insecure_channel(address) as channel: grpc.channel_ready_future(channel).result() # Run a simple count-filtered-letters pipeline. with beam.Pipeline(options=options) as p: res = ( p | beam.Create(list('aaabccxyyzzz')) | beam.Map(unicode) # TODO(BEAM-6587): Use strings directly rather than ints. | beam.Map(lambda x: int(ord(x))) | beam.ExternalTransform(TEST_FILTER_URN, b'middle', address) | beam.ExternalTransform(TEST_COUNT_URN, None, address) # TODO(BEAM-6587): Remove when above is removed. | beam.Map(lambda kv: (chr(kv[0]), kv[1])) | beam.Map(lambda kv: '%s: %s' % kv)) assert_that(res, equal_to(['a: 3', 'b: 1', 'c: 2'])) # Test GenerateSequence Java transform with beam.Pipeline(options=options) as p: res = (p | GenerateSequence( start=1, stop=10, expansion_service=address)) assert_that(res, equal_to([i for i in range(1, 10)])) finally: server.kill()
def test_external_transform(self): options = self.create_options() options._all_options['parallelism'] = 1 options._all_options['streaming'] = True expansion_address = "localhost:" + str( FlinkRunnerTest.expansion_port) with self.create_pipeline() as p: res = (p | GenerateSequence(start=1, stop=10, expansion_service=expansion_address)) assert_that(res, equal_to([i for i in range(1, 10)]))
def test_generate_sequence(self): port = os.environ.get('EXPANSION_PORT') address = 'localhost:%s' % port try: with TestPipeline() as p: res = ( p | GenerateSequence(start=1, stop=10, expansion_service=address)) assert_that(res, equal_to([i for i in range(1, 10)])) except RuntimeError as e: if re.search(GenerateSequence.URN, str(e)): print("looks like URN not implemented in expansion service, skipping.") else: raise e
def test_external_transforms(self): options = self.create_options() options._all_options['parallelism'] = 1 options._all_options['streaming'] = True expansion_address = "localhost:" + str( FlinkRunnerTest.expansion_port) with self.create_pipeline() as p: res = (p | GenerateSequence(start=1, stop=10, expansion_service=expansion_address)) assert_that(res, equal_to([i for i in range(1, 10)])) # We expect to fail here because we do not have a Kafka cluster handy. # Nevertheless, we check that the transform is expanded by the # ExpansionService and that the pipeline fails during execution. with self.assertRaises(Exception) as ctx: with self.create_pipeline() as p: # pylint: disable=expression-not-assigned (p | ReadFromKafka(consumer_config={ 'bootstrap.servers': 'notvalid1:7777, notvalid2:3531' }, topics=['topic1', 'topic2'], key_deserializer='org.apache.kafka.' 'common.serialization.' 'ByteArrayDeserializer', value_deserializer='org.apache.kafka.' 'common.serialization.' 'LongDeserializer', expansion_service=expansion_address)) self.assertTrue( 'No resolvable bootstrap urls given in bootstrap.servers' in str(ctx.exception), 'Expected to fail due to invalid bootstrap.servers, but ' 'failed due to:\n%s' % str(ctx.exception))