def run_write():
    def user_data_mapper(test_row):
        return [str(test_row.text_column).encode('utf-8'),
                str(test_row.number_column).encode('utf-8'),
                str(test_row.boolean_column).encode('utf-8')
                ]

    p = beam.Pipeline(options=PipelineOptions(OPTIONS))
    (p
     | GenerateSequence(start=1, stop=3, expansion_service=EXPANSION_SERVICE)
     | beam.Map(lambda num: Row("test" + str(num), num, True))
     | "Writing into Snowflake" >> WriteToSnowflake(
                server_name=SERVER_NAME,
                username=USERNAME,
                password=PASSWORD,
                schema=SCHEMA,
                database=DATABASE,
                staging_bucket_name=STAGING_BUCKET_NAME,
                storage_integration=STORAGE_INTEGRATION,
                create_disposition="CREATE_IF_NEEDED",
                write_disposition="TRUNCATE",
                table_schema=SCHEMA_STRING,
                user_data_mapper=user_data_mapper,
                table=TABLE,
                query=None,
                expansion_service=EXPANSION_SERVICE)
     )
    result = p.run()
    result.wait_until_finish()
Ejemplo n.º 2
0
def main(options):

    pipe = beam.Pipeline(options=options)

    (
        pipe
        # | 'Gen' >> beam.Create(range(10))
        | 'Gen' >> GenerateSequence(
              start=1,
              stop=22,
              expansion_service='localhost:8097',
          )
        | 'Log' >> Log()
    )

    print
    cprint('Starting streaming graph forever. Kill with ctrl+c',
           'red', attrs=['bold'])
    print

    result = pipe.run()
    try:
        while True:
            time.sleep(10)
    except KeyboardInterrupt:
        print
        cprint('Shutting down...', 'yellow')
        result.cancel()
Ejemplo n.º 3
0
  def test_external_transform(self):
    with self.create_pipeline() as p:
      res = (
          p
          | GenerateSequence(
              start=1, stop=10, expansion_service=self.get_expansion_service()))

      assert_that(res, equal_to([i for i in range(1, 10)]))
Ejemplo n.º 4
0
        def test_external_transforms(self):
            # TODO Move expansion address resides into PipelineOptions
            def get_expansion_service():
                return "localhost:" + str(self.expansion_port)

            with self.create_pipeline() as p:
                res = (p
                       | GenerateSequence(
                           start=1,
                           stop=10,
                           expansion_service=get_expansion_service()))

                assert_that(res, equal_to([i for i in range(1, 10)]))

            # We expect to fail here because we do not have a Kafka cluster handy.
            # Nevertheless, we check that the transform is expanded by the
            # ExpansionService and that the pipeline fails during execution.
            with self.assertRaises(Exception) as ctx:
                with self.create_pipeline() as p:
                    # pylint: disable=expression-not-assigned
                    (p
                     |
                     ReadFromKafka(consumer_config={
                         'bootstrap.servers':
                         'notvalid1:7777, notvalid2:3531'
                     },
                                   topics=['topic1', 'topic2'],
                                   key_deserializer='org.apache.kafka.'
                                   'common.serialization.'
                                   'ByteArrayDeserializer',
                                   value_deserializer='org.apache.kafka.'
                                   'common.serialization.'
                                   'LongDeserializer',
                                   expansion_service=get_expansion_service()))
            self.assertTrue(
                'No resolvable bootstrap urls given in bootstrap.servers'
                in str(ctx.exception),
                'Expected to fail due to invalid bootstrap.servers, but '
                'failed due to:\n%s' % str(ctx.exception))

            # We just test the expansion but do not execute.
            # pylint: disable=expression-not-assigned
            (self.create_pipeline()
             | Impulse()
             | Map(lambda input: (1, input))
             | WriteToKafka(producer_config={
                 'bootstrap.servers':
                 'localhost:9092, notvalid2:3531'
             },
                            topic='topic1',
                            key_serializer='org.apache.kafka.'
                            'common.serialization.'
                            'LongSerializer',
                            value_serializer='org.apache.kafka.'
                            'common.serialization.'
                            'ByteArraySerializer',
                            expansion_service=get_expansion_service()))
Ejemplo n.º 5
0
def main(argv=None):
    options = PipelineOptions(argv)
    p = Pipeline(options=options)

    (p
     | GenerateSequence(
         0, stop=100, expansion_service=BEAM_IO_EXPANSION_SERVICE)
     | Map(lambda x: logging.info(x)))

    p.run()
Ejemplo n.º 6
0
    def test_java_expansion(self):
        if not self.expansion_service_jar:
            raise unittest.SkipTest('No expansion service jar provided.')

        # The actual definitions of these transforms is in
        # org.apache.beam.runners.core.construction.TestExpansionService.
        TEST_COUNT_URN = "pytest:beam:transforms:count"
        TEST_FILTER_URN = "pytest:beam:transforms:filter_less_than"

        # Run as cheaply as possible on the portable runner.
        # TODO(robertwb): Support this directly in the direct runner.
        options = beam.options.pipeline_options.PipelineOptions(
            runner='PortableRunner',
            experiments=['beam_fn_api'],
            environment_type=python_urns.EMBEDDED_PYTHON,
            job_endpoint='embed')

        try:
            # Start the java server and wait for it to be ready.
            port = '8091'
            address = 'localhost:%s' % port
            server = subprocess.Popen(
                ['java', '-jar', self.expansion_service_jar, port])
            with grpc.insecure_channel(address) as channel:
                grpc.channel_ready_future(channel).result()

            # Run a simple count-filtered-letters pipeline.
            with beam.Pipeline(options=options) as p:
                res = (
                    p
                    | beam.Create(list('aaabccxyyzzz'))
                    | beam.Map(unicode)
                    # TODO(BEAM-6587): Use strings directly rather than ints.
                    | beam.Map(lambda x: int(ord(x)))
                    | beam.ExternalTransform(TEST_FILTER_URN, b'middle',
                                             address)
                    | beam.ExternalTransform(TEST_COUNT_URN, None, address)
                    # TODO(BEAM-6587): Remove when above is removed.
                    | beam.Map(lambda kv: (chr(kv[0]), kv[1]))
                    | beam.Map(lambda kv: '%s: %s' % kv))

                assert_that(res, equal_to(['a: 3', 'b: 1', 'c: 2']))

            # Test GenerateSequence Java transform
            with beam.Pipeline(options=options) as p:
                res = (p
                       | GenerateSequence(
                           start=1, stop=10, expansion_service=address))

                assert_that(res, equal_to([i for i in range(1, 10)]))

        finally:
            server.kill()
Ejemplo n.º 7
0
        def test_external_transform(self):
            options = self.create_options()
            options._all_options['parallelism'] = 1
            options._all_options['streaming'] = True

            expansion_address = "localhost:" + str(
                FlinkRunnerTest.expansion_port)

            with self.create_pipeline() as p:
                res = (p
                       | GenerateSequence(start=1,
                                          stop=10,
                                          expansion_service=expansion_address))

                assert_that(res, equal_to([i for i in range(1, 10)]))
Ejemplo n.º 8
0
  def test_generate_sequence(self):
    port = os.environ.get('EXPANSION_PORT')
    address = 'localhost:%s' % port

    try:
      with TestPipeline() as p:
        res = (
            p
            | GenerateSequence(start=1, stop=10, expansion_service=address))

        assert_that(res, equal_to([i for i in range(1, 10)]))
    except RuntimeError as e:
      if re.search(GenerateSequence.URN, str(e)):
        print("looks like URN not implemented in expansion service, skipping.")
      else:
        raise e
Ejemplo n.º 9
0
        def test_external_transforms(self):
            options = self.create_options()
            options._all_options['parallelism'] = 1
            options._all_options['streaming'] = True

            expansion_address = "localhost:" + str(
                FlinkRunnerTest.expansion_port)

            with self.create_pipeline() as p:
                res = (p
                       | GenerateSequence(start=1,
                                          stop=10,
                                          expansion_service=expansion_address))

                assert_that(res, equal_to([i for i in range(1, 10)]))

            # We expect to fail here because we do not have a Kafka cluster handy.
            # Nevertheless, we check that the transform is expanded by the
            # ExpansionService and that the pipeline fails during execution.
            with self.assertRaises(Exception) as ctx:
                with self.create_pipeline() as p:
                    # pylint: disable=expression-not-assigned
                    (p
                     | ReadFromKafka(consumer_config={
                         'bootstrap.servers':
                         'notvalid1:7777, notvalid2:3531'
                     },
                                     topics=['topic1', 'topic2'],
                                     key_deserializer='org.apache.kafka.'
                                     'common.serialization.'
                                     'ByteArrayDeserializer',
                                     value_deserializer='org.apache.kafka.'
                                     'common.serialization.'
                                     'LongDeserializer',
                                     expansion_service=expansion_address))
            self.assertTrue(
                'No resolvable bootstrap urls given in bootstrap.servers'
                in str(ctx.exception),
                'Expected to fail due to invalid bootstrap.servers, but '
                'failed due to:\n%s' % str(ctx.exception))