Ejemplos de Create en Python, ejemplos de google.cloud.dataflow.Create en Python

Ejemplo n.º 1

0

Mostrar archivo

    def test_bad_types(self):
        p = df.Pipeline('DirectPipelineRunner', argv=sys.argv)

        # [START type_hints_missing_define_numbers]
        numbers = p | df.Create(['1', '2', '3'])
        # [END type_hints_missing_define_numbers]

        # Consider the following code.
        # [START type_hints_missing_apply]
        evens = numbers | df.Filter(lambda x: x % 2 == 0)
        # [END type_hints_missing_apply]

        # Now suppose numers was defined as [snippet above].
        # When running this pipeline, you'd get a runtime error,
        # possibly on a remote machine, possibly very late.

        with self.assertRaises(TypeError):
            p.run()

        # To catch this early, we can assert what types we expect.
        with self.assertRaises(typehints.TypeCheckError):
            # [START type_hints_takes]
            p.options.view_as(TypeOptions).pipeline_type_check = True
            evens = numbers | df.Filter(lambda x: x % 2 == 0).with_input_types(
                int)
            # [END type_hints_takes]

        # Type hints can be declared on DoFns and callables as well, rather
        # than where they're used, to be more self contained.
        with self.assertRaises(typehints.TypeCheckError):
            # [START type_hints_do_fn]
            @df.typehints.with_input_types(int)
            class FilterEvensDoFn(df.DoFn):
                def process(self, context):
                    if context.element % 2 == 0:
                        yield context.element

            evens = numbers | df.ParDo(FilterEvensDoFn())
            # [END type_hints_do_fn]

        words = p | df.Create('words', ['a', 'bb', 'c'])
        # One can assert outputs and apply them to transforms as well.
        # Helps document the contract and checks it at pipeline construction time.
        # [START type_hints_transform]
        T = df.typehints.TypeVariable('T')

        @df.typehints.with_input_types(T)
        @df.typehints.with_output_types(df.typehints.Tuple[int, T])
        class MyTransform(df.PTransform):
            def apply(self, pcoll):
                return pcoll | df.Map(lambda x: (len(x), x))

        words_with_lens = words | MyTransform()
        # [END type_hints_transform]

        with self.assertRaises(typehints.TypeCheckError):
            words_with_lens | df.Map(lambda x: x).with_input_types(
                df.typehints.Tuple[int, int])

Ejemplo n.º 2

0

Mostrar archivo

    def test_deferred_side_input_iterable(self):
        @typehints.with_input_types(str, typehints.Iterable[str])
        def concat(glue, items):
            return glue.join(sorted(items))

        p = df.Pipeline(options=PipelineOptions([]))
        main_input = p | df.Create(['a', 'bb', 'c'])
        side_input = p | df.Create('side', ['x', 'y', 'z'])
        result = main_input | df.Map(concat, pvalue.AsIter(side_input))
        assert_that(result, equal_to(['xayaz', 'xbbybbz', 'xcycz']))
        p.run()

        bad_side_input = p | df.Create('bad_side', [1, 2, 3])
        with self.assertRaises(typehints.TypeCheckError):
            main_input | df.Map('fail', concat, pvalue.AsIter(bad_side_input))

Ejemplo n.º 3

0

Mostrar archivo

 def test_runtime_checks_on(self):
     p = df.Pipeline('DirectPipelineRunner', argv=sys.argv)
     with self.assertRaises(typehints.TypeCheckError):
         # [START type_hints_runtime_on]
         p.options.view_as(TypeOptions).runtime_type_check = True
         p | df.Create(['a']) | df.Map(lambda x: 3).with_output_types(str)
         p.run()

Ejemplo n.º 4

0

Mostrar archivo

Archivo: bigquery_tornadoes_test.py Proyecto: wangjiahong/DataflowPythonSDK

 def test_basics(self):
     p = df.Pipeline('DirectPipelineRunner')
     rows = (p | df.Create('create', [{
         'month': 1,
         'day': 1,
         'tornado': False
     }, {
         'month': 1,
         'day': 2,
         'tornado': True
     }, {
         'month': 1,
         'day': 3,
         'tornado': True
     }, {
         'month': 2,
         'day': 1,
         'tornado': True
     }]))
     results = bigquery_tornadoes.count_tornadoes(rows)
     df.assert_that(
         results,
         df.equal_to([{
             'month': 1,
             'tornado_count': 2
         }, {
             'month': 2,
             'tornado_count': 1
         }]))
     p.run()

Ejemplo n.º 5

0

Mostrar archivo

Archivo: write_ptransform_test.py Proyecto: wangjiahong/DataflowPythonSDK

    def _run_write_test(self,
                        data,
                        return_init_result=True,
                        return_write_results=True):
        write_to_test_sink = WriteToTestSink(return_init_result,
                                             return_write_results)
        p = Pipeline(options=PipelineOptions([]))
        result = p | df.Create('start', data) | write_to_test_sink

        assert_that(result, is_empty())
        p.run()

        sink = write_to_test_sink.last_sink
        self.assertIsNotNone(sink)

        self.assertEqual(sink.state, _TestSink.STATE_FINALIZED)
        if data:
            self.assertIsNotNone(sink.last_writer)
            self.assertEqual(sink.last_writer.state, _TestWriter.STATE_CLOSED)
            self.assertEqual(sink.last_writer.write_output, data)
            if return_init_result:
                self.assertEqual(sink.last_writer.init_result,
                                 _TestSink.TEST_INIT_RESULT)
                self.assertEqual(sink.init_result_at_finalize,
                                 _TestSink.TEST_INIT_RESULT)
            self.assertIsNotNone(sink.last_writer.uid)
            if return_write_results:
                self.assertEqual(sink.write_results_at_finalize,
                                 [_TestWriter.TEST_WRITE_RESULT])
        else:
            self.assertIsNone(sink.last_writer)

Ejemplo n.º 6

0

Mostrar archivo

    def test_deferred_side_inputs(self):
        @typehints.with_input_types(str, int)
        def repeat(s, times):
            return s * times

        p = df.Pipeline(options=PipelineOptions([]))
        main_input = p | df.Create(['a', 'bb', 'c'])
        side_input = p | df.Create('side', [3])
        result = main_input | df.Map(repeat, pvalue.AsSingleton(side_input))
        assert_that(result, equal_to(['aaa', 'bbbbbb', 'ccc']))
        p.run()

        bad_side_input = p | df.Create('bad_side', ['z'])
        with self.assertRaises(typehints.TypeCheckError):
            main_input | df.Map('again', repeat,
                                pvalue.AsSingleton(bad_side_input))

Ejemplo n.º 7

0

Mostrar archivo

Archivo: top_wikipedia_sessions_test.py Proyecto: wangjiahong/DataflowPythonSDK

    def test_compute_top_sessions(self):
        p = df.Pipeline('DirectPipelineRunner')
        edits = p | df.Create('create', self.EDITS)
        result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0)

        df.assert_that(result, df.equal_to(self.EXPECTED))
        p.run()

Ejemplo n.º 8

0

Mostrar archivo

    def test_aggregation(self):

        mean = combiners.MeanCombineFn()
        mean.__name__ = 'mean'
        counter_types = [
            (sum, int, 6),
            (min, int, 0),
            (max, int, 3),
            (mean, int, 1),
            (sum, float, 6.0),
            (min, float, 0.0),
            (max, float, 3.0),
            (mean, float, 1.5),
            (any, int, True),
            (all, float, False),
        ]
        aggeregators = [
            Aggregator('%s_%s' % (f.__name__, t.__name__), f, t)
            for f, t, _ in counter_types
        ]

        class UpdateAggregators(df.DoFn):
            def process(self, context):
                for a in aggeregators:
                    context.aggregate_to(a, context.element)

        p = df.Pipeline('DirectPipelineRunner')
        p | df.Create([0, 1, 2, 3]) | df.ParDo(UpdateAggregators())
        res = p.run()
        for (_, _, expected), a in zip(counter_types, aggeregators):
            actual = res.aggregated_values(a).values()[0]
            self.assertEqual(expected, actual)
            self.assertEqual(type(expected), type(actual))

Ejemplo n.º 9

0

Mostrar archivo

Archivo: tfidf_test.py Proyecto: datalabai/cloud

 def test_tfidf_transform(self):
     p = df.Pipeline('DirectPipelineRunner')
     uri_to_line = p | df.Create('create sample', [('1.txt', 'abc def ghi'),
                                                   ('2.txt', 'abc def'),
                                                   ('3.txt', 'abc')])
     result = (uri_to_line
               | tfidf.TfIdf()
               | df.Map('flatten', lambda (word, (uri, tfidf)):
                        (word, uri, tfidf)))

Ejemplo n.º 10

0

Mostrar archivo

Archivo: fileio_test.py Proyecto: volnt/DataflowPythonSDK

 def test_empty_write(self):
     temp_path = tempfile.NamedTemporaryFile().name
     sink = MyFileSink(temp_path,
                       file_name_suffix='.foo',
                       coder=coders.ToStringCoder())
     p = df.Pipeline('DirectPipelineRunner')
     p | df.Create([]) | df.io.Write(sink)  # pylint: disable=expression-not-assigned
     p.run()
     self.assertEqual(
         open(temp_path + '-00000-of-00001.foo').read(), '[start][end]')

Ejemplo n.º 11

0

Mostrar archivo

def assert_that(pcoll, matcher):
    """Asserts that the give PCollection satisfies the constraints of the matcher
  in a way that is runnable locally or on a remote service.
  """
    singleton = pcoll.pipeline | df.Create('create_singleton', [None])

    def check_matcher(_, side_value):
        assert matcher(side_value)
        return []

    singleton | df.FlatMap(check_matcher, AsIter(pcoll))

Ejemplo n.º 12

0

Mostrar archivo

Archivo: juliaset.py Proyecto: wangjiahong/DataflowPythonSDK

def generate_julia_set_colors(pipeline, c, n, max_iterations):
    """Compute julia set coordinates for each point in our set."""
    def point_set(n):
        for x in range(n):
            for y in range(n):
                yield (x, y)

    julia_set_colors = (pipeline
                        | df.Create('add points', point_set(n))
                        | df.Map(get_julia_set_point_color, c, n,
                                 max_iterations))

    return julia_set_colors

Ejemplo n.º 13

0

Mostrar archivo

    def test_create_groups(self):
        p = df.Pipeline('DirectPipelineRunner')

        group_ids_pcoll = p | df.Create('create_group_ids', ['A', 'B', 'C'])
        corpus_pcoll = p | df.Create('create_corpus', [{
            'f': 'corpus1'
        }, {
            'f': 'corpus2'
        }, {
            'f': 'corpus3'
        }])
        words_pcoll = p | df.Create('create_words', [{
            'f': 'word1'
        }, {
            'f': 'word2'
        }, {
            'f': 'word3'
        }])
        ignore_corpus_pcoll = p | df.Create('create_ignore_corpus',
                                            ['corpus1'])
        ignore_word_pcoll = p | df.Create('create_ignore_word', ['word1'])

        groups = bigquery_side_input.create_groups(group_ids_pcoll,
                                                   corpus_pcoll, words_pcoll,
                                                   ignore_corpus_pcoll,
                                                   ignore_word_pcoll)

        def group_matcher(actual):
            self.assertEqual(len(actual), 3)
            for group in actual:
                self.assertEqual(len(group), 3)
                self.assertTrue(group[1].startswith('corpus'))
                self.assertNotEqual(group[1], 'corpus1')
                self.assertTrue(group[2].startswith('word'))
                self.assertNotEqual(group[2], 'word1')

        df.assert_that(groups, group_matcher)
        p.run()

Ejemplo n.º 14

0

Mostrar archivo

Archivo: combiners_test.py Proyecto: wangjiahong/DataflowPythonSDK

    def test_combine_per_key_with_callable(self):
        """CombinePerKey using a standard callable reducing iterables.

    A common case for Dataflow combiners is to sum (or max or min) over the
    values of each key. Such standard functions can be used directly as combiner
    functions. In fact, any function "reducing" an iterable to a single value
    can be used.
    """
        result = (df.Pipeline(runner=df.runners.DirectPipelineRunner())
                  | df.Create(CombinersTest.SAMPLE_DATA)
                  | df.CombinePerKey(sum))

        df.assert_that(result, df.equal_to([('a', 6), ('b', 30), ('c', 100)]))
        result.pipeline.run()

Ejemplo n.º 15

0

Mostrar archivo

    def _run_write_test(self,
                        data,
                        return_init_result=True,
                        return_write_results=True):
        write_to_test_sink = WriteToTestSink(return_init_result,
                                             return_write_results)
        p = Pipeline(options=PipelineOptions([]))
        result = p | df.Create('start', data) | write_to_test_sink

        assert_that(result, is_empty())
        p.run()

        sink = write_to_test_sink.last_sink
        self.assertIsNotNone(sink)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: combiners_test.py Proyecto: wangjiahong/DataflowPythonSDK

    def test_combine_per_key_with_custom_callable(self):
        """CombinePerKey using a custom function reducing iterables."""
        def multiply(values):
            result = 1
            for v in values:
                result *= v
            return result

        result = (df.Pipeline(runner=df.runners.DirectPipelineRunner())
                  | df.Create(CombinersTest.SAMPLE_DATA)
                  | df.CombinePerKey(multiply))

        df.assert_that(result, df.equal_to([('a', 6), ('b', 200), ('c', 100)]))
        result.pipeline.run()

Ejemplo n.º 17

0

Mostrar archivo

def run(argv=None):
    """Run the workflow."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--output')
    parser.add_argument('--ignore_corpus', default='')
    parser.add_argument('--ignore_word', default='')
    parser.add_argument('--num_groups')

    known_args, pipeline_args = parser.parse_known_args(argv)
    p = df.Pipeline(argv=pipeline_args)

    group_ids = []
    for i in xrange(0, int(known_args.num_groups)):
        group_ids.append('id' + str(i))

    query_corpus = 'select UNIQUE(corpus) from publicdata:samples.shakespeare'
    query_word = 'select UNIQUE(word) from publicdata:samples.shakespeare'
    ignore_corpus = known_args.ignore_corpus
    ignore_word = known_args.ignore_word

    pcoll_corpus = p | df.Read('read corpus',
                               df.io.BigQuerySource(query=query_corpus))
    pcoll_word = p | df.Read('read words',
                             df.io.BigQuerySource(query=query_word))
    pcoll_ignore_corpus = p | df.Create('create_ignore_corpus',
                                        [ignore_corpus])
    pcoll_ignore_word = p | df.Create('create_ignore_word', [ignore_word])
    pcoll_group_ids = p | df.Create('create groups', group_ids)

    pcoll_groups = create_groups(pcoll_group_ids, pcoll_corpus, pcoll_word,
                                 pcoll_ignore_corpus, pcoll_ignore_word)

    # pylint:disable=expression-not-assigned
    pcoll_groups | df.io.Write('WriteToText',
                               df.io.TextFileSink(known_args.output))
    p.run()

Ejemplo n.º 18

0

Mostrar archivo

Archivo: fileio_test.py Proyecto: volnt/DataflowPythonSDK

    def test_fixed_shard_write(self):
        temp_path = tempfile.NamedTemporaryFile().name
        sink = MyFileSink(temp_path,
                          file_name_suffix='.foo',
                          num_shards=3,
                          shard_name_template='_NN_SSS_',
                          coder=coders.ToStringCoder())
        p = df.Pipeline('DirectPipelineRunner')
        p | df.Create(['a', 'b']) | df.io.Write(sink)  # pylint: disable=expression-not-assigned

        p.run()

        concat = ''.join(
            open(temp_path + '_03_%03d_.foo' % shard_num).read()
            for shard_num in range(3))
        self.assertTrue('][a][' in concat, concat)
        self.assertTrue('][b][' in concat, concat)

Ejemplo n.º 19

0

Mostrar archivo

 def test_top_prefixes(self):
     p = df.Pipeline('DirectPipelineRunner')
     words = p | df.Create('create', self.WORDS)
     result = words | autocomplete.TopPerPrefix('test', 5)
     # values must be hashable for now
     result = result | df.Map(lambda (k, vs): (k, tuple(vs)))
     assert_that(
         result,
         contains_in_any_order([
             ('t', ((3, 'to'), (2, 'this'), (1, 'that'))),
             ('to', ((3, 'to'), )),
             ('th', ((2, 'this'), (1, 'that'))),
             ('thi', ((2, 'this'), )),
             ('this', ((2, 'this'), )),
             ('tha', ((1, 'that'), )),
             ('that', ((1, 'that'), )),
         ]))
     p.run()

Ejemplo n.º 20

0

Mostrar archivo

def run(argv=None):

    parser = argparse.ArgumentParser()
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)
    # A thousand work items of a million tries each.
    (p  # pylint: disable=expression-not-assigned
     | df.Create('Initialize', [100000] * 100).with_output_types(int)
     | df.Map('Run trials', run_trials)
     | df.CombineGlobally('Sum', combine_results).without_defaults()
     | df.io.Write('Write',
                   df.io.TextFileSink(known_args.output, coder=JsonCoder())))

    # Actually run the pipeline (all operations above are deferred).
    p.run()

Ejemplo n.º 21

0

Mostrar archivo

Archivo: trigger_test.py Proyecto: volnt/DataflowPythonSDK

 def test_after_count(self):
     p = Pipeline('DirectPipelineRunner')
     result = (p
               | df.Create([1, 2, 3, 4, 5, 10, 11])
               | df.FlatMap(lambda t: [('A', t), ('B', t + 5)])
               | df.Map(lambda (k, t): TimestampedValue((k, t), t))
               | df.WindowInto(
                   FixedWindows(10),
                   trigger=AfterCount(3),
                   accumulation_mode=AccumulationMode.DISCARDING)
               | df.GroupByKey()
               | df.Map(lambda (k, v): ('%s-%s' % (k, len(v)), set(v))))
     assert_that(
         result,
         equal_to({
             'A-5': {1, 2, 3, 4, 5},
             # A-10, A-11 never emitted due to AfterCount(3) never firing.
             'B-4': {6, 7, 8, 9},
             'B-3': {10, 15, 16},
         }.iteritems()))

Ejemplo n.º 22

0

Mostrar archivo

    def test_pardo_side_input(self):
        p = df.Pipeline('DirectPipelineRunner')
        words = p | df.Create('start', ['a', 'bb', 'ccc', 'dddd'])

        # [START model_pardo_side_input]
        # Callable takes additional arguments.
        def filter_using_length(word, lower_bound, upper_bound=float('inf')):
            if lower_bound <= len(word) <= upper_bound:
                yield word

        # Construct a deferred side input.
        avg_word_len = words | df.Map(len) | df.CombineGlobally(
            df.combiners.MeanCombineFn())

        # Call with explicit side inputs.
        small_words = words | df.FlatMap('small', filter_using_length, 0, 3)

        # A single deferred side input.
        larger_than_average = words | df.FlatMap(
            'large',
            filter_using_length,
            lower_bound=pvalue.AsSingleton(avg_word_len))

        # Mix and match.
        small_but_nontrivial = words | df.FlatMap(
            filter_using_length,
            lower_bound=2,
            upper_bound=pvalue.AsSingleton(avg_word_len))
        # [END model_pardo_side_input]

        df.assert_that(small_words, df.equal_to(['a', 'bb', 'ccc']))
        df.assert_that(larger_than_average,
                       df.equal_to(['ccc', 'dddd']),
                       label='larger_than_average')
        df.assert_that(small_but_nontrivial,
                       df.equal_to(['bb']),
                       label='small_but_not_trivial')
        p.run()

Ejemplo n.º 23

0

Mostrar archivo

Archivo: bigquery_schema.py Proyecto: wangjiahong/DataflowPythonSDK

def run(argv=None):
    """Run the workflow."""
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--output',
        required=True,
        help=
        ('Output BigQuery table for results specified as: PROJECT:DATASET.TABLE '
         'or DATASET.TABLE.'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    from google.cloud.dataflow.internal.clients import bigquery  # pylint: disable=g-import-not-at-top

    table_schema = bigquery.TableSchema()

    # Fields that use standard types.
    kind_schema = bigquery.TableFieldSchema()
    kind_schema.name = 'kind'
    kind_schema.type = 'string'
    kind_schema.mode = 'nullable'
    table_schema.fields.append(kind_schema)

    full_name_schema = bigquery.TableFieldSchema()
    full_name_schema.name = 'fullName'
    full_name_schema.type = 'string'
    full_name_schema.mode = 'required'
    table_schema.fields.append(full_name_schema)

    age_schema = bigquery.TableFieldSchema()
    age_schema.name = 'age'
    age_schema.type = 'integer'
    age_schema.mode = 'nullable'
    table_schema.fields.append(age_schema)

    gender_schema = bigquery.TableFieldSchema()
    gender_schema.name = 'gender'
    gender_schema.type = 'string'
    gender_schema.mode = 'nullable'
    table_schema.fields.append(gender_schema)

    # A nested field
    phone_number_schema = bigquery.TableFieldSchema()
    phone_number_schema.name = 'phoneNumber'
    phone_number_schema.type = 'record'
    phone_number_schema.mode = 'nullable'

    area_code = bigquery.TableFieldSchema()
    area_code.name = 'areaCode'
    area_code.type = 'integer'
    area_code.mode = 'nullable'
    phone_number_schema.fields.append(area_code)

    number = bigquery.TableFieldSchema()
    number.name = 'number'
    number.type = 'integer'
    number.mode = 'nullable'
    phone_number_schema.fields.append(number)
    table_schema.fields.append(phone_number_schema)

    # A repeated field.
    children_schema = bigquery.TableFieldSchema()
    children_schema.name = 'children'
    children_schema.type = 'string'
    children_schema.mode = 'repeated'
    table_schema.fields.append(children_schema)

    def create_random_record(record_id):
        return {
            'kind':
            'kind' + record_id,
            'fullName':
            'fullName' + record_id,
            'age':
            int(record_id) * 10,
            'gender':
            'male',
            'phoneNumber': {
                'areaCode': int(record_id) * 100,
                'number': int(record_id) * 100000
            },
            'children': [
                'child' + record_id + '1', 'child' + record_id + '2',
                'child' + record_id + '3'
            ]
        }

    # pylint: disable=expression-not-assigned
    record_ids = p | df.Create('CreateIDs', ['1', '2', '3', '4', '5'])
    records = record_ids | df.Map('CreateRecords', create_random_record)
    records | df.io.Write(
        'write',
        df.io.BigQuerySink(
            known_args.output,
            schema=table_schema,
            create_disposition=df.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=df.io.BigQueryDisposition.WRITE_TRUNCATE))

    # Run the pipeline (all operations are deferred until run() is called).
    p.run()

Ejemplo n.º 24

0

Mostrar archivo

Archivo: filters_test.py Proyecto: wangjiahong/DataflowPythonSDK

  def _get_result_for_month(self, month):
    p = df.Pipeline('DirectPipelineRunner')
    rows = (p | df.Create('create', self.input_data))

    results = filters.filter_cold_days(rows, month)
    return results

Ejemplo n.º 25

0

Mostrar archivo

 def test_runtime_checks_off(self):
     p = df.Pipeline('DirectPipelineRunner', argv=sys.argv)
     # [START type_hints_runtime_off]
     p | df.Create(['a']) | df.Map(lambda x: 3).with_output_types(str)
     p.run()