Python CombinePerKey Beispiele, apache_beam.transforms.core.CombinePerKey Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: combiners.py Projekt: prasanna1024kt/gcp_python

    def PerKey(pcoll, n, compare=None, *args, **kwargs):
        """Identifies the compare-most N elements associated with each key.

    This transform will produce a PCollection mapping unique keys in the input
    PCollection to the n greatest elements with which they are associated, where
    "greatest" is determined by the comparator function supplied as the compare
    argument.

    compare should be an implementation of "a < b" taking at least two arguments
    (a and b). Additional arguments and side inputs specified in the apply call
    become additional arguments to the comparator.  Defaults to the natural
    ordering of the elements.

    The arguments 'key' and 'reverse' may instead be passed as keyword
    arguments, and have the same meaning as for Python's sort functions.

    Args:
      pcoll: PCollection to process.
      n: number of elements to extract from pcoll.
      compare: as described above.
      *args: as described above.
      **kwargs: as described above.

    Raises:
      TypeCheckError: If the output type of the input PCollection is not
        compatible with KV[A, B].
    """
        key = kwargs.pop('key', None)
        reverse = kwargs.pop('reverse', False)
        return pcoll | core.CombinePerKey(
            TopCombineFn(n, compare, key, reverse), *args, **kwargs)

Beispiel #2

0

Datei anzeigen

Datei: combiners.py Projekt: wikier/beam

 def expand(self, pcoll):
     paired_with_void_type = KV[pcoll.element_type, Any]
     return (pcoll
             | (core.Map(
                 '%s:PairWithVoid' % self.label, lambda x:
                 (x, None)).with_output_types(paired_with_void_type))
             | core.CombinePerKey(CountCombineFn()))

Beispiel #3

0

Datei anzeigen

 def expand(self, pcoll):
     paired_with_void_type = typehints.Tuple[pcoll.element_type, Any]
     output_type = typehints.KV[pcoll.element_type, int]
     return (pcoll
             | ('%s:PairWithVoid' % self.label >> core.Map(lambda x: (
                 x, None)).with_output_types(paired_with_void_type))
             | core.CombinePerKey(
                 CountCombineFn()).with_output_types(output_type))

Beispiel #4

0

Datei anzeigen

Datei: top.py Projekt: angulartist/meetuplytics

 def expand(self, pcoll):
     """Expands the transform.
     Raises TypeCheckError: If the output type of the input PCollection is not
     compatible with KV[A, B].
     Args:
       pcoll: PCollection to process
     Returns:
       the PCollection containing the result.
     """
     return pcoll | core.CombinePerKey(
         TopDistinctFn(self._n, self._compare, self._key,
                       self._reverse), *self._args, **self._kwargs)

Beispiel #5

0

Datei anzeigen

 def expand(self, pcoll):
     # These CombinePerKey stages will be packed if and only if
     # translations.pack_combiners is enabled in the TestPipeline runner.
     assert_that(pcoll | 'min-perkey' >> core.CombinePerKey(min),
                 equal_to([('a', -1)]),
                 label='assert-min-perkey')
     assert_that(pcoll | 'count-perkey' >> combiners.Count.PerKey(),
                 equal_to([('a', 10)]),
                 label='assert-count-perkey')
     assert_that(
         pcoll
         | 'largest-perkey' >> combiners.Top.LargestPerKey(2),
         equal_to([('a', [9, 6])]),
         label='assert-largest-perkey')

Beispiel #6

0

Datei anzeigen

Datei: import_pipeline.py Projekt: sylvanasbeta/professional-services

def run(argv=None):
    """Construct the pipeline."""

    options = ImportAssetOptions(argv)

    p = beam.Pipeline(options=options)

    # Delete bigquery dataset on pipeline start.
    deleted_tables = (
        p | beam.Create([None])  # dummy PCollection to trigger delete tables.
        | 'delete_tables' >> beam.ParDo(
            DeleteDataSetTables(options.dataset, options.write_disposition)))

    # Cleanup json documents.
    sanitized_assets = (
        p | 'read' >> ReadFromText(options.input, coder=JsonCoder())
        | 'map_cai_properties' >> beam.ParDo(MapCAIProperties())
        | 'produce_resource_json' >> beam.ParDo(ProduceResourceJson(
            options.load_time, options.group_by))
        | 'bigquery_sanitize' >> beam.ParDo(BigQuerySanitize()))

    # Joining all iam_policy objects with resources of the same name.
    merged_iam_and_asset = (
        sanitized_assets | 'name_key' >> beam.ParDo(AssignGroupByKey('NAME'))
        | 'group_by_name' >> beam.GroupByKey()
        | 'combine_policy' >> beam.ParDo(CombinePolicyResource()))

    # split into BigQuery tables.
    keyed_assets = merged_iam_and_asset | 'group_by_key' >> beam.ParDo(
        AssignGroupByKey(options.group_by))

    # Generate BigQuery schema for each table.
    schemas = keyed_assets | 'to_schema' >> core.CombinePerKey(
        BigQuerySchemaCombineFn())

    # Write to GCS and load to BigQuery.
    # pylint: disable=expression-not-assigned
    (keyed_assets | 'group_assets_by_key' >> beam.GroupByKey()
     | 'write_to_gcs' >> beam.ParDo(
         WriteToGCS(options.stage, options.load_time))
     | 'group_written_objets_by_key' >> beam.GroupByKey()
     | 'load_to_bigquery' >> beam.ParDo(
         LoadToBigQuery(options.dataset, options.write_disposition,
                        options.load_time), beam.pvalue.AsDict(schemas),
         beam.pvalue.AsSingleton(deleted_tables)))

    return p.run()

Beispiel #7

0

Datei anzeigen

Datei: import_pipeline.py Projekt: hiteng/prof_services

def run(argv=None):
    """Construct the pipeline."""

    options = ImportAssetOptions(argv)

    p = beam.Pipeline(options=options)

    # Cleanup json documents.
    sanitized = (p | 'read' >> ReadFromText(options.input, coder=JsonCoder())
                 | 'map_cai_properties' >> beam.ParDo(MapCAIProperties())
                 | 'produce_resource_json' >> beam.ParDo(
                     ProduceResourceJson(options.group_by))
                 | 'bigquery_sanitize' >> beam.ParDo(BigQuerySanitize()))

    # Joining all iam_policy objects with resources of the same name.
    merged_iam = (sanitized | 'assign_name_key' >> beam.ParDo(
        AssignGroupByKey('NAME', options.num_shards))
                  | 'group_by_name' >> beam.GroupByKey()
                  | 'combine_policy' >> beam.ParDo(CombinePolicyResource()))

    # split into BigQuery tables.
    keyed_assets = merged_iam | 'assign_group_by_key' >> beam.ParDo(
        AssignGroupByKey(options.group_by, options.num_shards))

    # Generate BigQuery schema for each table.
    schemas = keyed_assets | 'to_schema' >> core.CombinePerKey(
        BigQuerySchemaCombineFn())

    pvalue_schemas = beam.pvalue.AsDict(schemas)
    # Write to GCS and load to BigQuery.
    # pylint: disable=expression-not-assigned
    (keyed_assets
     | 'add_load_time' >> beam.ParDo(AddLoadTime(options.load_time))
     | 'group_by_key_before_enforce' >> beam.GroupByKey()
     | 'enforce_schema' >> beam.ParDo(EnforceSchemaDataTypes(), pvalue_schemas)
     | 'group_by_key_before_write' >> beam.GroupByKey()
     |
     'write_to_gcs' >> beam.ParDo(WriteToGCS(options.stage, options.load_time))
     | 'group_written_objects_by_key' >> beam.GroupByKey()
     | 'delete_tables' >> beam.ParDo(
         DeleteDataSetTables(options.dataset, options.write_disposition))
     | 'load_to_bigquery' >> beam.ParDo(
         LoadToBigQuery(options.dataset, options.load_time),
         beam.pvalue.AsDict(schemas)))

    return p.run()

Beispiel #8

0

Datei anzeigen

Datei: combiners.py Projekt: prasanna1024kt/gcp_python

 def expand(self, pcoll):
     return pcoll | core.CombinePerKey(MeanCombineFn())

Beispiel #9

0

Datei anzeigen

Datei: combiners.py Projekt: prasanna1024kt/gcp_python

 def FixedSizePerKey(pcoll, n):
     return pcoll | core.CombinePerKey(SampleCombineFn(n))

Beispiel #10

0

Datei anzeigen

 def expand(self, pcoll):
     return pcoll | core.CombinePerKey(SampleCombineFn(self._n))

Beispiel #11

0

Datei anzeigen

 def expand(self, pcoll):
   _ = pcoll | 'mean-perkey' >> combiners.Mean.PerKey()
   _ = pcoll | 'count-perkey' >> combiners.Count.PerKey()
   _ = pcoll | 'largest-perkey' >> core.CombinePerKey(combiners.Largest(1))

Beispiel #12

0

Datei anzeigen

 def expand(self, pcoll):
     return (pcoll
             | core.ParDo(self.add_timestamp).with_output_types(
                 Tuple[K, Tuple[T, TimestampType]])
             | core.CombinePerKey(LatestCombineFn()))