コード例 #1
0
    def PerKey(pcoll, n, compare=None, *args, **kwargs):
        """Identifies the compare-most N elements associated with each key.

    This transform will produce a PCollection mapping unique keys in the input
    PCollection to the n greatest elements with which they are associated, where
    "greatest" is determined by the comparator function supplied as the compare
    argument.

    compare should be an implementation of "a < b" taking at least two arguments
    (a and b). Additional arguments and side inputs specified in the apply call
    become additional arguments to the comparator.  Defaults to the natural
    ordering of the elements.

    The arguments 'key' and 'reverse' may instead be passed as keyword
    arguments, and have the same meaning as for Python's sort functions.

    Args:
      pcoll: PCollection to process.
      n: number of elements to extract from pcoll.
      compare: as described above.
      *args: as described above.
      **kwargs: as described above.

    Raises:
      TypeCheckError: If the output type of the input PCollection is not
        compatible with KV[A, B].
    """
        key = kwargs.pop('key', None)
        reverse = kwargs.pop('reverse', False)
        return pcoll | core.CombinePerKey(
            TopCombineFn(n, compare, key, reverse), *args, **kwargs)
コード例 #2
0
ファイル: combiners.py プロジェクト: wikier/beam
 def expand(self, pcoll):
     paired_with_void_type = KV[pcoll.element_type, Any]
     return (pcoll
             | (core.Map(
                 '%s:PairWithVoid' % self.label, lambda x:
                 (x, None)).with_output_types(paired_with_void_type))
             | core.CombinePerKey(CountCombineFn()))
コード例 #3
0
 def expand(self, pcoll):
     paired_with_void_type = typehints.Tuple[pcoll.element_type, Any]
     output_type = typehints.KV[pcoll.element_type, int]
     return (pcoll
             | ('%s:PairWithVoid' % self.label >> core.Map(lambda x: (
                 x, None)).with_output_types(paired_with_void_type))
             | core.CombinePerKey(
                 CountCombineFn()).with_output_types(output_type))
コード例 #4
0
ファイル: top.py プロジェクト: angulartist/meetuplytics
 def expand(self, pcoll):
     """Expands the transform.
     Raises TypeCheckError: If the output type of the input PCollection is not
     compatible with KV[A, B].
     Args:
       pcoll: PCollection to process
     Returns:
       the PCollection containing the result.
     """
     return pcoll | core.CombinePerKey(
         TopDistinctFn(self._n, self._compare, self._key,
                       self._reverse), *self._args, **self._kwargs)
コード例 #5
0
 def expand(self, pcoll):
     # These CombinePerKey stages will be packed if and only if
     # translations.pack_combiners is enabled in the TestPipeline runner.
     assert_that(pcoll | 'min-perkey' >> core.CombinePerKey(min),
                 equal_to([('a', -1)]),
                 label='assert-min-perkey')
     assert_that(pcoll | 'count-perkey' >> combiners.Count.PerKey(),
                 equal_to([('a', 10)]),
                 label='assert-count-perkey')
     assert_that(
         pcoll
         | 'largest-perkey' >> combiners.Top.LargestPerKey(2),
         equal_to([('a', [9, 6])]),
         label='assert-largest-perkey')
コード例 #6
0
def run(argv=None):
    """Construct the pipeline."""

    options = ImportAssetOptions(argv)

    p = beam.Pipeline(options=options)

    # Delete bigquery dataset on pipeline start.
    deleted_tables = (
        p | beam.Create([None])  # dummy PCollection to trigger delete tables.
        | 'delete_tables' >> beam.ParDo(
            DeleteDataSetTables(options.dataset, options.write_disposition)))

    # Cleanup json documents.
    sanitized_assets = (
        p | 'read' >> ReadFromText(options.input, coder=JsonCoder())
        | 'map_cai_properties' >> beam.ParDo(MapCAIProperties())
        | 'produce_resource_json' >> beam.ParDo(ProduceResourceJson(
            options.load_time, options.group_by))
        | 'bigquery_sanitize' >> beam.ParDo(BigQuerySanitize()))

    # Joining all iam_policy objects with resources of the same name.
    merged_iam_and_asset = (
        sanitized_assets | 'name_key' >> beam.ParDo(AssignGroupByKey('NAME'))
        | 'group_by_name' >> beam.GroupByKey()
        | 'combine_policy' >> beam.ParDo(CombinePolicyResource()))

    # split into BigQuery tables.
    keyed_assets = merged_iam_and_asset | 'group_by_key' >> beam.ParDo(
        AssignGroupByKey(options.group_by))

    # Generate BigQuery schema for each table.
    schemas = keyed_assets | 'to_schema' >> core.CombinePerKey(
        BigQuerySchemaCombineFn())

    # Write to GCS and load to BigQuery.
    # pylint: disable=expression-not-assigned
    (keyed_assets | 'group_assets_by_key' >> beam.GroupByKey()
     | 'write_to_gcs' >> beam.ParDo(
         WriteToGCS(options.stage, options.load_time))
     | 'group_written_objets_by_key' >> beam.GroupByKey()
     | 'load_to_bigquery' >> beam.ParDo(
         LoadToBigQuery(options.dataset, options.write_disposition,
                        options.load_time), beam.pvalue.AsDict(schemas),
         beam.pvalue.AsSingleton(deleted_tables)))

    return p.run()
コード例 #7
0
def run(argv=None):
    """Construct the pipeline."""

    options = ImportAssetOptions(argv)

    p = beam.Pipeline(options=options)

    # Cleanup json documents.
    sanitized = (p | 'read' >> ReadFromText(options.input, coder=JsonCoder())
                 | 'map_cai_properties' >> beam.ParDo(MapCAIProperties())
                 | 'produce_resource_json' >> beam.ParDo(
                     ProduceResourceJson(options.group_by))
                 | 'bigquery_sanitize' >> beam.ParDo(BigQuerySanitize()))

    # Joining all iam_policy objects with resources of the same name.
    merged_iam = (sanitized | 'assign_name_key' >> beam.ParDo(
        AssignGroupByKey('NAME', options.num_shards))
                  | 'group_by_name' >> beam.GroupByKey()
                  | 'combine_policy' >> beam.ParDo(CombinePolicyResource()))

    # split into BigQuery tables.
    keyed_assets = merged_iam | 'assign_group_by_key' >> beam.ParDo(
        AssignGroupByKey(options.group_by, options.num_shards))

    # Generate BigQuery schema for each table.
    schemas = keyed_assets | 'to_schema' >> core.CombinePerKey(
        BigQuerySchemaCombineFn())

    pvalue_schemas = beam.pvalue.AsDict(schemas)
    # Write to GCS and load to BigQuery.
    # pylint: disable=expression-not-assigned
    (keyed_assets
     | 'add_load_time' >> beam.ParDo(AddLoadTime(options.load_time))
     | 'group_by_key_before_enforce' >> beam.GroupByKey()
     | 'enforce_schema' >> beam.ParDo(EnforceSchemaDataTypes(), pvalue_schemas)
     | 'group_by_key_before_write' >> beam.GroupByKey()
     |
     'write_to_gcs' >> beam.ParDo(WriteToGCS(options.stage, options.load_time))
     | 'group_written_objects_by_key' >> beam.GroupByKey()
     | 'delete_tables' >> beam.ParDo(
         DeleteDataSetTables(options.dataset, options.write_disposition))
     | 'load_to_bigquery' >> beam.ParDo(
         LoadToBigQuery(options.dataset, options.load_time),
         beam.pvalue.AsDict(schemas)))

    return p.run()
コード例 #8
0
 def expand(self, pcoll):
     return pcoll | core.CombinePerKey(MeanCombineFn())
コード例 #9
0
 def FixedSizePerKey(pcoll, n):
     return pcoll | core.CombinePerKey(SampleCombineFn(n))
コード例 #10
0
 def expand(self, pcoll):
     return pcoll | core.CombinePerKey(SampleCombineFn(self._n))
コード例 #11
0
 def expand(self, pcoll):
   _ = pcoll | 'mean-perkey' >> combiners.Mean.PerKey()
   _ = pcoll | 'count-perkey' >> combiners.Count.PerKey()
   _ = pcoll | 'largest-perkey' >> core.CombinePerKey(combiners.Largest(1))
コード例 #12
0
 def expand(self, pcoll):
     return (pcoll
             | core.ParDo(self.add_timestamp).with_output_types(
                 Tuple[K, Tuple[T, TimestampType]])
             | core.CombinePerKey(LatestCombineFn()))