Beispiel #1
0
    def PerKey(pcoll, n, compare=None, *args, **kwargs):
        """Identifies the compare-most N elements associated with each key.

    This transform will produce a PCollection mapping unique keys in the input
    PCollection to the n greatest elements with which they are associated, where
    "greatest" is determined by the comparator function supplied as the compare
    argument.

    compare should be an implementation of "a < b" taking at least two arguments
    (a and b). Additional arguments and side inputs specified in the apply call
    become additional arguments to the comparator.  Defaults to the natural
    ordering of the elements.

    The arguments 'key' and 'reverse' may instead be passed as keyword
    arguments, and have the same meaning as for Python's sort functions.

    Args:
      pcoll: PCollection to process.
      n: number of elements to extract from pcoll.
      compare: as described above.
      *args: as described above.
      **kwargs: as described above.

    Raises:
      TypeCheckError: If the output type of the input PCollection is not
        compatible with KV[A, B].
    """
        key = kwargs.pop('key', None)
        reverse = kwargs.pop('reverse', False)
        return pcoll | core.CombinePerKey(
            TopCombineFn(n, compare, key, reverse), *args, **kwargs)
Beispiel #2
0
 def expand(self, pcoll):
     paired_with_void_type = KV[pcoll.element_type, Any]
     return (pcoll
             | (core.Map(
                 '%s:PairWithVoid' % self.label, lambda x:
                 (x, None)).with_output_types(paired_with_void_type))
             | core.CombinePerKey(CountCombineFn()))
Beispiel #3
0
 def expand(self, pcoll):
     paired_with_void_type = typehints.Tuple[pcoll.element_type, Any]
     output_type = typehints.KV[pcoll.element_type, int]
     return (pcoll
             | ('%s:PairWithVoid' % self.label >> core.Map(lambda x: (
                 x, None)).with_output_types(paired_with_void_type))
             | core.CombinePerKey(
                 CountCombineFn()).with_output_types(output_type))
Beispiel #4
0
 def expand(self, pcoll):
     """Expands the transform.
     Raises TypeCheckError: If the output type of the input PCollection is not
     compatible with KV[A, B].
     Args:
       pcoll: PCollection to process
     Returns:
       the PCollection containing the result.
     """
     return pcoll | core.CombinePerKey(
         TopDistinctFn(self._n, self._compare, self._key,
                       self._reverse), *self._args, **self._kwargs)
Beispiel #5
0
 def expand(self, pcoll):
     # These CombinePerKey stages will be packed if and only if
     # translations.pack_combiners is enabled in the TestPipeline runner.
     assert_that(pcoll | 'min-perkey' >> core.CombinePerKey(min),
                 equal_to([('a', -1)]),
                 label='assert-min-perkey')
     assert_that(pcoll | 'count-perkey' >> combiners.Count.PerKey(),
                 equal_to([('a', 10)]),
                 label='assert-count-perkey')
     assert_that(
         pcoll
         | 'largest-perkey' >> combiners.Top.LargestPerKey(2),
         equal_to([('a', [9, 6])]),
         label='assert-largest-perkey')
def run(argv=None):
    """Construct the pipeline."""

    options = ImportAssetOptions(argv)

    p = beam.Pipeline(options=options)

    # Delete bigquery dataset on pipeline start.
    deleted_tables = (
        p | beam.Create([None])  # dummy PCollection to trigger delete tables.
        | 'delete_tables' >> beam.ParDo(
            DeleteDataSetTables(options.dataset, options.write_disposition)))

    # Cleanup json documents.
    sanitized_assets = (
        p | 'read' >> ReadFromText(options.input, coder=JsonCoder())
        | 'map_cai_properties' >> beam.ParDo(MapCAIProperties())
        | 'produce_resource_json' >> beam.ParDo(ProduceResourceJson(
            options.load_time, options.group_by))
        | 'bigquery_sanitize' >> beam.ParDo(BigQuerySanitize()))

    # Joining all iam_policy objects with resources of the same name.
    merged_iam_and_asset = (
        sanitized_assets | 'name_key' >> beam.ParDo(AssignGroupByKey('NAME'))
        | 'group_by_name' >> beam.GroupByKey()
        | 'combine_policy' >> beam.ParDo(CombinePolicyResource()))

    # split into BigQuery tables.
    keyed_assets = merged_iam_and_asset | 'group_by_key' >> beam.ParDo(
        AssignGroupByKey(options.group_by))

    # Generate BigQuery schema for each table.
    schemas = keyed_assets | 'to_schema' >> core.CombinePerKey(
        BigQuerySchemaCombineFn())

    # Write to GCS and load to BigQuery.
    # pylint: disable=expression-not-assigned
    (keyed_assets | 'group_assets_by_key' >> beam.GroupByKey()
     | 'write_to_gcs' >> beam.ParDo(
         WriteToGCS(options.stage, options.load_time))
     | 'group_written_objets_by_key' >> beam.GroupByKey()
     | 'load_to_bigquery' >> beam.ParDo(
         LoadToBigQuery(options.dataset, options.write_disposition,
                        options.load_time), beam.pvalue.AsDict(schemas),
         beam.pvalue.AsSingleton(deleted_tables)))

    return p.run()
def run(argv=None):
    """Construct the pipeline."""

    options = ImportAssetOptions(argv)

    p = beam.Pipeline(options=options)

    # Cleanup json documents.
    sanitized = (p | 'read' >> ReadFromText(options.input, coder=JsonCoder())
                 | 'map_cai_properties' >> beam.ParDo(MapCAIProperties())
                 | 'produce_resource_json' >> beam.ParDo(
                     ProduceResourceJson(options.group_by))
                 | 'bigquery_sanitize' >> beam.ParDo(BigQuerySanitize()))

    # Joining all iam_policy objects with resources of the same name.
    merged_iam = (sanitized | 'assign_name_key' >> beam.ParDo(
        AssignGroupByKey('NAME', options.num_shards))
                  | 'group_by_name' >> beam.GroupByKey()
                  | 'combine_policy' >> beam.ParDo(CombinePolicyResource()))

    # split into BigQuery tables.
    keyed_assets = merged_iam | 'assign_group_by_key' >> beam.ParDo(
        AssignGroupByKey(options.group_by, options.num_shards))

    # Generate BigQuery schema for each table.
    schemas = keyed_assets | 'to_schema' >> core.CombinePerKey(
        BigQuerySchemaCombineFn())

    pvalue_schemas = beam.pvalue.AsDict(schemas)
    # Write to GCS and load to BigQuery.
    # pylint: disable=expression-not-assigned
    (keyed_assets
     | 'add_load_time' >> beam.ParDo(AddLoadTime(options.load_time))
     | 'group_by_key_before_enforce' >> beam.GroupByKey()
     | 'enforce_schema' >> beam.ParDo(EnforceSchemaDataTypes(), pvalue_schemas)
     | 'group_by_key_before_write' >> beam.GroupByKey()
     |
     'write_to_gcs' >> beam.ParDo(WriteToGCS(options.stage, options.load_time))
     | 'group_written_objects_by_key' >> beam.GroupByKey()
     | 'delete_tables' >> beam.ParDo(
         DeleteDataSetTables(options.dataset, options.write_disposition))
     | 'load_to_bigquery' >> beam.ParDo(
         LoadToBigQuery(options.dataset, options.load_time),
         beam.pvalue.AsDict(schemas)))

    return p.run()
Beispiel #8
0
 def expand(self, pcoll):
     return pcoll | core.CombinePerKey(MeanCombineFn())
Beispiel #9
0
 def FixedSizePerKey(pcoll, n):
     return pcoll | core.CombinePerKey(SampleCombineFn(n))
Beispiel #10
0
 def expand(self, pcoll):
     return pcoll | core.CombinePerKey(SampleCombineFn(self._n))
Beispiel #11
0
 def expand(self, pcoll):
   _ = pcoll | 'mean-perkey' >> combiners.Mean.PerKey()
   _ = pcoll | 'count-perkey' >> combiners.Count.PerKey()
   _ = pcoll | 'largest-perkey' >> core.CombinePerKey(combiners.Largest(1))
Beispiel #12
0
 def expand(self, pcoll):
     return (pcoll
             | core.ParDo(self.add_timestamp).with_output_types(
                 Tuple[K, Tuple[T, TimestampType]])
             | core.CombinePerKey(LatestCombineFn()))