Esempio n. 1
0
    def expand(self, pcolls):
        # Check input PCollections for PCollection-ness, and that they all belong
        # to the same pipeline.
        for pcoll in pcolls.values():
            self._check_pcollection(pcoll)
            if self.pipeline:
                assert pcoll.pipeline == self.pipeline

        tags = list(pcolls.keys())

        def add_tag(tag):
            return lambda k, v: (k, (tag, v))

        def collect_values(key, tagged_values):
            grouped_values = {tag: [] for tag in tags}
            for tag, value in tagged_values:
                grouped_values[tag].append(value)
            return key, grouped_values

        return ([
            pcoll
            | 'Tag[%s]' % tag >> MapTuple(add_tag(tag))
            for (tag, pcoll) in pcolls.items()
        ]
                | Flatten(pipeline=self.pipeline)
                | GroupByKey()
                | MapTuple(collect_values))
Esempio n. 2
0
    def expand(self, pcolls):
        """Performs CoGroupByKey on argument pcolls; see class docstring."""

        # For associating values in K-V pairs with the PCollections they came from.
        def _pair_tag_with_value(key_value, tag):
            (key, value) = key_value
            return (key, (tag, value))

        # Creates the key, value pairs for the output PCollection. Values are either
        # lists or dicts (per the class docstring), initialized by the result of
        # result_ctor(result_ctor_arg).
        def _merge_tagged_vals_under_key(key_grouped, result_ctor,
                                         result_ctor_arg):
            (key, grouped) = key_grouped
            result_value = result_ctor(result_ctor_arg)
            for tag, value in grouped:
                result_value[tag].append(value)
            return (key, result_value)

        try:
            # If pcolls is a dict, we turn it into (tag, pcoll) pairs for use in the
            # general-purpose code below. The result value constructor creates dicts
            # whose keys are the tags.
            result_ctor_arg = list(pcolls)
            result_ctor = lambda tags: dict((tag, []) for tag in tags)
            pcolls = pcolls.items()
        except AttributeError:
            # Otherwise, pcolls is a list/tuple, so we turn it into (index, pcoll)
            # pairs. The result value constructor makes tuples with len(pcolls) slots.
            pcolls = list(enumerate(pcolls))
            result_ctor_arg = len(pcolls)
            result_ctor = lambda size: tuple([] for _ in range(size))

        # Check input PCollections for PCollection-ness, and that they all belong
        # to the same pipeline.
        for _, pcoll in pcolls:
            self._check_pcollection(pcoll)
            if self.pipeline:
                assert pcoll.pipeline == self.pipeline

        return ([
            pcoll | 'pair_with_%s' % tag >> Map(_pair_tag_with_value, tag)
            for tag, pcoll in pcolls
        ]
                | Flatten(pipeline=self.pipeline)
                | GroupByKey()
                | Map(_merge_tagged_vals_under_key, result_ctor,
                      result_ctor_arg))
Esempio n. 3
0
            pcolls = list(enumerate(pcolls))
            result_ctor_arg = len(pcolls)
            result_ctor = lambda size: tuple([] for _ in xrange(size))

        # Check input PCollections for PCollection-ness, and that they all belong
        # to the same pipeline.
        for _, pcoll in pcolls:
            self._check_pcollection(pcoll)
            if self.pipeline:
                assert pcoll.pipeline == self.pipeline

        return ([
            pcoll | Map('pair_with_%s' % tag, _pair_tag_with_value, tag)
            for tag, pcoll in pcolls
        ]
                | Flatten(pipeline=self.pipeline)
                | GroupByKey()
                | Map(_merge_tagged_vals_under_key, result_ctor,
                      result_ctor_arg))


def Keys(label='Keys'):  # pylint: disable=invalid-name
    """Produces a PCollection of first elements of 2-tuples in a PCollection."""
    return Map(label, lambda (k, v): k)


def Values(label='Values'):  # pylint: disable=invalid-name
    """Produces a PCollection of second elements of 2-tuples in a PCollection."""
    return Map(label, lambda (k, v): v)