Beispiel #1
0
    def expand(self, pcolls):
        # Check input PCollections for PCollection-ness, and that they all belong
        # to the same pipeline.
        for pcoll in pcolls.values():
            self._check_pcollection(pcoll)
            if self.pipeline:
                assert pcoll.pipeline == self.pipeline

        tags = list(pcolls.keys())

        def add_tag(tag):
            return lambda k, v: (k, (tag, v))

        def collect_values(key, tagged_values):
            grouped_values = {tag: [] for tag in tags}
            for tag, value in tagged_values:
                grouped_values[tag].append(value)
            return key, grouped_values

        return ([
            pcoll
            | 'Tag[%s]' % tag >> MapTuple(add_tag(tag))
            for (tag, pcoll) in pcolls.items()
        ]
                | Flatten(pipeline=self.pipeline)
                | GroupByKey()
                | MapTuple(collect_values))
Beispiel #2
0
    def expand(self, pcolls):
        if isinstance(pcolls, dict):
            if all(
                    isinstance(tag, str) and len(tag) < 10
                    for tag in pcolls.keys()):
                # Small, string tags. Pass them as data.
                pcolls_dict = pcolls
                restore_tags = None
            else:
                # Pass the tags in the restore_tags closure.
                tags = list(pcolls.keys())
                pcolls_dict = {
                    str(ix): pcolls[tag]
                    for (ix, tag) in enumerate(tags)
                }
                restore_tags = lambda vs: {
                    tag: vs[str(ix)]
                    for (ix, tag) in enumerate(tags)
                }
        else:
            # Tags are tuple indices.
            num_tags = len(pcolls)
            pcolls_dict = {str(ix): pcolls[ix] for ix in range(num_tags)}
            restore_tags = lambda vs: tuple(vs[str(ix)]
                                            for ix in range(num_tags))

        result = pcolls_dict | _CoGBKImpl(pipeline=self.pipeline)
        if restore_tags:
            return result | 'RestoreTags' >> MapTuple(lambda k, vs:
                                                      (k, restore_tags(vs)))
        else:
            return result
Beispiel #3
0
  def expand(self, pcolls):
    if isinstance(pcolls, dict):
      tags = list(pcolls.keys())
      if all(isinstance(tag, str) and len(tag) < 10 for tag in tags):
        # Small, string tags. Pass them as data.
        pcolls_dict = pcolls
        restore_tags = None
      else:
        # Pass the tags in the restore_tags closure.
        tags = list(pcolls.keys())
        pcolls_dict = {str(ix): pcolls[tag] for (ix, tag) in enumerate(tags)}
        restore_tags = lambda vs: {
            tag: vs[str(ix)]
            for (ix, tag) in enumerate(tags)
        }
    else:
      # Tags are tuple indices.
      tags = [str(ix) for ix in range(len(pcolls))]
      pcolls_dict = dict(zip(tags, pcolls))
      restore_tags = lambda vs: tuple(vs[tag] for tag in tags)

    input_key_types = []
    input_value_types = []
    for pcoll in pcolls_dict.values():
      key_type, value_type = typehints.trivial_inference.key_value_types(
          pcoll.element_type)
      input_key_types.append(key_type)
      input_value_types.append(value_type)
    output_key_type = typehints.Union[tuple(input_key_types)]
    iterable_input_value_types = tuple(
        # TODO: Change List[t] to Iterable[t]
        typehints.List[t] for t in input_value_types)

    output_value_type = typehints.Dict[
        str, typehints.Union[iterable_input_value_types or [typehints.Any]]]
    result = (
        pcolls_dict
        | 'CoGroupByKeyImpl' >>
        _CoGBKImpl(pipeline=self.pipeline).with_output_types(
            typehints.Tuple[output_key_type, output_value_type]))

    if restore_tags:
      if isinstance(pcolls, dict):
        dict_key_type = typehints.Union[tuple(
            trivial_inference.instance_to_type(tag) for tag in tags)]
        output_value_type = typehints.Dict[
            dict_key_type, typehints.Union[iterable_input_value_types]]
      else:
        output_value_type = typehints.Tuple[iterable_input_value_types]
      result |= 'RestoreTags' >> MapTuple(
          lambda k, vs: (k, restore_tags(vs))).with_output_types(
              typehints.Tuple[output_key_type, output_value_type])

    return result
Beispiel #4
0
def KvSwap(pcoll, label='KvSwap'):  # pylint: disable=invalid-name
    """Produces a PCollection reversing 2-tuples in a PCollection."""
    return pcoll | label >> MapTuple(lambda k, v: (v, k))
Beispiel #5
0
def Values(pcoll, label='Values'):  # pylint: disable=invalid-name
    """Produces a PCollection of second elements of 2-tuples in a PCollection."""
    return pcoll | label >> MapTuple(lambda _, v: v)
Beispiel #6
0
def Keys(pcoll, label='Keys'):  # pylint: disable=invalid-name
    """Produces a PCollection of first elements of 2-tuples in a PCollection."""
    return pcoll | label >> MapTuple(lambda k, _: k)