def expand(self, pcolls): # Check input PCollections for PCollection-ness, and that they all belong # to the same pipeline. for pcoll in pcolls.values(): self._check_pcollection(pcoll) if self.pipeline: assert pcoll.pipeline == self.pipeline tags = list(pcolls.keys()) def add_tag(tag): return lambda k, v: (k, (tag, v)) def collect_values(key, tagged_values): grouped_values = {tag: [] for tag in tags} for tag, value in tagged_values: grouped_values[tag].append(value) return key, grouped_values return ([ pcoll | 'Tag[%s]' % tag >> MapTuple(add_tag(tag)) for (tag, pcoll) in pcolls.items() ] | Flatten(pipeline=self.pipeline) | GroupByKey() | MapTuple(collect_values))
def expand(self, pcolls): if isinstance(pcolls, dict): if all( isinstance(tag, str) and len(tag) < 10 for tag in pcolls.keys()): # Small, string tags. Pass them as data. pcolls_dict = pcolls restore_tags = None else: # Pass the tags in the restore_tags closure. tags = list(pcolls.keys()) pcolls_dict = { str(ix): pcolls[tag] for (ix, tag) in enumerate(tags) } restore_tags = lambda vs: { tag: vs[str(ix)] for (ix, tag) in enumerate(tags) } else: # Tags are tuple indices. num_tags = len(pcolls) pcolls_dict = {str(ix): pcolls[ix] for ix in range(num_tags)} restore_tags = lambda vs: tuple(vs[str(ix)] for ix in range(num_tags)) result = pcolls_dict | _CoGBKImpl(pipeline=self.pipeline) if restore_tags: return result | 'RestoreTags' >> MapTuple(lambda k, vs: (k, restore_tags(vs))) else: return result
def expand(self, pcolls): if isinstance(pcolls, dict): tags = list(pcolls.keys()) if all(isinstance(tag, str) and len(tag) < 10 for tag in tags): # Small, string tags. Pass them as data. pcolls_dict = pcolls restore_tags = None else: # Pass the tags in the restore_tags closure. tags = list(pcolls.keys()) pcolls_dict = {str(ix): pcolls[tag] for (ix, tag) in enumerate(tags)} restore_tags = lambda vs: { tag: vs[str(ix)] for (ix, tag) in enumerate(tags) } else: # Tags are tuple indices. tags = [str(ix) for ix in range(len(pcolls))] pcolls_dict = dict(zip(tags, pcolls)) restore_tags = lambda vs: tuple(vs[tag] for tag in tags) input_key_types = [] input_value_types = [] for pcoll in pcolls_dict.values(): key_type, value_type = typehints.trivial_inference.key_value_types( pcoll.element_type) input_key_types.append(key_type) input_value_types.append(value_type) output_key_type = typehints.Union[tuple(input_key_types)] iterable_input_value_types = tuple( # TODO: Change List[t] to Iterable[t] typehints.List[t] for t in input_value_types) output_value_type = typehints.Dict[ str, typehints.Union[iterable_input_value_types or [typehints.Any]]] result = ( pcolls_dict | 'CoGroupByKeyImpl' >> _CoGBKImpl(pipeline=self.pipeline).with_output_types( typehints.Tuple[output_key_type, output_value_type])) if restore_tags: if isinstance(pcolls, dict): dict_key_type = typehints.Union[tuple( trivial_inference.instance_to_type(tag) for tag in tags)] output_value_type = typehints.Dict[ dict_key_type, typehints.Union[iterable_input_value_types]] else: output_value_type = typehints.Tuple[iterable_input_value_types] result |= 'RestoreTags' >> MapTuple( lambda k, vs: (k, restore_tags(vs))).with_output_types( typehints.Tuple[output_key_type, output_value_type]) return result
def KvSwap(pcoll, label='KvSwap'): # pylint: disable=invalid-name """Produces a PCollection reversing 2-tuples in a PCollection.""" return pcoll | label >> MapTuple(lambda k, v: (v, k))
def Values(pcoll, label='Values'): # pylint: disable=invalid-name """Produces a PCollection of second elements of 2-tuples in a PCollection.""" return pcoll | label >> MapTuple(lambda _, v: v)
def Keys(pcoll, label='Keys'): # pylint: disable=invalid-name """Produces a PCollection of first elements of 2-tuples in a PCollection.""" return pcoll | label >> MapTuple(lambda k, _: k)