def test_key_value_serde(self): """ inner """ self.serde_eq(int, serde._key_serde(serde.of([int, str]), None)) self.serde_eq(str, serde._key_serde(serde.of((str, int)), None)) self.serde_eq(int, serde._value_serde(serde.of((str, int)), None)) self.serde_eq(int, serde._value_serde(serde.of([str, int]), None))
def cogroup(*pcollections, **kargs): from bigflow import serde """ inner function""" if len(pcollections) == 0: raise ValueError("No argument") pipeline = pcollections[0].pipeline() key_serde = kargs.get('key_serde', None) if key_serde is None: key_serde = serde._key_serde(pcollections[0].serde(), pipeline.default_objector()) value_serdes = kargs.get('value_serdes', None) if value_serdes is None: value_serdes = [] for p in pcollections: value_serdes.append( serde._value_serde(p.serde(), pipeline.default_objector())) def _make_shuffle(node, value_serde): return pcollection.PCollection(node.match_by(KeyReader(None, key_serde)), pipeline)\ .map(entity.ExtractValueFn(), serde=value_serde, scale=0.8) if not all(isinstance(p, pcollection.PCollection) for p in pcollections): raise ValueError("cogroup only applied on PCollections") plan = pcollections[0].node().plan() scope = pcollections[0].node().scope() nodes = map(lambda p: p.node(), pcollections) shuffle = plan.shuffle(scope, nodes) if 'concurrency' in kargs: concurrency = kargs['concurrency'] shuffle.with_concurrency(concurrency) elif pipeline.estimate_concurrency: concurrency = sum(node.size() for node in nodes) / pipeline.size_per_concurrency shuffle.with_concurrency(concurrency) results = [] for i in range(len(nodes)): results.append(_make_shuffle(shuffle.node(i), value_serdes[i])) return ptable.PTable(tuple(results), key_serde=key_serde)
def cogroup(*pcollections, **kargs): from bigflow import serde """ inner function""" if len(pcollections) == 0: raise ValueError("No argument") pipeline = pcollections[0].pipeline() key_serde = kargs.get('key_serde', None) if key_serde is None: key_serde = serde._key_serde(pcollections[0].serde(), pipeline.default_objector()) value_serdes = kargs.get('value_serdes', None) if value_serdes is None: value_serdes = [] for p in pcollections: value_serdes.append(serde._value_serde(p.serde(), pipeline.default_objector())) def _make_shuffle(node, value_serde): return pcollection.PCollection(node.match_by(KeyReader(None, key_serde)), pipeline)\ .map(entity.ExtractValueFn(), serde=value_serde, scale=0.8) if not all(isinstance(p, pcollection.PCollection) for p in pcollections): raise ValueError("cogroup only applied on PCollections") plan = pcollections[0].node().plan() scope = pcollections[0].node().scope() nodes = map(lambda p: p.node(), pcollections) shuffle = plan.shuffle(scope, nodes) if 'concurrency' in kargs: concurrency = kargs['concurrency'] shuffle.with_concurrency(concurrency) elif pipeline.estimate_concurrency: concurrency = sum(node.size() for node in nodes) / pipeline.size_per_concurrency shuffle.with_concurrency(concurrency) results = [] for i in range(len(nodes)): results.append(_make_shuffle(shuffle.node(i), value_serdes[i])) return ptable.PTable(tuple(results), key_serde = key_serde)
def group_by_key(pcollection, **options): """ 利用给定的PCollection,使用一个默认的key/value提取函数对输入的PCollection分组 ,返回一个表示分组的PTable Args: pcollection (PCollection): 输入PCollection **options: 可配置选项 Returns: PTable: 分组结果 >>> _p = _pipeline.parallelize([("A", 4), ("A", 3), ("B", 2), ("A", 1)]) >>> transforms.group_by_key(_p).get() {"A": [4, 3, 1], "B": [2]} """ import bigflow.transform_impls.functions from bigflow import serde pipeline = pcollection.pipeline() key_serde = options.get('key_serde', None) value_serde = options.get('value_serde', None) if key_serde is None: key_serde = serde._key_serde(pcollection.serde(), pipeline.default_objector()) if value_serde is None: value_serde = serde._value_serde(pcollection.serde(), pipeline.default_objector()) options['key_serde'] = key_serde options['value_serde'] = value_serde # if key_extractor is None, will use the default key extractor return group_by(pcollection, key_extractor=None, value_extractor=bigflow.core.entity.ExtractValueFn(), **options)
def group_by_key(pcollection, **options): """ 利用给定的PCollection,使用一个默认的key/value提取函数对输入的PCollection分组 ,返回一个表示分组的PTable Args: pcollection (PCollection): 输入PCollection **options: 可配置选项 Returns: PTable: 分组结果 >>> _p = _pipeline.parallelize([("A", 4), ("A", 3), ("B", 2), ("A", 1)]) >>> transforms.group_by_key(_p).get() {"A": [4, 3, 1], "B": [2]} """ import bigflow.transform_impls.functions from bigflow import serde pipeline = pcollection.pipeline() key_serde = options.get('key_serde', None) value_serde = options.get('value_serde', None) if key_serde is None: key_serde = serde._key_serde(pcollection.serde(), pipeline.default_objector()) if value_serde is None: value_serde = serde._value_serde(pcollection.serde(), pipeline.default_objector()) options['key_serde'] = key_serde options['value_serde'] = value_serde # if key_extractor is None, will use the default key extractor return group_by(pcollection, key_extractor=None, value_extractor=bigflow.core.entity.ExtractValueFn(), **options )