def test_cogroup_options(self): input1 = self._pipeline.parallelize([("A", 1), ("B", 1), ("A", 2)]) input2 = self._pipeline.parallelize([("A", 3), ("C", 4)]) t1 = input1.group_by_key() t2 = input2.group_by_key() result1 = transforms.cogroup(input1, input2, concurrency=1)\ .apply_values(lambda _t1, _t2: _t1) result2 = transforms.cogroup(input1, input2, concurrency=1) \ .apply_values(lambda _t1, _t2: _t2) t1.cache(), t2.cache(), result1.cache(), result2.cache() self.assertEqual(t1.get(), result1.get()) self.assertEqual(t2.get(), result2.get())
def test_cogroup_ptable(self): """ Case: test cogroup ptable """ data1 = self._pipeline.parallelize([("A", 4), ("A", 3), ("B", 2), ("A", 1)]) data2 = self._pipeline.parallelize([("A", 8), ("A", 6), ("B", 4), ("A", 2)]) ptable = transforms.cogroup(data1, data2) with self.assertRaises(error.BigflowPlanningException): ptable.flatten() def _test_value_fn(a, b, c, d, e): self.assertEqual(200, e) return transforms.union(a, b, c, d) si = self._pipeline.parallelize([100]) result = ptable.apply_key_values(_test_value_fn, si, 200).flatten() expect = [('A', 'A'), ('A', 4), ('A', 3), ('A', 1), ('A', 8), ('A', 6), ('A', 2), ('A', 100), ('B', 'B'), ('B', 2), ('B', 4), ('B', 100)] result = result.get() self.assertItemsEqual(expect, result)
def cogroup(self, other, *others): """ 等同于 :func:`bigflow.transforms.cogroup(self, other, *others) <bigflow.transforms.cogroup>`, Args: other (PCollection): 用于协同分组的PCollection *others: 更多的PCollection Returns: PTable: 分组结果 """ return transforms.cogroup(self, other, *others)
def cogroup(*pcollections, **options): """ 对多个输入SchemaPCollection进行协同分组,返回一个PTable表示分组结果 PTable的value为所有输入PCollection cogroup会按照输入字段来group到一起 Args: *pcollection (SchemaPCollection): 输入的多个SchemaPCollection **options: 配置选项,需要配置fields=str/tuple/list(一个列表、元组、一个逗号分隔的字符串,表示要group的字段) Returns: PTable: 分组结果 Examples: >>> from bigflow import base, schema, transforms >>> p = base.Pipeline.create('local') >>> p1 = p.parallelize([('a', 1), ('c', 2), ('a', 3), ('b', 2), ('d', 1)]) >>> sp1 = p1.apply(schema.tuple_to_dict, ['websites', 'clicknum']) >>> p2 = p.parallelize([('a', 6), ('c', 8), ('a', 9), ('b', 0), ('d', 7)]) >>> sp2 = p2.apply(schema.tuple_to_dict, ['websites', 'clicknum']) >>> csp = sp1.apply(schema.cogroup, sp2, fields=['websites']) >>> csp.apply_values(lambda x, y: transforms.union(x, y)).apply(schema.flatten).get() 输出结果为: [{'clicknum': 8, 'websites': 'b'}, {'clicknum': 2, 'websites': 'b'}, {'clicknum': 9, 'websites': 'd'}, {'clicknum': 1, 'websites': 'd'}, {'clicknum': 5, 'websites': 'a'}, {'clicknum': 7, 'websites': 'a'}, {'clicknum': 1, 'websites': 'a'}, {'clicknum': 3, 'websites': 'a'}, {'clicknum': 6, 'websites': 'c'}, {'clicknum': 2, 'websites': 'c'}] """ if len(pcollections) == 0: raise ValueError("No argument") fields = options.get('fields', None) def _trans_to_sp(*records): """ 内部函数 """ from bigflow import schema_pcollection return tuple( schema_pcollection.SchemaPCollection(record) for record in records) return transforms.cogroup(*_check_set_args(fields, pcollections)) \ .apply_values(_trans_to_sp)
def cogroup(*pcollections, **options): """ 对多个输入SchemaPCollection进行协同分组,返回一个PTable表示分组结果 PTable的value为所有输入PCollection cogroup会按照输入字段来group到一起 Args: *pcollection (SchemaPCollection): 输入的多个SchemaPCollection **options: 配置选项,需要配置fields=str/tuple/list(一个列表、元组、一个逗号分隔的字符串,表示要group的字段) Returns: PTable: 分组结果 Examples: >>> from bigflow import base, schema, transforms >>> p = base.Pipeline.create('local') >>> p1 = p.parallelize([('a', 1), ('c', 2), ('a', 3), ('b', 2), ('d', 1)]) >>> sp1 = p1.apply(schema.tuple_to_dict, ['websites', 'clicknum']) >>> p2 = p.parallelize([('a', 6), ('c', 8), ('a', 9), ('b', 0), ('d', 7)]) >>> sp2 = p2.apply(schema.tuple_to_dict, ['websites', 'clicknum']) >>> csp = sp1.apply(schema.cogroup, sp2, fields=['websites']) >>> csp.apply_values(lambda x, y: transforms.union(x, y)).apply(schema.flatten).get() 输出结果为: [{'clicknum': 8, 'websites': 'b'}, {'clicknum': 2, 'websites': 'b'}, {'clicknum': 9, 'websites': 'd'}, {'clicknum': 1, 'websites': 'd'}, {'clicknum': 5, 'websites': 'a'}, {'clicknum': 7, 'websites': 'a'}, {'clicknum': 1, 'websites': 'a'}, {'clicknum': 3, 'websites': 'a'}, {'clicknum': 6, 'websites': 'c'}, {'clicknum': 2, 'websites': 'c'}] """ if len(pcollections) == 0: raise ValueError("No argument") fields = options.get('fields', None) def _trans_to_sp(*records): """ 内部函数 """ from bigflow import schema_pcollection return tuple(schema_pcollection.SchemaPCollection(record) for record in records) return transforms.cogroup(*_check_set_args(fields, pcollections)) \ .apply_values(_trans_to_sp)