Esempio n. 1
0
    def test_cogroup_options(self):
        input1 = self._pipeline.parallelize([("A", 1), ("B", 1), ("A", 2)])

        input2 = self._pipeline.parallelize([("A", 3), ("C", 4)])

        t1 = input1.group_by_key()
        t2 = input2.group_by_key()

        result1 = transforms.cogroup(input1, input2, concurrency=1)\
                            .apply_values(lambda _t1, _t2: _t1)
        result2 = transforms.cogroup(input1, input2, concurrency=1) \
            .apply_values(lambda _t1, _t2: _t2)
        t1.cache(), t2.cache(), result1.cache(), result2.cache()
        self.assertEqual(t1.get(), result1.get())
        self.assertEqual(t2.get(), result2.get())
Esempio n. 2
0
    def test_cogroup_ptable(self):
        """
        Case: test cogroup ptable
        """

        data1 = self._pipeline.parallelize([("A", 4), ("A", 3), ("B", 2), ("A", 1)])
        data2 = self._pipeline.parallelize([("A", 8), ("A", 6), ("B", 4), ("A", 2)])
        ptable = transforms.cogroup(data1, data2)
        with self.assertRaises(error.BigflowPlanningException):
            ptable.flatten()

        def _test_value_fn(a, b, c, d, e):
            self.assertEqual(200, e)
            return transforms.union(a, b, c, d)

        si = self._pipeline.parallelize([100])
        result = ptable.apply_key_values(_test_value_fn, si, 200).flatten()

        expect = [('A', 'A'),
                   ('A', 4),
                   ('A', 3),
                   ('A', 1),
                   ('A', 8),
                   ('A', 6),
                   ('A', 2),
                   ('A', 100),
                   ('B', 'B'),
                   ('B', 2),
                   ('B', 4),
                   ('B', 100)]
        result = result.get()
        self.assertItemsEqual(expect, result)
Esempio n. 3
0
    def test_cogroup_options(self):
        input1 = self._pipeline.parallelize([("A", 1),
                                             ("B", 1),
                                             ("A", 2)])

        input2 = self._pipeline.parallelize([("A", 3),
                                             ("C", 4)])

        t1 = input1.group_by_key()
        t2 = input2.group_by_key()

        result1 = transforms.cogroup(input1, input2, concurrency=1)\
                            .apply_values(lambda _t1, _t2: _t1)
        result2 = transforms.cogroup(input1, input2, concurrency=1) \
            .apply_values(lambda _t1, _t2: _t2)
        t1.cache(), t2.cache(), result1.cache(), result2.cache()
        self.assertEqual(t1.get(), result1.get())
        self.assertEqual(t2.get(), result2.get())
Esempio n. 4
0
    def cogroup(self, other, *others):
        """
        等同于
        :func:`bigflow.transforms.cogroup(self, other, *others)
        <bigflow.transforms.cogroup>`,

        Args:
          other (PCollection):  用于协同分组的PCollection
          *others:  更多的PCollection

        Returns:
          PTable:  分组结果
        """
        return transforms.cogroup(self, other, *others)
Esempio n. 5
0
    def cogroup(self, other, *others):
        """
        等同于
        :func:`bigflow.transforms.cogroup(self, other, *others)
        <bigflow.transforms.cogroup>`,

        Args:
          other (PCollection):  用于协同分组的PCollection
          *others:  更多的PCollection

        Returns:
          PTable:  分组结果
        """
        return transforms.cogroup(self, other, *others)
Esempio n. 6
0
def cogroup(*pcollections, **options):
    """
    对多个输入SchemaPCollection进行协同分组,返回一个PTable表示分组结果
    PTable的value为所有输入PCollection
    cogroup会按照输入字段来group到一起

    Args:
        *pcollection (SchemaPCollection): 输入的多个SchemaPCollection
        **options: 配置选项,需要配置fields=str/tuple/list(一个列表、元组、一个逗号分隔的字符串,表示要group的字段)

    Returns:
        PTable: 分组结果

    Examples:
        >>> from bigflow import base, schema, transforms
        >>> p = base.Pipeline.create('local')
        >>> p1 = p.parallelize([('a', 1), ('c', 2), ('a', 3), ('b', 2), ('d', 1)])
		>>> sp1 = p1.apply(schema.tuple_to_dict, ['websites', 'clicknum'])
        >>> p2 = p.parallelize([('a', 6), ('c', 8), ('a', 9), ('b', 0), ('d', 7)])
		>>> sp2 = p2.apply(schema.tuple_to_dict, ['websites', 'clicknum'])
		>>> csp = sp1.apply(schema.cogroup, sp2, fields=['websites'])
		>>> csp.apply_values(lambda x, y: transforms.union(x, y)).apply(schema.flatten).get()
		输出结果为:
       	[{'clicknum': 8, 'websites': 'b'}, {'clicknum': 2, 'websites': 'b'},
	     {'clicknum': 9, 'websites': 'd'}, {'clicknum': 1, 'websites': 'd'},
	     {'clicknum': 5, 'websites': 'a'}, {'clicknum': 7, 'websites': 'a'}, {'clicknum': 1, 'websites': 'a'},
	     {'clicknum': 3, 'websites': 'a'}, {'clicknum': 6, 'websites': 'c'}, {'clicknum': 2, 'websites': 'c'}]
    """
    if len(pcollections) == 0:
        raise ValueError("No argument")

    fields = options.get('fields', None)

    def _trans_to_sp(*records):
        """ 内部函数 """
        from bigflow import schema_pcollection
        return tuple(
            schema_pcollection.SchemaPCollection(record) for record in records)

    return transforms.cogroup(*_check_set_args(fields, pcollections)) \
            .apply_values(_trans_to_sp)
Esempio n. 7
0
def cogroup(*pcollections, **options):
    """
    对多个输入SchemaPCollection进行协同分组,返回一个PTable表示分组结果
    PTable的value为所有输入PCollection
    cogroup会按照输入字段来group到一起

    Args:
        *pcollection (SchemaPCollection): 输入的多个SchemaPCollection
        **options: 配置选项,需要配置fields=str/tuple/list(一个列表、元组、一个逗号分隔的字符串,表示要group的字段)

    Returns:
        PTable: 分组结果

    Examples:
        >>> from bigflow import base, schema, transforms
        >>> p = base.Pipeline.create('local')
        >>> p1 = p.parallelize([('a', 1), ('c', 2), ('a', 3), ('b', 2), ('d', 1)])
		>>> sp1 = p1.apply(schema.tuple_to_dict, ['websites', 'clicknum'])
        >>> p2 = p.parallelize([('a', 6), ('c', 8), ('a', 9), ('b', 0), ('d', 7)])
		>>> sp2 = p2.apply(schema.tuple_to_dict, ['websites', 'clicknum'])
		>>> csp = sp1.apply(schema.cogroup, sp2, fields=['websites'])
		>>> csp.apply_values(lambda x, y: transforms.union(x, y)).apply(schema.flatten).get()
		输出结果为:
       	[{'clicknum': 8, 'websites': 'b'}, {'clicknum': 2, 'websites': 'b'},
	     {'clicknum': 9, 'websites': 'd'}, {'clicknum': 1, 'websites': 'd'},
	     {'clicknum': 5, 'websites': 'a'}, {'clicknum': 7, 'websites': 'a'}, {'clicknum': 1, 'websites': 'a'},
	     {'clicknum': 3, 'websites': 'a'}, {'clicknum': 6, 'websites': 'c'}, {'clicknum': 2, 'websites': 'c'}]
    """
    if len(pcollections) == 0:
        raise ValueError("No argument")

    fields = options.get('fields', None)

    def _trans_to_sp(*records):
        """ 内部函数 """
        from bigflow import schema_pcollection
        return tuple(schema_pcollection.SchemaPCollection(record) for record in records)

    return transforms.cogroup(*_check_set_args(fields, pcollections)) \
            .apply_values(_trans_to_sp)