Exemple #1
0
def _ret_dict_handler(ret_dict, record_val):
    """ 内部函数 """
    inter_type_keys = []
    inter_type_values = []
    inter_type_flag = False
    ptype_keys = []
    ptype_values = []
    ptype_flag = False

    for key, value in ret_dict.items():
        if isinstance(value, ptype.PType):
            ptype_keys.append((key, value.serde()))
            ptype_values.append(value)
            ptype_flag = True
        else:
            inter_type_keys.append((key, type(value)))
            inter_type_values.append(value)
            inter_type_flag = True

    if ptype_flag and inter_type_flag:
        ptype_keys.extend(inter_type_keys)
        return tuple_to_dict(transforms.cartesian(*ptype_values)\
            .apply(transforms.map, lambda record: record + tuple(inter_type_values)),
                ptype_keys)
    elif not ptype_flag and inter_type_flag:
        from bigflow import schema_pcollection
        return schema_pcollection.SchemaPCollection(record_val.apply(transforms.map,
            lambda record: dict(zip(tuple(key_sd[0] for key_sd in inter_type_keys),
                inter_type_values)), serde=of(dict(inter_type_keys))))
    else:
        return tuple_to_dict(transforms.cartesian(*ptype_values), ptype_keys)
Exemple #2
0
def _ret_tuple_handler(ret_tuple, record_val):
    """ 内部函数 """
    inter_type_sds = []
    inter_type_values = []
    inter_type_flag = False
    ptype_sds = []
    ptype_values = []
    ptype_flag = False

    for item in ret_tuple:
        if isinstance(item, ptype.PType):
            ptype_sds.append((item.serde()))
            ptype_values.append(item)
            ptype_flag = True
        else:
            inter_type_sds.append((type(item)))
            inter_type_values.append(item)
            inter_type_flag = True

    if ptype_flag and inter_type_flag:
        ptype_sds.extend(inter_type_sds)
        return transforms.cartesian(*ptype_values)\
            .apply(transforms.map, lambda record: record + tuple(inter_type_values),
                serde=serde.of(tuple(ptype_sds)))
    elif not ptype_flag and inter_type_flag:
        return record_val.apply(transforms.map,
            lambda record: tuple(inter_type_values), serde=serde.of(tuple(inter_type_sds)))
    else:
        return transforms.cartesian(*ptype_values)
Exemple #3
0
def _ret_tuple_handler(ret_tuple, record_val):
    """ 内部函数 """
    inter_type_sds = []
    inter_type_values = []
    inter_type_flag = False
    ptype_sds = []
    ptype_values = []
    ptype_flag = False

    for item in ret_tuple:
        if isinstance(item, ptype.PType):
            ptype_sds.append((item.serde()))
            ptype_values.append(item)
            ptype_flag = True
        else:
            inter_type_sds.append((type(item)))
            inter_type_values.append(item)
            inter_type_flag = True

    if ptype_flag and inter_type_flag:
        ptype_sds.extend(inter_type_sds)
        return transforms.cartesian(*ptype_values)\
            .apply(transforms.map, lambda record: record + tuple(inter_type_values),
                serde=serde.of(tuple(ptype_sds)))
    elif not ptype_flag and inter_type_flag:
        return record_val.apply(transforms.map,
                                lambda record: tuple(inter_type_values),
                                serde=serde.of(tuple(inter_type_sds)))
    else:
        return transforms.cartesian(*ptype_values)
Exemple #4
0
def _ret_dict_handler(ret_dict, record_val):
    """ 内部函数 """
    inter_type_keys = []
    inter_type_values = []
    inter_type_flag = False
    ptype_keys = []
    ptype_values = []
    ptype_flag = False

    for key, value in ret_dict.items():
        if isinstance(value, ptype.PType):
            ptype_keys.append((key, value.serde()))
            ptype_values.append(value)
            ptype_flag = True
        else:
            inter_type_keys.append((key, type(value)))
            inter_type_values.append(value)
            inter_type_flag = True

    if ptype_flag and inter_type_flag:
        ptype_keys.extend(inter_type_keys)
        return tuple_to_dict(transforms.cartesian(*ptype_values)\
            .apply(transforms.map, lambda record: record + tuple(inter_type_values)),
                ptype_keys)
    elif not ptype_flag and inter_type_flag:
        from bigflow import schema_pcollection
        return schema_pcollection.SchemaPCollection(
            record_val.apply(
                transforms.map,
                lambda record: dict(
                    zip(tuple(key_sd[0] for key_sd in inter_type_keys),
                        inter_type_values)),
                serde=of(dict(inter_type_keys))))
    else:
        return tuple_to_dict(transforms.cartesian(*ptype_values), ptype_keys)
Exemple #5
0
def agg(p, io_description, fn, *args, **kargs):
    """
    选择一些字段去做一些聚合操作。

    Args:
        p (pcollection): 输入数据集,需要是一个每个元素都是dict的pcollection
        io_description (str): 格式为:  a,b=>c,d,e  即,输入字段=>输出字段
        fn (callable) : 函数原型为 (*input_pcollections) => (*output_pcollection_or_pobjects)
                    即,该函数的输入参数为多个pcollection,
                    每个pcollection表示数据的一个字段的全部行所拼成的一个pcollection。
                    该函数的返回值是一些pobject或
                    pcollection所组成的tuple(如果只有一个元素可以不必返回tuple)。

    Returns:
        返回一个每个元素是一个dict的pcollection。
        这个pcollection中所有元素输出的几个pcollection进行笛卡尔积并添加字段名后的结果。

    例如:::

        >>> x = _pipeline.parallelize([{'a' : 1, 'b': 2.0}, {'a': 2, 'b': 3.0}])
        >>> print x.apply(fields.agg,
        >>>      'a, b => c, d, e',
        >>>      lambda a, b: (
        >>>         a.count(),
        >>>         b.sum(),
        >>>         a.flat_map(lambda x: xrange(x))
        >>>     )
        >>> ).get()

        [{'c': 2, 'd': 5.0, 'e': 0}, {'c': 2, 'd': 5.0, 'e': 0}, {'c': 2, 'd': 5.0, 'e': 1}]

    """
    io_fields = ''.join(io_description.split()).split('=>')
    assert len(io_fields) >= 1
    assert len(io_fields) <= 2

    select_fields = io_fields[0].split(',')
    io_fields.append(io_fields[0])
    out_fields = io_fields[1].split(',')

    fields = list(p.apply(select_cols, select_fields))
    fields.extend(args)
    ret = fn(*fields, **kargs)
    if isinstance(ret, ptype.PType):
        ret = (ret, )
    tp = transforms.cartesian(*ret)
    return tp.apply(transforms.map,
                    lambda tp: dict(zip(out_fields, tp)),
                    serde=get_out_fields_serde(tp.serde(), out_fields))
Exemple #6
0
def agg(p, io_description, fn, *args, **kargs):
    """
    选择一些字段去做一些聚合操作。

    Args:
        p (pcollection): 输入数据集,需要是一个每个元素都是dict的pcollection
        io_description (str): 格式为:  a,b=>c,d,e  即,输入字段=>输出字段
        fn (callable) : 函数原型为 (*input_pcollections) => (*output_pcollection_or_pobjects)
                    即,该函数的输入参数为多个pcollection,
                    每个pcollection表示数据的一个字段的全部行所拼成的一个pcollection。
                    该函数的返回值是一些pobject或
                    pcollection所组成的tuple(如果只有一个元素可以不必返回tuple)。

    Returns:
        返回一个每个元素是一个dict的pcollection。
        这个pcollection中所有元素输出的几个pcollection进行笛卡尔积并添加字段名后的结果。

    例如:::

        >>> x = _pipeline.parallelize([{'a' : 1, 'b': 2.0}, {'a': 2, 'b': 3.0}])
        >>> print x.apply(fields.agg,
        >>>      'a, b => c, d, e',
        >>>      lambda a, b: (
        >>>         a.count(),
        >>>         b.sum(),
        >>>         a.flat_map(lambda x: xrange(x))
        >>>     )
        >>> ).get()

        [{'c': 2, 'd': 5.0, 'e': 0}, {'c': 2, 'd': 5.0, 'e': 0}, {'c': 2, 'd': 5.0, 'e': 1}]

    """
    io_fields = ''.join(io_description.split()).split('=>')
    assert len(io_fields) >= 1
    assert len(io_fields) <= 2

    select_fields = io_fields[0].split(',')
    io_fields.append(io_fields[0])
    out_fields = io_fields[1].split(',')

    fields = list(p.apply(select_cols, select_fields))
    fields.extend(args)
    ret = fn(*fields, **kargs)
    if isinstance(ret, ptype.PType):
        ret = (ret,)
    tp = transforms.cartesian(*ret)
    return tp.apply(transforms.map, lambda tp: dict(zip(out_fields, tp)),
            serde=get_out_fields_serde(tp.serde(), out_fields))
Exemple #7
0
    def cartesian(self, other, *others, **options):
        """
        与其他的PCollection做笛卡尔积

        Args:
          other (PCollection):  其他的PCollection
          *others:  更多的PCollection

        Returns:
          PCollection:  表示结果的PCollection

        >>> _p1 = _pipeline.parallelize([1, 2, 3])
        >>> _p2 = _pipeline.parallelize([4, 5])
        >>> _p1.cartesian(_p2).get()
        [(1, 4), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)]
        """
        return transforms.cartesian(self, other, *others, **options)
Exemple #8
0
    def cartesian(self, other, *others, **options):
        """
        与其他的PCollection做笛卡尔积

        Args:
          other (PCollection):  其他的PCollection
          *others:  更多的PCollection

        Returns:
          PCollection:  表示结果的PCollection

        >>> _p1 = _pipeline.parallelize([1, 2, 3])
        >>> _p2 = _pipeline.parallelize([4, 5])
        >>> _p1.cartesian(_p2).get()
        [(1, 4), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)]
        """
        return transforms.cartesian(self, other, *others, **options)
Exemple #9
0
    def cartesian(self, *pvalues, **options):
        """
        求当前算子与pvalues的笛卡尔积。
        等价于 :func:`bigflow.transforms.cartesian(self, *pvalues, **options)
        <bigflow.transforms.cartesian>`

        Args: *pvalues (PObject/PCollection)
        Returns:
            PCollection: 此PObject与所有参数的笛卡尔积。结果PCollection中的每条记录是一个tuple。
            每个tuple的第n个元素是第n个输入ptype对象的记录。

        >>> _p1 = _pipeline.parallelize(1)
        >>> _p2 = _pipeline.parallelize(2)
        >>> _p1.cartesian(_p2).get()
        [(1, 2)]
        >>> _p3 = _pipeline.parallelize([3, 4])
        >>> _p1.cartesian(_p3).get()
        [(1, 3), (1, 4)]
        >>> _p1.cartesian(_p2, _p3).get()
        [(1, 2, 3), (1, 2, 4)]

        """
        from bigflow import transforms
        return transforms.cartesian(self, *pvalues, **options)
Exemple #10
0
    def cartesian(self, *pvalues, **options):
        """
        求当前算子与pvalues的笛卡尔积。
        等价于 :func:`bigflow.transforms.cartesian(self, *pvalues, **options)
        <bigflow.transforms.cartesian>`

        Args: *pvalues (PObject/PCollection)
        Returns:
            PCollection: 此PObject与所有参数的笛卡尔积。结果PCollection中的每条记录是一个tuple。
            每个tuple的第n个元素是第n个输入ptype对象的记录。

        >>> _p1 = _pipeline.parallelize(1)
        >>> _p2 = _pipeline.parallelize(2)
        >>> _p1.cartesian(_p2).get()
        [(1, 2)]
        >>> _p3 = _pipeline.parallelize([3, 4])
        >>> _p1.cartesian(_p3).get()
        [(1, 3), (1, 4)]
        >>> _p1.cartesian(_p2, _p3).get()
        [(1, 2, 3), (1, 2, 4)]

        """
        from bigflow import transforms
        return transforms.cartesian(self, *pvalues, **options)
Exemple #11
0
 def check_cartesian(self, expect, *pvalues):
     self.passertEqual(expect, pvalues[0].cartesian(*pvalues[1:]))
     self.passertEqual(expect, transforms.cartesian(*pvalues))