def _ret_dict_handler(ret_dict, record_val): """ 内部函数 """ inter_type_keys = [] inter_type_values = [] inter_type_flag = False ptype_keys = [] ptype_values = [] ptype_flag = False for key, value in ret_dict.items(): if isinstance(value, ptype.PType): ptype_keys.append((key, value.serde())) ptype_values.append(value) ptype_flag = True else: inter_type_keys.append((key, type(value))) inter_type_values.append(value) inter_type_flag = True if ptype_flag and inter_type_flag: ptype_keys.extend(inter_type_keys) return tuple_to_dict(transforms.cartesian(*ptype_values)\ .apply(transforms.map, lambda record: record + tuple(inter_type_values)), ptype_keys) elif not ptype_flag and inter_type_flag: from bigflow import schema_pcollection return schema_pcollection.SchemaPCollection(record_val.apply(transforms.map, lambda record: dict(zip(tuple(key_sd[0] for key_sd in inter_type_keys), inter_type_values)), serde=of(dict(inter_type_keys)))) else: return tuple_to_dict(transforms.cartesian(*ptype_values), ptype_keys)
def _ret_tuple_handler(ret_tuple, record_val): """ 内部函数 """ inter_type_sds = [] inter_type_values = [] inter_type_flag = False ptype_sds = [] ptype_values = [] ptype_flag = False for item in ret_tuple: if isinstance(item, ptype.PType): ptype_sds.append((item.serde())) ptype_values.append(item) ptype_flag = True else: inter_type_sds.append((type(item))) inter_type_values.append(item) inter_type_flag = True if ptype_flag and inter_type_flag: ptype_sds.extend(inter_type_sds) return transforms.cartesian(*ptype_values)\ .apply(transforms.map, lambda record: record + tuple(inter_type_values), serde=serde.of(tuple(ptype_sds))) elif not ptype_flag and inter_type_flag: return record_val.apply(transforms.map, lambda record: tuple(inter_type_values), serde=serde.of(tuple(inter_type_sds))) else: return transforms.cartesian(*ptype_values)
def _ret_dict_handler(ret_dict, record_val): """ 内部函数 """ inter_type_keys = [] inter_type_values = [] inter_type_flag = False ptype_keys = [] ptype_values = [] ptype_flag = False for key, value in ret_dict.items(): if isinstance(value, ptype.PType): ptype_keys.append((key, value.serde())) ptype_values.append(value) ptype_flag = True else: inter_type_keys.append((key, type(value))) inter_type_values.append(value) inter_type_flag = True if ptype_flag and inter_type_flag: ptype_keys.extend(inter_type_keys) return tuple_to_dict(transforms.cartesian(*ptype_values)\ .apply(transforms.map, lambda record: record + tuple(inter_type_values)), ptype_keys) elif not ptype_flag and inter_type_flag: from bigflow import schema_pcollection return schema_pcollection.SchemaPCollection( record_val.apply( transforms.map, lambda record: dict( zip(tuple(key_sd[0] for key_sd in inter_type_keys), inter_type_values)), serde=of(dict(inter_type_keys)))) else: return tuple_to_dict(transforms.cartesian(*ptype_values), ptype_keys)
def agg(p, io_description, fn, *args, **kargs): """ 选择一些字段去做一些聚合操作。 Args: p (pcollection): 输入数据集,需要是一个每个元素都是dict的pcollection io_description (str): 格式为: a,b=>c,d,e 即,输入字段=>输出字段 fn (callable) : 函数原型为 (*input_pcollections) => (*output_pcollection_or_pobjects) 即,该函数的输入参数为多个pcollection, 每个pcollection表示数据的一个字段的全部行所拼成的一个pcollection。 该函数的返回值是一些pobject或 pcollection所组成的tuple(如果只有一个元素可以不必返回tuple)。 Returns: 返回一个每个元素是一个dict的pcollection。 这个pcollection中所有元素输出的几个pcollection进行笛卡尔积并添加字段名后的结果。 例如::: >>> x = _pipeline.parallelize([{'a' : 1, 'b': 2.0}, {'a': 2, 'b': 3.0}]) >>> print x.apply(fields.agg, >>> 'a, b => c, d, e', >>> lambda a, b: ( >>> a.count(), >>> b.sum(), >>> a.flat_map(lambda x: xrange(x)) >>> ) >>> ).get() [{'c': 2, 'd': 5.0, 'e': 0}, {'c': 2, 'd': 5.0, 'e': 0}, {'c': 2, 'd': 5.0, 'e': 1}] """ io_fields = ''.join(io_description.split()).split('=>') assert len(io_fields) >= 1 assert len(io_fields) <= 2 select_fields = io_fields[0].split(',') io_fields.append(io_fields[0]) out_fields = io_fields[1].split(',') fields = list(p.apply(select_cols, select_fields)) fields.extend(args) ret = fn(*fields, **kargs) if isinstance(ret, ptype.PType): ret = (ret, ) tp = transforms.cartesian(*ret) return tp.apply(transforms.map, lambda tp: dict(zip(out_fields, tp)), serde=get_out_fields_serde(tp.serde(), out_fields))
def agg(p, io_description, fn, *args, **kargs): """ 选择一些字段去做一些聚合操作。 Args: p (pcollection): 输入数据集,需要是一个每个元素都是dict的pcollection io_description (str): 格式为: a,b=>c,d,e 即,输入字段=>输出字段 fn (callable) : 函数原型为 (*input_pcollections) => (*output_pcollection_or_pobjects) 即,该函数的输入参数为多个pcollection, 每个pcollection表示数据的一个字段的全部行所拼成的一个pcollection。 该函数的返回值是一些pobject或 pcollection所组成的tuple(如果只有一个元素可以不必返回tuple)。 Returns: 返回一个每个元素是一个dict的pcollection。 这个pcollection中所有元素输出的几个pcollection进行笛卡尔积并添加字段名后的结果。 例如::: >>> x = _pipeline.parallelize([{'a' : 1, 'b': 2.0}, {'a': 2, 'b': 3.0}]) >>> print x.apply(fields.agg, >>> 'a, b => c, d, e', >>> lambda a, b: ( >>> a.count(), >>> b.sum(), >>> a.flat_map(lambda x: xrange(x)) >>> ) >>> ).get() [{'c': 2, 'd': 5.0, 'e': 0}, {'c': 2, 'd': 5.0, 'e': 0}, {'c': 2, 'd': 5.0, 'e': 1}] """ io_fields = ''.join(io_description.split()).split('=>') assert len(io_fields) >= 1 assert len(io_fields) <= 2 select_fields = io_fields[0].split(',') io_fields.append(io_fields[0]) out_fields = io_fields[1].split(',') fields = list(p.apply(select_cols, select_fields)) fields.extend(args) ret = fn(*fields, **kargs) if isinstance(ret, ptype.PType): ret = (ret,) tp = transforms.cartesian(*ret) return tp.apply(transforms.map, lambda tp: dict(zip(out_fields, tp)), serde=get_out_fields_serde(tp.serde(), out_fields))
def cartesian(self, other, *others, **options): """ 与其他的PCollection做笛卡尔积 Args: other (PCollection): 其他的PCollection *others: 更多的PCollection Returns: PCollection: 表示结果的PCollection >>> _p1 = _pipeline.parallelize([1, 2, 3]) >>> _p2 = _pipeline.parallelize([4, 5]) >>> _p1.cartesian(_p2).get() [(1, 4), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)] """ return transforms.cartesian(self, other, *others, **options)
def cartesian(self, *pvalues, **options): """ 求当前算子与pvalues的笛卡尔积。 等价于 :func:`bigflow.transforms.cartesian(self, *pvalues, **options) <bigflow.transforms.cartesian>` Args: *pvalues (PObject/PCollection) Returns: PCollection: 此PObject与所有参数的笛卡尔积。结果PCollection中的每条记录是一个tuple。 每个tuple的第n个元素是第n个输入ptype对象的记录。 >>> _p1 = _pipeline.parallelize(1) >>> _p2 = _pipeline.parallelize(2) >>> _p1.cartesian(_p2).get() [(1, 2)] >>> _p3 = _pipeline.parallelize([3, 4]) >>> _p1.cartesian(_p3).get() [(1, 3), (1, 4)] >>> _p1.cartesian(_p2, _p3).get() [(1, 2, 3), (1, 2, 4)] """ from bigflow import transforms return transforms.cartesian(self, *pvalues, **options)
def check_cartesian(self, expect, *pvalues): self.passertEqual(expect, pvalues[0].cartesian(*pvalues[1:])) self.passertEqual(expect, transforms.cartesian(*pvalues))