def group_by(pcollection, fields, **options): """ 对pcollection按字段group by Args: pcollection (SchemaPCollection): 输入数据集SchemaPCollection(用来表示结构化的,带字段的PCollection), 可以当作每个元素是一个dict的PCollection来用 fields (Iterable): 如果fields为一个str,则会按“,”进行切割,然后按切割出的字段进行分组。 如果fields为一个list/tuple,则直接按list中的多个字段进行分组 Returns: SchemaPCollection: 每个key为group的字段所组成的一个dict,每个value是一个PCollection,包含所有的列。 Examples: >>> from bigflow import base, schema >>> p = base.Pipeline.create('local') >>> analytics = p.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)]) >>> .apply(schema.tuple_to_dict, ['websites', 'clicknum']) >>> .apply(schema.select, lambda cols: { >>> 'website': cols['websites'].flat_map(lambda line: line.split(',')), >>> 'clicknum': cols['clicknum'] >>> }).apply(schema.group_by, ['website']) >>> .apply_values(transforms.first) >>> .apply(schema.flatten) >>> print analytics.get() 输出结果为: [{'website': 'a', 'clicknum': 1}, {'website': 'b', 'clicknum': 1}, {'website': 'c', 'clicknum': 1}, {'website': 'd', 'clicknum': 1}] """ pcollection = _transform_schema_pcollection(pcollection) if _is_tuple_serde(pcollection.serde()): def _select_fields(tup, fields): """ 内部函数 """ return tuple(tup[field] for field in fields) from bigflow import schema_pcollection return pcollection.group_by(lambda record: _select_fields(record, fields), key_serde=serde.of(tuple(_origin_serde(pcollection.serde())[field] for field in fields)), **options)\ .apply_values(lambda record: schema_pcollection.SchemaPCollection(record)) def _select_fields(dct, fields): """ 内部函数 """ return dict((field, dct[field]) for field in fields) if isinstance(fields, str): fields = ''.join(fields.split()).split(',') from bigflow import schema_pcollection return pcollection.group_by(lambda record: _select_fields(record, fields), key_serde=_get_serde_of_fields(pcollection.serde(), fields, pcollection.pipeline().default_objector()), **options) \ .apply_values(lambda record: schema_pcollection.SchemaPCollection(record))
def _ret_dict_handler(ret_dict, record_val): """ 内部函数 """ inter_type_keys = [] inter_type_values = [] inter_type_flag = False ptype_keys = [] ptype_values = [] ptype_flag = False for key, value in ret_dict.items(): if isinstance(value, ptype.PType): ptype_keys.append((key, value.serde())) ptype_values.append(value) ptype_flag = True else: inter_type_keys.append((key, type(value))) inter_type_values.append(value) inter_type_flag = True if ptype_flag and inter_type_flag: ptype_keys.extend(inter_type_keys) return tuple_to_dict(transforms.cartesian(*ptype_values)\ .apply(transforms.map, lambda record: record + tuple(inter_type_values)), ptype_keys) elif not ptype_flag and inter_type_flag: from bigflow import schema_pcollection return schema_pcollection.SchemaPCollection( record_val.apply( transforms.map, lambda record: dict( zip(tuple(key_sd[0] for key_sd in inter_type_keys), inter_type_values)), serde=of(dict(inter_type_keys)))) else: return tuple_to_dict(transforms.cartesian(*ptype_values), ptype_keys)
def tuple_to_dict(pcollection, fields): """ 从每个元素是tuple的PCollection转化成一个SchemaPCollection。 Args: pcollection (PCollection): 输入PCollection,每个元素是一个tuple fields (list): fields中每个元素是一个tuple(key, value), key是字段名,value是相应字段所用的serde, 若fields中每个元素是一个str表示字段名, 则表示所有类型都是可被marshal序列化的类型 Returns: SchemaPCollection: 用来表示结构化的,带字段的PCollection(FieldsDict),它拥有普通PCollection的所有操作 """ fields = _str_to_list(fields) from bigflow import schema_pcollection order_fields = [] serde_fields = {} for field in fields: if isinstance(field, tuple): order_fields.append(field[0]) serde_fields[field[0]] = field[1] else: order_fields.append(field) serde_fields[field] = serde._ return schema_pcollection.SchemaPCollection( pcollection.map(lambda record: dict(zip(order_fields, record)), serde=of(serde_fields)))
def agg(pcollection, agg_fn, *args): """ 把PCollection的全部数据进行聚合 Args: pcollection (SchemaPCollection): 输入数据集SchemaPCollection(用来表示结构化的,带字段的PCollection), 可以当作每个元素是一个dict的PCollection来用 agg_fn (callable): 一个函数,表示对每个字段进行怎么样的聚合,该函数将传入一个dict类型的参数,dict的key是所有的字段名,每个value是一个 PCollection,表示该字段的全部数据。 用户需要返回一个dict,dict的key是要输出的字段,value是一个分布式数据集(PCollection 或PObject), 表示相关字段下的数据。 最终多个字段下的多个数据集进行笛卡尔积,拼接成最终返回的数据集。 *args (object): 变换所需要的参数列表 Returns: SchemaPCollection: 返回一个每个元素是一个dict的pcollection, 其中所有元素输出的几个pcollection进行笛卡尔积并添加字段名后的结果 Examples: >>> from bigflow import base, schema >>> p = base.Pipeline.create('local') >>> analytics = p.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)]) >>> .apply(schema.tuple_to_dict, ['websites', 'clicknum']) >>> .apply(schema.select, lambda cols: { >>> 'website': cols['websites'].flat_map(lambda line: line.split(',')), >>> 'clicknum': cols['clicknum'] >>> }).apply(schema.group_by, ['website']) >>> .apply_values(schema.agg, lambda cols: { >>> 'max_click_num': cols['clicknum'].max(), >>> 'sum_click_num': cols['clicknum'].sum(), >>> 'avg_click_num': cols['clicknum'].sum() / cols['clicknum'].count() >>> }).apply(schema.flatten) >>> print analytics.get() 输出结果为: [{'sum_click_num': 7, 'website': 'a', 'avg_click_num': 1, 'max_click_num': 3}, {'sum_click_num': 6, 'website': 'c', 'avg_click_num': 2, 'max_click_num': 3}, {'sum_click_num': 5, 'website': 'b', 'avg_click_num': 1, 'max_click_num': 2}, {'sum_click_num': 1, 'website': 'd', 'avg_click_num': 1, 'max_click_num': 1}] """ pcollection = _transform_schema_pcollection(pcollection) if _is_tuple_serde(pcollection.serde()): def _pack_udf(): """ 内部函数 """ tp = list(pcollection.apply(_select_cols_tp)) tp.extend(args) ret_tuple = agg_fn(*tp) return _ret_tuple_handler(ret_tuple, tp[0]) from bigflow import schema_pcollection return schema_pcollection.SchemaPCollection(_pack_udf()) def _pack_udf(val, *args): """ 内部函数 """ record_val = val[-1] ret_dict = agg_fn(dict(zip(agg_fields, val[:-1])), *args) return _ret_dict_handler(ret_dict, record_val) agg_fields = pcollection._get_fields() return _pack_udf(pcollection.apply(_select_cols, agg_fields), *args)
def full_join(*pcollections, **options): """ 对多个输入SchemaPCollection,根据指定的字段(dict)对SchemaPCollection做内连接操作, 连接结果为(fields, (value1, value2, ..., value n)), 若第m个SchemaPCollection没有元素, 则value m为None Args: *pcollection (SchemaPCollection): 输入的多个SchemaPCollection **options: 配置选项,需要配置fields=str/tuple/list(一个列表、元组、一个逗号分隔的字符串,表示指定的字段) Returns: SchemaPCollection: 连接结果 Examples: >>> from bigflow import base, schema, transforms >>> p = base.Pipeline.create('local') >>> p1 = p.parallelize([('a', 2), ('e', 4), ('c', 6)]) >>> sp1 = p1.apply(schema.tuple_to_dict, ['websites', 'clicknum']) >>> p2 = p.parallelize([('a', 9), ('b', 8), ('d', 7)]) >>> sp2 = p2.apply(schema.tuple_to_dict, ['websites', 'click']) >>> csp = sp1.apply(schema.full_join, sp2, fields=['websites']) >>> csp.get() 输出结果为: [({'clicknum': 4, 'websites': 'e'}, {'click': None, 'websites': None}), ({'clicknum': None, 'websites': None}, {'click': 8, 'websites': 'b'}), ({'clicknum': None, 'websites': None}, {'click': 7, 'websites': 'd'}), ({'clicknum': 2, 'websites': 'a'}, {'click': 9, 'websites': 'a'}), ({'clicknum': 6, 'websites': 'c'}, {'click': None, 'websites': None})] """ if len(pcollections) < 2: raise ValueError("require at least 2 pcollections") fields = options.get('fields', None) from bigflow import schema_pcollection pc = transforms.full_join(*_check_set_args(fields, pcollections)) none_dict = _get_none_dict(pcollections) ret = schema_pcollection.SchemaPCollection( pc.apply(transforms.map, lambda tp: _none_to_dict(tp, none_dict), serde=_value_serde(pc.serde()))) # default: merge = False, don't merge the result merge = options.get("merge", False) if not merge: return ret else: return _merge_result_after_join(ret, "full_join")
def _trans_to_sp(*records): """ 内部函数 """ from bigflow import schema_pcollection return tuple( schema_pcollection.SchemaPCollection(record) for record in records)
def flatten(ptype): """ 把PTable的所有value拼接上key,并打平成一个SchemaPCollection Args: ptype (PTable): 输入数据集,需要是一个PTable,key,value都必须为字典 Returns: SchemaPCollection: 返回的SchemaPCollection拥有key,value所有的字段,如果key,value中有相同字段,则以value为准 Examples: >>> from bigflow import base, schema >>> p = base.Pipeline.create('local') >>> analytics = p.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)]) >>> .apply(schema.tuple_to_dict, ['websites', 'clicknum']) >>> .apply(schema.select, lambda cols: { >>> 'website': cols['websites'].flat_map(lambda line: line.split(',')), >>> 'clicknum': cols['clicknum'] >>> }).apply(schema.group_by, ['website']) >>> .apply(schema.flatten) >>> print analytics.get() 输出结果为: [{'website': 'a', 'clicknum': 1}, {'website': 'b', 'clicknum': 1}, {'website': 'c', 'clicknum': 1}, {'website': 'b', 'clicknum': 2}, {'website': 'c', 'clicknum': 2}, {'website': 'a', 'clicknum': 3}, {'website': 'c', 'clicknum': 3}, {'website': 'a', 'clicknum': 2}, {'website': 'b', 'clicknum': 2}, {'website': 'a', 'clicknum': 1}, {'website': 'd', 'clicknum': 1}] """ def _new_dict(*dicts): """ 内部函数 """ ret = {} for dct in dicts: ret.update(dct) return ret def _merge_kv(tp, level): """ 内部函数 """ kvs = [] for i in xrange(level): kvs.append(tp[0]) tp = tp[1] kvs.append(tp) return kvs def _merge_serde(serdes, dft=None): """ 内部函数 """ ret = {} for sd in serdes: if not isinstance(serde.origin(sd), FieldsDictSerde): return dft for field, field_serde in serde.origin( sd)._fields_to_types.iteritems(): ret[field] = field_serde return of(ret) if not isinstance(ptype, ptable.PTable): raise ValueError("flatten should only be applied on PTable") level = ptype.nested_level() + 1 from bigflow import schema_pcollection return schema_pcollection.SchemaPCollection(ptype.flatten().map( lambda kv: _new_dict(*_merge_kv(kv, level)), serde=_merge_serde(ptype.key_serdes() + [ptype.serde()], ptype.pipeline().default_objector())))
def select(pcollection, select_fn, *args): """ 对每条数据选择一些字段进行变换 Args: pcollection (SchemaPCollection): 输入数据集SchemaPCollection(用来表示结构化的, 带字段的PCollection), 可以当作每个元素是一个dict的PCollection来用 select_fn (callable): 一个函数,表示每条记录要进行什么样的变换,该函数将传入一个dict类型的参数,dict的key是所有的字段名, 每个value是一个PObject,表示本条记录中该字段的数据。 用户需要返回一个dict,dict的key是要输出的字段, value是一个分布式数据集(PCollection或PObject), 表示相关字段下的数据。 最终多个字段下的多个数据集进 行笛卡尔积,拼接成最终返回的数据集。 *args (object): 变换所需要的参数列表 Returns: SchemaPCollection: 每个元素是一个dict的pcollection, 这个SchemaPCollection中所有元素相当于对原数据每条数据进行一次select_fn处理, 处理后返回的tuple中的所有数据集进行笛卡尔积, 最终再把所有输入数据处理后得出的结果拼成一个数据集。 Examples: >>> from bigflow import base, schema >>> p = base.Pipeline.create('local') >>> analytics = p.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)]) >>> .apply(schema.tuple_to_dict, ['websites', 'clicknum']) >>> .apply(schema.select, lambda cols: { >>> 'website': cols['websites'].flat_map(lambda line: line.split(',')), >>> 'clicknum': cols['clicknum'] >>> }) >>> print analytics.get() 输出结果为 [{'website': 'a', 'clicknum': 1}, {'website': 'b', 'clicknum': 1}, {'website': 'c', 'clicknum': 1}, {'website': 'b', 'clicknum': 2}, {'website': 'c', 'clicknum': 2}, {'website': 'a', 'clicknum': 3}, {'website': 'c', 'clicknum': 3}, {'website': 'a', 'clicknum': 2}, {'website': 'b', 'clicknum': 2}, {'website': 'a', 'clicknum': 1}, {'website': 'd', 'clicknum': 1}] >>> from bigflow import schema >>> from bigflow import transforms >>> from bigflow import base >>> pl = base.Pipeline.create("local") >>> raw_data = [["xiaoming", "school_1", 12, 150, 90], ] >>> data = pl.parallelize(raw_data) \ >>> .apply(schema.tuple_to_dict, >>> [("name", str), >>> ("school", str), >>> ("age", int), >>> ("height", int), >>> ("weight", int)]) >>> fields = { >>> # 复用bigflow提供的transforms:传入一个tuple,提供transforms及自定义函数 >>> "name": (transforms.map, lambda name: "My name is " + name), >>> # 提供变化函数:传入一个function >>> "school": lambda school: "My school is " + school, >>> } >>> output = schema.select(data, fields) >>> print output.get() 输出结果为 [{'school': 'My school is school_1', 'name': 'My name is xiaoming'}] """ if isinstance(select_fn, list) or isinstance( select_fn, tuple) or isinstance(select_fn, str): cols = _str_to_list(select_fn) return pcollection.map( lambda record: {col: record.get(col) for col in cols}, serde=of({col: serde._ for col in cols})) pcollection = _transform_schema_pcollection(pcollection) if isinstance(select_fn, dict): def wrapper_of_udf(select_fields): fields_fn = select_fields def _apply_udf_in_cols(cols, *args): """ only return columns which user select """ result = {} for field, tf in fields_fn.items(): if callable(tf): result[field] = cols[field].apply(tf) elif isinstance(tf, tuple): result[field] = cols[field].apply(*tf) else: result[field] = tf return result return _apply_udf_in_cols return select(pcollection, wrapper_of_udf(select_fn)) if _is_tuple_serde(pcollection.serde()): def _pack_udf(*val): """ 内部函数 """ ret_tuple = select_fn(*val) return _ret_tuple_handler(ret_tuple, val[0]) from bigflow import schema_pcollection return schema_pcollection.SchemaPCollection( pcollection.apply( group_by_every_record.group_by_every_record).apply_values( transforms.first).apply_values( _select_cols_tp).apply_values(_pack_udf, *args).flatten_values()) def _pack_udf(*val): """ 内部函数 """ sep_postion = len(select_fields) + 1 record_val = val[sep_postion - 1] ret_dict = {} if len(val) > sep_postion: shard_pts = val[sep_postion:] ret_dict = select_fn( dict(zip(select_fields, val[0:sep_postion - 1])), *shard_pts) else: ret_dict = select_fn( dict(zip(select_fields, val[0:sep_postion - 1]))) return _ret_dict_handler(ret_dict, record_val) select_fields = pcollection._get_fields() from bigflow import schema_pcollection return schema_pcollection.SchemaPCollection( pcollection.apply( group_by_every_record.group_by_every_record).apply_values( transforms.first).apply_values(_select_cols, select_fields).apply_values( _pack_udf, *args).flatten_values())
def _transform_schema_pcollection(pcollection): """ 内部函数 """ from bigflow import schema_pcollection return schema_pcollection.SchemaPCollection(pcollection)
def transform_from_node(self, load_node, pipeline): """ 内部接口 """ from bigflow import schema if self.fields is None: raise ValueError('''columns is necessary,(1) columns(list), each item in columns is string, SchemaPCollection's element is dict, (2) columns(int),SchemaPCollection's element is tuple. eg. columns=3 or columns=[(xx, int), (yy, str)] or columns=[xx, yy], (3) columns(list), each item in columns is base type in [int, float, str]''' ) if isinstance(self.fields, tuple): self.fields = list(self.fields) fields_type = [] ignore_overflow = self.ignore_overflow ignore_illegal_line = self.ignore_illegal_line if isinstance(self.fields, list): def get_fields_type(fields): """内部函数""" fields_type = [] for field in fields: if isinstance(field, tuple): if field[1] in [int, str, float]: fields_type.append(field[1]) else: raise ValueError( '''columns is list(field name or data type), data type(int/str/float)''') elif field in [int, str, float]: fields_type.append(field) elif isinstance(field, str): fields_type.append(str) else: raise ValueError( '''columns is list(field name or data type), data type(int/str/float)''') return fields_type fields_type = get_fields_type(self.fields) ret = super(SchemaTextFile, self)\ .transform_from_node(load_node, pipeline)\ .flat_map(entity.SplitStringToTypes(self.sep, fields_type, ignore_overflow, ignore_illegal_line), serde=serde.of(tuple(fields_type))) if self.fields[0] in [int, float, str]: return ret else: ret = ret.apply(schema.tuple_to_dict, self.fields) return ret elif isinstance(self.fields, int): from bigflow import schema_pcollection return schema_pcollection.SchemaPCollection(super(SchemaTextFile, self) .transform_from_node(load_node, pipeline)\ .flat_map(entity.SplitStringToTypes(self.sep, [str for _ in xrange(self.fields)], True, ignore_illegal_line), serde=serde.of(tuple(serde.StrSerde() for index in xrange(self.fields))))) else: raise ValueError("columns is list(field name),or int(row number)")