def transform_from_node(self, load_node, pipeline): """ 内部接口 """ from bigflow import ptable if self.repeatedly: transformed = load_node.repeatedly() \ .process_by(_TextFromRecord()) \ .as_type(serde.StrSerde()) \ .set_effective_key_num(0) \ .input(0).allow_partial_processing() \ .done() else: transformed = load_node \ .process_by(_TextFromRecord()) \ .as_type(serde.StrSerde()) \ .set_effective_key_num(0) \ .input(0).allow_partial_processing() \ .done() transformed.set_size(load_node.size()) if self._options.get('partitioned', False): transformed_pcollection = pcollection.PCollection( transformed, pipeline) return ptable.PTable(transformed_pcollection, key_serde=serde.StrSerde()) return pcollection.PCollection(transformed.leave_scope(), pipeline)
def take(pvalue, n, **options): """ inner function """ if utils.is_infinite(pvalue): raise ValueError("take not supported infinite PType") objector = options.get('serde', pvalue.serde()) scale = options.get('scale', 0.1) partial_scale = math.sqrt(scale) size = options.get('output_size', None) if size is None: partial_size = None else: partial_size = ptype.node().size() * math.sqrt(size / ptype.node().size()) if isinstance(n, pobject.PObject): # treat the pobject param as side input partial_helper = side_input_util.SideInputsUtil(pvalue, (n, )) partial_node = partial_helper.process_with_side_inputs() \ .by(entity.TakeProcessor(n)) \ .as_type(objector) \ .set_debug_info("TakePartial: " + str(n.node())) \ .set_effective_key_num(0) \ .input(-1).allow_partial_processing() \ .done() \ .set_size(partial_size, partial_scale) partial = pcollection.PCollection(partial_node, pvalue.pipeline()) result_helper = side_input_util.SideInputsUtil(partial, (n, )) result_node = result_helper.process_with_side_inputs() \ .by(entity.TakeProcessor(n)) \ .as_type(objector) \ .set_debug_info("Take: " + str(n.node())) \ .set_effective_key_num(0) \ .set_size(size, partial_scale) elif isinstance(n, (int, long)): result_node = pvalue.node() \ .process_by(entity.TakeProcessor(n)) \ .as_type(objector) \ .set_debug_info("TakePartial: %d" % n) \ .input(0).allow_partial_processing().done() \ .set_size(partial_size, partial_scale) \ .process_by(entity.TakeProcessor(n)) \ .as_type(objector) \ .set_debug_info("Take: %d" % n) \ .set_effective_key_num(0) \ .set_size(size, partial_scale) else: raise ValueError("Wrong argument, only integers are accepted") return pcollection.PCollection(result_node, pvalue.pipeline())
def filter(pvalue, fn, *side_inputs, **kargs): """ Filter transform implementation :param pvalue: PType :param fn: UDF :param side_inputs: SideInputs :return: PType after filter """ serde = kargs.get('serde', pvalue.serde()) scale = kargs.get('scale', 0.5) size = kargs.get('output_size', None) memory = kargs.get('memory_limit', -1) cpu = kargs.get('cpu_limit', -1) side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple( side_inputs) helper = side_input_util.SideInputsUtil(pvalue, side_inputs) result_node = helper.process_with_side_inputs().by(entity.FilterProcessor(fn, *side_inputs))\ .as_type(serde)\ .set_debug_info("Filter: " + repr(fn)) \ .ignore_group() \ .set_effective_key_num(0) \ .input(-1).allow_partial_processing().done() \ .set_size(size, scale) \ .set_memory(memory) \ .set_cpu(cpu) return pcollection.PCollection(result_node, pvalue.pipeline())
def flatten(pvalue, **kargs): """ Transform flatten implmentation :param ptable: PTable :return: flattened PCollection """ def _flatten_once(node, key_serde, value_serde): return node.process_by(entity.FlattenProcessor(key_serde)) \ .as_type(value_serde) \ .set_debug_info("FlattenProcessor") \ .input(0).allow_partial_processing() \ .done() \ .set_size(scale_factor=1.25) \ .leave_scope() if isinstance(pvalue, ptable.PTable): key_serdes = pvalue.key_serdes() value_serde = pvalue.serde() assert len(key_serdes) == pvalue.nested_level() + 1 it = reversed(key_serdes) node = pvalue.node() for i in range(0, pvalue.nested_level() + 1): key_serde = it.next() value_serde = serde.tuple_of(key_serde, value_serde) node = _flatten_once(node, key_serde, value_serde) pvalue = pcollection.PCollection(node, pvalue.pipeline()) return pvalue
def flat_map(pvalue, fn, *side_inputs, **kargs): """ Implementation of transforms.flat_map() """ objector = kargs.get('serde', pvalue.pipeline().default_objector()) side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple( side_inputs) helper = side_input_util.SideInputsUtil(pvalue, side_inputs) scale = kargs.get('scale', 1.5) size = kargs.get('output_size', None) memory = kargs.get('memory_limit', -1) cpu = kargs.get('cpu_limit', -1) assert isinstance(pvalue, pcollection.PCollection) or isinstance( pvalue, pobject.PObject) result_node = helper.process_with_side_inputs() \ .ignore_group() \ .by(entity.FlatMapProcessor(fn).set_side_inputs(*side_inputs)) \ .as_type(objector) \ .set_debug_info("FlatMap: " + repr(fn)) \ .set_effective_key_num(0) \ .input(-1).allow_partial_processing() \ .done() \ .set_memory(memory) \ .set_cpu(cpu) \ .set_size(size, scale) return pcollection.PCollection(result_node, pvalue.pipeline())
def union(*pvalues, **options): """ inner function""" if len(pvalues) == 0: raise ValueError("No argument") if not all( isinstance(p, pcollection.PCollection) or isinstance(p, pobject.PObject) for p in pvalues): raise ValueError("Union only applied on PCollections or PObjects") serdes = [p.serde() for p in pvalues] com_serde = options.get("serde", serde.common_serde(*serdes)) if com_serde: def _inner_map(p): """use com_serde to convert""" if p.serde().__class__ != com_serde.__class__: p = p.map(lambda x: x, serde=com_serde) return p pvalues = map(_inner_map, pvalues) common_scope = pvalues[0].node().scope() all_nodes = map(lambda p: p.node(), pvalues) if not all(node.scope() is common_scope for node in all_nodes): raise ValueError( "PCollections to union should work on same scope only") plan = pvalues[0].node().plan() return pcollection.PCollection(plan.union(nodes=all_nodes), pvalues[0].pipeline())
def _transform_with_fns(pvalue, initializer, transformer, finalizer, *side_inputs, **kargs): """ pcollection transform """ objector = kargs.get('serde', pvalue.pipeline().default_objector()) status_objector = kargs.get('status_serde', pvalue.pipeline().default_objector()) debug_info = "transform" + repr((initializer, transformer, finalizer)) side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple( side_inputs) helper = side_input_util.SideInputsUtil(pvalue, side_inputs) pnode = helper.process_with_side_inputs() initializer = entity.Functor.of(initializer) transformer = entity.Functor.of(transformer) finalizer = entity.Functor.of(finalizer) result_node = pnode \ .by(entity.TransformProcessor(status_objector, initializer, transformer, finalizer) .set_side_inputs(*side_inputs)) \ .as_type(objector) \ .is_stateful() \ .set_effective_key_num(0) \ .set_debug_info(debug_info) \ .set_size(kargs.get('output_size', None), kargs.get('scale', 0.1)) \ .set_memory(kargs.get('memory_limit', -1)) \ .set_cpu(kargs.get('cpu_limit', -1)) return pcollection.PCollection(result_node, pvalue.pipeline())
def transform_from_node(self, load_node, pipeline): """ inner func """ from bigflow import ptable transformed_pcollection = pcollection.PCollection( load_node, pipeline) before_post_process = \ ptable.PTable(transformed_pcollection, key_serde=serde.CPickleSerde()) return self._user_input_base.post_process(before_post_process)
def transform_to_node(self, ptype): from bigflow.core import entity from bigflow import pcollection node = ptype.node() plan = node.plan() objector = self.options.get('serde', ptype.pipeline().default_objector()) shuffle_scope = plan.shuffle(plan.global_scope(), [node]) node = shuffle_scope.node(0) if self.partition_fn is None: node = node.distribute_by_default() else: node = node.distribute_by(entity.Partitioner(self.partition_fn)) pvalue = pcollection.PCollection(node, ptype.pipeline()) for k, action in self.transform_actions.items(): pvalue = action(pvalue) node = pvalue.node() if self.partition_number is not None: shuffle_scope.with_concurrency(self.partition_number) if self.key_reader_obj is not None: node = node.sort_by(self.key_reader_obj) #serialize = objector.serialize is_serialize = True serialize = entity.SerdeWrapper(objector, is_serialize) if self.kv_serializer is not None: serialized = pcollection.PCollection(node, ptype.pipeline()).map( self.kv_serializer).node() else: serialized = pcollection.PCollection( node, ptype.pipeline()).map(serialize).node() node = serialized.process_by(_ToRecord(self.kv_serializer)) \ .as_type(record_objector.RecordObjector()) \ .set_effective_key_num(0) \ .input(0) \ .done() \ .ignore_group() return node
def as_pcollection(self): """ 将PObject转为PCollection Returns: PCollection: 变换结果 """ from bigflow import pcollection return pcollection.PCollection(self.node(), self.pipeline())
def transform_from_node(self, load_node, pipeline): """ 内部接口 """ transformed = load_node.repeatedly() \ .process_by(_TextFromRecord()) \ .as_type(serde.StrSerde()) \ .set_effective_key_num(0) \ .input(0).allow_partial_processing() \ .done() transformed.set_size(load_node.size()) return pcollection.PCollection(transformed.leave_scope(), pipeline)
def select_elements(pvalue, n, key=None, isMaxed=True, **options): """implementation of select elements""" if not isinstance(pvalue, pcollection.PCollection): raise ValueError("Invalid arguments: pvalue must be of type PCollection") if isinstance(n, pobject.PObject): result_node = _select_elements_pobject(pvalue, n, key, isMaxed, **options) elif isinstance(n, (int, long)): result_node = _select_elements_int(pvalue, n, key, isMaxed, **options) return pcollection.PCollection(result_node, pvalue.pipeline())
def broadcast_to(pvalue, scope): """ Broadcast given PType instance to given scope Args: pvalue (PType): PType instance scope (LogicalPlan.Scope): scope Returns: PType: new PType after broadcast """ if not isinstance(pvalue, ptype.PType): return pvalue if isinstance(pvalue, ptable.PTable): flattned = pvalue.flatten() node = flattned.node() plan = node.plan() broadcasted = pcollection.PCollection(plan.broadcast_to(node, scope), pvalue.pipeline()) broadcasted = utils.construct(pvalue.pipeline(), broadcasted.node(), ptable.PTable, pvalue.nested_level(), pvalue.inner_most_type()) return broadcasted # else: node = pvalue.node() plan = node.plan() broadcasted_node = plan.broadcast_to(node, scope) if isinstance(pvalue, pcollection.PCollection): return pcollection.PCollection(broadcasted_node, pvalue.pipeline()) else: return pobject.PObject(broadcasted_node, pvalue.pipeline())
def transform_from_node(self, load_node, pipeline): """ 内部接口 """ from bigflow import ptable if self.repeatedly: transformed = load_node.repeatedly() \ .process_by(_KVFromBinaryRecord()) \ .as_type(serde.tuple_of(serde.StrSerde(), serde.StrSerde())) \ .set_effective_key_num(0) \ .input(0).allow_partial_processing() \ .done() else: transformed = load_node \ .process_by(_KVFromBinaryRecord()) \ .as_type(serde.tuple_of(serde.StrSerde(), serde.StrSerde())) \ .set_effective_key_num(0) \ .ignore_group() \ .input(0).allow_partial_processing() \ .done() transformed.set_size(load_node.size()) transformed = pcollection.PCollection(transformed, pipeline) tserde = self._options.get('serde', pipeline.default_objector()) if self.kv_deserializer is not None: transformed = transformed.map(self.kv_deserializer, serde=tserde) else: is_serialize = False deserialize = entity.SerdeWrapper(tserde, is_serialize, 1) transformed = transformed.map(deserialize, serde=tserde) if self._options.get('partitioned'): return ptable.PTable(transformed, key_serde=serde.StrSerde()) return pcollection.PCollection(transformed.node().leave_scope(), pipeline)
def flatten_values(pvalue): """ Transform flatten implmentation :param pvalue: pvalue :return: flattened PCollection """ if isinstance(pvalue, ptable.PTable): node = pvalue.node().leave_scope() for i in range(0, pvalue.nested_level()): node = node.leave_scope() return pcollection.PCollection(node, pvalue.pipeline()) return pvalue
def group_by_every_record(pvalue, **options): """ group by every record """ pipeline = pvalue.pipeline() node = pvalue.node() plan = node.plan() scope = node.scope() shuffle = plan.shuffle(scope, [node]) shuffle_node = shuffle.node(0).distribute_every() from bigflow import serde key_serde = serde.StrSerde() return ptable.PTable(pcollection.PCollection(shuffle_node, pipeline), key_serde=key_serde)
def window_into(pvalue, win, **options): """ group by window """ pipeline = pvalue.pipeline() key_serde = options.get('key_serde', win.key_serde()) if not key_serde: key_serde = pvalue.pipeline().default_objector() node = node_window_by( pvalue.node(), win, options.get('concurrency', None), pipeline) return ptable.PTable(pcollection.PCollection(node, pipeline), key_serde=key_serde)
def group_by(pvalue, key_extractor, value_extractor, **options): """ only the tuple pair elements of pvalue accepted """ key_serde = options.get('key_serde', pvalue.pipeline().default_objector()) if value_extractor is None: value_serde = options.get('value_serde', pvalue.serde()) else: value_serde = options.get('value_serde', pvalue.pipeline().default_objector()) pipeline = pvalue.pipeline() node = node_group_by(pvalue.node(), key_extractor, value_extractor, key_serde, value_serde, options.get('concurrency', None), pipeline) return ptable.PTable(pcollection.PCollection(node, pipeline), key_serde=key_serde)
def construct(pipeline, node, type, nested_level=None, inner_most_type=None, key_serdes=None): """ Construct a PType from a LogicalPlan node Args: pipeline (Pipeline): the Pipeline constructed PType belongs to node (LogicalPlan.Node): node type (class): class of PType to construct Kwargs: nested_leve: specify PTable's nested level if PType is a PTable inner_most_type: specify PTable's inner-most type if PType is a PTable Returns: PType: PType """ if inner_most_type is ptable.PTable: raise ValueError("Invalid value type for PTable") if type is pobject.PObject: pvalue = pobject.PObject(node, pipeline) elif type is pcollection.PCollection: pvalue = pcollection.PCollection(node, pipeline) else: if key_serdes is None: key_serdes = [pipeline.default_objector()] * (nested_level + 1) if nested_level > 0: pvalue = ptable.PTable(construct(pipeline, node, type, nested_level - 1, inner_most_type, key_serdes[1:]), key_serde=key_serdes[0]) else: pvalue = ptable.PTable(inner_most_type(node, pipeline)) return pvalue
def _select_elements_pobject(pvalue, n, key=None, isMaxed=True, **options): """pobject as side input for select elements""" scale = options.get('scale', 0.1) partial_scale = math.sqrt(scale) size = options.get('output_size', None) select_type = "Max" if isMaxed else "Min" tserde = options.get('serde', pvalue.serde()) if size is None: partial_size = None else: partial_size = pvalue.node().size() * \ math.sqrt(size / pvalue.node().size()) partial_helper = side_input_util.SideInputsUtil(pvalue, (n, )) partial_node = partial_helper.process_with_side_inputs() \ .by(entity.SelectElementsProcessor(n, isMaxed, key)) \ .as_type(tserde) \ .set_debug_info("%sElementsPartial: PObject" % (select_type)) \ .set_effective_key_num(0) \ .input(-1).allow_partial_processing() \ .done() \ .set_size(partial_size, partial_scale) partial = pcollection.PCollection(partial_node, pvalue.pipeline()) partial = partial.sort_by(key, isMaxed) result_helper = side_input_util.SideInputsUtil(partial, (n, )) result_node = result_helper.process_with_side_inputs() \ .by(entity.TakeProcessor(n))\ .as_type(tserde) \ .set_effective_key_num(0) \ .set_debug_info("%sElements: PObject" % (select_type))\ .set_size(size, partial_scale) return result_node
def transform_to_node(self, ptype): """ 内部接口 """ from bigflow.core import entity node = ptype.node() plan = node.plan() shuffle_scope = plan.shuffle(plan.global_scope(), [node]) node = shuffle_scope.node(0) if self.partition_fn is None: node = node.distribute_by_default() else: node = node.distribute_by(entity.Partitioner(self.partition_fn)) pvalue = pcollection.PCollection(node, ptype.pipeline()) for k, action in self.transform_actions.items(): pvalue = action(pvalue) node = pvalue.node() if self.partition_number is not None: shuffle_scope.with_concurrency(self.partition_number) if self.key_reader_obj is not None: node = node.sort_by(self.key_reader_obj) node = node.process_by(_ToRecord())\ .as_type(record_objector.RecordObjector()) \ .set_effective_key_num(0) \ .input(0) \ .done() \ .ignore_group() return node
def _make_shuffle(node, value_serde): return pcollection.PCollection(node.match_by(KeyReader(None, key_serde)), pipeline)\ .map(entity.ExtractValueFn(), serde=value_serde, scale=0.8)
def to_pcollection(pobject): """ inner fn""" return pcollection.PCollection(pobject.node(), pobject.pipeline())
def pipe(pvalue, command, **options): """ Transform pipe implementation :param pvalue: PType :return: PCollection """ if utils.is_infinite(pvalue): raise ValueError("pipe not supported infinite PType") if isinstance(pvalue, ptable.PTable): def merge_value(pvalue): """ inner """ if isinstance(pvalue, ptable.PTable): return pvalue.apply_values(merge_value) else: return pvalue.apply(transforms.to_list_pobject) def merge_kv(tp, level): """ inner """ kvs=[] for i in xrange(level): kvs.append(tp[0]) tp = tp[1] kvs.append(tp) return kvs level = pvalue.nested_level() + 1 transformed = pvalue.apply(merge_value).flatten() \ .apply(transforms.map, lambda kv: merge_kv(kv, level), serde=serde.of(pvalue.key_serdes() + [pvalue.serde()])) options['input_fields_num'] = level + 1 options['is_nested_ptype'] = True else: transformed = pvalue output_fields_num = options.get('output_fields_num', 1) if output_fields_num == 1: options['serde'] = serde.StrSerde() else: serdes = [serde.StrSerde()] * output_fields_num options['serde'] = serde.TupleSerde(*serdes) scale = options.get('scale', 1.0) size = options.get('output_size', None) memory = options.get('memory_limit', -1) cpu = options.get('cpu_limit', -1) result_node = transformed.node() \ .process_by(entity.PipeProcessor(command, **options)) \ .as_type(options['serde']) \ .set_debug_info("Pipe: " + repr(command)) \ .ignore_group() \ .set_effective_key_num(0) \ .input(-1).allow_partial_processing().done() \ .set_size(size, scale) \ .set_memory(memory) \ .set_cpu(cpu) return pcollection.PCollection(result_node, transformed.pipeline())