def _key(self, ensure_keep_group=False): ''' 内部函数 ensure_keep_group的话,则返回至少发一条数据给reduce,确保每个group都保留着。 否则,依赖于其它结点产生group。 ''' value = self._value() value = value[0] if isinstance(value, tuple) else value take_num = 1 if ensure_keep_group else 0 if self.__key is None: import bigflow.transforms from bigflow.core import entity key_serde = self.key_serdes()[0] deserialize = entity.SerdeWrapper(key_serde, is_serialize=False) key_node = bigflow.transforms.flatten_values(value).node() \ .process_by(entity.TakeProcessor(take_num)) \ .as_type(value.serde()) \ .set_debug_info("ExtractKeyPartial") \ .input(0).allow_partial_processing().done() \ .process_by(entity.GetLastKeyProcessor(deserialize)) \ .as_type(key_serde) \ .set_debug_info("ExtractKey") self.__key = pobject.PObject(key_node, self._pipeline) return self.__key
def to_list_pobject(pvalue, **options): """ Transform listing implementation :param pvalue: PCollection/PObject :return: PObject """ def __initializer(emitter): return list() def __transformer(status, emitter, record): status.append(copy.deepcopy(record)) return status def __finalizer(status, emitter): emitter.emit(status) if utils.is_infinite(pvalue): raise ValueError("to_list_pobject not supported infinite PType") elif isinstance(pvalue, pobject.PObject): result = pvalue.map(lambda x: [x]) elif isinstance(pvalue, ptable.PTable): raise ValueError( "to_list_pobject only applied on PCollections/PObject") else: result = pvalue.transform(__initializer, __transformer, __finalizer, serde=serde.list_of(pvalue.serde())) return pobject.PObject(result.node(), result.pipeline())
def make_tuple(*pobjects, **options): """ make tuple of pobjects """ for pobj in pobjects: assert isinstance(pobj, pobject.PObject) result = cartesian.cartesian(*pobjects, **options) return pobject.PObject(result.node(), result.pipeline())
def reduce(ptype, fn, *side_inputs, **kargs): """ inner fun """ if utils.is_infinite(ptype): raise ValueError("reduce not supported infinite PType") scale = kargs.get('scale', 0.1) partial_scale = math.sqrt(scale) size = kargs.get('output_size', None) if size is None: partial_size = None else: partial_size = ptype.node().size() * math.sqrt( size / ptype.node().size()) memory = kargs.get('memory_limit', -1) cpu = kargs.get('cpu_limit', -1) objector = kargs.get('serde', ptype.serde()) # use the same serde of the input side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple( side_inputs) partial_helper = side_input_util.SideInputsUtil(ptype, side_inputs) partial_node = partial_helper.process_with_side_inputs()\ .by(entity.ReduceProcessor(fn).set_side_inputs(*side_inputs))\ .as_type(objector)\ .set_debug_info("ReducePartial: " + repr(fn)) \ .set_effective_key_num(0) \ .input(-1).allow_partial_processing().done() \ .set_size(partial_size, partial_scale) \ .set_memory(memory) \ .set_cpu(cpu) non_partial_helper = side_input_util.SideInputsUtil( partial_node, side_inputs) non_partial_node = non_partial_helper.process_with_side_inputs()\ .by(entity.ReduceProcessor(fn).set_side_inputs(*side_inputs))\ .as_type(objector)\ .set_debug_info("Reduce: " + repr(fn)) \ .set_effective_key_num(0) \ .set_size(size, partial_scale) \ .set_memory(memory) \ .set_cpu(cpu) return pobject.PObject(non_partial_node, ptype.pipeline())
def combine(ptype, fn, **kargs): """ inner function""" if utils.is_infinite(ptype): raise ValueError("combine not supported infinite PType") objector = kargs.get('serde', ptype.serde()) # default, use the input serde pre_combine = kargs.get('pre_combine', True) scale = kargs.get('scale', 0.1) partial_scale = math.sqrt(scale) size = kargs.get('output_size', None) memory = kargs.get('memory_limit', -1) cpu = kargs.get('cpu_limit', -1) def _build_combine_node(from_node, is_partial, size, scale): debug_info = ("Partial" if is_partial else "") + "Combine: " + repr(fn) result = from_node.process_by(entity.CombineProcessor(fn).set_side_inputs(ptype)) \ .as_type(objector) \ .set_debug_info(debug_info) \ .set_effective_key_num(0) \ .set_size(size, scale) \ .set_memory(memory) \ .set_cpu(cpu) \ .input(0).prepare_before_processing().done() if is_partial: result = result.input(0).allow_partial_processing().done() return result if size is None: partial_size = None else: partial_size = ptype.node().size() * math.sqrt(size / ptype.node().size()) combined_node = ptype.node() if pre_combine: combined_node = _build_combine_node(combined_node, True, partial_size, partial_scale) combined_node = _build_combine_node(combined_node, False, partial_size, partial_scale) else: combined_node = _build_combine_node(combined_node, False, size, scale) return pobject.PObject(combined_node, ptype.pipeline())
def aggregate(ptype, zero, aggregate_fn, combine_fn, *side_inputs, **kargs): """ Implementation of transforms.aggregate() """ if utils.is_infinite(ptype): raise ValueError("aggregate not supported infinite PType") objector = kargs.get('serde', ptype.pipeline().default_objector()) scale = kargs.get('scale', 0.1) partial_scale = math.sqrt(scale) size = kargs.get('output_size', None) if size is None: partial_size = None else: partial_size = ptype.node().size() * math.sqrt(size / ptype.node().size()) memory = kargs.get('memory_limit', -1) cpu = kargs.get('cpu_limit', -1) side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple(side_inputs) partial_helper = side_input_util.SideInputsUtil(ptype, side_inputs) partial_node = partial_helper.process_with_side_inputs() \ .by(entity.AccumulateProcessor(zero, aggregate_fn).set_side_inputs(*side_inputs)) \ .as_type(objector) \ .set_debug_info("AggregatePartial") \ .input(-1).allow_partial_processing().done()\ .set_effective_key_num(0) \ .set_size(partial_size, partial_scale) \ .set_memory(memory) \ .set_cpu(cpu) non_partial_helper = side_input_util.SideInputsUtil(partial_node, side_inputs) non_partial_node = non_partial_helper.process_with_side_inputs() \ .by(entity.AccumulateProcessor(zero, combine_fn).set_side_inputs(*side_inputs)) \ .as_type(objector) \ .set_debug_info("Aggregate") \ .set_effective_key_num(0) \ .set_size(size, partial_scale) \ .set_memory(memory) \ .set_cpu(cpu) return pobject.PObject(non_partial_node, ptype.pipeline())
def construct(pipeline, node, type, nested_level=None, inner_most_type=None, key_serdes=None): """ Construct a PType from a LogicalPlan node Args: pipeline (Pipeline): the Pipeline constructed PType belongs to node (LogicalPlan.Node): node type (class): class of PType to construct Kwargs: nested_leve: specify PTable's nested level if PType is a PTable inner_most_type: specify PTable's inner-most type if PType is a PTable Returns: PType: PType """ if inner_most_type is ptable.PTable: raise ValueError("Invalid value type for PTable") if type is pobject.PObject: pvalue = pobject.PObject(node, pipeline) elif type is pcollection.PCollection: pvalue = pcollection.PCollection(node, pipeline) else: if key_serdes is None: key_serdes = [pipeline.default_objector()] * (nested_level + 1) if nested_level > 0: pvalue = ptable.PTable(construct(pipeline, node, type, nested_level - 1, inner_most_type, key_serdes[1:]), key_serde=key_serdes[0]) else: pvalue = ptable.PTable(inner_most_type(node, pipeline)) return pvalue
def broadcast_to(pvalue, scope): """ Broadcast given PType instance to given scope Args: pvalue (PType): PType instance scope (LogicalPlan.Scope): scope Returns: PType: new PType after broadcast """ if not isinstance(pvalue, ptype.PType): return pvalue if isinstance(pvalue, ptable.PTable): flattned = pvalue.flatten() node = flattned.node() plan = node.plan() broadcasted = pcollection.PCollection(plan.broadcast_to(node, scope), pvalue.pipeline()) broadcasted = utils.construct(pvalue.pipeline(), broadcasted.node(), ptable.PTable, pvalue.nested_level(), pvalue.inner_most_type()) return broadcasted # else: node = pvalue.node() plan = node.plan() broadcasted_node = plan.broadcast_to(node, scope) if isinstance(pvalue, pcollection.PCollection): return pcollection.PCollection(broadcasted_node, pvalue.pipeline()) else: return pobject.PObject(broadcasted_node, pvalue.pipeline())
def accumulate(pvalue, zero, accumulator, *side_inputs, **kargs): """ Implementation of transforms.accumulate() """ if utils.is_infinite(pvalue): raise ValueError("accumulate not supported infinite PType") objector = kargs.get('serde', pvalue.pipeline().default_objector()) side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple(side_inputs) helper = side_input_util.SideInputsUtil(pvalue, side_inputs) result_node = helper.process_with_side_inputs() \ .by(entity.AccumulateProcessor(entity.Functor.of(zero), entity.Functor.of(accumulator)) .set_side_inputs(*side_inputs)) \ .set_debug_info("accumulate(" + repr(zero) + ',' + repr(accumulator)) \ .as_type(objector) \ .set_effective_key_num(0) \ .set_size(kargs.get('output_size', None), kargs.get('scale', 0.1)) \ .set_memory(kargs.get('memory_limit', -1)) \ .set_cpu(kargs.get('cpu_limit', -1)) return pobject.PObject(result_node, pvalue.pipeline())