예제 #1
0
파일: ptable.py 프로젝트: zz198808/bigflow
    def _key(self, ensure_keep_group=False):
        '''
            内部函数

            ensure_keep_group的话,则返回至少发一条数据给reduce,确保每个group都保留着。
            否则,依赖于其它结点产生group。

        '''
        value = self._value()
        value = value[0] if isinstance(value, tuple) else value

        take_num = 1 if ensure_keep_group else 0

        if self.__key is None:

            import bigflow.transforms
            from bigflow.core import entity

            key_serde = self.key_serdes()[0]
            deserialize = entity.SerdeWrapper(key_serde, is_serialize=False)
            key_node = bigflow.transforms.flatten_values(value).node() \
                .process_by(entity.TakeProcessor(take_num)) \
                .as_type(value.serde()) \
                .set_debug_info("ExtractKeyPartial") \
                .input(0).allow_partial_processing().done() \
                .process_by(entity.GetLastKeyProcessor(deserialize)) \
                .as_type(key_serde) \
                .set_debug_info("ExtractKey")

            self.__key = pobject.PObject(key_node, self._pipeline)

        return self.__key
예제 #2
0
def to_list_pobject(pvalue, **options):
    """
    Transform listing implementation
    :param pvalue: PCollection/PObject
    :return: PObject
    """
    def __initializer(emitter):
        return list()

    def __transformer(status, emitter, record):
        status.append(copy.deepcopy(record))
        return status

    def __finalizer(status, emitter):
        emitter.emit(status)

    if utils.is_infinite(pvalue):
        raise ValueError("to_list_pobject not supported infinite PType")
    elif isinstance(pvalue, pobject.PObject):
        result = pvalue.map(lambda x: [x])
    elif isinstance(pvalue, ptable.PTable):
        raise ValueError(
            "to_list_pobject only applied on PCollections/PObject")
    else:
        result = pvalue.transform(__initializer,
                                  __transformer,
                                  __finalizer,
                                  serde=serde.list_of(pvalue.serde()))

    return pobject.PObject(result.node(), result.pipeline())
예제 #3
0
def make_tuple(*pobjects, **options):
    """
        make tuple of pobjects
    """
    for pobj in pobjects:
        assert isinstance(pobj, pobject.PObject)

    result = cartesian.cartesian(*pobjects, **options)
    return pobject.PObject(result.node(), result.pipeline())
예제 #4
0
파일: reduce.py 프로젝트: zz198808/bigflow
def reduce(ptype, fn, *side_inputs, **kargs):
    """
    inner fun
    """

    if utils.is_infinite(ptype):
        raise ValueError("reduce not supported infinite PType")

    scale = kargs.get('scale', 0.1)
    partial_scale = math.sqrt(scale)
    size = kargs.get('output_size', None)
    if size is None:
        partial_size = None
    else:
        partial_size = ptype.node().size() * math.sqrt(
            size / ptype.node().size())
    memory = kargs.get('memory_limit', -1)
    cpu = kargs.get('cpu_limit', -1)

    objector = kargs.get('serde',
                         ptype.serde())  # use the same serde of the input
    side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple(
        side_inputs)
    partial_helper = side_input_util.SideInputsUtil(ptype, side_inputs)

    partial_node = partial_helper.process_with_side_inputs()\
        .by(entity.ReduceProcessor(fn).set_side_inputs(*side_inputs))\
        .as_type(objector)\
        .set_debug_info("ReducePartial: " + repr(fn)) \
        .set_effective_key_num(0) \
        .input(-1).allow_partial_processing().done() \
        .set_size(partial_size, partial_scale) \
        .set_memory(memory) \
        .set_cpu(cpu)

    non_partial_helper = side_input_util.SideInputsUtil(
        partial_node, side_inputs)

    non_partial_node = non_partial_helper.process_with_side_inputs()\
        .by(entity.ReduceProcessor(fn).set_side_inputs(*side_inputs))\
        .as_type(objector)\
        .set_debug_info("Reduce: " + repr(fn)) \
        .set_effective_key_num(0) \
        .set_size(size, partial_scale) \
        .set_memory(memory) \
        .set_cpu(cpu)

    return pobject.PObject(non_partial_node, ptype.pipeline())
예제 #5
0
def combine(ptype, fn, **kargs):
    """ inner function"""

    if utils.is_infinite(ptype):
        raise ValueError("combine not supported infinite PType")

    objector = kargs.get('serde', ptype.serde()) # default, use the input serde

    pre_combine = kargs.get('pre_combine', True)

    scale = kargs.get('scale', 0.1)
    partial_scale = math.sqrt(scale)
    size = kargs.get('output_size', None)

    memory = kargs.get('memory_limit', -1)
    cpu = kargs.get('cpu_limit', -1)

    def _build_combine_node(from_node, is_partial, size, scale):
        debug_info = ("Partial" if is_partial else "") + "Combine: " + repr(fn)

        result = from_node.process_by(entity.CombineProcessor(fn).set_side_inputs(ptype)) \
            .as_type(objector) \
            .set_debug_info(debug_info) \
            .set_effective_key_num(0) \
            .set_size(size, scale) \
            .set_memory(memory) \
            .set_cpu(cpu) \
            .input(0).prepare_before_processing().done()

        if is_partial:
            result = result.input(0).allow_partial_processing().done()
        return result

    if size is None:
        partial_size = None
    else:
        partial_size = ptype.node().size() * math.sqrt(size / ptype.node().size())

    combined_node = ptype.node()

    if pre_combine:
        combined_node = _build_combine_node(combined_node, True, partial_size, partial_scale)
        combined_node = _build_combine_node(combined_node, False, partial_size, partial_scale)
    else:
        combined_node = _build_combine_node(combined_node, False, size, scale)

    return pobject.PObject(combined_node, ptype.pipeline())
예제 #6
0
def aggregate(ptype, zero, aggregate_fn, combine_fn, *side_inputs, **kargs):
    """
    Implementation of transforms.aggregate()
    """

    if utils.is_infinite(ptype):
        raise ValueError("aggregate not supported infinite PType")

    objector = kargs.get('serde', ptype.pipeline().default_objector())
    scale = kargs.get('scale', 0.1)
    partial_scale = math.sqrt(scale)
    size = kargs.get('output_size', None)
    if size is None:
        partial_size = None
    else:
        partial_size = ptype.node().size() * math.sqrt(size / ptype.node().size())
    memory = kargs.get('memory_limit', -1)
    cpu = kargs.get('cpu_limit', -1)

    side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple(side_inputs)
    partial_helper = side_input_util.SideInputsUtil(ptype, side_inputs)

    partial_node = partial_helper.process_with_side_inputs() \
        .by(entity.AccumulateProcessor(zero, aggregate_fn).set_side_inputs(*side_inputs)) \
        .as_type(objector) \
        .set_debug_info("AggregatePartial") \
        .input(-1).allow_partial_processing().done()\
        .set_effective_key_num(0) \
        .set_size(partial_size, partial_scale) \
        .set_memory(memory) \
        .set_cpu(cpu)

    non_partial_helper = side_input_util.SideInputsUtil(partial_node, side_inputs)

    non_partial_node = non_partial_helper.process_with_side_inputs() \
        .by(entity.AccumulateProcessor(zero, combine_fn).set_side_inputs(*side_inputs)) \
        .as_type(objector) \
        .set_debug_info("Aggregate") \
        .set_effective_key_num(0) \
        .set_size(size, partial_scale) \
        .set_memory(memory) \
        .set_cpu(cpu)

    return pobject.PObject(non_partial_node, ptype.pipeline())
예제 #7
0
def construct(pipeline,
              node,
              type,
              nested_level=None,
              inner_most_type=None,
              key_serdes=None):
    """
    Construct a PType from a LogicalPlan node

    Args:
      pipeline (Pipeline):  the Pipeline constructed PType belongs to
      node (LogicalPlan.Node):  node
      type (class):  class of PType to construct

    Kwargs:
      nested_leve: specify PTable's nested level if PType is a PTable
      inner_most_type:  specify PTable's inner-most type if PType is a PTable

    Returns:
      PType:  PType
    """
    if inner_most_type is ptable.PTable:
        raise ValueError("Invalid value type for PTable")

    if type is pobject.PObject:
        pvalue = pobject.PObject(node, pipeline)
    elif type is pcollection.PCollection:
        pvalue = pcollection.PCollection(node, pipeline)
    else:
        if key_serdes is None:
            key_serdes = [pipeline.default_objector()] * (nested_level + 1)
        if nested_level > 0:
            pvalue = ptable.PTable(construct(pipeline, node, type,
                                             nested_level - 1, inner_most_type,
                                             key_serdes[1:]),
                                   key_serde=key_serdes[0])
        else:
            pvalue = ptable.PTable(inner_most_type(node, pipeline))

    return pvalue
예제 #8
0
def broadcast_to(pvalue, scope):
    """
    Broadcast given PType instance to given scope

    Args:
      pvalue (PType):  PType instance
      scope (LogicalPlan.Scope):  scope

    Returns:
      PType:  new PType after broadcast
    """
    if not isinstance(pvalue, ptype.PType):
        return pvalue

    if isinstance(pvalue, ptable.PTable):
        flattned = pvalue.flatten()

        node = flattned.node()
        plan = node.plan()

        broadcasted = pcollection.PCollection(plan.broadcast_to(node, scope), pvalue.pipeline())

        broadcasted = utils.construct(pvalue.pipeline(),
                                      broadcasted.node(),
                                      ptable.PTable,
                                      pvalue.nested_level(),
                                      pvalue.inner_most_type())

        return broadcasted

    # else:
    node = pvalue.node()
    plan = node.plan()
    broadcasted_node = plan.broadcast_to(node, scope)

    if isinstance(pvalue, pcollection.PCollection):
        return pcollection.PCollection(broadcasted_node, pvalue.pipeline())
    else:
        return pobject.PObject(broadcasted_node, pvalue.pipeline())
예제 #9
0
def accumulate(pvalue, zero, accumulator, *side_inputs, **kargs):
    """
    Implementation of transforms.accumulate()
    """

    if utils.is_infinite(pvalue):
        raise ValueError("accumulate not supported infinite PType")

    objector = kargs.get('serde', pvalue.pipeline().default_objector())

    side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple(side_inputs)
    helper = side_input_util.SideInputsUtil(pvalue, side_inputs)
    result_node = helper.process_with_side_inputs() \
        .by(entity.AccumulateProcessor(entity.Functor.of(zero), entity.Functor.of(accumulator))
            .set_side_inputs(*side_inputs)) \
        .set_debug_info("accumulate(" + repr(zero) + ',' + repr(accumulator)) \
        .as_type(objector) \
        .set_effective_key_num(0) \
        .set_size(kargs.get('output_size', None), kargs.get('scale', 0.1)) \
        .set_memory(kargs.get('memory_limit', -1)) \
        .set_cpu(kargs.get('cpu_limit', -1))

    return pobject.PObject(result_node, pvalue.pipeline())