Ejemplo n.º 1
0
    def transform_from_node(self, load_node, pipeline):
        """
        内部接口
        """
        from bigflow import ptable

        if self.repeatedly:
            transformed = load_node.repeatedly() \
                .process_by(_TextFromRecord()) \
                .as_type(serde.StrSerde()) \
                .set_effective_key_num(0) \
                .input(0).allow_partial_processing() \
                .done()
        else:
            transformed = load_node \
                .process_by(_TextFromRecord()) \
                .as_type(serde.StrSerde()) \
                .set_effective_key_num(0) \
                .input(0).allow_partial_processing() \
                .done()

        transformed.set_size(load_node.size())

        if self._options.get('partitioned', False):
            transformed_pcollection = pcollection.PCollection(
                transformed, pipeline)
            return ptable.PTable(transformed_pcollection,
                                 key_serde=serde.StrSerde())

        return pcollection.PCollection(transformed.leave_scope(), pipeline)
Ejemplo n.º 2
0
def take(pvalue, n, **options):
    """ inner function """

    if utils.is_infinite(pvalue):
        raise ValueError("take not supported infinite PType")

    objector = options.get('serde', pvalue.serde())

    scale = options.get('scale', 0.1)
    partial_scale = math.sqrt(scale)
    size = options.get('output_size', None)
    if size is None:
        partial_size = None
    else:
        partial_size = ptype.node().size() * math.sqrt(size / ptype.node().size())

    if isinstance(n, pobject.PObject):
        # treat the pobject param as side input
        partial_helper = side_input_util.SideInputsUtil(pvalue, (n, ))
        partial_node = partial_helper.process_with_side_inputs() \
            .by(entity.TakeProcessor(n)) \
            .as_type(objector) \
            .set_debug_info("TakePartial: " + str(n.node())) \
            .set_effective_key_num(0) \
            .input(-1).allow_partial_processing() \
            .done() \
            .set_size(partial_size, partial_scale)

        partial = pcollection.PCollection(partial_node, pvalue.pipeline())
        result_helper = side_input_util.SideInputsUtil(partial, (n, ))

        result_node = result_helper.process_with_side_inputs() \
            .by(entity.TakeProcessor(n)) \
            .as_type(objector) \
            .set_debug_info("Take: " + str(n.node())) \
            .set_effective_key_num(0) \
            .set_size(size, partial_scale)

    elif isinstance(n, (int, long)):
        result_node = pvalue.node() \
            .process_by(entity.TakeProcessor(n)) \
            .as_type(objector) \
            .set_debug_info("TakePartial: %d" % n) \
            .input(0).allow_partial_processing().done() \
            .set_size(partial_size, partial_scale) \
            .process_by(entity.TakeProcessor(n)) \
            .as_type(objector) \
            .set_debug_info("Take: %d" % n) \
            .set_effective_key_num(0) \
            .set_size(size, partial_scale)

    else:
        raise ValueError("Wrong argument, only integers are accepted")

    return pcollection.PCollection(result_node, pvalue.pipeline())
Ejemplo n.º 3
0
def filter(pvalue, fn, *side_inputs, **kargs):
    """ Filter transform implementation

    :param pvalue: PType
    :param fn: UDF
    :param side_inputs: SideInputs
    :return: PType after filter
    """

    serde = kargs.get('serde', pvalue.serde())
    scale = kargs.get('scale', 0.5)
    size = kargs.get('output_size', None)
    memory = kargs.get('memory_limit', -1)
    cpu = kargs.get('cpu_limit', -1)

    side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple(
        side_inputs)
    helper = side_input_util.SideInputsUtil(pvalue, side_inputs)

    result_node = helper.process_with_side_inputs().by(entity.FilterProcessor(fn, *side_inputs))\
        .as_type(serde)\
        .set_debug_info("Filter: " + repr(fn)) \
        .ignore_group() \
        .set_effective_key_num(0) \
        .input(-1).allow_partial_processing().done() \
        .set_size(size, scale) \
        .set_memory(memory) \
        .set_cpu(cpu)

    return pcollection.PCollection(result_node, pvalue.pipeline())
Ejemplo n.º 4
0
def flatten(pvalue, **kargs):
    """ Transform flatten implmentation

    :param ptable: PTable
    :return: flattened PCollection
    """
    def _flatten_once(node, key_serde, value_serde):
        return node.process_by(entity.FlattenProcessor(key_serde)) \
                   .as_type(value_serde) \
                   .set_debug_info("FlattenProcessor") \
                   .input(0).allow_partial_processing() \
                   .done() \
                   .set_size(scale_factor=1.25) \
                   .leave_scope()

    if isinstance(pvalue, ptable.PTable):
        key_serdes = pvalue.key_serdes()
        value_serde = pvalue.serde()
        assert len(key_serdes) == pvalue.nested_level() + 1
        it = reversed(key_serdes)
        node = pvalue.node()
        for i in range(0, pvalue.nested_level() + 1):
            key_serde = it.next()
            value_serde = serde.tuple_of(key_serde, value_serde)
            node = _flatten_once(node, key_serde, value_serde)

        pvalue = pcollection.PCollection(node, pvalue.pipeline())

    return pvalue
Ejemplo n.º 5
0
def flat_map(pvalue, fn, *side_inputs, **kargs):
    """
    Implementation of transforms.flat_map()
    """
    objector = kargs.get('serde', pvalue.pipeline().default_objector())
    side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple(
        side_inputs)
    helper = side_input_util.SideInputsUtil(pvalue, side_inputs)

    scale = kargs.get('scale', 1.5)
    size = kargs.get('output_size', None)
    memory = kargs.get('memory_limit', -1)
    cpu = kargs.get('cpu_limit', -1)

    assert isinstance(pvalue, pcollection.PCollection) or isinstance(
        pvalue, pobject.PObject)

    result_node = helper.process_with_side_inputs() \
        .ignore_group() \
        .by(entity.FlatMapProcessor(fn).set_side_inputs(*side_inputs)) \
        .as_type(objector) \
        .set_debug_info("FlatMap: " + repr(fn)) \
        .set_effective_key_num(0) \
        .input(-1).allow_partial_processing() \
        .done() \
        .set_memory(memory) \
        .set_cpu(cpu) \
        .set_size(size, scale)

    return pcollection.PCollection(result_node, pvalue.pipeline())
Ejemplo n.º 6
0
def union(*pvalues, **options):
    """ inner function"""
    if len(pvalues) == 0:
        raise ValueError("No argument")

    if not all(
            isinstance(p, pcollection.PCollection)
            or isinstance(p, pobject.PObject) for p in pvalues):
        raise ValueError("Union only applied on PCollections or PObjects")

    serdes = [p.serde() for p in pvalues]
    com_serde = options.get("serde", serde.common_serde(*serdes))
    if com_serde:

        def _inner_map(p):
            """use com_serde to convert"""
            if p.serde().__class__ != com_serde.__class__:
                p = p.map(lambda x: x, serde=com_serde)
            return p

        pvalues = map(_inner_map, pvalues)

    common_scope = pvalues[0].node().scope()
    all_nodes = map(lambda p: p.node(), pvalues)
    if not all(node.scope() is common_scope for node in all_nodes):
        raise ValueError(
            "PCollections to union should work on same scope only")

    plan = pvalues[0].node().plan()
    return pcollection.PCollection(plan.union(nodes=all_nodes),
                                   pvalues[0].pipeline())
Ejemplo n.º 7
0
def _transform_with_fns(pvalue, initializer, transformer, finalizer,
                        *side_inputs, **kargs):
    """
    pcollection transform
    """

    objector = kargs.get('serde', pvalue.pipeline().default_objector())
    status_objector = kargs.get('status_serde',
                                pvalue.pipeline().default_objector())
    debug_info = "transform" + repr((initializer, transformer, finalizer))
    side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple(
        side_inputs)
    helper = side_input_util.SideInputsUtil(pvalue, side_inputs)
    pnode = helper.process_with_side_inputs()
    initializer = entity.Functor.of(initializer)
    transformer = entity.Functor.of(transformer)
    finalizer = entity.Functor.of(finalizer)

    result_node = pnode \
        .by(entity.TransformProcessor(status_objector, initializer, transformer, finalizer)
                .set_side_inputs(*side_inputs)) \
        .as_type(objector) \
        .is_stateful() \
        .set_effective_key_num(0) \
        .set_debug_info(debug_info) \
        .set_size(kargs.get('output_size', None), kargs.get('scale', 0.1)) \
        .set_memory(kargs.get('memory_limit', -1)) \
        .set_cpu(kargs.get('cpu_limit', -1))

    return pcollection.PCollection(result_node, pvalue.pipeline())
Ejemplo n.º 8
0
 def transform_from_node(self, load_node, pipeline):
     """ inner func """
     from bigflow import ptable
     transformed_pcollection = pcollection.PCollection(
         load_node, pipeline)
     before_post_process = \
         ptable.PTable(transformed_pcollection, key_serde=serde.CPickleSerde())
     return self._user_input_base.post_process(before_post_process)
Ejemplo n.º 9
0
    def transform_to_node(self, ptype):
        from bigflow.core import entity
        from bigflow import pcollection
        node = ptype.node()
        plan = node.plan()

        objector = self.options.get('serde',
                                    ptype.pipeline().default_objector())
        shuffle_scope = plan.shuffle(plan.global_scope(), [node])
        node = shuffle_scope.node(0)
        if self.partition_fn is None:
            node = node.distribute_by_default()
        else:
            node = node.distribute_by(entity.Partitioner(self.partition_fn))

        pvalue = pcollection.PCollection(node, ptype.pipeline())

        for k, action in self.transform_actions.items():
            pvalue = action(pvalue)

        node = pvalue.node()

        if self.partition_number is not None:
            shuffle_scope.with_concurrency(self.partition_number)

        if self.key_reader_obj is not None:
            node = node.sort_by(self.key_reader_obj)

        #serialize = objector.serialize
        is_serialize = True
        serialize = entity.SerdeWrapper(objector, is_serialize)
        if self.kv_serializer is not None:
            serialized = pcollection.PCollection(node, ptype.pipeline()).map(
                self.kv_serializer).node()
        else:
            serialized = pcollection.PCollection(
                node, ptype.pipeline()).map(serialize).node()

        node = serialized.process_by(_ToRecord(self.kv_serializer)) \
            .as_type(record_objector.RecordObjector()) \
            .set_effective_key_num(0) \
            .input(0) \
            .done() \
            .ignore_group()
        return node
Ejemplo n.º 10
0
    def as_pcollection(self):
        """
        将PObject转为PCollection

        Returns:
          PCollection:  变换结果
        """
        from bigflow import pcollection
        return pcollection.PCollection(self.node(), self.pipeline())
Ejemplo n.º 11
0
    def transform_from_node(self, load_node, pipeline):
        """ 内部接口 """
        transformed = load_node.repeatedly() \
            .process_by(_TextFromRecord()) \
            .as_type(serde.StrSerde()) \
            .set_effective_key_num(0) \
            .input(0).allow_partial_processing() \
            .done()

        transformed.set_size(load_node.size())

        return pcollection.PCollection(transformed.leave_scope(), pipeline)
Ejemplo n.º 12
0
def select_elements(pvalue, n, key=None, isMaxed=True, **options):
    """implementation of select elements"""

    if not isinstance(pvalue, pcollection.PCollection):
        raise ValueError("Invalid arguments: pvalue must be of type PCollection")

    if isinstance(n, pobject.PObject):
        result_node = _select_elements_pobject(pvalue, n, key, isMaxed, **options)
    elif isinstance(n, (int, long)):
        result_node = _select_elements_int(pvalue, n, key, isMaxed, **options)

    return pcollection.PCollection(result_node, pvalue.pipeline())
Ejemplo n.º 13
0
def broadcast_to(pvalue, scope):
    """
    Broadcast given PType instance to given scope

    Args:
      pvalue (PType):  PType instance
      scope (LogicalPlan.Scope):  scope

    Returns:
      PType:  new PType after broadcast
    """
    if not isinstance(pvalue, ptype.PType):
        return pvalue

    if isinstance(pvalue, ptable.PTable):
        flattned = pvalue.flatten()

        node = flattned.node()
        plan = node.plan()

        broadcasted = pcollection.PCollection(plan.broadcast_to(node, scope), pvalue.pipeline())

        broadcasted = utils.construct(pvalue.pipeline(),
                                      broadcasted.node(),
                                      ptable.PTable,
                                      pvalue.nested_level(),
                                      pvalue.inner_most_type())

        return broadcasted

    # else:
    node = pvalue.node()
    plan = node.plan()
    broadcasted_node = plan.broadcast_to(node, scope)

    if isinstance(pvalue, pcollection.PCollection):
        return pcollection.PCollection(broadcasted_node, pvalue.pipeline())
    else:
        return pobject.PObject(broadcasted_node, pvalue.pipeline())
Ejemplo n.º 14
0
    def transform_from_node(self, load_node, pipeline):
        """
        内部接口
        """
        from bigflow import ptable
        if self.repeatedly:
            transformed = load_node.repeatedly() \
                .process_by(_KVFromBinaryRecord()) \
                .as_type(serde.tuple_of(serde.StrSerde(), serde.StrSerde())) \
                .set_effective_key_num(0) \
                .input(0).allow_partial_processing() \
                .done()
        else:
            transformed = load_node \
                .process_by(_KVFromBinaryRecord()) \
                .as_type(serde.tuple_of(serde.StrSerde(), serde.StrSerde())) \
                .set_effective_key_num(0) \
                .ignore_group() \
                .input(0).allow_partial_processing() \
                .done()

        transformed.set_size(load_node.size())

        transformed = pcollection.PCollection(transformed, pipeline)

        tserde = self._options.get('serde', pipeline.default_objector())

        if self.kv_deserializer is not None:
            transformed = transformed.map(self.kv_deserializer, serde=tserde)
        else:
            is_serialize = False
            deserialize = entity.SerdeWrapper(tserde, is_serialize, 1)
            transformed = transformed.map(deserialize, serde=tserde)

        if self._options.get('partitioned'):
            return ptable.PTable(transformed, key_serde=serde.StrSerde())
        return pcollection.PCollection(transformed.node().leave_scope(),
                                       pipeline)
Ejemplo n.º 15
0
def flatten_values(pvalue):
    """ Transform flatten implmentation

    :param pvalue: pvalue
    :return: flattened PCollection
    """
    if isinstance(pvalue, ptable.PTable):
        node = pvalue.node().leave_scope()
        for i in range(0, pvalue.nested_level()):
            node = node.leave_scope()

        return pcollection.PCollection(node, pvalue.pipeline())

    return pvalue
Ejemplo n.º 16
0
def group_by_every_record(pvalue, **options):
    """
    group by every record
    """

    pipeline = pvalue.pipeline()
    node = pvalue.node()
    plan = node.plan()
    scope = node.scope()
    shuffle = plan.shuffle(scope, [node])
    shuffle_node = shuffle.node(0).distribute_every()

    from bigflow import serde
    key_serde = serde.StrSerde()
    return ptable.PTable(pcollection.PCollection(shuffle_node, pipeline), key_serde=key_serde)
Ejemplo n.º 17
0
def window_into(pvalue, win, **options):
    """
    group by window
    """
    pipeline = pvalue.pipeline()
    key_serde = options.get('key_serde', win.key_serde())
    if not key_serde:
        key_serde = pvalue.pipeline().default_objector()

    node = node_window_by(
            pvalue.node(),
            win,
            options.get('concurrency', None),
            pipeline)

    return ptable.PTable(pcollection.PCollection(node, pipeline), key_serde=key_serde)
Ejemplo n.º 18
0
def group_by(pvalue, key_extractor, value_extractor, **options):
    """
    only the tuple pair elements of pvalue accepted
    """

    key_serde = options.get('key_serde', pvalue.pipeline().default_objector())
    if value_extractor is None:
        value_serde = options.get('value_serde', pvalue.serde())
    else:
        value_serde = options.get('value_serde',
                                  pvalue.pipeline().default_objector())

    pipeline = pvalue.pipeline()

    node = node_group_by(pvalue.node(), key_extractor, value_extractor,
                         key_serde, value_serde,
                         options.get('concurrency', None), pipeline)
    return ptable.PTable(pcollection.PCollection(node, pipeline),
                         key_serde=key_serde)
Ejemplo n.º 19
0
def construct(pipeline,
              node,
              type,
              nested_level=None,
              inner_most_type=None,
              key_serdes=None):
    """
    Construct a PType from a LogicalPlan node

    Args:
      pipeline (Pipeline):  the Pipeline constructed PType belongs to
      node (LogicalPlan.Node):  node
      type (class):  class of PType to construct

    Kwargs:
      nested_leve: specify PTable's nested level if PType is a PTable
      inner_most_type:  specify PTable's inner-most type if PType is a PTable

    Returns:
      PType:  PType
    """
    if inner_most_type is ptable.PTable:
        raise ValueError("Invalid value type for PTable")

    if type is pobject.PObject:
        pvalue = pobject.PObject(node, pipeline)
    elif type is pcollection.PCollection:
        pvalue = pcollection.PCollection(node, pipeline)
    else:
        if key_serdes is None:
            key_serdes = [pipeline.default_objector()] * (nested_level + 1)
        if nested_level > 0:
            pvalue = ptable.PTable(construct(pipeline, node, type,
                                             nested_level - 1, inner_most_type,
                                             key_serdes[1:]),
                                   key_serde=key_serdes[0])
        else:
            pvalue = ptable.PTable(inner_most_type(node, pipeline))

    return pvalue
Ejemplo n.º 20
0
def _select_elements_pobject(pvalue, n, key=None, isMaxed=True, **options):
    """pobject as side input for select elements"""
    scale = options.get('scale', 0.1)
    partial_scale = math.sqrt(scale)
    size = options.get('output_size', None)
    select_type = "Max" if isMaxed else "Min"
    tserde = options.get('serde', pvalue.serde())

    if size is None:
        partial_size = None
    else:
        partial_size = pvalue.node().size() * \
                math.sqrt(size / pvalue.node().size())

    partial_helper = side_input_util.SideInputsUtil(pvalue, (n, ))
    partial_node = partial_helper.process_with_side_inputs() \
            .by(entity.SelectElementsProcessor(n, isMaxed, key)) \
            .as_type(tserde) \
            .set_debug_info("%sElementsPartial: PObject" % (select_type)) \
            .set_effective_key_num(0) \
            .input(-1).allow_partial_processing() \
            .done() \
            .set_size(partial_size, partial_scale)

    partial = pcollection.PCollection(partial_node, pvalue.pipeline())

    partial = partial.sort_by(key, isMaxed)
    result_helper = side_input_util.SideInputsUtil(partial, (n, ))

    result_node = result_helper.process_with_side_inputs() \
            .by(entity.TakeProcessor(n))\
            .as_type(tserde) \
            .set_effective_key_num(0) \
            .set_debug_info("%sElements: PObject" % (select_type))\
            .set_size(size, partial_scale)

    return result_node
Ejemplo n.º 21
0
    def transform_to_node(self, ptype):
        """
        内部接口
        """
        from bigflow.core import entity
        node = ptype.node()
        plan = node.plan()

        shuffle_scope = plan.shuffle(plan.global_scope(), [node])
        node = shuffle_scope.node(0)
        if self.partition_fn is None:
            node = node.distribute_by_default()
        else:
            node = node.distribute_by(entity.Partitioner(self.partition_fn))

        pvalue = pcollection.PCollection(node, ptype.pipeline())

        for k, action in self.transform_actions.items():
            pvalue = action(pvalue)

        node = pvalue.node()

        if self.partition_number is not None:
            shuffle_scope.with_concurrency(self.partition_number)

        if self.key_reader_obj is not None:
            node = node.sort_by(self.key_reader_obj)

        node = node.process_by(_ToRecord())\
            .as_type(record_objector.RecordObjector()) \
            .set_effective_key_num(0) \
            .input(0) \
            .done() \
            .ignore_group()

        return node
Ejemplo n.º 22
0
 def _make_shuffle(node, value_serde):
     return pcollection.PCollection(node.match_by(KeyReader(None, key_serde)), pipeline)\
         .map(entity.ExtractValueFn(), serde=value_serde, scale=0.8)
Ejemplo n.º 23
0
 def to_pcollection(pobject):
     """ inner fn"""
     return pcollection.PCollection(pobject.node(), pobject.pipeline())
Ejemplo n.º 24
0
def pipe(pvalue, command, **options):
    """
    Transform pipe implementation
    :param pvalue: PType
    :return: PCollection
    """
    if utils.is_infinite(pvalue):
        raise ValueError("pipe not supported infinite PType")

    if isinstance(pvalue, ptable.PTable):
        def merge_value(pvalue):
            """ inner """
            if isinstance(pvalue, ptable.PTable):
                return pvalue.apply_values(merge_value)
            else:
                return pvalue.apply(transforms.to_list_pobject)

        def merge_kv(tp, level):
            """ inner """
            kvs=[]
            for i in xrange(level):
                kvs.append(tp[0])
                tp = tp[1]
            kvs.append(tp)
            return kvs

        level = pvalue.nested_level() + 1

        transformed = pvalue.apply(merge_value).flatten() \
            .apply(transforms.map, lambda kv: merge_kv(kv, level),
                serde=serde.of(pvalue.key_serdes() + [pvalue.serde()]))

        options['input_fields_num'] = level + 1
        options['is_nested_ptype'] = True
    else:
        transformed = pvalue

    output_fields_num = options.get('output_fields_num', 1)
    if output_fields_num == 1:
        options['serde'] = serde.StrSerde()
    else:
        serdes = [serde.StrSerde()] * output_fields_num
        options['serde'] = serde.TupleSerde(*serdes)

    scale = options.get('scale', 1.0)
    size = options.get('output_size', None)
    memory = options.get('memory_limit', -1)
    cpu = options.get('cpu_limit', -1)

    result_node = transformed.node() \
        .process_by(entity.PipeProcessor(command, **options)) \
        .as_type(options['serde']) \
        .set_debug_info("Pipe: " + repr(command)) \
        .ignore_group() \
        .set_effective_key_num(0) \
        .input(-1).allow_partial_processing().done() \
        .set_size(size, scale) \
        .set_memory(memory) \
        .set_cpu(cpu)

    return pcollection.PCollection(result_node, transformed.pipeline())