Ejemplo n.º 1
0
def subtract(a, b):

    if utils.is_infinite(a) or utils.is_infinite(b):
        raise ValueError("subtract not supported infinite PType")

    def filter_if_b_is_empty(a, b):
        return a.filter(lambda input, is_empty: is_empty, b.is_empty())
    a = a.map(lambda x: (x, None))
    b = b.map(lambda x: (x, None))
    return a.cogroup(b).apply_values(filter_if_b_is_empty).flatten().map(lambda kv:kv[0])
Ejemplo n.º 2
0
def diff(a, b):
    """
    Implementation of transforms.diff()
    """

    if utils.is_infinite(a) or utils.is_infinite(b):
        raise ValueError("diff not supported infinite PType")

    def filter_count_ne(a, b):
        return a.count() \
            .flat_map(lambda c1, c2: [(c1, c2)], b.count(), serde = serde.of((int, int))) \
            .filter(lambda tp: tp[0] != tp[1])
    a = a.map(lambda x: (x, None), serde = serde.tuple_of(a.serde(), serde.of(int)))
    b = b.map(lambda x: (x, None), serde = serde.tuple_of(b.serde(), serde.of(int)))

    return a.cogroup(b).apply_values(filter_count_ne).flatten()
Ejemplo n.º 3
0
def first(ptype):
    """
    Implementation of first
    """
    if utils.is_infinite(ptype):
        raise ValueError("first not supported infinite PType")
    return bigflow.pobject.PObject(ptype.take(1).node(), ptype.pipeline())
Ejemplo n.º 4
0
def to_list_pobject(pvalue, **options):
    """
    Transform listing implementation
    :param pvalue: PCollection/PObject
    :return: PObject
    """
    def __initializer(emitter):
        return list()

    def __transformer(status, emitter, record):
        status.append(copy.deepcopy(record))
        return status

    def __finalizer(status, emitter):
        emitter.emit(status)

    if utils.is_infinite(pvalue):
        raise ValueError("to_list_pobject not supported infinite PType")
    elif isinstance(pvalue, pobject.PObject):
        result = pvalue.map(lambda x: [x])
    elif isinstance(pvalue, ptable.PTable):
        raise ValueError(
            "to_list_pobject only applied on PCollections/PObject")
    else:
        result = pvalue.transform(__initializer,
                                  __transformer,
                                  __finalizer,
                                  serde=serde.list_of(pvalue.serde()))

    return pobject.PObject(result.node(), result.pipeline())
Ejemplo n.º 5
0
def to_list_pobject(pvalue, **options):
    """
    Transform listing implementation
    :param pvalue: PCollection/PObject
    :return: PObject
    """
    def __initializer(emitter):
        return list()

    def __transformer(status, emitter, record):
        status.append(copy.deepcopy(record))
        return status

    def __finalizer(status, emitter):
        emitter.emit(status)

    if utils.is_infinite(pvalue):
        raise ValueError("to_list_pobject not supported infinite PType")
    elif isinstance(pvalue, pobject.PObject):
        result = pvalue.map(lambda x: [x])
    elif isinstance(pvalue, ptable.PTable):
        raise ValueError("to_list_pobject only applied on PCollections/PObject")
    else:
        result = pvalue.transform(
            __initializer,
            __transformer,
            __finalizer,
            serde=serde.list_of(pvalue.serde()))

    return pobject.PObject(result.node(), result.pipeline())
Ejemplo n.º 6
0
    def _transform_output_format(self, pcollection, output_format):
        from bigflow.util import path_util
        from bigflow.util import utils

        format_type = output_format.get_entity_name()
        # todo: extract ugi from output_format, support multiple clusters and ugis
        if format_type == "TextOutputFormat" or \
                format_type == "SequenceFileAsBinaryOutputFormat":
            uri = path_util.to_abs_local_path(output_format.path)

            if utils.is_infinite(pcollection):
                if not path_util.is_hdfs_path(uri):
                    raise ValueError("That write infinite PType to local file "
                            "is not supported in MRPipeline")
                else:
                    output_format.path = self._toft_path(uri)
            else:
                if not path_util.is_hdfs_path(uri):
                    # User try to use MRPipeline to write local file, we replace original uri
                    # to a temp path on HDFS and dump the output for local FS after job is done.
                    hdfs_uri = self._tmp_hdfs_path(uri)
                    output_format.path = self._toft_path(hdfs_uri)
                    self._local_uri_infos.append({
                        'local_uri': uri,
                        'hdfs_uri': hdfs_uri,
                        'overwrite': output_format.overwrite
                    })
                    logger.debug(
                            "Write file to HDFS path: %s and dump it after job done" % hdfs_uri)
                    self._remote_temp_files.append(hdfs_uri)
                else:
                    output_format.path = self._toft_path(self._tmp_output_path(uri))
                    output_format.commit_path = self._toft_path(uri)

        return output_format
Ejemplo n.º 7
0
def first(ptype):
    """
    Implementation of first
    """
    if utils.is_infinite(ptype):
        raise ValueError("first not supported infinite PType")
    return bigflow.pobject.PObject(ptype.take(1).node(), ptype.pipeline())
Ejemplo n.º 8
0
def intersection(a, b, output_duplicated=False):

    if utils.is_infinite(a) or utils.is_infinite(b):
        raise ValueError("intersectio not supported infinite PType")

    def filter_if_neither_empty(a, b):
        return a.is_empty() \
                .flat_map(lambda x:[x]) \
                .filter(lambda a_empty, b_empty: not a_empty and not b_empty, b.is_empty())

    a = a.map(lambda x: (x, None))
    b = b.map(lambda x: (x, None))

    if output_duplicated:
        filter_rule = lambda a, b: a.take(b.count())
    else:
        filter_rule = filter_if_neither_empty
    return a.cogroup(b).apply_values(filter_rule).flatten().map(
        lambda kv: kv[0])
Ejemplo n.º 9
0
def take(pvalue, n, **options):
    """ inner function """

    if utils.is_infinite(pvalue):
        raise ValueError("take not supported infinite PType")

    objector = options.get('serde', pvalue.serde())

    scale = options.get('scale', 0.1)
    partial_scale = math.sqrt(scale)
    size = options.get('output_size', None)
    if size is None:
        partial_size = None
    else:
        partial_size = ptype.node().size() * math.sqrt(size / ptype.node().size())

    if isinstance(n, pobject.PObject):
        # treat the pobject param as side input
        partial_helper = side_input_util.SideInputsUtil(pvalue, (n, ))
        partial_node = partial_helper.process_with_side_inputs() \
            .by(entity.TakeProcessor(n)) \
            .as_type(objector) \
            .set_debug_info("TakePartial: " + str(n.node())) \
            .set_effective_key_num(0) \
            .input(-1).allow_partial_processing() \
            .done() \
            .set_size(partial_size, partial_scale)

        partial = pcollection.PCollection(partial_node, pvalue.pipeline())
        result_helper = side_input_util.SideInputsUtil(partial, (n, ))

        result_node = result_helper.process_with_side_inputs() \
            .by(entity.TakeProcessor(n)) \
            .as_type(objector) \
            .set_debug_info("Take: " + str(n.node())) \
            .set_effective_key_num(0) \
            .set_size(size, partial_scale)

    elif isinstance(n, (int, long)):
        result_node = pvalue.node() \
            .process_by(entity.TakeProcessor(n)) \
            .as_type(objector) \
            .set_debug_info("TakePartial: %d" % n) \
            .input(0).allow_partial_processing().done() \
            .set_size(partial_size, partial_scale) \
            .process_by(entity.TakeProcessor(n)) \
            .as_type(objector) \
            .set_debug_info("Take: %d" % n) \
            .set_effective_key_num(0) \
            .set_size(size, partial_scale)

    else:
        raise ValueError("Wrong argument, only integers are accepted")

    return pcollection.PCollection(result_node, pvalue.pipeline())
Ejemplo n.º 10
0
def take(pvalue, n, **options):
    """ inner function """

    if utils.is_infinite(pvalue):
        raise ValueError("take not supported infinite PType")

    objector = options.get('serde', pvalue.serde())

    scale = options.get('scale', 0.1)
    partial_scale = math.sqrt(scale)
    size = options.get('output_size', None)
    if size is None:
        partial_size = None
    else:
        partial_size = ptype.node().size() * math.sqrt(size / ptype.node().size())

    if isinstance(n, pobject.PObject):
        # treat the pobject param as side input
        partial_helper = side_input_util.SideInputsUtil(pvalue, (n, ))
        partial_node = partial_helper.process_with_side_inputs() \
            .by(entity.TakeProcessor(n)) \
            .as_type(objector) \
            .set_debug_info("TakePartial: " + str(n.node())) \
            .set_effective_key_num(0) \
            .input(-1).allow_partial_processing() \
            .done() \
            .set_size(partial_size, partial_scale)

        partial = pcollection.PCollection(partial_node, pvalue.pipeline())
        result_helper = side_input_util.SideInputsUtil(partial, (n, ))

        result_node = result_helper.process_with_side_inputs() \
            .by(entity.TakeProcessor(n)) \
            .as_type(objector) \
            .set_debug_info("Take: " + str(n.node())) \
            .set_effective_key_num(0) \
            .set_size(size, partial_scale)

    elif isinstance(n, (int, long)):
        result_node = pvalue.node() \
            .process_by(entity.TakeProcessor(n)) \
            .as_type(objector) \
            .set_debug_info("TakePartial: %d" % n) \
            .input(0).allow_partial_processing().done() \
            .set_size(partial_size, partial_scale) \
            .process_by(entity.TakeProcessor(n)) \
            .as_type(objector) \
            .set_debug_info("Take: %d" % n) \
            .set_effective_key_num(0) \
            .set_size(size, partial_scale)

    else:
        raise ValueError("Wrong argument, only integers are accepted")

    return pcollection.PCollection(result_node, pvalue.pipeline())
Ejemplo n.º 11
0
def distinct(ptype, **kargs):
    """
    distinct() implementation
    """

    if utils.is_infinite(ptype):
        raise ValueError("distinct not supported infinite PType")

    distinct_gbk = ptype.group_by(lambda x: x, lambda x: None)
    distinct_gbk.node().set_debug_info("Distinct")
    return distinct_gbk.apply(transforms.extract_keys)
Ejemplo n.º 12
0
def distinct(ptype, **kargs):
    """
    distinct() implementation
    """

    if utils.is_infinite(ptype):
        raise ValueError("distinct not supported infinite PType")

    distinct_gbk = ptype.group_by(lambda x: x, lambda x: None)
    distinct_gbk.node().set_debug_info("Distinct")
    return distinct_gbk.apply(transforms.extract_keys)
Ejemplo n.º 13
0
def max(pvalue, key=None):
    """
    Implementation of transforms.max()
    """
    if not isinstance(pvalue, pcollection.PCollection):
        raise ValueError("Invalid argument: pvalue must be of type PCollection")
    if utils.is_infinite(pvalue):
        raise ValueError("max not supported infinite PType")

    if key is None:
        return transforms.reduce(pvalue, lambda x, y: x if x > y else y)
    else:
        return transforms.reduce(pvalue, lambda x, y: x if key(x) > key(y) else y)
Ejemplo n.º 14
0
def min(pvalue, key=None):
    """
    Implementation of transforms.max()
    """
    if not isinstance(pvalue, pcollection.PCollection):
        raise ValueError(
            "Invalid argument: pvalue must be of type PCollection")
    if utils.is_infinite(pvalue):
        raise ValueError("min not supported infinite PType")

    if key is None:
        return reduce.reduce(pvalue, lambda x, y: x if x < y else y)
    else:
        return reduce.reduce(pvalue, lambda x, y: x if key(x) < key(y) else y)
Ejemplo n.º 15
0
def reduce(ptype, fn, *side_inputs, **kargs):
    """
    inner fun
    """

    if utils.is_infinite(ptype):
        raise ValueError("reduce not supported infinite PType")

    scale = kargs.get('scale', 0.1)
    partial_scale = math.sqrt(scale)
    size = kargs.get('output_size', None)
    if size is None:
        partial_size = None
    else:
        partial_size = ptype.node().size() * math.sqrt(
            size / ptype.node().size())
    memory = kargs.get('memory_limit', -1)
    cpu = kargs.get('cpu_limit', -1)

    objector = kargs.get('serde',
                         ptype.serde())  # use the same serde of the input
    side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple(
        side_inputs)
    partial_helper = side_input_util.SideInputsUtil(ptype, side_inputs)

    partial_node = partial_helper.process_with_side_inputs()\
        .by(entity.ReduceProcessor(fn).set_side_inputs(*side_inputs))\
        .as_type(objector)\
        .set_debug_info("ReducePartial: " + repr(fn)) \
        .set_effective_key_num(0) \
        .input(-1).allow_partial_processing().done() \
        .set_size(partial_size, partial_scale) \
        .set_memory(memory) \
        .set_cpu(cpu)

    non_partial_helper = side_input_util.SideInputsUtil(
        partial_node, side_inputs)

    non_partial_node = non_partial_helper.process_with_side_inputs()\
        .by(entity.ReduceProcessor(fn).set_side_inputs(*side_inputs))\
        .as_type(objector)\
        .set_debug_info("Reduce: " + repr(fn)) \
        .set_effective_key_num(0) \
        .set_size(size, partial_scale) \
        .set_memory(memory) \
        .set_cpu(cpu)

    return pobject.PObject(non_partial_node, ptype.pipeline())
Ejemplo n.º 16
0
def combine(ptype, fn, **kargs):
    """ inner function"""

    if utils.is_infinite(ptype):
        raise ValueError("combine not supported infinite PType")

    objector = kargs.get('serde', ptype.serde()) # default, use the input serde

    pre_combine = kargs.get('pre_combine', True)

    scale = kargs.get('scale', 0.1)
    partial_scale = math.sqrt(scale)
    size = kargs.get('output_size', None)

    memory = kargs.get('memory_limit', -1)
    cpu = kargs.get('cpu_limit', -1)

    def _build_combine_node(from_node, is_partial, size, scale):
        debug_info = ("Partial" if is_partial else "") + "Combine: " + repr(fn)

        result = from_node.process_by(entity.CombineProcessor(fn).set_side_inputs(ptype)) \
            .as_type(objector) \
            .set_debug_info(debug_info) \
            .set_effective_key_num(0) \
            .set_size(size, scale) \
            .set_memory(memory) \
            .set_cpu(cpu) \
            .input(0).prepare_before_processing().done()

        if is_partial:
            result = result.input(0).allow_partial_processing().done()
        return result

    if size is None:
        partial_size = None
    else:
        partial_size = ptype.node().size() * math.sqrt(size / ptype.node().size())

    combined_node = ptype.node()

    if pre_combine:
        combined_node = _build_combine_node(combined_node, True, partial_size, partial_scale)
        combined_node = _build_combine_node(combined_node, False, partial_size, partial_scale)
    else:
        combined_node = _build_combine_node(combined_node, False, size, scale)

    return pobject.PObject(combined_node, ptype.pipeline())
Ejemplo n.º 17
0
    def _transform_output_format(self, pcollection, output_format):
        from bigflow.util import path_util
        from bigflow.util import utils

        format_type = output_format.get_entity_name()
        ugi = output_format.ugi if hasattr(output_format, "ugi") else None
        if format_type == "TextOutputFormat" or \
                format_type == "SequenceFileAsBinaryOutputFormat" or \
                format_type == "ParquetOutputFormat" or \
                format_type == "PartitionedParquetOutputFormat":
            uri = path_util.to_abs_local_path(output_format.path)
            if utils.is_infinite(pcollection):
                output_format.path = self._toft_path(uri, ugi)
            else:
                output_format.path = self._toft_path(self._tmp_output_path(uri), ugi)
                output_format.commit_path = self._toft_path(uri, ugi)

        return output_format
Ejemplo n.º 18
0
def aggregate(ptype, zero, aggregate_fn, combine_fn, *side_inputs, **kargs):
    """
    Implementation of transforms.aggregate()
    """

    if utils.is_infinite(ptype):
        raise ValueError("aggregate not supported infinite PType")

    objector = kargs.get('serde', ptype.pipeline().default_objector())
    scale = kargs.get('scale', 0.1)
    partial_scale = math.sqrt(scale)
    size = kargs.get('output_size', None)
    if size is None:
        partial_size = None
    else:
        partial_size = ptype.node().size() * math.sqrt(size / ptype.node().size())
    memory = kargs.get('memory_limit', -1)
    cpu = kargs.get('cpu_limit', -1)

    side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple(side_inputs)
    partial_helper = side_input_util.SideInputsUtil(ptype, side_inputs)

    partial_node = partial_helper.process_with_side_inputs() \
        .by(entity.AccumulateProcessor(zero, aggregate_fn).set_side_inputs(*side_inputs)) \
        .as_type(objector) \
        .set_debug_info("AggregatePartial") \
        .input(-1).allow_partial_processing().done()\
        .set_effective_key_num(0) \
        .set_size(partial_size, partial_scale) \
        .set_memory(memory) \
        .set_cpu(cpu)

    non_partial_helper = side_input_util.SideInputsUtil(partial_node, side_inputs)

    non_partial_node = non_partial_helper.process_with_side_inputs() \
        .by(entity.AccumulateProcessor(zero, combine_fn).set_side_inputs(*side_inputs)) \
        .as_type(objector) \
        .set_debug_info("Aggregate") \
        .set_effective_key_num(0) \
        .set_size(size, partial_scale) \
        .set_memory(memory) \
        .set_cpu(cpu)

    return pobject.PObject(non_partial_node, ptype.pipeline())
Ejemplo n.º 19
0
def reduce(ptype, fn, *side_inputs, **kargs):
    """
    inner fun
    """

    if utils.is_infinite(ptype):
        raise ValueError("reduce not supported infinite PType")

    scale = kargs.get('scale', 0.1)
    partial_scale = math.sqrt(scale)
    size = kargs.get('output_size', None)
    if size is None:
        partial_size = None
    else:
        partial_size = ptype.node().size() * math.sqrt(size / ptype.node().size())
    memory = kargs.get('memory_limit', -1)
    cpu = kargs.get('cpu_limit', -1)

    objector = kargs.get('serde', ptype.serde()) # use the same serde of the input
    side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple(side_inputs)
    partial_helper = side_input_util.SideInputsUtil(ptype, side_inputs)

    partial_node = partial_helper.process_with_side_inputs()\
        .by(entity.ReduceProcessor(fn).set_side_inputs(*side_inputs))\
        .as_type(objector)\
        .set_debug_info("ReducePartial: " + repr(fn)) \
        .set_effective_key_num(0) \
        .input(-1).allow_partial_processing().done() \
        .set_size(partial_size, partial_scale) \
        .set_memory(memory) \
        .set_cpu(cpu)

    non_partial_helper = side_input_util.SideInputsUtil(partial_node, side_inputs)

    non_partial_node = non_partial_helper.process_with_side_inputs()\
        .by(entity.ReduceProcessor(fn).set_side_inputs(*side_inputs))\
        .as_type(objector)\
        .set_debug_info("Reduce: " + repr(fn)) \
        .set_effective_key_num(0) \
        .set_size(size, partial_scale) \
        .set_memory(memory) \
        .set_cpu(cpu)

    return pobject.PObject(non_partial_node, ptype.pipeline())
Ejemplo n.º 20
0
    def _transform_output_format(self, pcollection, output_format):
        from bigflow.util import path_util
        from bigflow.util import utils

        format_type = output_format.get_entity_name()
        ugi = output_format.ugi if hasattr(output_format, "ugi") else None
        if format_type == "TextOutputFormat" or \
                format_type == "SequenceFileAsBinaryOutputFormat" or \
                format_type == "ParquetOutputFormat" or \
                format_type == "PartitionedParquetOutputFormat":
            uri = path_util.to_abs_local_path(output_format.path)
            if utils.is_infinite(pcollection):
                output_format.path = self._toft_path(uri, ugi)
            else:
                output_format.path = self._toft_path(
                    self._tmp_output_path(uri), ugi)
                output_format.commit_path = self._toft_path(uri, ugi)

        return output_format
Ejemplo n.º 21
0
    def _transform_output_format(self, pcollection, output_format):
        from bigflow.util import path_util
        from bigflow.util import utils

        format_type = output_format.get_entity_name()
        # todo: extract ugi from output_format, support multiple clusters and ugis
        if format_type == "TextOutputFormat" or \
                format_type == "SequenceFileAsBinaryOutputFormat":
            uri = path_util.to_abs_local_path(output_format.path)

            if utils.is_infinite(pcollection):
                if not path_util.is_hdfs_path(uri):
                    raise ValueError("That write infinite PType to local file "
                                     "is not supported in MRPipeline")
                else:
                    output_format.path = self._toft_path(uri)
            else:
                if not path_util.is_hdfs_path(uri):
                    # User try to use MRPipeline to write local file, we replace original uri
                    # to a temp path on HDFS and dump the output for local FS after job is done.
                    hdfs_uri = self._tmp_hdfs_path(uri)
                    output_format.path = self._toft_path(hdfs_uri)
                    self._local_uri_infos.append({
                        'local_uri':
                        uri,
                        'hdfs_uri':
                        hdfs_uri,
                        'overwrite':
                        output_format.overwrite
                    })
                    logger.debug(
                        "Write file to HDFS path: %s and dump it after job done"
                        % hdfs_uri)
                    self._remote_temp_files.append(hdfs_uri)
                else:
                    output_format.path = self._toft_path(
                        self._tmp_output_path(uri))
                    output_format.commit_path = self._toft_path(uri)

        return output_format
Ejemplo n.º 22
0
def accumulate(pvalue, zero, accumulator, *side_inputs, **kargs):
    """
    Implementation of transforms.accumulate()
    """

    if utils.is_infinite(pvalue):
        raise ValueError("accumulate not supported infinite PType")

    objector = kargs.get('serde', pvalue.pipeline().default_objector())

    side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple(side_inputs)
    helper = side_input_util.SideInputsUtil(pvalue, side_inputs)
    result_node = helper.process_with_side_inputs() \
        .by(entity.AccumulateProcessor(entity.Functor.of(zero), entity.Functor.of(accumulator))
            .set_side_inputs(*side_inputs)) \
        .set_debug_info("accumulate(" + repr(zero) + ',' + repr(accumulator)) \
        .as_type(objector) \
        .set_effective_key_num(0) \
        .set_size(kargs.get('output_size', None), kargs.get('scale', 0.1)) \
        .set_memory(kargs.get('memory_limit', -1)) \
        .set_cpu(kargs.get('cpu_limit', -1))

    return pobject.PObject(result_node, pvalue.pipeline())
Ejemplo n.º 23
0
def sort_by(pvalue, key_read_fn, reverse=False):
    if utils.is_infinite(pvalue):
        raise ValueError("sort_by not supported infinite PType")
    result_node = pvalue.node().sort_by(DefaultKeyReader(key_read_fn, reverse))
    return bigflow.pcollection.PCollection(result_node, pvalue.pipeline())
Ejemplo n.º 24
0
def sort_by(pvalue, key_read_fn, reverse=False):
    if utils.is_infinite(pvalue):
        raise ValueError("sort_by not supported infinite PType")
    result_node = pvalue.node().sort_by(DefaultKeyReader(key_read_fn, reverse))
    return bigflow.pcollection.PCollection(result_node, pvalue.pipeline())
Ejemplo n.º 25
0
def pipe(pvalue, command, **options):
    """
    Transform pipe implementation
    :param pvalue: PType
    :return: PCollection
    """
    if utils.is_infinite(pvalue):
        raise ValueError("pipe not supported infinite PType")

    if isinstance(pvalue, ptable.PTable):
        def merge_value(pvalue):
            """ inner """
            if isinstance(pvalue, ptable.PTable):
                return pvalue.apply_values(merge_value)
            else:
                return pvalue.apply(transforms.to_list_pobject)

        def merge_kv(tp, level):
            """ inner """
            kvs=[]
            for i in xrange(level):
                kvs.append(tp[0])
                tp = tp[1]
            kvs.append(tp)
            return kvs

        level = pvalue.nested_level() + 1

        transformed = pvalue.apply(merge_value).flatten() \
            .apply(transforms.map, lambda kv: merge_kv(kv, level),
                serde=serde.of(pvalue.key_serdes() + [pvalue.serde()]))

        options['input_fields_num'] = level + 1
        options['is_nested_ptype'] = True
    else:
        transformed = pvalue

    output_fields_num = options.get('output_fields_num', 1)
    if output_fields_num == 1:
        options['serde'] = serde.StrSerde()
    else:
        serdes = [serde.StrSerde()] * output_fields_num
        options['serde'] = serde.TupleSerde(*serdes)

    scale = options.get('scale', 1.0)
    size = options.get('output_size', None)
    memory = options.get('memory_limit', -1)
    cpu = options.get('cpu_limit', -1)

    result_node = transformed.node() \
        .process_by(entity.PipeProcessor(command, **options)) \
        .as_type(options['serde']) \
        .set_debug_info("Pipe: " + repr(command)) \
        .ignore_group() \
        .set_effective_key_num(0) \
        .input(-1).allow_partial_processing().done() \
        .set_size(size, scale) \
        .set_memory(memory) \
        .set_cpu(cpu)

    return pcollection.PCollection(result_node, transformed.pipeline())
Ejemplo n.º 26
0
 def _is_flat_ptype(ptype):
     is_flat = isinstance(ptype, pcollection.PCollection) or isinstance(
         ptype, pobject.PObject)
     return is_flat and not utils.is_infinite(ptype)