def subtract(a, b): if utils.is_infinite(a) or utils.is_infinite(b): raise ValueError("subtract not supported infinite PType") def filter_if_b_is_empty(a, b): return a.filter(lambda input, is_empty: is_empty, b.is_empty()) a = a.map(lambda x: (x, None)) b = b.map(lambda x: (x, None)) return a.cogroup(b).apply_values(filter_if_b_is_empty).flatten().map(lambda kv:kv[0])
def diff(a, b): """ Implementation of transforms.diff() """ if utils.is_infinite(a) or utils.is_infinite(b): raise ValueError("diff not supported infinite PType") def filter_count_ne(a, b): return a.count() \ .flat_map(lambda c1, c2: [(c1, c2)], b.count(), serde = serde.of((int, int))) \ .filter(lambda tp: tp[0] != tp[1]) a = a.map(lambda x: (x, None), serde = serde.tuple_of(a.serde(), serde.of(int))) b = b.map(lambda x: (x, None), serde = serde.tuple_of(b.serde(), serde.of(int))) return a.cogroup(b).apply_values(filter_count_ne).flatten()
def first(ptype): """ Implementation of first """ if utils.is_infinite(ptype): raise ValueError("first not supported infinite PType") return bigflow.pobject.PObject(ptype.take(1).node(), ptype.pipeline())
def to_list_pobject(pvalue, **options): """ Transform listing implementation :param pvalue: PCollection/PObject :return: PObject """ def __initializer(emitter): return list() def __transformer(status, emitter, record): status.append(copy.deepcopy(record)) return status def __finalizer(status, emitter): emitter.emit(status) if utils.is_infinite(pvalue): raise ValueError("to_list_pobject not supported infinite PType") elif isinstance(pvalue, pobject.PObject): result = pvalue.map(lambda x: [x]) elif isinstance(pvalue, ptable.PTable): raise ValueError( "to_list_pobject only applied on PCollections/PObject") else: result = pvalue.transform(__initializer, __transformer, __finalizer, serde=serde.list_of(pvalue.serde())) return pobject.PObject(result.node(), result.pipeline())
def to_list_pobject(pvalue, **options): """ Transform listing implementation :param pvalue: PCollection/PObject :return: PObject """ def __initializer(emitter): return list() def __transformer(status, emitter, record): status.append(copy.deepcopy(record)) return status def __finalizer(status, emitter): emitter.emit(status) if utils.is_infinite(pvalue): raise ValueError("to_list_pobject not supported infinite PType") elif isinstance(pvalue, pobject.PObject): result = pvalue.map(lambda x: [x]) elif isinstance(pvalue, ptable.PTable): raise ValueError("to_list_pobject only applied on PCollections/PObject") else: result = pvalue.transform( __initializer, __transformer, __finalizer, serde=serde.list_of(pvalue.serde())) return pobject.PObject(result.node(), result.pipeline())
def _transform_output_format(self, pcollection, output_format): from bigflow.util import path_util from bigflow.util import utils format_type = output_format.get_entity_name() # todo: extract ugi from output_format, support multiple clusters and ugis if format_type == "TextOutputFormat" or \ format_type == "SequenceFileAsBinaryOutputFormat": uri = path_util.to_abs_local_path(output_format.path) if utils.is_infinite(pcollection): if not path_util.is_hdfs_path(uri): raise ValueError("That write infinite PType to local file " "is not supported in MRPipeline") else: output_format.path = self._toft_path(uri) else: if not path_util.is_hdfs_path(uri): # User try to use MRPipeline to write local file, we replace original uri # to a temp path on HDFS and dump the output for local FS after job is done. hdfs_uri = self._tmp_hdfs_path(uri) output_format.path = self._toft_path(hdfs_uri) self._local_uri_infos.append({ 'local_uri': uri, 'hdfs_uri': hdfs_uri, 'overwrite': output_format.overwrite }) logger.debug( "Write file to HDFS path: %s and dump it after job done" % hdfs_uri) self._remote_temp_files.append(hdfs_uri) else: output_format.path = self._toft_path(self._tmp_output_path(uri)) output_format.commit_path = self._toft_path(uri) return output_format
def intersection(a, b, output_duplicated=False): if utils.is_infinite(a) or utils.is_infinite(b): raise ValueError("intersectio not supported infinite PType") def filter_if_neither_empty(a, b): return a.is_empty() \ .flat_map(lambda x:[x]) \ .filter(lambda a_empty, b_empty: not a_empty and not b_empty, b.is_empty()) a = a.map(lambda x: (x, None)) b = b.map(lambda x: (x, None)) if output_duplicated: filter_rule = lambda a, b: a.take(b.count()) else: filter_rule = filter_if_neither_empty return a.cogroup(b).apply_values(filter_rule).flatten().map( lambda kv: kv[0])
def take(pvalue, n, **options): """ inner function """ if utils.is_infinite(pvalue): raise ValueError("take not supported infinite PType") objector = options.get('serde', pvalue.serde()) scale = options.get('scale', 0.1) partial_scale = math.sqrt(scale) size = options.get('output_size', None) if size is None: partial_size = None else: partial_size = ptype.node().size() * math.sqrt(size / ptype.node().size()) if isinstance(n, pobject.PObject): # treat the pobject param as side input partial_helper = side_input_util.SideInputsUtil(pvalue, (n, )) partial_node = partial_helper.process_with_side_inputs() \ .by(entity.TakeProcessor(n)) \ .as_type(objector) \ .set_debug_info("TakePartial: " + str(n.node())) \ .set_effective_key_num(0) \ .input(-1).allow_partial_processing() \ .done() \ .set_size(partial_size, partial_scale) partial = pcollection.PCollection(partial_node, pvalue.pipeline()) result_helper = side_input_util.SideInputsUtil(partial, (n, )) result_node = result_helper.process_with_side_inputs() \ .by(entity.TakeProcessor(n)) \ .as_type(objector) \ .set_debug_info("Take: " + str(n.node())) \ .set_effective_key_num(0) \ .set_size(size, partial_scale) elif isinstance(n, (int, long)): result_node = pvalue.node() \ .process_by(entity.TakeProcessor(n)) \ .as_type(objector) \ .set_debug_info("TakePartial: %d" % n) \ .input(0).allow_partial_processing().done() \ .set_size(partial_size, partial_scale) \ .process_by(entity.TakeProcessor(n)) \ .as_type(objector) \ .set_debug_info("Take: %d" % n) \ .set_effective_key_num(0) \ .set_size(size, partial_scale) else: raise ValueError("Wrong argument, only integers are accepted") return pcollection.PCollection(result_node, pvalue.pipeline())
def distinct(ptype, **kargs): """ distinct() implementation """ if utils.is_infinite(ptype): raise ValueError("distinct not supported infinite PType") distinct_gbk = ptype.group_by(lambda x: x, lambda x: None) distinct_gbk.node().set_debug_info("Distinct") return distinct_gbk.apply(transforms.extract_keys)
def max(pvalue, key=None): """ Implementation of transforms.max() """ if not isinstance(pvalue, pcollection.PCollection): raise ValueError("Invalid argument: pvalue must be of type PCollection") if utils.is_infinite(pvalue): raise ValueError("max not supported infinite PType") if key is None: return transforms.reduce(pvalue, lambda x, y: x if x > y else y) else: return transforms.reduce(pvalue, lambda x, y: x if key(x) > key(y) else y)
def min(pvalue, key=None): """ Implementation of transforms.max() """ if not isinstance(pvalue, pcollection.PCollection): raise ValueError( "Invalid argument: pvalue must be of type PCollection") if utils.is_infinite(pvalue): raise ValueError("min not supported infinite PType") if key is None: return reduce.reduce(pvalue, lambda x, y: x if x < y else y) else: return reduce.reduce(pvalue, lambda x, y: x if key(x) < key(y) else y)
def reduce(ptype, fn, *side_inputs, **kargs): """ inner fun """ if utils.is_infinite(ptype): raise ValueError("reduce not supported infinite PType") scale = kargs.get('scale', 0.1) partial_scale = math.sqrt(scale) size = kargs.get('output_size', None) if size is None: partial_size = None else: partial_size = ptype.node().size() * math.sqrt( size / ptype.node().size()) memory = kargs.get('memory_limit', -1) cpu = kargs.get('cpu_limit', -1) objector = kargs.get('serde', ptype.serde()) # use the same serde of the input side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple( side_inputs) partial_helper = side_input_util.SideInputsUtil(ptype, side_inputs) partial_node = partial_helper.process_with_side_inputs()\ .by(entity.ReduceProcessor(fn).set_side_inputs(*side_inputs))\ .as_type(objector)\ .set_debug_info("ReducePartial: " + repr(fn)) \ .set_effective_key_num(0) \ .input(-1).allow_partial_processing().done() \ .set_size(partial_size, partial_scale) \ .set_memory(memory) \ .set_cpu(cpu) non_partial_helper = side_input_util.SideInputsUtil( partial_node, side_inputs) non_partial_node = non_partial_helper.process_with_side_inputs()\ .by(entity.ReduceProcessor(fn).set_side_inputs(*side_inputs))\ .as_type(objector)\ .set_debug_info("Reduce: " + repr(fn)) \ .set_effective_key_num(0) \ .set_size(size, partial_scale) \ .set_memory(memory) \ .set_cpu(cpu) return pobject.PObject(non_partial_node, ptype.pipeline())
def combine(ptype, fn, **kargs): """ inner function""" if utils.is_infinite(ptype): raise ValueError("combine not supported infinite PType") objector = kargs.get('serde', ptype.serde()) # default, use the input serde pre_combine = kargs.get('pre_combine', True) scale = kargs.get('scale', 0.1) partial_scale = math.sqrt(scale) size = kargs.get('output_size', None) memory = kargs.get('memory_limit', -1) cpu = kargs.get('cpu_limit', -1) def _build_combine_node(from_node, is_partial, size, scale): debug_info = ("Partial" if is_partial else "") + "Combine: " + repr(fn) result = from_node.process_by(entity.CombineProcessor(fn).set_side_inputs(ptype)) \ .as_type(objector) \ .set_debug_info(debug_info) \ .set_effective_key_num(0) \ .set_size(size, scale) \ .set_memory(memory) \ .set_cpu(cpu) \ .input(0).prepare_before_processing().done() if is_partial: result = result.input(0).allow_partial_processing().done() return result if size is None: partial_size = None else: partial_size = ptype.node().size() * math.sqrt(size / ptype.node().size()) combined_node = ptype.node() if pre_combine: combined_node = _build_combine_node(combined_node, True, partial_size, partial_scale) combined_node = _build_combine_node(combined_node, False, partial_size, partial_scale) else: combined_node = _build_combine_node(combined_node, False, size, scale) return pobject.PObject(combined_node, ptype.pipeline())
def _transform_output_format(self, pcollection, output_format): from bigflow.util import path_util from bigflow.util import utils format_type = output_format.get_entity_name() ugi = output_format.ugi if hasattr(output_format, "ugi") else None if format_type == "TextOutputFormat" or \ format_type == "SequenceFileAsBinaryOutputFormat" or \ format_type == "ParquetOutputFormat" or \ format_type == "PartitionedParquetOutputFormat": uri = path_util.to_abs_local_path(output_format.path) if utils.is_infinite(pcollection): output_format.path = self._toft_path(uri, ugi) else: output_format.path = self._toft_path(self._tmp_output_path(uri), ugi) output_format.commit_path = self._toft_path(uri, ugi) return output_format
def aggregate(ptype, zero, aggregate_fn, combine_fn, *side_inputs, **kargs): """ Implementation of transforms.aggregate() """ if utils.is_infinite(ptype): raise ValueError("aggregate not supported infinite PType") objector = kargs.get('serde', ptype.pipeline().default_objector()) scale = kargs.get('scale', 0.1) partial_scale = math.sqrt(scale) size = kargs.get('output_size', None) if size is None: partial_size = None else: partial_size = ptype.node().size() * math.sqrt(size / ptype.node().size()) memory = kargs.get('memory_limit', -1) cpu = kargs.get('cpu_limit', -1) side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple(side_inputs) partial_helper = side_input_util.SideInputsUtil(ptype, side_inputs) partial_node = partial_helper.process_with_side_inputs() \ .by(entity.AccumulateProcessor(zero, aggregate_fn).set_side_inputs(*side_inputs)) \ .as_type(objector) \ .set_debug_info("AggregatePartial") \ .input(-1).allow_partial_processing().done()\ .set_effective_key_num(0) \ .set_size(partial_size, partial_scale) \ .set_memory(memory) \ .set_cpu(cpu) non_partial_helper = side_input_util.SideInputsUtil(partial_node, side_inputs) non_partial_node = non_partial_helper.process_with_side_inputs() \ .by(entity.AccumulateProcessor(zero, combine_fn).set_side_inputs(*side_inputs)) \ .as_type(objector) \ .set_debug_info("Aggregate") \ .set_effective_key_num(0) \ .set_size(size, partial_scale) \ .set_memory(memory) \ .set_cpu(cpu) return pobject.PObject(non_partial_node, ptype.pipeline())
def reduce(ptype, fn, *side_inputs, **kargs): """ inner fun """ if utils.is_infinite(ptype): raise ValueError("reduce not supported infinite PType") scale = kargs.get('scale', 0.1) partial_scale = math.sqrt(scale) size = kargs.get('output_size', None) if size is None: partial_size = None else: partial_size = ptype.node().size() * math.sqrt(size / ptype.node().size()) memory = kargs.get('memory_limit', -1) cpu = kargs.get('cpu_limit', -1) objector = kargs.get('serde', ptype.serde()) # use the same serde of the input side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple(side_inputs) partial_helper = side_input_util.SideInputsUtil(ptype, side_inputs) partial_node = partial_helper.process_with_side_inputs()\ .by(entity.ReduceProcessor(fn).set_side_inputs(*side_inputs))\ .as_type(objector)\ .set_debug_info("ReducePartial: " + repr(fn)) \ .set_effective_key_num(0) \ .input(-1).allow_partial_processing().done() \ .set_size(partial_size, partial_scale) \ .set_memory(memory) \ .set_cpu(cpu) non_partial_helper = side_input_util.SideInputsUtil(partial_node, side_inputs) non_partial_node = non_partial_helper.process_with_side_inputs()\ .by(entity.ReduceProcessor(fn).set_side_inputs(*side_inputs))\ .as_type(objector)\ .set_debug_info("Reduce: " + repr(fn)) \ .set_effective_key_num(0) \ .set_size(size, partial_scale) \ .set_memory(memory) \ .set_cpu(cpu) return pobject.PObject(non_partial_node, ptype.pipeline())
def _transform_output_format(self, pcollection, output_format): from bigflow.util import path_util from bigflow.util import utils format_type = output_format.get_entity_name() ugi = output_format.ugi if hasattr(output_format, "ugi") else None if format_type == "TextOutputFormat" or \ format_type == "SequenceFileAsBinaryOutputFormat" or \ format_type == "ParquetOutputFormat" or \ format_type == "PartitionedParquetOutputFormat": uri = path_util.to_abs_local_path(output_format.path) if utils.is_infinite(pcollection): output_format.path = self._toft_path(uri, ugi) else: output_format.path = self._toft_path( self._tmp_output_path(uri), ugi) output_format.commit_path = self._toft_path(uri, ugi) return output_format
def _transform_output_format(self, pcollection, output_format): from bigflow.util import path_util from bigflow.util import utils format_type = output_format.get_entity_name() # todo: extract ugi from output_format, support multiple clusters and ugis if format_type == "TextOutputFormat" or \ format_type == "SequenceFileAsBinaryOutputFormat": uri = path_util.to_abs_local_path(output_format.path) if utils.is_infinite(pcollection): if not path_util.is_hdfs_path(uri): raise ValueError("That write infinite PType to local file " "is not supported in MRPipeline") else: output_format.path = self._toft_path(uri) else: if not path_util.is_hdfs_path(uri): # User try to use MRPipeline to write local file, we replace original uri # to a temp path on HDFS and dump the output for local FS after job is done. hdfs_uri = self._tmp_hdfs_path(uri) output_format.path = self._toft_path(hdfs_uri) self._local_uri_infos.append({ 'local_uri': uri, 'hdfs_uri': hdfs_uri, 'overwrite': output_format.overwrite }) logger.debug( "Write file to HDFS path: %s and dump it after job done" % hdfs_uri) self._remote_temp_files.append(hdfs_uri) else: output_format.path = self._toft_path( self._tmp_output_path(uri)) output_format.commit_path = self._toft_path(uri) return output_format
def accumulate(pvalue, zero, accumulator, *side_inputs, **kargs): """ Implementation of transforms.accumulate() """ if utils.is_infinite(pvalue): raise ValueError("accumulate not supported infinite PType") objector = kargs.get('serde', pvalue.pipeline().default_objector()) side_inputs = side_input_util.SideInputsUtil.get_dealt_side_inputs_tuple(side_inputs) helper = side_input_util.SideInputsUtil(pvalue, side_inputs) result_node = helper.process_with_side_inputs() \ .by(entity.AccumulateProcessor(entity.Functor.of(zero), entity.Functor.of(accumulator)) .set_side_inputs(*side_inputs)) \ .set_debug_info("accumulate(" + repr(zero) + ',' + repr(accumulator)) \ .as_type(objector) \ .set_effective_key_num(0) \ .set_size(kargs.get('output_size', None), kargs.get('scale', 0.1)) \ .set_memory(kargs.get('memory_limit', -1)) \ .set_cpu(kargs.get('cpu_limit', -1)) return pobject.PObject(result_node, pvalue.pipeline())
def sort_by(pvalue, key_read_fn, reverse=False): if utils.is_infinite(pvalue): raise ValueError("sort_by not supported infinite PType") result_node = pvalue.node().sort_by(DefaultKeyReader(key_read_fn, reverse)) return bigflow.pcollection.PCollection(result_node, pvalue.pipeline())
def pipe(pvalue, command, **options): """ Transform pipe implementation :param pvalue: PType :return: PCollection """ if utils.is_infinite(pvalue): raise ValueError("pipe not supported infinite PType") if isinstance(pvalue, ptable.PTable): def merge_value(pvalue): """ inner """ if isinstance(pvalue, ptable.PTable): return pvalue.apply_values(merge_value) else: return pvalue.apply(transforms.to_list_pobject) def merge_kv(tp, level): """ inner """ kvs=[] for i in xrange(level): kvs.append(tp[0]) tp = tp[1] kvs.append(tp) return kvs level = pvalue.nested_level() + 1 transformed = pvalue.apply(merge_value).flatten() \ .apply(transforms.map, lambda kv: merge_kv(kv, level), serde=serde.of(pvalue.key_serdes() + [pvalue.serde()])) options['input_fields_num'] = level + 1 options['is_nested_ptype'] = True else: transformed = pvalue output_fields_num = options.get('output_fields_num', 1) if output_fields_num == 1: options['serde'] = serde.StrSerde() else: serdes = [serde.StrSerde()] * output_fields_num options['serde'] = serde.TupleSerde(*serdes) scale = options.get('scale', 1.0) size = options.get('output_size', None) memory = options.get('memory_limit', -1) cpu = options.get('cpu_limit', -1) result_node = transformed.node() \ .process_by(entity.PipeProcessor(command, **options)) \ .as_type(options['serde']) \ .set_debug_info("Pipe: " + repr(command)) \ .ignore_group() \ .set_effective_key_num(0) \ .input(-1).allow_partial_processing().done() \ .set_size(size, scale) \ .set_memory(memory) \ .set_cpu(cpu) return pcollection.PCollection(result_node, transformed.pipeline())
def _is_flat_ptype(ptype): is_flat = isinstance(ptype, pcollection.PCollection) or isinstance( ptype, pobject.PObject) return is_flat and not utils.is_infinite(ptype)