def transform_from_node(self, load_node, pipeline): """ 内部接口 """ from bigflow import ptable if self.repeatedly: transformed = load_node.repeatedly() \ .process_by(_TextFromRecord()) \ .as_type(serde.StrSerde()) \ .set_effective_key_num(0) \ .input(0).allow_partial_processing() \ .done() else: transformed = load_node \ .process_by(_TextFromRecord()) \ .as_type(serde.StrSerde()) \ .set_effective_key_num(0) \ .input(0).allow_partial_processing() \ .done() transformed.set_size(load_node.size()) if self._options.get('partitioned', False): transformed_pcollection = pcollection.PCollection( transformed, pipeline) return ptable.PTable(transformed_pcollection, key_serde=serde.StrSerde()) return pcollection.PCollection(transformed.leave_scope(), pipeline)
def transform_from_node(self, load_node, pipeline): """ inner func """ from bigflow import ptable transformed_pcollection = pcollection.PCollection( load_node, pipeline) before_post_process = \ ptable.PTable(transformed_pcollection, key_serde=serde.CPickleSerde()) return self._user_input_base.post_process(before_post_process)
def construct(pipeline, node, type, nested_level=None, inner_most_type=None, key_serdes=None): """ Construct a PType from a LogicalPlan node Args: pipeline (Pipeline): the Pipeline constructed PType belongs to node (LogicalPlan.Node): node type (class): class of PType to construct Kwargs: nested_leve: specify PTable's nested level if PType is a PTable inner_most_type: specify PTable's inner-most type if PType is a PTable Returns: PType: PType """ if inner_most_type is ptable.PTable: raise ValueError("Invalid value type for PTable") if type is pobject.PObject: pvalue = pobject.PObject(node, pipeline) elif type is pcollection.PCollection: pvalue = pcollection.PCollection(node, pipeline) else: if key_serdes is None: key_serdes = [pipeline.default_objector()] * (nested_level + 1) if nested_level > 0: pvalue = ptable.PTable(construct(pipeline, node, type, nested_level - 1, inner_most_type, key_serdes[1:]), key_serde=key_serdes[0]) else: pvalue = ptable.PTable(inner_most_type(node, pipeline)) return pvalue
def group_by_every_record(pvalue, **options): """ group by every record """ pipeline = pvalue.pipeline() node = pvalue.node() plan = node.plan() scope = node.scope() shuffle = plan.shuffle(scope, [node]) shuffle_node = shuffle.node(0).distribute_every() from bigflow import serde key_serde = serde.StrSerde() return ptable.PTable(pcollection.PCollection(shuffle_node, pipeline), key_serde=key_serde)
def window_into(pvalue, win, **options): """ group by window """ pipeline = pvalue.pipeline() key_serde = options.get('key_serde', win.key_serde()) if not key_serde: key_serde = pvalue.pipeline().default_objector() node = node_window_by( pvalue.node(), win, options.get('concurrency', None), pipeline) return ptable.PTable(pcollection.PCollection(node, pipeline), key_serde=key_serde)
def cogroup(*pcollections, **kargs): from bigflow import serde """ inner function""" if len(pcollections) == 0: raise ValueError("No argument") pipeline = pcollections[0].pipeline() key_serde = kargs.get('key_serde', None) if key_serde is None: key_serde = serde._key_serde(pcollections[0].serde(), pipeline.default_objector()) value_serdes = kargs.get('value_serdes', None) if value_serdes is None: value_serdes = [] for p in pcollections: value_serdes.append( serde._value_serde(p.serde(), pipeline.default_objector())) def _make_shuffle(node, value_serde): return pcollection.PCollection(node.match_by(KeyReader(None, key_serde)), pipeline)\ .map(entity.ExtractValueFn(), serde=value_serde, scale=0.8) if not all(isinstance(p, pcollection.PCollection) for p in pcollections): raise ValueError("cogroup only applied on PCollections") plan = pcollections[0].node().plan() scope = pcollections[0].node().scope() nodes = map(lambda p: p.node(), pcollections) shuffle = plan.shuffle(scope, nodes) if 'concurrency' in kargs: concurrency = kargs['concurrency'] shuffle.with_concurrency(concurrency) elif pipeline.estimate_concurrency: concurrency = sum(node.size() for node in nodes) / pipeline.size_per_concurrency shuffle.with_concurrency(concurrency) results = [] for i in range(len(nodes)): results.append(_make_shuffle(shuffle.node(i), value_serdes[i])) return ptable.PTable(tuple(results), key_serde=key_serde)
def group_by(pvalue, key_extractor, value_extractor, **options): """ only the tuple pair elements of pvalue accepted """ key_serde = options.get('key_serde', pvalue.pipeline().default_objector()) if value_extractor is None: value_serde = options.get('value_serde', pvalue.serde()) else: value_serde = options.get('value_serde', pvalue.pipeline().default_objector()) pipeline = pvalue.pipeline() node = node_group_by(pvalue.node(), key_extractor, value_extractor, key_serde, value_serde, options.get('concurrency', None), pipeline) return ptable.PTable(pcollection.PCollection(node, pipeline), key_serde=key_serde)
def transform_from_node(self, load_node, pipeline): """ 内部接口 """ from bigflow import ptable if self.repeatedly: transformed = load_node.repeatedly() \ .process_by(_KVFromBinaryRecord()) \ .as_type(serde.tuple_of(serde.StrSerde(), serde.StrSerde())) \ .set_effective_key_num(0) \ .input(0).allow_partial_processing() \ .done() else: transformed = load_node \ .process_by(_KVFromBinaryRecord()) \ .as_type(serde.tuple_of(serde.StrSerde(), serde.StrSerde())) \ .set_effective_key_num(0) \ .ignore_group() \ .input(0).allow_partial_processing() \ .done() transformed.set_size(load_node.size()) transformed = pcollection.PCollection(transformed, pipeline) tserde = self._options.get('serde', pipeline.default_objector()) if self.kv_deserializer is not None: transformed = transformed.map(self.kv_deserializer, serde=tserde) else: is_serialize = False deserialize = entity.SerdeWrapper(tserde, is_serialize, 1) transformed = transformed.map(deserialize, serde=tserde) if self._options.get('partitioned'): return ptable.PTable(transformed, key_serde=serde.StrSerde()) return pcollection.PCollection(transformed.node().leave_scope(), pipeline)