def __left_join_in_every_group(*pcollections, **options): serdes = (pcollections[0].serde(), pcollections[1].serde()) current = pcollections[0].flat_map(entity.OneSideJoinFn(), pcollections[1], serde = serde.tuple_of(*serdes), **options) for i in range(2, len(pcollections)): serdes = serdes + (pcollections[i].serde(),) current = current.flat_map(_one_side_join_append_tuple, pcollections[i], serde = serde.tuple_of(*serdes), **options) return current
def __left_join_in_every_group(*pcollections, **options): serdes = (pcollections[0].serde(), pcollections[1].serde()) current = pcollections[0].flat_map(entity.OneSideJoinFn(), pcollections[1], serde=serde.tuple_of(*serdes), **options) for i in range(2, len(pcollections)): serdes = serdes + (pcollections[i].serde(), ) current = current.flat_map(_one_side_join_append_tuple, pcollections[i], serde=serde.tuple_of(*serdes), **options) return current
def flatten(pvalue, **kargs): """ Transform flatten implmentation :param ptable: PTable :return: flattened PCollection """ def _flatten_once(node, key_serde, value_serde): return node.process_by(entity.FlattenProcessor(key_serde)) \ .as_type(value_serde) \ .set_debug_info("FlattenProcessor") \ .input(0).allow_partial_processing() \ .done() \ .set_size(scale_factor=1.25) \ .leave_scope() if isinstance(pvalue, ptable.PTable): key_serdes = pvalue.key_serdes() value_serde = pvalue.serde() assert len(key_serdes) == pvalue.nested_level() + 1 it = reversed(key_serdes) node = pvalue.node() for i in range(0, pvalue.nested_level() + 1): key_serde = it.next() value_serde = serde.tuple_of(key_serde, value_serde) node = _flatten_once(node, key_serde, value_serde) pvalue = pcollection.PCollection(node, pvalue.pipeline()) return pvalue
def diff(a, b): """ Implementation of transforms.diff() """ if utils.is_infinite(a) or utils.is_infinite(b): raise ValueError("diff not supported infinite PType") def filter_count_ne(a, b): return a.count() \ .flat_map(lambda c1, c2: [(c1, c2)], b.count(), serde = serde.of((int, int))) \ .filter(lambda tp: tp[0] != tp[1]) a = a.map(lambda x: (x, None), serde = serde.tuple_of(a.serde(), serde.of(int))) b = b.map(lambda x: (x, None), serde = serde.tuple_of(b.serde(), serde.of(int))) return a.cogroup(b).apply_values(filter_count_ne).flatten()
def test_serde(self): p = self._pipeline.parallelize([1, 2]) p = p.map(lambda x: x, serde = serde.of(int)) q = p.map(lambda x: x + 1, serde = serde.of(int)) o = q.map(lambda x: str(x + 1), serde= serde.of(str)) result = p.cartesian(q) self.assertEqual(str(serde.tuple_of(int, int)), str(result.serde())) result = p.cartesian(q, o) self.assertEqual(str(serde.sample((1, 2, '3'))), str(result.serde()))
def transform_from_node(self, load_node, pipeline): """ 内部接口 """ from bigflow import ptable if self.repeatedly: transformed = load_node.repeatedly() \ .process_by(_KVFromBinaryRecord()) \ .as_type(serde.tuple_of(serde.StrSerde(), serde.StrSerde())) \ .set_effective_key_num(0) \ .input(0).allow_partial_processing() \ .done() else: transformed = load_node \ .process_by(_KVFromBinaryRecord()) \ .as_type(serde.tuple_of(serde.StrSerde(), serde.StrSerde())) \ .set_effective_key_num(0) \ .ignore_group() \ .input(0).allow_partial_processing() \ .done() transformed.set_size(load_node.size()) transformed = pcollection.PCollection(transformed, pipeline) tserde = self._options.get('serde', pipeline.default_objector()) if self.kv_deserializer is not None: transformed = transformed.map(self.kv_deserializer, serde=tserde) else: is_serialize = False deserialize = entity.SerdeWrapper(tserde, is_serialize, 1) transformed = transformed.map(deserialize, serde=tserde) if self._options.get('partitioned'): return ptable.PTable(transformed, key_serde=serde.StrSerde()) return pcollection.PCollection(transformed.node().leave_scope(), pipeline)
def __full_join_in_every_group(*pcollections, **options): def transform_append_tuple(left_table_empty, emitter, record, side_input): left_table_empty = False right_table_empty = True for e in side_input: right_table_empty = False emitter.emit(record + (e, )) if right_table_empty: emitter.emit(record + (None, )) return left_table_empty def finalize_append_tuple(left_table_empty, emitter, side_input): if left_table_empty: len_tuple = i nones_tuple = tuple([None for x in range(len_tuple)]) for e in side_input: emitter.emit(nones_tuple + (e, )) serdes = (pcollections[0].serde(), pcollections[1].serde()) current = pcollections[0].transform(entity.FullJoinInitializeFn(), entity.FullJoinTransformFn(), entity.FullJoinFinalizeFn(), pcollections[1], serde=serde.tuple_of(*serdes), **options) for i in range(2, len(pcollections)): serdes = serdes + (pcollections[i].serde(), ) current = current.transform(entity.FullJoinInitializeFn(), transform_append_tuple, finalize_append_tuple, pcollections[i], serde=serde.tuple_of(*serdes), **options) return current
def __full_join_in_every_group(*pcollections, **options): def transform_append_tuple(left_table_empty, emitter, record, side_input): left_table_empty = False right_table_empty = True for e in side_input: right_table_empty = False emitter.emit(record + (e,)) if right_table_empty: emitter.emit(record + (None,)) return left_table_empty def finalize_append_tuple(left_table_empty, emitter, side_input): if left_table_empty: len_tuple = i nones_tuple = tuple([None for x in range(len_tuple)]) for e in side_input: emitter.emit(nones_tuple + (e,)) serdes = (pcollections[0].serde(), pcollections[1].serde()) current = pcollections[0].transform(entity.FullJoinInitializeFn(), entity.FullJoinTransformFn(), entity.FullJoinFinalizeFn(), pcollections[1], serde = serde.tuple_of(*serdes), **options) for i in range(2, len(pcollections)): serdes = serdes + (pcollections[i].serde(), ) current = current.transform( entity.FullJoinInitializeFn(), transform_append_tuple, finalize_append_tuple, pcollections[i], serde = serde.tuple_of(*serdes), **options) return current