예제 #1
0
파일: join.py 프로젝트: yangwei024/bigflow
def __left_join_in_every_group(*pcollections, **options):
    serdes = (pcollections[0].serde(), pcollections[1].serde())
    current = pcollections[0].flat_map(entity.OneSideJoinFn(),
        pcollections[1],
        serde = serde.tuple_of(*serdes), **options)

    for i in range(2, len(pcollections)):
        serdes = serdes + (pcollections[i].serde(),)
        current = current.flat_map(_one_side_join_append_tuple,
            pcollections[i],
            serde = serde.tuple_of(*serdes), **options)
    return current
예제 #2
0
def __left_join_in_every_group(*pcollections, **options):
    serdes = (pcollections[0].serde(), pcollections[1].serde())
    current = pcollections[0].flat_map(entity.OneSideJoinFn(),
                                       pcollections[1],
                                       serde=serde.tuple_of(*serdes),
                                       **options)

    for i in range(2, len(pcollections)):
        serdes = serdes + (pcollections[i].serde(), )
        current = current.flat_map(_one_side_join_append_tuple,
                                   pcollections[i],
                                   serde=serde.tuple_of(*serdes),
                                   **options)
    return current
예제 #3
0
def flatten(pvalue, **kargs):
    """ Transform flatten implmentation

    :param ptable: PTable
    :return: flattened PCollection
    """

    def _flatten_once(node, key_serde, value_serde):
        return node.process_by(entity.FlattenProcessor(key_serde)) \
                   .as_type(value_serde) \
                   .set_debug_info("FlattenProcessor") \
                   .input(0).allow_partial_processing() \
                   .done() \
                   .set_size(scale_factor=1.25) \
                   .leave_scope()

    if isinstance(pvalue, ptable.PTable):
        key_serdes = pvalue.key_serdes()
        value_serde = pvalue.serde()
        assert len(key_serdes) == pvalue.nested_level() + 1
        it = reversed(key_serdes)
        node = pvalue.node()
        for i in range(0, pvalue.nested_level() + 1):
            key_serde = it.next()
            value_serde = serde.tuple_of(key_serde, value_serde)
            node = _flatten_once(node, key_serde, value_serde)

        pvalue = pcollection.PCollection(node, pvalue.pipeline())

    return pvalue
예제 #4
0
def flatten(pvalue, **kargs):
    """ Transform flatten implmentation

    :param ptable: PTable
    :return: flattened PCollection
    """
    def _flatten_once(node, key_serde, value_serde):
        return node.process_by(entity.FlattenProcessor(key_serde)) \
                   .as_type(value_serde) \
                   .set_debug_info("FlattenProcessor") \
                   .input(0).allow_partial_processing() \
                   .done() \
                   .set_size(scale_factor=1.25) \
                   .leave_scope()

    if isinstance(pvalue, ptable.PTable):
        key_serdes = pvalue.key_serdes()
        value_serde = pvalue.serde()
        assert len(key_serdes) == pvalue.nested_level() + 1
        it = reversed(key_serdes)
        node = pvalue.node()
        for i in range(0, pvalue.nested_level() + 1):
            key_serde = it.next()
            value_serde = serde.tuple_of(key_serde, value_serde)
            node = _flatten_once(node, key_serde, value_serde)

        pvalue = pcollection.PCollection(node, pvalue.pipeline())

    return pvalue
예제 #5
0
파일: diff.py 프로젝트: zz198808/bigflow
def diff(a, b):
    """
    Implementation of transforms.diff()
    """

    if utils.is_infinite(a) or utils.is_infinite(b):
        raise ValueError("diff not supported infinite PType")

    def filter_count_ne(a, b):
        return a.count() \
            .flat_map(lambda c1, c2: [(c1, c2)], b.count(), serde = serde.of((int, int))) \
            .filter(lambda tp: tp[0] != tp[1])
    a = a.map(lambda x: (x, None), serde = serde.tuple_of(a.serde(), serde.of(int)))
    b = b.map(lambda x: (x, None), serde = serde.tuple_of(b.serde(), serde.of(int)))

    return a.cogroup(b).apply_values(filter_count_ne).flatten()
예제 #6
0
    def test_serde(self):
        p = self._pipeline.parallelize([1, 2])

        p = p.map(lambda x: x, serde = serde.of(int))
        q = p.map(lambda x: x + 1, serde = serde.of(int))
        o = q.map(lambda x: str(x + 1), serde= serde.of(str))

        result = p.cartesian(q)
        self.assertEqual(str(serde.tuple_of(int, int)), str(result.serde()))

        result = p.cartesian(q, o)
        self.assertEqual(str(serde.sample((1, 2, '3'))), str(result.serde()))
예제 #7
0
파일: input.py 프로젝트: zz198808/bigflow
    def transform_from_node(self, load_node, pipeline):
        """
        内部接口
        """
        from bigflow import ptable
        if self.repeatedly:
            transformed = load_node.repeatedly() \
                .process_by(_KVFromBinaryRecord()) \
                .as_type(serde.tuple_of(serde.StrSerde(), serde.StrSerde())) \
                .set_effective_key_num(0) \
                .input(0).allow_partial_processing() \
                .done()
        else:
            transformed = load_node \
                .process_by(_KVFromBinaryRecord()) \
                .as_type(serde.tuple_of(serde.StrSerde(), serde.StrSerde())) \
                .set_effective_key_num(0) \
                .ignore_group() \
                .input(0).allow_partial_processing() \
                .done()

        transformed.set_size(load_node.size())

        transformed = pcollection.PCollection(transformed, pipeline)

        tserde = self._options.get('serde', pipeline.default_objector())

        if self.kv_deserializer is not None:
            transformed = transformed.map(self.kv_deserializer, serde=tserde)
        else:
            is_serialize = False
            deserialize = entity.SerdeWrapper(tserde, is_serialize, 1)
            transformed = transformed.map(deserialize, serde=tserde)

        if self._options.get('partitioned'):
            return ptable.PTable(transformed, key_serde=serde.StrSerde())
        return pcollection.PCollection(transformed.node().leave_scope(),
                                       pipeline)
예제 #8
0
def __full_join_in_every_group(*pcollections, **options):
    def transform_append_tuple(left_table_empty, emitter, record, side_input):
        left_table_empty = False
        right_table_empty = True
        for e in side_input:
            right_table_empty = False
            emitter.emit(record + (e, ))

        if right_table_empty:
            emitter.emit(record + (None, ))
        return left_table_empty

    def finalize_append_tuple(left_table_empty, emitter, side_input):
        if left_table_empty:
            len_tuple = i
            nones_tuple = tuple([None for x in range(len_tuple)])
            for e in side_input:
                emitter.emit(nones_tuple + (e, ))

    serdes = (pcollections[0].serde(), pcollections[1].serde())
    current = pcollections[0].transform(entity.FullJoinInitializeFn(),
                                        entity.FullJoinTransformFn(),
                                        entity.FullJoinFinalizeFn(),
                                        pcollections[1],
                                        serde=serde.tuple_of(*serdes),
                                        **options)

    for i in range(2, len(pcollections)):
        serdes = serdes + (pcollections[i].serde(), )
        current = current.transform(entity.FullJoinInitializeFn(),
                                    transform_append_tuple,
                                    finalize_append_tuple,
                                    pcollections[i],
                                    serde=serde.tuple_of(*serdes),
                                    **options)

    return current
예제 #9
0
파일: join.py 프로젝트: yangwei024/bigflow
def __full_join_in_every_group(*pcollections, **options):

    def transform_append_tuple(left_table_empty, emitter, record, side_input):
        left_table_empty = False
        right_table_empty = True
        for e in side_input:
            right_table_empty = False
            emitter.emit(record + (e,))

        if right_table_empty:
            emitter.emit(record + (None,))
        return left_table_empty

    def finalize_append_tuple(left_table_empty, emitter, side_input):
        if left_table_empty:
            len_tuple = i
            nones_tuple = tuple([None for x in range(len_tuple)])
            for e in side_input:
                emitter.emit(nones_tuple + (e,))

    serdes = (pcollections[0].serde(), pcollections[1].serde())
    current = pcollections[0].transform(entity.FullJoinInitializeFn(),
            entity.FullJoinTransformFn(),
            entity.FullJoinFinalizeFn(),
            pcollections[1],
            serde = serde.tuple_of(*serdes), **options)

    for i in range(2, len(pcollections)):
        serdes = serdes + (pcollections[i].serde(), )
        current = current.transform(
            entity.FullJoinInitializeFn(),
            transform_append_tuple,
            finalize_append_tuple,
            pcollections[i],
            serde = serde.tuple_of(*serdes), **options)

    return current