Esempio n. 1
0
    def _key(self, ensure_keep_group=False):
        '''
            内部函数

            ensure_keep_group的话,则返回至少发一条数据给reduce,确保每个group都保留着。
            否则,依赖于其它结点产生group。

        '''
        value = self._value()
        value = value[0] if isinstance(value, tuple) else value

        take_num = 1 if ensure_keep_group else 0

        if self.__key is None:

            import bigflow.transforms
            from bigflow.core import entity

            key_serde = self.key_serdes()[0]
            deserialize = entity.SerdeWrapper(key_serde, is_serialize=False)
            key_node = bigflow.transforms.flatten_values(value).node() \
                .process_by(entity.TakeProcessor(take_num)) \
                .as_type(value.serde()) \
                .set_debug_info("ExtractKeyPartial") \
                .input(0).allow_partial_processing().done() \
                .process_by(entity.GetLastKeyProcessor(deserialize)) \
                .as_type(key_serde) \
                .set_debug_info("ExtractKey")

            self.__key = pobject.PObject(key_node, self._pipeline)

        return self.__key
Esempio n. 2
0
    def transform_to_node(self, ptype):
        from bigflow.core import entity
        from bigflow import pcollection
        node = ptype.node()
        plan = node.plan()

        objector = self.options.get('serde',
                                    ptype.pipeline().default_objector())
        shuffle_scope = plan.shuffle(plan.global_scope(), [node])
        node = shuffle_scope.node(0)
        if self.partition_fn is None:
            node = node.distribute_by_default()
        else:
            node = node.distribute_by(entity.Partitioner(self.partition_fn))

        pvalue = pcollection.PCollection(node, ptype.pipeline())

        for k, action in self.transform_actions.items():
            pvalue = action(pvalue)

        node = pvalue.node()

        if self.partition_number is not None:
            shuffle_scope.with_concurrency(self.partition_number)

        if self.key_reader_obj is not None:
            node = node.sort_by(self.key_reader_obj)

        #serialize = objector.serialize
        is_serialize = True
        serialize = entity.SerdeWrapper(objector, is_serialize)
        if self.kv_serializer is not None:
            serialized = pcollection.PCollection(node, ptype.pipeline()).map(
                self.kv_serializer).node()
        else:
            serialized = pcollection.PCollection(
                node, ptype.pipeline()).map(serialize).node()

        node = serialized.process_by(_ToRecord(self.kv_serializer)) \
            .as_type(record_objector.RecordObjector()) \
            .set_effective_key_num(0) \
            .input(0) \
            .done() \
            .ignore_group()
        return node
Esempio n. 3
0
def str_to_idl(pcollection, **options):
    """
    对于给定的PCollection,对每条数据执行idl打包。要求输入的数据类型为str。

    Args:
      pcollection (PCollection): 输入
      **options:  可配置选项

        log_type: idl数据类型,目前支持log_text和log_bin,默认为log_text

    Returns:
      PCollection: 处理后的PCollection
    """
    from bigflow import serde
    from bigflow.core import entity
    log_type = options.get("log_type", "log_text")
    idl_serde = serde.IdlPacketSerde(log_type=log_type)
    serialize = entity.SerdeWrapper(idl_serde, is_serialize=True)
    return pcollection.map(serialize, serde=serde.StrSerde())
Esempio n. 4
0
def idl_to_str(pcollection, **options):
    """
    对于给定的PCollection,对每条数据执行idl解包。并过滤掉idl packet类型为Heartbeat和EOF的数据。

    Args:
      pcollection (PCollection): 输入
      **options:  可配置选项

        log_type: idl数据类型,目前支持log_text和log_bin,默认为log_text

    Returns:
      PCollection: 处理后的PCollection
    """
    from bigflow import serde
    from bigflow.core import entity
    log_type = options.get("log_type", "log_text")
    idl_serde = serde.IdlPacketSerde(log_type=log_type)
    deserialize = entity.SerdeWrapper(idl_serde, is_serialize=False)
    return pcollection.map(deserialize, serde=serde.StrSerde()) \
            .filter(lambda x: x is not None)
Esempio n. 5
0
    def transform_from_node(self, load_node, pipeline):
        """
        内部接口
        """
        from bigflow import ptable
        if self.repeatedly:
            transformed = load_node.repeatedly() \
                .process_by(_KVFromBinaryRecord()) \
                .as_type(serde.tuple_of(serde.StrSerde(), serde.StrSerde())) \
                .set_effective_key_num(0) \
                .input(0).allow_partial_processing() \
                .done()
        else:
            transformed = load_node \
                .process_by(_KVFromBinaryRecord()) \
                .as_type(serde.tuple_of(serde.StrSerde(), serde.StrSerde())) \
                .set_effective_key_num(0) \
                .ignore_group() \
                .input(0).allow_partial_processing() \
                .done()

        transformed.set_size(load_node.size())

        transformed = pcollection.PCollection(transformed, pipeline)

        tserde = self._options.get('serde', pipeline.default_objector())

        if self.kv_deserializer is not None:
            transformed = transformed.map(self.kv_deserializer, serde=tserde)
        else:
            is_serialize = False
            deserialize = entity.SerdeWrapper(tserde, is_serialize, 1)
            transformed = transformed.map(deserialize, serde=tserde)

        if self._options.get('partitioned'):
            return ptable.PTable(transformed, key_serde=serde.StrSerde())
        return pcollection.PCollection(transformed.node().leave_scope(),
                                       pipeline)