Esempio n. 1
0
    def test_key_value_serde(self):
        """ inner """
        self.serde_eq(int, serde._key_serde(serde.of([int, str]), None))
        self.serde_eq(str, serde._key_serde(serde.of((str, int)), None))

        self.serde_eq(int, serde._value_serde(serde.of((str, int)), None))
        self.serde_eq(int, serde._value_serde(serde.of([str, int]), None))
Esempio n. 2
0
def _ret_tuple_handler(ret_tuple, record_val):
    """ 内部函数 """
    inter_type_sds = []
    inter_type_values = []
    inter_type_flag = False
    ptype_sds = []
    ptype_values = []
    ptype_flag = False

    for item in ret_tuple:
        if isinstance(item, ptype.PType):
            ptype_sds.append((item.serde()))
            ptype_values.append(item)
            ptype_flag = True
        else:
            inter_type_sds.append((type(item)))
            inter_type_values.append(item)
            inter_type_flag = True

    if ptype_flag and inter_type_flag:
        ptype_sds.extend(inter_type_sds)
        return transforms.cartesian(*ptype_values)\
            .apply(transforms.map, lambda record: record + tuple(inter_type_values),
                serde=serde.of(tuple(ptype_sds)))
    elif not ptype_flag and inter_type_flag:
        return record_val.apply(transforms.map,
            lambda record: tuple(inter_type_values), serde=serde.of(tuple(inter_type_sds)))
    else:
        return transforms.cartesian(*ptype_values)
Esempio n. 3
0
 def test_intersection(self):
     a = self._pipeline.parallelize([1, 2, 3, 1, 4]).map(lambda x: x, serde = serde.of(int))
     b = self._pipeline.parallelize([1, 2, 1, 2, 3]).map(lambda x: x, serde = serde.of(int))
     diff = a.diff(b)
     diff_serde_str = str(diff.serde())
     expect_serde_str = str(serde.of((int, (int, int))))
     self.assertEqual(expect_serde_str, diff_serde_str)
     self.assertItemsEqual([(2, (1, 2)), (4, (1, 0))], diff.get())
Esempio n. 4
0
def _sort_str(pvalue, reverse=False):

    class ReverseStrSerde(serde.Serde):
        """ test """

        def serialize(self, obj):
            """ inner """
            return ''.join(chr(255 - ord(ch)) for ch in obj)

        def deserialize(self, buf):
            """ inner """
            return ''.join(chr(255 - ord(ch)) for ch in buf)

    def _serde_to_string(serde):
        return entity.Entity.of(entity.Entity.objector, serde) \
                .to_proto_message().SerializeToString()

    str_serde = None

    if not reverse:
        str_serde = serde.of(str)
    else:
        str_serde = serde.Optional(ReverseStrSerde())

    class _StrSortKeyReader(object):

        def __init__(self, serde):
            self.objector = _serde_to_string(serde)
            self.read_key = lambda x: x

    class SetValueNoneProcessor(entity.Processor):
        """ inner """
        def __init__(self):
            super(SetValueNoneProcessor, self).__init__()

    class SetKeyToValueProcessor(entity.Processor):
        """ inner """
        def __init__(self, serde):
            super(SetKeyToValueProcessor, self).__init__()
            self.set_config(_serde_to_string(serde))

    key_reader_obj = _StrSortKeyReader(str_serde)

    result_node = pvalue.node()._plan.shuffle(pvalue.node()._scope, from_nodes=[pvalue.node()]) \
        .sort() \
        .node(0).match_by(key_reader_obj, entity.Entity.key_reader) \
        .set_debug_info("Sort: " + repr(key_reader_obj)) \
        .process_by(SetValueNoneProcessor())\
        .as_type(serde.of(str)) \
        .set_effective_key_num(0) \
        .ignore_group() \
        .input(0).allow_partial_processing().done() \
        .process_by(SetKeyToValueProcessor(str_serde)) \
        .as_type(serde.of(str)) \
        .leave_scope()

    return bigflow.pcollection.PCollection(result_node, pvalue.pipeline())
Esempio n. 5
0
 def _optional_sd(sd_val):
     """ 内部函数 """
     new_serde = []
     for fd in sd_val.get_args():
         new_fd = {}
         if isinstance(fd, serde.Optional):
             fd = fd.origin_serde()
         for field, tp in fd.get_fields_to_types().items():
             new_fd[field] = serde.Optional(serde.of(tp))
         new_serde.append(of(new_fd))
     return serde.of(tuple(new_serde))
Esempio n. 6
0
def diff(a, b):
    """
    Implementation of transforms.diff()
    """

    if utils.is_infinite(a) or utils.is_infinite(b):
        raise ValueError("diff not supported infinite PType")

    def filter_count_ne(a, b):
        return a.count() \
            .flat_map(lambda c1, c2: [(c1, c2)], b.count(), serde = serde.of((int, int))) \
            .filter(lambda tp: tp[0] != tp[1])
    a = a.map(lambda x: (x, None), serde = serde.tuple_of(a.serde(), serde.of(int)))
    b = b.map(lambda x: (x, None), serde = serde.tuple_of(b.serde(), serde.of(int)))

    return a.cogroup(b).apply_values(filter_count_ne).flatten()
Esempio n. 7
0
def group_by(pcollection, fields, **options):
    """
	对pcollection按字段group by

    Args:
        pcollection (SchemaPCollection): 输入数据集SchemaPCollection(用来表示结构化的,带字段的PCollection), 可以当作每个元素是一个dict的PCollection来用
        fields (Iterable): 如果fields为一个str,则会按“,”进行切割,然后按切割出的字段进行分组。
                           如果fields为一个list/tuple,则直接按list中的多个字段进行分组

    Returns:
        SchemaPCollection: 每个key为group的字段所组成的一个dict,每个value是一个PCollection,包含所有的列。

    Examples:
        >>> from bigflow import base, schema
        >>> p = base.Pipeline.create('local')
        >>> analytics = p.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)])
        >>>     .apply(schema.tuple_to_dict, ['websites', 'clicknum'])
        >>>     .apply(schema.select, lambda cols: {
        >>>         'website': cols['websites'].flat_map(lambda line: line.split(',')),
        >>>         'clicknum': cols['clicknum']
        >>>     }).apply(schema.group_by, ['website'])
        >>>       .apply_values(transforms.first)
        >>>     .apply(schema.flatten)
        >>> print analytics.get()
       输出结果为:
        [{'website': 'a', 'clicknum': 1}, {'website': 'b', 'clicknum': 1}, {'website': 'c', 'clicknum': 1},
         {'website': 'd', 'clicknum': 1}]

    """
    pcollection = _transform_schema_pcollection(pcollection)

    if _is_tuple_serde(pcollection.serde()):
        def _select_fields(tup, fields):
            """ 内部函数 """
            return tuple(tup[field] for field in fields)

        from bigflow import schema_pcollection
        return pcollection.group_by(lambda record: _select_fields(record, fields),
            key_serde=serde.of(tuple(_origin_serde(pcollection.serde())[field]
            for field in fields)), **options)\
            .apply_values(lambda record: schema_pcollection.SchemaPCollection(record))

    def _select_fields(dct, fields):
        """ 内部函数 """
        return dict((field, dct[field]) for field in fields)

    if isinstance(fields, str):
        fields = ''.join(fields.split()).split(',')

    from bigflow import schema_pcollection
    return pcollection.group_by(lambda record: _select_fields(record, fields),
        key_serde=_get_serde_of_fields(pcollection.serde(), fields,
        pcollection.pipeline().default_objector()), **options) \
        .apply_values(lambda record: schema_pcollection.SchemaPCollection(record))
Esempio n. 8
0
 def check(self, sd, value):
     """ inner """
     sd = serde.of(sd)
     self.assertEqual(value, sd.deserialize(str(sd.serialize(value))))
     import marshal
     import sys
     try:
         assert value == marshal.loads(marshal.dumps(value))
     except:
         print >>sys.stderr, 'skip an unsupported serde', str(sd)
     else:
         self._checking_condition.append((sd, value))
Esempio n. 9
0
    def as_schema(self, fields):
        """
        根据字段,返回一个SchemaPCollection

        Args:
            fields: 类型可以是,tuple,list,dict;
                当fields是tuple或list时, 会判断每个元素的类型:
                    fields中的每个元素是python基本类型或一个serde;
                    接口将构造TupleSerde设置到PCollection每个元素

                    fields中的每个元素是python string,抛出异常

                当fields是dict时:
                    fields的key标识字段类型,value标识该字段的类型,如 {"name": str, "age": int}
                    当前PCollection中的每个元素必须是dict,dict内的key必须相同。
                    fields内的key要和PCollection内的key必须相同
        Returns:
            PCollection: 表示转化后的PCollection

        Examples:
            >>> data = self._pipeline.parallelize([("xiaoming", "PKU", 20)])
            >>> d1 = data.as_schema((str, str, int))
            >>> d2 = data.as_schema([str, str, int])
            >>> print d1.get()
            [('xiaoming', 'PKU', 20)]
            >>>
            >>> print d2.get()
            [('xiaoming', 'PKU', 20)]
            >>>
            >>> data = self._pipeline.parallelize([{"name": "xiaoming", "school": "PKU", "age": 20}])
            >>> d5 = data.as_schema({"name": str, "school": str, "age": int})
            >>> print d5.get()
            [{'age': 20, 'name': 'xiaoming', 'school': 'PKU'}]
            >>>
        """
        from bigflow import schema
        from bigflow import serde
        if isinstance(fields, tuple) or isinstance(fields, list):
            if len(fields) == 0:
                raise ValueError("the number of elems in fields is zero.")
            if isinstance(fields[0], str):
                _fields = {field: self._pipeline.default_objector() for field in fields}
                return self.map(lambda x: x, serde = schema.of(_fields))
            else:
                _fields = tuple(fields)
                return self.map(lambda x: x, serde = serde.of(_fields))
        elif isinstance(fields, dict):
            return self.map(lambda x: x, serde = schema.of(fields))
        else:
            raise ValueError("fields type only accept {`tuple`, `list`, `dict`}.")
Esempio n. 10
0
    def test_output_sort(self):
        self.setConfig(spark_conf={
            "spark.default.parallelism": "1",
        })
        """ test """

        lines = self._pipeline.parallelize([5, 1, 2, 0, 3, 4])\
                .map(lambda x: str(x), serde=serde.of(str))

        out1_path = self.generate_tmp_path() + '/output-1/'
        out2_path = self.generate_tmp_path() + '/output-2/'
        self._pipeline.write(lines,
            output.TextFile(out1_path)
                .sort()
                .partition(n = 2, partition_fn = lambda x, n: int(x) % n)
        )
        self._pipeline.write(lines,
            output.TextFile(out2_path)
                .sort(reverse=True)
                .partition(n = 2, partition_fn = lambda x, n: int(x) % n)
        )
        self._pipeline.run()
        l11 = self._pipeline.read(input.TextFile(out1_path + '/part-00000'))\
                 .accumulate('', lambda x, y: x + y)
        l12 = self._pipeline.read(input.TextFile(out1_path + '/part-00001'))\
                 .accumulate('', lambda x, y: x + y)

        l21 = self._pipeline.read(input.TextFile(out2_path + '/part-00000'))\
                 .accumulate('', lambda x, y: x + y)
        l22 = self._pipeline.read(input.TextFile(out2_path + '/part-00001'))\
                 .accumulate('', lambda x, y: x + y)
        l11.cache()
        l12.cache()
        l21.cache()
        l22.cache()
        self.assertEqual('024', l11.get())
        self.assertEqual('135', l12.get())
        self.assertEqual('420', l21.get())
        self.assertEqual('531', l22.get())
Esempio n. 11
0
    def end_serde_test(self):
        """ test """
        import sys
        from bigflow.core import entity
        logger.info(str(self._checking_condition))
        values = map(lambda condition: condition[1], self._checking_condition)
        p_values = self._pipeline.parallelize([values]) # 避免map结点超过32个(Hadoop的限制)
        p_value_list = []


        out = []
        for (i, (sd, value)) in enumerate(self._checking_condition):
            sd1 = serde.of(int)
            sd2 = sd

            cpp_deserialize_fn = entity.KVDeserializeFn(sd1, sd2)
            cpp_serialize_fn = entity.KVSerializeFn(sd1, sd2)

            python_deserialize_fn = lambda kv: (sd1.deserialize(kv[0]), sd2.deserialize(kv[1]))
            python_serialize_fn = lambda kv: (sd1.serialize(kv[0]), sd2.serialize(kv[1]))

            serialize_fns = [cpp_serialize_fn, python_serialize_fn]
            deserialize_fns = [cpp_deserialize_fn, python_deserialize_fn]

            kv_val = (1, value)
            def _assert_eq_val(v):
                assert v == kv_val
            for serialize_fn in serialize_fns:
                for deserialize_fn in deserialize_fns:
                    out.append(p_values.map(lambda x: (1, x[i]))
                            .map(serialize_fn)
                            .map(deserialize_fn)
                            .map(_assert_eq_val))
        if out:
            transforms.union(*out).cache()
        else:
            print >> sys.stderr, "SKIP a test!!!"
        self._pipeline.run()
Esempio n. 12
0
 def serde_eq(self, expect, real):
     """ inner """
     self.assertEqual(str(serde.of(expect)), str(serde.of(real)))
Esempio n. 13
0
    def transform_from_node(self, load_node, pipeline):
        """
        内部接口
        """
        from bigflow import schema
        if self.fields is None:
            raise ValueError('''columns is necessary,(1) columns(list),
                each item in columns is string, SchemaPCollection's element
                is dict, (2) columns(int),SchemaPCollection's element is tuple. eg.
                columns=3 or columns=[(xx, int), (yy, str)] or columns=[xx, yy],
                (3) columns(list), each item in columns is base type in [int, float, str]'''
                             )

        if isinstance(self.fields, tuple):
            self.fields = list(self.fields)

        fields_type = []
        ignore_overflow = self.ignore_overflow
        ignore_illegal_line = self.ignore_illegal_line
        if isinstance(self.fields, list):

            def get_fields_type(fields):
                """内部函数"""
                fields_type = []
                for field in fields:
                    if isinstance(field, tuple):
                        if field[1] in [int, str, float]:
                            fields_type.append(field[1])
                        else:
                            raise ValueError(
                                '''columns is list(field name or data type),
                                             data type(int/str/float)''')
                    elif field in [int, str, float]:
                        fields_type.append(field)
                    elif isinstance(field, str):
                        fields_type.append(str)
                    else:
                        raise ValueError(
                            '''columns is list(field name or data type),
                                         data type(int/str/float)''')
                return fields_type

            fields_type = get_fields_type(self.fields)
            ret = super(SchemaTextFile, self)\
                    .transform_from_node(load_node, pipeline)\
                    .flat_map(entity.SplitStringToTypes(self.sep,
                                                        fields_type,
                                                        ignore_overflow,
                                                        ignore_illegal_line),
                                                        serde=serde.of(tuple(fields_type)))
            if self.fields[0] in [int, float, str]:
                return ret
            else:
                ret = ret.apply(schema.tuple_to_dict, self.fields)
                return ret
        elif isinstance(self.fields, int):
            from bigflow import schema_pcollection
            return schema_pcollection.SchemaPCollection(super(SchemaTextFile, self)
                .transform_from_node(load_node, pipeline)\
                .flat_map(entity.SplitStringToTypes(self.sep,
                                                    [str for _ in xrange(self.fields)],
                                                    True,
                                                    ignore_illegal_line),
                          serde=serde.of(tuple(serde.StrSerde() for index in xrange(self.fields)))))
        else:
            raise ValueError("columns is list(field name),or int(row number)")
Esempio n. 14
0
 def test_get_tuple_serde_of_fields(self):
     """ test """
     sd = schema.FieldsDictSerde({'id': int, 'name': str, 'age': int})
     self.assertEqual(
         str(serde.of((int, str))),
         str(schema._get_tuple_serde_of_fields(sd, ['id', 'name'])))
Esempio n. 15
0
 def _get_tuple_serde(self, fields):
     """ 内部函数 """
     return serde.of(tuple(self._fields_to_types[key] for key in fields))
Esempio n. 16
0
 def serde_eq(self, expect, real):
     """ inner """
     self.assertEqual(str(serde.of(expect)), str(serde.of(real)))
Esempio n. 17
0
def get_serde_of_field(sd, field, default=None):
    """ get serde of field """
    if isinstance(sd, FieldsDictSerde):
        return serde.of(serde.origin(sd._tuple_serde)[sd._fields.index(field)])
    return default
Esempio n. 18
0
 def test_sort_str(self):
     """ test """
     data = self._pipeline.parallelize([4, 5, 1, 2, 3, 0])
     lines = data.map(lambda x: str(x), serde=serde.of(str))
     self.assertEqual('012345', lines.sort().accumulate('', lambda x, y: x + y).get())
Esempio n. 19
0
 def filter_count_ne(a, b):
     return a.count() \
         .flat_map(lambda c1, c2: [(c1, c2)], b.count(), serde = serde.of((int, int))) \
         .filter(lambda tp: tp[0] != tp[1])
Esempio n. 20
0
def __right_join_in_every_group(*pcollections, **options):
    serdes = serde.of(tuple(map(lambda p: p.serde(), pcollections)))
    return __left_join_in_every_group(*pcollections[::-1],
                                      **options).map(lambda x: x[::-1],
                                                     serde=serdes)
Esempio n. 21
0
 def __init__(self, fields_to_types):
     if not isinstance(fields_to_types, dict):
         fields_to_types = dict(zip(fields_to_types, [serde._] * len(fields_to_types)))
     self._fields_to_types = fields_to_types
     self._fields = sorted(fields_to_types.keys())
     self._tuple_serde = serde.of(tuple(fields_to_types[key] for key in self._fields))
Esempio n. 22
0
def _get_serde_of_field(sd, field, dft=None):
    """ 内部函数 """
    if isinstance(sd, FieldsDictSerde):
        return serde.of(serde.origin(sd._tuple_serde)[sd._fields.index(field)])
    return dft
Esempio n. 23
0
def __right_join_in_every_group(*pcollections, **options):
    serdes = serde.of(tuple(map(lambda p: p.serde(), pcollections)))
    return __left_join_in_every_group(*pcollections[::-1], **options).map(lambda x: x[::-1], serde = serdes)
Esempio n. 24
0
def _get_tuple_serde_of_fields(sd, fields, dft=None):
    """ 内部函数 """
    return serde.of(
        tuple(_get_serde_of_field(sd, field, dft) for field in fields))
Esempio n. 25
0
def get_serde_of_field(sd, field, default=None):
    """ get serde of field """
    if isinstance(sd, FieldsDictSerde):
        return serde.of(serde.origin(sd._tuple_serde)[sd._fields.index(field)])
    return default
Esempio n. 26
0
 def _get_col(index):
     return val.map(lambda v: v[index],
                    serde=serde.of(_origin_serde(val.serde())[index]))
Esempio n. 27
0
def pipe(pvalue, command, **options):
    """
    Transform pipe implementation
    :param pvalue: PType
    :return: PCollection
    """
    if utils.is_infinite(pvalue):
        raise ValueError("pipe not supported infinite PType")

    if isinstance(pvalue, ptable.PTable):
        def merge_value(pvalue):
            """ inner """
            if isinstance(pvalue, ptable.PTable):
                return pvalue.apply_values(merge_value)
            else:
                return pvalue.apply(transforms.to_list_pobject)

        def merge_kv(tp, level):
            """ inner """
            kvs=[]
            for i in xrange(level):
                kvs.append(tp[0])
                tp = tp[1]
            kvs.append(tp)
            return kvs

        level = pvalue.nested_level() + 1

        transformed = pvalue.apply(merge_value).flatten() \
            .apply(transforms.map, lambda kv: merge_kv(kv, level),
                serde=serde.of(pvalue.key_serdes() + [pvalue.serde()]))

        options['input_fields_num'] = level + 1
        options['is_nested_ptype'] = True
    else:
        transformed = pvalue

    output_fields_num = options.get('output_fields_num', 1)
    if output_fields_num == 1:
        options['serde'] = serde.StrSerde()
    else:
        serdes = [serde.StrSerde()] * output_fields_num
        options['serde'] = serde.TupleSerde(*serdes)

    scale = options.get('scale', 1.0)
    size = options.get('output_size', None)
    memory = options.get('memory_limit', -1)
    cpu = options.get('cpu_limit', -1)

    result_node = transformed.node() \
        .process_by(entity.PipeProcessor(command, **options)) \
        .as_type(options['serde']) \
        .set_debug_info("Pipe: " + repr(command)) \
        .ignore_group() \
        .set_effective_key_num(0) \
        .input(-1).allow_partial_processing().done() \
        .set_size(size, scale) \
        .set_memory(memory) \
        .set_cpu(cpu)

    return pcollection.PCollection(result_node, transformed.pipeline())
Esempio n. 28
0
 def _get_tuple_serde(self, fields):
     """ 内部函数 """
     return serde.of(tuple(self._fields_to_types[key] for key in fields))
Esempio n. 29
0
def is_empty(pcollection):
    return pcollection.take(1).count().map(lambda n: n == 0, serde = serde.of(bool))
Esempio n. 30
0
def _get_serde_of_field(sd, field, dft=None):
    """ 内部函数 """
    if isinstance(sd, FieldsDictSerde):
        return serde.of(serde.origin(sd._tuple_serde)[sd._fields.index(field)])
    return dft
Esempio n. 31
0
 def serde_equal(self, expect, real):
     self.assertEqual(str(serde.of(expect)), str(serde.of(real)))
Esempio n. 32
0
def _get_tuple_serde_of_fields(sd, fields, dft=None):
    """ 内部函数 """
    return serde.of(tuple(_get_serde_of_field(sd, field, dft) for field in fields))
Esempio n. 33
0
 def test_get_serde_of_fields(self):
     """ test """
     sd = fields.FieldsDictSerde({'id': int, 'name': str, 'age': int})
     self.assertEqual(str(serde.of(str)), str(fields.get_serde_of_field(sd, 'name')))
     self.assertEqual(str(fields.of({'id':int, 'name': str})),
             str(fields.get_serde_of_fields(sd, ['id', 'name'])))
Esempio n. 34
0
 def _get_col(index):
     return val.map(lambda v: v[index], serde=serde.of(_origin_serde(val.serde())[index]))