コード例 #1
0
ファイル: pcollection.py プロジェクト: zibo1996/bigflow
    def as_schema(self, fields):
        """
        根据字段,返回一个SchemaPCollection

        Args:
            fields: 类型可以是,tuple,list,dict;
                当fields是tuple或list时, 会判断每个元素的类型:
                    fields中的每个元素是python基本类型或一个serde;
                    接口将构造TupleSerde设置到PCollection每个元素

                    fields中的每个元素是python string,抛出异常

                当fields是dict时:
                    fields的key标识字段类型,value标识该字段的类型,如 {"name": str, "age": int}
                    当前PCollection中的每个元素必须是dict,dict内的key必须相同。
                    fields内的key要和PCollection内的key必须相同
        Returns:
            PCollection: 表示转化后的PCollection

        Examples:
            >>> data = self._pipeline.parallelize([("xiaoming", "PKU", 20)])
            >>> d1 = data.as_schema((str, str, int))
            >>> d2 = data.as_schema([str, str, int])
            >>> print d1.get()
            [('xiaoming', 'PKU', 20)]
            >>>
            >>> print d2.get()
            [('xiaoming', 'PKU', 20)]
            >>>
            >>> data = self._pipeline.parallelize([{"name": "xiaoming", "school": "PKU", "age": 20}])
            >>> d5 = data.as_schema({"name": str, "school": str, "age": int})
            >>> print d5.get()
            [{'age': 20, 'name': 'xiaoming', 'school': 'PKU'}]
            >>>
        """
        from bigflow import schema
        from bigflow import serde
        if isinstance(fields, tuple) or isinstance(fields, list):
            if len(fields) == 0:
                raise ValueError("the number of elems in fields is zero.")
            if isinstance(fields[0], str):
                _fields = {
                    field: self._pipeline.default_objector()
                    for field in fields
                }
                return self.map(lambda x: x, serde=schema.of(_fields))
            else:
                _fields = tuple(fields)
                return self.map(lambda x: x, serde=serde.of(_fields))
        elif isinstance(fields, dict):
            return self.map(lambda x: x, serde=schema.of(fields))
        else:
            raise ValueError(
                "fields type only accept {`tuple`, `list`, `dict`}.")
コード例 #2
0
ファイル: pcollection.py プロジェクト: yangwei024/bigflow
    def as_schema(self, fields):
        """
        根据字段,返回一个SchemaPCollection

        Args:
            fields: 类型可以是,tuple,list,dict;
                当fields是tuple或list时, 会判断每个元素的类型:
                    fields中的每个元素是python基本类型或一个serde;
                    接口将构造TupleSerde设置到PCollection每个元素

                    fields中的每个元素是python string,抛出异常

                当fields是dict时:
                    fields的key标识字段类型,value标识该字段的类型,如 {"name": str, "age": int}
                    当前PCollection中的每个元素必须是dict,dict内的key必须相同。
                    fields内的key要和PCollection内的key必须相同
        Returns:
            PCollection: 表示转化后的PCollection

        Examples:
            >>> data = self._pipeline.parallelize([("xiaoming", "PKU", 20)])
            >>> d1 = data.as_schema((str, str, int))
            >>> d2 = data.as_schema([str, str, int])
            >>> print d1.get()
            [('xiaoming', 'PKU', 20)]
            >>>
            >>> print d2.get()
            [('xiaoming', 'PKU', 20)]
            >>>
            >>> data = self._pipeline.parallelize([{"name": "xiaoming", "school": "PKU", "age": 20}])
            >>> d5 = data.as_schema({"name": str, "school": str, "age": int})
            >>> print d5.get()
            [{'age': 20, 'name': 'xiaoming', 'school': 'PKU'}]
            >>>
        """
        from bigflow import schema
        from bigflow import serde
        if isinstance(fields, tuple) or isinstance(fields, list):
            if len(fields) == 0:
                raise ValueError("the number of elems in fields is zero.")
            if isinstance(fields[0], str):
                _fields = {field: self._pipeline.default_objector() for field in fields}
                return self.map(lambda x: x, serde = schema.of(_fields))
            else:
                _fields = tuple(fields)
                return self.map(lambda x: x, serde = serde.of(_fields))
        elif isinstance(fields, dict):
            return self.map(lambda x: x, serde = schema.of(fields))
        else:
            raise ValueError("fields type only accept {`tuple`, `list`, `dict`}.")
コード例 #3
0
ファイル: schema_test.py プロジェクト: zz198808/bigflow
    def test_join(self):
        """ test """
        sp1 = self._pipeline.parallelize([('a', 2), ('e', 4), ('c', 6)])\
         .apply(schema.tuple_to_dict, ['websites', 'clicknum'])
        sp2 = self._pipeline.parallelize([('a', 9), ('b', 8), ('d', 7)])\
         .apply(schema.tuple_to_dict, ['websites', 'click'])
        sp3 = self._pipeline.parallelize([{'websites': 'a', 'clicknum': 2},
                {'websites': 'b', 'clicknum': 3}])\
            .map(lambda x: x, serde = schema.of({'websites': str, 'clicknum': int}))
        jsp = sp1.apply(schema.join, sp2, fields=['websites'])

        expect = [({
            'clicknum': 2,
            'websites': 'a'
        }, {
            'click': 9,
            'websites': 'a'
        })]

        self.passertEqual(expect, jsp)

        jsp = sp1.apply(schema.left_join, sp2, fields=['websites'])

        expect = \
        [({'clicknum': 4, 'websites': 'e'}, {'click': None, 'websites': None}),
         ({'clicknum': 2, 'websites': 'a'}, {'click': 9, 'websites': 'a'}),
         ({'clicknum': 6, 'websites': 'c'}, {'click': None, 'websites': None})]

        self.passertEqual(expect, jsp)

        jsp = sp3.apply(schema.left_join, sp1, fields=['websites'])

        expect = \
        [({'clicknum': 2, 'websites': 'a'}, {'clicknum': 2, 'websites': 'a'}),
         ({'clicknum': 3, 'websites': 'b'}, {'clicknum': None, 'websites': None})]

        self.passertEqual(expect, jsp)

        jsp = sp1.apply(schema.right_join, sp2, fields=['websites'])

        expect = \
        [({'clicknum': None, 'websites': None}, {'click': 8, 'websites': 'b'}),
         ({'clicknum': None, 'websites': None}, {'click': 7, 'websites': 'd'}),
         ({'clicknum': 2, 'websites': 'a'}, {'click': 9, 'websites': 'a'})]

        self.passertEqual(expect, jsp)

        jsp = sp1.apply(schema.full_join, sp2, fields=['websites'])

        expect = \
        [({'clicknum': 4, 'websites': 'e'}, {'click': None, 'websites': None}),
         ({'clicknum': None, 'websites': None}, {'click': 8, 'websites': 'b'}),
         ({'clicknum': None, 'websites': None}, {'click': 7, 'websites': 'd'}),
         ({'clicknum': 2, 'websites': 'a'}, {'click': 9, 'websites': 'a'}),
         ({'clicknum': 6, 'websites': 'c'}, {'click': None, 'websites': None})]

        self.passertEqual(expect, jsp)
コード例 #4
0
ファイル: schema_test.py プロジェクト: zz198808/bigflow
 def test_get_serde_of_fields(self):
     """ test """
     sd = schema.FieldsDictSerde({'id': int, 'name': str, 'age': int})
     self.assertEqual(str(serde.of(str)),
                      str(schema._get_serde_of_field(sd, 'name')))
     self.assertEqual(str(schema.of({
         'id': int,
         'name': str
     })), str(schema._get_serde_of_fields(sd, ['id', 'name'])))
コード例 #5
0
ファイル: schema_test.py プロジェクト: zz198808/bigflow
    def test_select(self):
        """ test """
        pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2),
            ('a,c', 3), ('a,b', 2), ('a,d', 1)])\
   .apply(transforms.map, lambda x:x, serde=serde.TupleSerde(str, int))\
    .apply(schema.select, lambda websties, clicknum: (
          websties.flat_map(lambda line: line.split(',')),
          clicknum))

        expect = \
  [('b', 2L), ('c', 2L), ('a', 3L), ('c', 3L), ('a', 1L), ('b', 1L), ('c', 1L),
        ('a', 2L), ('b', 2L), ('a', 1L), ('d', 1L)]

        self.passertEqual(expect, pc)

        pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2),
            ('a,c', 3), ('a,b', 2), ('a,d', 1)])\
   .apply(transforms.map, lambda x:x, serde=serde.TupleSerde(str, int))\
    .apply(schema.select, lambda websties, clicknum: (
          websties.flat_map(lambda line: line.split(',')),
          1))
        expect = \
        [('a', 1), ('b', 1), ('c', 1), ('b', 1), ('c', 1), ('a', 1),
         ('c', 1), ('a', 1), ('b', 1), ('a', 1), ('d', 1)]

        self.passertEqual(expect, pc)

        pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2),
            ('a,c', 3), ('a,b', 2), ('a,d', 1)])\
   .apply(transforms.map, lambda x:x, serde=serde.TupleSerde(str, int))\
    .apply(schema.select, lambda websties, clicknum: (
          'bigflow',
          1))
        expect = \
        [('bigflow', 1), ('bigflow', 1), ('bigflow', 1), ('bigflow', 1), ('bigflow', 1)]

        self.passertEqual(expect, pc)

        pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3),
                                         ('a,b', 2), ('a,d', 1)])
        sp = pc.apply(schema.tuple_to_dict, ['websites', 'clicknum']) \
      .apply(schema.select, lambda cols: {
          'website': cols['websites'].flat_map(lambda line: line.split(',')),
          'clicknum': cols['clicknum']})

        expect = \
  [{'clicknum': 1, 'website': 'a'}, {'clicknum': 1, 'website': 'd'},
         {'clicknum': 3, 'website': 'a'}, {'clicknum': 3, 'website': 'c'},
         {'clicknum': 1, 'website': 'a'}, {'clicknum': 1, 'website': 'b'},
        {'clicknum': 1, 'website': 'c'}, {'clicknum': 2, 'website': 'b'},
         {'clicknum': 2, 'website': 'c'}, {'clicknum': 2, 'website': 'a'},
         {'clicknum': 2, 'website': 'b'}]

        self.passertEqual(expect, sp)

        pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3),
                                         ('a,b', 2), ('a,d', 1)])
        sp = pc.apply(schema.tuple_to_dict, ['websites', 'clicknum']) \
      .apply(schema.select, ['websites'])

        expect = \
        [{'websites': 'a,b,c'}, {'websites': 'b,c'}, {'websites': 'a,c'},
         {'websites': 'a,b'}, {'websites': 'a,d'}]

        self.passertEqual(expect, sp)

        pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3),
                                         ('a,b', 2), ('a,d', 1)])
        sp = pc.apply(schema.tuple_to_dict, ['websites', 'clicknum']) \
            .map(lambda x: x, serde = schema.of({'websites': str, 'clicknum': int}))\
      .apply(schema.select, ['websites'])

        expect = \
        [{'websites': 'a,b,c'}, {'websites': 'b,c'}, {'websites': 'a,c'},
         {'websites': 'a,b'}, {'websites': 'a,d'}]

        self.passertEqual(expect, sp)

        pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3),
                                         ('a,b', 2), ('a,d', 1)])
        sp = pc.apply(schema.tuple_to_dict, ['websites', 'clicknum'])\
            .apply(schema.select, lambda cols: {
                'website': cols['websites'].flat_map(lambda line: line.split(',')),
                'clicknum': 100})
        expect = \
        [{'clicknum': 100, 'website': 'a'}, {'clicknum': 100, 'website': 'b'},
         {'clicknum': 100, 'website': 'c'}, {'clicknum': 100, 'website': 'b'},
         {'clicknum': 100, 'website': 'c'}, {'clicknum': 100, 'website': 'a'},
         {'clicknum': 100, 'website': 'c'}, {'clicknum': 100, 'website': 'a'},
         {'clicknum': 100, 'website': 'b'}, {'clicknum': 100, 'website': 'a'},
         {'clicknum': 100, 'website': 'd'}]

        self.passertEqual(expect, sp)

        pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3),
                                         ('a,b', 2), ('a,d', 1)])
        sp = pc.apply(schema.tuple_to_dict, ['websites', 'clicknum'])\
            .apply(schema.select, lambda cols: {
                'website': 'bigflow',
                'clicknum': 100})
        expect = \
        [{'clicknum': 100, 'website': 'bigflow'}, {'clicknum': 100, 'website': 'bigflow'},
         {'clicknum': 100, 'website': 'bigflow'}, {'clicknum': 100, 'website': 'bigflow'},
         {'clicknum': 100, 'website': 'bigflow'}]

        self.passertEqual(expect, sp)

        sideinput = self._pipeline.parallelize(2)
        pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3),
                                         ('a,b', 2), ('a,d', 1)])
        sp = pc.apply(schema.tuple_to_dict, ['websites', 'clicknum'])\
            .apply(schema.select, lambda cols, sd: {
                'website': cols['websites'].flat_map(lambda line: line.split(',')),
                'clicknum':cols['clicknum'] > sd}, sideinput)
        expect = \
        [{'clicknum': False, 'website': 'a'}, {'clicknum': False, 'website': 'b'},
         {'clicknum': False, 'website': 'c'}, {'clicknum': False, 'website': 'b'},
         {'clicknum': False, 'website': 'c'}, {'clicknum': True, 'website': 'a'},
         {'clicknum': True, 'website': 'c'}, {'clicknum': False, 'website': 'a'},
         {'clicknum': False, 'website': 'b'}, {'clicknum': False, 'website': 'a'},
         {'clicknum': False, 'website': 'd'}]

        self.passertEqual(expect, sp)

        sideinput = self._pipeline.parallelize(2)
        sp = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2),
            ('a,c', 3), ('a,b', 2), ('a,d', 1)])\
            .apply(transforms.map, lambda x:x, serde=serde.TupleSerde(str, int))\
            .apply(schema.select, lambda websties, clicknum, tp: (
                websties.flat_map(lambda line: line.split(',')),
                clicknum > tp), sideinput)
        expect = \
        [('a', False), ('b', False), ('c', False), ('b', False), ('c', False),
         ('a', True), ('c', True), ('a', False), ('b', False), ('a', False), ('d', False)]

        self.passertEqual(expect, sp)