def as_schema(self, fields): """ 根据字段,返回一个SchemaPCollection Args: fields: 类型可以是,tuple,list,dict; 当fields是tuple或list时, 会判断每个元素的类型: fields中的每个元素是python基本类型或一个serde; 接口将构造TupleSerde设置到PCollection每个元素 fields中的每个元素是python string,抛出异常 当fields是dict时: fields的key标识字段类型,value标识该字段的类型,如 {"name": str, "age": int} 当前PCollection中的每个元素必须是dict,dict内的key必须相同。 fields内的key要和PCollection内的key必须相同 Returns: PCollection: 表示转化后的PCollection Examples: >>> data = self._pipeline.parallelize([("xiaoming", "PKU", 20)]) >>> d1 = data.as_schema((str, str, int)) >>> d2 = data.as_schema([str, str, int]) >>> print d1.get() [('xiaoming', 'PKU', 20)] >>> >>> print d2.get() [('xiaoming', 'PKU', 20)] >>> >>> data = self._pipeline.parallelize([{"name": "xiaoming", "school": "PKU", "age": 20}]) >>> d5 = data.as_schema({"name": str, "school": str, "age": int}) >>> print d5.get() [{'age': 20, 'name': 'xiaoming', 'school': 'PKU'}] >>> """ from bigflow import schema from bigflow import serde if isinstance(fields, tuple) or isinstance(fields, list): if len(fields) == 0: raise ValueError("the number of elems in fields is zero.") if isinstance(fields[0], str): _fields = { field: self._pipeline.default_objector() for field in fields } return self.map(lambda x: x, serde=schema.of(_fields)) else: _fields = tuple(fields) return self.map(lambda x: x, serde=serde.of(_fields)) elif isinstance(fields, dict): return self.map(lambda x: x, serde=schema.of(fields)) else: raise ValueError( "fields type only accept {`tuple`, `list`, `dict`}.")
def as_schema(self, fields): """ 根据字段,返回一个SchemaPCollection Args: fields: 类型可以是,tuple,list,dict; 当fields是tuple或list时, 会判断每个元素的类型: fields中的每个元素是python基本类型或一个serde; 接口将构造TupleSerde设置到PCollection每个元素 fields中的每个元素是python string,抛出异常 当fields是dict时: fields的key标识字段类型,value标识该字段的类型,如 {"name": str, "age": int} 当前PCollection中的每个元素必须是dict,dict内的key必须相同。 fields内的key要和PCollection内的key必须相同 Returns: PCollection: 表示转化后的PCollection Examples: >>> data = self._pipeline.parallelize([("xiaoming", "PKU", 20)]) >>> d1 = data.as_schema((str, str, int)) >>> d2 = data.as_schema([str, str, int]) >>> print d1.get() [('xiaoming', 'PKU', 20)] >>> >>> print d2.get() [('xiaoming', 'PKU', 20)] >>> >>> data = self._pipeline.parallelize([{"name": "xiaoming", "school": "PKU", "age": 20}]) >>> d5 = data.as_schema({"name": str, "school": str, "age": int}) >>> print d5.get() [{'age': 20, 'name': 'xiaoming', 'school': 'PKU'}] >>> """ from bigflow import schema from bigflow import serde if isinstance(fields, tuple) or isinstance(fields, list): if len(fields) == 0: raise ValueError("the number of elems in fields is zero.") if isinstance(fields[0], str): _fields = {field: self._pipeline.default_objector() for field in fields} return self.map(lambda x: x, serde = schema.of(_fields)) else: _fields = tuple(fields) return self.map(lambda x: x, serde = serde.of(_fields)) elif isinstance(fields, dict): return self.map(lambda x: x, serde = schema.of(fields)) else: raise ValueError("fields type only accept {`tuple`, `list`, `dict`}.")
def test_join(self): """ test """ sp1 = self._pipeline.parallelize([('a', 2), ('e', 4), ('c', 6)])\ .apply(schema.tuple_to_dict, ['websites', 'clicknum']) sp2 = self._pipeline.parallelize([('a', 9), ('b', 8), ('d', 7)])\ .apply(schema.tuple_to_dict, ['websites', 'click']) sp3 = self._pipeline.parallelize([{'websites': 'a', 'clicknum': 2}, {'websites': 'b', 'clicknum': 3}])\ .map(lambda x: x, serde = schema.of({'websites': str, 'clicknum': int})) jsp = sp1.apply(schema.join, sp2, fields=['websites']) expect = [({ 'clicknum': 2, 'websites': 'a' }, { 'click': 9, 'websites': 'a' })] self.passertEqual(expect, jsp) jsp = sp1.apply(schema.left_join, sp2, fields=['websites']) expect = \ [({'clicknum': 4, 'websites': 'e'}, {'click': None, 'websites': None}), ({'clicknum': 2, 'websites': 'a'}, {'click': 9, 'websites': 'a'}), ({'clicknum': 6, 'websites': 'c'}, {'click': None, 'websites': None})] self.passertEqual(expect, jsp) jsp = sp3.apply(schema.left_join, sp1, fields=['websites']) expect = \ [({'clicknum': 2, 'websites': 'a'}, {'clicknum': 2, 'websites': 'a'}), ({'clicknum': 3, 'websites': 'b'}, {'clicknum': None, 'websites': None})] self.passertEqual(expect, jsp) jsp = sp1.apply(schema.right_join, sp2, fields=['websites']) expect = \ [({'clicknum': None, 'websites': None}, {'click': 8, 'websites': 'b'}), ({'clicknum': None, 'websites': None}, {'click': 7, 'websites': 'd'}), ({'clicknum': 2, 'websites': 'a'}, {'click': 9, 'websites': 'a'})] self.passertEqual(expect, jsp) jsp = sp1.apply(schema.full_join, sp2, fields=['websites']) expect = \ [({'clicknum': 4, 'websites': 'e'}, {'click': None, 'websites': None}), ({'clicknum': None, 'websites': None}, {'click': 8, 'websites': 'b'}), ({'clicknum': None, 'websites': None}, {'click': 7, 'websites': 'd'}), ({'clicknum': 2, 'websites': 'a'}, {'click': 9, 'websites': 'a'}), ({'clicknum': 6, 'websites': 'c'}, {'click': None, 'websites': None})] self.passertEqual(expect, jsp)
def test_get_serde_of_fields(self): """ test """ sd = schema.FieldsDictSerde({'id': int, 'name': str, 'age': int}) self.assertEqual(str(serde.of(str)), str(schema._get_serde_of_field(sd, 'name'))) self.assertEqual(str(schema.of({ 'id': int, 'name': str })), str(schema._get_serde_of_fields(sd, ['id', 'name'])))
def test_select(self): """ test """ pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)])\ .apply(transforms.map, lambda x:x, serde=serde.TupleSerde(str, int))\ .apply(schema.select, lambda websties, clicknum: ( websties.flat_map(lambda line: line.split(',')), clicknum)) expect = \ [('b', 2L), ('c', 2L), ('a', 3L), ('c', 3L), ('a', 1L), ('b', 1L), ('c', 1L), ('a', 2L), ('b', 2L), ('a', 1L), ('d', 1L)] self.passertEqual(expect, pc) pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)])\ .apply(transforms.map, lambda x:x, serde=serde.TupleSerde(str, int))\ .apply(schema.select, lambda websties, clicknum: ( websties.flat_map(lambda line: line.split(',')), 1)) expect = \ [('a', 1), ('b', 1), ('c', 1), ('b', 1), ('c', 1), ('a', 1), ('c', 1), ('a', 1), ('b', 1), ('a', 1), ('d', 1)] self.passertEqual(expect, pc) pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)])\ .apply(transforms.map, lambda x:x, serde=serde.TupleSerde(str, int))\ .apply(schema.select, lambda websties, clicknum: ( 'bigflow', 1)) expect = \ [('bigflow', 1), ('bigflow', 1), ('bigflow', 1), ('bigflow', 1), ('bigflow', 1)] self.passertEqual(expect, pc) pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)]) sp = pc.apply(schema.tuple_to_dict, ['websites', 'clicknum']) \ .apply(schema.select, lambda cols: { 'website': cols['websites'].flat_map(lambda line: line.split(',')), 'clicknum': cols['clicknum']}) expect = \ [{'clicknum': 1, 'website': 'a'}, {'clicknum': 1, 'website': 'd'}, {'clicknum': 3, 'website': 'a'}, {'clicknum': 3, 'website': 'c'}, {'clicknum': 1, 'website': 'a'}, {'clicknum': 1, 'website': 'b'}, {'clicknum': 1, 'website': 'c'}, {'clicknum': 2, 'website': 'b'}, {'clicknum': 2, 'website': 'c'}, {'clicknum': 2, 'website': 'a'}, {'clicknum': 2, 'website': 'b'}] self.passertEqual(expect, sp) pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)]) sp = pc.apply(schema.tuple_to_dict, ['websites', 'clicknum']) \ .apply(schema.select, ['websites']) expect = \ [{'websites': 'a,b,c'}, {'websites': 'b,c'}, {'websites': 'a,c'}, {'websites': 'a,b'}, {'websites': 'a,d'}] self.passertEqual(expect, sp) pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)]) sp = pc.apply(schema.tuple_to_dict, ['websites', 'clicknum']) \ .map(lambda x: x, serde = schema.of({'websites': str, 'clicknum': int}))\ .apply(schema.select, ['websites']) expect = \ [{'websites': 'a,b,c'}, {'websites': 'b,c'}, {'websites': 'a,c'}, {'websites': 'a,b'}, {'websites': 'a,d'}] self.passertEqual(expect, sp) pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)]) sp = pc.apply(schema.tuple_to_dict, ['websites', 'clicknum'])\ .apply(schema.select, lambda cols: { 'website': cols['websites'].flat_map(lambda line: line.split(',')), 'clicknum': 100}) expect = \ [{'clicknum': 100, 'website': 'a'}, {'clicknum': 100, 'website': 'b'}, {'clicknum': 100, 'website': 'c'}, {'clicknum': 100, 'website': 'b'}, {'clicknum': 100, 'website': 'c'}, {'clicknum': 100, 'website': 'a'}, {'clicknum': 100, 'website': 'c'}, {'clicknum': 100, 'website': 'a'}, {'clicknum': 100, 'website': 'b'}, {'clicknum': 100, 'website': 'a'}, {'clicknum': 100, 'website': 'd'}] self.passertEqual(expect, sp) pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)]) sp = pc.apply(schema.tuple_to_dict, ['websites', 'clicknum'])\ .apply(schema.select, lambda cols: { 'website': 'bigflow', 'clicknum': 100}) expect = \ [{'clicknum': 100, 'website': 'bigflow'}, {'clicknum': 100, 'website': 'bigflow'}, {'clicknum': 100, 'website': 'bigflow'}, {'clicknum': 100, 'website': 'bigflow'}, {'clicknum': 100, 'website': 'bigflow'}] self.passertEqual(expect, sp) sideinput = self._pipeline.parallelize(2) pc = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)]) sp = pc.apply(schema.tuple_to_dict, ['websites', 'clicknum'])\ .apply(schema.select, lambda cols, sd: { 'website': cols['websites'].flat_map(lambda line: line.split(',')), 'clicknum':cols['clicknum'] > sd}, sideinput) expect = \ [{'clicknum': False, 'website': 'a'}, {'clicknum': False, 'website': 'b'}, {'clicknum': False, 'website': 'c'}, {'clicknum': False, 'website': 'b'}, {'clicknum': False, 'website': 'c'}, {'clicknum': True, 'website': 'a'}, {'clicknum': True, 'website': 'c'}, {'clicknum': False, 'website': 'a'}, {'clicknum': False, 'website': 'b'}, {'clicknum': False, 'website': 'a'}, {'clicknum': False, 'website': 'd'}] self.passertEqual(expect, sp) sideinput = self._pipeline.parallelize(2) sp = self._pipeline.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)])\ .apply(transforms.map, lambda x:x, serde=serde.TupleSerde(str, int))\ .apply(schema.select, lambda websties, clicknum, tp: ( websties.flat_map(lambda line: line.split(',')), clicknum > tp), sideinput) expect = \ [('a', False), ('b', False), ('c', False), ('b', False), ('c', False), ('a', True), ('c', True), ('a', False), ('b', False), ('a', False), ('d', False)] self.passertEqual(expect, sp)