def test_common_serde(self): """test common serde""" serdes = [] serdes.append(serde.IntSerde) serdes.append(serde.FloatSerde) serdes.append(serde.StrSerde()) serdes.append(serde.ListSerde()) serdes.append(serde.TupleSerde()) serdes.append(serde.DictSerde(str, str)) serdes.append(serde.BoolSerde()) serdes.append(serde.DefaultSerde()) serdes.append(serde.CPickleSerde()) self.assertIsInstance(serde.common_serde(*serdes), serde.DefaultSerde) serdes.append(serde.ProtobufSerde(lambda x: x)) self.assertEqual(None, serde.common_serde(*serdes)) serdes = [] class TestSerde1(object): """for test""" pass class TestSerde2(object): """for test""" pass serdes.append(TestSerde1) self.assertIsInstance(serde.common_serde(*serdes), TestSerde1) serdes.append(TestSerde2) self.assertEquals(None, serde.common_serde(*serdes))
def _transform_with_transformer(pvalue, transformer, *side_inputs, **options): from bigflow import base assert (isinstance(transformer, base.Transformer)) def _begin_process(emitter, *side_input_data): the_transformer = transformer for out_record in the_transformer.begin_process(*side_input_data): emitter.emit(out_record) return the_transformer def _process(status, emitter, record, *side_input_data): assert (isinstance(status, base.Transformer)) the_transformer = status for out_record in the_transformer.process(record, *side_input_data): emitter.emit(out_record) return the_transformer def _end_process(status, emitter, *side_input_data): the_transformer = status for out_record in the_transformer.end_process(*side_input_data): emitter.emit(out_record) from bigflow import serde # DefaultSerde can not serialize transformer, so we use CPickleSerde by default. transformer_serde = options.get('transformer_serde', serde.CPickleSerde()) options = options.copy() options.update({"status_serde": transformer_serde}) return _transform_with_fns(pvalue, _begin_process, _process, _end_process, *side_inputs, **options)
def transform_from_node(self, load_node, pipeline): """ inner func """ from bigflow import ptable transformed_pcollection = pcollection.PCollection( load_node, pipeline) before_post_process = \ ptable.PTable(transformed_pcollection, key_serde=serde.CPickleSerde()) return self._user_input_base.post_process(before_post_process)
def test_parallelize_with_pipeline_default_serde(self): """ no comment """ from bigflow import serde import cPickle serde.USE_DEFAULT_SERDE = False self.setConfig(default_serde=serde.CPickleSerde()) self.run_check_parallelize_case()
def calculate_connected_component(edges): """ edges : [(p1, p2), (p3, p4), (p2, p5),...] return: [ [p1, p2, p5], [p3, p4] ] """ def _zero_func(): import disjointset return disjointset.DisjointSet() return edges.aggregate(_zero_func, lambda djset, edge: djset.union(edge[0], edge[1]), lambda set1, set2: set1.merge(set2), serde=serde.CPickleSerde()) \ .flat_map(lambda djset: djset.items()) \ .group_by_key() \ .apply_values(to_pobject_list) \ .flatten_values()
def run_check_parallelize_case_with_serde(self): """ set parallelize serde """ self.check_parallelize([ { "dataset": None, "serde": self._pipeline.default_objector() }, { "dataset": 1, "serde": serde.IntSerde() }, { "dataset": [], "serde": serde.StrSerde() }, { "dataset": [1], "serde": serde.IntSerde() }, { "dataset": [1, 2, 'str'], "serde": self._pipeline.default_objector() }, { "dataset": [1, 2, None], "serde": serde.Optional(serde.IntSerde()) }, { "dataset": { 1: 2, 3: 4 }, "serde": self._pipeline.default_objector() }, { "dataset": [{ 1: 2, 3: 4 }], "serde": serde.DictSerde(int, int) }, { "dataset": { 1: 2, 3: 4 }, "serde": self._pipeline.default_objector() }, { "dataset": [{ "name": "bob", "age": 14 }], "serde": FieldsDictSerde({ "name": str, "age": int }) }, { "dataset": { 1: 2 }, "serde": self._pipeline.default_objector() }, { "dataset": { None: None }, "serde": self._pipeline.default_objector() }, { "dataset": { '1': 1, '2': 2 }, "serde": self._pipeline.default_objector() }, { "dataset": { '1': [1, 2, 3], '2': [1, 2], '3': [None] }, "serde": self._pipeline.default_objector() }, { "dataset": { 1: { 2: { '3': [1, 2, '3'], '4': [None] }, 3: { '3': [1, 2] } }, 2: { 1: { '3': [1, 2] } } }, "serde": self._pipeline.default_objector() }, { "dataset": [{ "name": "bob", "age": 14, "date": datetime.date(2017, 1, 1) }], "serde": FieldsDictSerde({ "name": str, "age": int, "date": serde.CPickleSerde() }) }, ])