Esempio n. 1
0
    def test_common_serde(self):
        """test common serde"""
        serdes = []
        serdes.append(serde.IntSerde)
        serdes.append(serde.FloatSerde)
        serdes.append(serde.StrSerde())
        serdes.append(serde.ListSerde())
        serdes.append(serde.TupleSerde())
        serdes.append(serde.DictSerde(str, str))
        serdes.append(serde.BoolSerde())
        serdes.append(serde.DefaultSerde())
        serdes.append(serde.CPickleSerde())
        self.assertIsInstance(serde.common_serde(*serdes), serde.DefaultSerde)

        serdes.append(serde.ProtobufSerde(lambda x: x))
        self.assertEqual(None, serde.common_serde(*serdes))

        serdes = []

        class TestSerde1(object):
            """for test"""
            pass

        class TestSerde2(object):
            """for test"""
            pass

        serdes.append(TestSerde1)
        self.assertIsInstance(serde.common_serde(*serdes), TestSerde1)
        serdes.append(TestSerde2)
        self.assertEquals(None, serde.common_serde(*serdes))
Esempio n. 2
0
def _transform_with_transformer(pvalue, transformer, *side_inputs, **options):

    from bigflow import base
    assert (isinstance(transformer, base.Transformer))

    def _begin_process(emitter, *side_input_data):
        the_transformer = transformer
        for out_record in the_transformer.begin_process(*side_input_data):
            emitter.emit(out_record)
        return the_transformer

    def _process(status, emitter, record, *side_input_data):
        assert (isinstance(status, base.Transformer))
        the_transformer = status
        for out_record in the_transformer.process(record, *side_input_data):
            emitter.emit(out_record)
        return the_transformer

    def _end_process(status, emitter, *side_input_data):
        the_transformer = status
        for out_record in the_transformer.end_process(*side_input_data):
            emitter.emit(out_record)

    from bigflow import serde
    # DefaultSerde can not serialize transformer, so we use CPickleSerde by default.
    transformer_serde = options.get('transformer_serde', serde.CPickleSerde())
    options = options.copy()
    options.update({"status_serde": transformer_serde})
    return _transform_with_fns(pvalue, _begin_process, _process, _end_process,
                               *side_inputs, **options)
Esempio n. 3
0
 def transform_from_node(self, load_node, pipeline):
     """ inner func """
     from bigflow import ptable
     transformed_pcollection = pcollection.PCollection(
         load_node, pipeline)
     before_post_process = \
         ptable.PTable(transformed_pcollection, key_serde=serde.CPickleSerde())
     return self._user_input_base.post_process(before_post_process)
Esempio n. 4
0
    def test_parallelize_with_pipeline_default_serde(self):
        """
        no comment
        """

        from bigflow import serde
        import cPickle
        serde.USE_DEFAULT_SERDE = False

        self.setConfig(default_serde=serde.CPickleSerde())
        self.run_check_parallelize_case()
Esempio n. 5
0
def calculate_connected_component(edges):
    """
        edges : [(p1, p2), (p3, p4), (p2, p5),...]
        return: [ [p1, p2, p5], [p3, p4] ]
    """
    def _zero_func():
        import disjointset
        return disjointset.DisjointSet()

    return edges.aggregate(_zero_func,
                           lambda djset, edge: djset.union(edge[0], edge[1]),
                           lambda set1, set2: set1.merge(set2),
                           serde=serde.CPickleSerde()) \
                .flat_map(lambda djset: djset.items()) \
                .group_by_key() \
                .apply_values(to_pobject_list) \
                .flatten_values()
Esempio n. 6
0
 def run_check_parallelize_case_with_serde(self):
     """
     set parallelize serde
     """
     self.check_parallelize([
         {
             "dataset": None,
             "serde": self._pipeline.default_objector()
         },
         {
             "dataset": 1,
             "serde": serde.IntSerde()
         },
         {
             "dataset": [],
             "serde": serde.StrSerde()
         },
         {
             "dataset": [1],
             "serde": serde.IntSerde()
         },
         {
             "dataset": [1, 2, 'str'],
             "serde": self._pipeline.default_objector()
         },
         {
             "dataset": [1, 2, None],
             "serde": serde.Optional(serde.IntSerde())
         },
         {
             "dataset": {
                 1: 2,
                 3: 4
             },
             "serde": self._pipeline.default_objector()
         },
         {
             "dataset": [{
                 1: 2,
                 3: 4
             }],
             "serde": serde.DictSerde(int, int)
         },
         {
             "dataset": {
                 1: 2,
                 3: 4
             },
             "serde": self._pipeline.default_objector()
         },
         {
             "dataset": [{
                 "name": "bob",
                 "age": 14
             }],
             "serde": FieldsDictSerde({
                 "name": str,
                 "age": int
             })
         },
         {
             "dataset": {
                 1: 2
             },
             "serde": self._pipeline.default_objector()
         },
         {
             "dataset": {
                 None: None
             },
             "serde": self._pipeline.default_objector()
         },
         {
             "dataset": {
                 '1': 1,
                 '2': 2
             },
             "serde": self._pipeline.default_objector()
         },
         {
             "dataset": {
                 '1': [1, 2, 3],
                 '2': [1, 2],
                 '3': [None]
             },
             "serde": self._pipeline.default_objector()
         },
         {
             "dataset": {
                 1: {
                     2: {
                         '3': [1, 2, '3'],
                         '4': [None]
                     },
                     3: {
                         '3': [1, 2]
                     }
                 },
                 2: {
                     1: {
                         '3': [1, 2]
                     }
                 }
             },
             "serde": self._pipeline.default_objector()
         },
         {
             "dataset": [{
                 "name": "bob",
                 "age": 14,
                 "date": datetime.date(2017, 1, 1)
             }],
             "serde":
             FieldsDictSerde({
                 "name": str,
                 "age": int,
                 "date": serde.CPickleSerde()
             })
         },
     ])