Beispiel #1
0
    def test_proto_serde_ignore_exception(self):
        """ test """

        self.assertEqual(
            None,
            serde.ProtobufSerde(processor_pb2.PbPythonProcessorConfig,
                                False).deserialize('1111'))

        self.assertRaises(
            message.DecodeError,
            serde.ProtobufSerde(
                processor_pb2.PbPythonProcessorConfig).deserialize, '1111')
Beispiel #2
0
    def test_common_serde(self):
        """test common serde"""
        serdes = []
        serdes.append(serde.IntSerde)
        serdes.append(serde.FloatSerde)
        serdes.append(serde.StrSerde())
        serdes.append(serde.ListSerde())
        serdes.append(serde.TupleSerde())
        serdes.append(serde.DictSerde(str, str))
        serdes.append(serde.BoolSerde())
        serdes.append(serde.DefaultSerde())
        serdes.append(serde.CPickleSerde())
        self.assertIsInstance(serde.common_serde(*serdes), serde.DefaultSerde)

        serdes.append(serde.ProtobufSerde(lambda x: x))
        self.assertEqual(None, serde.common_serde(*serdes))

        serdes = []

        class TestSerde1(object):
            """for test"""
            pass

        class TestSerde2(object):
            """for test"""
            pass

        serdes.append(TestSerde1)
        self.assertIsInstance(serde.common_serde(*serdes), TestSerde1)
        serdes.append(TestSerde2)
        self.assertEquals(None, serde.common_serde(*serdes))
Beispiel #3
0
                                    task_name, DATE)
    #job_name = 'feed_production_day_relerec_state' + "_" + DATE
    pipeline = base.Pipeline.create("local" if ISTEST else "DAGMR",
                                    job_name=job_name,
                                    tmp_data_path=afs_tmp,
                                    hadoop_job_conf=job_conf)
    # 核心任务逻辑
    pipeline.add_file("./bigflow_python/proto/sample_pb2.py",
                      "./sample_pb2.py")

    # to run in local mode, run code below first, then read from local file
    #pipeline = base.Pipeline.create("DAGMR",
    #        job_name=job_name,
    #        tmp_data_path=afs_tmp,
    #        hadoop_job_conf=job_conf)
    #pbs = pipeline.read(input.SequenceFile(*input_path, serde=serde.StrSerde()))
    #pipeline.write(pbs, output.SequenceFile(output_path, serde=serde.StrSerde()))
    #pipeline.run()
    pbs = pipeline.read(
        input.SequenceFile(*input_path,
                           serde=serde.ProtobufSerde(sample_pb2.Sample)))
    p = pbs.flat_map(emit_features)\
        .group_by(key_extractor=lambda x:x[0], value_extractor=lambda x:x[1])\
        .apply_values(transforms.reduce, lambda a,b: map(operator.add, a, b)).flatten()\
        .map(lambda x: [x[0], x[1] + [float(x[1][2]) / (x[1][1]) if x[1][1] > 0 else 0]])\
        .group_by(key_extractor=lambda x:x[0].split('\t')[0], value_extractor=lambda x:x[1])\
        .apply_values(average).flatten().map(lambda x: '\t'.join(x[0].split('#') + map(str, x[1])))

    # output
    pipeline.write(p, output.TextFile(output_path).partition(n=1))
    pipeline.run()