def test_proto_serde_ignore_exception(self): """ test """ self.assertEqual( None, serde.ProtobufSerde(processor_pb2.PbPythonProcessorConfig, False).deserialize('1111')) self.assertRaises( message.DecodeError, serde.ProtobufSerde( processor_pb2.PbPythonProcessorConfig).deserialize, '1111')
def test_common_serde(self): """test common serde""" serdes = [] serdes.append(serde.IntSerde) serdes.append(serde.FloatSerde) serdes.append(serde.StrSerde()) serdes.append(serde.ListSerde()) serdes.append(serde.TupleSerde()) serdes.append(serde.DictSerde(str, str)) serdes.append(serde.BoolSerde()) serdes.append(serde.DefaultSerde()) serdes.append(serde.CPickleSerde()) self.assertIsInstance(serde.common_serde(*serdes), serde.DefaultSerde) serdes.append(serde.ProtobufSerde(lambda x: x)) self.assertEqual(None, serde.common_serde(*serdes)) serdes = [] class TestSerde1(object): """for test""" pass class TestSerde2(object): """for test""" pass serdes.append(TestSerde1) self.assertIsInstance(serde.common_serde(*serdes), TestSerde1) serdes.append(TestSerde2) self.assertEquals(None, serde.common_serde(*serdes))
task_name, DATE) #job_name = 'feed_production_day_relerec_state' + "_" + DATE pipeline = base.Pipeline.create("local" if ISTEST else "DAGMR", job_name=job_name, tmp_data_path=afs_tmp, hadoop_job_conf=job_conf) # 核心任务逻辑 pipeline.add_file("./bigflow_python/proto/sample_pb2.py", "./sample_pb2.py") # to run in local mode, run code below first, then read from local file #pipeline = base.Pipeline.create("DAGMR", # job_name=job_name, # tmp_data_path=afs_tmp, # hadoop_job_conf=job_conf) #pbs = pipeline.read(input.SequenceFile(*input_path, serde=serde.StrSerde())) #pipeline.write(pbs, output.SequenceFile(output_path, serde=serde.StrSerde())) #pipeline.run() pbs = pipeline.read( input.SequenceFile(*input_path, serde=serde.ProtobufSerde(sample_pb2.Sample))) p = pbs.flat_map(emit_features)\ .group_by(key_extractor=lambda x:x[0], value_extractor=lambda x:x[1])\ .apply_values(transforms.reduce, lambda a,b: map(operator.add, a, b)).flatten()\ .map(lambda x: [x[0], x[1] + [float(x[1][2]) / (x[1][1]) if x[1][1] > 0 else 0]])\ .group_by(key_extractor=lambda x:x[0].split('\t')[0], value_extractor=lambda x:x[1])\ .apply_values(average).flatten().map(lambda x: '\t'.join(x[0].split('#') + map(str, x[1]))) # output pipeline.write(p, output.TextFile(output_path).partition(n=1)) pipeline.run()