def test_partition(self): """ Test partition output """ import os try: p = self._pipeline.parallelize(["1", "2", "3"]) self._pipeline.write(p, output.TextFile('./output-1').partition(5)) self._pipeline.write( p, output.SequenceFile('./output-2').partition( 2, lambda x, n: int(x) % n)) self._pipeline.run() o1 = self._pipeline.read( input.SequenceFile('./output-2/part-00000')) o1.cache() o2 = self._pipeline.read( input.SequenceFile('./output-2/part-00001')) o2.cache() self.assertEqual(["2"], o1.get()) self.assertItemsEqual(["1", "3"], o2.get()) n = os.popen('ls output-1/[^_]* | wc -l').read() self.assertEqual(5, int(n)) o = self._pipeline.read(input.TextFile('output-1')).get() self.assertItemsEqual(["1", "2", "3"], o) finally: os.system("rm output-1 output-2 -r")
def test_sequence_file_serde(self): """ Case: test sequence file serde """ data = self._pipeline.parallelize([1, 2, 3, 400, 5]) local_file = self.generate_tmp_path() self._pipeline.write(data, output.SequenceFile(local_file, serde = serde.IntSerde())) self._pipeline.run() result = self._pipeline.read(input.SequenceFile(local_file, serde = serde.IntSerde())) self.assertItemsEqual([1, 2, 3, 400, 5], result.get()) result_invalid = self._pipeline.read( input.SequenceFile(local_file, serde = serde.TupleSerde())) with self.assertRaises(error.BigflowRuntimeException): result_invalid.get()
def test_seqfile(self): """ Test sequencefile io """ tmp_file = "./.tmp/test_tmp" key1 = 123 value1 = ["A", "B", "C"] key2 = 456 value2 = ["D", "E", "F"] input_data = [(key1, value1), (key2, value2)] def kv_serializer(record): return str(record[0]), ":".join(record[1]) pcollection_kv = self._pipeline.parallelize(input_data) self._pipeline.write( pcollection_kv, output.SequenceFile(tmp_file).as_type(kv_serializer)) self._pipeline.run() def kv_deserializer(tp): return int(tp[0]), tp[1].split(":") result = self._pipeline.read( input.SequenceFile(tmp_file).as_type(kv_deserializer)) result_data = result.get() self.assertItemsEqual(input_data, result_data)
def getResultWithSequence(self, pipeline_status, path): """ no comments """ pipeline_status.wait_status("APP_RUN") import time time.sleep(300) local_pipeline = base.Pipeline.create('local') result = local_pipeline.read(input.SequenceFile(path)) return result.get()
def test_sequence_file(self): """ Case: test sequence file """ data = self._pipeline.parallelize([1, 2, 3, 400, 5]) local_file = self.generate_tmp_path() self._pipeline.write(data, output.SequenceFile(local_file)) self._pipeline.run() result = self._pipeline.read(input.SequenceFile(local_file)) self.assertItemsEqual([1, 2, 3, 400, 5], result.get())
def test_sequence_file_invalid(self): """ Case: test sequence file invalid """ data = self._pipeline.parallelize(['1', '2', '3', '400', '5']) local_file = self.generate_tmp_path() self._pipeline.write(data, output.TextFile(local_file)) self._pipeline.run() result = self._pipeline.read(input.SequenceFile(local_file)) with self.assertRaises(error.BigflowRuntimeException): result.get()
def test_overwrite(self): """ Test pipeline overwrite target path """ p = self._pipeline.parallelize([1]) self._pipeline.write(p, output.SequenceFile('test_output')) self._pipeline.run() self._pipeline.write(p, output.SequenceFile('test_output')) self._pipeline.run() p1 = self._pipeline.read(input.SequenceFile('test_output')) p.cache() p1.cache() self.assertEqual(p.get(), p1.get()) shutil.rmtree('test_output') self.assertEqual(p.get(), p1.map(lambda x: x).get()) self.assertFalse(os.path.exists('test_output'))
def parallelize(self, dataset, **options): """ 将一段内存变量映射为一个P类型实例 Args: dataset (object): 任意类型的内存变量 options: serde: 设置dataset的serde对象 Returns: PType: 表示该内存变量的P类型 """ objector = options.get("serde", self.default_objector()) local_input_path = "./.local_input" if os.path.isfile(local_input_path): raise error.BigflowPlanningException( "file ./.local_input exist, " "cannot use it as temp directory") if not os.path.exists(local_input_path): os.makedirs(local_input_path) file_name = os.path.abspath(local_input_path + "/" + str(uuid.uuid4())) requests.write_record(file_name, utils.flatten_runtime_value(dataset), objector) self._local_temp_files.append(file_name) node = self.read(input.SequenceFile(file_name, **options)).node() nested_level, ptype = utils.detect_ptype(dataset) if nested_level < 0: return utils.construct(self, node, ptype) else: from bigflow.transform_impls import group_by for i in range(0, nested_level + 1): node = group_by.node_group_by( node, lambda x: x[0], lambda x: x[1] if len(x) == 2 else x[1:len(x)], key_serde=self.default_objector(), value_serde=self.default_objector()) return utils.construct(self, node, ptable.PTable, nested_level, ptype)
def test_commit(self): """ Case: test commit """ self.setConfig(immediately_commit=True) vertex0 = self._pipeline.parallelize([1, 1, 2, 2, 3, 3, 4, 4, 5, 5]) vertex1 = vertex0.group_by(lambda x: x) \ .apply_values(lambda x: x.reduce(lambda x, y: x + y)) \ .flatten() vertex1_output = self.generate_tmp_path() self._pipeline.write(vertex1, output.SequenceFile(vertex1_output)) def _initializer(emitter): return [] def _transformer(status, emitter, inp): import copy status.append(copy.deepcopy(inp)) return status def _finalizer(status, emitter): emitter.emit(len(status) / 0) vertex2 = vertex1.group_by(lambda x: x[0] % 2, lambda x: x[1]) \ .apply_values(lambda x: x.transform(_initializer, _transformer, _finalizer)) \ .flatten() vertex2_output = self.generate_tmp_path() self._pipeline.write(vertex2, output.SequenceFile(vertex2_output)) with self.assertRaises(ZeroDivisionError): self._pipeline.run() from bigflow import base local_pipeline = base.Pipeline.create( 'local', hadoop_config_path=self._hadoop_config_path) result = local_pipeline.read(input.SequenceFile(vertex1_output)) self.assertItemsEqual([(1, 2), (2, 4), (3, 6), (4, 8), (5, 10)], result.get())
def test_seq_file_new_api(self): """ test sequence file new api """ import os class KeySerde(serde.Serde): """value serde""" def serialize(self, obj): """serialize""" return str(obj + 1) def deserialize(self, buf): """deserialize""" return int(buf) - 1 class ValueSerde(serde.Serde): """value serde""" def serialize(self, obj): """serialize""" return str(obj * 2) def deserialize(self, buf): """deserialize""" return int(buf) / 2 tmp_file = "./.tmp/test_file_1" os.system("rm " + tmp_file + " -rf") input_data = [(2, 2), (1, 6)] d = self._pipeline.parallelize(input_data) self._pipeline.write(d, output.SequenceFile(tmp_file,\ key_serde=KeySerde(), value_serde=ValueSerde())) self._pipeline.run() read_data = self._pipeline.read(input.SequenceFile(tmp_file,\ key_serde=KeySerde(), value_serde=ValueSerde())) result_data = read_data.get() self.assertItemsEqual(input_data, result_data)
task_name, DATE) #job_name = 'feed_production_day_relerec_state' + "_" + DATE pipeline = base.Pipeline.create("local" if ISTEST else "DAGMR", job_name=job_name, tmp_data_path=afs_tmp, hadoop_job_conf=job_conf) # 核心任务逻辑 pipeline.add_file("./bigflow_python/proto/sample_pb2.py", "./sample_pb2.py") # to run in local mode, run code below first, then read from local file #pipeline = base.Pipeline.create("DAGMR", # job_name=job_name, # tmp_data_path=afs_tmp, # hadoop_job_conf=job_conf) #pbs = pipeline.read(input.SequenceFile(*input_path, serde=serde.StrSerde())) #pipeline.write(pbs, output.SequenceFile(output_path, serde=serde.StrSerde())) #pipeline.run() pbs = pipeline.read( input.SequenceFile(*input_path, serde=serde.ProtobufSerde(sample_pb2.Sample))) p = pbs.flat_map(emit_features)\ .group_by(key_extractor=lambda x:x[0], value_extractor=lambda x:x[1])\ .apply_values(transforms.reduce, lambda a,b: map(operator.add, a, b)).flatten()\ .map(lambda x: [x[0], x[1] + [float(x[1][2]) / (x[1][1]) if x[1][1] > 0 else 0]])\ .group_by(key_extractor=lambda x:x[0].split('\t')[0], value_extractor=lambda x:x[1])\ .apply_values(average).flatten().map(lambda x: '\t'.join(x[0].split('#') + map(str, x[1]))) # output pipeline.write(p, output.TextFile(output_path).partition(n=1)) pipeline.run()