def test_seqfile(self): """ Test sequencefile io """ tmp_file = "./.tmp/test_tmp" key1 = 123 value1 = ["A", "B", "C"] key2 = 456 value2 = ["D", "E", "F"] input_data = [(key1, value1), (key2, value2)] def kv_serializer(record): return str(record[0]), ":".join(record[1]) pcollection_kv = self._pipeline.parallelize(input_data) self._pipeline.write( pcollection_kv, output.SequenceFile(tmp_file).as_type(kv_serializer)) self._pipeline.run() def kv_deserializer(tp): return int(tp[0]), tp[1].split(":") result = self._pipeline.read( input.SequenceFile(tmp_file).as_type(kv_deserializer)) result_data = result.get() self.assertItemsEqual(input_data, result_data)
def test_partition(self): """ Test partition output """ import os try: p = self._pipeline.parallelize(["1", "2", "3"]) self._pipeline.write(p, output.TextFile('./output-1').partition(5)) self._pipeline.write( p, output.SequenceFile('./output-2').partition( 2, lambda x, n: int(x) % n)) self._pipeline.run() o1 = self._pipeline.read( input.SequenceFile('./output-2/part-00000')) o1.cache() o2 = self._pipeline.read( input.SequenceFile('./output-2/part-00001')) o2.cache() self.assertEqual(["2"], o1.get()) self.assertItemsEqual(["1", "3"], o2.get()) n = os.popen('ls output-1/[^_]* | wc -l').read() self.assertEqual(5, int(n)) o = self._pipeline.read(input.TextFile('output-1')).get() self.assertItemsEqual(["1", "2", "3"], o) finally: os.system("rm output-1 output-2 -r")
def test_overwrite(self): """ Test pipeline overwrite target path """ p = self._pipeline.parallelize([1]) self._pipeline.write(p, output.SequenceFile('test_output')) self._pipeline.run() self._pipeline.write(p, output.SequenceFile('test_output')) self._pipeline.run() p1 = self._pipeline.read(input.SequenceFile('test_output')) p.cache() p1.cache() self.assertEqual(p.get(), p1.get()) shutil.rmtree('test_output') self.assertEqual(p.get(), p1.map(lambda x: x).get()) self.assertFalse(os.path.exists('test_output'))
def test_sequence_file(self): """ Case: test sequence file """ data = self._pipeline.parallelize([1, 2, 3, 400, 5]) local_file = self.generate_tmp_path() self._pipeline.write(data, output.SequenceFile(local_file)) self._pipeline.run() result = self._pipeline.read(input.SequenceFile(local_file)) self.assertItemsEqual([1, 2, 3, 400, 5], result.get())
def test_commit(self): """ Case: test commit """ self.setConfig(immediately_commit=True) vertex0 = self._pipeline.parallelize([1, 1, 2, 2, 3, 3, 4, 4, 5, 5]) vertex1 = vertex0.group_by(lambda x: x) \ .apply_values(lambda x: x.reduce(lambda x, y: x + y)) \ .flatten() vertex1_output = self.generate_tmp_path() self._pipeline.write(vertex1, output.SequenceFile(vertex1_output)) def _initializer(emitter): return [] def _transformer(status, emitter, inp): import copy status.append(copy.deepcopy(inp)) return status def _finalizer(status, emitter): emitter.emit(len(status) / 0) vertex2 = vertex1.group_by(lambda x: x[0] % 2, lambda x: x[1]) \ .apply_values(lambda x: x.transform(_initializer, _transformer, _finalizer)) \ .flatten() vertex2_output = self.generate_tmp_path() self._pipeline.write(vertex2, output.SequenceFile(vertex2_output)) with self.assertRaises(ZeroDivisionError): self._pipeline.run() from bigflow import base local_pipeline = base.Pipeline.create( 'local', hadoop_config_path=self._hadoop_config_path) result = local_pipeline.read(input.SequenceFile(vertex1_output)) self.assertItemsEqual([(1, 2), (2, 4), (3, 6), (4, 8), (5, 10)], result.get())
def test_sequence_file_serde(self): """ Case: test sequence file serde """ data = self._pipeline.parallelize([1, 2, 3, 400, 5]) local_file = self.generate_tmp_path() self._pipeline.write(data, output.SequenceFile(local_file, serde = serde.IntSerde())) self._pipeline.run() result = self._pipeline.read(input.SequenceFile(local_file, serde = serde.IntSerde())) self.assertItemsEqual([1, 2, 3, 400, 5], result.get()) result_invalid = self._pipeline.read( input.SequenceFile(local_file, serde = serde.TupleSerde())) with self.assertRaises(error.BigflowRuntimeException): result_invalid.get()
def test_seq_file_new_api(self): """ test sequence file new api """ import os class KeySerde(serde.Serde): """value serde""" def serialize(self, obj): """serialize""" return str(obj + 1) def deserialize(self, buf): """deserialize""" return int(buf) - 1 class ValueSerde(serde.Serde): """value serde""" def serialize(self, obj): """serialize""" return str(obj * 2) def deserialize(self, buf): """deserialize""" return int(buf) / 2 tmp_file = "./.tmp/test_file_1" os.system("rm " + tmp_file + " -rf") input_data = [(2, 2), (1, 6)] d = self._pipeline.parallelize(input_data) self._pipeline.write(d, output.SequenceFile(tmp_file,\ key_serde=KeySerde(), value_serde=ValueSerde())) self._pipeline.run() read_data = self._pipeline.read(input.SequenceFile(tmp_file,\ key_serde=KeySerde(), value_serde=ValueSerde())) result_data = read_data.get() self.assertItemsEqual(input_data, result_data)