Beispiel #1
0
    def test_partition(self):
        """
        Test partition output
        """

        import os
        try:
            p = self._pipeline.parallelize(["1", "2", "3"])
            self._pipeline.write(p, output.TextFile('./output-1').partition(5))
            self._pipeline.write(
                p,
                output.SequenceFile('./output-2').partition(
                    2, lambda x, n: int(x) % n))
            self._pipeline.run()

            o1 = self._pipeline.read(
                input.SequenceFile('./output-2/part-00000'))
            o1.cache()
            o2 = self._pipeline.read(
                input.SequenceFile('./output-2/part-00001'))
            o2.cache()
            self.assertEqual(["2"], o1.get())
            self.assertItemsEqual(["1", "3"], o2.get())

            n = os.popen('ls output-1/[^_]* | wc -l').read()
            self.assertEqual(5, int(n))
            o = self._pipeline.read(input.TextFile('output-1')).get()
            self.assertItemsEqual(["1", "2", "3"], o)
        finally:
            os.system("rm output-1 output-2 -r")
Beispiel #2
0
 def test_sequence_file_serde(self):
     """
     Case: test sequence file serde
     """
     data = self._pipeline.parallelize([1, 2, 3, 400, 5])
     local_file = self.generate_tmp_path()
     self._pipeline.write(data,
             output.SequenceFile(local_file, serde = serde.IntSerde()))
     self._pipeline.run()
     result = self._pipeline.read(input.SequenceFile(local_file, serde = serde.IntSerde()))
     self.assertItemsEqual([1, 2, 3, 400, 5], result.get())
     result_invalid = self._pipeline.read(
             input.SequenceFile(local_file, serde = serde.TupleSerde()))
     with self.assertRaises(error.BigflowRuntimeException):
         result_invalid.get()
Beispiel #3
0
    def test_seqfile(self):
        """
        Test sequencefile io
        """
        tmp_file = "./.tmp/test_tmp"

        key1 = 123
        value1 = ["A", "B", "C"]

        key2 = 456
        value2 = ["D", "E", "F"]

        input_data = [(key1, value1), (key2, value2)]

        def kv_serializer(record):
            return str(record[0]), ":".join(record[1])

        pcollection_kv = self._pipeline.parallelize(input_data)
        self._pipeline.write(
            pcollection_kv,
            output.SequenceFile(tmp_file).as_type(kv_serializer))

        self._pipeline.run()

        def kv_deserializer(tp):
            return int(tp[0]), tp[1].split(":")

        result = self._pipeline.read(
            input.SequenceFile(tmp_file).as_type(kv_deserializer))
        result_data = result.get()

        self.assertItemsEqual(input_data, result_data)
Beispiel #4
0
    def getResultWithSequence(self, pipeline_status, path):
        """ no comments """
        pipeline_status.wait_status("APP_RUN")
        import time
        time.sleep(300)

        local_pipeline = base.Pipeline.create('local')
        result = local_pipeline.read(input.SequenceFile(path))
        return result.get()
Beispiel #5
0
 def test_sequence_file(self):
     """
     Case: test sequence file
     """
     data = self._pipeline.parallelize([1, 2, 3, 400, 5])
     local_file = self.generate_tmp_path()
     self._pipeline.write(data, output.SequenceFile(local_file))
     self._pipeline.run()
     result = self._pipeline.read(input.SequenceFile(local_file))
     self.assertItemsEqual([1, 2, 3, 400, 5], result.get())
Beispiel #6
0
 def test_sequence_file_invalid(self):
     """
     Case: test sequence file invalid
     """
     data = self._pipeline.parallelize(['1', '2', '3', '400', '5'])
     local_file = self.generate_tmp_path()
     self._pipeline.write(data, output.TextFile(local_file))
     self._pipeline.run()
     result = self._pipeline.read(input.SequenceFile(local_file))
     with self.assertRaises(error.BigflowRuntimeException):
         result.get()
Beispiel #7
0
 def test_overwrite(self):
     """
     Test pipeline overwrite target path
     """
     p = self._pipeline.parallelize([1])
     self._pipeline.write(p, output.SequenceFile('test_output'))
     self._pipeline.run()
     self._pipeline.write(p, output.SequenceFile('test_output'))
     self._pipeline.run()
     p1 = self._pipeline.read(input.SequenceFile('test_output'))
     p.cache()
     p1.cache()
     self.assertEqual(p.get(), p1.get())
     shutil.rmtree('test_output')
     self.assertEqual(p.get(), p1.map(lambda x: x).get())
     self.assertFalse(os.path.exists('test_output'))
Beispiel #8
0
    def parallelize(self, dataset, **options):
        """
        将一段内存变量映射为一个P类型实例

        Args:
          dataset (object):  任意类型的内存变量
          options:
                serde: 设置dataset的serde对象

        Returns:
          PType:  表示该内存变量的P类型
        """
        objector = options.get("serde", self.default_objector())

        local_input_path = "./.local_input"
        if os.path.isfile(local_input_path):
            raise error.BigflowPlanningException(
                "file ./.local_input exist, "
                "cannot use it as temp directory")
        if not os.path.exists(local_input_path):
            os.makedirs(local_input_path)

        file_name = os.path.abspath(local_input_path + "/" + str(uuid.uuid4()))
        requests.write_record(file_name, utils.flatten_runtime_value(dataset),
                              objector)

        self._local_temp_files.append(file_name)

        node = self.read(input.SequenceFile(file_name, **options)).node()

        nested_level, ptype = utils.detect_ptype(dataset)

        if nested_level < 0:
            return utils.construct(self, node, ptype)
        else:
            from bigflow.transform_impls import group_by

            for i in range(0, nested_level + 1):
                node = group_by.node_group_by(
                    node,
                    lambda x: x[0],
                    lambda x: x[1] if len(x) == 2 else x[1:len(x)],
                    key_serde=self.default_objector(),
                    value_serde=self.default_objector())

            return utils.construct(self, node, ptable.PTable, nested_level,
                                   ptype)
Beispiel #9
0
    def test_commit(self):
        """
        Case: test commit
        """
        self.setConfig(immediately_commit=True)

        vertex0 = self._pipeline.parallelize([1, 1, 2, 2, 3, 3, 4, 4, 5, 5])

        vertex1 = vertex0.group_by(lambda x: x) \
                .apply_values(lambda x: x.reduce(lambda x, y: x + y)) \
                .flatten()

        vertex1_output = self.generate_tmp_path()
        self._pipeline.write(vertex1, output.SequenceFile(vertex1_output))

        def _initializer(emitter):
            return []

        def _transformer(status, emitter, inp):
            import copy
            status.append(copy.deepcopy(inp))
            return status

        def _finalizer(status, emitter):
            emitter.emit(len(status) / 0)

        vertex2 = vertex1.group_by(lambda x: x[0] % 2, lambda x: x[1]) \
                .apply_values(lambda x: x.transform(_initializer, _transformer, _finalizer)) \
                .flatten()

        vertex2_output = self.generate_tmp_path()
        self._pipeline.write(vertex2, output.SequenceFile(vertex2_output))
        with self.assertRaises(ZeroDivisionError):
            self._pipeline.run()

        from bigflow import base
        local_pipeline = base.Pipeline.create(
            'local', hadoop_config_path=self._hadoop_config_path)
        result = local_pipeline.read(input.SequenceFile(vertex1_output))
        self.assertItemsEqual([(1, 2), (2, 4), (3, 6), (4, 8), (5, 10)],
                              result.get())
Beispiel #10
0
    def test_seq_file_new_api(self):
        """
        test sequence file new api
        """
        import os

        class KeySerde(serde.Serde):
            """value serde"""
            def serialize(self, obj):
                """serialize"""
                return str(obj + 1)

            def deserialize(self, buf):
                """deserialize"""
                return int(buf) - 1

        class ValueSerde(serde.Serde):
            """value serde"""
            def serialize(self, obj):
                """serialize"""
                return str(obj * 2)

            def deserialize(self, buf):
                """deserialize"""
                return int(buf) / 2

        tmp_file = "./.tmp/test_file_1"
        os.system("rm " + tmp_file + " -rf")
        input_data = [(2, 2), (1, 6)]
        d = self._pipeline.parallelize(input_data)
        self._pipeline.write(d, output.SequenceFile(tmp_file,\
            key_serde=KeySerde(), value_serde=ValueSerde()))
        self._pipeline.run()

        read_data = self._pipeline.read(input.SequenceFile(tmp_file,\
            key_serde=KeySerde(), value_serde=ValueSerde()))
        result_data = read_data.get()
        self.assertItemsEqual(input_data, result_data)
Beispiel #11
0
                                    task_name, DATE)
    #job_name = 'feed_production_day_relerec_state' + "_" + DATE
    pipeline = base.Pipeline.create("local" if ISTEST else "DAGMR",
                                    job_name=job_name,
                                    tmp_data_path=afs_tmp,
                                    hadoop_job_conf=job_conf)
    # 核心任务逻辑
    pipeline.add_file("./bigflow_python/proto/sample_pb2.py",
                      "./sample_pb2.py")

    # to run in local mode, run code below first, then read from local file
    #pipeline = base.Pipeline.create("DAGMR",
    #        job_name=job_name,
    #        tmp_data_path=afs_tmp,
    #        hadoop_job_conf=job_conf)
    #pbs = pipeline.read(input.SequenceFile(*input_path, serde=serde.StrSerde()))
    #pipeline.write(pbs, output.SequenceFile(output_path, serde=serde.StrSerde()))
    #pipeline.run()
    pbs = pipeline.read(
        input.SequenceFile(*input_path,
                           serde=serde.ProtobufSerde(sample_pb2.Sample)))
    p = pbs.flat_map(emit_features)\
        .group_by(key_extractor=lambda x:x[0], value_extractor=lambda x:x[1])\
        .apply_values(transforms.reduce, lambda a,b: map(operator.add, a, b)).flatten()\
        .map(lambda x: [x[0], x[1] + [float(x[1][2]) / (x[1][1]) if x[1][1] > 0 else 0]])\
        .group_by(key_extractor=lambda x:x[0].split('\t')[0], value_extractor=lambda x:x[1])\
        .apply_values(average).flatten().map(lambda x: '\t'.join(x[0].split('#') + map(str, x[1])))

    # output
    pipeline.write(p, output.TextFile(output_path).partition(n=1))
    pipeline.run()