Beispiel #1
0
    def test_add_remote_file_for_local(self):
        """test add remote file"""
        import subprocess
        prog_dir = self.generate_tmp_path()
        data_dir = self.generate_tmp_path()
        prog_list = ["#!/bin/env python", "#-*- coding:utf-8 -*-"]
        prog_list += ["def map(x):", "    return (x, 1)"]
        words = ["zhu", "xi", "da", "fa", "hao"]
        prog = self._pipeline.parallelize(prog_list)
        data = self._pipeline.parallelize(words)
        # set partition number to 1
        # for mv * can only handle one file if target is a file
        self._pipeline.write(prog, output.TextFile(prog_dir).partition(n=1))
        self._pipeline.write(data, output.TextFile(data_dir).partition(n=1))
        self._pipeline.run()

        # rename
        file_prog = os.path.join(prog_dir, "*")
        file_data = os.path.join(data_dir, "*")
        target_prog = os.path.join(prog_dir, "remote_map.py")
        target_data = os.path.join(data_dir, "words.txt")

        if self.running_on_filesystem == "local":
            # only file in the folder
            subprocess.Popen("mv %s %s" % (file_prog, target_prog),
                             shell=True).wait()
            subprocess.Popen("mv %s %s" % (file_data, target_data),
                             shell=True).wait()
        else:
            hadoop = "{bin} fs -conf {conf_path} -mv".format(
                bin=self._pipeline._config['hadoop_client_path'],
                conf_path=self._pipeline._config['hadoop_config_path'])
            subprocess.Popen("{hadoop} {source} {target}".format(
                hadoop=hadoop, source=file_prog, target=target_prog),
                             shell=True).wait()
            subprocess.Popen("{hadoop} {source} {target}".format(
                hadoop=hadoop, source=file_data, target=target_data),
                             shell=True).wait()

        self._pipeline.add_file(target_prog, "remote_map.py")
        self._pipeline.add_file(target_data, "words.txt")

        def _remote_map(x):
            """inner map"""
            local_words = []
            with open("words.txt") as fd:
                for line in fd:
                    local_words.append(line.strip())

            assert local_words == words, "local words and remote words not equal"
            import remote_map
            return remote_map.map(x)

        p_words = self._pipeline.parallelize(words)
        p_words_map = p_words.map(_remote_map)
        p_res = p_words_map.get()
        p_ori = map(lambda w: (w, 1), words)
        self.assertItemsEqual(p_ori, p_res)
Beispiel #2
0
    def test_write_binary_set_by_user(self):
        # Set record delimiter by user
        raw_data = ["aaa", "bbb", "ccc"]
        special_record_delimiter = chr(2) + chr(3) + chr(4)
        record_delimiters = [
            "\t",
            "\r\n",
            special_record_delimiter,
        ]
        expect_data = [
            ["aaa\tbbb\tccc\t"],
            ["aaa", "bbb", "ccc"],
            [
                special_record_delimiter.join(raw_data) +
                special_record_delimiter
            ],
        ]

        self.tmp_output_dirs = []
        for record_delimiter in record_delimiters:
            data = self._pipeline.parallelize(raw_data)

            output_dir = self.generate_tmp_path()
            self.tmp_output_dirs.append(output_dir)

            self._pipeline.write(
                data,
                output.TextFile(output_dir, record_delimiter=record_delimiter))

        self._pipeline.run()

        for idx, output_dir in enumerate(self.tmp_output_dirs):
            self._compare_expect_data_and_output(expect_data[idx], output_dir)
Beispiel #3
0
    def test_partition(self):
        """
        Test partition output
        """

        import os
        try:
            p = self._pipeline.parallelize(["1", "2", "3"])
            self._pipeline.write(p, output.TextFile('./output-1').partition(5))
            self._pipeline.write(
                p,
                output.SequenceFile('./output-2').partition(
                    2, lambda x, n: int(x) % n))
            self._pipeline.run()

            o1 = self._pipeline.read(
                input.SequenceFile('./output-2/part-00000'))
            o1.cache()
            o2 = self._pipeline.read(
                input.SequenceFile('./output-2/part-00001'))
            o2.cache()
            self.assertEqual(["2"], o1.get())
            self.assertItemsEqual(["1", "3"], o2.get())

            n = os.popen('ls output-1/[^_]* | wc -l').read()
            self.assertEqual(5, int(n))
            o = self._pipeline.read(input.TextFile('output-1')).get()
            self.assertItemsEqual(["1", "2", "3"], o)
        finally:
            os.system("rm output-1 output-2 -r")
    def test_add_remote_file(self):
        """test add remote file"""
        prog_dir = self.generate_tmp_path()
        data_dir = self.generate_tmp_path()
        prog_list = ["#!/bin/env python", "#-*- coding:utf-8 -*-"]
        prog_list += ["def map(x):", "    return (x, 1)"]
        words = ["zhu", "xi", "da", "fa", "hao"]
        prog = self._pipeline.parallelize(prog_list)
        data = self._pipeline.parallelize(words)
        # set partition number to 1 so we can make sure all the data is in the part-00000
        self._pipeline.write(prog, output.TextFile(prog_dir).partition(n=1))
        self._pipeline.write(data, output.TextFile(data_dir).partition(n=1))
        self._pipeline.run()

        target_prog = os.path.join(prog_dir, "part-00000")
        # Currently spark fails to deal with cache files with same file name.
        # In this test, move part-00000 to part-data temporarily. Should revert this change when
        # spark can handle same file name cache files.
        origin_target_data = os.path.join(data_dir, "part-00000")
        if self.running_on_filesystem == "local":
            target_data = origin_target_data
        else:
            target_data = os.path.join(data_dir, "part-data")
            self._pipeline._client.fs_mv(origin_target_data, target_data)

        # reset the pipeline, so this can work on spark pipeline.
        # todo: when spark_pipeline provides addFile, use addFile to handle add_file request after
        # todo: first run.
        self.setConfig()
        self._pipeline.add_file(target_prog, "remote_map.py")
        self._pipeline.add_file(target_data, "words.txt")

        def _remote_map(x):
            """inner map"""
            local_words = []
            with open("words.txt") as fd:
                for line in fd:
                    local_words.append(line.strip())

            assert local_words == words, "local words and remote words not equal"
            import remote_map
            return remote_map.map(x)

        p_words = self._pipeline.parallelize(words)
        p_words_map = p_words.map(_remote_map)
        p_ori = p_words.map(lambda x: (x, 1))
        self.passertEqual(0, p_words_map.diff(p_ori).count())
Beispiel #5
0
 def test_text_file(self):
     """
     Case: test text file
     """
     data = self._pipeline.parallelize(['1', '2', '3', '400', '5'])
     local_file = self.generate_tmp_path()
     self._pipeline.write(data, output.TextFile(local_file))
     self._pipeline.run()
     result = self._pipeline.read(input.TextFile(local_file))
     self.assertItemsEqual(['1', '2', '3', '400', '5'], result.get())
     # test read with \0 in the file
     null_data = data.map(lambda x: x + "\0")
     null_file = self.generate_tmp_path()
     self._pipeline.write(null_data, output.TextFile(null_file))
     self._pipeline.run()
     null_read = self._pipeline.read(input.TextFile(null_file))
     self.passertEqual(0, null_read.diff(null_data).count())
Beispiel #6
0
 def test_write_binary_use_default(self):
     # Use default record delimiter
     raw_data = ["aaa", "bbb", "ccc"]
     data = self._pipeline.parallelize(raw_data)
     output_dir = self.generate_tmp_path()
     self._pipeline.write(data, output.TextFile(output_dir))
     self._pipeline.run()
     self._compare_expect_data_and_output(raw_data, output_dir)
Beispiel #7
0
 def test_text_file_sync(self):
     """
     Case: test text file
     """
     data = self._pipeline.parallelize(['1', '2', '3', '400', '5'])
     local_file = self.generate_tmp_path()
     self._pipeline.write(data, output.TextFile(local_file, async_mode=False))
     self._pipeline.run()
     result = self._pipeline.read(input.TextFile(local_file))
     self.assertItemsEqual(['1', '2', '3', '400', '5'], result.get())
Beispiel #8
0
 def test_sequence_file_invalid(self):
     """
     Case: test sequence file invalid
     """
     data = self._pipeline.parallelize(['1', '2', '3', '400', '5'])
     local_file = self.generate_tmp_path()
     self._pipeline.write(data, output.TextFile(local_file))
     self._pipeline.run()
     result = self._pipeline.read(input.SequenceFile(local_file))
     with self.assertRaises(error.BigflowRuntimeException):
         result.get()
Beispiel #9
0
 def test_text_file_sort(self):
     """
     Case: test text file sort
     """
     data = self._pipeline.parallelize([500, 2, 100, 600, 7])
     local_file = self.generate_tmp_path()
     self._pipeline.write(data, output.TextFile(local_file).sort())
     self._pipeline.run()
     data_file = local_file + '/part-00000'
     file = open(data_file)
     self.assertItemsEqual(['2\n', '7\n', '100\n', '500\n', '600\n'], file.readlines())
Beispiel #10
0
    def test_output_sort(self):
        self.setConfig(spark_conf={
            "spark.default.parallelism": "1",
        })
        """ test """

        lines = self._pipeline.parallelize([5, 1, 2, 0, 3, 4])\
                .map(lambda x: str(x), serde=serde.of(str))

        out1_path = self.generate_tmp_path() + '/output-1/'
        out2_path = self.generate_tmp_path() + '/output-2/'
        self._pipeline.write(
            lines,
            output.TextFile(out1_path).sort().partition(
                n=2, partition_fn=lambda x, n: int(x) % n))
        self._pipeline.write(
            lines,
            output.TextFile(out2_path).sort(reverse=True).partition(
                n=2, partition_fn=lambda x, n: int(x) % n))
        self._pipeline.run()
        l11 = self._pipeline.read(input.TextFile(out1_path + '/part-00000'))\
                 .accumulate('', lambda x, y: x + y)
        l12 = self._pipeline.read(input.TextFile(out1_path + '/part-00001'))\
                 .accumulate('', lambda x, y: x + y)

        l21 = self._pipeline.read(input.TextFile(out2_path + '/part-00000'))\
                 .accumulate('', lambda x, y: x + y)
        l22 = self._pipeline.read(input.TextFile(out2_path + '/part-00001'))\
                 .accumulate('', lambda x, y: x + y)
        l11.cache()
        l12.cache()
        l21.cache()
        l22.cache()
        self.assertEqual('024', l11.get())
        self.assertEqual('135', l12.get())
        self.assertEqual('420', l21.get())
        self.assertEqual('531', l22.get())
Beispiel #11
0
 def wildcard_case(self):
     """
     Case: test wildcard
     """
     input_data = [['1', '2', '3'], ['400', '5'], ['401', '501']]
     index = 0
     root_path = self.generate_tmp_path()
     for tmp_data in input_data:
         data = self._pipeline.parallelize(tmp_data)
         path = root_path + '/X' + str(index)
         self._pipeline.write(data, output.TextFile(path))
         index = index + 1
     self._pipeline.run()
     match_path = root_path + '/*'
     result = self._pipeline.read(input.TextFile(match_path))
     self.assertItemsEqual(['401', '501', '1', '2', '3', '400', '5'], result.get())
Beispiel #12
0
    def test_broadcast(self):
        """
        Unit tests entrance
        """
        output_path = self.generate_tmp_path()
        pc  = self._pipeline.parallelize([1,2,3])
        pc1 = self._pipeline.parallelize([1,2,3])
        pc2 = self._pipeline.parallelize([4,2,6])
        pc3 = pc.map(lambda x,y,z:(x, (x in y) and (x in z)), pc1, pc2)
        pc4 = pc3.map(lambda x:"\t".join(map(str, x)))
        self._pipeline.write(pc4, output.TextFile(output_path).partition(n=2))
        self._pipeline.run()

        parts = ['part-00000', 'part-00001']
        input_path = map(lambda path:os.path.join(output_path, path), parts)
        result = self._pipeline.read(input.TextFile(*input_path))
        target = ['1\tFalse', '2\tTrue', '3\tFalse']
        self.assertItemsEqual(result.get(), target)
Beispiel #13
0
    def test_gzip_file(self):
        """
        Test read/write gzip files
        """
        import os
        try:
            mem_testdata = ['1', '2', '3']
            p = self._pipeline.parallelize(mem_testdata)
            target = output.TextFile('output-gzip').with_compression(
                "gzip").partition(2)
            self._pipeline.write(p, target)
            self._pipeline.run()
            self.assertTrue(os.path.isdir('output-gzip'))

            read = os.popen('gzip -cd output-gzip/*').read()
            self.assertItemsEqual(mem_testdata, read.rstrip('\n').split('\n'))

        finally:
            os.system("rm output-gzip -r")
Beispiel #14
0
    def test_write_binary_none(self):
        # Don't set record_delimiter, write binary.
        chars = [chr(i) for i in xrange(1, 10)]
        from random import shuffle
        from random import randint
        shuffle(chars)

        raw_data = []
        for cnt in xrange(100):
            index1 = randint(0, len(chars) - 1)
            index2 = randint(0, len(chars) - 1)
            raw_data.append("".join(
                chars[min(index1, index2):max(index1, index2)]))
        raw_data

        data = self._pipeline.parallelize(raw_data)
        output_dir = self.generate_tmp_path()
        self._pipeline.write(
            data, output.TextFile(output_dir, record_delimiter=None))
        self._pipeline.run()
        expect_data = ["".join(raw_data)]
        self._compare_expect_data_and_output(expect_data, output_dir)
Beispiel #15
0
计算得出每个网址访问的uv(被不同人访问的次数):
g.cn    1
qq.com  2
baidu.com   3
163.com 1
"""

import os
from bigflow import base, input, output, transforms


#输入是pcollection,对其做distinct和count,即求每个网址的uv
def count_distinct(p):
    return p.distinct().count()


#创建pipeline
_pipeline = base.Pipeline.create("LOCAL")

dir = os.path.dirname(os.path.abspath(__file__)) + "/data"
input_path = dir + "/" + "uv.text"
#读取输入并格式化
col = _pipeline.read(input.TextFile(input_path))
col = col.map(lambda x: x.split())
#按网址分组,并对每个网址求uv
col = col.group_by_key().apply_values(count_distinct).flatten()
col = col.map(lambda x: x[0] + "\t" + str(x[1]))
#写输出
_pipeline.write(col, output.TextFile("/tmp/website_uv"))
_pipeline.run()
Beispiel #16
0
    def end(self):
        record = (self._word, self._sum)
        self._emitter.emit(record)


pipeline = base.Pipeline.create('local')
plan = pipeline.plan()
plan.set_environment(entity.PythonEnvironment())

input_path = sys.path[0] + "/" + __file__
input_urls = [input_path]
output_path = sys.path[0] + "/" + "output"

single_word = plan.load(input_urls)\
        .by(input.TextFile(input_urls[0]).input_format).as_type(record_objector.RecordObjector())\
        .process_by(PythonFromRecordProcessor()).as_type(serde.any())\
        .process_by(WordSpliter()).as_type(serde.any())

result = plan.shuffle(single_word.scope(), [single_word])\
        .with_concurrency(10)\
        .node(0).match_by(WordIdentity(lambda x: x[0], serde.any()))\
        .process_by(WordCount()).as_type(serde.any())\
        .input(0).allow_partial_processing().done()\
        .process_by(WordCount()).as_type(serde.any())

plan.shuffle(plan.global_scope(), [result]).node(0).distribute_by_default()\
        .process_by(PythonToRecordProcessor()).as_type(record_objector.RecordObjector())\
        .sink_by(output.TextFile(output_path).output_format)

pipeline.run()
Beispiel #17
0
pipeline = base.Pipeline.create(
    # 指定计算引擎为"spark"或"SPARK"
    "spark",

    # 指定tmp_data_path
    tmp_data_path="hdfs:///app/dc/bigflow/tmp",

    # 指定spark配置
    spark_conf=spark_conf,

    # default_concurrency不是必须选项,该example数据量小可以设置小一些
    default_concurrency=250,
)

#case_str = "case4_2"
input_path = sys.argv[1]
output_path = sys.argv[2]

# 可通过 parallelize 构造P类型

data = pipeline.read(input.TextFile(input_path))
# 在P类型上应用transforms
result = data.map(lambda x: x.split()).group_by_key()\
        .apply_values(lambda x: x.max_elements(5, lambda x: x)).flatten()\
        .map(lambda t: "%s %s" % (t[0], t[1]))

# 当前预览版不支持get操作,只能通过pipelined的write方法将P类型写入文件系统
pipeline.write(result, output.TextFile(output_path))
pipeline.run()
Beispiel #18
0
    def test_schema_text_file(self):
        """
        Case: test schema text file
        """
        data = self._pipeline.parallelize(
            ['www.baidu.com,3', 'www.sina.com,6'])
        local_file = self.generate_tmp_path()
        self._pipeline.write(data, output.TextFile(local_file))
        self._pipeline.run()
        result_dict_sd = self._pipeline.read(
            input.SchemaTextFile(local_file,
                                 columns=[("website", str), ("clicknum", int)],
                                 separator=","))
        result_dict = self._pipeline.read(
            input.SchemaTextFile(local_file,
                                 columns=["website", "clicknum"],
                                 separator=","))
        result_tuple = self._pipeline.read(
            input.SchemaTextFile(local_file, columns=2, separator=","))
        result_tuple_type = self._pipeline.read(
            input.SchemaTextFile(local_file, columns=[str, int],
                                 separator=","))

        expect_dict_sd = \
        [{'clicknum': 3, 'website': 'www.baidu.com'},
         {'clicknum': 6, 'website': 'www.sina.com'}]
        self.assertItemsEqual(expect_dict_sd, result_dict_sd.get())

        expect_dict = \
        [{'clicknum': '3', 'website': 'www.baidu.com'},
         {'clicknum': '6', 'website': 'www.sina.com'}]
        self.assertItemsEqual(expect_dict, result_dict.get())

        expect_tuple = \
        [('www.baidu.com', '3'), ('www.sina.com', '6')]
        self.assertItemsEqual(expect_tuple, result_tuple.get())

        expect_tuple_type = \
        [('www.baidu.com', 3), ('www.sina.com', 6)]
        self.assertItemsEqual(expect_tuple_type, result_tuple_type.get())

        self._pipeline.write(
            result_dict,
            output.SchemaTextFile(local_file,
                                  columns=["website", "clicknum"],
                                  separator=","))
        self._pipeline.run()
        result_dict = self._pipeline.read(
            input.SchemaTextFile(local_file,
                                 columns=["website", "clicknum"],
                                 separator=","))
        self.assertItemsEqual(expect_dict, result_dict.get())

        self._pipeline.write(
            result_dict_sd,
            output.SchemaTextFile(local_file,
                                  columns=["website", "clicknum"],
                                  separator=","))
        self._pipeline.run()
        result_dict_sd = self._pipeline.read(
            input.SchemaTextFile(local_file,
                                 columns=[("website", str), ("clicknum", int)],
                                 separator=","))
        self.assertItemsEqual(expect_dict_sd, result_dict_sd.get())

        self._pipeline.write(result_tuple, output.SchemaTextFile(local_file))

        self._pipeline.write(result_tuple, output.SchemaTextFile(local_file))
        self._pipeline.run()
        result_tuple = self._pipeline.read(
            input.SchemaTextFile(local_file, columns=2))
        self.assertItemsEqual(expect_tuple, result_tuple.get())

        result_tuple_type = result_tuple_type.map(lambda (w, c): (w, c))
        self._pipeline.write(result_tuple_type,
                             output.SchemaTextFile(local_file, columns=2))
        self._pipeline.run()
        result_tuple_type = self._pipeline.read(
            input.SchemaTextFile(local_file, columns=[str, int]))
        self.assertItemsEqual(expect_tuple_type, result_tuple_type.get())
Beispiel #19
0
                                    task_name, DATE)
    #job_name = 'feed_production_day_relerec_state' + "_" + DATE
    pipeline = base.Pipeline.create("local" if ISTEST else "DAGMR",
                                    job_name=job_name,
                                    tmp_data_path=afs_tmp,
                                    hadoop_job_conf=job_conf)
    # 核心任务逻辑
    pipeline.add_file("./bigflow_python/proto/sample_pb2.py",
                      "./sample_pb2.py")

    # to run in local mode, run code below first, then read from local file
    #pipeline = base.Pipeline.create("DAGMR",
    #        job_name=job_name,
    #        tmp_data_path=afs_tmp,
    #        hadoop_job_conf=job_conf)
    #pbs = pipeline.read(input.SequenceFile(*input_path, serde=serde.StrSerde()))
    #pipeline.write(pbs, output.SequenceFile(output_path, serde=serde.StrSerde()))
    #pipeline.run()
    pbs = pipeline.read(
        input.SequenceFile(*input_path,
                           serde=serde.ProtobufSerde(sample_pb2.Sample)))
    p = pbs.flat_map(emit_features)\
        .group_by(key_extractor=lambda x:x[0], value_extractor=lambda x:x[1])\
        .apply_values(transforms.reduce, lambda a,b: map(operator.add, a, b)).flatten()\
        .map(lambda x: [x[0], x[1] + [float(x[1][2]) / (x[1][1]) if x[1][1] > 0 else 0]])\
        .group_by(key_extractor=lambda x:x[0].split('\t')[0], value_extractor=lambda x:x[1])\
        .apply_values(average).flatten().map(lambda x: '\t'.join(x[0].split('#') + map(str, x[1])))

    # output
    pipeline.write(p, output.TextFile(output_path).partition(n=1))
    pipeline.run()