예제 #1
0
    def test_gzip_file_case_2(self):
        expect = [
            'wan', 'cheng', 'Hello, world!', 'Hello, toft!', 'Hello, flume!'
        ]

        p = self._pipeline.read(input.TextFile('testdata/gzip'))
        self.passertEqual(expect, p)

        p = self._pipeline.read(
            input.TextFile('testdata/gzip/part-00000.gz',
                           'testdata/gzip/part-00001.gz'))
        self.passertEqual(expect, p)

        p = self._pipeline.read(input.TextFile('testdata/gzip/*'))
        self.passertEqual(expect, p)
예제 #2
0
    def test_partition(self):
        """
        Test partition output
        """

        import os
        try:
            p = self._pipeline.parallelize(["1", "2", "3"])
            self._pipeline.write(p, output.TextFile('./output-1').partition(5))
            self._pipeline.write(
                p,
                output.SequenceFile('./output-2').partition(
                    2, lambda x, n: int(x) % n))
            self._pipeline.run()

            o1 = self._pipeline.read(
                input.SequenceFile('./output-2/part-00000'))
            o1.cache()
            o2 = self._pipeline.read(
                input.SequenceFile('./output-2/part-00001'))
            o2.cache()
            self.assertEqual(["2"], o1.get())
            self.assertItemsEqual(["1", "3"], o2.get())

            n = os.popen('ls output-1/[^_]* | wc -l').read()
            self.assertEqual(5, int(n))
            o = self._pipeline.read(input.TextFile('output-1')).get()
            self.assertItemsEqual(["1", "2", "3"], o)
        finally:
            os.system("rm output-1 output-2 -r")
예제 #3
0
 def test_text_file(self):
     """
     Case: test text file
     """
     data = self._pipeline.parallelize(['1', '2', '3', '400', '5'])
     local_file = self.generate_tmp_path()
     self._pipeline.write(data, output.TextFile(local_file))
     self._pipeline.run()
     result = self._pipeline.read(input.TextFile(local_file))
     self.assertItemsEqual(['1', '2', '3', '400', '5'], result.get())
     # test read with \0 in the file
     null_data = data.map(lambda x: x + "\0")
     null_file = self.generate_tmp_path()
     self._pipeline.write(null_data, output.TextFile(null_file))
     self._pipeline.run()
     null_read = self._pipeline.read(input.TextFile(null_file))
     self.passertEqual(0, null_read.diff(null_data).count())
예제 #4
0
    def getResultWithText(self, pipeline_status, path):
        """ no comments """
        pipeline_status.wait_status("APP_RUN")
        import time
        time.sleep(300)

        local_pipeline = base.Pipeline.create('local')
        result = local_pipeline.read(input.TextFile(path))
        return result.get()
예제 #5
0
 def test_text_file_sync(self):
     """
     Case: test text file
     """
     data = self._pipeline.parallelize(['1', '2', '3', '400', '5'])
     local_file = self.generate_tmp_path()
     self._pipeline.write(data, output.TextFile(local_file, async_mode=False))
     self._pipeline.run()
     result = self._pipeline.read(input.TextFile(local_file))
     self.assertItemsEqual(['1', '2', '3', '400', '5'], result.get())
예제 #6
0
 def test_partitioned(self):
     """
     test
     """
     testdata = self._pipeline.read(input.TextFile(
             'testdata/part-00000',
             'testdata/part-00001',
             partitioned=True))
     expect = {'x': ['1', '2', '3'], 'y': ['4', '5', '6', '7']}
     table = self._pipeline.parallelize(expect)
     self.assertEqual(repr(table), repr(testdata))
     self.assertItemsEqual(table.flatten_values().get(), testdata.flatten_values().get())
예제 #7
0
    def test_output_sort(self):
        self.setConfig(spark_conf={
            "spark.default.parallelism": "1",
        })
        """ test """

        lines = self._pipeline.parallelize([5, 1, 2, 0, 3, 4])\
                .map(lambda x: str(x), serde=serde.of(str))

        out1_path = self.generate_tmp_path() + '/output-1/'
        out2_path = self.generate_tmp_path() + '/output-2/'
        self._pipeline.write(
            lines,
            output.TextFile(out1_path).sort().partition(
                n=2, partition_fn=lambda x, n: int(x) % n))
        self._pipeline.write(
            lines,
            output.TextFile(out2_path).sort(reverse=True).partition(
                n=2, partition_fn=lambda x, n: int(x) % n))
        self._pipeline.run()
        l11 = self._pipeline.read(input.TextFile(out1_path + '/part-00000'))\
                 .accumulate('', lambda x, y: x + y)
        l12 = self._pipeline.read(input.TextFile(out1_path + '/part-00001'))\
                 .accumulate('', lambda x, y: x + y)

        l21 = self._pipeline.read(input.TextFile(out2_path + '/part-00000'))\
                 .accumulate('', lambda x, y: x + y)
        l22 = self._pipeline.read(input.TextFile(out2_path + '/part-00001'))\
                 .accumulate('', lambda x, y: x + y)
        l11.cache()
        l12.cache()
        l21.cache()
        l22.cache()
        self.assertEqual('024', l11.get())
        self.assertEqual('135', l12.get())
        self.assertEqual('420', l21.get())
        self.assertEqual('531', l22.get())
예제 #8
0
 def wildcard_case(self):
     """
     Case: test wildcard
     """
     input_data = [['1', '2', '3'], ['400', '5'], ['401', '501']]
     index = 0
     root_path = self.generate_tmp_path()
     for tmp_data in input_data:
         data = self._pipeline.parallelize(tmp_data)
         path = root_path + '/X' + str(index)
         self._pipeline.write(data, output.TextFile(path))
         index = index + 1
     self._pipeline.run()
     match_path = root_path + '/*'
     result = self._pipeline.read(input.TextFile(match_path))
     self.assertItemsEqual(['401', '501', '1', '2', '3', '400', '5'], result.get())
예제 #9
0
    def test_cache(self):
        """ inner """
        f = open('lines.txt', 'w')
        f.writelines(['1 2 3 1 2 3', ' 1 2 3 4 5 6'])
        f.write('\n')
        f.close()
        lines = self._pipeline.read(input.TextFile('lines.txt'))

        def wordcount(plist):
            """ inner """
            return plist.group_by(lambda whole: whole) \
                .apply_values(transforms.count)

        wordcnt = lines.flat_map(lambda line: line.split()) \
            .group_by(lambda whole: whole) \
            .apply_values(wordcount)

        wordcnt.cache()
        expected = {
            "1": {
                "1": 3
            },
            "2": {
                "2": 3
            },
            "3": {
                "3": 3
            },
            "4": {
                "4": 1
            },
            "5": {
                "5": 1
            },
            "6": {
                "6": 1
            }
        }
        self.assertEqual(expected, wordcnt.get())

        os.system("rm -rf lines.txt")

        flattend = wordcnt.flatten()
        flattened_values = wordcnt.flatten_values()

        flattened_values.cache()
        self.assertItemsEqual([1, 1, 1, 3, 3, 3], flattened_values.get())
예제 #10
0
    def test_broadcast(self):
        """
        Unit tests entrance
        """
        output_path = self.generate_tmp_path()
        pc  = self._pipeline.parallelize([1,2,3])
        pc1 = self._pipeline.parallelize([1,2,3])
        pc2 = self._pipeline.parallelize([4,2,6])
        pc3 = pc.map(lambda x,y,z:(x, (x in y) and (x in z)), pc1, pc2)
        pc4 = pc3.map(lambda x:"\t".join(map(str, x)))
        self._pipeline.write(pc4, output.TextFile(output_path).partition(n=2))
        self._pipeline.run()

        parts = ['part-00000', 'part-00001']
        input_path = map(lambda path:os.path.join(output_path, path), parts)
        result = self._pipeline.read(input.TextFile(*input_path))
        target = ['1\tFalse', '2\tTrue', '3\tFalse']
        self.assertItemsEqual(result.get(), target)
예제 #11
0
    def test_SchemeTextFile_on_nonschema_pcollection(self):
        """
        非schemo pcollection使用SchemaTextFile进行输出
        """
        data = self._load_data_by_parallelize()
        data = data.map(lambda t: \
                dict(zip(("name", "school", "age", "height", "weight"), t)))

        tmp_output_path = self.generate_tmp_path()
        self._pipeline.write(
            data,
            output.SchemaTextFile(tmp_output_path, columns=["name", "age"]))
        self._pipeline.run()

        def _func(p):
            item = p.split("\t")
            return [item[0], int(item[1])]

        pc = self._pipeline.read(input.TextFile(tmp_output_path)).map(_func)
        expect_result = [["xiaoming", 12], ["xiaogang", 15], ["xiaohong", 18]]
        self.passertEqual(expect_result, pc)
예제 #12
0
 def test_gz_file(self):
     """
     Case: test text file
     """
     data = self._pipeline.read(input.TextFile('./testdata/part-00001.gz'))
     self.assertEqual(5626, data.count().get())
예제 #13
0
计算得出每个网址访问的uv(被不同人访问的次数):
g.cn    1
qq.com  2
baidu.com   3
163.com 1
"""

import os
from bigflow import base, input, output, transforms


#输入是pcollection,对其做distinct和count,即求每个网址的uv
def count_distinct(p):
    return p.distinct().count()


#创建pipeline
_pipeline = base.Pipeline.create("LOCAL")

dir = os.path.dirname(os.path.abspath(__file__)) + "/data"
input_path = dir + "/" + "uv.text"
#读取输入并格式化
col = _pipeline.read(input.TextFile(input_path))
col = col.map(lambda x: x.split())
#按网址分组,并对每个网址求uv
col = col.group_by_key().apply_values(count_distinct).flatten()
col = col.map(lambda x: x[0] + "\t" + str(x[1]))
#写输出
_pipeline.write(col, output.TextFile("/tmp/website_uv"))
_pipeline.run()
예제 #14
0
 def _compare_expect_data_and_output(self, expect_data, output_path):
     data = self._pipeline.read(input.TextFile(output_path)).get()
     self.assertItemsEqual(expect_data, data)
예제 #15
0
파일: avg.py 프로젝트: zz198808/bigflow
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#!filecoding:utf-8
"""
现在有一个文件,文件里有一堆数字,以空格和换行隔开。
求这个文件中所有数字的平均值(下取整)
"""

import os
from bigflow import base, input, output
#从标准输入读取文件列表,文件之间空格隔开
pipeline = base.Pipeline.create('LOCAL')

dir = os.path.dirname(os.path.abspath(__file__)) + "/data"

numbers = pipeline.read(input.TextFile(dir + "/" + "number.text"))\
         .flat_map(lambda line: line.split()) \
         .map(lambda n: int(n))


def avg(p):
    return p.sum() / p.count()


print numbers.apply(avg).get()
예제 #16
0
    def end(self):
        record = (self._word, self._sum)
        self._emitter.emit(record)


pipeline = base.Pipeline.create('local')
plan = pipeline.plan()
plan.set_environment(entity.PythonEnvironment())

input_path = sys.path[0] + "/" + __file__
input_urls = [input_path]
output_path = sys.path[0] + "/" + "output"

single_word = plan.load(input_urls)\
        .by(input.TextFile(input_urls[0]).input_format).as_type(record_objector.RecordObjector())\
        .process_by(PythonFromRecordProcessor()).as_type(serde.any())\
        .process_by(WordSpliter()).as_type(serde.any())

result = plan.shuffle(single_word.scope(), [single_word])\
        .with_concurrency(10)\
        .node(0).match_by(WordIdentity(lambda x: x[0], serde.any()))\
        .process_by(WordCount()).as_type(serde.any())\
        .input(0).allow_partial_processing().done()\
        .process_by(WordCount()).as_type(serde.any())

plan.shuffle(plan.global_scope(), [result]).node(0).distribute_by_default()\
        .process_by(PythonToRecordProcessor()).as_type(record_objector.RecordObjector())\
        .sink_by(output.TextFile(output_path).output_format)

pipeline.run()
예제 #17
0
pipeline = base.Pipeline.create(
    # 指定计算引擎为"spark"或"SPARK"
    "spark",

    # 指定tmp_data_path
    tmp_data_path="hdfs:///app/dc/bigflow/tmp",

    # 指定spark配置
    spark_conf=spark_conf,

    # default_concurrency不是必须选项,该example数据量小可以设置小一些
    default_concurrency=250,
)

#case_str = "case4_2"
input_path = sys.argv[1]
output_path = sys.argv[2]

# 可通过 parallelize 构造P类型

data = pipeline.read(input.TextFile(input_path))
# 在P类型上应用transforms
result = data.map(lambda x: x.split()).group_by_key()\
        .apply_values(lambda x: x.max_elements(5, lambda x: x)).flatten()\
        .map(lambda t: "%s %s" % (t[0], t[1]))

# 当前预览版不支持get操作,只能通过pipelined的write方法将P类型写入文件系统
pipeline.write(result, output.TextFile(output_path))
pipeline.run()
예제 #18
0
文件C:
user1
user3
user4

输出
将消费总和输出到标准输出。
3021
"""

import os
from bigflow import base, input, output
p = base.Pipeline.create('LOCAL')

dir = os.path.dirname(os.path.abspath(__file__)) + "/data/"
(A, B, C) = (dir + "A.text", dir + "B.text", dir + "C.text")
records = p.read(input.TextFile(A)).map(lambda _: _.split())  # user, ip, cost
ip_blacklist = p.read(input.TextFile(B)).map(lambda _: (_, None))
user_whitelist = p.read(input.TextFile(C)).map(lambda _: (_, None))

print records.map(lambda _: (_[1], (_[0], int(_[2])))) \
        .cogroup(ip_blacklist) \
        .apply_values(lambda records, ips: records.filter(lambda _, cnt: cnt == 0, ips.count())) \
        .flatten() \
        .map(lambda _: _[1]) \
        .cogroup(user_whitelist) \
        .apply_values(lambda records, users: records.filter(lambda _, cnt: cnt != 0, users.count())) \
        .flatten_values() \
        .sum() \
        .get()