def test_gzip_file_case_2(self): expect = [ 'wan', 'cheng', 'Hello, world!', 'Hello, toft!', 'Hello, flume!' ] p = self._pipeline.read(input.TextFile('testdata/gzip')) self.passertEqual(expect, p) p = self._pipeline.read( input.TextFile('testdata/gzip/part-00000.gz', 'testdata/gzip/part-00001.gz')) self.passertEqual(expect, p) p = self._pipeline.read(input.TextFile('testdata/gzip/*')) self.passertEqual(expect, p)
def test_partition(self): """ Test partition output """ import os try: p = self._pipeline.parallelize(["1", "2", "3"]) self._pipeline.write(p, output.TextFile('./output-1').partition(5)) self._pipeline.write( p, output.SequenceFile('./output-2').partition( 2, lambda x, n: int(x) % n)) self._pipeline.run() o1 = self._pipeline.read( input.SequenceFile('./output-2/part-00000')) o1.cache() o2 = self._pipeline.read( input.SequenceFile('./output-2/part-00001')) o2.cache() self.assertEqual(["2"], o1.get()) self.assertItemsEqual(["1", "3"], o2.get()) n = os.popen('ls output-1/[^_]* | wc -l').read() self.assertEqual(5, int(n)) o = self._pipeline.read(input.TextFile('output-1')).get() self.assertItemsEqual(["1", "2", "3"], o) finally: os.system("rm output-1 output-2 -r")
def test_text_file(self): """ Case: test text file """ data = self._pipeline.parallelize(['1', '2', '3', '400', '5']) local_file = self.generate_tmp_path() self._pipeline.write(data, output.TextFile(local_file)) self._pipeline.run() result = self._pipeline.read(input.TextFile(local_file)) self.assertItemsEqual(['1', '2', '3', '400', '5'], result.get()) # test read with \0 in the file null_data = data.map(lambda x: x + "\0") null_file = self.generate_tmp_path() self._pipeline.write(null_data, output.TextFile(null_file)) self._pipeline.run() null_read = self._pipeline.read(input.TextFile(null_file)) self.passertEqual(0, null_read.diff(null_data).count())
def getResultWithText(self, pipeline_status, path): """ no comments """ pipeline_status.wait_status("APP_RUN") import time time.sleep(300) local_pipeline = base.Pipeline.create('local') result = local_pipeline.read(input.TextFile(path)) return result.get()
def test_text_file_sync(self): """ Case: test text file """ data = self._pipeline.parallelize(['1', '2', '3', '400', '5']) local_file = self.generate_tmp_path() self._pipeline.write(data, output.TextFile(local_file, async_mode=False)) self._pipeline.run() result = self._pipeline.read(input.TextFile(local_file)) self.assertItemsEqual(['1', '2', '3', '400', '5'], result.get())
def test_partitioned(self): """ test """ testdata = self._pipeline.read(input.TextFile( 'testdata/part-00000', 'testdata/part-00001', partitioned=True)) expect = {'x': ['1', '2', '3'], 'y': ['4', '5', '6', '7']} table = self._pipeline.parallelize(expect) self.assertEqual(repr(table), repr(testdata)) self.assertItemsEqual(table.flatten_values().get(), testdata.flatten_values().get())
def test_output_sort(self): self.setConfig(spark_conf={ "spark.default.parallelism": "1", }) """ test """ lines = self._pipeline.parallelize([5, 1, 2, 0, 3, 4])\ .map(lambda x: str(x), serde=serde.of(str)) out1_path = self.generate_tmp_path() + '/output-1/' out2_path = self.generate_tmp_path() + '/output-2/' self._pipeline.write( lines, output.TextFile(out1_path).sort().partition( n=2, partition_fn=lambda x, n: int(x) % n)) self._pipeline.write( lines, output.TextFile(out2_path).sort(reverse=True).partition( n=2, partition_fn=lambda x, n: int(x) % n)) self._pipeline.run() l11 = self._pipeline.read(input.TextFile(out1_path + '/part-00000'))\ .accumulate('', lambda x, y: x + y) l12 = self._pipeline.read(input.TextFile(out1_path + '/part-00001'))\ .accumulate('', lambda x, y: x + y) l21 = self._pipeline.read(input.TextFile(out2_path + '/part-00000'))\ .accumulate('', lambda x, y: x + y) l22 = self._pipeline.read(input.TextFile(out2_path + '/part-00001'))\ .accumulate('', lambda x, y: x + y) l11.cache() l12.cache() l21.cache() l22.cache() self.assertEqual('024', l11.get()) self.assertEqual('135', l12.get()) self.assertEqual('420', l21.get()) self.assertEqual('531', l22.get())
def wildcard_case(self): """ Case: test wildcard """ input_data = [['1', '2', '3'], ['400', '5'], ['401', '501']] index = 0 root_path = self.generate_tmp_path() for tmp_data in input_data: data = self._pipeline.parallelize(tmp_data) path = root_path + '/X' + str(index) self._pipeline.write(data, output.TextFile(path)) index = index + 1 self._pipeline.run() match_path = root_path + '/*' result = self._pipeline.read(input.TextFile(match_path)) self.assertItemsEqual(['401', '501', '1', '2', '3', '400', '5'], result.get())
def test_cache(self): """ inner """ f = open('lines.txt', 'w') f.writelines(['1 2 3 1 2 3', ' 1 2 3 4 5 6']) f.write('\n') f.close() lines = self._pipeline.read(input.TextFile('lines.txt')) def wordcount(plist): """ inner """ return plist.group_by(lambda whole: whole) \ .apply_values(transforms.count) wordcnt = lines.flat_map(lambda line: line.split()) \ .group_by(lambda whole: whole) \ .apply_values(wordcount) wordcnt.cache() expected = { "1": { "1": 3 }, "2": { "2": 3 }, "3": { "3": 3 }, "4": { "4": 1 }, "5": { "5": 1 }, "6": { "6": 1 } } self.assertEqual(expected, wordcnt.get()) os.system("rm -rf lines.txt") flattend = wordcnt.flatten() flattened_values = wordcnt.flatten_values() flattened_values.cache() self.assertItemsEqual([1, 1, 1, 3, 3, 3], flattened_values.get())
def test_broadcast(self): """ Unit tests entrance """ output_path = self.generate_tmp_path() pc = self._pipeline.parallelize([1,2,3]) pc1 = self._pipeline.parallelize([1,2,3]) pc2 = self._pipeline.parallelize([4,2,6]) pc3 = pc.map(lambda x,y,z:(x, (x in y) and (x in z)), pc1, pc2) pc4 = pc3.map(lambda x:"\t".join(map(str, x))) self._pipeline.write(pc4, output.TextFile(output_path).partition(n=2)) self._pipeline.run() parts = ['part-00000', 'part-00001'] input_path = map(lambda path:os.path.join(output_path, path), parts) result = self._pipeline.read(input.TextFile(*input_path)) target = ['1\tFalse', '2\tTrue', '3\tFalse'] self.assertItemsEqual(result.get(), target)
def test_SchemeTextFile_on_nonschema_pcollection(self): """ 非schemo pcollection使用SchemaTextFile进行输出 """ data = self._load_data_by_parallelize() data = data.map(lambda t: \ dict(zip(("name", "school", "age", "height", "weight"), t))) tmp_output_path = self.generate_tmp_path() self._pipeline.write( data, output.SchemaTextFile(tmp_output_path, columns=["name", "age"])) self._pipeline.run() def _func(p): item = p.split("\t") return [item[0], int(item[1])] pc = self._pipeline.read(input.TextFile(tmp_output_path)).map(_func) expect_result = [["xiaoming", 12], ["xiaogang", 15], ["xiaohong", 18]] self.passertEqual(expect_result, pc)
def test_gz_file(self): """ Case: test text file """ data = self._pipeline.read(input.TextFile('./testdata/part-00001.gz')) self.assertEqual(5626, data.count().get())
计算得出每个网址访问的uv(被不同人访问的次数): g.cn 1 qq.com 2 baidu.com 3 163.com 1 """ import os from bigflow import base, input, output, transforms #输入是pcollection,对其做distinct和count,即求每个网址的uv def count_distinct(p): return p.distinct().count() #创建pipeline _pipeline = base.Pipeline.create("LOCAL") dir = os.path.dirname(os.path.abspath(__file__)) + "/data" input_path = dir + "/" + "uv.text" #读取输入并格式化 col = _pipeline.read(input.TextFile(input_path)) col = col.map(lambda x: x.split()) #按网址分组,并对每个网址求uv col = col.group_by_key().apply_values(count_distinct).flatten() col = col.map(lambda x: x[0] + "\t" + str(x[1])) #写输出 _pipeline.write(col, output.TextFile("/tmp/website_uv")) _pipeline.run()
def _compare_expect_data_and_output(self, expect_data, output_path): data = self._pipeline.read(input.TextFile(output_path)).get() self.assertItemsEqual(expect_data, data)
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #!filecoding:utf-8 """ 现在有一个文件,文件里有一堆数字,以空格和换行隔开。 求这个文件中所有数字的平均值(下取整) """ import os from bigflow import base, input, output #从标准输入读取文件列表,文件之间空格隔开 pipeline = base.Pipeline.create('LOCAL') dir = os.path.dirname(os.path.abspath(__file__)) + "/data" numbers = pipeline.read(input.TextFile(dir + "/" + "number.text"))\ .flat_map(lambda line: line.split()) \ .map(lambda n: int(n)) def avg(p): return p.sum() / p.count() print numbers.apply(avg).get()
def end(self): record = (self._word, self._sum) self._emitter.emit(record) pipeline = base.Pipeline.create('local') plan = pipeline.plan() plan.set_environment(entity.PythonEnvironment()) input_path = sys.path[0] + "/" + __file__ input_urls = [input_path] output_path = sys.path[0] + "/" + "output" single_word = plan.load(input_urls)\ .by(input.TextFile(input_urls[0]).input_format).as_type(record_objector.RecordObjector())\ .process_by(PythonFromRecordProcessor()).as_type(serde.any())\ .process_by(WordSpliter()).as_type(serde.any()) result = plan.shuffle(single_word.scope(), [single_word])\ .with_concurrency(10)\ .node(0).match_by(WordIdentity(lambda x: x[0], serde.any()))\ .process_by(WordCount()).as_type(serde.any())\ .input(0).allow_partial_processing().done()\ .process_by(WordCount()).as_type(serde.any()) plan.shuffle(plan.global_scope(), [result]).node(0).distribute_by_default()\ .process_by(PythonToRecordProcessor()).as_type(record_objector.RecordObjector())\ .sink_by(output.TextFile(output_path).output_format) pipeline.run()
pipeline = base.Pipeline.create( # 指定计算引擎为"spark"或"SPARK" "spark", # 指定tmp_data_path tmp_data_path="hdfs:///app/dc/bigflow/tmp", # 指定spark配置 spark_conf=spark_conf, # default_concurrency不是必须选项,该example数据量小可以设置小一些 default_concurrency=250, ) #case_str = "case4_2" input_path = sys.argv[1] output_path = sys.argv[2] # 可通过 parallelize 构造P类型 data = pipeline.read(input.TextFile(input_path)) # 在P类型上应用transforms result = data.map(lambda x: x.split()).group_by_key()\ .apply_values(lambda x: x.max_elements(5, lambda x: x)).flatten()\ .map(lambda t: "%s %s" % (t[0], t[1])) # 当前预览版不支持get操作,只能通过pipelined的write方法将P类型写入文件系统 pipeline.write(result, output.TextFile(output_path)) pipeline.run()
文件C: user1 user3 user4 输出 将消费总和输出到标准输出。 3021 """ import os from bigflow import base, input, output p = base.Pipeline.create('LOCAL') dir = os.path.dirname(os.path.abspath(__file__)) + "/data/" (A, B, C) = (dir + "A.text", dir + "B.text", dir + "C.text") records = p.read(input.TextFile(A)).map(lambda _: _.split()) # user, ip, cost ip_blacklist = p.read(input.TextFile(B)).map(lambda _: (_, None)) user_whitelist = p.read(input.TextFile(C)).map(lambda _: (_, None)) print records.map(lambda _: (_[1], (_[0], int(_[2])))) \ .cogroup(ip_blacklist) \ .apply_values(lambda records, ips: records.filter(lambda _, cnt: cnt == 0, ips.count())) \ .flatten() \ .map(lambda _: _[1]) \ .cogroup(user_whitelist) \ .apply_values(lambda records, users: records.filter(lambda _, cnt: cnt != 0, users.count())) \ .flatten_values() \ .sum() \ .get()