def test_scope(): Scope.reset() dc = DparkContext() rdd = dc.makeRDD([1, 2, 3]).map(int).map(int).map(int) dc.scheduler.current_scope = Scope.get("") for i, r in enumerate([rdd.prev.prev, rdd.prev, rdd]): assert r.scope.id == i + 1 assert r.scope.api_callsite.startswith("map:{}".format(i)) Scope.reset() rdd = dc.makeRDD([1, 2, 3]) \ .map(int) \ .map(int) \ .map(int) for i, r in enumerate([rdd.prev.prev, rdd.prev, rdd]): assert r.scope.id == i + 1 assert r.scope.api_callsite.startswith("map:0") def get_rdd(n): return dc.makeRDD([n, n]).map(int).map(int).map(int) rdds = [get_rdd(1), get_rdd(2)] assert rdds[0].scope.id + 4 == rdds[1].scope.id rdds = [get_rdd(i) for i in range(2)] assert rdds[0].scope.id == rdds[1].scope.id
def get_rdd(self): dpark = DparkContext() return dpark.union( [dpark.textFile(path, splitSize=64 << 20) for path in self.paths] ).map(Weblog.from_line)
def test_call_graph_join(): dc = DparkContext() Scope.reset() rdd = dc.makeRDD([(1, 1), (1, 2)]).map(lambda x: x) rdd = rdd.join(rdd) dc.scheduler.current_scope = Scope.get("") g = dc.scheduler.get_call_graph(rdd) pprint(g) assert g == ([0, 1, 2, 3], {(0, 1): 1, (1, 2): 2, (2, 3): 1}) fg = dc.scheduler.fmt_call_graph(g)
def test_call_graph_union(): dc = DparkContext() Scope.reset() r1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) r2 = dc.union([dc.makeRDD([(3, 4)]) for _ in range(2)]) rdd = r1.union(r2) dc.scheduler.current_scope = Scope.get("") g = dc.scheduler.get_call_graph(rdd) # pprint(g) fg = dc.scheduler.fmt_call_graph(g) # pprint(fg) assert g == ([0, 1, 2, 3, 4, 5], {(0, 1): 2, (1, 4): 1, (2, 3): 2, (3, 4): 1, (4, 5): 1})
def word_count(file_path, word): # 指定某个Mesos主机进行沟通 dpark = DparkContext() # 将分布式文件,构造成文件RDD,每块大小为16m f = dpark.textFile(file_path, splitSize=16 << 20) # 用map()转变成新的RDD,再用filter过滤出更新的RDD,最后用count()处理返回结果 print( word, 'count:', f.map(lambda line: line.strip()).filter( lambda line: word in line).count())
def main(infile, outfile): ctx = DparkContext() rdd = ctx.textFile(infile) rdd = rdd.map(map_unit_type_priority) rdd = rdd.reduceByKey(reduce_by_key) rdd = rdd.map(map_to_string) rdd.saveAsTextFile(outfile)
def test_lineage(): Scope.reset() dc = DparkContext() rdd1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(5)]) assert len(rdd1.dep_lineage_counts) == 1 rdd2 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(3)]) rdd3 = rdd1.union(rdd2) assert len(rdd3.dep_lineage_counts) == 2 rdd4 = dc.union([dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) for _ in range(4)]) assert len(rdd4.dep_lineage_counts) == 1 assert len(list(rdd4.dependencies)[0].rdd.dep_lineage_counts) == 1 rdd5 = rdd3.groupWith(rdd4) print("rdd1", rdd1.id, rdd1.dep_lineage_counts) stage = dc.scheduler.newStage(rdd1, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert list(stage.pipelines.keys()) == [rdd1.id] assert stage.pipeline_edges == {} stage = dc.scheduler.newStage(rdd3, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert sorted(list(stage.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id] assert stage.pipeline_edges == {((-1, rdd1.id), (-1, rdd3.id)):1, ((-1, rdd2.id), (-1, rdd3.id)):1} stage = dc.scheduler.newStage(rdd4, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert list(stage.pipelines.keys()) == [rdd4.id] assert stage.pipeline_edges == {} print("rdd5", rdd5.id, rdd3.id, rdd4.id) stage = dc.scheduler.newStage(rdd5, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert sorted(list(stage.pipelines.keys())) == [rdd5.id] assert sorted(stage.pipeline_edges) == sorted([((s.id, s.rdd.id), (-1, rdd5.id)) for s in stage.parents]) print('-' * 100) pprint(stage.get_pipeline_graph()) for s in stage.parents: if s.rdd.id == rdd4.id: assert list(s.pipelines.keys()) == [rdd4.id] assert s.pipeline_edges == {} elif s.rdd.id == rdd3.id: assert sorted(list(s.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id] assert s.pipeline_edges == {((-1, rdd1.id), (-1, rdd3.id)): 1, ((-1, rdd2.id), (-1, rdd3.id)): 1} else: assert False pprint(s.get_pipeline_graph())
def test_call_graph_union(): dc = DparkContext() Scope.reset() r1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) r2 = dc.union([dc.makeRDD([(3, 4)]) for _ in range(2)]) rdd = r1.union(r2) dc.scheduler.current_scope = Scope.get("") g = dc.scheduler.get_call_graph(rdd) # pprint(g) fg = dc.scheduler.fmt_call_graph(g) # pprint(fg) assert g == ([0, 1, 2, 3, 4, 5], { (0, 1): 2, (1, 4): 1, (2, 3): 2, (3, 4): 1, (4, 5): 1 })
def main(): current_path = os.path.dirname(os.path.abspath(__file__)) for i in cmp_list: assert os.path.isdir('cmp'+str(i+1)) dpark_ctx = DparkContext('process') # Dpark thread def map_iter(i): dir_name = 'cmp' + str(i+1) logger = os.path.join(dir_name, 'log') if os.path.isdir(logger) and os.listdir(logger): return print "Start running: ", i+1 os.chdir(os.path.join(current_path, 'cmp') + str(i+1)) os.system('python ./cmp.py') dpark_ctx.makeRDD(cmp_list).foreach(map_iter) print 'Done.'
def dt_model(): tr_x, tr_y, va_x, va_y, te_x, te_y = load_data() param_grid = { "min_samples_split": range(1, 10000, 1000), "min_samples_leaf": range(1, 10000, 1000), # 'max_leaf_nodes': [0, 100, 1000, 10000], "max_depth": [None, 100, 1000, 10000], } param_grid = grid_generator(param_grid) # Dpark dpark_ctx = DparkContext() def map_iter(param): idx = param[0][0] * 2 + param[0][1] param = param[1] m = tree.DecisionTreeClassifier(criterion="entropy", **param) print "%d, Start traininig Decision Tree model." % idx m = m.fit(tr_x, tr_y) print "%d, Training done." % idx proba = m.predict_proba(va_x) fpr, tpr, thresh = roc_curve(va_y, proba[:, 1]) auc_ = auc(fpr, tpr) print "%d, AUC is %f" % (idx, auc_) return idx, param, auc_ print "It will train %d models" % len(param_grid) result_record = dpark_ctx.makeRDD(param_grid, 50).enumerate().map(map_iter).collect() file_record = open("dt_result.pkl", "w") pickle.dump(result_record, file_record) file_record.close() # testing opt = reduce(lambda x, y: x if x[2] > y[2] else y, result_record) m = tree.DecisionTreeClassifier(criterion="entropy", **opt[1]) m = m.fit(tr_x, tr_y) proba = m.predict_proba(te_x) fpr, tpr, thresh = roc_curve(te_y, proba[:, 1]) auc_ = auc(fpr, tpr) print "Testing AUC is %f" % auc_
def main(txt, infile, outfile): ctx = DparkContext() csvfilename = infile txtfilename = txt txt_rdd = ctx.textFile(txtfilename) txt_rdd = txt_rdd.map(divide_txt) #('5988', ['2', 'CPM']) csv_rdd = ctx.textFile(csvfilename, splitSize=64<<20) #print csv_rdd.take(100) csv_rdd = csv_rdd.filter(remove_some_bid_unitid) csv_rdd = csv_rdd.map(divide_csv) #('6379', ['-1', '1236054964187470000', '6379', '77', '1', '1', '0']) record_rdd = txt_rdd.join(csv_rdd) #('6370', (['2', 'COMPLEMENT'], ['-1', '8183016859528920000', '6370', '86', '3', '1', '0'])) record_rdd = record_rdd.mapValue(join_element) #('6370', ['2', 'COMPLEMENT', '-1', '8183016859528920000', '6370', '86', '3', '1', '0']) record_rdd = record_rdd.groupBy(lambda line : str(line[1]).split()[5] + str(line[1]).split()[1]) #print record_rdd.take(1) record_rdd = record_rdd.map(map_unit_type) #print record_rdd.take(1) record_rdd = record_rdd.flatMap(flat_map_unit_type_priority) #print record_rdd.take(5) record_rdd = record_rdd.groupByKey() #print "*" * 50 #print record_rdd.take(5) record_rdd = record_rdd.mapValue(map_value_unit_type_priority) #print "#" * 50 #print record_rdd.take(5) record_rdd = record_rdd.map(map_unit_type_priority) #print "$" * 50 #print record_rdd.take(5) # unit type prioritt cluster n_ad n_imp n_click ctr record_rdd.saveAsTextFile(outfile)
import time from dpark import DparkContext, optParser from dpark.file_manager import file_manager dc = DparkContext() optParser.set_usage("%prog [options] path") options, args = optParser.parse_args() path = args[0] def run(split_size=1): t = time.time() dc.textFile(path).mergeSplit( splitSize=split_size).filter(lambda x: "yangxiufeng" in x).count() return time.time() - t run() # file cache print("{}s with locality".format(run())) file_manager.fs_list = file_manager.fs_list[1:] print("{}s merge & without locality".format(run(10))) print("{}s without locality, ".format(run()))
#coding:utf-8 from random import shuffle, random, sample import traceback from dpark import DparkContext dp = DparkContext('mesos') '''shuffle不返回数据值''' rdd1 = dp.parallelize([[(1, 2), (3, 4), (5, 6)]]).map(lambda x: shuffle(x)) print 'rdd1:' print rdd1.take(1) rdd2 = dp.parallelize([((1, 2), (3, 4), (5, 6))]).map(lambda x: shuffle(x)) print 'rdd2:' try: print rdd2.take(1) except Exception, e: print traceback.print_exc() '''O(NlogN)''' rdd3 = dp.parallelize([[(1, 2), (3, 4), (5, 6)] ]).map(lambda x: sorted(x, key=lambda k: random())) print 'rdd3:' print rdd3.take(1) '''O(N)''' rdd4 = dp.parallelize([[(1, 2), (3, 4), (5, 6)]]).map(lambda x: sample(x, len(x))) print 'rdd4:' print rdd4.take(1) rdd5 = dp.parallelize([((1, 2), (3, 4), (5, 6)) ]).map(lambda x: sample(x, len(x)))
from dpark import DparkContext def m(x): return x def r(x, y): return x + y def src(): return dc.makeRDD([(1,1)],2) dc = DparkContext("mesos") rdd1 = src() rdd2 = src().reduceByKey(r) to_union_1_a = [src() for _ in range(2)] to_union_1_b = [src()] to_union_2_a = [dc.union(to_union_1_a + to_union_1_b) for _ in range(2)] to_union_2_b = [rdd2, rdd1] to_union_3_a = [dc.union(to_union_2_a + to_union_2_b).map(m).reduceByKey(r)] to_union_3_b = [rdd2] rdd3 = dc.union(to_union_3_a + to_union_3_b) rdd4 = rdd2.join(rdd2) rdd1.collect() rdd2.collect()
# generate new training data # uid,iid,r - mu - bi # iid,1.|pi... import glob from dpark import DparkContext MU_PATH='/nfs/wuhong/offline_use/global_params_0' IBIAS_PATH='/nfs/wuhong/offline_use/ibias_0/' RATING_PATH='/nfs/wuhong/fm_data/user_music_factor_model/user_track_rating_for_training/' ITEM_FACTOR_PATH='/nfs/wuhong/offline_use/H_0/' NEW_RATING_PATH='/nfs/wuhong/offline_use/rating_new/' NEW_ITEM_FACTOR_PATH='/nfs/wuhong/offline_use/H_new/' dpark = DparkContext() f_global = file(MU_PATH) line = '' for l in f_global: line = l mu = float(line.strip().split('\t')[1]) f_global.close() mu = dpark.broadcast(mu) def local_mapper(line): iid, v, _ = line.strip().split('\t') return (iid, float(v)) ibias = {} ibias = dpark.textFile(glob.glob(IBIAS_PATH)).map(
from dpark import DparkContext, optParser dc = DparkContext() options, args = optParser.parse_args() infile = args[0] outfile = args[1] print("from {} to {}".format(infile, outfile)) def fm(x): for w in x.strip().split(): yield (w, 1) (dc.textFile(infile) .flatMap(fm) .reduceByKey(lambda x, y: x + y, numSplits=6) .map(lambda x: " ".join(list(map(str, x)))) .saveAsTextFile(outfile, overwrite=False))
def to_vertex((id, lines)): outEdges = [SPEdge(tid, int(v)) for _, tid, v in lines] return (id, SPVertex(id, sys.maxint, outEdges, True)) def compute(self, vs, agg, superstep): newValue = min(self.value, vs[0]) if vs else self.value if newValue != self.value: outbox = [SPMessage(edge.target_id, newValue + edge.value) for edge in self.outEdges] else: outbox = [] return SPVertex(self.id, newValue, self.outEdges, False), outbox if __name__ == '__main__': ctx = DparkContext() lines = ctx.textFile('graph.txt').map(lambda line:line.split(' ')) vertices = lines.filter(lambda x:len(x)==3).groupBy( lambda line:line[0]).map(to_vertex) messages = lines.filter(lambda x:len(x)==2).map( lambda (vid, v): (vid, SPMessage(vid, int(v))) ) print 'read', vertices.count(), 'vertices and ', messages.count(), 'messages.' result = Bagel.run(ctx, vertices, messages, compute, MinCombiner()) startVertex = 0 print 'Shortest path from %s to all vertices:' % startVertex for v in result.collect(): if v.value == sys.maxint: v.value = 'inf' print v.id, v.value
def compute(self, vs, agg, superstep): newValue = min(self.value, vs[0]) if vs else self.value if newValue != self.value: outbox = [ SPMessage(edge.target_id, newValue + edge.value) for edge in self.outEdges ] else: outbox = [] return SPVertex(self.id, newValue, self.outEdges, False), outbox if __name__ == '__main__': ctx = DparkContext() lines = ctx.textFile('graph.txt').map(lambda line: line.split(' ')) vertices = lines.filter(lambda x: len(x) == 3).groupBy( lambda line: line[0]).map(to_vertex) messages = lines.filter(lambda x: len(x) == 2).map( lambda (vid, v): (vid, SPMessage(vid, int(v)))) print 'read', vertices.count(), 'vertices and ', messages.count( ), 'messages.' result = Bagel.run(ctx, vertices, messages, compute, MinCombiner()) startVertex = 0 print 'Shortest path from %s to all vertices:' % startVertex for v in result.collect(): if v.value == sys.maxint: v.value = 'inf' print v.id, v.value
import time from dpark import DparkContext def m(x): if x[0] == 0: time.sleep(100) return x def r(x, y): return x + y dc = DparkContext("mesos") rdd = dc.makeRDD([(i, i) for i in range(2)], 2) rdd.collect() rdd.reduceByKey(r).map(m).reduceByKey(r).collect()
import math import random import os, sys from pprint import pprint sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dpark import DparkContext ctx = DparkContext() # range nums = ctx.parallelize(range(100), 4) print nums.count() print nums.reduce(lambda x, y: x + y) # text search f = ctx.textFile("./", ext="py").map(lambda x: x.strip()) log = f.filter(lambda line: "logging" in line).cache() print "logging", log.count() print "error", log.filter(lambda line: "error" in line).count() for line in log.filter(lambda line: "error" in line).collect(): print line # word count counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).cache() pprint(counts.filter(lambda (_, v): v > 50).collectAsMap()) pprint(sorted(counts.filter(lambda (_, v): v > 20).map(lambda (x, y): (y, x)).groupByKey().collect())) pprint(counts.map(lambda v: "%s:%s" % v).saveAsTextFile("wc/")) # Pi import random
def get_rdd(self): dpark = DparkContext() return dpark.union([ dpark.textFile(path, splitSize=64 << 20) for path in self.paths ]).map(Weblog.from_line)
# -*- coding: utf-8 -*- from __future__ import absolute_import from __future__ import print_function from dpark import DparkContext from dpark.mutable_dict import MutableDict from random import shuffle import six.moves.cPickle import numpy from six.moves import range from six.moves import zip dpark = DparkContext() with open('ab.mat') as f: ori = six.moves.cPickle.loads(f.read()) k = 50 d = 20 M = len(ori) V = len(ori[0]) assert M % d == 0 assert V % d == 0 m = M / d v = V / d GAMMA = 0.02 LAMBDA = 0.1 STEP = 0.9 W = MutableDict(d)
def DownLoad(file_path): dpark = DparkContext() file_block = dpark.textFile(file_path,splitSize=16<<20) file_block.foreach(write_to_wav)
import glob from dpark import DparkContext RATING_PATH = '/nfs/wuhong/offline_use/rating_new/' TRAINING_PATH = '/nfs/wuhong/paracel/data/als_fm/train' TEST_PATH = '/nfs/wuhong/paracel/data/als_fm/test' dpark = DparkContext() def local_filter1(line): tmp = line.strip().split(',')[1] if tmp.endswith('0') or tmp.endswith('1') or tmp.endswith('2'): return False return True def local_filter2(line): tmp = line.strip().split(',')[1] if tmp.endswith('0') or tmp.endswith('1') or tmp.endswith('2'): return True return False dpark.textFile( glob.glob(RATING_PATH)).filter(local_filter1).saveAsTextFile(TRAINING_PATH) dpark.textFile( glob.glob(RATING_PATH)).filter(local_filter2).saveAsTextFile(TEST_PATH)
outEdges = [Edge(ref) for ref in refs] return (title, Vertex(title, 1.0/numV, outEdges, True)) def gen_compute(num, epsilon): def compute(self, messageSum, agg, superstep): if messageSum and messageSum[0]: newValue = 0.15 / num + 0.85 * messageSum[0] else: newValue = self.value terminate = (superstep >= 10 and abs(newValue-self.value) < epsilon) or superstep > 30 outbox = [Message(edge.target_id, newValue / len(self.outEdges)) for edge in self.outEdges] if not terminate else [] return Vertex(self.id, newValue, self.outEdges, not terminate), outbox return compute if __name__ == '__main__': inputFile = 'wikipedia.txt' threshold = 0.01 dpark = DparkContext() input = dpark.textFile(inputFile) numVertex = input.count() vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache() epsilon = 0.01 / numVertex messages = dpark.parallelize([]) result = Bagel.run(dpark, vertices, messages, gen_compute(numVertex, epsilon)) for v in result.filter(lambda x:x.value > threshold).collect(): print v.id, v.value
#!/usr/bin/env python import sys, os, os.path import random from dpark import DparkContext dpark = DparkContext() from vector import Vector def parseVector(line): num = map(int, line.strip().split('\t')[2:]) num = [n-num[0] for n in num] return Vector(num[20:]) return Vector(map(int, line.strip().split('\t')[2:])) def closestCenter(p, centers): bestDist = p.squaredDist(centers[0]) bestIndex = 0 for i in range(1, len(centers)): d = p.squaredDist(centers[i]) if d < bestDist: bestDist = d bestIndex = i return bestIndex def minDist(p, centers): bestDist = p.squaredDist(centers[0]) for i in range(1, len(centers)): d = p.squaredDist(centers[i]) if d < bestDist: bestDist = d return bestDist
def test_lineage(): Scope.reset() dc = DparkContext() rdd1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(5)]) assert len(rdd1.dep_lineage_counts) == 1 rdd2 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(3)]) rdd3 = rdd1.union(rdd2) assert len(rdd3.dep_lineage_counts) == 2 rdd4 = dc.union( [dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) for _ in range(4)]) assert len(rdd4.dep_lineage_counts) == 1 assert len(list(rdd4.dependencies)[0].rdd.dep_lineage_counts) == 1 rdd5 = rdd3.groupWith(rdd4) print("rdd1", rdd1.id, rdd1.dep_lineage_counts) stage = dc.scheduler.newStage(rdd1, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert list(stage.pipelines.keys()) == [rdd1.id] assert stage.pipeline_edges == {} stage = dc.scheduler.newStage(rdd3, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert sorted(list(stage.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id] assert stage.pipeline_edges == { ((-1, rdd1.id), (-1, rdd3.id)): 1, ((-1, rdd2.id), (-1, rdd3.id)): 1 } stage = dc.scheduler.newStage(rdd4, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert list(stage.pipelines.keys()) == [rdd4.id] assert stage.pipeline_edges == {} print("rdd5", rdd5.id, rdd3.id, rdd4.id) stage = dc.scheduler.newStage(rdd5, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert sorted(list(stage.pipelines.keys())) == [rdd5.id] assert sorted(stage.pipeline_edges) == sorted([ ((s.id, s.rdd.id), (-1, rdd5.id)) for s in stage.parents ]) print('-' * 100) pprint(stage.get_pipeline_graph()) for s in stage.parents: if s.rdd.id == rdd4.id: assert list(s.pipelines.keys()) == [rdd4.id] assert s.pipeline_edges == {} elif s.rdd.id == rdd3.id: assert sorted(list( s.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id] assert s.pipeline_edges == { ((-1, rdd1.id), (-1, rdd3.id)): 1, ((-1, rdd2.id), (-1, rdd3.id)): 1 } else: assert False pprint(s.get_pipeline_graph())
def main(): # Loading data print "Loading..." data_train = sample_image() # Initialize networks visible_size = 64 # number of input units hidden_size = [25, 16, 9] # number of hidden units of each layer lamb = 0.0001 # weight decay parameter beta = 3 # weight of sparsity penalty dataset # dpark initialize dpark_ctx = DparkContext() # Start training, and L-BFGS is adopted # We apply a stack-wise greedy training process layer_ind = range(len(hidden_size) + 1) layer_ind.remove(0) layer_size = [visible_size] + hidden_size # desired average activation sparsity_param = dict() for ind in layer_ind: # standard: 64 units -> sparsity parameter 0.01 sparsity_param[ind] = layer_size[ind - 1] * 0.01 / 64 data = data_train opttheta = dict() # parameter vector of stack AE img = dict() # visualization mode for ind in layer_ind: print "start training layer No.%d" % ind # Obtain random parameters of considered layer theta = initial_parameter(layer_size[ind], layer_size[ind - 1]) # Training begins options = (data, layer_size[ind - 1], layer_size[ind], lamb, sparsity_param[ind], beta, dpark_ctx) opt = optimize.fmin_l_bfgs_b(compute_cost, theta, compute_grad, options) opttheta[ind] = opt[0] W = opttheta.get(ind)[:layer_size[ind]*layer_size[ind-1]].\ reshape(layer_size[ind], layer_size[ind-1]) data = np.dot(W, data) # visulization shows img[ind] = display_effect(W) plt.axis('off') plt.savefig(str(ind) + '.jpg') # Trained parameters of stack AE para_stack = vecstack2stack(opttheta, hidden_size, visible_size) # Save trained weights and bias out = open("weights_bias.pkl", "wb") pickle.dump(para_stack, out) out.close() print "Mission complete!"
#!/usr/bin/env python import sys, os, os.path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import random from dpark import DparkContext dpark = DparkContext() from vector import Vector def parseVector(line): return Vector(map(float, line.strip().split(' '))) def closestCenter(p, centers): bestDist = p.squaredDist(centers[0]) bestIndex = 0 for i in range(1, len(centers)): d = p.squaredDist(centers[i]) if d < bestDist: bestDist = d bestIndex = i return bestIndex if __name__ == '__main__': D = 4 K = 3 IT = 10 MIN_DIST = 0.01 centers = [Vector([random.random() for j in range(D)]) for i in range(K)] points = dpark.textFile('kmeans_data.txt').map(parseVector).cache() for it in range(IT):
def main(): # Loading data print "Loading..." data = sample_image() # Initialize networks visible_size = 64 # number of input units hidden_size = [25, 16, 9] # number of hidden units of each layer #lamb = 0.0001 # weight decay parameter ''' lamb = 0 # No weight decay! beta = 0.01 ''' # sigmoid DEBUG lamb = 0.0001 beta = 3 # dpark initialize dpark_ctx = DparkContext() # Start training, and L-BFGS is adopted # We apply a stack-wise greedy training process layer_ind = range(len(hidden_size) + 1) layer_ind.remove(0) layer_size = [visible_size] + hidden_size opttheta = dict() # parameter vector of stack AE img = dict() # visualization mode sparsity_param = dict() for ind in layer_ind: # standard: 64 units -> sparsity parameter 0.01 sparsity_param[ind] = layer_size[ind - 1] * 0.01 / 64 for ind in layer_ind: print "start training layer No.%d" % ind # Obtain random parameters of considered layer theta = initial_parameter(layer_size[ind], layer_size[ind - 1]) # SGD with mini-batch options = (data, layer_size[ind - 1], layer_size[ind], lamb, sparsity_param[ind], beta, dpark_ctx) opttheta[ind] = stocha_grad_desc_agagrad(compute_cost, compute_grad, theta, options, step_size_init=0.2, max_iter=25, tol=1e-7) # Preparing next layer! W = opttheta.get(ind)[:layer_size[ind]*layer_size[ind-1]].\ reshape(layer_size[ind], layer_size[ind-1]) b = opttheta.get(ind)[2*layer_size[ind]*layer_size[ind-1]:\ 2*layer_size[ind]*layer_size[ind-1]+layer_size[ind]].\ reshape(layer_size[ind], 1) data = ReLU(np.dot(W, data) + b) # visulization shows img[ind] = display_effect(W) plt.axis('off') plt.savefig(str(ind) + '.jpg') # DEBUG fin = open("theta_DEBUG.pkl", "wb") pickle.dump((W, b), fin) fin.close() sys.exit() # Trained parameters of stack AE para_stack = vecstack2stack(opttheta, hidden_size, visible_size) # Save trained weights and bias out = open("weights_bias.pkl", "wb") pickle.dump(para_stack, out) out.close() print "Mission complete!"
import sys sys.path.append('../') from dpark import DparkContext dpark = DparkContext() name = '/mfs/tmp/weblog-pre-20111019.csv' name = '/mfs/tmp/weblog-20111019.csv' name = '/tmp/weblog-20111019.csv.small' # name = '/tmp/weblog-20111019.csv.medium' name = 'resume_text_seg_data-2014-06-01.txt' pv = dpark.textFile(name) pv = pv.map(lambda x:x.split(',')).map(lambda l:(l[3], l[7])) pv = pv.flatMap(lambda (i, u):(u.startswith('/movie') and [(i, 2)] or u.startswith('/group') and [(i, 3)] or [])) # print pv.take(50) pv = pv.reduceByKey(lambda x, y:x * y) # print pv.take(50) print pv.filter(lambda (_, y):y % 2 == 0 and y % 3 == 0).count() # movie = pv.filter(lambda (bid,url): url.startswith('/movie')).reduceByKey(lambda x,y:None) # group = pv.filter(lambda (bid,url): url.startswith('/group')).reduceByKey(lambda x,y:None) # print movie.join(group).count() # print pv.map(lambda x:x.split(',')[2]).uniq().count() # print pv.map(lambda x:(x.split(',')[2],None)).reduceByKey(lambda x,y:None).count() # .filter(lambda uid:uid) # print upv.count() # print upv.reduceByKey(lambda x,y:x+y).count()
import sys sys.path.append('../') import logging from dpark import DparkContext dpark = DparkContext() name = 'rating.txt' def parse(line): sid, uid, r, f = line.split('\t') defaults = {'F': 4.5, 'P': 3.7, 'N': 4.0} if r == 'None': r = defaults[f] return (sid, (uid, float(r))) rating = dpark.textFile(name, numSplits=2).map(parse).groupByKey(2) #.cache() #print 'us', rating.first() print rating.count() def reverse(it): s = {} for k, us in it: for u, r in us: s.setdefault(u, {})[k] = r return s
#!/usr/bin/env python from __future__ import absolute_import from __future__ import print_function import sys, os, os.path from six.moves import map from six.moves import range sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import random from dpark import DparkContext from vector import Vector dpark = DparkContext() def parseVector(line): return Vector(list(map(float, line.strip().split(' ')))) def closestCenter(p, centers): bestDist = p.squaredDist(centers[0]) bestIndex = 0 for i in range(1, len(centers)): d = p.squaredDist(centers[i]) if d < bestDist: bestDist = d bestIndex = i return bestIndex if __name__ == '__main__':
#!/usr/bin/env python import sys, os, os.path import random from dpark import DparkContext dpark = DparkContext() from vector import Vector def parseVector(line): num = map(int, line.strip().split('\t')[2:]) num = [n - num[0] for n in num] return Vector(num[20:]) return Vector(map(int, line.strip().split('\t')[2:])) def closestCenter(p, centers): bestDist = p.squaredDist(centers[0]) bestIndex = 0 for i in range(1, len(centers)): d = p.squaredDist(centers[i]) if d < bestDist: bestDist = d bestIndex = i return bestIndex def minDist(p, centers): bestDist = p.squaredDist(centers[0]) for i in range(1, len(centers)): d = p.squaredDist(centers[i]) if d < bestDist:
#!/usr/bin/env python import os, sys import re from glob import glob from dpark import DparkContext from dpark.table import TableRDD, Globals, CachedTables from prettytable import PrettyTable ctx = DparkContext() _locals = {} def gen_table(expr, fields=None): if '(' in expr and getattr(ctx, expr.split('(')[0], None): rdd = eval('ctx.' + expr[:expr.rindex(')') + 1], globals(), _locals) else: rdd = eval(expr, globals(), _locals) head = rdd.first() if isinstance(head, str): if '\t' in head: rdd = rdd.fromCsv('excel-tab') elif ',' in head: rdd = rdd.fromCsv('excel') else: rdd = rdd.map(lambda l:l.split(' ')) row = rdd.first() else: row = head if not isinstance(rdd, TableRDD):
return _ class DemoOutputStream(ForEachDStream): def __init__(self, parent, output): ForEachDStream.__init__(self, parent, collect(output)) self.output = output def __setstate__(self, state): ForEachDStream.__setstate__(self, state) self.output = [] self.func = collect(self.output) sc = DparkContext(dpark_master) class TestDStream(unittest.TestCase): def _setupStreams(self, intput1, input2, operation): ssc = StreamingContext(2, sc) is1 = DemoInputStream(ssc, intput1) ssc.registerInputStream(is1) if input2: is2 = DemoInputStream(ssc, input2) ssc.registerInputStream(is2) os = operation(is1, is2) else: os = operation(is1) output = DemoOutputStream(os, []) ssc.registerOutputStream(output)
if messageSum and messageSum[0]: newValue = 0.15 / num + 0.85 * messageSum[0] else: newValue = self.value terminate = (superstep >= 10 and abs(newValue - self.value) < epsilon) or superstep > 30 outbox = [(edge.target_id, newValue / len(self.outEdges)) for edge in self.outEdges] if not terminate else [] return Vertex(self.id, newValue, self.outEdges, not terminate), outbox return compute if __name__ == '__main__': inputFile = 'wikipedia.txt' threshold = 0.01 dpark = DparkContext() path = os.path.join(os.path.dirname(os.path.abspath(__file__)), inputFile) input = dpark.textFile(path) numVertex = input.count() vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache() epsilon = 0.01 / numVertex messages = dpark.parallelize([]) result = Bagel.run(dpark, vertices, messages, gen_compute(numVertex, epsilon)) for id, v in result.filter( lambda id_v: id_v[1].value > threshold).collect(): print(id, v)
def DownLoad(file_path): dpark = DparkContext() file_block = dpark.textFile(file_path, splitSize=16 << 20) file_block.foreach(write_to_wav)
from dpark import DparkContext, optParser dc = DparkContext() options, args = optParser.parse_args() infile = args[0] outfile = args[1] print("from {} to {}".format(infile, outfile)) def fm(x): for w in x.strip().split(): yield (w, 1) (dc.textFile(infile).flatMap(fm).reduceByKey( lambda x, y: x + y, numSplits=6).map(lambda x: " ".join(list(map(str, x)))).saveAsTextFile( outfile, overwrite=False))
# -*- coding: utf-8 -*- from dpark import DparkContext def m(x): return x rdd = DparkContext().makeRDD([(1, 1)]).map(m).groupByKey() rdd.map(m).collect() rdd.map(m).collect()
for _, tid, v in lines] return (id, Vertex(id, sys.maxint, outEdges, True)) def compute(self, vs, agg, superstep): newValue = min(self.value, vs[0]) if vs else self.value if newValue != self.value: outbox = [(edge.target_id, newValue + edge.value) for edge in self.outEdges] else: outbox = [] return Vertex(self.id, newValue, self.outEdges, False), outbox if __name__ == '__main__': ctx = DparkContext() path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'graph.txt') lines = ctx.textFile(path).map(lambda line: line.split(' ')) vertices = lines.groupBy(lambda line: line[0]).map(to_vertex) startVertex = str(0) messages = ctx.makeRDD([(startVertex, 0)]) print('read', vertices.count(), 'vertices and ', messages.count(), 'messages.') result = Bagel.run(ctx, vertices, messages, compute, BasicCombiner(min), numSplits=2) print('Shortest path from %s to all vertices:' % startVertex) for id, v in result.collect(): if v.value == sys.maxint: v.value = 'inf' print(v.id, v.value)
#!/usr/bin/env python # encoding: utf-8 """ @version: v1.0 @author: W_H_J @license: Apache Licence @contact: [email protected] @site: @software: PyCharm @file: wordcount.py @time: 2018/6/5 18:10 @describe: 单词统计 """ from dpark import DparkContext ctx = DparkContext() file = ctx.textFile("./words.txt") words = file.flatMap(lambda x: x.split()).map(lambda x: (x, 1)) wc = words.reduceByKey(lambda x, y: x + y).collectAsMap() print(wc) # 统计单词出现的个数 def word_count(file_path, word): # 指定某个Mesos主机进行沟通 dpark = DparkContext() # 将分布式文件,构造成文件RDD,每块大小为16m f = dpark.textFile(file_path, splitSize=16 << 20) # 用map()转变成新的RDD,再用filter过滤出更新的RDD,最后用count()处理返回结果 print(
data = fs.get('/song/small/%s.mp3'%id) binfile = open("output/%s.mp3" % (id),"wb") binfile.write(data) binfile.close() m = RetrievalMusic(dptable, mode) m.retrieving('output/%s.mp3'%id) if mode != 2: call("rm output/%s.mp3" % (id), shell=True) def batchprocess(song_id, loaded, mode): # dpark = DparkContext() # dptable = dpark.broadcast(loaded) # dpark.parallelize(song_id, 80).foreach(lambda(id):calculate_single(id, dptable, mode)) for id in song_id: calculate_single(id, loaded, mode) if mode == 2: rearrange() if __name__ == '__main__': song_id = np.load("track_temp.npy") mode = 1 # 1 for save, 2 for filter, and 0 for regular work dpark = DparkContext() dpark.parallelize(song_id, 50).foreach(lambda(id):calculate_single(id,0,mode))
import math import random import os, sys from pprint import pprint sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dpark import DparkContext ctx = DparkContext() # range nums = ctx.parallelize(range(100), 4) print nums.count() print nums.reduce(lambda x, y: x + y) # text search f = ctx.textFile("./", ext='py').map(lambda x: x.strip()) log = f.filter(lambda line: 'logging' in line).cache() print 'logging', log.count() print 'error', log.filter(lambda line: 'error' in line).count() for line in log.filter(lambda line: 'error' in line).collect(): print line # word count counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey( lambda x, y: x + y).cache() pprint(counts.filter(lambda (_, v): v > 50).collectAsMap()) pprint( sorted( counts.filter(lambda (_, v): v > 20).map( lambda (x, y): (y, x)).groupByKey().collect())) pprint(counts.map(lambda v: "%s:%s" % v).saveAsTextFile("wc/"))
def main(argv): # Dpark initialize dpark = DparkContext() # number of the training and testing set num_train = 6000 num_test = 6000 # Loading the dataset data = svm_read_problem('echo_liveness.01.libsvm') y, x = data # Preparing training and testing data if len(x) != len(y): print("The labels and features are not accorded!") sys.exit() x_live = [x[i] for i in find(y, 1.0)] x_stu = [x[i] for i in find(y, 0.0)] n_live = len(x_live) n_stu = len(x_stu) ind_live = range(n_live) ind_stu = range(n_stu) random.shuffle(ind_live) random.shuffle(ind_stu) x_te = [x_live[i] for i in ind_live[num_train : num_test + num_train]] + \ [x_stu[i] for i in ind_stu[num_train : num_test + num_train]] y_te = [1.0] * len(ind_live[num_train : num_test + num_train]) + \ [-1.0]*len(ind_stu[num_train : num_test + num_train]) x_tr = [x_live[i] for i in ind_live[:num_train]] + \ [x_stu[i] for i in ind_stu[:num_train]] y_tr = [1.0]*num_train + [-1.0]*num_train # dpark version def map_iter(i): y_tr_examplar = [-1.0] * len(y_tr) y_tr_examplar[i] = 1.0 # opt = '-t 0 -w1 ' + str(len(y_tr)) + ' -w-1 1 -b 1 -q' # It is suggested in Efros' paper that: # C1 0.5, C2 0.01 opt = '-t 0 -w1 0.5 -w-1 0.01 -b 1 -q' m = svm_train(y_tr_examplar, list(x_tr), opt) p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-b 1 -q') p_val = np.array(p_val) # p_val = np.delete(p_val,1,1) # shape = (N, 1) p_val = p_val[:, 0] # shape = (N, ) return p_val p_vals = dpark.makeRDD( range(len(y_tr)) ).map( map_iter ).collect() val = np.array(p_vals).T # for-loop version ''' # Examplar SVM Training ensemble_model = [] # DPark for i in range(len(y_tr)): y_tr_examplar = [-1.0] * len(y_tr) y_tr_examplar[i] = 1.0; #opt = '-t 0 -w1 ' + str(len(y_tr)) + ' -w-1 1 -b 1 -q' # It is suggested in Efros' paper that: # C1 0.5, C2 0.01 opt = '-t 0 -w1 0.5 -w-1 0.01 -b 1 -q' m = svm_train(y_tr_examplar, x_tr, opt) ensemble_model.append(m) print("The %s-th examplar SVM has been trained" %i) # Calibaration, to be updated # Since we adopt the probability estimation model of LIB_SVM, Calibrating seems unnecessary # Ensembly Classify val = np.zeros((len(y_te),1)) for m in ensemble_model: p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-b 1 -q') p_val = np.array(p_val) p_val = np.delete(p_val,1, 1) val = np.hstack((val, p_val)) if val.shape[1] != len(y_tr) + 1: print "Chaos!" val = np.delete(val,0,1) print 'val.shape =', val.shape ''' # KNN k = num_train / 8 sorted_index = val.argsort(axis=1) sorted_index = sorted_index.T[::-1].T p_label = [] for index in sorted_index: nearest_samples = [] for sample_index in index[:k]: nearest_samples.append(y_tr[sample_index]) n,bins,dummy = plt.hist(nearest_samples, 2, normed=1, facecolor='r', alpha=0.75) if n[0] > n[1]: p_label.append(-1.0) else: p_label.append(1.0) # evaluation rate, pos_rate, neg_rate = evaluation(y_te, p_label) print("The Examplar SVM framework achieves a precision of %f" % rate)
import time import unittest import logging sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dpark import DparkContext from dpark.utils.nested_groupby import GroupByNestedIter from dpark.shuffle import GroupByNestedIter, AutoBatchedSerializer from dpark.utils.profile import profile import dpark.conf GroupByNestedIter.NO_CACHE = True print_mem_incr = True print_mem_incr = False dc = DparkContext('mesos') RC = dpark.conf.rddconf M = 1024 * 1024 def rss_func(p): if hasattr(p, "memory_info"): mem_info = getattr(p, "memory_info") else: mem_info = getattr(p, 'get_memory_info') def _(): return mem_info().rss / M return _
def main(argv): # Loading the dataset data = svm_read_problem('echo_liveness.01.libsvm') y, x = data del data num_train = 6000 num_test = 6000 # Preparing training and testing data if len(x) != len(y): print("Please examine the data set, for the labels and features are not accorded!") sys.exit() # generating random training and testing set, to yield the ability of classifier more accurately. x_live = [x[i] for i in find(y, 1.0)] x_stu = [x[i] for i in find(y, 0.0)] n_live = len(x_live) n_stu = len(x_stu) ind_live = range(n_live) ind_stu = range(n_stu) random.shuffle(ind_live) random.shuffle(ind_stu) x_te = [x_live[i] for i in ind_live[num_train : num_test + num_train]] + \ [x_stu[i] for i in ind_stu[num_train : num_test + num_train]] y_te = [1.0] * len(ind_live[num_train : num_test + num_train]) + \ [-1.0]*len(ind_stu[num_train : num_test + num_train]) x_tr = [x_live[i] for i in ind_live[:num_train]] + \ [x_stu[i] for i in ind_stu[:num_train]] y_tr = [1.0]*num_train + [-1.0]*num_train # SVM and a 10-fold Cross Validation choosing the best parameters. # gamma and c_reg are constructed in a parameter grid # for-loop version ''' gamma = np.arange(.01,20,.04) c_reg = np.arange(.01,20,.04) opt = [] best_para = {'gamma': 0, 'c': 0, 'precision': 0} for g in gamma: for c in c_reg: opt = '-g '+ str(g) +' -c ' + str(c) + ' -v 10 -q' pre = svm_train(y_tr,x_tr,opt) if pre > best_para.get('precision'): best_para['gamma'] = g best_para['c'] = c best_para['precision'] = pre best_opt = '-g '+ str(best_para.get('gamma')) +' -c ' + str(best_para.get('c')) + ' -q' m = svm_train(y_tr, x_tr, best_opt) p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-q') ''' # dpark version dpark = DparkContext() gamma = np.arange(.01, 5, .08) c_reg = np.arange(.01, 5, .08) opt = [] for g in gamma: for c in c_reg: opt.append('-g '+ str(g) +' -c ' + str(c) + ' -v 10 -q') def map_iter(i): pre = svm_train(y_tr, list(x_tr), opt[i]) return pre #pres = dpark.makeRDD(range(len(opt)),100).map(map_iter).collect() pres = dpark.makeRDD(range(len(opt))).map(map_iter).collect() pres = np.array(pres) best_opt_ind = pres.argsort() best_opt = opt[best_opt_ind[-1]] best_opt = best_opt[:best_opt.find('-v') - 1] m = svm_train(y_tr, x_tr, best_opt) p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-q') print 'This SVM framework precision: %f' % p_acc[0]
# -*- coding: utf-8 -*- from dpark import DparkContext def m(x): return x rdd = DparkContext().makeRDD([(1,1)]).map(m).groupByKey() rdd.map(m).collect() rdd.map(m).collect()
return (title, Vertex(title, 1.0/numV, outEdges, True)) def gen_compute(num, epsilon): def compute(self, messageSum, agg, superstep): if messageSum and messageSum[0]: newValue = 0.15 / num + 0.85 * messageSum[0] else: newValue = self.value terminate = (superstep >= 10 and abs(newValue-self.value) < epsilon) or superstep > 30 outbox = [Message(edge.target_id, newValue / len(self.outEdges)) for edge in self.outEdges] if not terminate else [] return Vertex(self.id, newValue, self.outEdges, not terminate), outbox return compute if __name__ == '__main__': inputFile = 'wikipedia.txt' threshold = 0.01 dpark = DparkContext() path = os.path.join(os.path.dirname(os.path.abspath(__file__)), inputFile) input = dpark.textFile(path) numVertex = input.count() vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache() epsilon = 0.01 / numVertex messages = dpark.parallelize([]) result = Bagel.run(dpark, vertices, messages, gen_compute(numVertex, epsilon)) for v in result.filter(lambda x:x.value > threshold).collect(): print v.id, v.value
# -*- coding: utf-8 -*- from dpark import DparkContext dc = DparkContext() def get_rdd(): return dc.makeRDD([(1, 1)]) rdd1 = get_rdd() rdd2 = dc.union([get_rdd() for i in range(2)]) rdd3 = get_rdd().groupByKey() dc.union([rdd1, rdd2, rdd3]).collect()
import glob from dpark import DparkContext RATING_PATH = '/nfs/wuhong/offline_use/rating_new/' TRAINING_PATH = '/nfs/wuhong/paracel/data/als_fm/train' TEST_PATH = '/nfs/wuhong/paracel/data/als_fm/test' dpark = DparkContext() def local_filter1(line): tmp = line.strip().split(',')[1] if tmp.endswith('0') or tmp.endswith('1') or tmp.endswith('2'): return False return True def local_filter2(line): tmp = line.strip().split(',')[1] if tmp.endswith('0') or tmp.endswith('1') or tmp.endswith('2'): return True return False dpark.textFile(glob.glob(RATING_PATH)).filter( local_filter1 ).saveAsTextFile(TRAINING_PATH) dpark.textFile(glob.glob(RATING_PATH)).filter( local_filter2 ).saveAsTextFile(TEST_PATH)
from __future__ import absolute_import from __future__ import print_function import sys sys.path.append('../') from dpark import DparkContext dpark = DparkContext() name = '/mfs/tmp/weblog-pre-20111019.csv' name = '/mfs/tmp/weblog-20111019.csv' name = '/tmp/weblog-20111019.csv.small' #name = '/tmp/weblog-20111019.csv.medium' pv = dpark.textFile(name) pv = pv.map(lambda x:x.split(',')).map(lambda l:(l[3],l[7])) pv = pv.flatMap(lambda i_u:(i_u[1].startswith('/movie') and [(i_u[0],2)] or i_u[1].startswith('/group') and [(i_u[0],3)] or [])) #print pv.take(50) pv = pv.reduceByKey(lambda x,y:x*y) #print pv.take(50) print(pv.filter(lambda __y:__y[1]%2==0 and __y[1]%3==0).count()) #movie = pv.filter(lambda (bid,url): url.startswith('/movie')).reduceByKey(lambda x,y:None) #group = pv.filter(lambda (bid,url): url.startswith('/group')).reduceByKey(lambda x,y:None) #print movie.join(group).count() #print pv.map(lambda x:x.split(',')[2]).uniq().count() #print pv.map(lambda x:(x.split(',')[2],None)).reduceByKey(lambda x,y:None).count() #.filter(lambda uid:uid) #print upv.count() #print upv.reduceByKey(lambda x,y:x+y).count()
from dpark import DparkContext dpark = DparkContext() count = dpark.accumulator(0) def random_once(*args, **kwrgs): x = random() * 2 - 1 y = random() * 2 - 1 if x * x + y * y < 1: count.add(1) depark.parallelize(range(0, N), 10).foreach(random_once) print 'PI is roughly ', 4.0 * count.value / N
from __future__ import absolute_import from __future__ import print_function import math import random import os, sys from pprint import pprint from six.moves import map from six.moves import range from six.moves import zip sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dpark import DparkContext dpark = DparkContext() # range nums = dpark.parallelize(list(range(100)), 4) print(nums.count()) print(nums.reduce(lambda x, y: x + y)) # text search f = dpark.textFile("./", ext='py').map(lambda x: x.strip()) log = f.filter(lambda line: 'logging' in line).cache() print('logging', log.count()) print('error', log.filter(lambda line: 'error' in line).count()) for line in log.filter(lambda line: 'error' in line).collect(): print(line) # word count counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).cache() pprint(counts.filter(lambda __v1: __v1[1] > 50).collectAsMap())