def test_scope(): Scope.reset() dc = DparkContext() rdd = dc.makeRDD([1, 2, 3]).map(int).map(int).map(int) dc.scheduler.current_scope = Scope.get("") for i, r in enumerate([rdd.prev.prev, rdd.prev, rdd]): assert r.scope.id == i + 1 assert r.scope.api_callsite.startswith("map:{}".format(i)) Scope.reset() rdd = dc.makeRDD([1, 2, 3]) \ .map(int) \ .map(int) \ .map(int) for i, r in enumerate([rdd.prev.prev, rdd.prev, rdd]): assert r.scope.id == i + 1 assert r.scope.api_callsite.startswith("map:0") def get_rdd(n): return dc.makeRDD([n, n]).map(int).map(int).map(int) rdds = [get_rdd(1), get_rdd(2)] assert rdds[0].scope.id + 4 == rdds[1].scope.id rdds = [get_rdd(i) for i in range(2)] assert rdds[0].scope.id == rdds[1].scope.id
def test_call_graph_join(): dc = DparkContext() Scope.reset() rdd = dc.makeRDD([(1, 1), (1, 2)]).map(lambda x: x) rdd = rdd.join(rdd) dc.scheduler.current_scope = Scope.get("") g = dc.scheduler.get_call_graph(rdd) pprint(g) assert g == ([0, 1, 2, 3], {(0, 1): 1, (1, 2): 2, (2, 3): 1}) fg = dc.scheduler.fmt_call_graph(g)
def word_count(file_path, word): # 指定某个Mesos主机进行沟通 dpark = DparkContext() # 将分布式文件,构造成文件RDD,每块大小为16m f = dpark.textFile(file_path, splitSize=16 << 20) # 用map()转变成新的RDD,再用filter过滤出更新的RDD,最后用count()处理返回结果 print( word, 'count:', f.map(lambda line: line.strip()).filter( lambda line: word in line).count())
def main(): current_path = os.path.dirname(os.path.abspath(__file__)) for i in cmp_list: assert os.path.isdir('cmp'+str(i+1)) dpark_ctx = DparkContext('process') # Dpark thread def map_iter(i): dir_name = 'cmp' + str(i+1) logger = os.path.join(dir_name, 'log') if os.path.isdir(logger) and os.listdir(logger): return print "Start running: ", i+1 os.chdir(os.path.join(current_path, 'cmp') + str(i+1)) os.system('python ./cmp.py') dpark_ctx.makeRDD(cmp_list).foreach(map_iter) print 'Done.'
def test_call_graph_union(): dc = DparkContext() Scope.reset() r1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) r2 = dc.union([dc.makeRDD([(3, 4)]) for _ in range(2)]) rdd = r1.union(r2) dc.scheduler.current_scope = Scope.get("") g = dc.scheduler.get_call_graph(rdd) # pprint(g) fg = dc.scheduler.fmt_call_graph(g) # pprint(fg) assert g == ([0, 1, 2, 3, 4, 5], { (0, 1): 2, (1, 4): 1, (2, 3): 2, (3, 4): 1, (4, 5): 1 })
def main(): # Loading data print "Loading..." data = sample_image() # Initialize networks visible_size = 64 # number of input units hidden_size = [25, 16, 9] # number of hidden units of each layer #lamb = 0.0001 # weight decay parameter ''' lamb = 0 # No weight decay! beta = 0.01 ''' # sigmoid DEBUG lamb = 0.0001 beta = 3 # dpark initialize dpark_ctx = DparkContext() # Start training, and L-BFGS is adopted # We apply a stack-wise greedy training process layer_ind = range(len(hidden_size) + 1) layer_ind.remove(0) layer_size = [visible_size] + hidden_size opttheta = dict() # parameter vector of stack AE img = dict() # visualization mode sparsity_param = dict() for ind in layer_ind: # standard: 64 units -> sparsity parameter 0.01 sparsity_param[ind] = layer_size[ind - 1] * 0.01 / 64 for ind in layer_ind: print "start training layer No.%d" % ind # Obtain random parameters of considered layer theta = initial_parameter(layer_size[ind], layer_size[ind - 1]) # SGD with mini-batch options = (data, layer_size[ind - 1], layer_size[ind], lamb, sparsity_param[ind], beta, dpark_ctx) opttheta[ind] = stocha_grad_desc_agagrad(compute_cost, compute_grad, theta, options, step_size_init=0.2, max_iter=25, tol=1e-7) # Preparing next layer! W = opttheta.get(ind)[:layer_size[ind]*layer_size[ind-1]].\ reshape(layer_size[ind], layer_size[ind-1]) b = opttheta.get(ind)[2*layer_size[ind]*layer_size[ind-1]:\ 2*layer_size[ind]*layer_size[ind-1]+layer_size[ind]].\ reshape(layer_size[ind], 1) data = ReLU(np.dot(W, data) + b) # visulization shows img[ind] = display_effect(W) plt.axis('off') plt.savefig(str(ind) + '.jpg') # DEBUG fin = open("theta_DEBUG.pkl", "wb") pickle.dump((W, b), fin) fin.close() sys.exit() # Trained parameters of stack AE para_stack = vecstack2stack(opttheta, hidden_size, visible_size) # Save trained weights and bias out = open("weights_bias.pkl", "wb") pickle.dump(para_stack, out) out.close() print "Mission complete!"
def DownLoad(file_path): dpark = DparkContext() file_block = dpark.textFile(file_path, splitSize=16 << 20) file_block.foreach(write_to_wav)
return _ class DemoOutputStream(ForEachDStream): def __init__(self, parent, output): ForEachDStream.__init__(self, parent, collect(output)) self.output = output def __setstate__(self, state): ForEachDStream.__setstate__(self, state) self.output = [] self.func = collect(self.output) sc = DparkContext(dpark_master) class TestDStream(unittest.TestCase): def _setupStreams(self, intput1, input2, operation): ssc = StreamingContext(2, sc) is1 = DemoInputStream(ssc, intput1) ssc.registerInputStream(is1) if input2: is2 = DemoInputStream(ssc, input2) ssc.registerInputStream(is2) os = operation(is1, is2) else: os = operation(is1) output = DemoOutputStream(os, []) ssc.registerOutputStream(output)
if messageSum and messageSum[0]: newValue = 0.15 / num + 0.85 * messageSum[0] else: newValue = self.value terminate = (superstep >= 10 and abs(newValue - self.value) < epsilon) or superstep > 30 outbox = [(edge.target_id, newValue / len(self.outEdges)) for edge in self.outEdges] if not terminate else [] return Vertex(self.id, newValue, self.outEdges, not terminate), outbox return compute if __name__ == '__main__': inputFile = 'wikipedia.txt' threshold = 0.01 dpark = DparkContext() path = os.path.join(os.path.dirname(os.path.abspath(__file__)), inputFile) input = dpark.textFile(path) numVertex = input.count() vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache() epsilon = 0.01 / numVertex messages = dpark.parallelize([]) result = Bagel.run(dpark, vertices, messages, gen_compute(numVertex, epsilon)) for id, v in result.filter( lambda id_v: id_v[1].value > threshold).collect(): print(id, v)
import time from dpark import DparkContext, optParser from dpark.file_manager import file_manager dc = DparkContext() optParser.set_usage("%prog [options] path") options, args = optParser.parse_args() path = args[0] def run(split_size=1): t = time.time() dc.textFile(path).mergeSplit( splitSize=split_size).filter(lambda x: "yangxiufeng" in x).count() return time.time() - t run() # file cache print("{}s with locality".format(run())) file_manager.fs_list = file_manager.fs_list[1:] print("{}s merge & without locality".format(run(10))) print("{}s without locality, ".format(run()))
def main(argv): # Loading the dataset data = svm_read_problem('echo_liveness.01.libsvm') y, x = data del data num_train = 6000 num_test = 6000 # Preparing training and testing data if len(x) != len(y): print("Please examine the data set, for the labels and features are not accorded!") sys.exit() # generating random training and testing set, to yield the ability of classifier more accurately. x_live = [x[i] for i in find(y, 1.0)] x_stu = [x[i] for i in find(y, 0.0)] n_live = len(x_live) n_stu = len(x_stu) ind_live = range(n_live) ind_stu = range(n_stu) random.shuffle(ind_live) random.shuffle(ind_stu) x_te = [x_live[i] for i in ind_live[num_train : num_test + num_train]] + \ [x_stu[i] for i in ind_stu[num_train : num_test + num_train]] y_te = [1.0] * len(ind_live[num_train : num_test + num_train]) + \ [-1.0]*len(ind_stu[num_train : num_test + num_train]) x_tr = [x_live[i] for i in ind_live[:num_train]] + \ [x_stu[i] for i in ind_stu[:num_train]] y_tr = [1.0]*num_train + [-1.0]*num_train # SVM and a 10-fold Cross Validation choosing the best parameters. # gamma and c_reg are constructed in a parameter grid # for-loop version ''' gamma = np.arange(.01,20,.04) c_reg = np.arange(.01,20,.04) opt = [] best_para = {'gamma': 0, 'c': 0, 'precision': 0} for g in gamma: for c in c_reg: opt = '-g '+ str(g) +' -c ' + str(c) + ' -v 10 -q' pre = svm_train(y_tr,x_tr,opt) if pre > best_para.get('precision'): best_para['gamma'] = g best_para['c'] = c best_para['precision'] = pre best_opt = '-g '+ str(best_para.get('gamma')) +' -c ' + str(best_para.get('c')) + ' -q' m = svm_train(y_tr, x_tr, best_opt) p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-q') ''' # dpark version dpark = DparkContext() gamma = np.arange(.01, 5, .08) c_reg = np.arange(.01, 5, .08) opt = [] for g in gamma: for c in c_reg: opt.append('-g '+ str(g) +' -c ' + str(c) + ' -v 10 -q') def map_iter(i): pre = svm_train(y_tr, list(x_tr), opt[i]) return pre #pres = dpark.makeRDD(range(len(opt)),100).map(map_iter).collect() pres = dpark.makeRDD(range(len(opt))).map(map_iter).collect() pres = np.array(pres) best_opt_ind = pres.argsort() best_opt = opt[best_opt_ind[-1]] best_opt = best_opt[:best_opt.find('-v') - 1] m = svm_train(y_tr, x_tr, best_opt) p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-q') print 'This SVM framework precision: %f' % p_acc[0]
import time import unittest import logging sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dpark import DparkContext from dpark.utils.nested_groupby import GroupByNestedIter from dpark.shuffle import GroupByNestedIter, AutoBatchedSerializer from dpark.utils.profile import profile import dpark.conf GroupByNestedIter.NO_CACHE = True print_mem_incr = True print_mem_incr = False dc = DparkContext('mesos') RC = dpark.conf.rddconf M = 1024 * 1024 def rss_func(p): if hasattr(p, "memory_info"): mem_info = getattr(p, "memory_info") else: mem_info = getattr(p, 'get_memory_info') def _(): return mem_info().rss / M return _
def test_lineage(): Scope.reset() dc = DparkContext() rdd1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(5)]) assert len(rdd1.dep_lineage_counts) == 1 rdd2 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(3)]) rdd3 = rdd1.union(rdd2) assert len(rdd3.dep_lineage_counts) == 2 rdd4 = dc.union( [dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) for _ in range(4)]) assert len(rdd4.dep_lineage_counts) == 1 assert len(list(rdd4.dependencies)[0].rdd.dep_lineage_counts) == 1 rdd5 = rdd3.groupWith(rdd4) print("rdd1", rdd1.id, rdd1.dep_lineage_counts) stage = dc.scheduler.newStage(rdd1, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert list(stage.pipelines.keys()) == [rdd1.id] assert stage.pipeline_edges == {} stage = dc.scheduler.newStage(rdd3, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert sorted(list(stage.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id] assert stage.pipeline_edges == { ((-1, rdd1.id), (-1, rdd3.id)): 1, ((-1, rdd2.id), (-1, rdd3.id)): 1 } stage = dc.scheduler.newStage(rdd4, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert list(stage.pipelines.keys()) == [rdd4.id] assert stage.pipeline_edges == {} print("rdd5", rdd5.id, rdd3.id, rdd4.id) stage = dc.scheduler.newStage(rdd5, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert sorted(list(stage.pipelines.keys())) == [rdd5.id] assert sorted(stage.pipeline_edges) == sorted([ ((s.id, s.rdd.id), (-1, rdd5.id)) for s in stage.parents ]) print('-' * 100) pprint(stage.get_pipeline_graph()) for s in stage.parents: if s.rdd.id == rdd4.id: assert list(s.pipelines.keys()) == [rdd4.id] assert s.pipeline_edges == {} elif s.rdd.id == rdd3.id: assert sorted(list( s.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id] assert s.pipeline_edges == { ((-1, rdd1.id), (-1, rdd3.id)): 1, ((-1, rdd2.id), (-1, rdd3.id)): 1 } else: assert False pprint(s.get_pipeline_graph())
def get_rdd(self): dpark = DparkContext() return dpark.union([ dpark.textFile(path, splitSize=64 << 20) for path in self.paths ]).map(Weblog.from_line)
import time from dpark import DparkContext def m(x): if x[0] == 0: time.sleep(100) return x def r(x, y): return x + y dc = DparkContext("mesos") rdd = dc.makeRDD([(i, i) for i in range(2)], 2) rdd.collect() rdd.reduceByKey(r).map(m).reduceByKey(r).collect()
def main(): # Loading data print "Loading..." data_train = sample_image() # Initialize networks visible_size = 64 # number of input units hidden_size = [25, 16, 9] # number of hidden units of each layer lamb = 0.0001 # weight decay parameter beta = 3 # weight of sparsity penalty dataset # dpark initialize dpark_ctx = DparkContext() # Start training, and L-BFGS is adopted # We apply a stack-wise greedy training process layer_ind = range(len(hidden_size) + 1) layer_ind.remove(0) layer_size = [visible_size] + hidden_size # desired average activation sparsity_param = dict() for ind in layer_ind: # standard: 64 units -> sparsity parameter 0.01 sparsity_param[ind] = layer_size[ind - 1] * 0.01 / 64 data = data_train opttheta = dict() # parameter vector of stack AE img = dict() # visualization mode for ind in layer_ind: print "start training layer No.%d" % ind # Obtain random parameters of considered layer theta = initial_parameter(layer_size[ind], layer_size[ind - 1]) # Training begins options = (data, layer_size[ind - 1], layer_size[ind], lamb, sparsity_param[ind], beta, dpark_ctx) opt = optimize.fmin_l_bfgs_b(compute_cost, theta, compute_grad, options) opttheta[ind] = opt[0] W = opttheta.get(ind)[:layer_size[ind]*layer_size[ind-1]].\ reshape(layer_size[ind], layer_size[ind-1]) data = np.dot(W, data) # visulization shows img[ind] = display_effect(W) plt.axis('off') plt.savefig(str(ind) + '.jpg') # Trained parameters of stack AE para_stack = vecstack2stack(opttheta, hidden_size, visible_size) # Save trained weights and bias out = open("weights_bias.pkl", "wb") pickle.dump(para_stack, out) out.close() print "Mission complete!"
# -*- coding: utf-8 -*- from dpark import DparkContext def m(x): return x rdd = DparkContext().makeRDD([(1, 1)]).map(m).groupByKey() rdd.map(m).collect() rdd.map(m).collect()
import math import random import os, sys from pprint import pprint sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dpark import DparkContext ctx = DparkContext() # range nums = ctx.parallelize(range(100), 4) print nums.count() print nums.reduce(lambda x, y: x + y) # text search f = ctx.textFile("./", ext='py').map(lambda x: x.strip()) log = f.filter(lambda line: 'logging' in line).cache() print 'logging', log.count() print 'error', log.filter(lambda line: 'error' in line).count() for line in log.filter(lambda line: 'error' in line).collect(): print line # word count counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey( lambda x, y: x + y).cache() pprint(counts.filter(lambda (_, v): v > 50).collectAsMap()) pprint( sorted( counts.filter(lambda (_, v): v > 20).map( lambda (x, y): (y, x)).groupByKey().collect())) pprint(counts.map(lambda v: "%s:%s" % v).saveAsTextFile("wc/"))
def main(): """Tuning for SparseAE""" # First layer T = SparseAE(64, 49, optimize_method='cg', max_iter=400, debug=0, verbose=True, tol=1e-8, mini_batch=32) X = vs.load_sample('IMAGES.mat', patch_size=8, n_patches=10000) T.train(X) T.devec_theta() vs.disp_effect(T.w1, fname='Fst_lyr.jpg') # Second layer rho = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100] beta = [3e-3, 3e-2, 9e-2, 3e-1, 9e-1, 3, 9, 30, 90, 300] lamb = [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10] param = product(rho, beta, lamb) param_str = ['rho', 'sparse_beta', 'lamb'] param = map(lambda x: dict(zip(param_str, x)), param) X = activate(np.dot(T.w1, X) + T.b1) if not os.path.isdir('./imgs'): os.system('mkdir imgs') ''' for idx, param_elem in enumerate(param): import warnings warnings.filterwarnings('error') try: S = SparseAE(49, 36, optimize_method='cg', max_iter=400, debug=0, verbose=True, tol=1e-8, mini_batch=32, **param_elem) S.train(X) S.devec_theta() fname = 'imgs/' + str(idx) + '.jpg' vs.disp_effect(S.w1, fname=fname) except: fname = 'imgs/' + 'log' fid = open(fname, 'w') fid.write('Exception: ' + str(idx) + '\n') fid.close() ''' # Break point re-computing fls = os.listdir('./imgs/weight') fls = map(lambda x: int(x[:x.find('.')]), fls) for fl in fls: param = param[:fl] + param[fl + 1:] # dpark parallel computing dpark_ctx = DparkContext('process') dpark_n_length = len(param) dpark_n_block = 50 if not os.path.isdir('./imgs/weight'): os.system('mkdir imgs/weight') print '%d models await training.' % dpark_n_length def map_iter(param_enum): idx = param_enum[0][0] * int(ceil(dpark_n_length / dpark_n_block)) +\ param_enum[0][1] import warnings warnings.filterwarnings('error') try: S = SparseAE(49, 36, optimize_method='cg', max_iter=400, debug=0, verbose=True, tol=1e-8, mini_batch=32, **param_enum[1]) S.train( np.array(X)) # dpark converts X, 'np.ndarray' to 'instance' S.devec_theta() fname = 'imgs/weight/' + str(idx) + '.csv' # vs.disp_effect(S.w1, fname=fname) # dpark doesn't support plt.savefig() np.savetxt(fname, S.w1, delimiter=',') except: import traceback traceback.print_exc() fname = 'imgs/' + 'log' fid = open(fname, 'w') fid.write('Training exception: ' + str(idx) + '\n') fid.close() dpark_ctx.makeRDD(param, dpark_n_block).enumerate().foreach(map_iter) print 'Done.' # Visualizing for i in range(len(param)): fname = 'imgs/weight/' + str(i) + '.csv' if not os.path.isfile(fname): continue w = np.loadtxt(fname, delimiter=',') fname_img = 'imgs/' + str(i) + '.jpg' vs.disp_effect(w, fname=fname_img) print i, 'visualization done.'