def test_scope(): Scope.reset() dc = DparkContext() rdd = dc.makeRDD([1, 2, 3]).map(int).map(int).map(int) dc.scheduler.current_scope = Scope.get("") for i, r in enumerate([rdd.prev.prev, rdd.prev, rdd]): assert r.scope.id == i + 1 assert r.scope.api_callsite.startswith("map:{}".format(i)) Scope.reset() rdd = dc.makeRDD([1, 2, 3]) \ .map(int) \ .map(int) \ .map(int) for i, r in enumerate([rdd.prev.prev, rdd.prev, rdd]): assert r.scope.id == i + 1 assert r.scope.api_callsite.startswith("map:0") def get_rdd(n): return dc.makeRDD([n, n]).map(int).map(int).map(int) rdds = [get_rdd(1), get_rdd(2)] assert rdds[0].scope.id + 4 == rdds[1].scope.id rdds = [get_rdd(i) for i in range(2)] assert rdds[0].scope.id == rdds[1].scope.id
def test_lineage(): Scope.reset() dc = DparkContext() rdd1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(5)]) assert len(rdd1.dep_lineage_counts) == 1 rdd2 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(3)]) rdd3 = rdd1.union(rdd2) assert len(rdd3.dep_lineage_counts) == 2 rdd4 = dc.union([dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) for _ in range(4)]) assert len(rdd4.dep_lineage_counts) == 1 assert len(list(rdd4.dependencies)[0].rdd.dep_lineage_counts) == 1 rdd5 = rdd3.groupWith(rdd4) print("rdd1", rdd1.id, rdd1.dep_lineage_counts) stage = dc.scheduler.newStage(rdd1, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert list(stage.pipelines.keys()) == [rdd1.id] assert stage.pipeline_edges == {} stage = dc.scheduler.newStage(rdd3, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert sorted(list(stage.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id] assert stage.pipeline_edges == {((-1, rdd1.id), (-1, rdd3.id)):1, ((-1, rdd2.id), (-1, rdd3.id)):1} stage = dc.scheduler.newStage(rdd4, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert list(stage.pipelines.keys()) == [rdd4.id] assert stage.pipeline_edges == {} print("rdd5", rdd5.id, rdd3.id, rdd4.id) stage = dc.scheduler.newStage(rdd5, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert sorted(list(stage.pipelines.keys())) == [rdd5.id] assert sorted(stage.pipeline_edges) == sorted([((s.id, s.rdd.id), (-1, rdd5.id)) for s in stage.parents]) print('-' * 100) pprint(stage.get_pipeline_graph()) for s in stage.parents: if s.rdd.id == rdd4.id: assert list(s.pipelines.keys()) == [rdd4.id] assert s.pipeline_edges == {} elif s.rdd.id == rdd3.id: assert sorted(list(s.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id] assert s.pipeline_edges == {((-1, rdd1.id), (-1, rdd3.id)): 1, ((-1, rdd2.id), (-1, rdd3.id)): 1} else: assert False pprint(s.get_pipeline_graph())
def main(): current_path = os.path.dirname(os.path.abspath(__file__)) for i in cmp_list: assert os.path.isdir('cmp'+str(i+1)) dpark_ctx = DparkContext('process') # Dpark thread def map_iter(i): dir_name = 'cmp' + str(i+1) logger = os.path.join(dir_name, 'log') if os.path.isdir(logger) and os.listdir(logger): return print "Start running: ", i+1 os.chdir(os.path.join(current_path, 'cmp') + str(i+1)) os.system('python ./cmp.py') dpark_ctx.makeRDD(cmp_list).foreach(map_iter) print 'Done.'
def test_call_graph_join(): dc = DparkContext() Scope.reset() rdd = dc.makeRDD([(1, 1), (1, 2)]).map(lambda x: x) rdd = rdd.join(rdd) dc.scheduler.current_scope = Scope.get("") g = dc.scheduler.get_call_graph(rdd) pprint(g) assert g == ([0, 1, 2, 3], {(0, 1): 1, (1, 2): 2, (2, 3): 1}) fg = dc.scheduler.fmt_call_graph(g)
def test_call_graph_union(): dc = DparkContext() Scope.reset() r1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) r2 = dc.union([dc.makeRDD([(3, 4)]) for _ in range(2)]) rdd = r1.union(r2) dc.scheduler.current_scope = Scope.get("") g = dc.scheduler.get_call_graph(rdd) # pprint(g) fg = dc.scheduler.fmt_call_graph(g) # pprint(fg) assert g == ([0, 1, 2, 3, 4, 5], {(0, 1): 2, (1, 4): 1, (2, 3): 2, (3, 4): 1, (4, 5): 1})
def test_call_graph_union(): dc = DparkContext() Scope.reset() r1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) r2 = dc.union([dc.makeRDD([(3, 4)]) for _ in range(2)]) rdd = r1.union(r2) dc.scheduler.current_scope = Scope.get("") g = dc.scheduler.get_call_graph(rdd) # pprint(g) fg = dc.scheduler.fmt_call_graph(g) # pprint(fg) assert g == ([0, 1, 2, 3, 4, 5], { (0, 1): 2, (1, 4): 1, (2, 3): 2, (3, 4): 1, (4, 5): 1 })
def dt_model(): tr_x, tr_y, va_x, va_y, te_x, te_y = load_data() param_grid = { "min_samples_split": range(1, 10000, 1000), "min_samples_leaf": range(1, 10000, 1000), # 'max_leaf_nodes': [0, 100, 1000, 10000], "max_depth": [None, 100, 1000, 10000], } param_grid = grid_generator(param_grid) # Dpark dpark_ctx = DparkContext() def map_iter(param): idx = param[0][0] * 2 + param[0][1] param = param[1] m = tree.DecisionTreeClassifier(criterion="entropy", **param) print "%d, Start traininig Decision Tree model." % idx m = m.fit(tr_x, tr_y) print "%d, Training done." % idx proba = m.predict_proba(va_x) fpr, tpr, thresh = roc_curve(va_y, proba[:, 1]) auc_ = auc(fpr, tpr) print "%d, AUC is %f" % (idx, auc_) return idx, param, auc_ print "It will train %d models" % len(param_grid) result_record = dpark_ctx.makeRDD(param_grid, 50).enumerate().map(map_iter).collect() file_record = open("dt_result.pkl", "w") pickle.dump(result_record, file_record) file_record.close() # testing opt = reduce(lambda x, y: x if x[2] > y[2] else y, result_record) m = tree.DecisionTreeClassifier(criterion="entropy", **opt[1]) m = m.fit(tr_x, tr_y) proba = m.predict_proba(te_x) fpr, tpr, thresh = roc_curve(te_y, proba[:, 1]) auc_ = auc(fpr, tpr) print "Testing AUC is %f" % auc_
.filter(lambda x:x)\ .filter(lambda line: (not is_spider(line) and (line['uid'] or line['bid'])))\ .filter(lambda l: l['bid'] not in fraud.value and l['uid'] not in fraud.value) spec = set(['url', 'uid', 'bid', 'unit_id', 'ad_id', 'status_code', 'user_agent', 'region', 'page_tags', 'hour', 'group']) features = common_gen(spec) features = features.map(feature_extract)\ .filter(lambda x:x)\ .cache() user_list = set(features.map(lambda x: x[0]).filter(lambda x: x<>'None').collect()) user_list_b = dp.broadcast(user_list) user_feature = dp.makeRDD([]) def _parse_list(line): uid, features = line.split('\t') features = [x.split(':') for x in features.split('|')] features = [(x[0], float(x[1])) for x in features] features = sorted(features, key=lambda x: x[1], reverse=True) return (uid, features) for name in ['book_cluster', 'movie_cluster', 'group_cluster', 'text_cluster']: fn = '/home2/alg/user_profile/%s/%s' % (current_date, name) if not os.path.exists(fn): continue rdd = dp.textFile(fn, splitSize=16<<20)\ .filter(lambda x: x.split('\t', 1)[0] in user_list_b.value)\ .map(_parse_list)\
for x in range(m): for y in range(v): pred = Wi[x].dot(Hj[y]) err = int(Oij[x][y]) - int(pred) w = Wi[x] + GAMMA * (Hj[y] * err - LAMBDA * Wi[x]) h = Hj[y] + GAMMA * (Wi[x] * err - LAMBDA * Hj[y]) Wi[x] = w Hj[y] = h W.put(i, Wi) H.put(j, Hj) rdd = dpark.makeRDD(list(range(d))) rdd = rdd.cartesian(rdd).cache() def calc_err(i_j): (i, j) = i_j Wi = W.get(i) Hj = H.get(j) ori = ori_b.value Rij = Wi.dot(Hj.T) Oij = ori[i * m:(i + 1) * m, j * v:(j + 1) * v] return ((Rij - Oij) ** 2).sum() J = list(range(d))
import time from dpark import DparkContext def m(x): if x[0] == 0: time.sleep(100) return x def r(x, y): return x + y dc = DparkContext("mesos") rdd = dc.makeRDD([(i, i) for i in range(2)], 2) rdd.collect() rdd.reduceByKey(r).map(m).reduceByKey(r).collect()
def main(argv): # Dpark initialize dpark = DparkContext() # number of the training and testing set num_train = 6000 num_test = 6000 # Loading the dataset data = svm_read_problem('echo_liveness.01.libsvm') y, x = data # Preparing training and testing data if len(x) != len(y): print("The labels and features are not accorded!") sys.exit() x_live = [x[i] for i in find(y, 1.0)] x_stu = [x[i] for i in find(y, 0.0)] n_live = len(x_live) n_stu = len(x_stu) ind_live = range(n_live) ind_stu = range(n_stu) random.shuffle(ind_live) random.shuffle(ind_stu) x_te = [x_live[i] for i in ind_live[num_train : num_test + num_train]] + \ [x_stu[i] for i in ind_stu[num_train : num_test + num_train]] y_te = [1.0] * len(ind_live[num_train : num_test + num_train]) + \ [-1.0]*len(ind_stu[num_train : num_test + num_train]) x_tr = [x_live[i] for i in ind_live[:num_train]] + \ [x_stu[i] for i in ind_stu[:num_train]] y_tr = [1.0]*num_train + [-1.0]*num_train # dpark version def map_iter(i): y_tr_examplar = [-1.0] * len(y_tr) y_tr_examplar[i] = 1.0 # opt = '-t 0 -w1 ' + str(len(y_tr)) + ' -w-1 1 -b 1 -q' # It is suggested in Efros' paper that: # C1 0.5, C2 0.01 opt = '-t 0 -w1 0.5 -w-1 0.01 -b 1 -q' m = svm_train(y_tr_examplar, list(x_tr), opt) p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-b 1 -q') p_val = np.array(p_val) # p_val = np.delete(p_val,1,1) # shape = (N, 1) p_val = p_val[:, 0] # shape = (N, ) return p_val p_vals = dpark.makeRDD( range(len(y_tr)) ).map( map_iter ).collect() val = np.array(p_vals).T # for-loop version ''' # Examplar SVM Training ensemble_model = [] # DPark for i in range(len(y_tr)): y_tr_examplar = [-1.0] * len(y_tr) y_tr_examplar[i] = 1.0; #opt = '-t 0 -w1 ' + str(len(y_tr)) + ' -w-1 1 -b 1 -q' # It is suggested in Efros' paper that: # C1 0.5, C2 0.01 opt = '-t 0 -w1 0.5 -w-1 0.01 -b 1 -q' m = svm_train(y_tr_examplar, x_tr, opt) ensemble_model.append(m) print("The %s-th examplar SVM has been trained" %i) # Calibaration, to be updated # Since we adopt the probability estimation model of LIB_SVM, Calibrating seems unnecessary # Ensembly Classify val = np.zeros((len(y_te),1)) for m in ensemble_model: p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-b 1 -q') p_val = np.array(p_val) p_val = np.delete(p_val,1, 1) val = np.hstack((val, p_val)) if val.shape[1] != len(y_tr) + 1: print "Chaos!" val = np.delete(val,0,1) print 'val.shape =', val.shape ''' # KNN k = num_train / 8 sorted_index = val.argsort(axis=1) sorted_index = sorted_index.T[::-1].T p_label = [] for index in sorted_index: nearest_samples = [] for sample_index in index[:k]: nearest_samples.append(y_tr[sample_index]) n,bins,dummy = plt.hist(nearest_samples, 2, normed=1, facecolor='r', alpha=0.75) if n[0] > n[1]: p_label.append(-1.0) else: p_label.append(1.0) # evaluation rate, pos_rate, neg_rate = evaluation(y_te, p_label) print("The Examplar SVM framework achieves a precision of %f" % rate)
def main(argv): # Loading the dataset data = svm_read_problem('echo_liveness.01.libsvm') y, x = data del data num_train = 6000 num_test = 6000 # Preparing training and testing data if len(x) != len(y): print("Please examine the data set, for the labels and features are not accorded!") sys.exit() # generating random training and testing set, to yield the ability of classifier more accurately. x_live = [x[i] for i in find(y, 1.0)] x_stu = [x[i] for i in find(y, 0.0)] n_live = len(x_live) n_stu = len(x_stu) ind_live = range(n_live) ind_stu = range(n_stu) random.shuffle(ind_live) random.shuffle(ind_stu) x_te = [x_live[i] for i in ind_live[num_train : num_test + num_train]] + \ [x_stu[i] for i in ind_stu[num_train : num_test + num_train]] y_te = [1.0] * len(ind_live[num_train : num_test + num_train]) + \ [-1.0]*len(ind_stu[num_train : num_test + num_train]) x_tr = [x_live[i] for i in ind_live[:num_train]] + \ [x_stu[i] for i in ind_stu[:num_train]] y_tr = [1.0]*num_train + [-1.0]*num_train # SVM and a 10-fold Cross Validation choosing the best parameters. # gamma and c_reg are constructed in a parameter grid # for-loop version ''' gamma = np.arange(.01,20,.04) c_reg = np.arange(.01,20,.04) opt = [] best_para = {'gamma': 0, 'c': 0, 'precision': 0} for g in gamma: for c in c_reg: opt = '-g '+ str(g) +' -c ' + str(c) + ' -v 10 -q' pre = svm_train(y_tr,x_tr,opt) if pre > best_para.get('precision'): best_para['gamma'] = g best_para['c'] = c best_para['precision'] = pre best_opt = '-g '+ str(best_para.get('gamma')) +' -c ' + str(best_para.get('c')) + ' -q' m = svm_train(y_tr, x_tr, best_opt) p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-q') ''' # dpark version dpark = DparkContext() gamma = np.arange(.01, 5, .08) c_reg = np.arange(.01, 5, .08) opt = [] for g in gamma: for c in c_reg: opt.append('-g '+ str(g) +' -c ' + str(c) + ' -v 10 -q') def map_iter(i): pre = svm_train(y_tr, list(x_tr), opt[i]) return pre #pres = dpark.makeRDD(range(len(opt)),100).map(map_iter).collect() pres = dpark.makeRDD(range(len(opt))).map(map_iter).collect() pres = np.array(pres) best_opt_ind = pres.argsort() best_opt = opt[best_opt_ind[-1]] best_opt = best_opt[:best_opt.find('-v') - 1] m = svm_train(y_tr, x_tr, best_opt) p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-q') print 'This SVM framework precision: %f' % p_acc[0]
def test_lineage(): Scope.reset() dc = DparkContext() rdd1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(5)]) assert len(rdd1.dep_lineage_counts) == 1 rdd2 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(3)]) rdd3 = rdd1.union(rdd2) assert len(rdd3.dep_lineage_counts) == 2 rdd4 = dc.union( [dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) for _ in range(4)]) assert len(rdd4.dep_lineage_counts) == 1 assert len(list(rdd4.dependencies)[0].rdd.dep_lineage_counts) == 1 rdd5 = rdd3.groupWith(rdd4) print("rdd1", rdd1.id, rdd1.dep_lineage_counts) stage = dc.scheduler.newStage(rdd1, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert list(stage.pipelines.keys()) == [rdd1.id] assert stage.pipeline_edges == {} stage = dc.scheduler.newStage(rdd3, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert sorted(list(stage.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id] assert stage.pipeline_edges == { ((-1, rdd1.id), (-1, rdd3.id)): 1, ((-1, rdd2.id), (-1, rdd3.id)): 1 } stage = dc.scheduler.newStage(rdd4, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert list(stage.pipelines.keys()) == [rdd4.id] assert stage.pipeline_edges == {} print("rdd5", rdd5.id, rdd3.id, rdd4.id) stage = dc.scheduler.newStage(rdd5, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert sorted(list(stage.pipelines.keys())) == [rdd5.id] assert sorted(stage.pipeline_edges) == sorted([ ((s.id, s.rdd.id), (-1, rdd5.id)) for s in stage.parents ]) print('-' * 100) pprint(stage.get_pipeline_graph()) for s in stage.parents: if s.rdd.id == rdd4.id: assert list(s.pipelines.keys()) == [rdd4.id] assert s.pipeline_edges == {} elif s.rdd.id == rdd3.id: assert sorted(list( s.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id] assert s.pipeline_edges == { ((-1, rdd1.id), (-1, rdd3.id)): 1, ((-1, rdd2.id), (-1, rdd3.id)): 1 } else: assert False pprint(s.get_pipeline_graph())
return (id, Vertex(id, sys.maxint, outEdges, True)) def compute(self, vs, agg, superstep): newValue = min(self.value, vs[0]) if vs else self.value if newValue != self.value: outbox = [(edge.target_id, newValue + edge.value) for edge in self.outEdges] else: outbox = [] return Vertex(self.id, newValue, self.outEdges, False), outbox if __name__ == '__main__': ctx = DparkContext() path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'graph.txt') lines = ctx.textFile(path).map(lambda line: line.split(' ')) vertices = lines.groupBy(lambda line: line[0]).map(to_vertex) startVertex = str(0) messages = ctx.makeRDD([(startVertex, 0)]) print('read', vertices.count(), 'vertices and ', messages.count(), 'messages.') result = Bagel.run(ctx, vertices, messages, compute, BasicCombiner(min), numSplits=2) print('Shortest path from %s to all vertices:' % startVertex) for id, v in result.collect(): if v.value == sys.maxint: v.value = 'inf' print(v.id, v.value)
def main(): """Tuning for SparseAE""" # First layer T = SparseAE(64, 49, optimize_method='cg', max_iter=400, debug=0, verbose=True, tol=1e-8, mini_batch=32) X = vs.load_sample('IMAGES.mat', patch_size=8, n_patches=10000) T.train(X) T.devec_theta() vs.disp_effect(T.w1, fname='Fst_lyr.jpg') # Second layer rho = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100] beta = [3e-3, 3e-2, 9e-2, 3e-1, 9e-1, 3, 9, 30, 90, 300] lamb = [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10] param = product(rho, beta, lamb) param_str = ['rho', 'sparse_beta', 'lamb'] param = map(lambda x: dict(zip(param_str, x)), param) X = activate(np.dot(T.w1, X) + T.b1) if not os.path.isdir('./imgs'): os.system('mkdir imgs') ''' for idx, param_elem in enumerate(param): import warnings warnings.filterwarnings('error') try: S = SparseAE(49, 36, optimize_method='cg', max_iter=400, debug=0, verbose=True, tol=1e-8, mini_batch=32, **param_elem) S.train(X) S.devec_theta() fname = 'imgs/' + str(idx) + '.jpg' vs.disp_effect(S.w1, fname=fname) except: fname = 'imgs/' + 'log' fid = open(fname, 'w') fid.write('Exception: ' + str(idx) + '\n') fid.close() ''' # Break point re-computing fls = os.listdir('./imgs/weight') fls = map(lambda x: int(x[:x.find('.')]), fls) for fl in fls: param = param[:fl] + param[fl + 1:] # dpark parallel computing dpark_ctx = DparkContext('process') dpark_n_length = len(param) dpark_n_block = 50 if not os.path.isdir('./imgs/weight'): os.system('mkdir imgs/weight') print '%d models await training.' % dpark_n_length def map_iter(param_enum): idx = param_enum[0][0] * int(ceil(dpark_n_length / dpark_n_block)) +\ param_enum[0][1] import warnings warnings.filterwarnings('error') try: S = SparseAE(49, 36, optimize_method='cg', max_iter=400, debug=0, verbose=True, tol=1e-8, mini_batch=32, **param_enum[1]) S.train( np.array(X)) # dpark converts X, 'np.ndarray' to 'instance' S.devec_theta() fname = 'imgs/weight/' + str(idx) + '.csv' # vs.disp_effect(S.w1, fname=fname) # dpark doesn't support plt.savefig() np.savetxt(fname, S.w1, delimiter=',') except: import traceback traceback.print_exc() fname = 'imgs/' + 'log' fid = open(fname, 'w') fid.write('Training exception: ' + str(idx) + '\n') fid.close() dpark_ctx.makeRDD(param, dpark_n_block).enumerate().foreach(map_iter) print 'Done.' # Visualizing for i in range(len(param)): fname = 'imgs/weight/' + str(i) + '.csv' if not os.path.isfile(fname): continue w = np.loadtxt(fname, delimiter=',') fname_img = 'imgs/' + str(i) + '.jpg' vs.disp_effect(w, fname=fname_img) print i, 'visualization done.'