Ejemplo n.º 1
0
def test_scope():

    Scope.reset()
    dc = DparkContext()

    rdd = dc.makeRDD([1, 2, 3]).map(int).map(int).map(int)
    dc.scheduler.current_scope = Scope.get("")

    for i, r in enumerate([rdd.prev.prev, rdd.prev, rdd]):
        assert r.scope.id == i + 1
        assert r.scope.api_callsite.startswith("map:{}".format(i))

    Scope.reset()
    rdd = dc.makeRDD([1, 2, 3]) \
        .map(int) \
        .map(int) \
        .map(int)

    for i, r in enumerate([rdd.prev.prev, rdd.prev, rdd]):
        assert r.scope.id == i + 1
        assert r.scope.api_callsite.startswith("map:0")

    def get_rdd(n):
        return dc.makeRDD([n, n]).map(int).map(int).map(int)

    rdds = [get_rdd(1), get_rdd(2)]
    assert rdds[0].scope.id + 4 == rdds[1].scope.id

    rdds = [get_rdd(i) for i in range(2)]
    assert rdds[0].scope.id == rdds[1].scope.id
Ejemplo n.º 2
0
def test_call_graph_join():
    dc = DparkContext()
    Scope.reset()
    rdd = dc.makeRDD([(1, 1), (1, 2)]).map(lambda x: x)
    rdd = rdd.join(rdd)
    dc.scheduler.current_scope = Scope.get("")
    g = dc.scheduler.get_call_graph(rdd)
    pprint(g)
    assert g == ([0, 1, 2, 3], {(0, 1): 1, (1, 2): 2, (2, 3): 1})

    fg = dc.scheduler.fmt_call_graph(g)
Ejemplo n.º 3
0
def word_count(file_path, word):
    # 指定某个Mesos主机进行沟通
    dpark = DparkContext()

    # 将分布式文件,构造成文件RDD,每块大小为16m
    f = dpark.textFile(file_path, splitSize=16 << 20)

    # 用map()转变成新的RDD,再用filter过滤出更新的RDD,最后用count()处理返回结果
    print(
        word, 'count:',
        f.map(lambda line: line.strip()).filter(
            lambda line: word in line).count())
Ejemplo n.º 4
0
def main():
    current_path = os.path.dirname(os.path.abspath(__file__))
    for i in cmp_list:
        assert os.path.isdir('cmp'+str(i+1))
    dpark_ctx = DparkContext('process')

    # Dpark thread
    def map_iter(i):
        dir_name = 'cmp' + str(i+1)
        logger = os.path.join(dir_name, 'log')
        if os.path.isdir(logger) and os.listdir(logger):
            return
        print "Start running: ", i+1
        os.chdir(os.path.join(current_path, 'cmp') + str(i+1))
        os.system('python ./cmp.py')

    dpark_ctx.makeRDD(cmp_list).foreach(map_iter)
    print 'Done.'
Ejemplo n.º 5
0
def test_call_graph_union():
    dc = DparkContext()
    Scope.reset()
    r1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)])
    r2 = dc.union([dc.makeRDD([(3, 4)]) for _ in range(2)])
    rdd = r1.union(r2)
    dc.scheduler.current_scope = Scope.get("")
    g = dc.scheduler.get_call_graph(rdd)
    # pprint(g)
    fg = dc.scheduler.fmt_call_graph(g)
    # pprint(fg)
    assert g == ([0, 1, 2, 3, 4, 5], {
        (0, 1): 2,
        (1, 4): 1,
        (2, 3): 2,
        (3, 4): 1,
        (4, 5): 1
    })
Ejemplo n.º 6
0
def main():
    # Loading data
    print "Loading..."
    data = sample_image()
    # Initialize networks
    visible_size = 64  # number of input units
    hidden_size = [25, 16, 9]  # number of hidden units of each layer
    #lamb = 0.0001     # weight decay parameter
    '''
    lamb = 0 # No weight decay!
    beta = 0.01
    '''
    # sigmoid DEBUG
    lamb = 0.0001
    beta = 3

    # dpark initialize
    dpark_ctx = DparkContext()
    # Start training, and L-BFGS is adopted
    # We apply a stack-wise greedy training process
    layer_ind = range(len(hidden_size) + 1)
    layer_ind.remove(0)
    layer_size = [visible_size] + hidden_size
    opttheta = dict()  # parameter vector of stack AE
    img = dict()  # visualization mode

    sparsity_param = dict()
    for ind in layer_ind:
        # standard: 64 units -> sparsity parameter 0.01
        sparsity_param[ind] = layer_size[ind - 1] * 0.01 / 64

    for ind in layer_ind:
        print "start training layer No.%d" % ind
        # Obtain random parameters of considered layer
        theta = initial_parameter(layer_size[ind], layer_size[ind - 1])
        # SGD with mini-batch
        options = (data, layer_size[ind - 1], layer_size[ind], lamb,
                   sparsity_param[ind], beta, dpark_ctx)
        opttheta[ind] = stocha_grad_desc_agagrad(compute_cost,
                                                 compute_grad,
                                                 theta,
                                                 options,
                                                 step_size_init=0.2,
                                                 max_iter=25,
                                                 tol=1e-7)
        # Preparing next layer!
        W = opttheta.get(ind)[:layer_size[ind]*layer_size[ind-1]].\
            reshape(layer_size[ind], layer_size[ind-1])
        b = opttheta.get(ind)[2*layer_size[ind]*layer_size[ind-1]:\
            2*layer_size[ind]*layer_size[ind-1]+layer_size[ind]].\
            reshape(layer_size[ind], 1)
        data = ReLU(np.dot(W, data) + b)
        # visulization shows
        img[ind] = display_effect(W)
        plt.axis('off')
        plt.savefig(str(ind) + '.jpg')

        # DEBUG
        fin = open("theta_DEBUG.pkl", "wb")
        pickle.dump((W, b), fin)
        fin.close()
        sys.exit()

    # Trained parameters of stack AE
    para_stack = vecstack2stack(opttheta, hidden_size, visible_size)

    # Save trained weights and bias
    out = open("weights_bias.pkl", "wb")
    pickle.dump(para_stack, out)
    out.close()

    print "Mission complete!"
Ejemplo n.º 7
0
def DownLoad(file_path):
    dpark = DparkContext()
    file_block = dpark.textFile(file_path, splitSize=16 << 20)
    file_block.foreach(write_to_wav)
Ejemplo n.º 8
0
    return _


class DemoOutputStream(ForEachDStream):
    def __init__(self, parent, output):
        ForEachDStream.__init__(self, parent, collect(output))
        self.output = output

    def __setstate__(self, state):
        ForEachDStream.__setstate__(self, state)
        self.output = []
        self.func = collect(self.output)


sc = DparkContext(dpark_master)


class TestDStream(unittest.TestCase):
    def _setupStreams(self, intput1, input2, operation):
        ssc = StreamingContext(2, sc)
        is1 = DemoInputStream(ssc, intput1)
        ssc.registerInputStream(is1)
        if input2:
            is2 = DemoInputStream(ssc, input2)
            ssc.registerInputStream(is2)
            os = operation(is1, is2)
        else:
            os = operation(is1)
        output = DemoOutputStream(os, [])
        ssc.registerOutputStream(output)
Ejemplo n.º 9
0
        if messageSum and messageSum[0]:
            newValue = 0.15 / num + 0.85 * messageSum[0]
        else:
            newValue = self.value
        terminate = (superstep >= 10 and
                     abs(newValue - self.value) < epsilon) or superstep > 30
        outbox = [(edge.target_id, newValue / len(self.outEdges))
                  for edge in self.outEdges] if not terminate else []
        return Vertex(self.id, newValue, self.outEdges, not terminate), outbox

    return compute


if __name__ == '__main__':
    inputFile = 'wikipedia.txt'
    threshold = 0.01

    dpark = DparkContext()
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), inputFile)
    input = dpark.textFile(path)
    numVertex = input.count()
    vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache()
    epsilon = 0.01 / numVertex
    messages = dpark.parallelize([])
    result = Bagel.run(dpark, vertices, messages,
                       gen_compute(numVertex, epsilon))

    for id, v in result.filter(
            lambda id_v: id_v[1].value > threshold).collect():
        print(id, v)
Ejemplo n.º 10
0
import time
from dpark import DparkContext, optParser
from dpark.file_manager import file_manager
dc = DparkContext()

optParser.set_usage("%prog [options] path")
options, args = optParser.parse_args()

path = args[0]


def run(split_size=1):
    t = time.time()
    dc.textFile(path).mergeSplit(
        splitSize=split_size).filter(lambda x: "yangxiufeng" in x).count()
    return time.time() - t


run()  # file cache
print("{}s with locality".format(run()))
file_manager.fs_list = file_manager.fs_list[1:]
print("{}s merge & without locality".format(run(10)))
print("{}s without locality, ".format(run()))
Ejemplo n.º 11
0
def main(argv):
    # Loading the dataset
    data = svm_read_problem('echo_liveness.01.libsvm')
    y, x = data
    del data

    num_train = 6000
    num_test = 6000

    # Preparing training and testing data
    if len(x) != len(y):
        print("Please examine the data set, for the labels and features are not accorded!")
        sys.exit()
    # generating random training and testing set, to yield the ability of classifier more accurately.
    x_live = [x[i] for i in find(y, 1.0)]
    x_stu = [x[i] for i in find(y, 0.0)]
    n_live = len(x_live)
    n_stu =  len(x_stu)
    ind_live = range(n_live)
    ind_stu = range(n_stu)
    random.shuffle(ind_live)
    random.shuffle(ind_stu)

    x_te = [x_live[i] for i in ind_live[num_train : num_test + num_train]] + \
        [x_stu[i] for i in ind_stu[num_train : num_test + num_train]]
    y_te = [1.0] * len(ind_live[num_train : num_test + num_train]) + \
        [-1.0]*len(ind_stu[num_train : num_test + num_train])
    x_tr = [x_live[i] for i in ind_live[:num_train]] + \
        [x_stu[i] for i in ind_stu[:num_train]]
    y_tr = [1.0]*num_train + [-1.0]*num_train

    # SVM and a 10-fold Cross Validation choosing the best parameters.
    # gamma and c_reg are constructed in a parameter grid
    
    # for-loop version
    '''
    gamma = np.arange(.01,20,.04)
    c_reg = np.arange(.01,20,.04)
    opt = []
    best_para = {'gamma': 0, 'c': 0, 'precision': 0}
    for g in gamma:
        for c in c_reg:
            opt = '-g '+ str(g) +' -c ' + str(c) + ' -v 10 -q'
            pre = svm_train(y_tr,x_tr,opt)
            if pre > best_para.get('precision'):
                best_para['gamma'] = g
                best_para['c'] = c
                best_para['precision'] = pre 
    best_opt = '-g '+ str(best_para.get('gamma')) +' -c ' + str(best_para.get('c')) + ' -q'
    m = svm_train(y_tr, x_tr, best_opt)
    p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-q')
    '''

    # dpark version
    dpark = DparkContext()
    gamma = np.arange(.01, 5, .08)
    c_reg = np.arange(.01, 5, .08)
    opt = []
    for g in gamma:
        for c in c_reg:
            opt.append('-g '+ str(g) +' -c ' + str(c) + ' -v 10 -q')

    def map_iter(i):
        pre = svm_train(y_tr, list(x_tr), opt[i])
        return pre

    #pres = dpark.makeRDD(range(len(opt)),100).map(map_iter).collect()
    pres = dpark.makeRDD(range(len(opt))).map(map_iter).collect()
    pres = np.array(pres)
    best_opt_ind = pres.argsort()
    best_opt = opt[best_opt_ind[-1]]

    best_opt = best_opt[:best_opt.find('-v') - 1]
    m = svm_train(y_tr, x_tr, best_opt)
    p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-q')

    print 'This SVM framework precision: %f' % p_acc[0]
Ejemplo n.º 12
0
import time
import unittest
import logging

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from dpark import DparkContext
from dpark.utils.nested_groupby import GroupByNestedIter
from dpark.shuffle import GroupByNestedIter, AutoBatchedSerializer
from dpark.utils.profile import profile
import dpark.conf

GroupByNestedIter.NO_CACHE = True
print_mem_incr = True
print_mem_incr = False
dc = DparkContext('mesos')
RC = dpark.conf.rddconf
M = 1024 * 1024


def rss_func(p):
    if hasattr(p, "memory_info"):
        mem_info = getattr(p, "memory_info")
    else:
        mem_info = getattr(p, 'get_memory_info')

    def _():
        return mem_info().rss / M

    return _
Ejemplo n.º 13
0
def test_lineage():
    Scope.reset()

    dc = DparkContext()
    rdd1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(5)])
    assert len(rdd1.dep_lineage_counts) == 1

    rdd2 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(3)])
    rdd3 = rdd1.union(rdd2)
    assert len(rdd3.dep_lineage_counts) == 2

    rdd4 = dc.union(
        [dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) for _ in range(4)])
    assert len(rdd4.dep_lineage_counts) == 1
    assert len(list(rdd4.dependencies)[0].rdd.dep_lineage_counts) == 1
    rdd5 = rdd3.groupWith(rdd4)

    print("rdd1", rdd1.id, rdd1.dep_lineage_counts)
    stage = dc.scheduler.newStage(rdd1, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert list(stage.pipelines.keys()) == [rdd1.id]
    assert stage.pipeline_edges == {}

    stage = dc.scheduler.newStage(rdd3, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert sorted(list(stage.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id]
    assert stage.pipeline_edges == {
        ((-1, rdd1.id), (-1, rdd3.id)): 1,
        ((-1, rdd2.id), (-1, rdd3.id)): 1
    }

    stage = dc.scheduler.newStage(rdd4, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert list(stage.pipelines.keys()) == [rdd4.id]
    assert stage.pipeline_edges == {}

    print("rdd5", rdd5.id, rdd3.id, rdd4.id)
    stage = dc.scheduler.newStage(rdd5, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert sorted(list(stage.pipelines.keys())) == [rdd5.id]
    assert sorted(stage.pipeline_edges) == sorted([
        ((s.id, s.rdd.id), (-1, rdd5.id)) for s in stage.parents
    ])

    print('-' * 100)
    pprint(stage.get_pipeline_graph())

    for s in stage.parents:
        if s.rdd.id == rdd4.id:
            assert list(s.pipelines.keys()) == [rdd4.id]
            assert s.pipeline_edges == {}
        elif s.rdd.id == rdd3.id:
            assert sorted(list(
                s.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id]
            assert s.pipeline_edges == {
                ((-1, rdd1.id), (-1, rdd3.id)): 1,
                ((-1, rdd2.id), (-1, rdd3.id)): 1
            }
        else:
            assert False

        pprint(s.get_pipeline_graph())
Ejemplo n.º 14
0
    def get_rdd(self):
        dpark = DparkContext()

        return dpark.union([
            dpark.textFile(path, splitSize=64 << 20) for path in self.paths
        ]).map(Weblog.from_line)
Ejemplo n.º 15
0
import time
from dpark import DparkContext


def m(x):
    if x[0] == 0:
        time.sleep(100)
    return x


def r(x, y):
    return x + y


dc = DparkContext("mesos")

rdd = dc.makeRDD([(i, i) for i in range(2)], 2)
rdd.collect()
rdd.reduceByKey(r).map(m).reduceByKey(r).collect()
Ejemplo n.º 16
0
def main():

    # Loading data
    print "Loading..."
    data_train = sample_image()

    # Initialize networks
    visible_size = 64  # number of input units
    hidden_size = [25, 16, 9]  # number of hidden units of each layer

    lamb = 0.0001  # weight decay parameter
    beta = 3  # weight of sparsity penalty dataset

    # dpark initialize
    dpark_ctx = DparkContext()

    # Start training, and L-BFGS is adopted
    # We apply a stack-wise greedy training process
    layer_ind = range(len(hidden_size) + 1)
    layer_ind.remove(0)
    layer_size = [visible_size] + hidden_size

    # desired average activation
    sparsity_param = dict()
    for ind in layer_ind:
        # standard: 64 units -> sparsity parameter 0.01
        sparsity_param[ind] = layer_size[ind - 1] * 0.01 / 64

    data = data_train
    opttheta = dict()  # parameter vector of stack AE
    img = dict()  # visualization mode

    for ind in layer_ind:

        print "start training layer No.%d" % ind

        # Obtain random parameters of considered layer
        theta = initial_parameter(layer_size[ind], layer_size[ind - 1])

        # Training begins
        options = (data, layer_size[ind - 1], layer_size[ind], lamb,
                   sparsity_param[ind], beta, dpark_ctx)

        opt = optimize.fmin_l_bfgs_b(compute_cost, theta, compute_grad,
                                     options)

        opttheta[ind] = opt[0]

        W = opttheta.get(ind)[:layer_size[ind]*layer_size[ind-1]].\
            reshape(layer_size[ind], layer_size[ind-1])

        data = np.dot(W, data)

        # visulization shows
        img[ind] = display_effect(W)
        plt.axis('off')
        plt.savefig(str(ind) + '.jpg')

    # Trained parameters of stack AE
    para_stack = vecstack2stack(opttheta, hidden_size, visible_size)

    # Save trained weights and bias
    out = open("weights_bias.pkl", "wb")
    pickle.dump(para_stack, out)
    out.close()

    print "Mission complete!"
Ejemplo n.º 17
0
# -*- coding: utf-8 -*-

from dpark import DparkContext


def m(x):
    return x


rdd = DparkContext().makeRDD([(1, 1)]).map(m).groupByKey()
rdd.map(m).collect()
rdd.map(m).collect()
Ejemplo n.º 18
0
import math
import random
import os, sys
from pprint import pprint
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dpark import DparkContext

ctx = DparkContext()

# range
nums = ctx.parallelize(range(100), 4)
print nums.count()
print nums.reduce(lambda x, y: x + y)

# text search
f = ctx.textFile("./", ext='py').map(lambda x: x.strip())
log = f.filter(lambda line: 'logging' in line).cache()
print 'logging', log.count()
print 'error', log.filter(lambda line: 'error' in line).count()
for line in log.filter(lambda line: 'error' in line).collect():
    print line

# word count
counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(
    lambda x, y: x + y).cache()
pprint(counts.filter(lambda (_, v): v > 50).collectAsMap())
pprint(
    sorted(
        counts.filter(lambda (_, v): v > 20).map(
            lambda (x, y): (y, x)).groupByKey().collect()))
pprint(counts.map(lambda v: "%s:%s" % v).saveAsTextFile("wc/"))
Ejemplo n.º 19
0
def main():
    """Tuning for SparseAE"""
    # First layer
    T = SparseAE(64,
                 49,
                 optimize_method='cg',
                 max_iter=400,
                 debug=0,
                 verbose=True,
                 tol=1e-8,
                 mini_batch=32)
    X = vs.load_sample('IMAGES.mat', patch_size=8, n_patches=10000)
    T.train(X)
    T.devec_theta()
    vs.disp_effect(T.w1, fname='Fst_lyr.jpg')

    # Second layer
    rho = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
    beta = [3e-3, 3e-2, 9e-2, 3e-1, 9e-1, 3, 9, 30, 90, 300]
    lamb = [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10]
    param = product(rho, beta, lamb)
    param_str = ['rho', 'sparse_beta', 'lamb']
    param = map(lambda x: dict(zip(param_str, x)), param)
    X = activate(np.dot(T.w1, X) + T.b1)
    if not os.path.isdir('./imgs'):
        os.system('mkdir imgs')
    '''
    for idx, param_elem in enumerate(param):
        import warnings
        warnings.filterwarnings('error')
        try:
            S = SparseAE(49, 36, optimize_method='cg', max_iter=400,
                         debug=0, verbose=True, tol=1e-8, mini_batch=32,
                         **param_elem)
            S.train(X)
            S.devec_theta()
            fname = 'imgs/' + str(idx) + '.jpg'
            vs.disp_effect(S.w1, fname=fname)
        except:
            fname = 'imgs/' + 'log'
            fid = open(fname, 'w')
            fid.write('Exception: ' + str(idx) + '\n')
            fid.close()
    '''
    # Break point re-computing
    fls = os.listdir('./imgs/weight')
    fls = map(lambda x: int(x[:x.find('.')]), fls)
    for fl in fls:
        param = param[:fl] + param[fl + 1:]
    # dpark parallel computing
    dpark_ctx = DparkContext('process')
    dpark_n_length = len(param)
    dpark_n_block = 50
    if not os.path.isdir('./imgs/weight'):
        os.system('mkdir imgs/weight')
    print '%d models await training.' % dpark_n_length

    def map_iter(param_enum):
        idx = param_enum[0][0] * int(ceil(dpark_n_length / dpark_n_block)) +\
                    param_enum[0][1]
        import warnings
        warnings.filterwarnings('error')
        try:
            S = SparseAE(49,
                         36,
                         optimize_method='cg',
                         max_iter=400,
                         debug=0,
                         verbose=True,
                         tol=1e-8,
                         mini_batch=32,
                         **param_enum[1])
            S.train(
                np.array(X))  # dpark converts X, 'np.ndarray' to 'instance'
            S.devec_theta()
            fname = 'imgs/weight/' + str(idx) + '.csv'
            # vs.disp_effect(S.w1, fname=fname)  # dpark doesn't support plt.savefig()
            np.savetxt(fname, S.w1, delimiter=',')
        except:
            import traceback
            traceback.print_exc()
            fname = 'imgs/' + 'log'
            fid = open(fname, 'w')
            fid.write('Training exception: ' + str(idx) + '\n')
            fid.close()

    dpark_ctx.makeRDD(param, dpark_n_block).enumerate().foreach(map_iter)
    print 'Done.'

    # Visualizing
    for i in range(len(param)):
        fname = 'imgs/weight/' + str(i) + '.csv'
        if not os.path.isfile(fname):
            continue
        w = np.loadtxt(fname, delimiter=',')
        fname_img = 'imgs/' + str(i) + '.jpg'
        vs.disp_effect(w, fname=fname_img)
        print i, 'visualization done.'