Ejemplos de DparkContext.makeRDD en Python, ejemplos de dpark.DparkContext.makeRDD en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test_scope.py Proyecto: douban/dpark

def test_scope():

    Scope.reset()
    dc = DparkContext()

    rdd = dc.makeRDD([1, 2, 3]).map(int).map(int).map(int)
    dc.scheduler.current_scope = Scope.get("")

    for i, r in enumerate([rdd.prev.prev, rdd.prev, rdd]):
        assert r.scope.id == i + 1
        assert r.scope.api_callsite.startswith("map:{}".format(i))

    Scope.reset()
    rdd = dc.makeRDD([1, 2, 3]) \
        .map(int) \
        .map(int) \
        .map(int)

    for i, r in enumerate([rdd.prev.prev, rdd.prev, rdd]):
        assert r.scope.id == i + 1
        assert r.scope.api_callsite.startswith("map:0")

    def get_rdd(n):
        return dc.makeRDD([n, n]).map(int).map(int).map(int)

    rdds = [get_rdd(1), get_rdd(2)]
    assert rdds[0].scope.id + 4 == rdds[1].scope.id

    rdds = [get_rdd(i) for i in range(2)]
    assert rdds[0].scope.id == rdds[1].scope.id

Ejemplo n.º 2

0

Mostrar archivo

def test_scope():

    Scope.reset()
    dc = DparkContext()

    rdd = dc.makeRDD([1, 2, 3]).map(int).map(int).map(int)
    dc.scheduler.current_scope = Scope.get("")

    for i, r in enumerate([rdd.prev.prev, rdd.prev, rdd]):
        assert r.scope.id == i + 1
        assert r.scope.api_callsite.startswith("map:{}".format(i))

    Scope.reset()
    rdd = dc.makeRDD([1, 2, 3]) \
        .map(int) \
        .map(int) \
        .map(int)

    for i, r in enumerate([rdd.prev.prev, rdd.prev, rdd]):
        assert r.scope.id == i + 1
        assert r.scope.api_callsite.startswith("map:0")

    def get_rdd(n):
        return dc.makeRDD([n, n]).map(int).map(int).map(int)

    rdds = [get_rdd(1), get_rdd(2)]
    assert rdds[0].scope.id + 4 == rdds[1].scope.id

    rdds = [get_rdd(i) for i in range(2)]
    assert rdds[0].scope.id == rdds[1].scope.id

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_scope.py Proyecto: douban/dpark

def test_lineage():
    Scope.reset()

    dc = DparkContext()
    rdd1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(5)])
    assert len(rdd1.dep_lineage_counts) == 1

    rdd2 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(3)])
    rdd3 = rdd1.union(rdd2)
    assert len(rdd3.dep_lineage_counts) == 2

    rdd4 = dc.union([dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) for _ in range(4)])
    assert len(rdd4.dep_lineage_counts) == 1
    assert len(list(rdd4.dependencies)[0].rdd.dep_lineage_counts) == 1
    rdd5 = rdd3.groupWith(rdd4)

    print("rdd1", rdd1.id, rdd1.dep_lineage_counts)
    stage = dc.scheduler.newStage(rdd1, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert list(stage.pipelines.keys()) == [rdd1.id]
    assert stage.pipeline_edges == {}

    stage = dc.scheduler.newStage(rdd3, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert sorted(list(stage.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id]
    assert stage.pipeline_edges == {((-1, rdd1.id), (-1, rdd3.id)):1,
                                    ((-1, rdd2.id), (-1, rdd3.id)):1}

    stage = dc.scheduler.newStage(rdd4, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert list(stage.pipelines.keys()) == [rdd4.id]
    assert stage.pipeline_edges == {}

    print("rdd5", rdd5.id, rdd3.id, rdd4.id)
    stage = dc.scheduler.newStage(rdd5, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert sorted(list(stage.pipelines.keys())) == [rdd5.id]
    assert sorted(stage.pipeline_edges) == sorted([((s.id, s.rdd.id), (-1, rdd5.id)) for s in stage.parents])

    print('-' * 100)
    pprint(stage.get_pipeline_graph())

    for s in stage.parents:
        if s.rdd.id == rdd4.id:
            assert list(s.pipelines.keys()) == [rdd4.id]
            assert s.pipeline_edges == {}
        elif s.rdd.id == rdd3.id:
            assert sorted(list(s.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id]
            assert s.pipeline_edges == {((-1, rdd1.id), (-1, rdd3.id)): 1,
                                        ((-1, rdd2.id), (-1, rdd3.id)): 1}
        else:
            assert False

        pprint(s.get_pipeline_graph())

Ejemplo n.º 4

0

Mostrar archivo

def main():
    current_path = os.path.dirname(os.path.abspath(__file__))
    for i in cmp_list:
        assert os.path.isdir('cmp'+str(i+1))
    dpark_ctx = DparkContext('process')

    # Dpark thread
    def map_iter(i):
        dir_name = 'cmp' + str(i+1)
        logger = os.path.join(dir_name, 'log')
        if os.path.isdir(logger) and os.listdir(logger):
            return
        print "Start running: ", i+1
        os.chdir(os.path.join(current_path, 'cmp') + str(i+1))
        os.system('python ./cmp.py')

    dpark_ctx.makeRDD(cmp_list).foreach(map_iter)
    print 'Done.'

Ejemplo n.º 5

0

Mostrar archivo

def test_call_graph_join():
    dc = DparkContext()
    Scope.reset()
    rdd = dc.makeRDD([(1, 1), (1, 2)]).map(lambda x: x)
    rdd = rdd.join(rdd)
    dc.scheduler.current_scope = Scope.get("")
    g = dc.scheduler.get_call_graph(rdd)
    pprint(g)
    assert g == ([0, 1, 2, 3], {(0, 1): 1, (1, 2): 2, (2, 3): 1})

    fg = dc.scheduler.fmt_call_graph(g)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: test_scope.py Proyecto: douban/dpark

def test_call_graph_join():
    dc = DparkContext()
    Scope.reset()
    rdd = dc.makeRDD([(1, 1), (1, 2)]).map(lambda x: x)
    rdd = rdd.join(rdd)
    dc.scheduler.current_scope = Scope.get("")
    g = dc.scheduler.get_call_graph(rdd)
    pprint(g)
    assert g == ([0, 1, 2, 3], {(0, 1): 1, (1, 2): 2, (2, 3): 1})

    fg = dc.scheduler.fmt_call_graph(g)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: test_scope.py Proyecto: douban/dpark

def test_call_graph_union():
    dc = DparkContext()
    Scope.reset()
    r1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)])
    r2 = dc.union([dc.makeRDD([(3, 4)]) for _ in range(2)])
    rdd = r1.union(r2)
    dc.scheduler.current_scope = Scope.get("")
    g = dc.scheduler.get_call_graph(rdd)
    # pprint(g)
    fg = dc.scheduler.fmt_call_graph(g)
    # pprint(fg)
    assert g == ([0, 1, 2, 3, 4, 5], {(0, 1): 2, (1, 4): 1, (2, 3): 2, (3, 4): 1, (4, 5): 1})

Ejemplo n.º 8

0

Mostrar archivo

def test_call_graph_union():
    dc = DparkContext()
    Scope.reset()
    r1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)])
    r2 = dc.union([dc.makeRDD([(3, 4)]) for _ in range(2)])
    rdd = r1.union(r2)
    dc.scheduler.current_scope = Scope.get("")
    g = dc.scheduler.get_call_graph(rdd)
    # pprint(g)
    fg = dc.scheduler.fmt_call_graph(g)
    # pprint(fg)
    assert g == ([0, 1, 2, 3, 4, 5], {
        (0, 1): 2,
        (1, 4): 1,
        (2, 3): 2,
        (3, 4): 1,
        (4, 5): 1
    })

Ejemplo n.º 9

0

Mostrar archivo

Archivo: dt.py Proyecto: LEONOB2014/DS-GA1001-Project

def dt_model():
    tr_x, tr_y, va_x, va_y, te_x, te_y = load_data()
    param_grid = {
        "min_samples_split": range(1, 10000, 1000),
        "min_samples_leaf": range(1, 10000, 1000),
        # 'max_leaf_nodes': [0, 100, 1000, 10000],
        "max_depth": [None, 100, 1000, 10000],
    }
    param_grid = grid_generator(param_grid)

    # Dpark
    dpark_ctx = DparkContext()

    def map_iter(param):
        idx = param[0][0] * 2 + param[0][1]
        param = param[1]
        m = tree.DecisionTreeClassifier(criterion="entropy", **param)
        print "%d, Start traininig Decision Tree model." % idx
        m = m.fit(tr_x, tr_y)
        print "%d, Training done." % idx
        proba = m.predict_proba(va_x)
        fpr, tpr, thresh = roc_curve(va_y, proba[:, 1])
        auc_ = auc(fpr, tpr)
        print "%d, AUC is %f" % (idx, auc_)
        return idx, param, auc_

    print "It will train %d models" % len(param_grid)
    result_record = dpark_ctx.makeRDD(param_grid, 50).enumerate().map(map_iter).collect()

    file_record = open("dt_result.pkl", "w")
    pickle.dump(result_record, file_record)
    file_record.close()

    # testing
    opt = reduce(lambda x, y: x if x[2] > y[2] else y, result_record)
    m = tree.DecisionTreeClassifier(criterion="entropy", **opt[1])
    m = m.fit(tr_x, tr_y)
    proba = m.predict_proba(te_x)
    fpr, tpr, thresh = roc_curve(te_y, proba[:, 1])
    auc_ = auc(fpr, tpr)
    print "Testing AUC is %f" % auc_

Ejemplo n.º 10

0

Mostrar archivo

Archivo: feature_gen_sample.py Proyecto: ssywin007/douban_ad

            .filter(lambda x:x)\
            .filter(lambda line: (not is_spider(line) and (line['uid'] or line['bid'])))\
            .filter(lambda l: l['bid'] not in fraud.value and l['uid'] not in fraud.value)


    spec = set(['url', 'uid', 'bid', 'unit_id', 'ad_id', 'status_code', 'user_agent', 'region', 'page_tags', 'hour', 'group'])
    features = common_gen(spec)

    features = features.map(feature_extract)\
        .filter(lambda x:x)\
        .cache()

    user_list = set(features.map(lambda x: x[0]).filter(lambda x: x<>'None').collect())
    user_list_b = dp.broadcast(user_list)

    user_feature = dp.makeRDD([])

    def _parse_list(line):
        uid, features = line.split('\t')
        features = [x.split(':') for x in features.split('|')]
        features = [(x[0], float(x[1])) for x in features]
        features = sorted(features, key=lambda x: x[1], reverse=True)
        return (uid, features)

    for name in ['book_cluster', 'movie_cluster', 'group_cluster', 'text_cluster']:
        fn = '/home2/alg/user_profile/%s/%s' % (current_date, name)
        if not os.path.exists(fn):
            continue
        rdd = dp.textFile(fn, splitSize=16<<20)\
            .filter(lambda x: x.split('\t', 1)[0] in user_list_b.value)\
            .map(_parse_list)\

Ejemplo n.º 11

0

Mostrar archivo

    for x in range(m):
        for y in range(v):
            pred = Wi[x].dot(Hj[y])
            err = int(Oij[x][y]) - int(pred)
            w = Wi[x] + GAMMA * (Hj[y] * err - LAMBDA * Wi[x])
            h = Hj[y] + GAMMA * (Wi[x] * err - LAMBDA * Hj[y])

            Wi[x] = w
            Hj[y] = h

    W.put(i, Wi)
    H.put(j, Hj)


rdd = dpark.makeRDD(list(range(d)))
rdd = rdd.cartesian(rdd).cache()


def calc_err(i_j):
    (i, j) = i_j
    Wi = W.get(i)
    Hj = H.get(j)

    ori = ori_b.value
    Rij = Wi.dot(Hj.T)
    Oij = ori[i * m:(i + 1) * m, j * v:(j + 1) * v]
    return ((Rij - Oij) ** 2).sum()


J = list(range(d))

Ejemplo n.º 12

0

Mostrar archivo

Archivo: online_dag.py Proyecto: zxh1986123/dpark

import time
from dpark import DparkContext


def m(x):
    if x[0] == 0:
        time.sleep(100)
    return x


def r(x, y):
    return x + y


dc = DparkContext("mesos")

rdd = dc.makeRDD([(i, i) for i in range(2)], 2)
rdd.collect()
rdd.reduceByKey(r).map(m).reduceByKey(r).collect()

Ejemplo n.º 13

0

Mostrar archivo

Archivo: examplar_svm_liveness.py Proyecto: jakezhaojb/live_or_studio

def main(argv):
    # Dpark initialize
    dpark = DparkContext()

    # number of the training and testing set
    num_train = 6000
    num_test = 6000

    # Loading the dataset
    data = svm_read_problem('echo_liveness.01.libsvm')
    y, x = data

    # Preparing training and testing data
    if len(x) != len(y):
        print("The labels and features are not accorded!")
        sys.exit()
    
    x_live = [x[i] for i in find(y, 1.0)]
    x_stu = [x[i] for i in find(y, 0.0)]
    n_live = len(x_live)
    n_stu = len(x_stu)
    ind_live = range(n_live)
    ind_stu = range(n_stu)
    random.shuffle(ind_live)
    random.shuffle(ind_stu)

    x_te = [x_live[i] for i in ind_live[num_train : num_test + num_train]] + \
        [x_stu[i] for i in ind_stu[num_train : num_test + num_train]]
    y_te = [1.0] * len(ind_live[num_train : num_test + num_train]) + \
        [-1.0]*len(ind_stu[num_train : num_test + num_train])
    x_tr = [x_live[i] for i in ind_live[:num_train]] + \
        [x_stu[i] for i in ind_stu[:num_train]]
    y_tr = [1.0]*num_train + [-1.0]*num_train

    # dpark version
    def map_iter(i):
        y_tr_examplar = [-1.0] * len(y_tr)
        y_tr_examplar[i] = 1.0
        # opt = '-t 0 -w1 ' + str(len(y_tr)) + ' -w-1 1 -b 1 -q'
        # It is suggested in Efros' paper that:
        # C1 0.5, C2 0.01
        opt = '-t 0 -w1 0.5 -w-1 0.01 -b 1 -q'
        m = svm_train(y_tr_examplar, list(x_tr), opt)
        p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-b 1 -q')
        p_val = np.array(p_val)
        # p_val = np.delete(p_val,1,1)  # shape = (N, 1)
        p_val = p_val[:, 0]  # shape = (N, )
        return p_val

    p_vals = dpark.makeRDD(
        range(len(y_tr))
    ).map(
        map_iter
    ).collect()

    val = np.array(p_vals).T

    # for-loop version
    '''
    # Examplar SVM Training
    ensemble_model = []
    # DPark

    for i in range(len(y_tr)):
        y_tr_examplar = [-1.0] * len(y_tr)
        y_tr_examplar[i] = 1.0;
        #opt = '-t 0 -w1 ' + str(len(y_tr)) + ' -w-1 1 -b 1 -q'
        # It is suggested in Efros' paper that:
        # C1 0.5, C2 0.01
        opt = '-t 0 -w1 0.5 -w-1 0.01 -b 1 -q'
        m = svm_train(y_tr_examplar, x_tr, opt)
        ensemble_model.append(m)
        print("The %s-th examplar SVM has been trained" %i)

    # Calibaration, to be updated
    # Since we adopt the probability estimation model of LIB_SVM, Calibrating seems unnecessary

    # Ensembly Classify
    val = np.zeros((len(y_te),1))
    for m in ensemble_model:
        p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-b 1 -q')
        p_val = np.array(p_val)
        p_val = np.delete(p_val,1, 1)
        val = np.hstack((val, p_val))
    if val.shape[1] != len(y_tr) + 1:
        print "Chaos!"
    val = np.delete(val,0,1)
    print 'val.shape =', val.shape
    '''
    
    # KNN
    k = num_train / 8
    sorted_index = val.argsort(axis=1)
    sorted_index = sorted_index.T[::-1].T
    p_label = []
    for index in sorted_index:
        nearest_samples = []
        for sample_index in index[:k]:
            nearest_samples.append(y_tr[sample_index])
        n,bins,dummy = plt.hist(nearest_samples, 2, normed=1, 
                                facecolor='r', alpha=0.75)
        if n[0] > n[1]:
            p_label.append(-1.0)
        else:
            p_label.append(1.0)

    # evaluation
    rate, pos_rate, neg_rate = evaluation(y_te, p_label)

    print("The Examplar SVM framework achieves a precision of %f" % rate)

Ejemplo n.º 14

0

Mostrar archivo

Archivo: svm_liveness.py Proyecto: afcarl/live_or_studio

def main(argv):
    # Loading the dataset
    data = svm_read_problem('echo_liveness.01.libsvm')
    y, x = data
    del data

    num_train = 6000
    num_test = 6000

    # Preparing training and testing data
    if len(x) != len(y):
        print("Please examine the data set, for the labels and features are not accorded!")
        sys.exit()
    # generating random training and testing set, to yield the ability of classifier more accurately.
    x_live = [x[i] for i in find(y, 1.0)]
    x_stu = [x[i] for i in find(y, 0.0)]
    n_live = len(x_live)
    n_stu =  len(x_stu)
    ind_live = range(n_live)
    ind_stu = range(n_stu)
    random.shuffle(ind_live)
    random.shuffle(ind_stu)

    x_te = [x_live[i] for i in ind_live[num_train : num_test + num_train]] + \
        [x_stu[i] for i in ind_stu[num_train : num_test + num_train]]
    y_te = [1.0] * len(ind_live[num_train : num_test + num_train]) + \
        [-1.0]*len(ind_stu[num_train : num_test + num_train])
    x_tr = [x_live[i] for i in ind_live[:num_train]] + \
        [x_stu[i] for i in ind_stu[:num_train]]
    y_tr = [1.0]*num_train + [-1.0]*num_train

    # SVM and a 10-fold Cross Validation choosing the best parameters.
    # gamma and c_reg are constructed in a parameter grid
    
    # for-loop version
    '''
    gamma = np.arange(.01,20,.04)
    c_reg = np.arange(.01,20,.04)
    opt = []
    best_para = {'gamma': 0, 'c': 0, 'precision': 0}
    for g in gamma:
        for c in c_reg:
            opt = '-g '+ str(g) +' -c ' + str(c) + ' -v 10 -q'
            pre = svm_train(y_tr,x_tr,opt)
            if pre > best_para.get('precision'):
                best_para['gamma'] = g
                best_para['c'] = c
                best_para['precision'] = pre 
    best_opt = '-g '+ str(best_para.get('gamma')) +' -c ' + str(best_para.get('c')) + ' -q'
    m = svm_train(y_tr, x_tr, best_opt)
    p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-q')
    '''

    # dpark version
    dpark = DparkContext()
    gamma = np.arange(.01, 5, .08)
    c_reg = np.arange(.01, 5, .08)
    opt = []
    for g in gamma:
        for c in c_reg:
            opt.append('-g '+ str(g) +' -c ' + str(c) + ' -v 10 -q')

    def map_iter(i):
        pre = svm_train(y_tr, list(x_tr), opt[i])
        return pre

    #pres = dpark.makeRDD(range(len(opt)),100).map(map_iter).collect()
    pres = dpark.makeRDD(range(len(opt))).map(map_iter).collect()
    pres = np.array(pres)
    best_opt_ind = pres.argsort()
    best_opt = opt[best_opt_ind[-1]]

    best_opt = best_opt[:best_opt.find('-v') - 1]
    m = svm_train(y_tr, x_tr, best_opt)
    p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-q')

    print 'This SVM framework precision: %f' % p_acc[0]

Ejemplo n.º 15

0

Mostrar archivo

def test_lineage():
    Scope.reset()

    dc = DparkContext()
    rdd1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(5)])
    assert len(rdd1.dep_lineage_counts) == 1

    rdd2 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(3)])
    rdd3 = rdd1.union(rdd2)
    assert len(rdd3.dep_lineage_counts) == 2

    rdd4 = dc.union(
        [dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) for _ in range(4)])
    assert len(rdd4.dep_lineage_counts) == 1
    assert len(list(rdd4.dependencies)[0].rdd.dep_lineage_counts) == 1
    rdd5 = rdd3.groupWith(rdd4)

    print("rdd1", rdd1.id, rdd1.dep_lineage_counts)
    stage = dc.scheduler.newStage(rdd1, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert list(stage.pipelines.keys()) == [rdd1.id]
    assert stage.pipeline_edges == {}

    stage = dc.scheduler.newStage(rdd3, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert sorted(list(stage.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id]
    assert stage.pipeline_edges == {
        ((-1, rdd1.id), (-1, rdd3.id)): 1,
        ((-1, rdd2.id), (-1, rdd3.id)): 1
    }

    stage = dc.scheduler.newStage(rdd4, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert list(stage.pipelines.keys()) == [rdd4.id]
    assert stage.pipeline_edges == {}

    print("rdd5", rdd5.id, rdd3.id, rdd4.id)
    stage = dc.scheduler.newStage(rdd5, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert sorted(list(stage.pipelines.keys())) == [rdd5.id]
    assert sorted(stage.pipeline_edges) == sorted([
        ((s.id, s.rdd.id), (-1, rdd5.id)) for s in stage.parents
    ])

    print('-' * 100)
    pprint(stage.get_pipeline_graph())

    for s in stage.parents:
        if s.rdd.id == rdd4.id:
            assert list(s.pipelines.keys()) == [rdd4.id]
            assert s.pipeline_edges == {}
        elif s.rdd.id == rdd3.id:
            assert sorted(list(
                s.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id]
            assert s.pipeline_edges == {
                ((-1, rdd1.id), (-1, rdd3.id)): 1,
                ((-1, rdd2.id), (-1, rdd3.id)): 1
            }
        else:
            assert False

        pprint(s.get_pipeline_graph())

Ejemplo n.º 16

0

Mostrar archivo

Archivo: shortpath.py Proyecto: zxh1986123/dpark

    return (id, Vertex(id, sys.maxint, outEdges, True))


def compute(self, vs, agg, superstep):
    newValue = min(self.value, vs[0]) if vs else self.value
    if newValue != self.value:
        outbox = [(edge.target_id, newValue + edge.value)
                  for edge in self.outEdges]
    else:
        outbox = []
    return Vertex(self.id, newValue, self.outEdges, False), outbox


if __name__ == '__main__':
    ctx = DparkContext()
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'graph.txt')
    lines = ctx.textFile(path).map(lambda line: line.split(' '))
    vertices = lines.groupBy(lambda line: line[0]).map(to_vertex)
    startVertex = str(0)
    messages = ctx.makeRDD([(startVertex, 0)])

    print('read', vertices.count(), 'vertices and ', messages.count(), 'messages.')

    result = Bagel.run(ctx, vertices, messages, compute, BasicCombiner(min), numSplits=2)

    print('Shortest path from %s to all vertices:' % startVertex)
    for id, v in result.collect():
        if v.value == sys.maxint:
            v.value = 'inf'
        print(v.id, v.value)

Ejemplo n.º 17

0

Mostrar archivo

def main():
    """Tuning for SparseAE"""
    # First layer
    T = SparseAE(64,
                 49,
                 optimize_method='cg',
                 max_iter=400,
                 debug=0,
                 verbose=True,
                 tol=1e-8,
                 mini_batch=32)
    X = vs.load_sample('IMAGES.mat', patch_size=8, n_patches=10000)
    T.train(X)
    T.devec_theta()
    vs.disp_effect(T.w1, fname='Fst_lyr.jpg')

    # Second layer
    rho = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
    beta = [3e-3, 3e-2, 9e-2, 3e-1, 9e-1, 3, 9, 30, 90, 300]
    lamb = [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10]
    param = product(rho, beta, lamb)
    param_str = ['rho', 'sparse_beta', 'lamb']
    param = map(lambda x: dict(zip(param_str, x)), param)
    X = activate(np.dot(T.w1, X) + T.b1)
    if not os.path.isdir('./imgs'):
        os.system('mkdir imgs')
    '''
    for idx, param_elem in enumerate(param):
        import warnings
        warnings.filterwarnings('error')
        try:
            S = SparseAE(49, 36, optimize_method='cg', max_iter=400,
                         debug=0, verbose=True, tol=1e-8, mini_batch=32,
                         **param_elem)
            S.train(X)
            S.devec_theta()
            fname = 'imgs/' + str(idx) + '.jpg'
            vs.disp_effect(S.w1, fname=fname)
        except:
            fname = 'imgs/' + 'log'
            fid = open(fname, 'w')
            fid.write('Exception: ' + str(idx) + '\n')
            fid.close()
    '''
    # Break point re-computing
    fls = os.listdir('./imgs/weight')
    fls = map(lambda x: int(x[:x.find('.')]), fls)
    for fl in fls:
        param = param[:fl] + param[fl + 1:]
    # dpark parallel computing
    dpark_ctx = DparkContext('process')
    dpark_n_length = len(param)
    dpark_n_block = 50
    if not os.path.isdir('./imgs/weight'):
        os.system('mkdir imgs/weight')
    print '%d models await training.' % dpark_n_length

    def map_iter(param_enum):
        idx = param_enum[0][0] * int(ceil(dpark_n_length / dpark_n_block)) +\
                    param_enum[0][1]
        import warnings
        warnings.filterwarnings('error')
        try:
            S = SparseAE(49,
                         36,
                         optimize_method='cg',
                         max_iter=400,
                         debug=0,
                         verbose=True,
                         tol=1e-8,
                         mini_batch=32,
                         **param_enum[1])
            S.train(
                np.array(X))  # dpark converts X, 'np.ndarray' to 'instance'
            S.devec_theta()
            fname = 'imgs/weight/' + str(idx) + '.csv'
            # vs.disp_effect(S.w1, fname=fname)  # dpark doesn't support plt.savefig()
            np.savetxt(fname, S.w1, delimiter=',')
        except:
            import traceback
            traceback.print_exc()
            fname = 'imgs/' + 'log'
            fid = open(fname, 'w')
            fid.write('Training exception: ' + str(idx) + '\n')
            fid.close()

    dpark_ctx.makeRDD(param, dpark_n_block).enumerate().foreach(map_iter)
    print 'Done.'

    # Visualizing
    for i in range(len(param)):
        fname = 'imgs/weight/' + str(i) + '.csv'
        if not os.path.isfile(fname):
            continue
        w = np.loadtxt(fname, delimiter=',')
        fname_img = 'imgs/' + str(i) + '.jpg'
        vs.disp_effect(w, fname=fname_img)
        print i, 'visualization done.'