Beispiel #1
0
def test_scope():

    Scope.reset()
    dc = DparkContext()

    rdd = dc.makeRDD([1, 2, 3]).map(int).map(int).map(int)
    dc.scheduler.current_scope = Scope.get("")

    for i, r in enumerate([rdd.prev.prev, rdd.prev, rdd]):
        assert r.scope.id == i + 1
        assert r.scope.api_callsite.startswith("map:{}".format(i))

    Scope.reset()
    rdd = dc.makeRDD([1, 2, 3]) \
        .map(int) \
        .map(int) \
        .map(int)

    for i, r in enumerate([rdd.prev.prev, rdd.prev, rdd]):
        assert r.scope.id == i + 1
        assert r.scope.api_callsite.startswith("map:0")

    def get_rdd(n):
        return dc.makeRDD([n, n]).map(int).map(int).map(int)

    rdds = [get_rdd(1), get_rdd(2)]
    assert rdds[0].scope.id + 4 == rdds[1].scope.id

    rdds = [get_rdd(i) for i in range(2)]
    assert rdds[0].scope.id == rdds[1].scope.id
Beispiel #2
0
def test_scope():

    Scope.reset()
    dc = DparkContext()

    rdd = dc.makeRDD([1, 2, 3]).map(int).map(int).map(int)
    dc.scheduler.current_scope = Scope.get("")

    for i, r in enumerate([rdd.prev.prev, rdd.prev, rdd]):
        assert r.scope.id == i + 1
        assert r.scope.api_callsite.startswith("map:{}".format(i))

    Scope.reset()
    rdd = dc.makeRDD([1, 2, 3]) \
        .map(int) \
        .map(int) \
        .map(int)

    for i, r in enumerate([rdd.prev.prev, rdd.prev, rdd]):
        assert r.scope.id == i + 1
        assert r.scope.api_callsite.startswith("map:0")

    def get_rdd(n):
        return dc.makeRDD([n, n]).map(int).map(int).map(int)

    rdds = [get_rdd(1), get_rdd(2)]
    assert rdds[0].scope.id + 4 == rdds[1].scope.id

    rdds = [get_rdd(i) for i in range(2)]
    assert rdds[0].scope.id == rdds[1].scope.id
Beispiel #3
0
    def get_rdd(self):
        dpark = DparkContext()

        return dpark.union(
            [dpark.textFile(path, splitSize=64 << 20)
             for path in self.paths]
        ).map(Weblog.from_line)
Beispiel #4
0
def test_call_graph_join():
    dc = DparkContext()
    Scope.reset()
    rdd = dc.makeRDD([(1, 1), (1, 2)]).map(lambda x: x)
    rdd = rdd.join(rdd)
    dc.scheduler.current_scope = Scope.get("")
    g = dc.scheduler.get_call_graph(rdd)
    pprint(g)
    assert g == ([0, 1, 2, 3], {(0, 1): 1, (1, 2): 2, (2, 3): 1})

    fg = dc.scheduler.fmt_call_graph(g)
Beispiel #5
0
def test_call_graph_join():
    dc = DparkContext()
    Scope.reset()
    rdd = dc.makeRDD([(1, 1), (1, 2)]).map(lambda x: x)
    rdd = rdd.join(rdd)
    dc.scheduler.current_scope = Scope.get("")
    g = dc.scheduler.get_call_graph(rdd)
    pprint(g)
    assert g == ([0, 1, 2, 3], {(0, 1): 1, (1, 2): 2, (2, 3): 1})

    fg = dc.scheduler.fmt_call_graph(g)
Beispiel #6
0
def test_call_graph_union():
    dc = DparkContext()
    Scope.reset()
    r1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)])
    r2 = dc.union([dc.makeRDD([(3, 4)]) for _ in range(2)])
    rdd = r1.union(r2)
    dc.scheduler.current_scope = Scope.get("")
    g = dc.scheduler.get_call_graph(rdd)
    # pprint(g)
    fg = dc.scheduler.fmt_call_graph(g)
    # pprint(fg)
    assert g == ([0, 1, 2, 3, 4, 5], {(0, 1): 2, (1, 4): 1, (2, 3): 2, (3, 4): 1, (4, 5): 1})
Beispiel #7
0
def word_count(file_path, word):
    # 指定某个Mesos主机进行沟通
    dpark = DparkContext()

    # 将分布式文件,构造成文件RDD,每块大小为16m
    f = dpark.textFile(file_path, splitSize=16 << 20)

    # 用map()转变成新的RDD,再用filter过滤出更新的RDD,最后用count()处理返回结果
    print(
        word, 'count:',
        f.map(lambda line: line.strip()).filter(
            lambda line: word in line).count())
def main(infile, outfile):

	ctx = DparkContext()

	rdd = ctx.textFile(infile)

	rdd = rdd.map(map_unit_type_priority)

	rdd = rdd.reduceByKey(reduce_by_key)

	rdd = rdd.map(map_to_string)

	rdd.saveAsTextFile(outfile)	
Beispiel #9
0
def test_lineage():
    Scope.reset()

    dc = DparkContext()
    rdd1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(5)])
    assert len(rdd1.dep_lineage_counts) == 1

    rdd2 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(3)])
    rdd3 = rdd1.union(rdd2)
    assert len(rdd3.dep_lineage_counts) == 2

    rdd4 = dc.union([dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) for _ in range(4)])
    assert len(rdd4.dep_lineage_counts) == 1
    assert len(list(rdd4.dependencies)[0].rdd.dep_lineage_counts) == 1
    rdd5 = rdd3.groupWith(rdd4)

    print("rdd1", rdd1.id, rdd1.dep_lineage_counts)
    stage = dc.scheduler.newStage(rdd1, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert list(stage.pipelines.keys()) == [rdd1.id]
    assert stage.pipeline_edges == {}

    stage = dc.scheduler.newStage(rdd3, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert sorted(list(stage.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id]
    assert stage.pipeline_edges == {((-1, rdd1.id), (-1, rdd3.id)):1,
                                    ((-1, rdd2.id), (-1, rdd3.id)):1}

    stage = dc.scheduler.newStage(rdd4, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert list(stage.pipelines.keys()) == [rdd4.id]
    assert stage.pipeline_edges == {}

    print("rdd5", rdd5.id, rdd3.id, rdd4.id)
    stage = dc.scheduler.newStage(rdd5, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert sorted(list(stage.pipelines.keys())) == [rdd5.id]
    assert sorted(stage.pipeline_edges) == sorted([((s.id, s.rdd.id), (-1, rdd5.id)) for s in stage.parents])

    print('-' * 100)
    pprint(stage.get_pipeline_graph())

    for s in stage.parents:
        if s.rdd.id == rdd4.id:
            assert list(s.pipelines.keys()) == [rdd4.id]
            assert s.pipeline_edges == {}
        elif s.rdd.id == rdd3.id:
            assert sorted(list(s.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id]
            assert s.pipeline_edges == {((-1, rdd1.id), (-1, rdd3.id)): 1,
                                        ((-1, rdd2.id), (-1, rdd3.id)): 1}
        else:
            assert False

        pprint(s.get_pipeline_graph())
Beispiel #10
0
def test_call_graph_union():
    dc = DparkContext()
    Scope.reset()
    r1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)])
    r2 = dc.union([dc.makeRDD([(3, 4)]) for _ in range(2)])
    rdd = r1.union(r2)
    dc.scheduler.current_scope = Scope.get("")
    g = dc.scheduler.get_call_graph(rdd)
    # pprint(g)
    fg = dc.scheduler.fmt_call_graph(g)
    # pprint(fg)
    assert g == ([0, 1, 2, 3, 4, 5], {
        (0, 1): 2,
        (1, 4): 1,
        (2, 3): 2,
        (3, 4): 1,
        (4, 5): 1
    })
Beispiel #11
0
def main():
    current_path = os.path.dirname(os.path.abspath(__file__))
    for i in cmp_list:
        assert os.path.isdir('cmp'+str(i+1))
    dpark_ctx = DparkContext('process')

    # Dpark thread
    def map_iter(i):
        dir_name = 'cmp' + str(i+1)
        logger = os.path.join(dir_name, 'log')
        if os.path.isdir(logger) and os.listdir(logger):
            return
        print "Start running: ", i+1
        os.chdir(os.path.join(current_path, 'cmp') + str(i+1))
        os.system('python ./cmp.py')

    dpark_ctx.makeRDD(cmp_list).foreach(map_iter)
    print 'Done.'
Beispiel #12
0
def dt_model():
    tr_x, tr_y, va_x, va_y, te_x, te_y = load_data()
    param_grid = {
        "min_samples_split": range(1, 10000, 1000),
        "min_samples_leaf": range(1, 10000, 1000),
        # 'max_leaf_nodes': [0, 100, 1000, 10000],
        "max_depth": [None, 100, 1000, 10000],
    }
    param_grid = grid_generator(param_grid)

    # Dpark
    dpark_ctx = DparkContext()

    def map_iter(param):
        idx = param[0][0] * 2 + param[0][1]
        param = param[1]
        m = tree.DecisionTreeClassifier(criterion="entropy", **param)
        print "%d, Start traininig Decision Tree model." % idx
        m = m.fit(tr_x, tr_y)
        print "%d, Training done." % idx
        proba = m.predict_proba(va_x)
        fpr, tpr, thresh = roc_curve(va_y, proba[:, 1])
        auc_ = auc(fpr, tpr)
        print "%d, AUC is %f" % (idx, auc_)
        return idx, param, auc_

    print "It will train %d models" % len(param_grid)
    result_record = dpark_ctx.makeRDD(param_grid, 50).enumerate().map(map_iter).collect()

    file_record = open("dt_result.pkl", "w")
    pickle.dump(result_record, file_record)
    file_record.close()

    # testing
    opt = reduce(lambda x, y: x if x[2] > y[2] else y, result_record)
    m = tree.DecisionTreeClassifier(criterion="entropy", **opt[1])
    m = m.fit(tr_x, tr_y)
    proba = m.predict_proba(te_x)
    fpr, tpr, thresh = roc_curve(te_y, proba[:, 1])
    auc_ = auc(fpr, tpr)
    print "Testing AUC is %f" % auc_
def main(txt, infile, outfile):
	
	ctx = DparkContext()
	csvfilename = infile
	txtfilename = txt 

	txt_rdd = ctx.textFile(txtfilename)

	txt_rdd = txt_rdd.map(divide_txt)
	#('5988', ['2', 'CPM']) 
	csv_rdd = ctx.textFile(csvfilename, splitSize=64<<20) 
	#print csv_rdd.take(100)
	csv_rdd = csv_rdd.filter(remove_some_bid_unitid)

	csv_rdd = csv_rdd.map(divide_csv)	
	#('6379', ['-1', '1236054964187470000', '6379', '77', '1', '1', '0'])
	record_rdd = txt_rdd.join(csv_rdd)
	#('6370', (['2', 'COMPLEMENT'], ['-1', '8183016859528920000', '6370', '86', '3', '1', '0']))
	record_rdd = record_rdd.mapValue(join_element)
	#('6370', ['2', 'COMPLEMENT', '-1', '8183016859528920000', '6370', '86', '3', '1', '0']) 
	record_rdd = record_rdd.groupBy(lambda line : str(line[1]).split()[5] + str(line[1]).split()[1])

	#print record_rdd.take(1)
	record_rdd = record_rdd.map(map_unit_type)
	#print record_rdd.take(1)
	record_rdd = record_rdd.flatMap(flat_map_unit_type_priority)
	#print record_rdd.take(5)
	record_rdd = record_rdd.groupByKey()
	#print "*" * 50
	#print record_rdd.take(5)

	record_rdd = record_rdd.mapValue(map_value_unit_type_priority)
	#print "#" * 50
	#print record_rdd.take(5)
	record_rdd = record_rdd.map(map_unit_type_priority)
	#print "$" * 50
	#print record_rdd.take(5)

	# unit	type	prioritt	cluster	n_ad	n_imp	n_click	ctr
	record_rdd.saveAsTextFile(outfile)
Beispiel #14
0
import time
from dpark import DparkContext, optParser
from dpark.file_manager import file_manager
dc = DparkContext()

optParser.set_usage("%prog [options] path")
options, args = optParser.parse_args()

path = args[0]


def run(split_size=1):
    t = time.time()
    dc.textFile(path).mergeSplit(
        splitSize=split_size).filter(lambda x: "yangxiufeng" in x).count()
    return time.time() - t


run()  # file cache
print("{}s with locality".format(run()))
file_manager.fs_list = file_manager.fs_list[1:]
print("{}s merge & without locality".format(run(10)))
print("{}s without locality, ".format(run()))
#coding:utf-8
from random import shuffle, random, sample
import traceback

from dpark import DparkContext

dp = DparkContext('mesos')
'''shuffle不返回数据值'''
rdd1 = dp.parallelize([[(1, 2), (3, 4), (5, 6)]]).map(lambda x: shuffle(x))
print 'rdd1:'
print rdd1.take(1)

rdd2 = dp.parallelize([((1, 2), (3, 4), (5, 6))]).map(lambda x: shuffle(x))
print 'rdd2:'
try:
    print rdd2.take(1)
except Exception, e:
    print traceback.print_exc()
'''O(NlogN)'''
rdd3 = dp.parallelize([[(1, 2), (3, 4), (5, 6)]
                       ]).map(lambda x: sorted(x, key=lambda k: random()))
print 'rdd3:'
print rdd3.take(1)
'''O(N)'''
rdd4 = dp.parallelize([[(1, 2), (3, 4),
                        (5, 6)]]).map(lambda x: sample(x, len(x)))
print 'rdd4:'
print rdd4.take(1)

rdd5 = dp.parallelize([((1, 2), (3, 4), (5, 6))
                       ]).map(lambda x: sample(x, len(x)))
Beispiel #16
0
from dpark import DparkContext


def m(x):
    return x


def r(x, y):
    return x + y


def src():
    return dc.makeRDD([(1,1)],2)


dc = DparkContext("mesos")

rdd1 = src()
rdd2 = src().reduceByKey(r)

to_union_1_a = [src() for _ in range(2)]
to_union_1_b = [src()]
to_union_2_a = [dc.union(to_union_1_a + to_union_1_b) for _ in range(2)]
to_union_2_b = [rdd2, rdd1]
to_union_3_a = [dc.union(to_union_2_a + to_union_2_b).map(m).reduceByKey(r)]
to_union_3_b = [rdd2]
rdd3 = dc.union(to_union_3_a + to_union_3_b)
rdd4 = rdd2.join(rdd2)

rdd1.collect()
rdd2.collect()
Beispiel #17
0
# generate new training data
#   uid,iid,r - mu - bi
#   iid,1.|pi...

import glob
from dpark import DparkContext

MU_PATH='/nfs/wuhong/offline_use/global_params_0'
IBIAS_PATH='/nfs/wuhong/offline_use/ibias_0/'
RATING_PATH='/nfs/wuhong/fm_data/user_music_factor_model/user_track_rating_for_training/'
ITEM_FACTOR_PATH='/nfs/wuhong/offline_use/H_0/'

NEW_RATING_PATH='/nfs/wuhong/offline_use/rating_new/'
NEW_ITEM_FACTOR_PATH='/nfs/wuhong/offline_use/H_new/'

dpark = DparkContext()

f_global = file(MU_PATH)
line = ''
for l in f_global:
    line = l
mu = float(line.strip().split('\t')[1])
f_global.close()
mu = dpark.broadcast(mu)

def local_mapper(line):
    iid, v, _ = line.strip().split('\t')
    return (iid, float(v))

ibias = {}
ibias = dpark.textFile(glob.glob(IBIAS_PATH)).map(
Beispiel #18
0
from dpark import DparkContext, optParser

dc = DparkContext()
options, args = optParser.parse_args()
infile = args[0]
outfile = args[1]
print("from {} to {}".format(infile, outfile))


def fm(x):
    for w in x.strip().split():
        yield (w, 1)


(dc.textFile(infile)
    .flatMap(fm)
    .reduceByKey(lambda x, y: x + y, numSplits=6)
    .map(lambda x: " ".join(list(map(str, x))))
    .saveAsTextFile(outfile, overwrite=False))
Beispiel #19
0
def to_vertex((id, lines)):
    outEdges = [SPEdge(tid, int(v)) 
        for _, tid, v in lines]
    return (id, SPVertex(id, sys.maxint, outEdges, True))

def compute(self, vs, agg, superstep):
    newValue = min(self.value, vs[0]) if vs else self.value
    if newValue != self.value:
        outbox = [SPMessage(edge.target_id, newValue + edge.value)
                for edge in self.outEdges]
    else:
        outbox = []
    return SPVertex(self.id, newValue, self.outEdges, False), outbox

if __name__ == '__main__':
    ctx = DparkContext()
    lines = ctx.textFile('graph.txt').map(lambda line:line.split(' '))
    vertices = lines.filter(lambda x:len(x)==3).groupBy(
        lambda line:line[0]).map(to_vertex)
    messages = lines.filter(lambda x:len(x)==2).map(
        lambda (vid, v): (vid, SPMessage(vid, int(v)))
    )
    print 'read', vertices.count(), 'vertices and ', messages.count(), 'messages.'

    result = Bagel.run(ctx, vertices, messages, compute, MinCombiner())
    startVertex = 0
    print 'Shortest path from %s to all vertices:' % startVertex
    for v in result.collect():
        if v.value == sys.maxint:
            v.value = 'inf'
        print v.id, v.value
Beispiel #20
0

def compute(self, vs, agg, superstep):
    newValue = min(self.value, vs[0]) if vs else self.value
    if newValue != self.value:
        outbox = [
            SPMessage(edge.target_id, newValue + edge.value)
            for edge in self.outEdges
        ]
    else:
        outbox = []
    return SPVertex(self.id, newValue, self.outEdges, False), outbox


if __name__ == '__main__':
    ctx = DparkContext()
    lines = ctx.textFile('graph.txt').map(lambda line: line.split(' '))
    vertices = lines.filter(lambda x: len(x) == 3).groupBy(
        lambda line: line[0]).map(to_vertex)
    messages = lines.filter(lambda x: len(x) == 2).map(
        lambda (vid, v): (vid, SPMessage(vid, int(v))))
    print 'read', vertices.count(), 'vertices and ', messages.count(
    ), 'messages.'

    result = Bagel.run(ctx, vertices, messages, compute, MinCombiner())
    startVertex = 0
    print 'Shortest path from %s to all vertices:' % startVertex
    for v in result.collect():
        if v.value == sys.maxint:
            v.value = 'inf'
        print v.id, v.value
Beispiel #21
0
import time
from dpark import DparkContext


def m(x):
    if x[0] == 0:
        time.sleep(100)
    return x


def r(x, y):
    return x + y


dc = DparkContext("mesos")

rdd = dc.makeRDD([(i, i) for i in range(2)], 2)
rdd.collect()
rdd.reduceByKey(r).map(m).reduceByKey(r).collect()
Beispiel #22
0
import math
import random
import os, sys
from pprint import pprint

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dpark import DparkContext

ctx = DparkContext()

# range
nums = ctx.parallelize(range(100), 4)
print nums.count()
print nums.reduce(lambda x, y: x + y)

# text search
f = ctx.textFile("./", ext="py").map(lambda x: x.strip())
log = f.filter(lambda line: "logging" in line).cache()
print "logging", log.count()
print "error", log.filter(lambda line: "error" in line).count()
for line in log.filter(lambda line: "error" in line).collect():
    print line

# word count
counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).cache()
pprint(counts.filter(lambda (_, v): v > 50).collectAsMap())
pprint(sorted(counts.filter(lambda (_, v): v > 20).map(lambda (x, y): (y, x)).groupByKey().collect()))
pprint(counts.map(lambda v: "%s:%s" % v).saveAsTextFile("wc/"))

# Pi
import random
Beispiel #23
0
    def get_rdd(self):
        dpark = DparkContext()

        return dpark.union([
            dpark.textFile(path, splitSize=64 << 20) for path in self.paths
        ]).map(Weblog.from_line)
Beispiel #24
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import print_function
from dpark import DparkContext
from dpark.mutable_dict import MutableDict
from random import shuffle
import six.moves.cPickle
import numpy
from six.moves import range
from six.moves import zip

dpark = DparkContext()

with open('ab.mat') as f:
    ori = six.moves.cPickle.loads(f.read())

k = 50
d = 20
M = len(ori)
V = len(ori[0])
assert M % d == 0
assert V % d == 0

m = M / d
v = V / d

GAMMA = 0.02
LAMBDA = 0.1
STEP = 0.9

W = MutableDict(d)
Beispiel #25
0
def DownLoad(file_path):
        dpark = DparkContext()
        file_block = dpark.textFile(file_path,splitSize=16<<20)
        file_block.foreach(write_to_wav)
Beispiel #26
0
import glob
from dpark import DparkContext

RATING_PATH = '/nfs/wuhong/offline_use/rating_new/'
TRAINING_PATH = '/nfs/wuhong/paracel/data/als_fm/train'
TEST_PATH = '/nfs/wuhong/paracel/data/als_fm/test'

dpark = DparkContext()


def local_filter1(line):
    tmp = line.strip().split(',')[1]
    if tmp.endswith('0') or tmp.endswith('1') or tmp.endswith('2'):
        return False
    return True


def local_filter2(line):
    tmp = line.strip().split(',')[1]
    if tmp.endswith('0') or tmp.endswith('1') or tmp.endswith('2'):
        return True
    return False


dpark.textFile(
    glob.glob(RATING_PATH)).filter(local_filter1).saveAsTextFile(TRAINING_PATH)

dpark.textFile(
    glob.glob(RATING_PATH)).filter(local_filter2).saveAsTextFile(TEST_PATH)
Beispiel #27
0
    outEdges = [Edge(ref) for ref in refs]
    return (title, Vertex(title, 1.0/numV, outEdges, True))

def gen_compute(num, epsilon):
    def compute(self, messageSum, agg, superstep):
        if messageSum and messageSum[0]:
            newValue = 0.15 / num + 0.85 * messageSum[0]
        else:
            newValue = self.value
        terminate = (superstep >= 10 and abs(newValue-self.value) < epsilon) or superstep > 30
        outbox = [Message(edge.target_id, newValue / len(self.outEdges))
                for edge in self.outEdges] if not terminate else []
        return Vertex(self.id, newValue, self.outEdges, not terminate), outbox
    return compute

if __name__ == '__main__':
    inputFile = 'wikipedia.txt'
    threshold = 0.01
    
    dpark = DparkContext()
    input = dpark.textFile(inputFile)
    numVertex = input.count()
    vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache()
    epsilon = 0.01 / numVertex
    messages = dpark.parallelize([])
    result = Bagel.run(dpark, vertices, messages,
        gen_compute(numVertex, epsilon))

    for v in result.filter(lambda x:x.value > threshold).collect():
        print v.id, v.value
Beispiel #28
0
#!/usr/bin/env python
import sys, os, os.path
import random
from dpark import DparkContext
dpark = DparkContext()
from vector import Vector

def parseVector(line):
    num = map(int, line.strip().split('\t')[2:])
    num = [n-num[0] for n in num]
    return Vector(num[20:])
    return Vector(map(int, line.strip().split('\t')[2:]))

def closestCenter(p, centers):
    bestDist = p.squaredDist(centers[0])
    bestIndex = 0
    for i in range(1, len(centers)):
        d = p.squaredDist(centers[i])
        if d < bestDist:
            bestDist = d
            bestIndex = i
    return bestIndex

def minDist(p, centers):
    bestDist = p.squaredDist(centers[0])
    for i in range(1, len(centers)):
        d = p.squaredDist(centers[i])
        if d < bestDist:
            bestDist = d
    return bestDist
Beispiel #29
0
def test_lineage():
    Scope.reset()

    dc = DparkContext()
    rdd1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(5)])
    assert len(rdd1.dep_lineage_counts) == 1

    rdd2 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(3)])
    rdd3 = rdd1.union(rdd2)
    assert len(rdd3.dep_lineage_counts) == 2

    rdd4 = dc.union(
        [dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) for _ in range(4)])
    assert len(rdd4.dep_lineage_counts) == 1
    assert len(list(rdd4.dependencies)[0].rdd.dep_lineage_counts) == 1
    rdd5 = rdd3.groupWith(rdd4)

    print("rdd1", rdd1.id, rdd1.dep_lineage_counts)
    stage = dc.scheduler.newStage(rdd1, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert list(stage.pipelines.keys()) == [rdd1.id]
    assert stage.pipeline_edges == {}

    stage = dc.scheduler.newStage(rdd3, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert sorted(list(stage.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id]
    assert stage.pipeline_edges == {
        ((-1, rdd1.id), (-1, rdd3.id)): 1,
        ((-1, rdd2.id), (-1, rdd3.id)): 1
    }

    stage = dc.scheduler.newStage(rdd4, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert list(stage.pipelines.keys()) == [rdd4.id]
    assert stage.pipeline_edges == {}

    print("rdd5", rdd5.id, rdd3.id, rdd4.id)
    stage = dc.scheduler.newStage(rdd5, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert sorted(list(stage.pipelines.keys())) == [rdd5.id]
    assert sorted(stage.pipeline_edges) == sorted([
        ((s.id, s.rdd.id), (-1, rdd5.id)) for s in stage.parents
    ])

    print('-' * 100)
    pprint(stage.get_pipeline_graph())

    for s in stage.parents:
        if s.rdd.id == rdd4.id:
            assert list(s.pipelines.keys()) == [rdd4.id]
            assert s.pipeline_edges == {}
        elif s.rdd.id == rdd3.id:
            assert sorted(list(
                s.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id]
            assert s.pipeline_edges == {
                ((-1, rdd1.id), (-1, rdd3.id)): 1,
                ((-1, rdd2.id), (-1, rdd3.id)): 1
            }
        else:
            assert False

        pprint(s.get_pipeline_graph())
Beispiel #30
0
def main():

    # Loading data
    print "Loading..."
    data_train = sample_image()

    # Initialize networks
    visible_size = 64  # number of input units
    hidden_size = [25, 16, 9]  # number of hidden units of each layer

    lamb = 0.0001  # weight decay parameter
    beta = 3  # weight of sparsity penalty dataset

    # dpark initialize
    dpark_ctx = DparkContext()

    # Start training, and L-BFGS is adopted
    # We apply a stack-wise greedy training process
    layer_ind = range(len(hidden_size) + 1)
    layer_ind.remove(0)
    layer_size = [visible_size] + hidden_size

    # desired average activation
    sparsity_param = dict()
    for ind in layer_ind:
        # standard: 64 units -> sparsity parameter 0.01
        sparsity_param[ind] = layer_size[ind - 1] * 0.01 / 64

    data = data_train
    opttheta = dict()  # parameter vector of stack AE
    img = dict()  # visualization mode

    for ind in layer_ind:

        print "start training layer No.%d" % ind

        # Obtain random parameters of considered layer
        theta = initial_parameter(layer_size[ind], layer_size[ind - 1])

        # Training begins
        options = (data, layer_size[ind - 1], layer_size[ind], lamb,
                   sparsity_param[ind], beta, dpark_ctx)

        opt = optimize.fmin_l_bfgs_b(compute_cost, theta, compute_grad,
                                     options)

        opttheta[ind] = opt[0]

        W = opttheta.get(ind)[:layer_size[ind]*layer_size[ind-1]].\
            reshape(layer_size[ind], layer_size[ind-1])

        data = np.dot(W, data)

        # visulization shows
        img[ind] = display_effect(W)
        plt.axis('off')
        plt.savefig(str(ind) + '.jpg')

    # Trained parameters of stack AE
    para_stack = vecstack2stack(opttheta, hidden_size, visible_size)

    # Save trained weights and bias
    out = open("weights_bias.pkl", "wb")
    pickle.dump(para_stack, out)
    out.close()

    print "Mission complete!"
Beispiel #31
0
#!/usr/bin/env python
import sys, os, os.path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import random
from dpark import DparkContext
dpark = DparkContext()
from vector import Vector

def parseVector(line):
    return Vector(map(float, line.strip().split(' ')))

def closestCenter(p, centers):
    bestDist = p.squaredDist(centers[0])
    bestIndex = 0
    for i in range(1, len(centers)):
        d = p.squaredDist(centers[i])
        if d < bestDist:
            bestDist = d
            bestIndex = i
    return bestIndex


if __name__ == '__main__':
    D = 4
    K = 3
    IT = 10
    MIN_DIST = 0.01
    centers = [Vector([random.random() for j in range(D)]) for i in range(K)]
    points = dpark.textFile('kmeans_data.txt').map(parseVector).cache()

    for it in range(IT):
Beispiel #32
0
def main():
    # Loading data
    print "Loading..."
    data = sample_image()
    # Initialize networks
    visible_size = 64  # number of input units
    hidden_size = [25, 16, 9]  # number of hidden units of each layer
    #lamb = 0.0001     # weight decay parameter
    '''
    lamb = 0 # No weight decay!
    beta = 0.01
    '''
    # sigmoid DEBUG
    lamb = 0.0001
    beta = 3

    # dpark initialize
    dpark_ctx = DparkContext()
    # Start training, and L-BFGS is adopted
    # We apply a stack-wise greedy training process
    layer_ind = range(len(hidden_size) + 1)
    layer_ind.remove(0)
    layer_size = [visible_size] + hidden_size
    opttheta = dict()  # parameter vector of stack AE
    img = dict()  # visualization mode

    sparsity_param = dict()
    for ind in layer_ind:
        # standard: 64 units -> sparsity parameter 0.01
        sparsity_param[ind] = layer_size[ind - 1] * 0.01 / 64

    for ind in layer_ind:
        print "start training layer No.%d" % ind
        # Obtain random parameters of considered layer
        theta = initial_parameter(layer_size[ind], layer_size[ind - 1])
        # SGD with mini-batch
        options = (data, layer_size[ind - 1], layer_size[ind], lamb,
                   sparsity_param[ind], beta, dpark_ctx)
        opttheta[ind] = stocha_grad_desc_agagrad(compute_cost,
                                                 compute_grad,
                                                 theta,
                                                 options,
                                                 step_size_init=0.2,
                                                 max_iter=25,
                                                 tol=1e-7)
        # Preparing next layer!
        W = opttheta.get(ind)[:layer_size[ind]*layer_size[ind-1]].\
            reshape(layer_size[ind], layer_size[ind-1])
        b = opttheta.get(ind)[2*layer_size[ind]*layer_size[ind-1]:\
            2*layer_size[ind]*layer_size[ind-1]+layer_size[ind]].\
            reshape(layer_size[ind], 1)
        data = ReLU(np.dot(W, data) + b)
        # visulization shows
        img[ind] = display_effect(W)
        plt.axis('off')
        plt.savefig(str(ind) + '.jpg')

        # DEBUG
        fin = open("theta_DEBUG.pkl", "wb")
        pickle.dump((W, b), fin)
        fin.close()
        sys.exit()

    # Trained parameters of stack AE
    para_stack = vecstack2stack(opttheta, hidden_size, visible_size)

    # Save trained weights and bias
    out = open("weights_bias.pkl", "wb")
    pickle.dump(para_stack, out)
    out.close()

    print "Mission complete!"
Beispiel #33
0
import sys
sys.path.append('../')
from dpark import DparkContext

dpark = DparkContext()

name = '/mfs/tmp/weblog-pre-20111019.csv'
name = '/mfs/tmp/weblog-20111019.csv'
name = '/tmp/weblog-20111019.csv.small'
# name = '/tmp/weblog-20111019.csv.medium'
name = 'resume_text_seg_data-2014-06-01.txt'
pv = dpark.textFile(name)
pv = pv.map(lambda x:x.split(',')).map(lambda l:(l[3], l[7]))
pv = pv.flatMap(lambda (i, u):(u.startswith('/movie') and [(i, 2)]
        or u.startswith('/group') and [(i, 3)]
        or []))
# print pv.take(50)
pv = pv.reduceByKey(lambda x, y:x * y)
# print pv.take(50)
print pv.filter(lambda (_, y):y % 2 == 0 and y % 3 == 0).count()

# movie = pv.filter(lambda (bid,url): url.startswith('/movie')).reduceByKey(lambda x,y:None)
# group = pv.filter(lambda (bid,url): url.startswith('/group')).reduceByKey(lambda x,y:None)
# print movie.join(group).count()

# print pv.map(lambda x:x.split(',')[2]).uniq().count()
# print pv.map(lambda x:(x.split(',')[2],None)).reduceByKey(lambda x,y:None).count()
# .filter(lambda uid:uid)
# print upv.count()
# print upv.reduceByKey(lambda x,y:x+y).count()
Beispiel #34
0
import sys
sys.path.append('../')
import logging
from dpark import DparkContext

dpark = DparkContext()

name = 'rating.txt'


def parse(line):
    sid, uid, r, f = line.split('\t')
    defaults = {'F': 4.5, 'P': 3.7, 'N': 4.0}
    if r == 'None':
        r = defaults[f]
    return (sid, (uid, float(r)))


rating = dpark.textFile(name, numSplits=2).map(parse).groupByKey(2)  #.cache()
#print 'us', rating.first()
print rating.count()


def reverse(it):
    s = {}
    for k, us in it:
        for u, r in us:
            s.setdefault(u, {})[k] = r
    return s

Beispiel #35
0
#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import print_function
import sys, os, os.path
from six.moves import map
from six.moves import range

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import random
from dpark import DparkContext
from vector import Vector

dpark = DparkContext()


def parseVector(line):
    return Vector(list(map(float, line.strip().split(' '))))


def closestCenter(p, centers):
    bestDist = p.squaredDist(centers[0])
    bestIndex = 0
    for i in range(1, len(centers)):
        d = p.squaredDist(centers[i])
        if d < bestDist:
            bestDist = d
            bestIndex = i
    return bestIndex


if __name__ == '__main__':
Beispiel #36
0
#!/usr/bin/env python
import sys, os, os.path
import random
from dpark import DparkContext
dpark = DparkContext()
from vector import Vector


def parseVector(line):
    num = map(int, line.strip().split('\t')[2:])
    num = [n - num[0] for n in num]
    return Vector(num[20:])
    return Vector(map(int, line.strip().split('\t')[2:]))


def closestCenter(p, centers):
    bestDist = p.squaredDist(centers[0])
    bestIndex = 0
    for i in range(1, len(centers)):
        d = p.squaredDist(centers[i])
        if d < bestDist:
            bestDist = d
            bestIndex = i
    return bestIndex


def minDist(p, centers):
    bestDist = p.squaredDist(centers[0])
    for i in range(1, len(centers)):
        d = p.squaredDist(centers[i])
        if d < bestDist:
Beispiel #37
0
#!/usr/bin/env python

import os, sys
import re
from glob import glob
from dpark import DparkContext
from dpark.table import TableRDD, Globals, CachedTables
from prettytable import PrettyTable

ctx = DparkContext()

_locals = {}

def gen_table(expr, fields=None):
    if '(' in expr and getattr(ctx, expr.split('(')[0], None):
        rdd = eval('ctx.' + expr[:expr.rindex(')') + 1], globals(), _locals)
    else:
        rdd = eval(expr, globals(), _locals)

    head = rdd.first()
    if isinstance(head, str):
        if '\t' in head:
            rdd = rdd.fromCsv('excel-tab')
        elif ',' in head:
            rdd = rdd.fromCsv('excel')
        else:
            rdd = rdd.map(lambda l:l.split(' '))
        row = rdd.first()
    else:
        row = head
    if not isinstance(rdd, TableRDD):
Beispiel #38
0
    return _


class DemoOutputStream(ForEachDStream):
    def __init__(self, parent, output):
        ForEachDStream.__init__(self, parent, collect(output))
        self.output = output

    def __setstate__(self, state):
        ForEachDStream.__setstate__(self, state)
        self.output = []
        self.func = collect(self.output)


sc = DparkContext(dpark_master)


class TestDStream(unittest.TestCase):
    def _setupStreams(self, intput1, input2, operation):
        ssc = StreamingContext(2, sc)
        is1 = DemoInputStream(ssc, intput1)
        ssc.registerInputStream(is1)
        if input2:
            is2 = DemoInputStream(ssc, input2)
            ssc.registerInputStream(is2)
            os = operation(is1, is2)
        else:
            os = operation(is1)
        output = DemoOutputStream(os, [])
        ssc.registerOutputStream(output)
Beispiel #39
0
        if messageSum and messageSum[0]:
            newValue = 0.15 / num + 0.85 * messageSum[0]
        else:
            newValue = self.value
        terminate = (superstep >= 10 and
                     abs(newValue - self.value) < epsilon) or superstep > 30
        outbox = [(edge.target_id, newValue / len(self.outEdges))
                  for edge in self.outEdges] if not terminate else []
        return Vertex(self.id, newValue, self.outEdges, not terminate), outbox

    return compute


if __name__ == '__main__':
    inputFile = 'wikipedia.txt'
    threshold = 0.01

    dpark = DparkContext()
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), inputFile)
    input = dpark.textFile(path)
    numVertex = input.count()
    vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache()
    epsilon = 0.01 / numVertex
    messages = dpark.parallelize([])
    result = Bagel.run(dpark, vertices, messages,
                       gen_compute(numVertex, epsilon))

    for id, v in result.filter(
            lambda id_v: id_v[1].value > threshold).collect():
        print(id, v)
Beispiel #40
0
def DownLoad(file_path):
    dpark = DparkContext()
    file_block = dpark.textFile(file_path, splitSize=16 << 20)
    file_block.foreach(write_to_wav)
Beispiel #41
0
from dpark import DparkContext, optParser

dc = DparkContext()
options, args = optParser.parse_args()
infile = args[0]
outfile = args[1]
print("from {} to {}".format(infile, outfile))


def fm(x):
    for w in x.strip().split():
        yield (w, 1)


(dc.textFile(infile).flatMap(fm).reduceByKey(
    lambda x, y: x + y,
    numSplits=6).map(lambda x: " ".join(list(map(str, x)))).saveAsTextFile(
        outfile, overwrite=False))
Beispiel #42
0
# -*- coding: utf-8 -*-

from dpark import DparkContext


def m(x):
    return x


rdd = DparkContext().makeRDD([(1, 1)]).map(m).groupByKey()
rdd.map(m).collect()
rdd.map(m).collect()
Beispiel #43
0
                for _, tid, v in lines]
    return (id, Vertex(id, sys.maxint, outEdges, True))


def compute(self, vs, agg, superstep):
    newValue = min(self.value, vs[0]) if vs else self.value
    if newValue != self.value:
        outbox = [(edge.target_id, newValue + edge.value)
                  for edge in self.outEdges]
    else:
        outbox = []
    return Vertex(self.id, newValue, self.outEdges, False), outbox


if __name__ == '__main__':
    ctx = DparkContext()
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'graph.txt')
    lines = ctx.textFile(path).map(lambda line: line.split(' '))
    vertices = lines.groupBy(lambda line: line[0]).map(to_vertex)
    startVertex = str(0)
    messages = ctx.makeRDD([(startVertex, 0)])

    print('read', vertices.count(), 'vertices and ', messages.count(), 'messages.')

    result = Bagel.run(ctx, vertices, messages, compute, BasicCombiner(min), numSplits=2)

    print('Shortest path from %s to all vertices:' % startVertex)
    for id, v in result.collect():
        if v.value == sys.maxint:
            v.value = 'inf'
        print(v.id, v.value)
Beispiel #44
0
#!/usr/bin/env python
# encoding: utf-8
""" 
@version: v1.0 
@author: W_H_J 
@license: Apache Licence  
@contact: [email protected] 
@site:  
@software: PyCharm 
@file: wordcount.py 
@time: 2018/6/5 18:10 
@describe:  单词统计
"""
from dpark import DparkContext
ctx = DparkContext()
file = ctx.textFile("./words.txt")
words = file.flatMap(lambda x: x.split()).map(lambda x: (x, 1))
wc = words.reduceByKey(lambda x, y: x + y).collectAsMap()
print(wc)


# 统计单词出现的个数
def word_count(file_path, word):
    # 指定某个Mesos主机进行沟通
    dpark = DparkContext()

    # 将分布式文件,构造成文件RDD,每块大小为16m
    f = dpark.textFile(file_path, splitSize=16 << 20)

    # 用map()转变成新的RDD,再用filter过滤出更新的RDD,最后用count()处理返回结果
    print(
Beispiel #45
0
        data = fs.get('/song/small/%s.mp3'%id)
        binfile = open("output/%s.mp3" % (id),"wb")
        binfile.write(data)
        binfile.close()
    
    m = RetrievalMusic(dptable, mode)
    m.retrieving('output/%s.mp3'%id)
    
    if mode != 2:
        call("rm output/%s.mp3" % (id), shell=True)


def batchprocess(song_id, loaded, mode):

#    dpark = DparkContext()
#    dptable = dpark.broadcast(loaded)
#    dpark.parallelize(song_id, 80).foreach(lambda(id):calculate_single(id, dptable, mode))
    for id in song_id:
        calculate_single(id, loaded, mode)
    
    if mode == 2:
        rearrange()

if __name__ == '__main__':
    song_id = np.load("track_temp.npy")
    mode = 1    # 1 for save, 2 for filter, and 0 for regular work
    dpark = DparkContext()
    dpark.parallelize(song_id, 50).foreach(lambda(id):calculate_single(id,0,mode))


Beispiel #46
0
import math
import random
import os, sys
from pprint import pprint
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dpark import DparkContext

ctx = DparkContext()

# range
nums = ctx.parallelize(range(100), 4)
print nums.count()
print nums.reduce(lambda x, y: x + y)

# text search
f = ctx.textFile("./", ext='py').map(lambda x: x.strip())
log = f.filter(lambda line: 'logging' in line).cache()
print 'logging', log.count()
print 'error', log.filter(lambda line: 'error' in line).count()
for line in log.filter(lambda line: 'error' in line).collect():
    print line

# word count
counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(
    lambda x, y: x + y).cache()
pprint(counts.filter(lambda (_, v): v > 50).collectAsMap())
pprint(
    sorted(
        counts.filter(lambda (_, v): v > 20).map(
            lambda (x, y): (y, x)).groupByKey().collect()))
pprint(counts.map(lambda v: "%s:%s" % v).saveAsTextFile("wc/"))
def main(argv):
    # Dpark initialize
    dpark = DparkContext()

    # number of the training and testing set
    num_train = 6000
    num_test = 6000

    # Loading the dataset
    data = svm_read_problem('echo_liveness.01.libsvm')
    y, x = data

    # Preparing training and testing data
    if len(x) != len(y):
        print("The labels and features are not accorded!")
        sys.exit()
    
    x_live = [x[i] for i in find(y, 1.0)]
    x_stu = [x[i] for i in find(y, 0.0)]
    n_live = len(x_live)
    n_stu = len(x_stu)
    ind_live = range(n_live)
    ind_stu = range(n_stu)
    random.shuffle(ind_live)
    random.shuffle(ind_stu)

    x_te = [x_live[i] for i in ind_live[num_train : num_test + num_train]] + \
        [x_stu[i] for i in ind_stu[num_train : num_test + num_train]]
    y_te = [1.0] * len(ind_live[num_train : num_test + num_train]) + \
        [-1.0]*len(ind_stu[num_train : num_test + num_train])
    x_tr = [x_live[i] for i in ind_live[:num_train]] + \
        [x_stu[i] for i in ind_stu[:num_train]]
    y_tr = [1.0]*num_train + [-1.0]*num_train

    # dpark version
    def map_iter(i):
        y_tr_examplar = [-1.0] * len(y_tr)
        y_tr_examplar[i] = 1.0
        # opt = '-t 0 -w1 ' + str(len(y_tr)) + ' -w-1 1 -b 1 -q'
        # It is suggested in Efros' paper that:
        # C1 0.5, C2 0.01
        opt = '-t 0 -w1 0.5 -w-1 0.01 -b 1 -q'
        m = svm_train(y_tr_examplar, list(x_tr), opt)
        p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-b 1 -q')
        p_val = np.array(p_val)
        # p_val = np.delete(p_val,1,1)  # shape = (N, 1)
        p_val = p_val[:, 0]  # shape = (N, )
        return p_val

    p_vals = dpark.makeRDD(
        range(len(y_tr))
    ).map(
        map_iter
    ).collect()

    val = np.array(p_vals).T

    # for-loop version
    '''
    # Examplar SVM Training
    ensemble_model = []
    # DPark

    for i in range(len(y_tr)):
        y_tr_examplar = [-1.0] * len(y_tr)
        y_tr_examplar[i] = 1.0;
        #opt = '-t 0 -w1 ' + str(len(y_tr)) + ' -w-1 1 -b 1 -q'
        # It is suggested in Efros' paper that:
        # C1 0.5, C2 0.01
        opt = '-t 0 -w1 0.5 -w-1 0.01 -b 1 -q'
        m = svm_train(y_tr_examplar, x_tr, opt)
        ensemble_model.append(m)
        print("The %s-th examplar SVM has been trained" %i)

    # Calibaration, to be updated
    # Since we adopt the probability estimation model of LIB_SVM, Calibrating seems unnecessary

    # Ensembly Classify
    val = np.zeros((len(y_te),1))
    for m in ensemble_model:
        p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-b 1 -q')
        p_val = np.array(p_val)
        p_val = np.delete(p_val,1, 1)
        val = np.hstack((val, p_val))
    if val.shape[1] != len(y_tr) + 1:
        print "Chaos!"
    val = np.delete(val,0,1)
    print 'val.shape =', val.shape
    '''
    
    # KNN
    k = num_train / 8
    sorted_index = val.argsort(axis=1)
    sorted_index = sorted_index.T[::-1].T
    p_label = []
    for index in sorted_index:
        nearest_samples = []
        for sample_index in index[:k]:
            nearest_samples.append(y_tr[sample_index])
        n,bins,dummy = plt.hist(nearest_samples, 2, normed=1, 
                                facecolor='r', alpha=0.75)
        if n[0] > n[1]:
            p_label.append(-1.0)
        else:
            p_label.append(1.0)

    # evaluation
    rate, pos_rate, neg_rate = evaluation(y_te, p_label)

    print("The Examplar SVM framework achieves a precision of %f" % rate)
Beispiel #48
0
import time
import unittest
import logging

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from dpark import DparkContext
from dpark.utils.nested_groupby import GroupByNestedIter
from dpark.shuffle import GroupByNestedIter, AutoBatchedSerializer
from dpark.utils.profile import profile
import dpark.conf

GroupByNestedIter.NO_CACHE = True
print_mem_incr = True
print_mem_incr = False
dc = DparkContext('mesos')
RC = dpark.conf.rddconf
M = 1024 * 1024


def rss_func(p):
    if hasattr(p, "memory_info"):
        mem_info = getattr(p, "memory_info")
    else:
        mem_info = getattr(p, 'get_memory_info')

    def _():
        return mem_info().rss / M

    return _
def main(argv):
    # Loading the dataset
    data = svm_read_problem('echo_liveness.01.libsvm')
    y, x = data
    del data

    num_train = 6000
    num_test = 6000

    # Preparing training and testing data
    if len(x) != len(y):
        print("Please examine the data set, for the labels and features are not accorded!")
        sys.exit()
    # generating random training and testing set, to yield the ability of classifier more accurately.
    x_live = [x[i] for i in find(y, 1.0)]
    x_stu = [x[i] for i in find(y, 0.0)]
    n_live = len(x_live)
    n_stu =  len(x_stu)
    ind_live = range(n_live)
    ind_stu = range(n_stu)
    random.shuffle(ind_live)
    random.shuffle(ind_stu)

    x_te = [x_live[i] for i in ind_live[num_train : num_test + num_train]] + \
        [x_stu[i] for i in ind_stu[num_train : num_test + num_train]]
    y_te = [1.0] * len(ind_live[num_train : num_test + num_train]) + \
        [-1.0]*len(ind_stu[num_train : num_test + num_train])
    x_tr = [x_live[i] for i in ind_live[:num_train]] + \
        [x_stu[i] for i in ind_stu[:num_train]]
    y_tr = [1.0]*num_train + [-1.0]*num_train

    # SVM and a 10-fold Cross Validation choosing the best parameters.
    # gamma and c_reg are constructed in a parameter grid
    
    # for-loop version
    '''
    gamma = np.arange(.01,20,.04)
    c_reg = np.arange(.01,20,.04)
    opt = []
    best_para = {'gamma': 0, 'c': 0, 'precision': 0}
    for g in gamma:
        for c in c_reg:
            opt = '-g '+ str(g) +' -c ' + str(c) + ' -v 10 -q'
            pre = svm_train(y_tr,x_tr,opt)
            if pre > best_para.get('precision'):
                best_para['gamma'] = g
                best_para['c'] = c
                best_para['precision'] = pre 
    best_opt = '-g '+ str(best_para.get('gamma')) +' -c ' + str(best_para.get('c')) + ' -q'
    m = svm_train(y_tr, x_tr, best_opt)
    p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-q')
    '''

    # dpark version
    dpark = DparkContext()
    gamma = np.arange(.01, 5, .08)
    c_reg = np.arange(.01, 5, .08)
    opt = []
    for g in gamma:
        for c in c_reg:
            opt.append('-g '+ str(g) +' -c ' + str(c) + ' -v 10 -q')

    def map_iter(i):
        pre = svm_train(y_tr, list(x_tr), opt[i])
        return pre

    #pres = dpark.makeRDD(range(len(opt)),100).map(map_iter).collect()
    pres = dpark.makeRDD(range(len(opt))).map(map_iter).collect()
    pres = np.array(pres)
    best_opt_ind = pres.argsort()
    best_opt = opt[best_opt_ind[-1]]

    best_opt = best_opt[:best_opt.find('-v') - 1]
    m = svm_train(y_tr, x_tr, best_opt)
    p_label, p_acc, p_val = svm_predict(y_te, x_te, m, '-q')

    print 'This SVM framework precision: %f' % p_acc[0]
Beispiel #50
0
# -*- coding: utf-8 -*-

from dpark import DparkContext


def m(x):
    return x


rdd = DparkContext().makeRDD([(1,1)]).map(m).groupByKey()
rdd.map(m).collect()
rdd.map(m).collect()
Beispiel #51
0
    return (title, Vertex(title, 1.0/numV, outEdges, True))

def gen_compute(num, epsilon):
    def compute(self, messageSum, agg, superstep):
        if messageSum and messageSum[0]:
            newValue = 0.15 / num + 0.85 * messageSum[0]
        else:
            newValue = self.value
        terminate = (superstep >= 10 and abs(newValue-self.value) < epsilon) or superstep > 30
        outbox = [Message(edge.target_id, newValue / len(self.outEdges))
                for edge in self.outEdges] if not terminate else []
        return Vertex(self.id, newValue, self.outEdges, not terminate), outbox
    return compute

if __name__ == '__main__':
    inputFile = 'wikipedia.txt'
    threshold = 0.01

    dpark = DparkContext()
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), inputFile)
    input = dpark.textFile(path)
    numVertex = input.count()
    vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache()
    epsilon = 0.01 / numVertex
    messages = dpark.parallelize([])
    result = Bagel.run(dpark, vertices, messages,
        gen_compute(numVertex, epsilon))

    for v in result.filter(lambda x:x.value > threshold).collect():
        print v.id, v.value
Beispiel #52
0
# -*- coding: utf-8 -*-

from dpark import DparkContext


dc = DparkContext()


def get_rdd():
    return dc.makeRDD([(1, 1)])


rdd1 = get_rdd()
rdd2 = dc.union([get_rdd() for i in range(2)])
rdd3 = get_rdd().groupByKey()
dc.union([rdd1, rdd2, rdd3]).collect()
Beispiel #53
0
import glob
from dpark import DparkContext

RATING_PATH = '/nfs/wuhong/offline_use/rating_new/'
TRAINING_PATH = '/nfs/wuhong/paracel/data/als_fm/train'
TEST_PATH = '/nfs/wuhong/paracel/data/als_fm/test'

dpark = DparkContext()

def local_filter1(line):
    tmp = line.strip().split(',')[1]
    if tmp.endswith('0') or tmp.endswith('1') or tmp.endswith('2'):
        return False
    return True

def local_filter2(line):
    tmp = line.strip().split(',')[1]
    if tmp.endswith('0') or tmp.endswith('1') or tmp.endswith('2'):
        return True
    return False

dpark.textFile(glob.glob(RATING_PATH)).filter(
    local_filter1
    ).saveAsTextFile(TRAINING_PATH)

dpark.textFile(glob.glob(RATING_PATH)).filter(
    local_filter2
    ).saveAsTextFile(TEST_PATH)
Beispiel #54
0
from __future__ import absolute_import
from __future__ import print_function
import sys
sys.path.append('../')
from dpark import DparkContext

dpark = DparkContext()

name = '/mfs/tmp/weblog-pre-20111019.csv'
name = '/mfs/tmp/weblog-20111019.csv'
name = '/tmp/weblog-20111019.csv.small'
#name = '/tmp/weblog-20111019.csv.medium'
pv = dpark.textFile(name)
pv = pv.map(lambda x:x.split(',')).map(lambda l:(l[3],l[7]))
pv = pv.flatMap(lambda i_u:(i_u[1].startswith('/movie') and [(i_u[0],2)]
        or i_u[1].startswith('/group') and [(i_u[0],3)]
        or []))
#print pv.take(50)
pv = pv.reduceByKey(lambda x,y:x*y)
#print pv.take(50)
print(pv.filter(lambda __y:__y[1]%2==0 and __y[1]%3==0).count())

#movie = pv.filter(lambda (bid,url): url.startswith('/movie')).reduceByKey(lambda x,y:None)
#group = pv.filter(lambda (bid,url): url.startswith('/group')).reduceByKey(lambda x,y:None)
#print movie.join(group).count()

#print pv.map(lambda x:x.split(',')[2]).uniq().count()
#print pv.map(lambda x:(x.split(',')[2],None)).reduceByKey(lambda x,y:None).count()
#.filter(lambda uid:uid)
#print upv.count()
#print upv.reduceByKey(lambda x,y:x+y).count()
from dpark import DparkContext
dpark = DparkContext()
count = dpark.accumulator(0)

def random_once(*args, **kwrgs):
    x = random() * 2 - 1
    y = random() * 2 - 1
    if x * x + y * y < 1:
        count.add(1)

depark.parallelize(range(0, N), 10).foreach(random_once)
print 'PI is roughly ', 4.0 * count.value / N
Beispiel #56
0
from __future__ import absolute_import
from __future__ import print_function
import math
import random
import os, sys
from pprint import pprint
from six.moves import map
from six.moves import range
from six.moves import zip

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dpark import DparkContext

dpark = DparkContext()

# range
nums = dpark.parallelize(list(range(100)), 4)
print(nums.count())
print(nums.reduce(lambda x, y: x + y))

# text search
f = dpark.textFile("./", ext='py').map(lambda x: x.strip())
log = f.filter(lambda line: 'logging' in line).cache()
print('logging', log.count())
print('error', log.filter(lambda line: 'error' in line).count())
for line in log.filter(lambda line: 'error' in line).collect():
    print(line)

# word count
counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).cache()
pprint(counts.filter(lambda __v1: __v1[1] > 50).collectAsMap())