Exemple #1
0
    return (title, Vertex(title, 1.0/numV, outEdges, True))

def gen_compute(num, epsilon):
    def compute(self, messageSum, agg, superstep):
        if messageSum and messageSum[0]:
            newValue = 0.15 / num + 0.85 * messageSum[0]
        else:
            newValue = self.value
        terminate = (superstep >= 10 and abs(newValue-self.value) < epsilon) or superstep > 30
        outbox = [Message(edge.target_id, newValue / len(self.outEdges))
                for edge in self.outEdges] if not terminate else []
        return Vertex(self.id, newValue, self.outEdges, not terminate), outbox
    return compute

if __name__ == '__main__':
    inputFile = 'wikipedia.txt'
    threshold = 0.01

    dpark = DparkContext()
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), inputFile)
    input = dpark.textFile(path)
    numVertex = input.count()
    vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache()
    epsilon = 0.01 / numVertex
    messages = dpark.parallelize([])
    result = Bagel.run(dpark, vertices, messages,
        gen_compute(numVertex, epsilon))

    for v in result.filter(lambda x:x.value > threshold).collect():
        print v.id, v.value
Exemple #2
0
import math
import random
import os, sys
from pprint import pprint
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dpark import DparkContext

ctx = DparkContext()

# range
nums = ctx.parallelize(range(100), 4)
print nums.count()
print nums.reduce(lambda x, y: x + y)

# text search
f = ctx.textFile("./", ext='py').map(lambda x: x.strip())
log = f.filter(lambda line: 'logging' in line).cache()
print 'logging', log.count()
print 'error', log.filter(lambda line: 'error' in line).count()
for line in log.filter(lambda line: 'error' in line).collect():
    print line

# word count
counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(
    lambda x, y: x + y).cache()
pprint(counts.filter(lambda (_, v): v > 50).collectAsMap())
pprint(
    sorted(
        counts.filter(lambda (_, v): v > 20).map(
            lambda (x, y): (y, x)).groupByKey().collect()))
pprint(counts.map(lambda v: "%s:%s" % v).saveAsTextFile("wc/"))
Exemple #3
0
        data = fs.get('/song/small/%s.mp3'%id)
        binfile = open("output/%s.mp3" % (id),"wb")
        binfile.write(data)
        binfile.close()
    
    m = RetrievalMusic(dptable, mode)
    m.retrieving('output/%s.mp3'%id)
    
    if mode != 2:
        call("rm output/%s.mp3" % (id), shell=True)


def batchprocess(song_id, loaded, mode):

#    dpark = DparkContext()
#    dptable = dpark.broadcast(loaded)
#    dpark.parallelize(song_id, 80).foreach(lambda(id):calculate_single(id, dptable, mode))
    for id in song_id:
        calculate_single(id, loaded, mode)
    
    if mode == 2:
        rearrange()

if __name__ == '__main__':
    song_id = np.load("track_temp.npy")
    mode = 1    # 1 for save, 2 for filter, and 0 for regular work
    dpark = DparkContext()
    dpark.parallelize(song_id, 50).foreach(lambda(id):calculate_single(id,0,mode))


Exemple #4
0
        if messageSum and messageSum[0]:
            newValue = 0.15 / num + 0.85 * messageSum[0]
        else:
            newValue = self.value
        terminate = (superstep >= 10 and
                     abs(newValue - self.value) < epsilon) or superstep > 30
        outbox = [(edge.target_id, newValue / len(self.outEdges))
                  for edge in self.outEdges] if not terminate else []
        return Vertex(self.id, newValue, self.outEdges, not terminate), outbox

    return compute


if __name__ == '__main__':
    inputFile = 'wikipedia.txt'
    threshold = 0.01

    dpark = DparkContext()
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), inputFile)
    input = dpark.textFile(path)
    numVertex = input.count()
    vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache()
    epsilon = 0.01 / numVertex
    messages = dpark.parallelize([])
    result = Bagel.run(dpark, vertices, messages,
                       gen_compute(numVertex, epsilon))

    for id, v in result.filter(
            lambda id_v: id_v[1].value > threshold).collect():
        print(id, v)
Exemple #5
0
from __future__ import print_function
import math
import random
import os, sys
from pprint import pprint
from six.moves import map
from six.moves import range
from six.moves import zip

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dpark import DparkContext

dpark = DparkContext()

# range
nums = dpark.parallelize(list(range(100)), 4)
print(nums.count())
print(nums.reduce(lambda x, y: x + y))

# text search
f = dpark.textFile("./", ext='py').map(lambda x: x.strip())
log = f.filter(lambda line: 'logging' in line).cache()
print('logging', log.count())
print('error', log.filter(lambda line: 'error' in line).count())
for line in log.filter(lambda line: 'error' in line).collect():
    print(line)

# word count
counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).cache()
pprint(counts.filter(lambda __v1: __v1[1] > 50).collectAsMap())
pprint(sorted(counts.filter(lambda __v: __v[1] > 20).map(lambda x_y: (x_y[1], x_y[0])).groupByKey().collect()))
Exemple #6
0
import math
import random
import os, sys
from pprint import pprint

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dpark import DparkContext

ctx = DparkContext()

# range
nums = ctx.parallelize(range(100), 4)
print nums.count()
print nums.reduce(lambda x, y: x + y)

# text search
f = ctx.textFile("./", ext="py").map(lambda x: x.strip())
log = f.filter(lambda line: "logging" in line).cache()
print "logging", log.count()
print "error", log.filter(lambda line: "error" in line).count()
for line in log.filter(lambda line: "error" in line).collect():
    print line

# word count
counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).cache()
pprint(counts.filter(lambda (_, v): v > 50).collectAsMap())
pprint(sorted(counts.filter(lambda (_, v): v > 20).map(lambda (x, y): (y, x)).groupByKey().collect()))
pprint(counts.map(lambda v: "%s:%s" % v).saveAsTextFile("wc/"))

# Pi
import random
#coding:utf-8
from random import shuffle, random, sample
import traceback

from dpark import DparkContext

dp = DparkContext('mesos')
'''shuffle不返回数据值'''
rdd1 = dp.parallelize([[(1, 2), (3, 4), (5, 6)]]).map(lambda x: shuffle(x))
print 'rdd1:'
print rdd1.take(1)

rdd2 = dp.parallelize([((1, 2), (3, 4), (5, 6))]).map(lambda x: shuffle(x))
print 'rdd2:'
try:
    print rdd2.take(1)
except Exception, e:
    print traceback.print_exc()
'''O(NlogN)'''
rdd3 = dp.parallelize([[(1, 2), (3, 4), (5, 6)]
                       ]).map(lambda x: sorted(x, key=lambda k: random()))
print 'rdd3:'
print rdd3.take(1)
'''O(N)'''
rdd4 = dp.parallelize([[(1, 2), (3, 4),
                        (5, 6)]]).map(lambda x: sample(x, len(x)))
print 'rdd4:'
print rdd4.take(1)

rdd5 = dp.parallelize([((1, 2), (3, 4), (5, 6))
                       ]).map(lambda x: sample(x, len(x)))