Ejemplo n.º 1
0
    def get_rdd(self):
        dpark = DparkContext()

        return dpark.union(
            [dpark.textFile(path, splitSize=64 << 20)
             for path in self.paths]
        ).map(Weblog.from_line)
Ejemplo n.º 2
0
def word_count(file_path, word):
    # 指定某个Mesos主机进行沟通
    dpark = DparkContext()

    # 将分布式文件,构造成文件RDD,每块大小为16m
    f = dpark.textFile(file_path, splitSize=16 << 20)

    # 用map()转变成新的RDD,再用filter过滤出更新的RDD,最后用count()处理返回结果
    print(
        word, 'count:',
        f.map(lambda line: line.strip()).filter(
            lambda line: word in line).count())
def main(txt, infile, outfile):
	
	ctx = DparkContext()
	csvfilename = infile
	txtfilename = txt 

	txt_rdd = ctx.textFile(txtfilename)

	txt_rdd = txt_rdd.map(divide_txt)
	#('5988', ['2', 'CPM']) 
	csv_rdd = ctx.textFile(csvfilename, splitSize=64<<20) 
	#print csv_rdd.take(100)
	csv_rdd = csv_rdd.filter(remove_some_bid_unitid)

	csv_rdd = csv_rdd.map(divide_csv)	
	#('6379', ['-1', '1236054964187470000', '6379', '77', '1', '1', '0'])
	record_rdd = txt_rdd.join(csv_rdd)
	#('6370', (['2', 'COMPLEMENT'], ['-1', '8183016859528920000', '6370', '86', '3', '1', '0']))
	record_rdd = record_rdd.mapValue(join_element)
	#('6370', ['2', 'COMPLEMENT', '-1', '8183016859528920000', '6370', '86', '3', '1', '0']) 
	record_rdd = record_rdd.groupBy(lambda line : str(line[1]).split()[5] + str(line[1]).split()[1])

	#print record_rdd.take(1)
	record_rdd = record_rdd.map(map_unit_type)
	#print record_rdd.take(1)
	record_rdd = record_rdd.flatMap(flat_map_unit_type_priority)
	#print record_rdd.take(5)
	record_rdd = record_rdd.groupByKey()
	#print "*" * 50
	#print record_rdd.take(5)

	record_rdd = record_rdd.mapValue(map_value_unit_type_priority)
	#print "#" * 50
	#print record_rdd.take(5)
	record_rdd = record_rdd.map(map_unit_type_priority)
	#print "$" * 50
	#print record_rdd.take(5)

	# unit	type	prioritt	cluster	n_ad	n_imp	n_click	ctr
	record_rdd.saveAsTextFile(outfile)
def main(infile, outfile):

	ctx = DparkContext()

	rdd = ctx.textFile(infile)

	rdd = rdd.map(map_unit_type_priority)

	rdd = rdd.reduceByKey(reduce_by_key)

	rdd = rdd.map(map_to_string)

	rdd.saveAsTextFile(outfile)	
Ejemplo n.º 5
0
def DownLoad(file_path):
        dpark = DparkContext()
        file_block = dpark.textFile(file_path,splitSize=16<<20)
        file_block.foreach(write_to_wav)
Ejemplo n.º 6
0
import glob
from dpark import DparkContext

RATING_PATH = '/nfs/wuhong/offline_use/rating_new/'
TRAINING_PATH = '/nfs/wuhong/paracel/data/als_fm/train'
TEST_PATH = '/nfs/wuhong/paracel/data/als_fm/test'

dpark = DparkContext()

def local_filter1(line):
    tmp = line.strip().split(',')[1]
    if tmp.endswith('0') or tmp.endswith('1') or tmp.endswith('2'):
        return False
    return True

def local_filter2(line):
    tmp = line.strip().split(',')[1]
    if tmp.endswith('0') or tmp.endswith('1') or tmp.endswith('2'):
        return True
    return False

dpark.textFile(glob.glob(RATING_PATH)).filter(
    local_filter1
    ).saveAsTextFile(TRAINING_PATH)

dpark.textFile(glob.glob(RATING_PATH)).filter(
    local_filter2
    ).saveAsTextFile(TEST_PATH)
Ejemplo n.º 7
0
    user_feature = dp.makeRDD([])

    def _parse_list(line):
        uid, features = line.split('\t')
        features = [x.split(':') for x in features.split('|')]
        features = [(x[0], float(x[1])) for x in features]
        features = sorted(features, key=lambda x: x[1], reverse=True)
        return (uid, features)

    for name in ['book_cluster', 'movie_cluster', 'group_cluster', 'text_cluster']:
        fn = '/home2/alg/user_profile/%s/%s' % (current_date, name)
        if not os.path.exists(fn):
            continue
        rdd = dp.textFile(fn, splitSize=16<<20)\
            .filter(lambda x: x.split('\t', 1)[0] in user_list_b.value)\
            .map(_parse_list)\
            .mapValue(lambda x: [('cnt', len(x)), ('hot', sum([y[1] for y in x]))] + x[:2])\
            .mapValue((lambda name: lambda x: [('%s_concise/%s' % (name, k), v) for (k, v) in x])(name))
        user_feature = user_feature.union(rdd)

    for name in ['gender', 'region']:
        fn = '/home2/alg/user_profile/%s/%s' % (current_date, name)
        if not os.path.exists(fn):
            continue
        rdd = dp.textFile('/home2/alg/user_profile/%s/%s' % (current_date, name), splitSize=16<<20)\
            .filter(lambda x: x.split('\t', 1)[0] in user_list_b.value)\
            .map(_parse_list)\
            .mapValue(lambda x: x[:2])\
            .mapValue((lambda name: lambda x: [('%s/%s' % (name, k), v) for (k, v) in x])(name))
        user_feature = user_feature.union(rdd)
Ejemplo n.º 8
0
        if messageSum and messageSum[0]:
            newValue = 0.15 / num + 0.85 * messageSum[0]
        else:
            newValue = self.value
        terminate = (superstep >= 10 and
                     abs(newValue - self.value) < epsilon) or superstep > 30
        outbox = [(edge.target_id, newValue / len(self.outEdges))
                  for edge in self.outEdges] if not terminate else []
        return Vertex(self.id, newValue, self.outEdges, not terminate), outbox

    return compute


if __name__ == '__main__':
    inputFile = 'wikipedia.txt'
    threshold = 0.01

    dpark = DparkContext()
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), inputFile)
    input = dpark.textFile(path)
    numVertex = input.count()
    vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache()
    epsilon = 0.01 / numVertex
    messages = dpark.parallelize([])
    result = Bagel.run(dpark, vertices, messages,
                       gen_compute(numVertex, epsilon))

    for id, v in result.filter(
            lambda id_v: id_v[1].value > threshold).collect():
        print(id, v)
Ejemplo n.º 9
0
    return (title, Vertex(title, 1.0/numV, outEdges, True))

def gen_compute(num, epsilon):
    def compute(self, messageSum, agg, superstep):
        if messageSum and messageSum[0]:
            newValue = 0.15 / num + 0.85 * messageSum[0]
        else:
            newValue = self.value
        terminate = (superstep >= 10 and abs(newValue-self.value) < epsilon) or superstep > 30
        outbox = [Message(edge.target_id, newValue / len(self.outEdges))
                for edge in self.outEdges] if not terminate else []
        return Vertex(self.id, newValue, self.outEdges, not terminate), outbox
    return compute

if __name__ == '__main__':
    inputFile = 'wikipedia.txt'
    threshold = 0.01

    dpark = DparkContext()
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), inputFile)
    input = dpark.textFile(path)
    numVertex = input.count()
    vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache()
    epsilon = 0.01 / numVertex
    messages = dpark.parallelize([])
    result = Bagel.run(dpark, vertices, messages,
        gen_compute(numVertex, epsilon))

    for v in result.filter(lambda x:x.value > threshold).collect():
        print v.id, v.value
Ejemplo n.º 10
0
Archivo: wc.py Proyecto: likaiguo/dpark
import sys
sys.path.append('../')
from dpark import DparkContext

dpark = DparkContext()

name = '/mfs/tmp/weblog-pre-20111019.csv'
name = '/mfs/tmp/weblog-20111019.csv'
name = '/tmp/weblog-20111019.csv.small'
# name = '/tmp/weblog-20111019.csv.medium'
name = 'resume_text_seg_data-2014-06-01.txt'
pv = dpark.textFile(name)
pv = pv.map(lambda x:x.split(',')).map(lambda l:(l[3], l[7]))
pv = pv.flatMap(lambda (i, u):(u.startswith('/movie') and [(i, 2)]
        or u.startswith('/group') and [(i, 3)]
        or []))
# print pv.take(50)
pv = pv.reduceByKey(lambda x, y:x * y)
# print pv.take(50)
print pv.filter(lambda (_, y):y % 2 == 0 and y % 3 == 0).count()

# movie = pv.filter(lambda (bid,url): url.startswith('/movie')).reduceByKey(lambda x,y:None)
# group = pv.filter(lambda (bid,url): url.startswith('/group')).reduceByKey(lambda x,y:None)
# print movie.join(group).count()

# print pv.map(lambda x:x.split(',')[2]).uniq().count()
# print pv.map(lambda x:(x.split(',')[2],None)).reduceByKey(lambda x,y:None).count()
# .filter(lambda uid:uid)
# print upv.count()
# print upv.reduceByKey(lambda x,y:x+y).count()
Ejemplo n.º 11
0
    bestIndex = 0
    for i in range(1, len(centers)):
        d = p.squaredDist(centers[i])
        if d < bestDist:
            bestDist = d
            bestIndex = i
    return bestIndex


if __name__ == '__main__':
    D = 4
    K = 3
    IT = 10
    MIN_DIST = 0.01
    centers = [Vector([random.random() for j in range(D)]) for i in range(K)]
    points = dpark.textFile('kmeans_data.txt').map(parseVector).cache()

    for it in range(IT):
        print('iteration', it)
        mappedPoints = points.map(lambda p: (closestCenter(p, centers), (p, 1)))
        ncenters = mappedPoints.reduceByKey(
            lambda (s1, c1), (s2, c2): (s1 + s2, c1 + c2)
        ).map(
            lambda id_sum_count: (id_sum_count[0], id_sum_count[1][0] / id_sum_count[1][1])
        ).collectAsMap()

        updated = False
        for i in ncenters:
            if centers[i].dist(ncenters[i]) > MIN_DIST:
                centers[i] = ncenters[i]
                updated = True
Ejemplo n.º 12
0
from __future__ import absolute_import
from __future__ import print_function
import sys
sys.path.append('../')
from dpark import DparkContext

dpark = DparkContext()

name = '/mfs/tmp/weblog-pre-20111019.csv'
name = '/mfs/tmp/weblog-20111019.csv'
name = '/tmp/weblog-20111019.csv.small'
#name = '/tmp/weblog-20111019.csv.medium'
pv = dpark.textFile(name)
pv = pv.map(lambda x:x.split(',')).map(lambda l:(l[3],l[7]))
pv = pv.flatMap(lambda i_u:(i_u[1].startswith('/movie') and [(i_u[0],2)]
        or i_u[1].startswith('/group') and [(i_u[0],3)]
        or []))
#print pv.take(50)
pv = pv.reduceByKey(lambda x,y:x*y)
#print pv.take(50)
print(pv.filter(lambda __y:__y[1]%2==0 and __y[1]%3==0).count())

#movie = pv.filter(lambda (bid,url): url.startswith('/movie')).reduceByKey(lambda x,y:None)
#group = pv.filter(lambda (bid,url): url.startswith('/group')).reduceByKey(lambda x,y:None)
#print movie.join(group).count()

#print pv.map(lambda x:x.split(',')[2]).uniq().count()
#print pv.map(lambda x:(x.split(',')[2],None)).reduceByKey(lambda x,y:None).count()
#.filter(lambda uid:uid)
#print upv.count()
#print upv.reduceByKey(lambda x,y:x+y).count()
Ejemplo n.º 13
0
    bestIndex = 0
    for i in range(1, len(centers)):
        d = p.squaredDist(centers[i])
        if d < bestDist:
            bestDist = d
            bestIndex = i
    return bestIndex


if __name__ == '__main__':
    D = 4
    K = 3
    IT = 10
    MIN_DIST = 0.01
    centers = [Vector([random.random() for j in range(D)]) for i in range(K)]
    points = dpark.textFile('kmeans_data.txt').map(parseVector).cache()

    for it in range(IT):
        print 'iteration', it
        mappedPoints = points.map(lambda p:(closestCenter(p, centers), (p, 1)))
        ncenters = mappedPoints.reduceByKey(
                lambda (s1,c1),(s2,c2): (s1+s2,c1+c2)
            ).map(
                lambda (id, (sum, count)): (id, sum/count)
            ).collectAsMap()
        
        updated = False
        for i in ncenters:
            if centers[i].dist(ncenters[i]) > MIN_DIST:
                centers[i] = ncenters[i]
                updated = True
Ejemplo n.º 14
0
import glob
from dpark import DparkContext

RATING_PATH = '/nfs/wuhong/offline_use/rating_new/'
TRAINING_PATH = '/nfs/wuhong/paracel/data/als_fm/train'
TEST_PATH = '/nfs/wuhong/paracel/data/als_fm/test'

dpark = DparkContext()


def local_filter1(line):
    tmp = line.strip().split(',')[1]
    if tmp.endswith('0') or tmp.endswith('1') or tmp.endswith('2'):
        return False
    return True


def local_filter2(line):
    tmp = line.strip().split(',')[1]
    if tmp.endswith('0') or tmp.endswith('1') or tmp.endswith('2'):
        return True
    return False


dpark.textFile(
    glob.glob(RATING_PATH)).filter(local_filter1).saveAsTextFile(TRAINING_PATH)

dpark.textFile(
    glob.glob(RATING_PATH)).filter(local_filter2).saveAsTextFile(TEST_PATH)
Ejemplo n.º 15
0
Archivo: wc.py Proyecto: qygaojs/dpark
from dpark import DparkContext, optParser

dc = DparkContext()
options, args = optParser.parse_args()
infile = args[0]
outfile = args[1]
print("from {} to {}".format(infile, outfile))


def fm(x):
    for w in x.strip().split():
        yield (w, 1)


(dc.textFile(infile).flatMap(fm).reduceByKey(
    lambda x, y: x + y,
    numSplits=6).map(lambda x: " ".join(list(map(str, x)))).saveAsTextFile(
        outfile, overwrite=False))
Ejemplo n.º 16
0
#!/usr/bin/env python
# encoding: utf-8
""" 
@version: v1.0 
@author: W_H_J 
@license: Apache Licence  
@contact: [email protected] 
@site:  
@software: PyCharm 
@file: wordcount.py 
@time: 2018/6/5 18:10 
@describe:  单词统计
"""
from dpark import DparkContext
ctx = DparkContext()
file = ctx.textFile("./words.txt")
words = file.flatMap(lambda x: x.split()).map(lambda x: (x, 1))
wc = words.reduceByKey(lambda x, y: x + y).collectAsMap()
print(wc)


# 统计单词出现的个数
def word_count(file_path, word):
    # 指定某个Mesos主机进行沟通
    dpark = DparkContext()

    # 将分布式文件,构造成文件RDD,每块大小为16m
    f = dpark.textFile(file_path, splitSize=16 << 20)

    # 用map()转变成新的RDD,再用filter过滤出更新的RDD,最后用count()处理返回结果
    print(
Ejemplo n.º 17
0
    outEdges = [Edge(ref) for ref in refs]
    return (title, Vertex(title, 1.0/numV, outEdges, True))

def gen_compute(num, epsilon):
    def compute(self, messageSum, agg, superstep):
        if messageSum and messageSum[0]:
            newValue = 0.15 / num + 0.85 * messageSum[0]
        else:
            newValue = self.value
        terminate = (superstep >= 10 and abs(newValue-self.value) < epsilon) or superstep > 30
        outbox = [Message(edge.target_id, newValue / len(self.outEdges))
                for edge in self.outEdges] if not terminate else []
        return Vertex(self.id, newValue, self.outEdges, not terminate), outbox
    return compute

if __name__ == '__main__':
    inputFile = 'wikipedia.txt'
    threshold = 0.01
    
    dpark = DparkContext()
    input = dpark.textFile(inputFile)
    numVertex = input.count()
    vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache()
    epsilon = 0.01 / numVertex
    messages = dpark.parallelize([])
    result = Bagel.run(dpark, vertices, messages,
        gen_compute(numVertex, epsilon))

    for v in result.filter(lambda x:x.value > threshold).collect():
        print v.id, v.value
Ejemplo n.º 18
0
import sys
sys.path.append('../')
import logging
from dpark import DparkContext

dpark = DparkContext()

name = 'rating.txt'

def parse(line):
    sid, uid, r, f = line.split('\t')
    defaults = {'F':4.5, 'P':3.7, 'N':4.0}
    if r == 'None':
        r = defaults[f]
    return (sid, (uid, float(r)))
rating = dpark.textFile(name, numSplits=2).map(parse).groupByKey(2)#.cache()
#print 'us', rating.first()
print rating.count()

def reverse(it):
    s = {}
    for k, us in it:
        for u,r in us:
            s.setdefault(u, {})[k] = r
    return s

def vsum(a, b):
#    return 1
    if len(a) < len(b):
        a, b = b, a
    d = dict(a)
Ejemplo n.º 19
0
Archivo: demo.py Proyecto: douban/dpark
from six.moves import map
from six.moves import range
from six.moves import zip

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dpark import DparkContext

dpark = DparkContext()

# range
nums = dpark.parallelize(list(range(100)), 4)
print(nums.count())
print(nums.reduce(lambda x, y: x + y))

# text search
f = dpark.textFile("./", ext='py').map(lambda x: x.strip())
log = f.filter(lambda line: 'logging' in line).cache()
print('logging', log.count())
print('error', log.filter(lambda line: 'error' in line).count())
for line in log.filter(lambda line: 'error' in line).collect():
    print(line)

# word count
counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).cache()
pprint(counts.filter(lambda __v1: __v1[1] > 50).collectAsMap())
pprint(sorted(counts.filter(lambda __v: __v[1] > 20).map(lambda x_y: (x_y[1], x_y[0])).groupByKey().collect()))
pprint(counts.map(lambda v: "%s:%s" % v).saveAsTextFile("wc/"))

# Pi
import random
Ejemplo n.º 20
0
import random
import os, sys
from pprint import pprint

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dpark import DparkContext

ctx = DparkContext()

# range
nums = ctx.parallelize(range(100), 4)
print nums.count()
print nums.reduce(lambda x, y: x + y)

# text search
f = ctx.textFile("./", ext="py").map(lambda x: x.strip())
log = f.filter(lambda line: "logging" in line).cache()
print "logging", log.count()
print "error", log.filter(lambda line: "error" in line).count()
for line in log.filter(lambda line: "error" in line).collect():
    print line

# word count
counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).cache()
pprint(counts.filter(lambda (_, v): v > 50).collectAsMap())
pprint(sorted(counts.filter(lambda (_, v): v > 20).map(lambda (x, y): (y, x)).groupByKey().collect()))
pprint(counts.map(lambda v: "%s:%s" % v).saveAsTextFile("wc/"))

# Pi
import random
Ejemplo n.º 21
0
Archivo: wc.py Proyecto: douban/dpark
from dpark import DparkContext, optParser

dc = DparkContext()
options, args = optParser.parse_args()
infile = args[0]
outfile = args[1]
print("from {} to {}".format(infile, outfile))


def fm(x):
    for w in x.strip().split():
        yield (w, 1)


(dc.textFile(infile)
    .flatMap(fm)
    .reduceByKey(lambda x, y: x + y, numSplits=6)
    .map(lambda x: " ".join(list(map(str, x))))
    .saveAsTextFile(outfile, overwrite=False))
Ejemplo n.º 22
0
    def get_rdd(self):
        dpark = DparkContext()

        return dpark.union([
            dpark.textFile(path, splitSize=64 << 20) for path in self.paths
        ]).map(Weblog.from_line)
Ejemplo n.º 23
0
    outEdges = [SPEdge(tid, int(v)) 
        for _, tid, v in lines]
    return (id, SPVertex(id, sys.maxint, outEdges, True))

def compute(self, vs, agg, superstep):
    newValue = min(self.value, vs[0]) if vs else self.value
    if newValue != self.value:
        outbox = [SPMessage(edge.target_id, newValue + edge.value)
                for edge in self.outEdges]
    else:
        outbox = []
    return SPVertex(self.id, newValue, self.outEdges, False), outbox

if __name__ == '__main__':
    ctx = DparkContext()
    lines = ctx.textFile('graph.txt').map(lambda line:line.split(' '))
    vertices = lines.filter(lambda x:len(x)==3).groupBy(
        lambda line:line[0]).map(to_vertex)
    messages = lines.filter(lambda x:len(x)==2).map(
        lambda (vid, v): (vid, SPMessage(vid, int(v)))
    )
    print 'read', vertices.count(), 'vertices and ', messages.count(), 'messages.'

    result = Bagel.run(ctx, vertices, messages, compute, MinCombiner())
    startVertex = 0
    print 'Shortest path from %s to all vertices:' % startVertex
    for v in result.collect():
        if v.value == sys.maxint:
            v.value = 'inf'
        print v.id, v.value
Ejemplo n.º 24
0
def compute(self, vs, agg, superstep):
    newValue = min(self.value, vs[0]) if vs else self.value
    if newValue != self.value:
        outbox = [
            SPMessage(edge.target_id, newValue + edge.value)
            for edge in self.outEdges
        ]
    else:
        outbox = []
    return SPVertex(self.id, newValue, self.outEdges, False), outbox


if __name__ == '__main__':
    ctx = DparkContext()
    lines = ctx.textFile('graph.txt').map(lambda line: line.split(' '))
    vertices = lines.filter(lambda x: len(x) == 3).groupBy(
        lambda line: line[0]).map(to_vertex)
    messages = lines.filter(lambda x: len(x) == 2).map(
        lambda (vid, v): (vid, SPMessage(vid, int(v))))
    print 'read', vertices.count(), 'vertices and ', messages.count(
    ), 'messages.'

    result = Bagel.run(ctx, vertices, messages, compute, MinCombiner())
    startVertex = 0
    print 'Shortest path from %s to all vertices:' % startVertex
    for v in result.collect():
        if v.value == sys.maxint:
            v.value = 'inf'
        print v.id, v.value
Ejemplo n.º 25
0
f_global = file(MU_PATH)
line = ''
for l in f_global:
    line = l
mu = float(line.strip().split('\t')[1])
f_global.close()
mu = dpark.broadcast(mu)

def local_mapper(line):
    iid, v, _ = line.strip().split('\t')
    return (iid, float(v))

ibias = {}
ibias = dpark.textFile(glob.glob(IBIAS_PATH)).map(
        local_mapper
    ).collectAsMap()
ibias = dpark.broadcast(ibias)

def local_mapper2(line):
    uid, iid, aid, v = line.strip().split('\t')
    return '%s,%s,%s\n' % (uid , iid, float(v) - mu - ibias[iid])

# generate new rating data
dpark.textFile(glob.glob(RATING_PATH)).filter(
        lambda line: ibias.get(line.strip().split('\t')[1])
    ).map(
        local_mapper2
    ).saveAsTextFile(NEW_RATING_PATH)

def local_mapper3(line):
Ejemplo n.º 26
0
    return bestIndex

def minDist(p, centers):
    bestDist = p.squaredDist(centers[0])
    for i in range(1, len(centers)):
        d = p.squaredDist(centers[i])
        if d < bestDist:
            bestDist = d
    return bestDist

if __name__ == '__main__':
    K = 100
    IT = 50
    MIN_DIST = 0.01
    PATH = 'tab/1558dee2ecfb7a0f9f63e27376675b6c.tab'
    points = dpark.textFile(PATH, numSplits=100)[:1].map(parseVector).cache()
    print points.count()
    centers = points.take(K)

    for it in range(IT):
        print 'iteration', it
        mappedPoints = points.map(lambda p:(closestCenter(p, centers), (p, 1)))
        ncenters = mappedPoints.reduceByKey(
                lambda (s1,c1),(s2,c2): (s1+s2,c1+c2)
            ).map(
                lambda (id, (sum, count)): (id, sum/count)
            ).collectAsMap()
        
        updated = False
        for i in ncenters:
            if centers[i].dist(ncenters[i]) > MIN_DIST:
Ejemplo n.º 27
0
def DownLoad(file_path):
    dpark = DparkContext()
    file_block = dpark.textFile(file_path, splitSize=16 << 20)
    file_block.foreach(write_to_wav)
Ejemplo n.º 28
0
from dpark import DparkContext

dpark = DparkContext()

name = 'rating.txt'


def parse(line):
    sid, uid, r, f = line.split('\t')
    defaults = {'F': 4.5, 'P': 3.7, 'N': 4.0}
    if r == 'None':
        r = defaults[f]
    return (sid, (uid, float(r)))


rating = dpark.textFile(name, numSplits=2).map(parse).groupByKey(2)  #.cache()
#print 'us', rating.first()
print rating.count()


def reverse(it):
    s = {}
    for k, us in it:
        for u, r in us:
            s.setdefault(u, {})[k] = r
    return s


def vsum(a, b):
    #    return 1
    if len(a) < len(b):
Ejemplo n.º 29
0
    return (id, Vertex(id, sys.maxint, outEdges, True))


def compute(self, vs, agg, superstep):
    newValue = min(self.value, vs[0]) if vs else self.value
    if newValue != self.value:
        outbox = [(edge.target_id, newValue + edge.value)
                  for edge in self.outEdges]
    else:
        outbox = []
    return Vertex(self.id, newValue, self.outEdges, False), outbox


if __name__ == '__main__':
    ctx = DparkContext()
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'graph.txt')
    lines = ctx.textFile(path).map(lambda line: line.split(' '))
    vertices = lines.groupBy(lambda line: line[0]).map(to_vertex)
    startVertex = str(0)
    messages = ctx.makeRDD([(startVertex, 0)])

    print('read', vertices.count(), 'vertices and ', messages.count(), 'messages.')

    result = Bagel.run(ctx, vertices, messages, compute, BasicCombiner(min), numSplits=2)

    print('Shortest path from %s to all vertices:' % startVertex)
    for id, v in result.collect():
        if v.value == sys.maxint:
            v.value = 'inf'
        print(v.id, v.value)
Ejemplo n.º 30
0
def minDist(p, centers):
    bestDist = p.squaredDist(centers[0])
    for i in range(1, len(centers)):
        d = p.squaredDist(centers[i])
        if d < bestDist:
            bestDist = d
    return bestDist


if __name__ == '__main__':
    K = 100
    IT = 50
    MIN_DIST = 0.01
    PATH = 'tab/1558dee2ecfb7a0f9f63e27376675b6c.tab'
    points = dpark.textFile(PATH, numSplits=100)[:1].map(parseVector).cache()
    print points.count()
    centers = points.take(K)

    for it in range(IT):
        print 'iteration', it
        mappedPoints = points.map(lambda p: (closestCenter(p, centers),
                                             (p, 1)))
        ncenters = mappedPoints.reduceByKey(
            lambda (s1, c1), (s2, c2): (s1 + s2, c1 + c2)).map(
                lambda (id, (sum, count)): (id, sum / count)).collectAsMap()

        updated = False
        for i in ncenters:
            if centers[i].dist(ncenters[i]) > MIN_DIST:
                centers[i] = ncenters[i]
Ejemplo n.º 31
0
import math
import random
import os, sys
from pprint import pprint
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dpark import DparkContext

ctx = DparkContext()

# range
nums = ctx.parallelize(range(100), 4)
print nums.count()
print nums.reduce(lambda x, y: x + y)

# text search
f = ctx.textFile("./", ext='py').map(lambda x: x.strip())
log = f.filter(lambda line: 'logging' in line).cache()
print 'logging', log.count()
print 'error', log.filter(lambda line: 'error' in line).count()
for line in log.filter(lambda line: 'error' in line).collect():
    print line

# word count
counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(
    lambda x, y: x + y).cache()
pprint(counts.filter(lambda (_, v): v > 50).collectAsMap())
pprint(
    sorted(
        counts.filter(lambda (_, v): v > 20).map(
            lambda (x, y): (y, x)).groupByKey().collect()))
pprint(counts.map(lambda v: "%s:%s" % v).saveAsTextFile("wc/"))
Ejemplo n.º 32
0
        if messageSum and messageSum[0]:
            newValue = 0.15 / num + 0.85 * messageSum[0]
        else:
            newValue = self.value
        terminate = (superstep >= 10 and
                     abs(newValue - self.value) < epsilon) or superstep > 30
        outbox = [
            Message(edge.target_id, newValue / len(self.outEdges))
            for edge in self.outEdges
        ] if not terminate else []
        return Vertex(self.id, newValue, self.outEdges, not terminate), outbox

    return compute


if __name__ == '__main__':
    inputFile = 'wikipedia.txt'
    threshold = 0.01

    dpark = DparkContext()
    input = dpark.textFile(inputFile)
    numVertex = input.count()
    vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache()
    epsilon = 0.01 / numVertex
    messages = dpark.parallelize([])
    result = Bagel.run(dpark, vertices, messages,
                       gen_compute(numVertex, epsilon))

    for v in result.filter(lambda x: x.value > threshold).collect():
        print v.id, v.value