def upload(): if request.method == 'POST': file = request.files['file'] if file: filename = secure_filename(file.filename) file.save(os.path.join(UPLOAD_FOLDER, filename)) f = dpark.textFile(os.path.join(UPLOAD_FOLDER, filename)) chs = f.flatMap(lambda x: x).filter(lambda x: x in string.ascii_letters).map(lambda x: (x, 1)) wc = chs.reduceByKey(lambda x, y: x+y).collectAsMap() db.LogProc.save(wc) return redirect('/')
def upload(): if request.method == 'POST': file = request.files['file'] if file: filename = secure_filename(file.filename) file.save(os.path.join(UPLOAD_FOLDER, filename)) f = dpark.textFile(os.path.join(UPLOAD_FOLDER, filename)) chs = f.flatMap(lambda x: x).filter( lambda x: x in string.ascii_letters).map(lambda x: (x, 1)) wc = chs.reduceByKey(lambda x, y: x + y).collectAsMap() db.LogProc.save(wc) return redirect('/')
bestIndex = 0 for i in range(1, len(centers)): d = p.squaredDist(centers[i]) if d < bestDist: bestDist = d bestIndex = i return bestIndex if __name__ == '__main__': D = 4 K = 3 IT = 10 MIN_DIST = 0.01 centers = [Vector([random.random() for j in range(D)]) for i in range(K)] points = dpark.textFile('kmeans_data.txt').map(parseVector).cache() for it in range(IT): print 'iteration', it mappedPoints = points.map(lambda p:(closestCenter(p, centers), (p, 1))) ncenters = mappedPoints.reduceByKey( lambda (s1,c1),(s2,c2): (s1+s2,c1+c2) ).map( lambda (id, (sum, count)): (id, sum/count) ).collectAsMap() updated = False for i in ncenters: if centers[i].dist(ncenters[i]) > MIN_DIST: centers[i] = ncenters[i] updated = True
bestIndex = 0 for i in range(1, len(centers)): d = p.squaredDist(centers[i]) if d < bestDist: bestDist = d bestIndex = i return bestIndex if __name__ == '__main__': D = 4 K = 3 IT = 10 MIN_DIST = 0.01 centers = [Vector([random.random() for j in range(D)]) for i in range(K)] points = dpark.textFile('kmeans_data.txt').map(parseVector).cache() for it in range(IT): print('iteration', it) mappedPoints = points.map(lambda p: (closestCenter(p, centers), (p, 1))) ncenters = mappedPoints.reduceByKey(lambda (s1, c1), (s2, c2): ( s1 + s2, c1 + c2)).map(lambda id_sum_count: (id_sum_count[ 0], id_sum_count[1][0] / id_sum_count[1][1])).collectAsMap() updated = False for i in ncenters: if centers[i].dist(ncenters[i]) > MIN_DIST: centers[i] = ncenters[i] updated = True if not updated:
# Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import dpark lines = dpark.textFile("/usr/share/dict/words", 128) #lines = dpark.textFile("/usr/share/doc/gcc-4.4.7/README.Portability", 128) words = lines.flatMap(lambda x: x.split()).map(lambda x: (x, 1)) wc = words.reduceByKey(lambda x, y: x + y).collectAsMap() print(wc[wc.keys[0]])
sys.path.append('../') import logging import dpark name = 'rating.txt' def parse(line): sid, uid, r, f = line.strip().split('\t') defaults = {'F': 4.5, 'P': 3.7, 'N': 4.0} if r == 'None': r = defaults[f] return (sid, (uid, float(r))) rating = dpark.textFile(name, numSplits=2).map(parse).groupByKey(2) #.cache() #print 'us', rating.first() print rating.count() def reverse(it): s = {} for k, us in it: for u, r in us: s.setdefault(u, {})[k] = r return s def vsum(a, b): # return 1 if len(a) < len(b):
import dpark import math PRIOR_COUNT = 10 PRIOR_CORRELATION = 0 FILE_SUFFIX = 'ml-100k/' TRAIN_FILENAME = 'ua.base' TEST_FILENAME = 'ua.test' MOVIES_FILENAME = 'u.item' def _split_movie(line): fields = line.split('|') return int(fields[0]), fields[1] movies = dpark.textFile(FILE_SUFFIX + MOVIES_FILENAME).map(_split_movie) movie_names = movies.collectAsMap() ratings = dpark.textFile(FILE_SUFFIX + TRAIN_FILENAME) def _split_rating(line): fields = line.split('\t') return int(fields[0]), int(fields[1]), int(fields[2]) num_raters_perMovie = ratings.map(_split_rating).groupBy(lambda line: line[1])\ .map(lambda line: (line[0], len(line[1]))) rating_with_size = ratings.map(_split_rating).groupBy(lambda line: line[1])\ .join(num_raters_perMovie) def _map_fields(line): return map(lambda f: (f[0], f[1], f[2], line[1][1]), line[1][0]) rating_with_size = rating_with_size.flatMap(_map_fields)
#!-*-encoding:utf8-*- import dpark file=dpark.textFile('./words.txt') words = file.flatMap(lambda x:x.split()).map(lambda x:(x,1)) wc=words.reduceByKey(lambda x,y:x+y).collectAsMap() print wc
import sys sys.path.append('../') import logging import dpark name = 'rating.txt' def parse(line): sid, uid, r, f = line.split('\t') defaults = {'F':4.5, 'P':3.7, 'N':4.0} if r == 'None': r = defaults[f] return (sid, (uid, float(r))) rating = dpark.textFile(name, numSplits=2).map(parse).groupByKey(2)#.cache() #print 'us', rating.first() print rating.count() def reverse(it): s = {} for k, us in it: for u,r in us: s.setdefault(u, {})[k] = r return s def vsum(a, b): # return 1 if len(a) < len(b): a, b = b, a d = dict(a) s = 0 for u,r in b:
import math import random import os, sys from pprint import pprint sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import dpark # range nums = dpark.parallelize(range(100), 4) print nums.count() print nums.reduce(lambda x,y:x+y) # text search f = dpark.textFile("./", ext='py').map(lambda x:x.strip()) log = f.filter(lambda line: 'logging' in line).cache() print 'logging', log.count() print 'error', log.filter(lambda line: 'error' in line).count() for line in log.filter(lambda line: 'error' in line).collect(): print line # word count counts = f.flatMap(lambda x:x.split()).map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y).cache() pprint(counts.filter(lambda (_,v): v>50).collectAsMap()) pprint(sorted(counts.filter(lambda (_,v): v>20).map(lambda (x,y):(y,x)).groupByKey().collect())) pprint(counts.map(lambda v: "%s:%s"%v ).saveAsTextFile("wc/")) # Pi import random def rand(i): x = random.random() y = random.random()