Example #1
0
def upload():
    if request.method == 'POST':
        file = request.files['file']
        if file:
            filename = secure_filename(file.filename)
            file.save(os.path.join(UPLOAD_FOLDER, filename))
            f = dpark.textFile(os.path.join(UPLOAD_FOLDER, filename))
            chs = f.flatMap(lambda x: x).filter(lambda x: x in string.ascii_letters).map(lambda x: (x, 1))
            wc = chs.reduceByKey(lambda x, y: x+y).collectAsMap()
            db.LogProc.save(wc)
    return redirect('/')
Example #2
0
def upload():
    if request.method == 'POST':
        file = request.files['file']
        if file:
            filename = secure_filename(file.filename)
            file.save(os.path.join(UPLOAD_FOLDER, filename))
            f = dpark.textFile(os.path.join(UPLOAD_FOLDER, filename))
            chs = f.flatMap(lambda x: x).filter(
                lambda x: x in string.ascii_letters).map(lambda x: (x, 1))
            wc = chs.reduceByKey(lambda x, y: x + y).collectAsMap()
            db.LogProc.save(wc)
    return redirect('/')
Example #3
0
    bestIndex = 0
    for i in range(1, len(centers)):
        d = p.squaredDist(centers[i])
        if d < bestDist:
            bestDist = d
            bestIndex = i
    return bestIndex


if __name__ == '__main__':
    D = 4
    K = 3
    IT = 10
    MIN_DIST = 0.01
    centers = [Vector([random.random() for j in range(D)]) for i in range(K)]
    points = dpark.textFile('kmeans_data.txt').map(parseVector).cache()

    for it in range(IT):
        print 'iteration', it
        mappedPoints = points.map(lambda p:(closestCenter(p, centers), (p, 1)))
        ncenters = mappedPoints.reduceByKey(
                lambda (s1,c1),(s2,c2): (s1+s2,c1+c2)
            ).map(
                lambda (id, (sum, count)): (id, sum/count)
            ).collectAsMap()

        updated = False
        for i in ncenters:
            if centers[i].dist(ncenters[i]) > MIN_DIST:
                centers[i] = ncenters[i]
                updated = True
Example #4
0
    bestIndex = 0
    for i in range(1, len(centers)):
        d = p.squaredDist(centers[i])
        if d < bestDist:
            bestDist = d
            bestIndex = i
    return bestIndex


if __name__ == '__main__':
    D = 4
    K = 3
    IT = 10
    MIN_DIST = 0.01
    centers = [Vector([random.random() for j in range(D)]) for i in range(K)]
    points = dpark.textFile('kmeans_data.txt').map(parseVector).cache()

    for it in range(IT):
        print('iteration', it)
        mappedPoints = points.map(lambda p: (closestCenter(p, centers),
                                             (p, 1)))
        ncenters = mappedPoints.reduceByKey(lambda (s1, c1), (s2, c2): (
            s1 + s2, c1 + c2)).map(lambda id_sum_count: (id_sum_count[
                0], id_sum_count[1][0] / id_sum_count[1][1])).collectAsMap()

        updated = False
        for i in ncenters:
            if centers[i].dist(ncenters[i]) > MIN_DIST:
                centers[i] = ncenters[i]
                updated = True
        if not updated:
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import dpark

lines = dpark.textFile("/usr/share/dict/words", 128)
#lines = dpark.textFile("/usr/share/doc/gcc-4.4.7/README.Portability", 128)
words = lines.flatMap(lambda x: x.split()).map(lambda x: (x, 1))
wc = words.reduceByKey(lambda x, y: x + y).collectAsMap()
print(wc[wc.keys[0]])
Example #6
0
sys.path.append('../')
import logging
import dpark

name = 'rating.txt'


def parse(line):
    sid, uid, r, f = line.strip().split('\t')
    defaults = {'F': 4.5, 'P': 3.7, 'N': 4.0}
    if r == 'None':
        r = defaults[f]
    return (sid, (uid, float(r)))


rating = dpark.textFile(name, numSplits=2).map(parse).groupByKey(2)  #.cache()
#print 'us', rating.first()
print rating.count()


def reverse(it):
    s = {}
    for k, us in it:
        for u, r in us:
            s.setdefault(u, {})[k] = r
    return s


def vsum(a, b):
    #    return 1
    if len(a) < len(b):
Example #7
0
import dpark
import math
PRIOR_COUNT = 10
PRIOR_CORRELATION = 0
FILE_SUFFIX = 'ml-100k/'
TRAIN_FILENAME = 'ua.base'
TEST_FILENAME = 'ua.test'
MOVIES_FILENAME = 'u.item'


def _split_movie(line):
    fields = line.split('|')
    return int(fields[0]), fields[1]
movies = dpark.textFile(FILE_SUFFIX + MOVIES_FILENAME).map(_split_movie)
movie_names = movies.collectAsMap()
ratings = dpark.textFile(FILE_SUFFIX + TRAIN_FILENAME)


def _split_rating(line):
    fields = line.split('\t')
    return int(fields[0]), int(fields[1]), int(fields[2])
num_raters_perMovie = ratings.map(_split_rating).groupBy(lambda line: line[1])\
    .map(lambda line: (line[0], len(line[1])))
rating_with_size = ratings.map(_split_rating).groupBy(lambda line: line[1])\
    .join(num_raters_perMovie)


def _map_fields(line):
    return map(lambda f: (f[0], f[1], f[2], line[1][1]), line[1][0])

rating_with_size = rating_with_size.flatMap(_map_fields)
Example #8
0
#!-*-encoding:utf8-*-
import dpark
file=dpark.textFile('./words.txt')
words = file.flatMap(lambda x:x.split()).map(lambda x:(x,1))
wc=words.reduceByKey(lambda x,y:x+y).collectAsMap()
print wc
Example #9
0
import sys
sys.path.append('../')
import logging
import dpark

name = 'rating.txt'

def parse(line):
    sid, uid, r, f = line.split('\t')
    defaults = {'F':4.5, 'P':3.7, 'N':4.0}
    if r == 'None':
        r = defaults[f]
    return (sid, (uid, float(r)))
rating = dpark.textFile(name, numSplits=2).map(parse).groupByKey(2)#.cache()
#print 'us', rating.first()
print rating.count()

def reverse(it):
    s = {}
    for k, us in it:
        for u,r in us:
            s.setdefault(u, {})[k] = r
    return s

def vsum(a, b):
#    return 1
    if len(a) < len(b):
        a, b = b, a
    d = dict(a)
    s = 0
    for u,r in b:
Example #10
0
import math
import random
import os, sys
from pprint import pprint
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import dpark

# range
nums = dpark.parallelize(range(100), 4)
print nums.count()
print nums.reduce(lambda x,y:x+y)

# text search
f = dpark.textFile("./", ext='py').map(lambda x:x.strip())
log = f.filter(lambda line: 'logging' in line).cache()
print 'logging', log.count()
print 'error', log.filter(lambda line: 'error' in line).count()
for line in log.filter(lambda line: 'error' in line).collect():
    print line

# word count
counts = f.flatMap(lambda x:x.split()).map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y).cache()
pprint(counts.filter(lambda (_,v): v>50).collectAsMap())
pprint(sorted(counts.filter(lambda (_,v): v>20).map(lambda (x,y):(y,x)).groupByKey().collect()))
pprint(counts.map(lambda v: "%s:%s"%v ).saveAsTextFile("wc/"))

# Pi
import random
def rand(i):
    x = random.random()
    y = random.random()