예제 #1
0
def bf_string(x, y):
    return borough_finder.find_borough(float(x), float(y))
예제 #2
0
import borough_finder
import time

print "one million borough lookups"

start_time = time.time()

for i in range(0, 1000000 / 6):
    #should be Bronx
    if borough_finder.find_borough(40.854453, -73.854218) != 'The Bronx':
        print "Error - didn't find Bronx on loop %d" % i
        exit()

    #should be Staten island
    if borough_finder.find_borough(40.591136, -74.132996) != 'Staten Island':
        print "Error - didn't find Staten Island on loop %d" % i
        exit()

    #should be Brooklyn
    if borough_finder.find_borough(40.629709, -73.936615) != "Brooklyn":
        print "Error - didn't find Brooklyn on loop %d" % i
        exit()

    #should be Manhattan
    if borough_finder.find_borough(40.791061, -73.950348) != "Manhattan":
        print "Error - didn't find Manhattan on loop %d" % i
        exit()

    #should be Queens
    if borough_finder.find_borough(40.713036, -73.874817) != "Queens":
        print "Error - didn't find Queens on loop %d" % i
from pyspark import SparkContext, SparkConf
import borough_finder  #Needs numpy and matplotlib installed via pip
import time

#conf = (SparkConf()
#	.setMaster("local")
#	.setAppName("My app")
#	.set("spark.executor.memory", "1g"))
#sc = SparkContext(conf = conf)

#beware- need to change the absolute file here
sc = SparkContext("local", "myApp", pyFiles=['borough_finder.py'])

data = [
    (40.854453, -73.854218),  #bronx
    (40.591136, -74.132996),  #staten island
    (40.629709, -73.936615),  #brooklyn
    (40.791061, -73.950348),  #manhattan
    (40.713036, -73.874817)  #queens
] * 2000000
distData = sc.parallelize(data)  #normally we'd be reading hive tables
print "About to do 10 million conversions"
the_time = time.time()

#And putting the updated output into new tables
j = distData.map(lambda s: borough_finder.find_borough(s[0], s[1])).collect()

print "Took %d seconds " % (time.time() - the_time)
예제 #4
0
print "loading textfile"

distData = sc.textFile('file:///home/w205/longlat.csv')

print "splitting"
split_records = distData.map(lambda l: l.split(','))

the_time = time.time()

#And putting the updated output into new tables
print "starting mapping"
j = split_records.map(lambda s: (
    s[0],
    s[1],
    borough_finder.find_borough(
        forceAFloat(s[1]),  #note swapping order here
        forceAFloat(s[0]))))

accumManhattan = sc.accumulator(0)
accumTheBronx = sc.accumulator(0)
accumQueens = sc.accumulator(0)
accumBrooklyn = sc.accumulator(0)
accumStatenIsland = sc.accumulator(0)
accumNone = sc.accumulator(0)
accumMystery = sc.accumulator(0)

accumHash = {
    "None": accumNone,
    "Manhattan": accumManhattan,
    "The Bronx": accumTheBronx,
    "Staten Island": accumStatenIsland,