def bf_string(x, y): return borough_finder.find_borough(float(x), float(y))
import borough_finder import time print "one million borough lookups" start_time = time.time() for i in range(0, 1000000 / 6): #should be Bronx if borough_finder.find_borough(40.854453, -73.854218) != 'The Bronx': print "Error - didn't find Bronx on loop %d" % i exit() #should be Staten island if borough_finder.find_borough(40.591136, -74.132996) != 'Staten Island': print "Error - didn't find Staten Island on loop %d" % i exit() #should be Brooklyn if borough_finder.find_borough(40.629709, -73.936615) != "Brooklyn": print "Error - didn't find Brooklyn on loop %d" % i exit() #should be Manhattan if borough_finder.find_borough(40.791061, -73.950348) != "Manhattan": print "Error - didn't find Manhattan on loop %d" % i exit() #should be Queens if borough_finder.find_borough(40.713036, -73.874817) != "Queens": print "Error - didn't find Queens on loop %d" % i
from pyspark import SparkContext, SparkConf import borough_finder #Needs numpy and matplotlib installed via pip import time #conf = (SparkConf() # .setMaster("local") # .setAppName("My app") # .set("spark.executor.memory", "1g")) #sc = SparkContext(conf = conf) #beware- need to change the absolute file here sc = SparkContext("local", "myApp", pyFiles=['borough_finder.py']) data = [ (40.854453, -73.854218), #bronx (40.591136, -74.132996), #staten island (40.629709, -73.936615), #brooklyn (40.791061, -73.950348), #manhattan (40.713036, -73.874817) #queens ] * 2000000 distData = sc.parallelize(data) #normally we'd be reading hive tables print "About to do 10 million conversions" the_time = time.time() #And putting the updated output into new tables j = distData.map(lambda s: borough_finder.find_borough(s[0], s[1])).collect() print "Took %d seconds " % (time.time() - the_time)
print "loading textfile" distData = sc.textFile('file:///home/w205/longlat.csv') print "splitting" split_records = distData.map(lambda l: l.split(',')) the_time = time.time() #And putting the updated output into new tables print "starting mapping" j = split_records.map(lambda s: ( s[0], s[1], borough_finder.find_borough( forceAFloat(s[1]), #note swapping order here forceAFloat(s[0])))) accumManhattan = sc.accumulator(0) accumTheBronx = sc.accumulator(0) accumQueens = sc.accumulator(0) accumBrooklyn = sc.accumulator(0) accumStatenIsland = sc.accumulator(0) accumNone = sc.accumulator(0) accumMystery = sc.accumulator(0) accumHash = { "None": accumNone, "Manhattan": accumManhattan, "The Bronx": accumTheBronx, "Staten Island": accumStatenIsland,