def test_prequest_get_success_calls_aws_with_correct_url(): with requests_mock.Mocker() as m: m.get(url) m.get(prequest.Prequest.PARENT_API_URL.format(url, False)) prequest.get(url) last_request = m.last_request assert last_request.url == prequest.Prequest.PARENT_API_URL.format(url, False)
def execute(trial=False): start_time = datetime.datetime.now() print('Fetching MBTAPerformance data...') data_url = 'http://datamechanics.io/data/nathansw_rooday_sbajwa_shreyap/MBTAPerformance.json' response = requests.get(data_url).json() print('MBTAPerformance fetched!') count = 0 obj1 = {} obj2 = {} obj3 = {} for key in response.keys(): if count % 3 == 0: obj1[key] = response[key] elif count % 3 == 1: obj2[key] = response[key] elif count % 3 == 2: obj3[key] = response[key] count += 1 final = [obj1, obj2, obj3] print('Saving MBTAPerformance data...') spark = SparkSession.builder.appName('save-mbta-performance').getOrCreate() df = spark.createDataFrame(final) df.write.json('hdfs://project/hariri/cs591/mbta-performance.json') spark.stop() print('Done!') end_time = datetime.datetime.now() return {'start': start_time, 'end': end_time}
def execute(trial=False): start_time = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('fjansen', 'fjansen') auth_key = dml.auth['services']['nycportal']['token'] url = 'https://data.cityofnewyork.us/resource/fhrw-4uyv.json' if trial: params = {'$limit': 1000, '$$app_token': auth_key} else: params = {'$limit': 10000, '$$app_token': auth_key} resp = json.loads(prequest.get(url, params=params).text) repo.dropCollection('nyc311') repo.createCollection('nyc311') repo['fjansen.nyc311'].insert_many(resp) repo.logout() end_time = datetime.datetime.now() return {"start": start_time, "end": end_time}
def test_get_success_returns_200(): with requests_mock.Mocker() as m: m.get(url, text='resp') m.get(prequest.Prequest.PARENT_API_URL.format(url, False)) resp = prequest.get(url) assert resp.text == 'resp' assert resp.status_code == 200
def test_get_fail_and_cache_fail_returns_original_resp(): with requests_mock.Mocker() as m: m.get(url, status_code=500) m.get(prequest.Prequest.PARENT_API_URL.format(url, True), json={'url': url2}, status_code=404) m.get(url2) resp = prequest.get(url) assert resp.url == url assert resp.status_code == 500
def test_get_500_calls_cache(): with requests_mock.Mocker() as m: m.get(url, status_code=500) m.get(prequest.Prequest.PARENT_API_URL.format(url, True), json={'url': url2}) m.get(url2) resp = prequest.get(url) assert resp.url == url2 assert resp.status_code == 200
def execute(trial=False): '''Retrieve some data sets (not using the API here for the sake of simplicity).''' startTime = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('bemullen_crussack_dharmesh_vinwah', 'bemullen_crussack_dharmesh_vinwah') key = "fires" address = {} urls = [] september_data_url = ( 'https://data.boston.gov/api/3/action/datastore_search?' 'resource_id=14683ec2-c53a-46e0-b6de-67ec123629f0') december_data_url = ( 'https://data.boston.gov/api/3/action/datastore_search?' 'resource_id=ce5cb864-bd01-4707-b381-9e204b4db73f') may_data_url = ( 'https://data.boston.gov/api/3/action/datastore_search?' 'resource_id=9d91dbc7-9875-4cd9-a772-3b363a4b193f') urls.append(RetrieveFire.parseURL(september_data_url)) urls.append(RetrieveFire.parseURL(december_data_url)) urls.append(RetrieveFire.parseURL(may_data_url)) for url in urls: r = json.loads(prequest.get(url).text) month = "" if url[-1] == '0': month = 'september' elif url[-3] == '7': month = 'december' else: month = 'may' # appended the month of the incident to each record for record in r['result']['records']: streetAddress = (record['Street Number'].strip() + " " + record['Street Name'].strip() + " " + record['Street Type'].strip() + " " + record['Neighborhood'].strip() + "MA " + record['Zip'].strip()) g = geocoder.google(streetAddress) address[record['Incident Number']] = (month, g.latlng) repo.dropCollection(key) repo.createCollection(key) repo['bemullen_crussack_dharmesh_vinwah.' + key].insert_many([address]) repo.logout() endTime = datetime.datetime.now() return {"start": startTime, "end": endTime}
def execute(trial=False): start_time = datetime.datetime.now() print('Fetching stops data...') data_url = 'http://datamechanics.io/data/nathansw_rooday_sbajwa_shreyap/stops.json' response = requests.get(data_url).json() print('stops data fetched!') print('Saving stops data...') spark = SparkSession.builder.appName('save-stops').getOrCreate() df = spark.createDataFrame(response) df.write.json('hdfs://project/hariri/cs591/stops.json') spark.stop() print('Done!') end_time = datetime.datetime.now() return {'start': start_time, 'end': end_time}
def execute(trial=False): start_time = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('fjansen', 'fjansen') if trial: urls = [ 'https://data.boston.gov/api/3/action/datastore_search?resource_id=8f4f497e-d93c-4f2f-b754-bfc69e2700a0&limit=1000' ] else: urls = [ 'https://data.boston.gov/api/3/action/datastore_search?resource_id=8608b9db-71e2-4acb-9691-75b3c66fdd17&limit=10000', 'https://data.boston.gov/api/3/action/datastore_search?resource_id=d969a70d-2734-4e75-b2ae-e64aec289892&limit=10000', 'https://data.boston.gov/api/3/action/datastore_search?resource_id=8f4f497e-d93c-4f2f-b754-bfc69e2700a0&limit=10000' ] responses = [] for url in urls: temp = json.loads(prequest.get(url).text) for e in temp['result']['records']: # Delete pre-assigned _id, which clashes with mongo del e['_id'] responses.append(temp['result']['records']) repo.dropCollection("fires") repo.createCollection("fires") for response in responses: repo['fjansen.fires'].insert_many(response) repo.logout() end_time = datetime.datetime.now() return {"start": start_time, "end": end_time}
def execute(trial=False): start_time = datetime.datetime.now() spark = SparkSession.builder.appName('save-demographics').getOrCreate() # opens 'Race.json' file from datamechanics.io url = 'http://datamechanics.io/data/nathansw_sbajwa/Race.json' response = requests.get(url).json() df = spark.createDataFrame(response) df.write.json('hdfs://project/hariri/cs591/race.json') # opens 'MeansOfCommuting.json' file from datamechanics.io url = 'http://datamechanics.io/data/nathansw_sbajwa/MeansOfCommuting.json' response = requests.get(url).json() df = spark.createDataFrame(response) df.write.json('hdfs://project/hariri/cs591/commuting.json') # opens 'PovertyRates.json' file from datamechanics.io url = 'http://datamechanics.io/data/nathansw_sbajwa/PovertyRates.json' response = requests.get(url).json() df = spark.createDataFrame(response) df.write.json('hdfs://project/hariri/cs591/poverty-rates.json') # opens 'HouseholdIncome.json' file from datamechanics.io url = 'http://datamechanics.io/data/nathansw_sbajwa/HouseholdIncome.json' response = requests.get(url).json() # removes $ from all of the nested keys within the JSON file (char forbidden by mongodb) # TODO Is this necessary for Spark? for town in response.keys(): # Preps variables to alter dict with toReplace = {} toDelete = [] for old_key in response[town]: # ex: '$25,000-34,999' -> '25,000-34,999' new_key = old_key.replace('$', '') # only continue if the original key had a $ that needed to be removed if new_key != old_key: # puts new key in separate dict toReplace[new_key] = response[town][old_key] # adds old key to list of keys to be deleted toDelete += [old_key] # merges two dicts i.e. r[town] contains both old and new keys ($ and no $) response[town].update(toReplace) # deletes old keys from r[town] leaving only kys with no $ for key in toDelete: del response[town][key] df = spark.createDataFrame(response) df.write.json('hdfs://project/hariri/cs591/household-income.json') # logs out of db spark.stop() end_time = datetime.datetime.now() return {"start": start_time, "end": end_time}