def execute(trial=False): '''Retrieve some data sets and store in mongodb collections.''' startTime = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('ajr10_williami', 'ajr10_williami') # open_spaces_cambridge print("retrieving open spaces data from data.cambridgema.gov") client = sodapy.Socrata("data.cambridgema.gov", None) response = client.get("5ctr-ccas", limit=50) r = json.loads(json.dumps(response, sort_keys=True, indent=2)) repo.dropCollection("ajr10_williami.open_spaces_cambridge") repo.createCollection("ajr10_williami.open_spaces_cambridge") print("inserting data into target: ", "open_spaces_cambridge") repo["ajr10_williami.open_spaces_cambridge"].insert_many(r) # trees_cambridge print("retrieving tree data from data.cambridgema.gov") client = sodapy.Socrata("data.cambridgema.gov", None) response = client.get("q83f-7quz", limit=50) r = json.loads(json.dumps(response, sort_keys=True, indent=2)) repo.dropCollection("ajr10_williami.trees_cambridge") repo.createCollection("ajr10_williami.trees_cambridge") print("inserting data into target: ", "trees_cambridge") repo["ajr10_williami.trees_cambridge"].insert_many(r) # energy_cambridge print("retrieving energy data from data.cambridgema.gov") client = sodapy.Socrata("data.cambridgema.gov", None) response = client.get("es2i-g3p6", limit=50) r = json.loads(json.dumps(response, sort_keys=True, indent=2)) repo.dropCollection("ajr10_williami.energy_cambridge") repo.createCollection("ajr10_williami.energy_cambridge") print("inserting data into target: ", "energy_cambridge") repo["ajr10_williami.energy_cambridge"].insert_many(r) # logout and return start and end times repo.logout() endTime = datetime.datetime.now() return {"start": startTime, "end": endTime}
def execute(trial=False): '''Retrieve some data sets (not using the API here for the sake of simplicity).''' startTime = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('pgr_syquiac', 'pgr_syquiac') # Get data for hospitals client = sodapy.Socrata("data.cityofboston.gov", None) response = client.get("u6fv-m8v4", limit=30) repo.dropCollection("hospitals") repo.createCollection("hospitals") repo['pgr_syquiac.hospitals'].insert_many(response) # Get data for CDC 500 cities client = sodapy.Socrata("chronicdata.cdc.gov", None) response = client.get("csmm-fdhi", CityName="Boston", GeographicLevel="Census Tract", limit=5000) repo.dropCollection("cdc") repo.createCollection("cdc") repo['pgr_syquiac.cdc'].insert_many(response) # Get data for all universities in the US url = 'http://datamechanics.io/data/pgr_syquiac/universities.json' response = urllib.request.urlopen(url).read().decode("utf-8") r = json.loads(response) s = json.dumps(r, sort_keys=True, indent=2) repo.dropCollection("schools") repo.createCollection("schools") repo['pgr_syquiac.schools'].insert_many(r) # Get data for Open Swimming Pools in Boston client = sodapy.Socrata("data.cityofboston.gov", None) response = client.get("5jxx-wfpr", limit=150) repo.dropCollection("pools") repo.createCollection("pools") repo['pgr_syquiac.pools'].insert_many(response) # Get data for healthy corner stores client = sodapy.Socrata("data.cityofboston.gov", None) response = client.get("ybm6-m5qd", limit=20) repo.dropCollection("stores") repo.createCollection("stores") repo['pgr_syquiac.stores'].insert_many(response) repo.logout() endTime = datetime.datetime.now() return {"start": startTime, "end": endTime}
def execute(trial=False): startTime = datetime.datetime.now() client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('rengx_ztwu_lwj', 'rengx_ztwu_lwj') #public school repo.publicschool.drop() publicschool = repo.publicschool ss = sodapy.Socrata("data.cityofboston.gov", "x92LG4iaFto5qWQGFk3lDdv6p", username="******", password="******") response = ss.get("492y-i77g") r = json.loads(json.dumps(response, sort_keys=True, indent=2)) lis = [] for i in range(len(r)): item = {} name = r[i]["sch_name"] addr = r[i]["location_location"] zipp = r[i]["location_zip"] coords = r[i]['location']['coordinates'] #print(coords) xcoor = str(int(coords[1] * 1000000)) ycoor = str(int(coords[0] * (-1000000))) item["x"] = xcoor item["y"] = ycoor item["name"] = name item["addr"] = addr item["zipp"] = zipp item["type"] = "school" lis.append(item) publicschool.insert_many(lis) #market repo.market.drop() market = repo.market ss = sodapy.Socrata("data.mass.gov", "x92LG4iaFto5qWQGFk3lDdv6p", username="******", password="******") response = ss.get("66t5-f563") r = json.loads(json.dumps(response, sort_keys=True, indent=2)) market_data = retrieve.selection(r) market.insert_many(market_data) repo.logout() endTime = datetime.datetime.now() #print("retrieve complete") return {"start": startTime, "end": endTime}
def execute(trial=False): '''Retrieve some data sets (not using the API here for the sake of simplicity).''' startTime = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('pt0713_silnuext', 'pt0713_silnuext') client = sodapy.Socrata("data.cityofboston.gov", None) response = [] limits = [0, 50001, 100001, 150001, 200001, 250001] for limit in limits: response += client.get("crime", limit=50000, offset=limit) s = json.dumps(response, sort_keys=True, indent=2) repo.dropCollection("crime") repo.createCollection("crime") repo['pt0713_silnuext.crime'].insert_many(response) repo['pt0713_silnuext.crime'].metadata({'complete': True}) print(repo['pt0713_silnuext.crime'].metadata()) repo.logout() endTime = datetime.datetime.now() return {"start": startTime, "end": endTime}
def __init__(self, data_id=None): self.data_id = data_id self.data = None self.client = sodapy.Socrata(domain=lacity_url, app_token=lacity_app_token, username=lacity_user_name, password=lacity_password) return
def get_socrata_client(): return sodapy.Socrata( SO_WEB, SO_TOKEN, username=SO_USER, password=SO_PASS, timeout=60, )
def process_domain(self, domain): logger.info("Processing %s...", domain['url']) socrata = sodapy.Socrata(domain['url'], **domain.get('auth', {'app_token': None})) datasets = socrata.datasets() logger.info("Found %d datasets", len(datasets)) if not datasets: return seen = set() for dataset in datasets: try: valid = self.process_dataset(domain, dataset) except Exception as e: sentry_sdk.capture_exception(e) logger.exception("Error processing dataset %s", dataset['resource']['id']) else: assert isinstance(valid, bool) if valid: seen.add(dataset['resource']['id']) logger.info("Discovered %d/%d datasets", len(seen), len(datasets)) # Clean up the datasets we didn't see deleted = 0 size = 10000 query = { 'query': { 'bool': { 'must': [ { 'term': { 'materialize.identifier': self.identifier, }, }, { 'term': { 'materialize.socrata_domain.keyword': domain['url'], }, }, ], }, } } hits = self.elasticsearch.scan( index='datasets,pending', query=query, size=size, _source=['materialize.socrata_id'], ) for h in hits: if h['_source']['materialize']['socrata_id'] not in seen: self.delete_dataset(full_id=h['_id']) deleted += 1 if deleted: logger.info("Deleted %d missing datasets", deleted)
def get_client(host="data.austintexas.gov", timeout=30): SOCRATA_APP_TOKEN = os.getenv("SOCRATA_APP_TOKEN") SOCRATA_API_KEY_ID = os.getenv("SOCRATA_API_KEY_ID") SOCRATA_API_KEY_SECRET = os.getenv("SOCRATA_API_KEY_SECRET") return sodapy.Socrata( host, SOCRATA_APP_TOKEN, username=SOCRATA_API_KEY_ID, password=SOCRATA_API_KEY_SECRET, timeout=timeout, )
def main(): log = structlog.get_logger() domain = os.environ['DOMAIN'] app_token = os.environ['APP_TOKEN'] dataset_id = os.environ['DATASET_ID'] database_url = os.environ['DATABASE_URI'] mongo_client = pymongo.MongoClient(database_url) client = sodapy.Socrata(domain, app_token) log.debug('Fetching count of dataset records') count = count_dataset(client, dataset_id) if count == None: log.error('Unable to query count of dataset') return 1 log.info('Record count', count=count) db = mongo_client.permits index = db.all_permits log.debug('Fetching permits') for permit_set in fetch_permits(client, dataset_id, count): if permit_set == None: event = ('Excessive failures while fetching Socrata data, ' 'exiting early') log.error(event) return 1 log.debug('Cleaning permit set') permit_set = [clean_permit(permit) for permit in permit_set] try: result = index.insert_many(permit_set, ordered=False) insert_count = len(result.inserted_ids) except pymongo.errors.BulkWriteError as e: event = ('Error while bulk inserting data, ' 'remaining documents will still be inserted') log.error(event, exc_info=True) insert_count = e.details.get('nInserted') finally: log.debug('bulk inserted permit data', count=insert_count) log.info('Finished fetching permits') log.debug('Total records in index table', count=index.count()) return 0
def execute(trial = False): '''Retrieve some data sets for the MongoDB collection.''' startTime = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('chamathd', 'chamathd') # Neighborhood population data for the Boston area from collected data sources print("Fetching Boston population data from Data Mechanics resource") colName = "chamathd.neighborhood_pop_boston" url = "http://datamechanics.io/data/chamathd/boston_neighborhood_census.json" response = urllib.request.urlopen(url).read().decode("utf-8") r = json.loads(response) repo.dropCollection(colName) repo.createCollection(colName) print("Inserting JSON data into collection", colName) repo[colName].insert_many(r["neighborhoods"]) print("Finished writing data to", colName) print() # Neighborhood population data for the Cambridge area from Cambridge Open Data print("Fetching Cambridge population data from Cambridge Open Data") colName = "chamathd.neighborhood_pop_cambridge" socrataClient = sodapy.Socrata("data.cambridgema.gov", None) response = socrataClient.get("vacj-bzri", limit=50) r = json.loads(json.dumps(response, sort_keys=True, indent=2)) repo.dropCollection(colName) repo.createCollection(colName) print("Inserting JSON data into collection", colName) repo[colName].insert_many(r) print("Finished writing data to", colName) print() repo.logout() endTime = datetime.datetime.now() return {"start":startTime, "end":endTime}
def func(page_size, pages, token, database_id): print("Functiong working, boss!") api_url = 'data.cityofnewyork.us' client = sodapy.Socrata(api_url, token) database = database_id total = client.get(database, select='COUNT(*)') if pages < 0: pages = math.ceil(int(total[0]['COUNT']) / page_size) return (client.get(database, limit=total[0]['COUNT'])) listobj = [] for i in range(pages): listobj.append( client.get(database, limit=page_size, offset=i * page_size)) return (listobj)
def get_metadatas(socrata_table, identifier_column_name, domain_column_name): ids = socrata_table[identifier_column_name] domains = socrata_table[domain_column_name] metadatas = [] for id, domain in zip(ids, domains): client = sodapy.Socrata(domain, '44TCQSoL2igJ2376fmSMcGkkh') try: metadata = client.get_metadata(id) metadata['domain'] = domain metadatas.append(metadata) print("Retrieved metadata for dataset: {}".format(str(id))) except: continue return metadatas
def get_data(page_size, num_pages=None, output=None): client = sodapy.Socrata('data.cityofnewyork.us', os.environ['APP_KEY']) if num_pages == None: r = client.get('nc67-uf89', limit=page_size) else: r = client.get('nc67-uf89', limit=page_size, offset=(num_pages - 1) * page_size) if output == None: print(r) else: with open(output + '.txt', 'w') as outfile: json.dump(r, outfile)
def main(data_config, date_log, output_directory): inmates_client = sodapy.Socrata(data_config['service_url'], data_config['apptoken'], username=data_config['username'], password=data_config['password']) metadata = inmates_client.get_metadata(data_config['endpoint']) count = inmates_client.get(data_config['endpoint'], query="select COUNT(*)") date = metadata["rowsUpdatedAt"] data = inmates_client.get(data_config['endpoint'], content_type='json', limit=count[0]['COUNT']) inmates_df = pd.io.json.json_normalize(data) inmates_df.to_feather(f'{output_directory}/{date}_inmates.feather') with open(date_log, 'a') as log: log.write(f'{date}\n') return ()
def get_two_most_recent(id): """Query the Divvy API for the two most recent tuples of the specified station for prediction Args: id (int): Station ID in Divvy network Returns: pd.DataFrame: DF containing data pulled from API """ client = sodapy.Socrata("data.cityofchicago.org", None) results = client.get("eq45-8inv", id=id, limit=2) results_df = pd.DataFrame.from_records(results) # convert numeric columns to numeric results_df[['id', 'percent_full', 'available_bikes' ]] = results_df[['id', 'percent_full', 'available_bikes']].apply(pd.to_numeric, errors='coerce') return results_df
def execute(trial=False): '''Retrieve some data sets (not using the API here for the sake of simplicity).''' startTime = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('jgrishey', 'jgrishey') client = sodapy.Socrata( "data.cambridgema.gov", dml.auth['services']['cityofcambridgedataportal']['token']) response = client.get("vnxa-cuyr", limit=500) tickets = [] ID = 0 for ticket in response: if 'location' in ticket: lat = ticket['location']['latitude'] lon = ticket['location']['longitude'] tickets.append({ '_id': ID, 'lat': lat, 'long': lon }) if (lat != '0' and lon != '0') else () ID += 1 repo.dropCollection("tickets") repo.createCollection("tickets") for ticket in tickets: repo['jgrishey.tickets'].insert(ticket) repo.logout() endTime = datetime.datetime.now() return {"start": startTime, "end": endTime}
def execute(trial=False): '''Retrieve some data sets (not using the API here for the sake of simplicity).''' startTime = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('jgrishey', 'jgrishey') client = sodapy.Socrata( "data.cityofboston.gov", dml.auth["services"]["cityofbostondataportal"]["token"]) response = client.get("29yf-ye7n", limit=10000) crimes = [] ID = 0 for crime in response: lat = crime['location']['coordinates'][1] lon = crime['location']['coordinates'][0] crimes.append({ '_id': ID, 'lat': lat, 'long': lon }) if (lat != '0' and lon != 0) else () ID += 1 repo.dropCollection("crime") repo.createCollection("crime") for crime in crimes: repo['jgrishey.crime'].insert(crime) repo.logout() endTime = datetime.datetime.now() return {"start": startTime, "end": endTime}
def main(): logging.info("Starting...") REPO = {"id": 140626918, "name": "cityofaustin/atd-data-tech"} WORKSPACE_ID = "5caf7dc6ecad11531cc418ef" SOCRATA_RESOURCE_ID = "rzwg-fyv8" ZENHUB_ACCESS_TOKEN = os.environ["ZENHUB_ACCESS_TOKEN"] GITHUB_ACCESS_TOKEN = os.environ["GITHUB_ACCESS_TOKEN"] SOCRATA_API_KEY_ID = os.environ["SOCRATA_API_KEY_ID"] SOCRATA_API_KEY_SECRET = os.environ["SOCRATA_API_KEY_SECRET"] SOCRATA_APP_TOKEN = os.environ["SOCRATA_APP_TOKEN"] issues_gh = get_github_issues(REPO["name"], GITHUB_ACCESS_TOKEN) issues = [issue_to_dict(issue) for issue in issues_gh] convert_timestamps(issues) zenhub_metadata = get_zenhub_metadata(WORKSPACE_ID, ZENHUB_ACCESS_TOKEN, REPO["id"]) zenhub_metadata_index = create_zenhub_metadata_index(zenhub_metadata) for issue in issues: zenhub_meta = zenhub_metadata_index.get(issue["number"]) if zenhub_meta: issue.update(zenhub_meta) # set pipeline for closed issues, which have no pipeline metadata issue["pipeline"] = ("Closed" if issue["state"] == "closed" else issue.get("pipeline")) client = sodapy.Socrata( "data.austintexas.gov", SOCRATA_APP_TOKEN, username=SOCRATA_API_KEY_ID, password=SOCRATA_API_KEY_SECRET, timeout=60, ) for chunk in chunks(issues, 1000): client.upsert(SOCRATA_RESOURCE_ID, issues) logging.info(f"{len(chunk)} processed")
def execute(trial=False): '''Retrieve some data sets (not using the API here for the sake of simplicity).''' startTime = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('pt0713_silnuext', 'pt0713_silnuext') client = sodapy.Socrata("data.mass.gov", None) response = client.get("x99p-b88k", limit=9787, offset=0) s = json.dumps(response, sort_keys=True, indent=2) repo.dropCollection("fld") repo.createCollection("fld") repo['pt0713_silnuext.fld'].insert_many(response) repo['pt0713_silnuext.fld'].metadata({'complete': True}) print(repo['pt0713_silnuext.fld'].metadata()) repo.logout() endTime = datetime.datetime.now() return {"start": startTime, "end": endTime}
def execute(trial=False): startTime = datetime.datetime.now() client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate("rengx_ztwu_lwj", "rengx_ztwu_lwj") market = repo.market market_find = market.find() market_data = [] for i in market_find: market_data.append(i) ss = sodapy.Socrata("data.cityofboston.gov", "x92LG4iaFto5qWQGFk3lDdv6p", username="******", password="******") response = ss.get("u6fv-m8v4") r = json.loads(json.dumps(response, sort_keys=True, indent=2)) hospital_data = r #26 response = ss.get("rdqf-ter7") r = json.loads(json.dumps(response, sort_keys=True, indent=2)) garden_data = r #184 # get Police Station data in Boston response = ss.get("pyxn-r3i2") r = json.loads(json.dumps(response, sort_keys=True, indent=2)) police_data = r accessdb = union.unionF(hospital_data, garden_data, market_data, police_data) repo.access.drop() access = repo.access access.insert_many(accessdb) repo.logout() endTime = datetime.datetime.now() #print("union complete") return {"start": startTime, "end": endTime}
def __init__(self, source, dataset_id, n_types, tz, hrs, max_hrs): self.source = source self.dataset_id = dataset_id self.n_types = n_types self.tz = pytz.timezone(tz) self.hrs = hrs self.max_hrs = max_hrs # Preparing Socrata client self.client = sodapy.Socrata(source, None) # Preparing containers self.data = pd.DataFrame(columns=SocrataProvider.COLS) self.data_ds = ColumnDataSource( data={cl: [] for cl in SocrataProvider.COLS}) self.data_view = CDSView(filters=[], source=self.data_ds) self.type_stats_ds = ColumnDataSource(data={"type": [], "counts": []}) self.dispatch_types = [] # Calculating start time for inital data fetch self.start_time = dt.datetime.now( self.tz) - pd.Timedelta(hours=max_hrs) self.fetch_data()
def execute(trial=False): '''Retrieve some data sets (not using the API here for the sake of simplicity).''' startTime = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('bohan_nyx_xh1994_yiran123', 'bohan_nyx_xh1994_yiran123') # city of boston crime incident July 2012 - August 2015 #url = 'https://data.cityofboston.gov/resource/ufcx-3fdn.json' #response_crime = urllib.request.urlopen(url).read().decode("utf-8") client = sodapy.Socrata( "data.cityofboston.gov", dml.auth['services']['cityofbostondataportal']['token']) r = client.get("29yf-ye7n", limit=3000) #r = json.loads(response_crime) #s = json.dumps(r, sort_keys=True, indent=2) repo.dropCollection("crime_boston") repo.createCollection("crime_boston") repo['bohan_nyx_xh1994_yiran123.crime_boston'].insert_many(r) repo['bohan_nyx_xh1994_yiran123.crime_boston'].metadata( {'complete': True}) #print(repo['bohan_nyx_xh1994_yiran123.crime_boston'].metadata()) #city of boston property assessment 2014 '''url_property = 'https://data.cityofboston.gov/resource/jsri-cpsq.json' response_property = urllib.request.urlopen(url_property).read().decode("utf-8") r = json.loads(response_property) s = json.dumps(r, sort_keys=True, indent=2) repo.dropCollection("property_boston_assessment_2014") repo.createCollection("property_boston_assessment_2014") repo['bohan_nyx_xh1994_yiran123.property_boston_assessment_2014'].insert_many(r)''' #Food Establishment Inspections #url_foodIE = 'https://data.cityofboston.gov/resource/427a-3cn5.json' #response_foodIE = urllib.request.urlopen(url_foodIE).read().decode("utf-8") client = sodapy.Socrata( "data.cityofboston.gov", dml.auth['services']['cityofbostondataportal']['token']) r = client.get("427a-3cn5", limit=467558) #467558 s = json.dumps(r, sort_keys=True, indent=2) repo.dropCollection("Food_Establishment_Inspections") repo.createCollection("Food_Establishment_Inspections") repo[ 'bohan_nyx_xh1994_yiran123.Food_Establishment_Inspections'].insert_many( r) #https://data.cityofboston.gov/resource/fdxy-gydq.json client = sodapy.Socrata( "data.cityofboston.gov", dml.auth['services']['cityofbostondataportal']['token']) r = client.get("fdxy-gydq", limit=3000) s = json.dumps(r, sort_keys=True, indent=2) repo.dropCollection("Active_Food_Establishment_Licenses") repo.createCollection("Active_Food_Establishment_Licenses") repo[ 'bohan_nyx_xh1994_yiran123.Active_Food_Establishment_Licenses'].insert_many( r) #entertainment Licenses #url_entertainmentL = 'https://data.cityofboston.gov/resource/cz6t-w69j.json' #response_entertainmentL = urllib.request.urlopen(url_entertainmentL).read().decode("utf-8") client = sodapy.Socrata( "data.cityofboston.gov", dml.auth['services']['cityofbostondataportal']['token']) r = client.get("cz6t-w69j", limit=5223) #5223 s = json.dumps(r, sort_keys=True, indent=2) repo.dropCollection("Entertainment_Licenses") repo.createCollection("Entertainment_Licenses") repo['bohan_nyx_xh1994_yiran123.Entertainment_Licenses'].insert_many(r) #mbta stops location '''url_stopbylocation = 'http://realtime.mbta.com/developer/api/v2/stopsbylocation'#?api_key=wX9NwuHnZU2ToO7GmGR9uw&lat=42.346961&lon=-71.076640&format=json' api_key_mbta = 'wX9NwuHnZU2ToO7GmGR9uw' mbta_api_key = dml.auth['services']['mbtadeveloperportal']['key'] response_stopbylocation = requests.get(url_stopbylocation + '?api_key=' + mbta_api_key + '&route=' + route) r = json.loads(response_stopbylocation) s = json.dumps(r, sort_keys=True, indent=2) repo.dropCollection("mbta_stop_by_location") repo.createCollection("mbta_stop_by_location") repo['bohan_nyx_xh1994_yiran123.mbta_stop_by_location'].insert_many(r)''' #the lat and long is [4]and[5],.....stopsbylocation return a list of the stops nearest a particular location #TRAFFIC SIGNALS '''url_traffic_signal = 'http://bostonopendata-boston.opendata.arcgis.com/datasets/de08c6fe69c942509089e6db98c716a3_0.geojson' response_traffic_signal = urllib.request.urlopen(url_traffic_signal).read().decode("utf-8") r = json.loads(response_traffic_signal) s = json.dumps(r, sort_keys=True, indent=2) repo.dropCollection("TRAFFIC_SIGNALS") repo.createCollection("TRAFFIC_SIGNALS") repo['bohan_nyx_xh1994_yiran123.TRAFFIC_SIGNALS'].insert(r)''' url_airbnb = 'http://datamechanics.io/data/bohan_xh1994/airbnb.json' response_airbnb_rating = urllib.request.urlopen( url_airbnb).read().decode("utf-8") r = json.loads(response_airbnb_rating) s = json.dumps(r, sort_keys=True, indent=2) repo.dropCollection("airbnb_rating") repo.createCollection("airbnb_rating") repo['bohan_nyx_xh1994_yiran123.airbnb_rating'].insert(r) url_MBTA_Bus_stops = 'http://datamechanics.io/data/wuhaoyu_yiran123/MBTA_Bus_Stops.geojson' response_MBTA_Bus_stops = urllib.request.urlopen( url_MBTA_Bus_stops).read().decode("utf-8") r = json.loads(response_MBTA_Bus_stops) s = json.dumps(r, sort_keys=True, indent=2) repo.dropCollection("MBTA_Bus_stops") repo.createCollection("MBTA_Bus_stops") repo['bohan_nyx_xh1994_yiran123.MBTA_Bus_stops'].insert(r) repo.logout() endTime = datetime.datetime.now() return {"start": startTime, "end": endTime}
#!/usr/bin/env python 'Create table of number of trips from station I to station J' import sodapy import pandas as pd data_url = 'data.cityofchicago.org' trip_data = 'fg6s-gzvg' station_data = 'aavc-b2wj' # Open Socrata resource data = sodapy.Socrata(data_url, None) # Get station information select = 'id, station_name, total_docks, latitude, longitude' stations_json = data.get(station_data, select=select, limit=1000, order='id ASC') # Convert to DataFrame and convert columns to numeric types stations = pd.DataFrame(stations_json) cols_convert = ['id', 'total_docks', 'latitude', 'longitude'] stations[cols_convert] = stations[cols_convert].apply(pd.to_numeric) stations.set_index('id', inplace=True) print 'Retrieved {0:d} station records'.format(len(stations)) stations.to_csv('stations.csv')
def execute(trial=False): '''Retrieve some data sets (not using the API here for the sake of simplicity).''' startTime = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('jw0208', "jw0208") #username, password ## done---------------------------------------------------------- url = 'http://datamechanics.io/data/jw0208/medicare.json' response = urllib.request.urlopen(url).read().decode("utf-8") r = json.loads(response) s = json.dumps(r, sort_keys=True, indent=2) repo.dropCollection("medicare") repo.createCollection("medicare") repo['jw0208.medicare'].insert_many(r) repo['jw0208.medicare'].metadata({'complete': True}) #print(json.dumps(s, sort_keys=True, indent=2)) ## done---------------------------------------------------------- url = 'http://datamechanics.io/data/jw0208/poverty.json' response = urllib.request.urlopen(url).read().decode("utf-8") r = json.loads(response) s = json.dumps(r, sort_keys=True, indent=2) repo.dropCollection("poverty") repo.createCollection("poverty") repo['jw0208.poverty'].insert_many(r) repo['jw0208.poverty'].metadata({'complete': True}) #print(json.dumps(s, sort_keys=True, indent=2)) # ## done---------------------------------------------------------- url = 'http://datamechanics.io/data/jw0208/education.json' response = urllib.request.urlopen(url).read().decode("utf-8") r = json.loads(response) s = json.dumps(r, sort_keys=True, indent=2) repo.dropCollection("education") repo.createCollection("education") repo['jw0208.education'].insert_many(r) repo['jw0208.education'].metadata({'complete': True}) #print(repo['jw0208.education'].metadata()) #print(json.dumps(s, sort_keys=True, indent=2)) # with open('education.json') as r: # s = json.loads(r.read()) # repo.dropCollection("education") # repo.createCollection("education") # repo['jw0208.education'].insert_many(s) # repo['jw0208.education'].metadata({'complete': True}) # # print(json.dumps(s, sort_keys=True, indent=2)) ## done---------------------------------------------------------- url = 'http://datamechanics.io/data/jw0208/income.json' response = urllib.request.urlopen(url).read().decode("utf-8") r = json.loads(response) s = json.dumps(r, sort_keys=True, indent=2) repo.dropCollection("income") repo.createCollection("income") repo['jw0208.income'].insert_many(r) repo['jw0208.income'].metadata({'complete': True}) # print(json.dumps(s, sort_keys=True, indent=2)) ## ---------------------------------------------------------- client = sodapy.Socrata("chronicdata.cdc.gov", None) response = client.get("fq5d-abxc", limit=52) r = json.loads( json.dumps(response)) #load twice to convert list to str s = json.dumps(r, sort_keys=True, indent=2) repo.dropCollection("health") repo.createCollection("health") repo['jw0208.health'].insert_many(r) repo['jw0208.health'].metadata({'complete': True}) #print(json.dumps(response, sort_keys=True, indent=2)) endTime = datetime.datetime.now() return {"start": startTime, "end": endTime}
def execute(trial = False): '''Retrieve some data sets (not using the API here for the sake of simplicity).''' startTime = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('pt0713_silnuext', 'pt0713_silnuext') repo.dropCollection("police_crime") repo.createCollection("police_crime") # import police districts data url = "http://bostonopendata-boston.opendata.arcgis.com/datasets/9a3a8c427add450eaf45a470245680fc_5.geojson" response = urllib.request.urlopen(url).read().decode("utf-8") r = json.loads(response) r = [r['features'][i]['properties'] for i in range(11)] # getting DISTRICT column from police_district dataset districts = project(r, lambda t: (t['DISTRICT'])) print(districts) repo['pt0713_silnuext.police_crime'].insert_many(r) repo['pt0713_silnuext.police_crime'].metadata({'complete':True}) print(repo['pt0713_silnuext.police_crime'].metadata()) # import crime data client1 = sodapy.Socrata("data.cityofboston.gov", None) response1 = [] limits = [0, 50001, 100001, 150001, 200001, 250001] for limit in limits: response1 += client1.get("crime", limit=50000, offset=limit) s = json.dumps(response1, sort_keys=True, indent=2) # getting DISTRICT column from crime dataset crime_district = project(response1,lambda t:([t.get("reptdistrict")])) # if a crime happens in police_district: count+1 crime_in_police_district = 0 for district in crime_district: if district[0] in districts: crime_in_police_district += 1 print(crime_in_police_district) # count the final percentage of crime happens in police district / all of the crimes happen percentage_crime_in_police_district = crime_in_police_district / len(crime_district) print("The percentage of crime happens in police district is: ", percentage_crime_in_police_district) print("Thus, our assumption about crime happens less in police districts is wrong.") repo['pt0713_silnuext.police_crime'].insert_many(response1) repo['pt0713_silnuext.police_crime'].metadata({'complete':True}) print(repo['pt0713_silnuext.police_crime'].metadata()) repo.logout() endTime = datetime.datetime.now() return {"start":startTime, "end":endTime}
# minimum value for this clause is 1894 clause = "year > '{}'".format(minyear) + monthstring print(clause) # ## Step 9. # **Using sodapy library (recommended by CDC) get the drought data from CDC API.** set limit to something large so we get all records (if excluded will only get 1000). # In[20]: # this gets the pdsi data from cdc website using the query speciried in the get() function client = sodapy.Socrata("data.cdc.gov", None) results = client.get("en5r-5ds4", where=clause, limit = 10000000) pdsi = pd.DataFrame.from_records(results) client.close() # In[21]: print(pdsi.head(5)) print(len(pdsi)) # ## Step 10. # **change datatypes of all columns.**
group="hour") df_r = pd.DataFrame.from_records(r).astype('int').set_index('hour') df_T = pd.DataFrame(df_r.transpose().to_dict()) df_T.index = [stat_id] return df_T data_url = 'data.cityofchicago.org' app_token = '6xUrAQkGuIctuyUrTEdLZZFRY' d_trips = 'fg6s-gzvg' d_stats = 'aavc-b2wj' api = sodapy.Socrata(data_url, app_token) if not os.path.exists('stations_trip_counts.csv'): r = api.get(d_stats) cols = [ 'id', 'address', 'docks_in_service', 'latitude', 'longitude', 'station_name', 'total_docks' ] df_stats = pd.DataFrame.from_records(r, columns=cols) # Convert data types df_stats[['id', 'docks_in_service', 'total_docks']] =\ df_stats[['id', 'docks_in_service', 'total_docks']].astype(int)
def get_dataset_descriptors(domain, token): client = sodapy.Socrata(domain, token) return client.datasets()
def execute(trial=False): startTime = datetime.datetime.now() client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('nyx', 'nyx') #Landmarks dataset # url = 'https://data.cityofboston.gov/resource/u6fv-m8v4.json' client = sodapy.Socrata("data.cityofboston.gov", None) # response = urllib.request.urlopen(url).read().decode("utf-8") r1 = client.get("u6fv-m8v4", limit=10) s = json.dumps(r1, sort_keys=True, indent=2) repo.dropCollection("landmarks") repo.createCollection("landmarks") repo['nyx.landmarks'].insert_many(r1) repo['nyx.landmarks'].metadata({'complete': True}) print(repo['nyx.landmarks'].metadata()) #Farmer markets client = sodapy.Socrata("data.mass.gov", None) #url = 'https://data.mass.gov/resource/66t5-f563.json' #response = urllib.request.urlopen(url).read().decode("utf-8") r2 = client.get("66t5-f563", limit=10) #r2 = json.loads(response) s = json.dumps(r2, sort_keys=True, indent=2) repo.dropCollection("fmarket") repo.createCollection("fmarket") repo['nyx.fmarket'].insert_many(r1) #School Gardens #url = 'https://data.cityofboston.gov/resource/pzcy-jpz4.json' client = sodapy.Socrata("data.cityofboston.gov", None) #response = urllib.request.urlopen(url).read().decode("utf-8") r3 = client.get("pzcy-jpz4", limit=10) s = json.dumps(r3, sort_keys=True, indent=2) repo.dropCollection("fmarket") repo.createCollection("fmarket") repo['nyx.fmarket'].insert_many(r2) #Crime dataset #url = 'https://data.cityofboston.gov/resource/29yf-ye7n.json' client = sodapy.Socrata("data.cityofboston.gov", None) #response = urllib.request.urlopen(url).read().decode("utf-8") r4 = client.get("29yf-ye7n", limit=10) s = json.dumps(r4, sort_keys=True, indent=2) repo.dropCollection("crime") repo.createCollection("crime") repo['nyx.crime'].insert_many(r3) #Parking Zone #url = 'https://data.cityofboston.gov/resource/gdnf-7hki.json' client = sodapy.Socrata("data.cityofboston.gov", None) #response = urllib.request.urlopen(url).read().decode("utf-8") r5 = client.get("gdnf-7hki", limit=10) s = json.dumps(r5, sort_keys=True, indent=2) repo.dropCollection("parking") repo.createCollection("parking") repo['nyx.parking'].insert_many(r4) #Police Station #url = 'https://data.cityofboston.gov/resource/pyxn-r3i2.json' client = sodapy.Socrata("data.cityofboston.gov", None) #response = urllib.request.urlopen(url).read().decode("utf-8") r6 = client.get("pyxn-r3i2", limit=10) s = json.dumps(r6, sort_keys=True, indent=2) repo.dropCollection("police") repo.createCollection("police") repo['nyx.police'].insert_many(r5) repo.logout() endTime = datetime.datetime.now() return {"start": startTime, "end": endTime}
def execute(trial=False): startTime = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('billy108_zhou13', 'billy108_zhou13') # get Seasonal Swimming pools data in Boston client = sodapy.Socrata("data.cityofboston.gov", None) response = client.get("xw3e-c7pz") r = json.loads(json.dumps(response, sort_keys=True, indent=2)) repo.dropCollection("seasonalSwimPools") repo.createCollection("seasonalSwimPools") repo['billy108_zhou13.seasonalSwimPools'].insert_many(r) repo['billy108_zhou13.seasonalSwimPools'].metadata({'complete': True}) print(repo['billy108_zhou13.seasonalSwimPools'].metadata()) # get Community Gardens data in Boston client = sodapy.Socrata("data.cityofboston.gov", None) response = client.get("rdqf-ter7") r = json.loads(json.dumps(response, sort_keys=True, indent=2)) repo.dropCollection("communityGardens") repo.createCollection("communityGardens") repo['billy108_zhou13.communityGardens'].insert_many(r) repo['billy108_zhou13.communityGardens'].metadata({'complete': True}) print(repo['billy108_zhou13.communityGardens'].metadata()) # get recreational open space data in Cambridge client = sodapy.Socrata("data.cambridgema.gov", None) response = client.get("5ctr-ccas") r = json.loads(json.dumps(response, sort_keys=True, indent=2)) repo.dropCollection("openSpaceCambridge") repo.createCollection("openSpaceCambridge") repo['billy108_zhou13.openSpaceCambridge'].insert_many(r) repo['billy108_zhou13.openSpaceCambridge'].metadata({'complete': True}) print(repo['billy108_zhou13.openSpaceCambridge'].metadata()) # get recreational waterplay parks data in Cambridge client = sodapy.Socrata("data.cambridgema.gov", None) response = client.get("hv2t-vv6d") r = json.loads(json.dumps(response, sort_keys=True, indent=2)) repo.dropCollection("waterplayCambridge") repo.createCollection("waterplayCambridge") repo['billy108_zhou13.waterplayCambridge'].insert_many(r) repo['billy108_zhou13.waterplayCambridge'].metadata({'complete': True}) print(repo['billy108_zhou13.waterplayCambridge'].metadata()) # Get data of Open spaces of conservation and recreation interest in Boston url = 'http://bostonopendata-boston.opendata.arcgis.com/datasets/2868d370c55d4d458d4ae2224ef8cddd_7.geojson' response = urllib.request.urlopen(url).read().decode("utf-8") r = json.loads(response) repo.dropCollection("openSpaceBoston") repo.createCollection("openSpaceBoston") repo['billy108_zhou13.openSpaceBoston'].insert_many(r['features']) repo['billy108_zhou13.openSpaceBoston'].metadata({'complete': True}) print(repo['billy108_zhou13.openSpaceBoston'].metadata()) # Get data of Community Center Pools in Boston url = 'http://bostonopendata-boston.opendata.arcgis.com/datasets/5575f763dbb64effa36acd67085ef3a8_0.geojson' response = urllib.request.urlopen(url).read().decode("utf-8") r = json.loads(response) repo.dropCollection("commCenterPools") repo.createCollection("commCenterPools") repo['billy108_zhou13.commCenterPools'].insert_many(r['features']) repo['billy108_zhou13.commCenterPools'].metadata({'complete': True}) print(repo['billy108_zhou13.commCenterPools'].metadata()) repo.logout() endTime = datetime.datetime.now() return {"start": startTime, "end": endTime}