def upload_to_socrata(**kwargs): from sodapy import Socrata # make a sodapy Socrata client s = Socrata("data.detroitmi.gov", Variable.get("SOCRATA_TOKEN"), Variable.get("SOCRATA_USER"), Variable.get("SOCRATA_PASS")) # get the SQLAlchemy engine hook = PostgresHook('etl_postgres') eng = hook.get_sqlalchemy_engine() # get the payload result = eng.execute(f"select * from {kwargs['table']}") payload = [dict(row) for row in result] if kwargs['method'] == 'replace': job = s.replace(kwargs['id'], payload) else: chunk_size = 10000 for i in range(0, len(payload), chunk_size): try: r = s.upsert(kwargs['id'], payload[i:i + chunk_size]) except: print(f"Error on record {i}") r = s.upsert(kwargs['id'], payload[i:i + chunk_size])
def send_crab_data(self): CONVERSION_RATE = .00000701549 oocytes = Oocyte.objects.filter(crab=self).filter(chosen_count=10) client = Socrata("noaa-fisheries-afsc.data.socrata.com", "q3DhSQxvyWbtq1kLPs5q7jwQp", username="******", password = "******") data = {'area_2': '', 'area_5': '', 'calibration_5x': 0.00028, 'area_4': '', 'area_7': '', 'area_10': '', 'calibration_10x': 0.00056, 'area_9': '', 'year': '', 'sample': '', 'area_3': '', 'area_8': '', 'area_1': '', 'area_6': ''} data['area_1'] = oocytes[0].area * CONVERSION_RATE data['area_2'] = oocytes[1].area * CONVERSION_RATE data['area_3'] = oocytes[2].area * CONVERSION_RATE data['area_4'] = oocytes[3].area * CONVERSION_RATE data['area_5'] = oocytes[4].area * CONVERSION_RATE data['area_6'] = oocytes[5].area * CONVERSION_RATE data['area_7'] = oocytes[6].area * CONVERSION_RATE data['area_8'] = oocytes[7].area * CONVERSION_RATE data['area_9'] = oocytes[8].area * CONVERSION_RATE data['area_10'] = oocytes[9].area * CONVERSION_RATE data['year'] = datetime.datetime.now().year data['sample'] = self.sample_num payload = [data] client.upsert("km2u-hwjw", payload)
def test_upsert_exception(): mock_adapter = {} mock_adapter["prefix"] = PREFIX adapter = requests_mock.Adapter() mock_adapter["adapter"] = adapter client = Socrata(DOMAIN, APPTOKEN, session_adapter=mock_adapter) response_data = "403_response_json.txt" setup_mock(adapter, "POST", response_data, 403, reason="Forbidden") data = [{"theme": "Surfing", "artist": "Wavves", "title": "King of the Beach", "year": "2010"}] try: client.upsert(DATASET_IDENTIFIER, data) except Exception as e: assert isinstance(e, requests.exceptions.HTTPError) else: raise AssertionError("No exception raised for bad request.")
def upsert_to_socrata(client: Socrata, dataset_identifier: str, zipper: dict) -> None: """ Upsert data to Socrata dataset. :param client: Socrata connection client :param dataset_identifier: Unique Socrata dataset identifier. Not the data page identifier but primary page id. :param zipper: dictionary of zipped results (headers and data values) :return: None """ try: client.upsert(dataset_identifier=dataset_identifier, payload=zipper, content_type='json') except Exception as e: print("Error upserting to Socrata: {}. {}".format( dataset_identifier, e)) return
def lambda_handler(event, context): ''' Method called by Amazon Web Services when the lambda trigger fires. This lambda is configured to be triggered by file creation in the ITS DataHub Sandbox s3 bucket ("usdot-its-cvpilot-public-data" or "test-usdot-its-cvpilot-public-data"). When a new file is added to the Sandbox s3 bucket, this lambda function will read the new JSON newline file, perform data transformation, upsert the new data records to the corresponding Socrata data set on data.transportation.gov, and remove the oldest records from the Socrata dataset to keep the data set at a manageable size. Parameters: event, context: Amazon Web Services required parameters. Describes triggering event. ''' # Read data from the newly deposited file and # perform data transformation on the records out_recs = [] for bucket, key in lambda_to_socrata_util.get_fps_from_event(event): raw_recs = lambda_to_socrata_util.process_s3_file(bucket, key) for raw_rec in raw_recs: split_raw_recs = split_tim_tdf(raw_rec) processed_raw_recs = [process_tim(i) for i in split_raw_recs] out_recs += processed_raw_recs if len(out_recs) == 0: logger.info("No new data found. Exit script") return # Upsert the new records to the corresponding Socrata data set logger.info("Connecting to Socrata") client = Socrata("data.transportation.gov", SOCRATA_API_KEY, SOCRATA_USERNAME, SOCRATA_PASSWORD, timeout=400) logger.info("Transform record dtypes according to Socrata dataset") col_dtype_dict = lambda_to_socrata_util.get_col_dtype_dict( client, SOCRATA_DATASET_ID) float_fields = ['randomNum', 'metadata_generatedAt_timeOfDay'] out_recs = [ lambda_to_socrata_util.mod_dtype(r, col_dtype_dict, float_fields) for r in out_recs ] logger.info("Uploading {} new records".format(len(out_recs))) uploadResponse = client.upsert(SOCRATA_DATASET_ID, out_recs) logger.info(uploadResponse)
def test_upsert(): mock_adapter = {} mock_adapter["prefix"] = PREFIX adapter = requests_mock.Adapter() mock_adapter["adapter"] = adapter client = Socrata(DOMAIN, APPTOKEN, username=USERNAME, password=PASSWORD, session_adapter=mock_adapter) response_data = "upsert_songs.txt" data = [{"theme": "Surfing", "artist": "Wavves", "title": "King of the Beach", "year": "2010"}] setup_mock(adapter, "POST", response_data, 200) response = client.upsert(DATASET_IDENTIFIER, data) assert isinstance(response, dict) assert response.get("Rows Created") == 1 client.close()
def test_upsert_exception(): mock_adapter = {} mock_adapter["prefix"] = PREFIX adapter = requests_mock.Adapter() mock_adapter["adapter"] = adapter client = Socrata(DOMAIN, APPTOKEN, session_adapter=mock_adapter) path = "/songs.json" response_data = "403_response_json.txt" set_up_mock(adapter, "POST", response_data, 403, reason="Forbidden") data = [{"theme": "Surfing", "artist": "Wavves", "title": "King of the Beach", "year": "2010"}] try: response = client.upsert(PATH, data) except Exception, e: assert isinstance(e, requests.exceptions.HTTPError)
def log_request(): print 'log_request', request.url print (socrata_app_token and socrata_username and socrata_password and socrata_access_log_domain and socrata_access_log_datasetid) if socrata_app_token and socrata_username and socrata_password and socrata_access_log_domain and socrata_access_log_datasetid: client = Socrata(socrata_access_log_domain, socrata_app_token, username=socrata_username, password=socrata_password) # fix this, see http://esd.io/blog/flask-apps-heroku-real-ip-spoofing.html if not request.headers.getlist("X-Forwarded-For"): ip = request.remote_addr else: ip = request.headers.getlist("X-Forwarded-For")[0] # for some reason a space and a * is causing an upsert error so am replacing space with %20 url = str(request.url).replace(" ", "%20") # See Socrata's time format https://support.socrata.com/hc/en-us/articles/202949918-Importing-Data-Types-and-You- dtnow = datetime.utcnow().isoformat() dtnow = dtnow[:dtnow.index('.')]+'Z' data = [{'datetime': dtnow, 'ip_address': str(ip), 'url': url}] print data print 'upsert', client.upsert(socrata_access_log_datasetid, data)
def test_upsert_exception(): mock_adapter = {} mock_adapter["prefix"] = PREFIX adapter = requests_mock.Adapter() mock_adapter["adapter"] = adapter client = Socrata(DOMAIN, APPTOKEN, session_adapter=mock_adapter) path = "/songs.json" response_data = "403_response_json.txt" set_up_mock(adapter, "POST", response_data, 403, reason="Forbidden") data = [{ "theme": "Surfing", "artist": "Wavves", "title": "King of the Beach", "year": "2010" }] try: response = client.upsert(PATH, data) except Exception, e: assert isinstance(e, requests.exceptions.HTTPError)
texas_df['UID'] = texas_df.apply(make_uid, axis=1) texas_df['state'] = "Texas" ########## # Combine the Data Frames ########## # Concatenate the dataframes into one unified DF df_combined = pd.concat([tracts_df, top_30_df, counties_df, texas_df], sort=True) # Replace NaN values that Socrata doesn't like df_combined['place'] = df_combined['place'].fillna("N/A") df_combined['state'] = df_combined['state'].fillna("N/A") df_combined['county'] = df_combined['county'].fillna("N/A") df_combined['tract'] = df_combined['tract'].fillna(0) json_data = df_combined.to_dict(orient="records") ########## # Send Data to Socrata ########## try: socrata_upsert = socrata_client.upsert('evvr-qwa7', json_data) print(socrata_upsert) except Exception as e: print("Exception, could not insert: " + str(e)) print("Query: '%s'" % json_data)
class MDSSocrata: __slots__ = [ "mds_config", "mds_http_graphql", "mds_socrata_dataset", "provider_name", "query", "client", ] def __del__(self): """ Make sure the client is closed whenever the class is destructed. :return: """ try: self.client.close() finally: self.client = None def __str__(self) -> str: """ Returns the configuration as string. :return: """ return str({ "SOCRATA_DATA_ENDPOINT": self.mds_config.get_setting("SOCRATA_DATA_ENDPOINT", None), "SOCRATA_DATASET": self.mds_config.get_setting("SOCRATA_DATASET", None), "SOCRATA_APP_TOKEN": self.mds_config.get_setting("SOCRATA_APP_TOKEN", None), "SOCRATA_KEY_ID": self.mds_config.get_setting("SOCRATA_KEY_ID", None), "SOCRATA_KEY_SECRET": self.mds_config.get_setting("SOCRATA_KEY_SECRET", None), }) def __init__(self, provider_name, mds_config, mds_gql): """ Constructor for the init class. :param str provider_name: The name of the provider :param MDSConfig mds_config: The configuration class where we can gather our endpoint :param MDSGraphQLRequest mds_gql: The http graphql class we need to make requests :return: """ self.provider_name = provider_name self.mds_config = mds_config self.mds_http_graphql = mds_gql self.mds_socrata_dataset = self.mds_config.get_setting( "SOCRATA_DATASET", None) self.client = Socrata( self.mds_config.get_setting("SOCRATA_DATA_ENDPOINT", None), self.mds_config.get_setting("SOCRATA_APP_TOKEN", None), username=self.mds_config.get_setting("SOCRATA_KEY_ID", None), password=self.mds_config.get_setting("SOCRATA_KEY_SECRET", None), timeout=20, ) self.query = Template(""" query getTrips { api_trips( where: { provider: { provider_name: { _eq: "$provider_name" }} end_time: { _gte: "$time_min" }, _and: { end_time: { _lt: "$time_max" }} } ) { trip_id: id device_id: device { id } vehicle_type trip_duration trip_distance start_time end_time modified_date council_district_start council_district_end census_geoid_start census_geoid_end } } """) def get_query(self, time_min, time_max) -> str: """ Returns a string with the new query based on limit and offset. :param str time_min: The minimum time the trip ended :param str time_max: The maximum time the trip ended :return str: """ if isinstance(self.provider_name, str) is False: raise Exception("provider_name must be a string") if isinstance(time_min, str) is False: raise Exception("time_min must be a sql datetime string") if isinstance(time_max, str) is False: raise Exception("time_max must be a sql datetime string") return self.query.substitute(provider_name=self.provider_name, time_min=time_min, time_max=time_max) def get_data(self, time_min, time_max) -> dict: """ Gathers data from the API endpoint :param str time_min: :param str time_max: :return dict: """ query = self.get_query(time_min=time_min, time_max=time_max) return self.mds_http_graphql.request(query) def get_config(self) -> dict: """ Returns the configuration dictionary for testing. :return dict: """ return self.mds_config.get_config() def save(self, data) -> dict: """ Upserts the data into Socrata :param dict data: The data to be saved unto socrata. :return dict: """ data = list(map(self.clean_trip_device_id, data)) data = list(map(self.parse_datetimes, data)) data = list(map(self.check_geos_data, data)) if self.client is not None: return self.client.upsert(self.mds_socrata_dataset, data) else: raise Exception( "The socrata client is not initialized correctly, check your API credentials." ) def parse_datetimes(self, data) -> dict: """ Parses the PostgreSQL datetime with timezone into an insertable socrata timestamp in CST time. It also adds necessary fields, such as year, month, hour and day of the week. :param data: :return: """ fmt = "%Y-%m-%dT%H:%M:%S" end_time = self.datetime_to_cst(data["end_time"]) data["start_time"] = self.datetime_to_cst( data["start_time"]).strftime(fmt) data["end_time"] = end_time.strftime(fmt) data["modified_date"] = self.datetime_to_cst( data["modified_date"]).strftime(fmt) data["year"] = end_time.year data["month"] = end_time.month data["hour"] = end_time.hour data["day_of_week"] = end_time.weekday() return data @staticmethod def check_geos_data(data) -> dict: """ Parses the PostgreSQL datetime with timezone into an insertable socrata timestamp in CST time. It also adds necessary fields, such as year, month, hour and day of the week. :param data: :return: """ geos_fields = [ "council_district_start", "council_district_end", "census_geoid_start", "census_geoid_end", ] for field in geos_fields: if data[field] is None or data[field] == "None": data[field] = 0 else: data[field] = data.get(field, 0) return data @staticmethod def datetime_to_cst(timestamp): return parser.parse(timestamp).astimezone(tz.gettz("CST")) @staticmethod def clean_trip_device_id(trip) -> dict: """ Transforms the device_id field from a dictionary into a string value :param dict trip: The trip dictionary being transformed :return dict: """ trip["device_id"] = trip["device_id"]["id"] return trip
class SocrataDataset(object): def __init__(self, dataset_id, socrata_client=None, socrata_params={}, float_fields=[]): self.dataset_id = dataset_id self.client = socrata_client if not socrata_client and socrata_params: self.client = Socrata(**socrata_params) self.socrata_params = socrata_params self.col_dtype_dict = self.get_col_dtype_dict() self.float_fields = float_fields def get_col_dtype_dict(self): ''' Retrieve data dictionary of a Socrata data set in the form of a dictionary, with the key being the column name and the value being the column data type Returns: data dictionary of a Socrata data set in the form of a dictionary, with the key being the column name and the value being the column data type ''' dataset_col_meta = self.client.get_metadata(self.dataset_id)['columns'] col_dtype_dict = { col['name']: col['dataTypeName'] for col in dataset_col_meta } return col_dtype_dict def mod_dtype(self, rec, col_dtype_dict=None, float_fields=None): ''' Make sure the data type of each field in the data record matches the data type of the field in the Socrata data set. Parameters: rec: dictionary object of the data record col_dtype_dict: data dictionary of a Socrata data set in the form of a dictionary, with the key being the column name and the value being the column data type float_fields: list of fields that should be a float Returns: dictionary object of the data record, with number, string, and boolean fields modified to align with the data type of the corresponding Socrata data set ''' col_dtype_dict = col_dtype_dict or self.col_dtype_dict float_fields = float_fields or self.float_fields identity = lambda x: x dtype_func = {'number': float, 'text': str, 'checkbox': bool} out = {} for k, v in rec.items(): if k in float_fields and k in col_dtype_dict: out[k] = float(v) elif k in col_dtype_dict: if v is not None and v is not '': out[k] = dtype_func.get( col_dtype_dict.get(k, 'nonexistentKey'), identity)(v) out = {k: v for k, v in out.items() if k in col_dtype_dict} return out def create_new_draft(self): draftDataset = requests.post( 'https://{}/api/views/{}/publication.json'.format( self.client.domain, self.dataset_id), auth=(self.socrata_params['username'], self.socrata_params['password']), params={'method': 'copySchema'}) logger.info(draftDataset.json()) draftId = draftDataset.json()['id'] return draftId def publish_draft(self, draftId): time.sleep(5) publishResponse = requests.post( 'https://{}/api/views/{}/publication.json'.format( self.client.domain, draftId), auth=(self.socrata_params['username'], self.socrata_params['password'])) logger.info(publishResponse.json()) return publishResponse def delete_draft(self, draftId): time.sleep(5) deleteResponse = self.client.delete(draftId) if deleteResponse.status_code == 200: logger.info('Empty draft {} has been discarded.'.format(draftId)) return deleteResponse def clean_and_upsert(self, recs, dataset_id=None): dataset_id = dataset_id or self.dataset_id out_recs = [self.mod_dtype(r) for r in recs] uploadResponse = self.client.upsert(dataset_id, out_recs) return uploadResponse
offset=offset, date_limit=get_date_limit(), initial_date_limit=get_initial_date_limit()) offset += limit data = run_hasura_query(query) if "data" not in data: print("query: ", query) print("data: ", data) raise RuntimeError("There was a problem gathering data.") # Format records records = config["formatter"](data, config["formatter_config"]) # Upsert records to Socrata client.upsert(config["dataset_uid"], records) total_records += len(records) if len(records) == 0: print(f'{total_records} {config["table"]} records upserted.') print(f'Completed {config["table"]} table.') elif total_records != 0: print(f"{total_records} records upserted") # Terminate Socrata connection client.close() # Stop timer and print duration end = time.time() hours, rem = divmod(end - start, 3600) minutes, seconds = divmod(rem, 60)
'bounce_rate', 'entrance_rate', 'exit_rate' ] with open(os.path.join(report_folder, 'all-pages.json'), encoding='utf8') as json_file: data = json.load(json_file) taken_at = datetime.strptime(data['taken_at'], "%Y-%m-%dT%H:%M:%S.%fZ") datestamp = (taken_at - timedelta(days=1)).strftime('%Y-%m-%d') for page in data['data']: page = clean_ga_keys(page) visits = int(page.pop('visits')) page['date'] = datestamp page['id'] = datestamp + page['domain'] + page['page'] page['pageviews'] = visits if visits > 0: page['bounce_rate'] = 100 * float(page['bounces']) / visits page['entrance_rate'] = 100 * float(page['entrances']) / visits page['exit_rate'] = 100 * float(page['exits']) / visits for key in percentages: page[key] = round(float(page.get(key, 0)), 2) upsert_chunks = [ data['data'][x:x+soda_batch_size] for x in range(0, len(data['data']), soda_batch_size) ] for chunk in upsert_chunks: soda_client.upsert(os.environ["SOCRATA_RESOURCEID"], chunk)
class SocrataClient(Config): def __init__(self, **kwargs): self.client = None self.service_data = kwargs.get('service_data', None) self.dataset_id = kwargs.get('dataset_id', None) super(SocrataClient, self).__init__() def _connect(self): self.client = Socrata(self.credentials['socrata_url'], self.credentials['socrata_token'], self.credentials['socrata_username'], self.credentials['socrata_password']) # See: # https://stackoverflow.com/questions/47514331/readtimeout-error-for-api-data-with-sodapy-client self.client.timeout = SOCRATA_MASTER_TIMEOUT def upsert(self): """ NOTE: Unique ID in Socrata is set via UI https://support.socrata.com/hc/en-us/articles/ \ 360008065493-Setting-a-Row-Identifier-in-the-Socrata-Data-Management-Experience """ if self.dataset_id is None: logging.error('[SOCRATA_CLIENT] No Socrata dataset ID provided') return if self.client is None: self._connect() groomed_data = self.dict_transform() data = self.send_data(groomed_data=groomed_data) logging.info('[SOCRATA_CLIENT] Upserting data') logging.info(self.client.upsert(self.dataset_id, data)) def send_data(self, groomed_data): data = [] for row in groomed_data: for key, entry in row.items(): # Deformat and replace dates if isinstance(entry, datetime.datetime): replacement = self.__deformat_date(entry) row[key] = replacement data.append(row) return data def dict_transform(self): final_report = [] for key, entry in self.service_data.items(): # Add ID, go into dict row = {'id': key} for subkey, subent in entry.items(): row[subkey] = subent final_report.append(row) return final_report def json_to_csv(self, filename='soctemp.csv'): """ Convert JSON to temporary CSV file for initial upload """ tempfile = os.path.join(ROOT_DIR, 'tmp', filename) fieldnames = set() final_report = [] for key, entry in self.service_data.items(): fieldnames.add('id') # Add ID, go into dict row = {'id': key} for subkey, subent in entry.items(): row[subkey] = subent fieldnames.add(subkey) final_report.append(row) with open(tempfile, 'w', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=list(fieldnames)) writer.writeheader() for row in final_report: writer.writerow(row) @staticmethod def __deformat_date(date): return date.isoformat()
def lambda_handler(event, context): ''' Method called by Amazon Web Services when the lambda trigger fires. This lambda is configured to be triggered at regular intervals. It will remove old records from Socrata data sets to keep it at a certain size. Parameters: event, context: Amazon Web Services required parameters. Describes triggering event. ''' logger.info("Connecting to Socrata") client = Socrata(SOCRATA_DOMAIN, SOCRATA_API_KEY, SOCRATA_USERNAME, SOCRATA_PASSWORD, timeout=400) # check number of records in each socrata data set # keep track of data sets that have more than x records id_toDelete_dict = {} for datasetId in SOCRATA_DATASET_IDS: r = requests.get("https://{}/resource/{}.json?$select=count(*)".format( SOCRATA_DOMAIN, datasetId), auth=HTTPBasicAuth(SOCRATA_USERNAME, SOCRATA_PASSWORD)) r = r.json() count = int(r[0]['count']) if count > DATASET_LIMIT: toDelete = count - DATASET_LIMIT id_toDelete_dict[datasetId] = toDelete logger.info( 'data set {} has {} rows - removing oldest {} rows'.format( datasetId, count, toDelete)) else: logger.info('data set {} has {} rows - no action needed'.format( datasetId, count)) time.sleep(5) # loop through data set to remove excess records # remove the oldest records to keep the data set at a manageable size # wait at end of each loop if hitting error wait = False while bool(id_toDelete_dict): for datasetId, toDelete in id_toDelete_dict.items(): try: # remove at most 10000 records at a time to avoid timeouts N = min(toDelete, 10000) retrievedRows = client.get(datasetId, limit=N, exclude_system_fields=False, select=':id') deleteList = [{ ':id': row[':id'], ':deleted': True } for row in retrievedRows] result = client.upsert(datasetId, deleteList) logger.info(id_toDelete_dict) toDelete = toDelete - N id_toDelete_dict[datasetId] = toDelete if not toDelete > 0: del id_toDelete_dict[datasetId] if len(id_toDelete_dict) == 1: time.sleep(1) except: logger.info( 'Error while deleting {} records in {}. Skipping for now'. format(N, datasetId)) wait = True if wait: logger.info('Slow down a bit. Wait 2 seconds.') time.sleep(2) wait = False
def run(**kwargs): """ Runs the program based on the above flags, the values will be passed to kwargs as a dictionary :param dict kwargs: The values specified by click decorators. :return: """ # Start timer start = time.time() SOCRATA_DATASET = mds_config.get_setting("SOCRATA_DATASET", None) SOCRATA_ENDPOINT = mds_config.get_setting("SOCRATA_DATA_ENDPOINT", None) SOCRATA_APP_TOKEN = mds_config.get_setting("SOCRATA_APP_TOKEN", None) SOCRATA_KEY_ID = mds_config.get_setting("SOCRATA_KEY_ID", None) SOCRATA_KEY_SECRET = mds_config.get_setting("SOCRATA_KEY_SECRET", None) HASURA_ENDPOINT = mds_config.get_setting("HASURA_ENDPOINT", None) # Prep Hasura query HASURA_HEADERS = { "Content-Type": "application/json", "X-Hasura-Admin-Secret": mds_config.get_setting("HASURA_ADMIN_KEY", None) } time_min = kwargs.get("time_min", None) time_max = kwargs.get("time_max", None) query = """ query getTrips($timeMin: timestamptz!, $timeMax: timestamptz!) { api_trips( where: { end_time: { _gte: $timeMin } _and: { end_time: { _lt: $timeMax } }, } ) { trip_id: id device_id: device { id } vehicle_type trip_duration trip_distance start_time end_time modified_date council_district_start council_district_end census_geoid_start census_geoid_end } } """ response = requests.post(HASURA_ENDPOINT, data=json.dumps({ "query": query, "variables": { "timeMin": time_min, "timeMax": time_max, } }), headers=HASURA_HEADERS) def datetime_to_cst(timestamp) -> str: """ Returns a datetime in CST timezone :param timestamp: :return: """ try: return parser.parse(timestamp).astimezone(tz.gettz("CST")) except: return timestamp def parse_datetimes(data) -> dict: """ Parses the PostgreSQL datetime with timezone into an insertable socrata timestamp in CST time. It also adds necessary fields, such as year, month, hour and day of the week. :param data: :return: """ fmt = "%Y-%m-%dT%H:%M:%S" end_time = datetime_to_cst(data["end_time"]) data["start_time"] = datetime_to_cst(data["start_time"]).strftime(fmt) data["end_time"] = end_time.strftime(fmt) data["modified_date"] = datetime_to_cst( data["modified_date"]).strftime(fmt) data["year"] = end_time.year data["month"] = end_time.month data["hour"] = end_time.hour data["day_of_week"] = end_time.weekday() return data def clean_trip(trip) -> dict: """ Transforms the device_id field from a dictionary into a string value :param dict trip: The trip dictionary being transformed :return dict: """ trip["device_id"] = trip["device_id"]["id"] trip = parse_datetimes(trip) return trip # # Generate the # try: trips = list(map(clean_trip, response.json()["data"]["api_trips"])) except: trips = [] if len(trips) == 0: print("Nothing to do here, exiting.") exit(0) else: print(f"Total trips to process: {len(trips)}") print("Connecting to Socrata") # Setup connection to Socrata client = Socrata( SOCRATA_ENDPOINT, SOCRATA_APP_TOKEN, username=SOCRATA_KEY_ID, password=SOCRATA_KEY_SECRET, timeout=7200, ) print( "Making upsert... this can take a while but not more than 2 hours. Please wait." ) client.upsert(SOCRATA_DATASET, trips) # Stop timer and print duration end = time.time() hours, rem = divmod(end - start, 3600) minutes, seconds = divmod(rem, 60) print("Finished in: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
import requests from sodapy import Socrata import json # store api token in creds.txt with open('creds.json', 'r') as f: creds = json.loads(f.read()) client = Socrata("odn.data.socrata.com", creds["token"], username=creds['username'], password=creds['password']) data = requests.get('https://odn.data.socrata.com/resource/jwbj-xtgt.json?$where=region_name%20IS%20NOT%20NULL%20AND%20location%20IS%20NULL').json() for row in data: if len(row['region_name'].split(',')) > 2: location = requests.get('http://nominatim.openstreetmap.org/search/?q=%s&format=json' % (row['region_name'])).json() else: location = requests.get('http://nominatim.openstreetmap.org/search/?q=%s,usa&format=json' % (row['region_name'])).json() if location: location = location[0] row['location'] = "(%s, %s)" % (location['lat'], location['lon']) client.upsert("k53q-ytmx", data)
class SocrataDataset(object): """ Helper class for interacting with datasets in Socrata. """ logger = None def __init__(self, dataset_id, socrata_client=None, socrata_params=None, float_fields=None, logger=None): """ Initialization function of the SocrataDataset class. Parameters: dataset_id: 4x4 ID of the Socrata draft (e.g. x123-bc12) client: Optional parameter if the user chooses to pass in the socrata_params parameter. If user chooses not to pass in socrata_params, they can also pass in an sodapy.Socrata object that has been initialized with the proper socrata credentials. socrata_params: Optional parameter if the user choose to pass in the socrata_client parameter. Dictionary object containing Socrata credentials. Must include the following fields: 'username', 'password', 'app_token', 'domain'. float_fields: An array of Socrata field names that should be of float types (numbers with decimals). logger: Optional parameter. Could pass in a logger object or not pass in anything. If a logger object is passed in, information will be logged instead of printed. If not, information will be printed. """ self.socrata_params = {} self.float_fields = [] self.dataset_id = dataset_id self.client = socrata_client if not socrata_client and socrata_params: self.client = Socrata(**socrata_params) self.socrata_params = socrata_params self.col_dtype_dict = self.get_col_dtype_dict() self.float_fields = float_fields self.print_func = print if logger: self.print_func = logger.info def get_col_dtype_dict(self): """ Retrieve data dictionary of a Socrata data set in the form of a dictionary, with the key being the column name and the value being the column data type Returns: Data dictionary of a Socrata data set in the form of a dictionary, with the key being the column name and the value being the column data type. """ dataset_col_meta = self.client.get_metadata(self.dataset_id)['columns'] col_dtype_dict = { col['name']: col['dataTypeName'] for col in dataset_col_meta } return col_dtype_dict def mod_dtype(self, rec, col_dtype_dict=None, float_fields=None): """ Make sure the data type of each field in the data record matches the data type of the field in the Socrata data set. Parameters: rec: dictionary object of the data record col_dtype_dict: data dictionary of a Socrata data set in the form of a dictionary, with the key being the column name and the value being the column data type float_fields: list of fields that should be a float Returns: Dictionary object of the data record, with number, string, and boolean fields modified to align with the data type of the corresponding Socrata data set. """ col_dtype_dict = col_dtype_dict or self.col_dtype_dict float_fields = float_fields or self.float_fields identity = lambda x: x dtype_func = {'number': float, 'text': str, 'checkbox': bool} out = {} for k, v in rec.items(): if k in float_fields and k in col_dtype_dict: out[k] = float(v) elif (k in col_dtype_dict and v not in [None, '']): out[k] = dtype_func.get( col_dtype_dict.get(k, 'nonexistentKey'), identity)(v) out = {k: v for k, v in out.items() if k in col_dtype_dict} return out def create_new_draft(self): """ Create a new draft of the current dataset. Returns: Draft ID of the new draft. """ draft_dataset = requests.post( 'https://{}/api/views/{}/publication.json'.format( self.client.domain, self.dataset_id), auth=(self.socrata_params['username'], self.socrata_params['password']), params={'method': 'copySchema'}) logger.info(draft_dataset.json()) draft_id = draft_dataset.json()['id'] return draft_id def publish_draft(self, draft_id): """ Publish the Socrata draft specified. Parameters: draft_id: 4x4 ID of the Socrata draft (e.g. x123-bc12) Returns: Response of the publish draft request. """ time.sleep(5) publish_response = requests.post( 'https://{}/api/views/{}/publication.json'.format( self.client.domain, draft_id), auth=(self.socrata_params['username'], self.socrata_params['password'])) logger.info(publish_response.json()) return publish_response def delete_draft(self, draft_id): """ Delete the Socrata draft specified. Parameters: draft_id: 4x4 ID of the Socrata draft (e.g. x123-bc12) Returns: Response of the delete draft request. """ time.sleep(5) delete_response = self.client.delete(draft_id) if delete_response.status_code == 200: logger.info('Empty draft {} has been discarded.'.format(draft_id)) return delete_response def clean_and_upsert(self, recs, dataset_id=None): """ Publish the Socrata draft specified. Parameters: recs: an array of dictionary objects of the data to upsert. dataset_id: 4x4 ID of the Socrata dataset (e.g. x123-bc12) to perform upserts to. This parameter is not required if you are performing upserts to the dataset you've initialized this class with. Returns: A dictionary object with the following fields: 'Rows Deleted' - number of rows deleted due to the upsert request 'Rows Updated' - number of rows updated due to the upsert request 'Rows Created' - number of rows created due to the upsert request """ dataset_id = dataset_id or self.dataset_id out_recs = [self.mod_dtype(r) for r in recs] upload_response = self.client.upsert(dataset_id, out_recs) return upload_response