def __init__(self, accessPointJob, accessPointObs, apiKey, needsObs=False): """ Initializes the connection to the PostgREST instance. @param accessPointJob: the PostgREST "etl_perfmet_job" table endpoint @param accessPointObs: the PostgREST "etl_perfmet_obs" table endpoint @param apiKey: the PostgREST API key needed to write to the endpoints @param needsObs: set this to True to enable the writing of observations. """ self.jobDB = Postgrest(accessPointJob, auth=apiKey) self.obsDB = None if needsObs: self.obsDB = Postgrest(accessPointObs, auth=apiKey)
def get_data(start): pgrest = Postgrest(secrets.PG["url"], auth=secrets.PG["token"]) params = { "select": "trip_id,provider_id,start_latitude,start_longitude,end_latitude,end_longitude", "council_district_start": "is.null", "limit": interval, "offset": start } print('get data') trips = pgrest.select(params) print('split_trips') trips = split_trips(trips, field_map) print('create_points') trips = create_points(trips) print('point in grid') trips = point_in_poly(points=trips, polys=grid, null_val="OUT_OF_BOUNDS") print('point in council district') trips = point_in_poly(points=trips, polys=districts, row_property_key="council_district", feature_property_key="district_n", null_val=0) print('merge trips') trips = merge_trips(trips) print('reduce fields') trips = reduce_fields(trips, [ field['name'] for field in config.FIELDS if field.get('upload_postgrest') ]) print('post trips') post_trips(pgrest, trips) return trips
def main(): args = cli_args() auth = KNACK_CREDENTIALS[args.app_name] cfg_dataset = cfg[args.dataset] filters = knackutil.date_filter_on_or_after( args.last_run_date, cfg_dataset["modified_date_field_id"] ) kn = knackpy_wrapper(cfg_dataset, auth, filters=filters) if not kn.data: return 0 # Filter data for records that have been modifed after the last # job run last_run_timestamp = arrow.get(args.last_run_date).timestamp * 1000 kn.data = filter_by_date( kn.data, cfg_dataset["modified_date_field"], last_run_timestamp ) if not kn.data: return 0 pgrest = Postgrest(cfg_dataset["pgrest_base_url"], auth=JOB_DB_API_TOKEN) for record in kn.data: # convert mills timestamp to iso record[cfg_dataset["modified_date_field"]] = arrow.get((record[cfg_dataset["modified_date_field"]] / 1000)).format() kn.data = datautil.lower_case_keys(kn.data) pgrest.upsert(kn.data) return len(kn.data)
def main(): def async_wrapper(end_time): try: t0 = time.time() data = get_data(client, end_time, interval, paging) elapsed = time.time() - t0 logging.info('Request Time: {} seconds for {} records'.format( elapsed, len(data))) min_max(data) data = parse_routes(data) data = drop_dupes(data) data = [{ field: row[field] for field in config.FIELDS if field.get('upsert_mds') } for row in data] data = floats_to_iso( data, [field for field in config.FIELDS if field.get('datetime')]) if data: results = post_data(pgrest, data) return data except Exception as e: logging.error(e) logging.error("{} to {} FAILED".format(end_time - interval, end_time)) args = cli_args() provider_name = args.provider_name cfg = secrets.PROVIDERS[provider_name] pgrest = Postgrest(secrets.PG["url"], auth=secrets.PG["token"]) start = args.start end = args.end if not start: # use the most recent record as the start date start = most_recent(pgrest, cfg["provider_id"]) start = to_unix(start) if not end: # use current time as the end date end = int(datetime.today().timestamp()) interval = cfg["interval"] paging = cfg["paging"] if cfg.pop("time_format") == "mills": # mills to unix start, end, interval = int(start * 1000), int(end * 1000), int( interval * 1000) if not cfg["client_params"].get("token"): auth_info = cfg.get("oauth") url = auth_info.pop("url") cfg["client_params"]["token"] = get_token(url, auth_info) client = ProviderClient(**cfg["client_params"]) total = 0 workers = 6 pool = ThreadPool(workers) results = pool.map(async_wrapper, range(start, end, interval)) pool.close() pool.join() for result in results: if result: total += len(result) logging.info(total) return total
def main(): args = cli_args() cfg_dataset = cfg[args.dataset] limit = cfg_dataset.get("limit") if not args.last_run_date or args.replace: last_run_date = "1970-01-01T00:00:00" else: last_run_date = datetime.utcfromtimestamp(int(args.last_run_date)) last_run_date = last_run_date.isoformat() pgrest = Postgrest(cfg_dataset["pgrest_base_url"], auth=JOB_DB_API_TOKEN) ''' The `interval` is the number of records which will be processed on each loop. It servers as the `offset` paramenter, so it's the means by which we chunk records. 1000 matches the max records of returned by our postgres instance, so each loop = 1 request to the source db (postgrest). We have disabled pagination in our requests (see below), so it's very important that our `offset` does not exceed the number of records the API will return in one request. ''' interval = 1000 offset = 0 records_processed = 0 while True: ''' Download records in chunks, posting each chunk to socrata. Note that the `order` param ensures that each offset request returns the expected chunk of records, by preserving the order in which records are returned. Ordering slows down the response from the postgREST, but it's necessary to ensure consistent results. ''' params = { cfg_dataset["modified_date_field"]: f"gte.{last_run_date}", "limit": limit, "order": "{}.asc".format(cfg_dataset["modified_date_field"]), "offset": offset } ''' We disable `pagination` in this request because we are simulating pagination by manually passing an `offest` to each request. ''' records = pgrest.select(params=params, pagination=False) print("got {} records".format(len(records))) if not records: break if args.destination[0] == "socrata": date_fields = cfg_dataset.get("date_fields") pub = socrata_pub(records, cfg_dataset, args.replace, date_fields=date_fields) print("Published {} records.".format(len(records))) offset += interval records_processed += len(records) return records_processed
import requests import pdb import json from datetime import datetime, timedelta from pypgrest import Postgrest from tdutils import argutil from config.secrets import * form_id = "44359e32-1a7f-41bd-b53e-3ebc039bd21a" key = FULCRUM_CRED.get("api_key") # create postgrest instance pgrest = Postgrest( "http://transportation-data.austintexas.io/signal_pms", auth=JOB_DB_API_TOKEN ) def get_pgrest_records(): """Summary Returns: TYPE: Description """ # the datetime converstin for modified_date is not right. The time part are missing params = {} results = pgrest.select(params=params) if len(results) != 0:
import time import copy import _setpath from config.secrets import * from config.knack.config import SIGNAL_PMS_POSTGRE_KNACK as cfg # from tdutils.pgrestutil import Postgrest from tdutils import argutil, datautil from pypgrest import Postgrest key = FULCRUM.get("api_key") # create postgrest instance pgrest = Postgrest(cfg.get("postgre_url"), auth=JOB_DB_API_TOKEN) def get_postgre_records(): """Summary get postgreSQL records for all signal pms currently in postgreSQL Returns: list: list of dictionaries of all pgrest records """ params = {} postgre_records = pgrest.select(params=params) postgre_records_df = pd.DataFrame.from_dict(postgre_records) # temporary fix to remove duplicate pm records
def main(): args = cli_args() provider_name = args.provider_name cfg = secrets.PROVIDERS[provider_name] pgrest = Postgrest(secrets.PG["url"], auth=secrets.PG["token"]) start = args.start end = args.end offset = cfg["time_offset_seconds"] if not start: # use the most recent record as the start date (minus the offset) start = most_recent(pgrest, cfg["provider_id"]) start = to_unix(start) start = start - offset if not end: # use current time as the end date end = int(datetime.today().timestamp()) interval = cfg["interval"] if cfg.get("time_format") == "mills": # mills to unix start, end, interval = int(start * 1000), int(end * 1000), int( interval * 1000) auth_type = cfg.get("auth_type") if not cfg.get("token") and auth_type.lower() != "httpbasicauth": token_res = get_token(cfg["auth_url"], cfg["auth_data"]) cfg["token"] = token_res[cfg["auth_token_res_key"]] client_params = build_client_params(cfg) client = ProviderClient(**client_params) total = 0 for i in range(start, end, interval): data = get_data(client, i, interval, cfg["paging"]) print(start) if data: data = parse_routes(data) data = drop_dupes(data) data = [{ field["name"]: row[field["name"]] for field in config.FIELDS if field.get("upload_mds") } for row in data] data = floats_to_iso( data, [ field["name"] for field in config.FIELDS if field.get("datetime") ], ) post_data(pgrest, data) total += len(data) else: continue return total
class PerfMetDB: """ Represents a connection to the PostgREST instance of the performance metrics tables. """ def __init__(self, accessPointJob, accessPointObs, apiKey, needsObs=False): """ Initializes the connection to the PostgREST instance. @param accessPointJob: the PostgREST "etl_perfmet_job" table endpoint @param accessPointObs: the PostgREST "etl_perfmet_obs" table endpoint @param apiKey: the PostgREST API key needed to write to the endpoints @param needsObs: set this to True to enable the writing of observations. """ self.jobDB = Postgrest(accessPointJob, auth=apiKey) self.obsDB = None if needsObs: self.obsDB = Postgrest(accessPointObs, auth=apiKey) def writeJob(self, perfMet): """ Writes the job information to the job log. """ metadata = { "data_source": perfMet.dataSource, "stage": perfMet.stage, "seconds": perfMet.processingTotal, "records": perfMet.records, "processing_date": str(perfMet.processingTime), "collection_start": str(date_util.localize(perfMet.collectTimeStart)) if perfMet.collectTimeStart else None, "collection_end": str(date_util.localize(perfMet.collectTimeEnd)) if perfMet.collectTimeEnd else None } self.jobDB.upsert(metadata) def readAllJobs(self, timestampIn): """ Reads all jobs activity for the given processing day of the timestamp. """ day = date_util.roundDay(date_util.localize(timestampIn)) command = { "select": "data_source,stage,seconds,records,processing_date,collection_start,collection_end", "processing_date": [ "gte.%s" % str(day), "lt.%s" % str( date_util.localize( day.replace(tzinfo=None) + datetime.timedelta(days=1))) ], "order": "data_source,stage" } return self.jobDB.select(params=command) def getRecentJobsDate(self): """ Returns the most recent processing date for jobs. """ command = { "select": "processing_date", "order": "processing_date.desc", "limit": 1 } ret = self.jobDB.select(params=command) if ret and ret[0] and "processing_date" in ret[0]: ret = ret[0]["processing_date"] else: ret = None return ret def writeObs(self, perfMet): """ Writes observations to the observations log. """ metadata = [] if not perfMet.observations: return for identifier, obs in perfMet.observations.items(): minTimestamp = obs.minTimestamp if minTimestamp: if isinstance(minTimestamp, datetime.datetime): minTimestamp = str(date_util.localize(minTimestamp)) maxTimestamp = obs.maxTimestamp if maxTimestamp: if isinstance(maxTimestamp, datetime.datetime): maxTimestamp = str(date_util.localize(maxTimestamp)) metadata.append({ "data_source": perfMet.dataSource, "sensor_name": identifier[0], "data_type": identifier[1], "data": obs.observation, "expected": obs.expected, "collection_date": str(obs.collectionDate), "timestamp_min": minTimestamp, "timestamp_max": maxTimestamp }) self.obsDB.upsert(metadata) def readAllObs(self, timestampIn, earlyDate=None, dataSource=None, obsType=None): """ Reads all observations activity for the given collection day of the timestamp. """ if not earlyDate: timestampIn = date_util.roundDay(date_util.localize(timestampIn)) earlyDate = date_util.localize( timestampIn.replace(tzinfo=None) - datetime.timedelta(days=1)) collDateClause = [ "gte.%s" % str(timestampIn), "lt.%s" % str( date_util.localize( timestampIn.replace(tzinfo=None) + datetime.timedelta(days=1))) ] else: collDateClause = [ "gt.%s" % str(earlyDate), "lte.%s" % str(timestampIn) ] command = { "select": "data_source,sensor_name,data_type,data,expected,collection_date,timestamp_min,timestamp_max", "collection_date": collDateClause, "order": "data_type,sensor_name,collection_date" } if dataSource: command["data_source"] = "eq.%s" % dataSource if obsType: command["data_type"] = "eq.%s" % obsType return self.obsDB.select(params=command)
def main(): def async_wrapper(interval): start = interval[0] end = interval[1] entry = "bird [{}, {}]".format(start, end) print(entry) try: t0 = time.time() print('get data') data = get_data(client, start, end) elapsed = time.time() - t0 logging.info('Request Time: {} seconds for {} records'.format( elapsed, len(data))) data = parse_routes(data) data = drop_dupes(data) data = [{ field['name']: row[field['name']] for field in config.FIELDS if field.get('upload_mds') } for row in data] data = floats_to_iso(data, [ field['name'] for field in config.FIELDS if field.get('datetime') ]) if data: results = post_data(pgrest, data) return data except Exception: logging.error("#FAILED {}".format(entry)) args = cli_args() provider_name = args.provider_name cfg = secrets.PROVIDERS[provider_name] pgrest = Postgrest(secrets.PG["url"], auth=secrets.PG["token"]) if not cfg["client_params"].get("token"): auth_info = cfg.get("oauth") url = auth_info.pop("url") cfg["client_params"]["token"] = get_token(url, auth_info) client = ProviderClient(**cfg["client_params"]) total = 0 workers = 4 pool = ThreadPool(workers) results = pool.map(async_wrapper, INTERVALS) pool.close() pool.join() for result in results: if result: total += len(result) logging.info(total) return total
def main(): def split_trips(data, fields): """ Split trip into separate rows (one for trip start and one fore trip end) """ split = [] for row in data: new_row = { "type": "start", "x": row[fields["trip_start_x_field"]], "y": row[fields["trip_start_y_field"]], } new_row.update(**row) split.append(new_row) new_row = { "type": "end", fields["trip_id_field"]: row[fields["trip_id_field"]], "x": row[fields["trip_end_x_field"]], "y": row[fields["trip_end_y_field"]], } new_row.update(**row) split.append(new_row) return split def create_points(data): """ Create shapely geometry from list of dicts """ for row in data: if row["x"] and row["y"]: try: row["geometry"] = point.Point(float(row["x"]), float(row["y"])) except: row["geometry"] = None else: row["geometry"] = None return data def read_json(f): """ Load (geo)JSON into memory """ with open(f, "r") as fin: return json.loads(fin.read()) def point_in_poly( points=None, polys=None, row_property_key="cell_id", feature_property_key="id", geom_key="geometry", null_val=None, ): """ Get property of polygon that intersects input point. Assumes input polygons do not overlap. points: list of dicts with shapely point geometries polys: geojson polygon feature collection row_property_key: the property name of the input point that will be assigned a value from intersecting the polygon feature_property_key: the property name of the intersecting polygon that will be assigned to intersecting point(s) geom_key: the name of the feature proerty which contains the geometry of the feature. Applies to both input points and polygons null_val: the value which will be assigned to `<row_property_key>` if no intersecting polygon is found. Returns points with updated <row_property_key> """ start_time = time.time() count = 0 # create grid cell index of polygon *bounding boxes* idx = index.Index() for pos, feature in enumerate(polys["features"]): idx.insert(pos, shape(feature[geom_key]).bounds) # find intersecting polygon for i, pt in enumerate(points): if pt[geom_key]: matched = False # iterate through polygon *bounding boxes* that intersect with point for intersect_pos in idx.intersection(pt[geom_key].coords[0]): poly = shape(polys["features"][intersect_pos][geom_key]) # check if point intersects actual polygon if pt[geom_key].intersects(poly): pt.update({ row_property_key: polys["features"][intersect_pos]["properties"] [feature_property_key] }) matched = True # break because we assume there are no overlapping polygons break if not matched: # add empty cell ID rows not contained by poly count += 1 pt.update({row_property_key: null_val}) else: count += 1 pt.update({row_property_key: null_val}) elapsed_time = time.time() - start_time print(f"{count} points outside the input poly(s)") print(f"{elapsed_time} seconds for point in poly") return points def merge_trips(trips, id_field="trip_id"): # merge start and end trips to single trip with start and end cell new_trips = {} for trip in trips: trip_id = trip[id_field] trip_type = trip.get("type") geoid = trip.get("census_geoid") district = trip.get("council_district") if trip_type == "start": current_data = { "census_geoid_start": geoid, "council_district_start": district, } elif trip_type == "end": current_data = { "census_geoid_end": geoid, "council_district_end": district, } if trip_id in new_trips: new_trips[trip_id].update(current_data) else: new_trips[trip_id] = current_data new_trips[trip_id].update(**trip) return [new_trips[trip] for trip in new_trips.keys()] def reduce_fields(data, fieldnames): return [{key: row[key] for key in fieldnames} for row in data] def post_trips(client, trips): print("upsert!") print(len(trips)) return client.upsert(trips) field_map = { "trip_id_field": "trip_id", "trip_start_y_field": "start_latitude", "trip_start_x_field": "start_longitude", "trip_end_y_field": "end_latitude", "trip_end_x_field": "end_longitude", } # move working directory to script location to # ensure relative paths work (in case script # is run by external launcher) if os.path.dirname(__file__): os.chdir(os.path.dirname(__file__)) districts = read_json(config.DISTRICTS_GEOJSON) census_tracts = read_json(config.CENSUS_TRACTS_GEOJSON) # num of records that will be processed per request # not to exceed the postgrest db request limit, which you need to know interval = 5000 total = 0 pgrest = Postgrest(secrets.PG["url"], auth=secrets.PG["token"]) while True: # loop until request no longer yields trips params = { "select": "trip_id,provider_id,start_latitude,start_longitude,end_latitude,end_longitude", "census_geoid_start": "is.null", # assume if census geoid is null the record has not been processed "limit": interval, } print("get data") trips = pgrest.select(params) if not trips: logging.info("All records processed.") break trips = split_trips(trips, field_map) trips = create_points(trips) trips = point_in_poly( points=trips, polys=census_tracts, row_property_key="census_geoid", feature_property_key="GEOID10", null_val="OUT_OF_BOUNDS", ) trips = point_in_poly( points=trips, polys=districts, row_property_key="council_district", feature_property_key="district_n", null_val=0, ) trips = merge_trips(trips) trips = reduce_fields( trips, [ field["name"] for field in config.FIELDS if field.get("upload_postgrest") ], ) post_trips(pgrest, trips) total += len(trips) return total
def __init__(self, accessPoint, apiKey): """ Initializes the PostgREST access with a given access point URL and the API key. """ self.catalogDB = Postgrest(accessPoint, auth=apiKey)
class CatalogPostgREST: """ Implements catalog access functions using PostgREST. """ def __init__(self, accessPoint, apiKey): """ Initializes the PostgREST access with a given access point URL and the API key. """ self.catalogDB = Postgrest(accessPoint, auth=apiKey) def query(self, dataSource, stage, base, ext, earlyDate=None, lateDate=None, exactEarlyDate=False, limit=None, start=None, reverse=False): """ Performs a query on the given datatype, data stage, base, ext, and optional early and late dates. Returns a list of dictionary objects, each a result. @param exactEarlyDate: Set this to true to query only on exact date defined by the earlyDate parameter @param limit Limits the output to a specific number of records. If None, then the driver default is used. @param start sets the start frome wnen doing a multi-chunk query. @param reverse will allow the results to be sorted in descending order. """ # TODO: Do we need a query that will return a catalog entry that contains a given collection date (between collection_date # and collection_end)? # Specify query plus required parameters and sorting/pagination parameters: command = { "select": "collection_date,collection_end,processing_date,pointer,id_base,id_ext,metadata", "repository": "eq.%s" % stage, "data_source": "eq.%s" % dataSource, "order": ("collection_date.asc" if not reverse else "collection_date.desc") + ",id_base.asc,id_ext.asc", "limit": 1 if limit is None else limit, "offset": 0 if start is None else start } # Allow base and ext identifiers to be omitted, or to be a "match first part of string" query: if base is not None: if "%%" in base: command["id_base"] = "like.%s" % base.replace("%%", "*") else: command["id_base"] = "eq.%s" % base if ext is not None: if "%%" in ext: command["id_ext"] = "like.%s" % ext.replace("%%", "*") else: command["id_ext"] = "eq.%s" % ext # Collection date range: May need to use an array because there could be two constraints: collDateRange = [] if earlyDate is not None: if exactEarlyDate: collDateRange.append("eq.%s" % str(earlyDate)) else: collDateRange.append("gte.%s" % str(earlyDate)) if lateDate is not None: collDateRange.append("lt.%s" % str(lateDate)) if collDateRange: if len(collDateRange) == 1: command["collection_date"] = collDateRange[0] else: command["collection_date"] = collDateRange # Run the query: return self.catalogDB.select(params=command) def upsert(self, upsertDataList): """ Performs an upsert operation on the given list of dictionary objects. Each dictionary object shall contain "repository", "data_source", "id_base", "id_ext", "pointer", "collection_date", "collection_end" (optional), "processing_date", and optionally "metadata". """ try: self.catalogDB.upsert(upsertDataList) except: print( "ERROR: Exception encountered in CatalogPostgREST.upsert(). Input:" ) print(upsertDataList) raise @staticmethod def getPreferredChunk(): """ Retruns the preferred chunk size that catalog.Catalog.query() should used in requests. """ return PREFERRED_CHUNK_SIZE