def main(): args = cli_args() auth = KNACK_CREDENTIALS[args.app_name] cfg_dataset = cfg[args.dataset] filters = knackutil.date_filter_on_or_after( args.last_run_date, cfg_dataset["modified_date_field_id"] ) kn = knackpy_wrapper(cfg_dataset, auth, filters=filters) if not kn.data: return 0 # Filter data for records that have been modifed after the last # job run last_run_timestamp = arrow.get(args.last_run_date).timestamp * 1000 kn.data = filter_by_date( kn.data, cfg_dataset["modified_date_field"], last_run_timestamp ) if not kn.data: return 0 pgrest = Postgrest(cfg_dataset["pgrest_base_url"], auth=JOB_DB_API_TOKEN) for record in kn.data: # convert mills timestamp to iso record[cfg_dataset["modified_date_field"]] = arrow.get((record[cfg_dataset["modified_date_field"]] / 1000)).format() kn.data = datautil.lower_case_keys(kn.data) pgrest.upsert(kn.data) return len(kn.data)
class PerfMetDB: """ Represents a connection to the PostgREST instance of the performance metrics tables. """ def __init__(self, accessPointJob, accessPointObs, apiKey, needsObs=False): """ Initializes the connection to the PostgREST instance. @param accessPointJob: the PostgREST "etl_perfmet_job" table endpoint @param accessPointObs: the PostgREST "etl_perfmet_obs" table endpoint @param apiKey: the PostgREST API key needed to write to the endpoints @param needsObs: set this to True to enable the writing of observations. """ self.jobDB = Postgrest(accessPointJob, auth=apiKey) self.obsDB = None if needsObs: self.obsDB = Postgrest(accessPointObs, auth=apiKey) def writeJob(self, perfMet): """ Writes the job information to the job log. """ metadata = { "data_source": perfMet.dataSource, "stage": perfMet.stage, "seconds": perfMet.processingTotal, "records": perfMet.records, "processing_date": str(perfMet.processingTime), "collection_start": str(date_util.localize(perfMet.collectTimeStart)) if perfMet.collectTimeStart else None, "collection_end": str(date_util.localize(perfMet.collectTimeEnd)) if perfMet.collectTimeEnd else None } self.jobDB.upsert(metadata) def readAllJobs(self, timestampIn): """ Reads all jobs activity for the given processing day of the timestamp. """ day = date_util.roundDay(date_util.localize(timestampIn)) command = { "select": "data_source,stage,seconds,records,processing_date,collection_start,collection_end", "processing_date": [ "gte.%s" % str(day), "lt.%s" % str( date_util.localize( day.replace(tzinfo=None) + datetime.timedelta(days=1))) ], "order": "data_source,stage" } return self.jobDB.select(params=command) def getRecentJobsDate(self): """ Returns the most recent processing date for jobs. """ command = { "select": "processing_date", "order": "processing_date.desc", "limit": 1 } ret = self.jobDB.select(params=command) if ret and ret[0] and "processing_date" in ret[0]: ret = ret[0]["processing_date"] else: ret = None return ret def writeObs(self, perfMet): """ Writes observations to the observations log. """ metadata = [] if not perfMet.observations: return for identifier, obs in perfMet.observations.items(): minTimestamp = obs.minTimestamp if minTimestamp: if isinstance(minTimestamp, datetime.datetime): minTimestamp = str(date_util.localize(minTimestamp)) maxTimestamp = obs.maxTimestamp if maxTimestamp: if isinstance(maxTimestamp, datetime.datetime): maxTimestamp = str(date_util.localize(maxTimestamp)) metadata.append({ "data_source": perfMet.dataSource, "sensor_name": identifier[0], "data_type": identifier[1], "data": obs.observation, "expected": obs.expected, "collection_date": str(obs.collectionDate), "timestamp_min": minTimestamp, "timestamp_max": maxTimestamp }) self.obsDB.upsert(metadata) def readAllObs(self, timestampIn, earlyDate=None, dataSource=None, obsType=None): """ Reads all observations activity for the given collection day of the timestamp. """ if not earlyDate: timestampIn = date_util.roundDay(date_util.localize(timestampIn)) earlyDate = date_util.localize( timestampIn.replace(tzinfo=None) - datetime.timedelta(days=1)) collDateClause = [ "gte.%s" % str(timestampIn), "lt.%s" % str( date_util.localize( timestampIn.replace(tzinfo=None) + datetime.timedelta(days=1))) ] else: collDateClause = [ "gt.%s" % str(earlyDate), "lte.%s" % str(timestampIn) ] command = { "select": "data_source,sensor_name,data_type,data,expected,collection_date,timestamp_min,timestamp_max", "collection_date": collDateClause, "order": "data_type,sensor_name,collection_date" } if dataSource: command["data_source"] = "eq.%s" % dataSource if obsType: command["data_type"] = "eq.%s" % obsType return self.obsDB.select(params=command)
class CatalogPostgREST: """ Implements catalog access functions using PostgREST. """ def __init__(self, accessPoint, apiKey): """ Initializes the PostgREST access with a given access point URL and the API key. """ self.catalogDB = Postgrest(accessPoint, auth=apiKey) def query(self, dataSource, stage, base, ext, earlyDate=None, lateDate=None, exactEarlyDate=False, limit=None, start=None, reverse=False): """ Performs a query on the given datatype, data stage, base, ext, and optional early and late dates. Returns a list of dictionary objects, each a result. @param exactEarlyDate: Set this to true to query only on exact date defined by the earlyDate parameter @param limit Limits the output to a specific number of records. If None, then the driver default is used. @param start sets the start frome wnen doing a multi-chunk query. @param reverse will allow the results to be sorted in descending order. """ # TODO: Do we need a query that will return a catalog entry that contains a given collection date (between collection_date # and collection_end)? # Specify query plus required parameters and sorting/pagination parameters: command = { "select": "collection_date,collection_end,processing_date,pointer,id_base,id_ext,metadata", "repository": "eq.%s" % stage, "data_source": "eq.%s" % dataSource, "order": ("collection_date.asc" if not reverse else "collection_date.desc") + ",id_base.asc,id_ext.asc", "limit": 1 if limit is None else limit, "offset": 0 if start is None else start } # Allow base and ext identifiers to be omitted, or to be a "match first part of string" query: if base is not None: if "%%" in base: command["id_base"] = "like.%s" % base.replace("%%", "*") else: command["id_base"] = "eq.%s" % base if ext is not None: if "%%" in ext: command["id_ext"] = "like.%s" % ext.replace("%%", "*") else: command["id_ext"] = "eq.%s" % ext # Collection date range: May need to use an array because there could be two constraints: collDateRange = [] if earlyDate is not None: if exactEarlyDate: collDateRange.append("eq.%s" % str(earlyDate)) else: collDateRange.append("gte.%s" % str(earlyDate)) if lateDate is not None: collDateRange.append("lt.%s" % str(lateDate)) if collDateRange: if len(collDateRange) == 1: command["collection_date"] = collDateRange[0] else: command["collection_date"] = collDateRange # Run the query: return self.catalogDB.select(params=command) def upsert(self, upsertDataList): """ Performs an upsert operation on the given list of dictionary objects. Each dictionary object shall contain "repository", "data_source", "id_base", "id_ext", "pointer", "collection_date", "collection_end" (optional), "processing_date", and optionally "metadata". """ try: self.catalogDB.upsert(upsertDataList) except: print( "ERROR: Exception encountered in CatalogPostgREST.upsert(). Input:" ) print(upsertDataList) raise @staticmethod def getPreferredChunk(): """ Retruns the preferred chunk size that catalog.Catalog.query() should used in requests. """ return PREFERRED_CHUNK_SIZE