def main():

    args = cli_args()
    
    auth = KNACK_CREDENTIALS[args.app_name]

    cfg_dataset = cfg[args.dataset]

    filters = knackutil.date_filter_on_or_after(
        args.last_run_date, cfg_dataset["modified_date_field_id"]
    )

    kn = knackpy_wrapper(cfg_dataset, auth, filters=filters)

    if not kn.data:
        return 0

    # Filter data for records that have been modifed after the last
    # job run
    last_run_timestamp = arrow.get(args.last_run_date).timestamp * 1000

    kn.data = filter_by_date(
        kn.data, cfg_dataset["modified_date_field"], last_run_timestamp
    )

    if not kn.data:
        return 0

    pgrest = Postgrest(cfg_dataset["pgrest_base_url"], auth=JOB_DB_API_TOKEN)

    for record in kn.data:
        # convert mills timestamp to iso
        record[cfg_dataset["modified_date_field"]] = arrow.get((record[cfg_dataset["modified_date_field"]] / 1000)).format()

    kn.data = datautil.lower_case_keys(kn.data)
    
    pgrest.upsert(kn.data)

    return len(kn.data)
Exemple #2
0
class PerfMetDB:
    """
    Represents a connection to the PostgREST instance of the performance metrics tables.
    """
    def __init__(self, accessPointJob, accessPointObs, apiKey, needsObs=False):
        """
        Initializes the connection to the PostgREST instance.
        
        @param accessPointJob: the PostgREST "etl_perfmet_job" table endpoint
        @param accessPointObs: the PostgREST "etl_perfmet_obs" table endpoint
        @param apiKey: the PostgREST API key needed to write to the endpoints
        @param needsObs: set this to True to enable the writing of observations.
        """
        self.jobDB = Postgrest(accessPointJob, auth=apiKey)
        self.obsDB = None
        if needsObs:
            self.obsDB = Postgrest(accessPointObs, auth=apiKey)

    def writeJob(self, perfMet):
        """
        Writes the job information to the job log.
        """
        metadata = {
            "data_source":
            perfMet.dataSource,
            "stage":
            perfMet.stage,
            "seconds":
            perfMet.processingTotal,
            "records":
            perfMet.records,
            "processing_date":
            str(perfMet.processingTime),
            "collection_start":
            str(date_util.localize(perfMet.collectTimeStart))
            if perfMet.collectTimeStart else None,
            "collection_end":
            str(date_util.localize(perfMet.collectTimeEnd))
            if perfMet.collectTimeEnd else None
        }
        self.jobDB.upsert(metadata)

    def readAllJobs(self, timestampIn):
        """
        Reads all jobs activity for the given processing day of the timestamp.
        """
        day = date_util.roundDay(date_util.localize(timestampIn))
        command = {
            "select":
            "data_source,stage,seconds,records,processing_date,collection_start,collection_end",
            "processing_date": [
                "gte.%s" % str(day),
                "lt.%s" % str(
                    date_util.localize(
                        day.replace(tzinfo=None) + datetime.timedelta(days=1)))
            ],
            "order":
            "data_source,stage"
        }
        return self.jobDB.select(params=command)

    def getRecentJobsDate(self):
        """
        Returns the most recent processing date for jobs.
        """
        command = {
            "select": "processing_date",
            "order": "processing_date.desc",
            "limit": 1
        }
        ret = self.jobDB.select(params=command)
        if ret and ret[0] and "processing_date" in ret[0]:
            ret = ret[0]["processing_date"]
        else:
            ret = None
        return ret

    def writeObs(self, perfMet):
        """
        Writes observations to the observations log.
        """
        metadata = []
        if not perfMet.observations:
            return
        for identifier, obs in perfMet.observations.items():
            minTimestamp = obs.minTimestamp
            if minTimestamp:
                if isinstance(minTimestamp, datetime.datetime):
                    minTimestamp = str(date_util.localize(minTimestamp))
            maxTimestamp = obs.maxTimestamp
            if maxTimestamp:
                if isinstance(maxTimestamp, datetime.datetime):
                    maxTimestamp = str(date_util.localize(maxTimestamp))
            metadata.append({
                "data_source": perfMet.dataSource,
                "sensor_name": identifier[0],
                "data_type": identifier[1],
                "data": obs.observation,
                "expected": obs.expected,
                "collection_date": str(obs.collectionDate),
                "timestamp_min": minTimestamp,
                "timestamp_max": maxTimestamp
            })
        self.obsDB.upsert(metadata)

    def readAllObs(self,
                   timestampIn,
                   earlyDate=None,
                   dataSource=None,
                   obsType=None):
        """
        Reads all observations activity for the given collection day of the timestamp.
        """
        if not earlyDate:
            timestampIn = date_util.roundDay(date_util.localize(timestampIn))
            earlyDate = date_util.localize(
                timestampIn.replace(tzinfo=None) - datetime.timedelta(days=1))
            collDateClause = [
                "gte.%s" % str(timestampIn),
                "lt.%s" % str(
                    date_util.localize(
                        timestampIn.replace(tzinfo=None) +
                        datetime.timedelta(days=1)))
            ]
        else:
            collDateClause = [
                "gt.%s" % str(earlyDate),
                "lte.%s" % str(timestampIn)
            ]

        command = {
            "select":
            "data_source,sensor_name,data_type,data,expected,collection_date,timestamp_min,timestamp_max",
            "collection_date": collDateClause,
            "order": "data_type,sensor_name,collection_date"
        }
        if dataSource:
            command["data_source"] = "eq.%s" % dataSource
        if obsType:
            command["data_type"] = "eq.%s" % obsType
        return self.obsDB.select(params=command)
Exemple #3
0
class CatalogPostgREST:
    """
    Implements catalog access functions using PostgREST.
    """
    def __init__(self, accessPoint, apiKey):
        """
        Initializes the PostgREST access with a given access point URL and the API key.
        """
        self.catalogDB = Postgrest(accessPoint, auth=apiKey)

    def query(self,
              dataSource,
              stage,
              base,
              ext,
              earlyDate=None,
              lateDate=None,
              exactEarlyDate=False,
              limit=None,
              start=None,
              reverse=False):
        """
        Performs a query on the given datatype, data stage, base, ext, and optional early and late dates. Returns a list
        of dictionary objects, each a result.
        
        @param exactEarlyDate: Set this to true to query only on exact date defined by the earlyDate parameter
        @param limit Limits the output to a specific number of records. If None, then the driver default is used.
        @param start sets the start frome wnen doing a multi-chunk query.
        @param reverse will allow the results to be sorted in descending order.
        """
        # TODO: Do we need a query that will return a catalog entry that contains a given collection date (between collection_date
        # and collection_end)?

        # Specify query plus required parameters and sorting/pagination parameters:
        command = {
            "select":
            "collection_date,collection_end,processing_date,pointer,id_base,id_ext,metadata",
            "repository":
            "eq.%s" % stage,
            "data_source":
            "eq.%s" % dataSource,
            "order":
            ("collection_date.asc" if not reverse else "collection_date.desc")
            + ",id_base.asc,id_ext.asc",
            "limit":
            1 if limit is None else limit,
            "offset":
            0 if start is None else start
        }

        # Allow base and ext identifiers to be omitted, or to be a "match first part of string" query:
        if base is not None:
            if "%%" in base:
                command["id_base"] = "like.%s" % base.replace("%%", "*")
            else:
                command["id_base"] = "eq.%s" % base
        if ext is not None:
            if "%%" in ext:
                command["id_ext"] = "like.%s" % ext.replace("%%", "*")
            else:
                command["id_ext"] = "eq.%s" % ext

        # Collection date range: May need to use an array because there could be two constraints:
        collDateRange = []
        if earlyDate is not None:
            if exactEarlyDate:
                collDateRange.append("eq.%s" % str(earlyDate))
            else:
                collDateRange.append("gte.%s" % str(earlyDate))
        if lateDate is not None:
            collDateRange.append("lt.%s" % str(lateDate))
        if collDateRange:
            if len(collDateRange) == 1:
                command["collection_date"] = collDateRange[0]
            else:
                command["collection_date"] = collDateRange

        # Run the query:
        return self.catalogDB.select(params=command)

    def upsert(self, upsertDataList):
        """
        Performs an upsert operation on the given list of dictionary objects. Each dictionary object shall contain
        "repository", "data_source", "id_base", "id_ext", "pointer", "collection_date", "collection_end" (optional),
        "processing_date", and optionally "metadata".
        """
        try:
            self.catalogDB.upsert(upsertDataList)
        except:
            print(
                "ERROR: Exception encountered in CatalogPostgREST.upsert(). Input:"
            )
            print(upsertDataList)
            raise

    @staticmethod
    def getPreferredChunk():
        """
        Retruns the preferred chunk size that catalog.Catalog.query() should used in requests.
        """
        return PREFERRED_CHUNK_SIZE