Example #1
0
def _uploadObs(targetDate, observations):
    "Called by processObs."
    fields = KNACK_OBS_VIEW["fields"]
    for _, obs in observations.iterrows():
        record = {
            fields["data_source"]: obs["data_source"],
            fields["sensor_name"]: obs["sensor_name"],
            fields["data_type"]: obs["data_type"],
            fields["data"]: obs["data"],
            fields["expected"]: obs["expected"],
            fields["collection_date"]: localTimeStruct(obs["collection_date"])
        }
        if obs["timestamp_min"]:
            record[fields["timestamp_range"]] = {
                "times": [{
                    "from": localTimeStruct(obs["timestamp_min"]),
                    "to": localTimeStruct(obs["timestamp_max"])
                }]
            }
            day = date_util.roundDay(date_util.localize(
                obs["collection_date"]))
            record[fields["timestamp_range_min"]] = max(
                (date_util.localize(date_util.parseDate(obs["timestamp_min"]))
                 - day).total_seconds() / 3600, 0)
            record[fields["timestamp_range_max"]] = min(
                (date_util.localize(date_util.parseDate(obs["timestamp_max"]))
                 - day).total_seconds() / 3600, 24)
        regulate(lambda: record_view(record,
                                     app_id=config_app.KNACK_PERFMET_ID,
                                     api_key="knack",
                                     method="create",
                                     scene=KNACK_OBS_VIEW["scene"],
                                     view=KNACK_OBS_VIEW["view"]))
Example #2
0
def localTimeStruct(timeStr):
    "Returns the 'specific times' time structure for Knack from the given time string."
    ourTime = date_util.localize(date_util.parseDate(timeStr))
    return {
        "date": ourTime.strftime("%m/%d/%Y"),
        "hours": int(ourTime.strftime("%I")),
        "minutes": ourTime.minute,
        "am_pm": ourTime.strftime("%p")
    }
Example #3
0
    def _ingestArgs(self, args):
        """
        This is where arguments are ingested and set to Initializer class-level attributes. Override
        this and call the parent if custom arguments need to be processed.
        """
        # Local time zone:
        date_util.setLocalTimezone(config.getLocalTimezone())
        
        # Last run date:
        if hasattr(args, "last_run_date") and args.last_run_date:
            self.lastRunDate = date_util.parseDate(args.last_run_date, dateOnly=self.parseDateOnly)
            print("Last run date: %s" % str(self.lastRunDate))
        else:
            self.lastRunDate = None
    
        # Start date, or number of days back:
        if hasattr(args, "start_date") and args.start_date:
            try:
                dateEarliest = int(args.start_date)
                self.startDate = date_util.localize(arrow.now()
                    .replace(hour=0, minute=0, second=0, microsecond=0)
                    .shift(days=-dateEarliest).datetime)
            except ValueError:
                self.startDate = date_util.parseDate(args.start_date, dateOnly=self.parseDateOnly)
        else:
            self.startDate = None

        # End date:
        if hasattr(args, "end_date") and args.end_date:
            self.endDate = date_util.parseDate(args.end_date, dateOnly=self.parseDateOnly)
        else:
            self.endDate = None
            
        if self.startDate or self.endDate:
            dateStr = "INFO: Processing time range:"
            if self.startDate:
                dateStr += " " + str(self.startDate)
                if self.endDate:
                    if self.endDate == self.startDate:
                        dateStr += " only"
                    else:
                        dateStr += " up to " + str(self.endDate)
                else:
                    dateStr += " onward"
            print(dateStr + ".")
        if not self.lastRunDate and not self.startDate:
            raise Exception("A last_run_date or start_date must be specified.")
            
        # Force overwrite:
        if hasattr(args, "force"):
            self.forceOverwrite = args.force
            if self.forceOverwrite:
                print("INFO: Force mode is on: items will be overwritten.") 
            
        # Production mode:
        self.productionMode = config.electProductionMode(not args.debug) \
            if hasattr(args, "debug") else config.electProductionMode()
        if not self.productionMode:
            print("INFO: Debug mode is enabled.")
    
        # Debugging features:
        if hasattr(args, "simulate"):
            self.simulationMode = args.simulate
            if self.simulationMode:
                print("INFO: Simulated write mode is enabled.")
        if hasattr(args, "output_filepath"):
            self.writeFilePath = args.output_filepath
            if self.writeFilePath:
                print("INFO: Write file path is: %s" % self.writeFilePath)
            
        # Set up temporary output directory:
        if self.needsTempDir:
            self.tempDir = tempfile.mkdtemp()
            print("INFO: Created holding place: %s" % self.tempDir)
Example #4
0
def main():
    # Parse command-line parameter:
    parser = ArgumentParser(description=PROGRAM_DESC,
                            formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument(
        "-r",
        "--last_run_date",
        help=
        "last run date, in YYYY-MM-DD format with optional time zone offset (default: yesterday)"
    )
    args = parser.parse_args()

    date_util.setLocalTimezone(config_app.TIMEZONE)
    if args.last_run_date:
        lastRunDate = date_util.parseDate(args.last_run_date, dateOnly=True)
        print("perfmet_knack: Last run date: %s" % str(lastRunDate))
    else:
        lastRunDate = date_util.roundDay(
            date_util.localize(datetime.datetime.now() -
                               datetime.timedelta(days=1)))

    # Find the most recent day for performance metrics:
    print("Finding most recent processing date...")
    perfMetDB = config_app.createPerfmetConn()
    recent = perfMetDB.getRecentJobsDate()
    if not recent:
        print(
            "ERROR: No recent processing date is found in the performance metrics DB."
        )
        return -1
    recent = date_util.roundDay(date_util.localize(
        date_util.parseDate(recent)))
    if recent < lastRunDate:
        print("ERROR: No processing date exists after %s" % str(lastRunDate))
        return -1
    print("The most recent processing date is %s." % str(recent))

    # Performe the activities:
    print("Retrieving old Knack entries...")
    jobData = retrieveJobs()
    obsData = retrieveObservations()

    print("Deleting old Knack entries...")
    delete(jobData, obsData)

    print("Uploading new Knack Jobs entries...")
    # Perform all retrieval and custom processing to get data ready for Knack:
    jobs = perfMetDB.readAllJobs(recent)
    jobs = pd.DataFrame(jobs)
    jobs = jobs.sort_values("processing_date").groupby(
        ["data_source", "stage"]).tail(1)
    jobs["stage"].replace("Socrata Agg.", "Socrata", inplace=True)
    jobs["stage"].replace("Ingest", "a. Ingest", inplace=True)
    jobs["stage"].replace("Standardize", "b. Standardize", inplace=True)
    jobs["stage"].replace("Ready", "c. Ready", inplace=True)
    jobs["stage"].replace("Aggregate", "d. Aggregate", inplace=True)
    jobs["stage"].replace("Publish", "e. Publish", inplace=True)

    uploadJobs(jobs)

    # Deal with observations here:
    processObs(perfMetDB,
               jobs,
               "Bluetooth",
               "b. Standardize",
               "Unmatched Entries",
               calcExpected=True)
    processObs(perfMetDB,
               jobs,
               "Wavetronix",
               "b. Standardize",
               "Vehicle Counts",
               calcExpected=True)
    processObs(perfMetDB,
               jobs,
               "GRIDSMART",
               "b. Standardize",
               "Vehicle Counts",
               calcExpected=True)

    print("Done.")
    return 1
Example #5
0
def processObs(perfMetDB,
               jobs,
               dataSource,
               stage,
               obsType,
               sampleDays=SAMPLE_DAYS,
               calcExpected=False):
    "Reads observations from the database and prepares them for sending to Knack."
    print("Processing new Knack '%s' observations..." % dataSource)
    rec = jobs[(jobs["data_source"] == dataSource)
               & (jobs["stage"] == stage)].copy()
    if len(rec) == 0:
        print("WARNING: No entry for '%s'/'%s' was found in etl_perfmet_job!" %
              (dataSource, stage))
        return None

    # Get processing record that covers the latest date:
    rec.sort_values("collection_end", ascending=False, inplace=True)
    rec = rec.iloc[0]

    # Retrieve observations for the given date range:
    lateDate = date_util.localize(date_util.parseDate(rec["collection_end"]))
    earlyDate = date_util.localize(
        lateDate.replace(tzinfo=None) - datetime.timedelta(days=sampleDays))
    observations = perfMetDB.readAllObs(lateDate,
                                        earlyDate=earlyDate,
                                        dataSource=dataSource,
                                        obsType=obsType)
    if not observations:
        print("WARNING: No observations are found for '%s', type '%s'." %
              (dataSource, obsType))
        return None
    observations = pd.DataFrame(observations)
    observations["collection_date"] = observations["collection_date"].apply(
        lambda t: date_util.localize(date_util.parseDate(t)))
    observations.sort_values("collection_date", ascending=False, inplace=True)

    # Pick out the one that covers the latest date:
    yesterday = date_util.localize(
        lateDate.replace(tzinfo=None) - datetime.timedelta(days=1))
    # TODO: If we end up processing hourly, then we'll need to change this beginning time mechanism.
    obsSubset = observations[(observations["collection_date"] <= lateDate)
                             & (observations["collection_date"] >= yesterday)]
    obsSubset = obsSubset.loc[obsSubset.groupby("sensor_name")
                              ["collection_date"].idxmax()]
    maxes = observations.loc[observations.groupby("sensor_name")
                             ["collection_date"].idxmax()]
    avgs = observations.groupby("sensor_name")["data"].mean()
    ret = pd.DataFrame()
    for index, obs in maxes.iterrows():
        # TODO: There's probably some fancy merge that we could do to make this process easier.
        if index not in obsSubset.index:
            # No recent entry available for this day! Make a fake entry, which is the most recent entry that had data.
            # That way, we can see when the data stopped.
            rec = maxes.loc[index].copy()
            rec["data"] = -1
        else:
            rec = obsSubset.loc[index].copy()
        if calcExpected:
            rec["expected"] = avgs[obs["sensor_name"]]
        ret = ret.append(rec)
    _uploadObs(yesterday, ret)
    return ret