def _uploadObs(targetDate, observations): "Called by processObs." fields = KNACK_OBS_VIEW["fields"] for _, obs in observations.iterrows(): record = { fields["data_source"]: obs["data_source"], fields["sensor_name"]: obs["sensor_name"], fields["data_type"]: obs["data_type"], fields["data"]: obs["data"], fields["expected"]: obs["expected"], fields["collection_date"]: localTimeStruct(obs["collection_date"]) } if obs["timestamp_min"]: record[fields["timestamp_range"]] = { "times": [{ "from": localTimeStruct(obs["timestamp_min"]), "to": localTimeStruct(obs["timestamp_max"]) }] } day = date_util.roundDay(date_util.localize( obs["collection_date"])) record[fields["timestamp_range_min"]] = max( (date_util.localize(date_util.parseDate(obs["timestamp_min"])) - day).total_seconds() / 3600, 0) record[fields["timestamp_range_max"]] = min( (date_util.localize(date_util.parseDate(obs["timestamp_max"])) - day).total_seconds() / 3600, 24) regulate(lambda: record_view(record, app_id=config_app.KNACK_PERFMET_ID, api_key="knack", method="create", scene=KNACK_OBS_VIEW["scene"], view=KNACK_OBS_VIEW["view"]))
def localTimeStruct(timeStr): "Returns the 'specific times' time structure for Knack from the given time string." ourTime = date_util.localize(date_util.parseDate(timeStr)) return { "date": ourTime.strftime("%m/%d/%Y"), "hours": int(ourTime.strftime("%I")), "minutes": ourTime.minute, "am_pm": ourTime.strftime("%p") }
def _ingestArgs(self, args): """ This is where arguments are ingested and set to Initializer class-level attributes. Override this and call the parent if custom arguments need to be processed. """ # Local time zone: date_util.setLocalTimezone(config.getLocalTimezone()) # Last run date: if hasattr(args, "last_run_date") and args.last_run_date: self.lastRunDate = date_util.parseDate(args.last_run_date, dateOnly=self.parseDateOnly) print("Last run date: %s" % str(self.lastRunDate)) else: self.lastRunDate = None # Start date, or number of days back: if hasattr(args, "start_date") and args.start_date: try: dateEarliest = int(args.start_date) self.startDate = date_util.localize(arrow.now() .replace(hour=0, minute=0, second=0, microsecond=0) .shift(days=-dateEarliest).datetime) except ValueError: self.startDate = date_util.parseDate(args.start_date, dateOnly=self.parseDateOnly) else: self.startDate = None # End date: if hasattr(args, "end_date") and args.end_date: self.endDate = date_util.parseDate(args.end_date, dateOnly=self.parseDateOnly) else: self.endDate = None if self.startDate or self.endDate: dateStr = "INFO: Processing time range:" if self.startDate: dateStr += " " + str(self.startDate) if self.endDate: if self.endDate == self.startDate: dateStr += " only" else: dateStr += " up to " + str(self.endDate) else: dateStr += " onward" print(dateStr + ".") if not self.lastRunDate and not self.startDate: raise Exception("A last_run_date or start_date must be specified.") # Force overwrite: if hasattr(args, "force"): self.forceOverwrite = args.force if self.forceOverwrite: print("INFO: Force mode is on: items will be overwritten.") # Production mode: self.productionMode = config.electProductionMode(not args.debug) \ if hasattr(args, "debug") else config.electProductionMode() if not self.productionMode: print("INFO: Debug mode is enabled.") # Debugging features: if hasattr(args, "simulate"): self.simulationMode = args.simulate if self.simulationMode: print("INFO: Simulated write mode is enabled.") if hasattr(args, "output_filepath"): self.writeFilePath = args.output_filepath if self.writeFilePath: print("INFO: Write file path is: %s" % self.writeFilePath) # Set up temporary output directory: if self.needsTempDir: self.tempDir = tempfile.mkdtemp() print("INFO: Created holding place: %s" % self.tempDir)
def main(): # Parse command-line parameter: parser = ArgumentParser(description=PROGRAM_DESC, formatter_class=RawDescriptionHelpFormatter) parser.add_argument( "-r", "--last_run_date", help= "last run date, in YYYY-MM-DD format with optional time zone offset (default: yesterday)" ) args = parser.parse_args() date_util.setLocalTimezone(config_app.TIMEZONE) if args.last_run_date: lastRunDate = date_util.parseDate(args.last_run_date, dateOnly=True) print("perfmet_knack: Last run date: %s" % str(lastRunDate)) else: lastRunDate = date_util.roundDay( date_util.localize(datetime.datetime.now() - datetime.timedelta(days=1))) # Find the most recent day for performance metrics: print("Finding most recent processing date...") perfMetDB = config_app.createPerfmetConn() recent = perfMetDB.getRecentJobsDate() if not recent: print( "ERROR: No recent processing date is found in the performance metrics DB." ) return -1 recent = date_util.roundDay(date_util.localize( date_util.parseDate(recent))) if recent < lastRunDate: print("ERROR: No processing date exists after %s" % str(lastRunDate)) return -1 print("The most recent processing date is %s." % str(recent)) # Performe the activities: print("Retrieving old Knack entries...") jobData = retrieveJobs() obsData = retrieveObservations() print("Deleting old Knack entries...") delete(jobData, obsData) print("Uploading new Knack Jobs entries...") # Perform all retrieval and custom processing to get data ready for Knack: jobs = perfMetDB.readAllJobs(recent) jobs = pd.DataFrame(jobs) jobs = jobs.sort_values("processing_date").groupby( ["data_source", "stage"]).tail(1) jobs["stage"].replace("Socrata Agg.", "Socrata", inplace=True) jobs["stage"].replace("Ingest", "a. Ingest", inplace=True) jobs["stage"].replace("Standardize", "b. Standardize", inplace=True) jobs["stage"].replace("Ready", "c. Ready", inplace=True) jobs["stage"].replace("Aggregate", "d. Aggregate", inplace=True) jobs["stage"].replace("Publish", "e. Publish", inplace=True) uploadJobs(jobs) # Deal with observations here: processObs(perfMetDB, jobs, "Bluetooth", "b. Standardize", "Unmatched Entries", calcExpected=True) processObs(perfMetDB, jobs, "Wavetronix", "b. Standardize", "Vehicle Counts", calcExpected=True) processObs(perfMetDB, jobs, "GRIDSMART", "b. Standardize", "Vehicle Counts", calcExpected=True) print("Done.") return 1
def processObs(perfMetDB, jobs, dataSource, stage, obsType, sampleDays=SAMPLE_DAYS, calcExpected=False): "Reads observations from the database and prepares them for sending to Knack." print("Processing new Knack '%s' observations..." % dataSource) rec = jobs[(jobs["data_source"] == dataSource) & (jobs["stage"] == stage)].copy() if len(rec) == 0: print("WARNING: No entry for '%s'/'%s' was found in etl_perfmet_job!" % (dataSource, stage)) return None # Get processing record that covers the latest date: rec.sort_values("collection_end", ascending=False, inplace=True) rec = rec.iloc[0] # Retrieve observations for the given date range: lateDate = date_util.localize(date_util.parseDate(rec["collection_end"])) earlyDate = date_util.localize( lateDate.replace(tzinfo=None) - datetime.timedelta(days=sampleDays)) observations = perfMetDB.readAllObs(lateDate, earlyDate=earlyDate, dataSource=dataSource, obsType=obsType) if not observations: print("WARNING: No observations are found for '%s', type '%s'." % (dataSource, obsType)) return None observations = pd.DataFrame(observations) observations["collection_date"] = observations["collection_date"].apply( lambda t: date_util.localize(date_util.parseDate(t))) observations.sort_values("collection_date", ascending=False, inplace=True) # Pick out the one that covers the latest date: yesterday = date_util.localize( lateDate.replace(tzinfo=None) - datetime.timedelta(days=1)) # TODO: If we end up processing hourly, then we'll need to change this beginning time mechanism. obsSubset = observations[(observations["collection_date"] <= lateDate) & (observations["collection_date"] >= yesterday)] obsSubset = obsSubset.loc[obsSubset.groupby("sensor_name") ["collection_date"].idxmax()] maxes = observations.loc[observations.groupby("sensor_name") ["collection_date"].idxmax()] avgs = observations.groupby("sensor_name")["data"].mean() ret = pd.DataFrame() for index, obs in maxes.iterrows(): # TODO: There's probably some fancy merge that we could do to make this process easier. if index not in obsSubset.index: # No recent entry available for this day! Make a fake entry, which is the most recent entry that had data. # That way, we can see when the data stopped. rec = maxes.loc[index].copy() rec["data"] = -1 else: rec = obsSubset.loc[index].copy() if calcExpected: rec["expected"] = avgs[obs["sensor_name"]] ret = ret.append(rec) _uploadObs(yesterday, ret) return ret