Ejemplos de localize en Python, ejemplos de atd_data_lake.util.date_util.localize en Python

Ejemplo n.º 1

0

Mostrar archivo

 def writeObs(self, perfMet):
     """
     Writes observations to the observations log.
     """
     metadata = []
     if not perfMet.observations:
         return
     for identifier, obs in perfMet.observations.items():
         minTimestamp = obs.minTimestamp
         if minTimestamp:
             if isinstance(minTimestamp, datetime.datetime):
                 minTimestamp = str(date_util.localize(minTimestamp))
         maxTimestamp = obs.maxTimestamp
         if maxTimestamp:
             if isinstance(maxTimestamp, datetime.datetime):
                 maxTimestamp = str(date_util.localize(maxTimestamp))
         metadata.append({
             "data_source": perfMet.dataSource,
             "sensor_name": identifier[0],
             "data_type": identifier[1],
             "data": obs.observation,
             "expected": obs.expected,
             "collection_date": str(obs.collectionDate),
             "timestamp_min": minTimestamp,
             "timestamp_max": maxTimestamp
         })
     self.obsDB.upsert(metadata)

Ejemplo n.º 2

0

Mostrar archivo

def _uploadObs(targetDate, observations):
    "Called by processObs."
    fields = KNACK_OBS_VIEW["fields"]
    for _, obs in observations.iterrows():
        record = {
            fields["data_source"]: obs["data_source"],
            fields["sensor_name"]: obs["sensor_name"],
            fields["data_type"]: obs["data_type"],
            fields["data"]: obs["data"],
            fields["expected"]: obs["expected"],
            fields["collection_date"]: localTimeStruct(obs["collection_date"])
        }
        if obs["timestamp_min"]:
            record[fields["timestamp_range"]] = {
                "times": [{
                    "from": localTimeStruct(obs["timestamp_min"]),
                    "to": localTimeStruct(obs["timestamp_max"])
                }]
            }
            day = date_util.roundDay(date_util.localize(
                obs["collection_date"]))
            record[fields["timestamp_range_min"]] = max(
                (date_util.localize(date_util.parseDate(obs["timestamp_min"]))
                 - day).total_seconds() / 3600, 0)
            record[fields["timestamp_range_max"]] = min(
                (date_util.localize(date_util.parseDate(obs["timestamp_max"]))
                 - day).total_seconds() / 3600, 24)
        regulate(lambda: record_view(record,
                                     app_id=config_app.KNACK_PERFMET_ID,
                                     api_key="knack",
                                     method="create",
                                     scene=KNACK_OBS_VIEW["scene"],
                                     view=KNACK_OBS_VIEW["view"]))

Ejemplo n.º 3

0

Mostrar archivo

 def query(self, stage, base, ext, earlyDate, lateDate, exactEarlyDate=False, limit=None, reverse=False):
     """
     Returns a generator of catalog entries sorted by date that match the given criteria.
     
     @param earlyDate: Set this to None to have no early date.
     @param lateDate: Set this to None to have no late date.
     @param exactEarlyDate: Set this to true to query only on exact date defined by the earlyDate parameter
     @param limit: Optional limit on query results
     """
     offset = 0
     while limit is None or offset < limit:
         results = self.dbConn.query(self.dataSource, stage, base, ext, earlyDate, lateDate, \
             exactEarlyDate=exactEarlyDate, limit=self.dbConn.getPreferredChunk(), start=offset, reverse=reverse)
         if results:
             for item in results:
                 if item["collection_date"]:
                     item["collection_date"] = date_util.localize(arrow.get(item["collection_date"]).datetime)
                 if item["collection_end"]:
                     item["collection_end"] = date_util.localize(arrow.get(item["collection_end"]).datetime)
                 if item["processing_date"]:
                     item["processing_date"] = date_util.localize(arrow.get(item["processing_date"]).datetime)
                 yield item
         if not results or len(results) < self.dbConn.getPreferredChunk():
             break
         offset += len(results)

Ejemplo n.º 4

0

Mostrar archivo

    def retrieve(self, earlyDate=None, lateDate=None):
        """
        Returns Wavetronix records between the given dates. Returns dictionary of days with list of records as values.
        """
        ret = {}
        cursor = self.conn.cursor()
        sql = """SELECT a.DETID, b.INTID, CAST(CURDATETIME AS date), CURDATETIME, INTNAME, DETNAME, a.VOLUME, a.OCCUPANCY, SPEED, STATUS,
UPLOADSUCCESS,DETCOUNTCOMPARISON, DAILYCUMULATIVE FROM KITSDB.KITS.SYSDETHISTORYRM AS a, KITSDB.KITS.DETECTORSRM AS b"""
        datePart = self._buildDatePart(earlyDate, lateDate, includeWhere=True)
        if not datePart:
            sql += " WHERE "
        else:
            sql += datePart + " AND "
        sql += " a.DETID = b.DETID"
        sql += " ORDER BY CURDATETIME, INTNAME, DETNAME;"
        cursor.execute(sql)
        for row in cursor:
            rec = KITSDBRec(detID=int(row[0]),
                            intID=int(row[1]),
                            curDateTime=date_util.localize(row[3]),
                            intName=row[4],
                            detName=row[5],
                            volume=int(row[6]),
                            occupancy=int(row[7]),
                            speed=int(row[8]),
                            status=row[9],
                            uploadSuccess=int(row[10]),
                            detCountComparison=int(row[11]),
                            dailyCumulative=int(row[12]))
            ourDate = date_util.localize(
                datetime.datetime.strptime(row[2], "%Y-%m-%d"))
            if ourDate not in ret:
                ret[ourDate] = []
            ret[ourDate].append(rec)
        return ret

Ejemplo n.º 5

0

Mostrar archivo

Archivo: etl_app.py Proyecto: cityofaustin/atd-data-lake

    def doMainLoop(self):
        """
        Coordinates the main loop activity
        """
        # TODO: Add in benchmarking
        
        # TODO: Add in a preparation method call?
        
        # --- BEGIN STUFF. TODO: Support for loop over time period at intervals (with functional disable for that)?
        self.runCount = 1
        recsProcessed = 0
        self.processingDate = date_util.localize(arrow.now().datetime)
        
        # TODO: Exception handling with retry ability?
        
        recsProcessed += self.etlActivity()

        self.runCount += 1
        # --- END STUFF
        
        if self.perfmet:
            self.perfmet.logJob(recsProcessed)
        
        # TODO: Shutdown method call?
        
        return recsProcessed

Ejemplo n.º 6

0

Mostrar archivo

Archivo: last_update.py Proyecto: cityofaustin/atd-data-lake

 def isWithin(self, cmpDate, cmpDateEnd):
     "Checks to see if the given date range overlaps the current position."
     if self.curIndex < len(self.items):
         endDate = self.items[self.curIndex].dateEnd
         if not cmpDateEnd:
             cmpDateEnd = date_util.localize(
                 cmpDate.replace(tzinfo=None) +
                 datetime.timedelta(days=1))
         if not endDate:
             endDate = date_util.localize(
                 self.items[self.curIndex].date.replace(tzinfo=None) +
                 datetime.timedelta(days=1))
         if not (cmpDateEnd <= self.items[self.curIndex].date
                 or cmpDate >= endDate):
             return True
     return False

Ejemplo n.º 7

0

Mostrar archivo

 def getSearchableQueryList(self, stage, base, ext, earlyDate, lateDate, exactEarlyDate=False, singleLatest=False, baseDict=False):
     """
     Returns a _SearchableQueryList object that contains a list of catalog entries sorted by date that match the criteria.
     The getNextDateIndex() method can be called to identify the index that corresponds with the next date.
     
     @param earlyDate: Set this to None to have no early date.
     @param lateDate: Set this to None to have no late date.
     @param exactEarlyDate: Set this to true to query only on exact date defined by the earlyDate parameter
     @return A single _SearchableQueryList object
     """
     limit = None
     reverse = False
     if singleLatest:
         limit = 1
         reverse = True
     queryList = self.getQueryList(stage, base, ext, earlyDate, lateDate, exactEarlyDate, limit=limit, reverse=reverse)
     
     # Try to get the next one, for good measure:
     if lateDate:
         addedQueryItem = self.queryEarliest(stage, base, ext, date_util.localize(lateDate.replace(tzinfo=None) + datetime.timedelta(seconds=1)))
         if addedQueryItem:
             if not queryList:
                 queryList = [addedQueryItem]
             elif addedQueryItem["collection_date"] != queryList[-1]["collection_date"]:
                 queryList.append(addedQueryItem)
     if queryList:
         return self._SearchableQueryList([x["collection_date"] for x in queryList], queryList)
     return None

Ejemplo n.º 8

0

Mostrar archivo

Archivo: etl_app.py Proyecto: cityofaustin/atd-data-lake

 def doCompareLoop(self, provSrc, provTgt, baseExtKey=True):
     """
     Sets up and iterates through the compare loop, calling innerLoopActivity.
     
     @param provSrc: Specifies source providers as a last_update.LastUpdateProv object
     @param provTgt: Specifies the target provider as a last_update.LastUpdateProv object, or None for all sources
     @param baseExtKey: Set this to true to compare the presence of both base and ext; otherwise, just base is used.
     """
     comparator = last_update.LastUpdate(provSrc, provTgt,
                             force=self.forceOverwrite).configure(startDate=self.startDate,
                                                                  endDate=self.endDate,
                                                                  baseExtKey=baseExtKey)
     self.itemCount = 0
     self.prevDate = None
     for item in comparator.compare(lastRunDate=self.lastRunDate):
         if item.identifier.date != self.prevDate and self.storageTgt:
             self.storageTgt.flushCatalog()
         
         self.processingDate = date_util.localize(arrow.now().datetime)
         countIncr = self.innerLoopActivity(item)
         
         if countIncr:
             if not self.prevDate:
                 self.prevDate = item.identifier.date
         self.itemCount += countIncr            
     else:
         if self.storageTgt:
             self.storageTgt.flushCatalog()
     return self.itemCount

Ejemplo n.º 9

0

Mostrar archivo

def localTimeStruct(timeStr):
    "Returns the 'specific times' time structure for Knack from the given time string."
    ourTime = date_util.localize(date_util.parseDate(timeStr))
    return {
        "date": ourTime.strftime("%m/%d/%Y"),
        "hours": int(ourTime.strftime("%I")),
        "minutes": ourTime.minute,
        "am_pm": ourTime.strftime("%p")
    }

Ejemplo n.º 10

0

Mostrar archivo

 def readAllJobs(self, timestampIn):
     """
     Reads all jobs activity for the given processing day of the timestamp.
     """
     day = date_util.roundDay(date_util.localize(timestampIn))
     command = {
         "select":
         "data_source,stage,seconds,records,processing_date,collection_start,collection_end",
         "processing_date": [
             "gte.%s" % str(day),
             "lt.%s" % str(
                 date_util.localize(
                     day.replace(tzinfo=None) + datetime.timedelta(days=1)))
         ],
         "order":
         "data_source,stage"
     }
     return self.jobDB.select(params=command)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: bt_json_standard.py Proyecto: cityofaustin/atd-data-lake

def _parseTimeShort(inTime):
    "Parses the time string as encountered in the Bluetooth source files."

    try:
        return str(
            date_util.localize(
                datetime.datetime.strptime(inTime, "%m/%d/%Y %I:%M %p")))
    except (ValueError, TypeError):
        return None

Ejemplo n.º 12

0

Mostrar archivo

Archivo: last_update.py Proyecto: cityofaustin/atd-data-lake

 def __init__(self, sameDay=False):
     """
     Base constructor.
     
     @param sameDay: If True, allows a last update that happens "today" to be processed, if there is no end date specified.
     """
     self.startDate = None
     self.endDate = None
     self.sameDayDate = date_util.localize(datetime.datetime.now()).replace(hour=0, minute=0, second=0, microsecond=0) \
                     if not sameDay else None

Ejemplo n.º 13

0

Mostrar archivo

def makeHeader(areaBase, device, sameDay=False):
    """
    Utility function for unit data retriever classes that builds up a header
    """
    currentTime = date_util.localize(arrow.now().datetime)
    ourDay = currentTime.replace(hour=0, minute=0, second=0, microsecond=0)
    if not sameDay:
        ourDay = date_util.localize(
            ourDay.replace(tzinfo=None) - datetime.timedelta(days=1))

    targetFilename = "{}_{}_{}_unit_data".format(areaBase, device,
                                                 ourDay.strftime("%Y-%m-%d"))
    header = {
        "data_type": "{}_unit_data".format(device),
        "target_filename": targetFilename + ".json",
        "collection_date": str(ourDay),
        "processing_date": str(currentTime)
    }
    return header

Ejemplo n.º 14

0

Mostrar archivo

def fillDayRecords(ourDate, countsFileData, ident, receiver):
    "Caution: this mutates countsFileData."

    ourDateMax = date_util.localize(
        ourDate.replace(tzinfo=None) + datetime.timedelta(days=1))
    for item in countsFileData["data"]:
        timestamp = arrow.get(item["timestamp_adj"]).datetime
        if timestamp >= ourDate and timestamp < ourDateMax:
            # This record falls within range:
            item["zone"] = ident
            receiver.append(item)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: perfmet.py Proyecto: cityofaustin/atd-data-lake

 def recordCollect(self, timestampIn, representsDay=False):
     """
     Tracks the maximum and minimum timestamps. Note that this performs comparisons without localization.
     """
     timestampEnd = date_util.localize(
         timestampIn.replace(tzinfo=None) +
         datetime.timedelta(days=1)) if representsDay else timestampIn
     if not self.collectTimeStart:
         self.collectTimeStart = timestampIn
     if not self.collectTimeEnd:
         self.collectTimeEnd = timestampEnd
     self.collectTimeStart = min(self.collectTimeStart, timestampIn)
     self.collectTimeEnd = max(self.collectTimeEnd, timestampEnd)

Ejemplo n.º 16

0

Mostrar archivo

 def writeJob(self, perfMet):
     """
     Writes the job information to the job log.
     """
     metadata = {
         "data_source":
         perfMet.dataSource,
         "stage":
         perfMet.stage,
         "seconds":
         perfMet.processingTotal,
         "records":
         perfMet.records,
         "processing_date":
         str(perfMet.processingTime),
         "collection_start":
         str(date_util.localize(perfMet.collectTimeStart))
         if perfMet.collectTimeStart else None,
         "collection_end":
         str(date_util.localize(perfMet.collectTimeEnd))
         if perfMet.collectTimeEnd else None
     }
     self.jobDB.upsert(metadata)

Ejemplo n.º 17

0

Mostrar archivo

 def query(self, earlyDate=None, lateDate=None):
     """
     Does a quick search to check for the presence of records. Returns dictionary of days with number of records as values.
     """
     ret = {}
     cursor = self.conn.cursor()
     sql = "SELECT CAST(CURDATETIME AS date), COUNT(1) FROM KITSDB.KITS.SYSDETHISTORYRM"
     sql += self._buildDatePart(earlyDate, lateDate, includeWhere=True)
     sql += " GROUP BY CAST(CURDATETIME AS date);"
     cursor.execute(sql)
     for row in cursor:
         ret[date_util.localize(
             datetime.datetime.strptime(row[0], "%Y-%m-%d"))] = int(row[1])
     return ret

Ejemplo n.º 18

0

Mostrar archivo

    def readAllObs(self,
                   timestampIn,
                   earlyDate=None,
                   dataSource=None,
                   obsType=None):
        """
        Reads all observations activity for the given collection day of the timestamp.
        """
        if not earlyDate:
            timestampIn = date_util.roundDay(date_util.localize(timestampIn))
            earlyDate = date_util.localize(
                timestampIn.replace(tzinfo=None) - datetime.timedelta(days=1))
            collDateClause = [
                "gte.%s" % str(timestampIn),
                "lt.%s" % str(
                    date_util.localize(
                        timestampIn.replace(tzinfo=None) +
                        datetime.timedelta(days=1)))
            ]
        else:
            collDateClause = [
                "gt.%s" % str(earlyDate),
                "lte.%s" % str(timestampIn)
            ]

        command = {
            "select":
            "data_source,sensor_name,data_type,data,expected,collection_date,timestamp_min,timestamp_max",
            "collection_date": collDateClause,
            "order": "data_type,sensor_name,collection_date"
        }
        if dataSource:
            command["data_source"] = "eq.%s" % dataSource
        if obsType:
            command["data_type"] = "eq.%s" % obsType
        return self.obsDB.select(params=command)

Ejemplo n.º 19

0

Mostrar archivo

Archivo: gs_log_reader.py Proyecto: cityofaustin/atd-data-lake

    def _getLogDates(self):
        """
        _getLogDates internally retrieves the log date list from the device.
        """
        baseURL = self.device.getURL()
        try:
            webResponse = requests.get(baseURL + "counts.json")
        except:
            print("Problem base URL: %s" % baseURL, file=sys.stderr)
            raise

        countsAvail = webResponse.json()
        for item in countsAvail:
            self.avail.add(
                date_util.localize(datetime.datetime.strptime(
                    item, "%Y-%m-%d")))

Ejemplo n.º 20

0

Mostrar archivo

 def getLatestTimestamp(self, earlyDate=None, lateDate=None):
     """
     Performs a query to determine the latest timestamp encountered that sits between the date range (default: no bound).
     
     @return the latest timestamp, including date and time
     """
     cursor = self.conn.cursor()
     sql = "SELECT TOP 1 CURDATETIME FROM KITSDB.KITS.SYSDETHISTORYRM"
     sql += self._buildDatePart(earlyDate, lateDate, includeWhere=True)
     sql += " ORDER BY CURDATETIME DESC;"
     cursor.execute(sql)
     row = cursor.fetchone()
     if row:
         return date_util.localize(
             datetime.datetime.strptime(row[0], "%Y-%m-%d"))
     return None

Ejemplo n.º 21

0

Mostrar archivo

Archivo: wt_json_standard.py Proyecto: cityofaustin/atd-data-lake

def wtStandardize(storageItem, filepathSrc, filenameTgt, processingDate):
    """
    Performs the actual Wavetronix standardization, which is basically doing a direct translation from CSV to JSON
    """
    # Define header:
    jsonHeader = {"data_type": "wavetronix",
                  "origin_filename": os.path.basename(filepathSrc),
                  "target_filename": filenameTgt,
                  "collection_date": str(storageItem.identifier.date),
                  "processing_date": str(processingDate)}

    # Read in the file:
    data = []
    perfWork = {} # This will be sensor -> [count, minTime, maxTime]
    with open(filepathSrc, "rt") as fileReader:
        reader = csv.DictReader(fileReader)
        for row in reader:
            data.append({"detID": int(row["detID"]),
                         "intID": int(row["intID"]),
                         "curDateTime": str(date_util.localize(datetime.datetime.strptime(row["curDateTime"], "%Y-%m-%d %H:%M:%S"))),
                         "intName": row["intName"],
                         "detName": row["detName"],
                         "volume": int(row["volume"]),
                         "occupancy": int(row["occupancy"]),
                         "speed": int(row["speed"]),
                         "status": row["status"],
                         "uploadSuccess": int(row["uploadSuccess"]),
                         "detCountComparison": int(row["detCountComparison"]),
                         "dailyCumulative": int(row["dailyCumulative"])})
            
            # Performance metrics:
            if row["intName"] and str(row["intName"] != "nan"):
                if row["intName"] not in perfWork:
                    perfWork[row["intName"]] = [0, row["curDateTime"], row["curDateTime"]]
                recs = perfWork[row["intName"]]
                recs[0] += int(row["volume"])
                if row["curDateTime"]:
                    if row["curDateTime"] < recs[1]:
                        recs[1] = row["curDateTime"]
                    elif row["curDateTime"] > recs[2]:
                        recs[2] = row["curDateTime"]
                
    # We're complete!
    ret = {"header": jsonHeader,
           "data": data}
    return ret, perfWork

Ejemplo n.º 22

0

Mostrar archivo

Archivo: last_upd_fs.py Proyecto: cityofaustin/atd-data-lake

    def prepare(self, startDate, endDate):
        """
        Initializes the query between the start date and the end date. If startDate and endDate are
        the same, then only results for that exact time are queried.
        """
        super().prepare(startDate, endDate)

        # Get the unique dates that are within the time range:
        ourDatesSet = set()
        for index in range(len(self.pattList)):
            ourDates = self.dateDirs[index].getDates()
            for ourDate in ourDates:
                ourDate = date_util.localOverwrite(
                    ourDate) if not self.assumeUTC else date_util.localize(
                        ourDate)
                if (not startDate or ourDate >= startDate) \
                        and (not endDate or ourDate < endDate or startDate == endDate and startDate == ourDate):
                    ourDatesSet.add(ourDate)
        self.dateList = list(ourDatesSet)
        self.dateList.sort()

Ejemplo n.º 23

0

Mostrar archivo

Archivo: gs_ready_agg.py Proyecto: cityofaustin/atd-data-lake

    def innerLoopActivity(self, item):
        """
        This is where the actual ETL activity is called for the given compare item.
        """
        print("%s: %s" % (item.label, self.storageSrc.repository))
        data = self.storageSrc.retrieveJSON(item.label)
        header = data["header"]
        
        # Collect movement information:
        movements = []
        for camera in data["site"]["site"]["CameraDevices"]:
            for zoneMask in camera["Fisheye"]["CameraMasks"]["ZoneMasks"]:
                if "Vehicle" in zoneMask:
                    movements.append({"zone_approach": zoneMask["Vehicle"]["ApproachType"],
                                      "turn_type": zoneMask["Vehicle"]["TurnType"],
                                      "zone": zoneMask["Vehicle"]["Id"]})
        
        # Process the counts:
        countData = pd.DataFrame(data["counts"])
        countData['heavy_vehicle'] = np.where(countData.vehicle_length < 17, 0, 1)
        # In the following line, we convert to UTC because there's a bug in the grouper that doesn't deal with
        # the end of daylight savings time.
        countData['timestamp'] = pd.to_datetime(countData["timestamp_adj"], utc=True)
        countData = countData.merge(pd.DataFrame(movements), on='zone')

        # Do the grouping:        
        colValues = [pd.Grouper(key='timestamp', freq=('%ds' % (self.args.agg * 60))), 'zone_approach', 'turn', 'heavy_vehicle']
        grouped = countData.groupby(colValues)
        volume = grouped.size().reset_index(name='volume')
        avgSpeed = grouped.agg({'speed': 'mean'}).round(3).reset_index().rename(columns={'speed': 'speed_avg'})
        stdSpeed = grouped.agg({'speed': 'std'}).fillna(0).round(3).reset_index().rename(columns={'speed': 'speed_std'})
        avgSecInZone = grouped.agg({'seconds_in_zone': 'mean'}).round(3).reset_index().rename(columns={'seconds_in_zone': 'seconds_in_zone_avg'})
        stdSecInZone = grouped.agg({'seconds_in_zone': 'std'}).round(3).fillna(0).reset_index().rename(columns={'seconds_in_zone': 'seconds_in_zone_std'})

        # Merging all information
        colValues[0] = "timestamp"
        summarized = volume.merge(avgSpeed, on=colValues).merge(stdSpeed, on=colValues).merge(avgSecInZone, on=colValues).merge(stdSecInZone, on=colValues)
        summarized = summarized[['timestamp', 'zone_approach', 'turn', 'heavy_vehicle',
                                'volume', 'speed_avg', 'speed_std', 'seconds_in_zone_avg', 'seconds_in_zone_std']]
        # While converting the timestamp to a string, we also convert it back to our local time zone to counter
        # the grouping/UTC workaround that was performed above.
        summarized["timestamp"] = summarized["timestamp"].dt.tz_convert(date_util.LOCAL_TIMEZONE).astype(str)
        
        # Update the header
        header["processing_date"] = str(date_util.localize(arrow.now().datetime))
        header["agg_interval_sec"] = self.args.agg * 60
        
        # Assemble together the aggregation file:
        newFileContents = {"header": header,
                           "data": summarized.apply(lambda x: x.to_dict(), axis=1).tolist(),
                           "site": data["site"],
                           "device": data["device"]}
        
        # Write the aggregation:
        catalogElement = self.storageTgt.createCatalogElement(item.identifier.base, "agg%d.json" % self.args.agg,
                                                              item.identifier.date, self.processingDate)
        self.storageTgt.writeJSON(newFileContents, catalogElement)
            
        # Performance metrics logging:
        self.perfmet.recordCollect(item.identifier.date, representsDay=True)
        
        return 1

Ejemplo n.º 24

0

Mostrar archivo

Archivo: gs_json_standard.py Proyecto: cityofaustin/atd-data-lake

    def _jsonizeWork(self, fileDict):
        n = len(fileDict)
        i = 0
        self.apiVersion = self.getAPIVersion(fileDict)
        self.setDataColumns()
        for key, value in fileDict.items():
            guid = key
            csvPath = value  # Recall this is a temporary location from unzipped
            collDateStr = str(self.item.identifier.date)
            targetFilename = self.item.identifier.base + '_' + guid + "_" + collDateStr.split(
            )[0] + '.json'

            print(("Working on file {}").format(csvPath))
            # Initiate json object
            jsonData = {'header': self.header, 'data': None}
            # Add header information
            jsonData['header']['origin_filename'] = guid + '.csv'
            jsonData['header']['target_filename'] = targetFilename
            jsonData['header']['version'] = self.apiVersion
            jsonData['header']['guid'] = guid

            data = pd.read_csv(csvPath, header=None, names=self.columns)
            jsonData['data'] = data.apply(lambda x: x.to_dict(),
                                          axis=1).tolist()

            # Fix the time representation. First, find the time delta:
            errs = {}
            newData = []
            try:
                hostTimeUTC = self._getTime(
                    self.siteFile["datetime"]["HostTimeUTC"])
                deviceTime = self._getTime(
                    self.siteFile["datetime"]["DateTime"],
                    self.siteFile["datetime"]["TimeZoneId"].split()[0])
                timeDelta = hostTimeUTC - deviceTime

                # At this point, collect an indication of whether this file accounts for some of the previous day, or some of the
                # next day.
                collDatetime = self.item.identifier.date.replace(hour=0,
                                                                 minute=0,
                                                                 second=0,
                                                                 microsecond=0)
                timestamp = None
                if self.apiVersion == 8 and jsonData['data']:
                    timestamp = datetime.datetime.strptime(
                        collDateStr.split()[0] + " 000000", "%Y-%m-%d %H%M%S")
                    timestamp -= datetime.timedelta(
                        minutes=jsonData['data'][0]['utc_offset'])
                    timestamp = pytz.utc.localize(timestamp)
                    timestamp = date_util.localize(timestamp + timeDelta)
                elif self.apiVersion == 7:
                    print(
                        "WARNING: 'timestamp_adj' processing not provided for API v7!"
                    )
                    # TODO: Figure out the date parsing needed for this.
                elif self.apiVersion == 4:
                    timestamp = datetime.datetime.strptime(
                        collDateStr.split()[0] + " 000000", "%Y-%m-%d %H%M%S")
                    timestamp = pytz.utc.localize(timestamp)
                    timestamp = date_util.localize(timestamp + timeDelta)
                if timestamp:
                    if timestamp < collDatetime:
                        jsonData['header']['day_covered'] = -1
                    elif timestamp == collDatetime:
                        jsonData['header']['day_covered'] = 0
                    else:
                        jsonData['header']['day_covered'] = 1

                # Add in "timestamp_adj" for each data item:
                for item in jsonData['data']:
                    try:
                        if self.apiVersion == 8:
                            # TODO: The UTC Offset doesn't seem to reflect DST. Should we ignore it and blindly localize instead?
                            #       We can figure this out by seeing what the latest count is on a live download of the current day.
                            timestamp = datetime.datetime.strptime(collDateStr.split()[0] + " " \
                                + ("%06d" % int(float(item['timestamp']))) + "." + str(round((item['timestamp'] % 1) * 10) * 100000),
                                "%Y-%m-%d %H%M%S.%f")
                            timestamp -= datetime.timedelta(
                                minutes=item['utc_offset'])
                            timestamp = pytz.utc.localize(timestamp)
                            item['timestamp_adj'] = str(
                                date_util.localize(timestamp + timeDelta))
                        elif self.apiVersion == 7:
                            print(
                                "WARNING: 'timestamp_adj' processing not provided for API v7!"
                            )
                            # TODO: Figure out the date parsing needed for this.
                        elif self.apiVersion == 4:
                            timestamp = datetime.datetime.strptime(
                                item['timestamp'], "%Y%m%dT%H%M%S" +
                                (".%f" if "." in item['timestamp'] else ""))
                            timestamp = pytz.utc.localize(timestamp)
                            item['timestamp_adj'] = str(
                                date_util.localize(timestamp + timeDelta))

                            item['count_version'] = int(item['count_version'])
                        if timestamp:
                            # Performance metrics:
                            if not self.perfWork[1]:
                                self.perfWork = [0, timestamp, timestamp]
                            self.perfWork[0] += 1
                            if timestamp < self.perfWork[1]:
                                self.perfWork[1] = timestamp
                            if timestamp > self.perfWork[2]:
                                self.perfWork[2] = timestamp
                        newData.append(item)
                    except ValueError as exc:
                        err = "WARNING: Value parsing error: " + str(exc)
                        if err not in errs:
                            errs[err] = 0
                        errs[err] += 1
                jsonData['data'] = newData
                for err in errs:
                    print(err + " (" + str(errs[err]) + ")")
            except KeyError:
                print(
                    "WARNING: Time representation processing has malfunctioned. Correct time key may not be present in site file."
                )
            except ValueError as exc:
                print(
                    "WARNING: Time representation processing has malfunctioned. Value parsing error:"
                )
                print(exc)

            # Write to storage object:
            catalogElement = self.storageTgt.createCatalogElement(
                self.item.identifier.base,
                guid + ".json",
                self.item.identifier.date,
                processingDate=self.processingDate)
            self.storageTgt.writeJSON(jsonData,
                                      catalogElement,
                                      cacheCatalogFlag=True)

            i += 1
            print("JSON standardization saved as {}".format(targetFilename))
            print("File {} out of {} done!".format(i, n))

Ejemplo n.º 25

0

Mostrar archivo

def processObs(perfMetDB,
               jobs,
               dataSource,
               stage,
               obsType,
               sampleDays=SAMPLE_DAYS,
               calcExpected=False):
    "Reads observations from the database and prepares them for sending to Knack."
    print("Processing new Knack '%s' observations..." % dataSource)
    rec = jobs[(jobs["data_source"] == dataSource)
               & (jobs["stage"] == stage)].copy()
    if len(rec) == 0:
        print("WARNING: No entry for '%s'/'%s' was found in etl_perfmet_job!" %
              (dataSource, stage))
        return None

    # Get processing record that covers the latest date:
    rec.sort_values("collection_end", ascending=False, inplace=True)
    rec = rec.iloc[0]

    # Retrieve observations for the given date range:
    lateDate = date_util.localize(date_util.parseDate(rec["collection_end"]))
    earlyDate = date_util.localize(
        lateDate.replace(tzinfo=None) - datetime.timedelta(days=sampleDays))
    observations = perfMetDB.readAllObs(lateDate,
                                        earlyDate=earlyDate,
                                        dataSource=dataSource,
                                        obsType=obsType)
    if not observations:
        print("WARNING: No observations are found for '%s', type '%s'." %
              (dataSource, obsType))
        return None
    observations = pd.DataFrame(observations)
    observations["collection_date"] = observations["collection_date"].apply(
        lambda t: date_util.localize(date_util.parseDate(t)))
    observations.sort_values("collection_date", ascending=False, inplace=True)

    # Pick out the one that covers the latest date:
    yesterday = date_util.localize(
        lateDate.replace(tzinfo=None) - datetime.timedelta(days=1))
    # TODO: If we end up processing hourly, then we'll need to change this beginning time mechanism.
    obsSubset = observations[(observations["collection_date"] <= lateDate)
                             & (observations["collection_date"] >= yesterday)]
    obsSubset = obsSubset.loc[obsSubset.groupby("sensor_name")
                              ["collection_date"].idxmax()]
    maxes = observations.loc[observations.groupby("sensor_name")
                             ["collection_date"].idxmax()]
    avgs = observations.groupby("sensor_name")["data"].mean()
    ret = pd.DataFrame()
    for index, obs in maxes.iterrows():
        # TODO: There's probably some fancy merge that we could do to make this process easier.
        if index not in obsSubset.index:
            # No recent entry available for this day! Make a fake entry, which is the most recent entry that had data.
            # That way, we can see when the data stopped.
            rec = maxes.loc[index].copy()
            rec["data"] = -1
        else:
            rec = obsSubset.loc[index].copy()
        if calcExpected:
            rec["expected"] = avgs[obs["sensor_name"]]
        ret = ret.append(rec)
    _uploadObs(yesterday, ret)
    return ret

Ejemplo n.º 26

0

Mostrar archivo

def main():
    # Parse command-line parameter:
    parser = ArgumentParser(description=PROGRAM_DESC,
                            formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument(
        "-r",
        "--last_run_date",
        help=
        "last run date, in YYYY-MM-DD format with optional time zone offset (default: yesterday)"
    )
    args = parser.parse_args()

    date_util.setLocalTimezone(config_app.TIMEZONE)
    if args.last_run_date:
        lastRunDate = date_util.parseDate(args.last_run_date, dateOnly=True)
        print("perfmet_knack: Last run date: %s" % str(lastRunDate))
    else:
        lastRunDate = date_util.roundDay(
            date_util.localize(datetime.datetime.now() -
                               datetime.timedelta(days=1)))

    # Find the most recent day for performance metrics:
    print("Finding most recent processing date...")
    perfMetDB = config_app.createPerfmetConn()
    recent = perfMetDB.getRecentJobsDate()
    if not recent:
        print(
            "ERROR: No recent processing date is found in the performance metrics DB."
        )
        return -1
    recent = date_util.roundDay(date_util.localize(
        date_util.parseDate(recent)))
    if recent < lastRunDate:
        print("ERROR: No processing date exists after %s" % str(lastRunDate))
        return -1
    print("The most recent processing date is %s." % str(recent))

    # Performe the activities:
    print("Retrieving old Knack entries...")
    jobData = retrieveJobs()
    obsData = retrieveObservations()

    print("Deleting old Knack entries...")
    delete(jobData, obsData)

    print("Uploading new Knack Jobs entries...")
    # Perform all retrieval and custom processing to get data ready for Knack:
    jobs = perfMetDB.readAllJobs(recent)
    jobs = pd.DataFrame(jobs)
    jobs = jobs.sort_values("processing_date").groupby(
        ["data_source", "stage"]).tail(1)
    jobs["stage"].replace("Socrata Agg.", "Socrata", inplace=True)
    jobs["stage"].replace("Ingest", "a. Ingest", inplace=True)
    jobs["stage"].replace("Standardize", "b. Standardize", inplace=True)
    jobs["stage"].replace("Ready", "c. Ready", inplace=True)
    jobs["stage"].replace("Aggregate", "d. Aggregate", inplace=True)
    jobs["stage"].replace("Publish", "e. Publish", inplace=True)

    uploadJobs(jobs)

    # Deal with observations here:
    processObs(perfMetDB,
               jobs,
               "Bluetooth",
               "b. Standardize",
               "Unmatched Entries",
               calcExpected=True)
    processObs(perfMetDB,
               jobs,
               "Wavetronix",
               "b. Standardize",
               "Vehicle Counts",
               calcExpected=True)
    processObs(perfMetDB,
               jobs,
               "GRIDSMART",
               "b. Standardize",
               "Vehicle Counts",
               calcExpected=True)

    print("Done.")
    return 1

Ejemplo n.º 27

0

Mostrar archivo

Archivo: etl_app.py Proyecto: cityofaustin/atd-data-lake

    def _ingestArgs(self, args):
        """
        This is where arguments are ingested and set to Initializer class-level attributes. Override
        this and call the parent if custom arguments need to be processed.
        """
        # Local time zone:
        date_util.setLocalTimezone(config.getLocalTimezone())
        
        # Last run date:
        if hasattr(args, "last_run_date") and args.last_run_date:
            self.lastRunDate = date_util.parseDate(args.last_run_date, dateOnly=self.parseDateOnly)
            print("Last run date: %s" % str(self.lastRunDate))
        else:
            self.lastRunDate = None
    
        # Start date, or number of days back:
        if hasattr(args, "start_date") and args.start_date:
            try:
                dateEarliest = int(args.start_date)
                self.startDate = date_util.localize(arrow.now()
                    .replace(hour=0, minute=0, second=0, microsecond=0)
                    .shift(days=-dateEarliest).datetime)
            except ValueError:
                self.startDate = date_util.parseDate(args.start_date, dateOnly=self.parseDateOnly)
        else:
            self.startDate = None

        # End date:
        if hasattr(args, "end_date") and args.end_date:
            self.endDate = date_util.parseDate(args.end_date, dateOnly=self.parseDateOnly)
        else:
            self.endDate = None
            
        if self.startDate or self.endDate:
            dateStr = "INFO: Processing time range:"
            if self.startDate:
                dateStr += " " + str(self.startDate)
                if self.endDate:
                    if self.endDate == self.startDate:
                        dateStr += " only"
                    else:
                        dateStr += " up to " + str(self.endDate)
                else:
                    dateStr += " onward"
            print(dateStr + ".")
        if not self.lastRunDate and not self.startDate:
            raise Exception("A last_run_date or start_date must be specified.")
            
        # Force overwrite:
        if hasattr(args, "force"):
            self.forceOverwrite = args.force
            if self.forceOverwrite:
                print("INFO: Force mode is on: items will be overwritten.") 
            
        # Production mode:
        self.productionMode = config.electProductionMode(not args.debug) \
            if hasattr(args, "debug") else config.electProductionMode()
        if not self.productionMode:
            print("INFO: Debug mode is enabled.")
    
        # Debugging features:
        if hasattr(args, "simulate"):
            self.simulationMode = args.simulate
            if self.simulationMode:
                print("INFO: Simulated write mode is enabled.")
        if hasattr(args, "output_filepath"):
            self.writeFilePath = args.output_filepath
            if self.writeFilePath:
                print("INFO: Write file path is: %s" % self.writeFilePath)
            
        # Set up temporary output directory:
        if self.needsTempDir:
            self.tempDir = tempfile.mkdtemp()
            print("INFO: Created holding place: %s" % self.tempDir)

Ejemplo n.º 28

0

Mostrar archivo

    def _processDay(self, date):
        """
        The code is set up to collect all catalog entries for each day. Here, we analyze the alignment of logged
        entries with the actual time (e.g. there's clock drift or bad time zones), retrieve the records that we
        need, and then create a new time-aligned, completed JSON count for each intersection. 
        """
        count = 0

        # Obtain unit data:
        unitData = self.unitDataProv.retrieve(date)

        # Iterate through each intersection:
        sortedBases = sorted(self.bases)
        for base in sortedBases:
            print("== " + base + ": " + date.strftime("%Y-%m-%d") + " ==")

            # Step 1: Get site file:
            siteFileCatElem, newSiteFlag = self.siteFileCatElems.getForPrevDate(
                base, date, forceValid=True)
            if not siteFileCatElem:
                print("ERROR: No site file is found for '%s' for date %s." %
                      (base, str(date)))
                continue
            if not newSiteFlag:
                siteFile = self.siteFileCache[base]
            else:
                # Get site file from repository if needed:
                siteFile = json.loads(
                    self.storageSrc.retrieveBuffer(siteFileCatElem["pointer"]))
                self.siteFileCache[base] = siteFile

            # Step 2: Resolve the base to the units file:
            # Basically we need to take site.Location.Street1 and .Street2 and positively identify the
            # corresponding record in unit_data.devices[].primary_st and .cross_st.

            # Stage 0: First try to see if we are explicitly called out in config.KNACK_LOOKUPS.
            matchedDevice = None
            reverseFlag = False
            testStr = siteFile["site"]["Location"]["Street1"].strip(
            ) + "_" + siteFile["site"]["Location"]["Street2"].strip()
            if testStr in config_app.KNACK_LOOKUPS:
                for deviceItem in unitData["devices"]:
                    if deviceItem[
                            "atd_location_id"] == config_app.KNACK_LOOKUPS[
                                testStr]:
                        matchedDevice = deviceItem
                        break
                else:
                    print(
                        "WARNING: The respective ID '%s' is not found for the 'KNACK_LOOKUPS' entry for '%s'."
                        % (config_app.KNACK_LOOKUPS[testStr], testStr))
            else:
                # Stage 1: Do fuzzy matching to match respective Knack entry:
                matchedDevice = None
                street1Sub = siteFile["site"]["Location"]["Street1"].strip()
                street2Sub = siteFile["site"]["Location"]["Street2"].strip()
                testStr = (street1Sub + " " + street2Sub).lower()

                compareList = []
                for deviceItem in unitData["devices"]:
                    if str(deviceItem["primary_st"]) == "nan" or str(
                            deviceItem["cross_st"]) == "nan":
                        continue
                    if not deviceItem["primary_st"]:
                        deviceItem["primary_st"] = ""
                    if not deviceItem["cross_st"]:
                        deviceItem["cross_st"] = ""
                    compareList.append(
                        _CompareEntry((deviceItem["primary_st"].strip() + " " +
                                       deviceItem["cross_st"].strip()).lower(),
                                      False, deviceItem))
                    compareList.append(
                        _CompareEntry(
                            (deviceItem["cross_st"].strip() + " " +
                             deviceItem["primary_st"].strip()).lower(), True,
                            deviceItem))
                winningEntry, maxRatio = _findFuzzyWinner(compareList, testStr)
                if maxRatio < MIN_MATCH_RATIO:
                    # Stage 2: Try fuzzy matching with "STREET_SYNONYMS" string substitutions if they're available.
                    if street1Sub in config_app.STREET_SYNONYMS:
                        street1Sub = config_app.STREET_SYNONYMS[street1Sub]
                    if street2Sub in config_app.STREET_SYNONYMS:
                        street2Sub = config_app.STREET_SYNONYMS[street2Sub]
                    testStr2 = (street1Sub + " " + street2Sub).lower()
                    if testStr != testStr2:
                        winningEntry, maxRatio = _findFuzzyWinner(
                            compareList, testStr2)
                    if maxRatio < MIN_MATCH_RATIO:
                        # Stage 3: Try matching IP addresses.
                        print(
                            "WARNING: No unit_data device could be discerned by name for GRIDSMART device '%s'."
                            % base)
                        if "device_net_addr" in siteFile["header"]:
                            netAddr = siteFile["header"]["device_net_addr"]
                            for deviceItem in unitData["devices"]:
                                if deviceItem["device_ip"] == netAddr:
                                    print(
                                        "INFO: Matched IP address %s: '%s/%s'."
                                        % (netAddr, deviceItem["primary_st"],
                                           deviceItem["cross_st"]))
                                    matchedDevice = deviceItem
                                    break
                            else:
                                # Stage 4: Try GPS coordinate matching.
                                print(
                                    "WARNING: Could not match by IP address.")
                                minDistance = None
                                minDistDevice = None
                                for deviceItem in unitData["devices"]:
                                    dist = gps_h.gps2feet(
                                        float(siteFile["site"]["Location"]
                                              ["Latitude"]),
                                        float(siteFile["site"]["Location"]
                                              ["Longitude"]),
                                        float(deviceItem["lat"]),
                                        float(deviceItem["lon"]))
                                    if minDistance is None or minDistance > dist:
                                        minDistance = dist
                                        minDistDevice = deviceItem

                                if minDistance < MAX_DIST:
                                    print(
                                        "Matched at %d feet to nearest GPS coords: '%s/%s'"
                                        %
                                        (minDistance, deviceItem["primary_st"],
                                         deviceItem["cross_st"]))
                                    matchedDevice = minDistDevice
                                else:
                                    print(
                                        "WARNING: Also could not match to nearest GPS coordinates."
                                    )
                    else:
                        print("INFO: Matched on substituted string key: '%s'" %
                              testStr2)
                if maxRatio >= MIN_MATCH_RATIO:
                    matchedDevice = winningEntry.item
                    reverseFlag = winningEntry.reverseFlag

                # Caution: This mutates the device file cache.
                if matchedDevice:
                    matchedDevice["reversed"] = reverseFlag

            # Step 3: Gather counts files:
            # Iterate through the GUID/approach files:
            countsReceiver = []
            repHeader = None
            dayDirErr = 0
            for cameraDeviceItem in siteFile["site"]["CameraDevices"]:
                print("Camera MAC address: %s" %
                      cameraDeviceItem["Fisheye"]["MACAddress"])
                if not cameraDeviceItem["Fisheye"]["IsConfigured"]:
                    print("Ignoring because it isn't configured.")
                    continue
                for zoneMaskItem in cameraDeviceItem["Fisheye"]["CameraMasks"][
                        "ZoneMasks"]:
                    if "Vehicle" not in zoneMaskItem:
                        continue
                    if not zoneMaskItem["Vehicle"]["IncludeInData"]:
                        continue
                    ident = zoneMaskItem["Vehicle"]["Id"]
                    guid = ident[0:8] + "-" + ident[8:12] + "-" + ident[
                        12:16] + "-" + ident[16:20] + "-" + ident[20:]

                    # First, get the current day's file:
                    curDayCounts = getCountsFile(date, base, guid,
                                                 self.storageSrc)
                    if curDayCounts:
                        try:
                            fillDayRecords(date, curDayCounts, ident,
                                           countsReceiver)
                        except KeyError:
                            traceback.print_exc()
                            continue

                        # Next, figure out which supplemental day's file we need in order to get the full picture:
                        auxDate = None
                        if "day_covered" in curDayCounts["header"]:
                            if curDayCounts["header"]["day_covered"] == 1:
                                auxDate = date_util.localize(
                                    date.replace(tzinfo=None) -
                                    datetime.timedelta(days=1)
                                )  # We have to get some of yesterday.
                            elif curDayCounts["header"]["day_covered"] == -1:
                                auxDate = date_util.localize(
                                    date.replace(tzinfo=None) +
                                    datetime.timedelta(days=1)
                                )  # We have to get some of tomorrow.
                        else:
                            print(
                                "WARNING: 'day_covered' is missing from header; data from adjacent day may be missing."
                            )

                        auxDayCounts = None
                        header = curDayCounts["header"]
                        if auxDate:
                            del curDayCounts  # Memory management
                            auxDayCounts = getCountsFile(
                                auxDate, base, guid, self.storageSrc)
                            if auxDayCounts:
                                fillDayRecords(date, auxDayCounts, ident,
                                               countsReceiver)
                                """
                                try:
                                    fillDayRecords(date, auxDayCounts, ident, countsReceiver)
                                except KeyError:
                                    traceback.print_exc()
                                    continue
                                """
                            else:
                                print(
                                    "WARNING: GUID %s is not found for the auxiliary (%s) day file."
                                    % (str(auxDate), guid))
                                dayDirErr = header["day_covered"]

                        # Store a representative header:
                        if not repHeader:
                            repHeader = header
                    else:
                        print(
                            "WARNING: GUID %s is not found for current day file."
                            % guid)

            # Completion checking:
            if dayDirErr == 1 and not self.args.ignore_prev:
                print(
                    "ERROR: No records from previous day were found, aborting. This can be ignored if the -p flag is specified."
                )
                continue
            elif dayDirErr == -1 and not self.args.ignore_next:
                print(
                    "ERROR: No records from next day were found, aborting. This can be ignored if the -n flag is specified."
                )
                continue

            if not countsReceiver:
                print("ERROR: No counts were found.")
            else:
                # Step 4: Write out compiled counts:
                countsReceiver.sort(key=lambda c: c["timestamp_adj"])

                header = {
                    "data_type": "gridsmart",
                    "zip_name": repHeader["zip_name"],
                    "collection_date": repHeader["collection_date"],
                    "processing_date": str(self.processingDate),
                    "version": repHeader["version"]
                }

                newFileContents = {
                    "header": header,
                    "counts": countsReceiver,
                    "site": siteFile,
                    "device": matchedDevice if matchedDevice else []
                }

                # TODO: Continue to see out how to positively resolve NORTHBOUND, EASTBOUND, etc. to street geometry.
                catalogElem = self.storageTgt.createCatalogElement(
                    base, "counts.json", date, self.processingDate)
                print("INFO: Writing: " + catalogElem["pointer"])
                self.storageTgt.writeJSON(newFileContents,
                                          catalogElem,
                                          cacheCatalogFlag=False)
                # We turned off cacheCatalogFlag because we're writing many files, and would have to specially handle the last day.

                # Performance metrics:
                self.perfmet.recordCollect(date, representsDay=True)

                # Increment count:
                count += 1

        self.bases.clear()
        return count