Esempio n. 1
0
 def __init__(self, accessPointJob, accessPointObs, apiKey, needsObs=False):
     """
     Initializes the connection to the PostgREST instance.
     
     @param accessPointJob: the PostgREST "etl_perfmet_job" table endpoint
     @param accessPointObs: the PostgREST "etl_perfmet_obs" table endpoint
     @param apiKey: the PostgREST API key needed to write to the endpoints
     @param needsObs: set this to True to enable the writing of observations.
     """
     self.jobDB = Postgrest(accessPointJob, auth=apiKey)
     self.obsDB = None
     if needsObs:
         self.obsDB = Postgrest(accessPointObs, auth=apiKey)
Esempio n. 2
0
    def get_data(start):
        pgrest = Postgrest(secrets.PG["url"], auth=secrets.PG["token"])

        params = {
            "select":
            "trip_id,provider_id,start_latitude,start_longitude,end_latitude,end_longitude",
            "council_district_start": "is.null",
            "limit": interval,
            "offset": start
        }

        print('get data')
        trips = pgrest.select(params)

        print('split_trips')
        trips = split_trips(trips, field_map)

        print('create_points')
        trips = create_points(trips)

        print('point in grid')
        trips = point_in_poly(points=trips,
                              polys=grid,
                              null_val="OUT_OF_BOUNDS")

        print('point in council district')
        trips = point_in_poly(points=trips,
                              polys=districts,
                              row_property_key="council_district",
                              feature_property_key="district_n",
                              null_val=0)

        print('merge trips')
        trips = merge_trips(trips)

        print('reduce fields')
        trips = reduce_fields(trips, [
            field['name']
            for field in config.FIELDS if field.get('upload_postgrest')
        ])

        print('post trips')
        post_trips(pgrest, trips)
        return trips
def main():

    args = cli_args()
    
    auth = KNACK_CREDENTIALS[args.app_name]

    cfg_dataset = cfg[args.dataset]

    filters = knackutil.date_filter_on_or_after(
        args.last_run_date, cfg_dataset["modified_date_field_id"]
    )

    kn = knackpy_wrapper(cfg_dataset, auth, filters=filters)

    if not kn.data:
        return 0

    # Filter data for records that have been modifed after the last
    # job run
    last_run_timestamp = arrow.get(args.last_run_date).timestamp * 1000

    kn.data = filter_by_date(
        kn.data, cfg_dataset["modified_date_field"], last_run_timestamp
    )

    if not kn.data:
        return 0

    pgrest = Postgrest(cfg_dataset["pgrest_base_url"], auth=JOB_DB_API_TOKEN)

    for record in kn.data:
        # convert mills timestamp to iso
        record[cfg_dataset["modified_date_field"]] = arrow.get((record[cfg_dataset["modified_date_field"]] / 1000)).format()

    kn.data = datautil.lower_case_keys(kn.data)
    
    pgrest.upsert(kn.data)

    return len(kn.data)
Esempio n. 4
0
def main():
    def async_wrapper(end_time):
        try:
            t0 = time.time()

            data = get_data(client, end_time, interval, paging)

            elapsed = time.time() - t0

            logging.info('Request Time: {} seconds for {} records'.format(
                elapsed, len(data)))

            min_max(data)

            data = parse_routes(data)

            data = drop_dupes(data)

            data = [{
                field: row[field]
                for field in config.FIELDS if field.get('upsert_mds')
            } for row in data]

            data = floats_to_iso(
                data,
                [field for field in config.FIELDS if field.get('datetime')])

            if data:
                results = post_data(pgrest, data)
                return data

        except Exception as e:
            logging.error(e)
            logging.error("{} to {} FAILED".format(end_time - interval,
                                                   end_time))

    args = cli_args()

    provider_name = args.provider_name

    cfg = secrets.PROVIDERS[provider_name]

    pgrest = Postgrest(secrets.PG["url"], auth=secrets.PG["token"])

    start = args.start
    end = args.end

    if not start:
        # use the most recent record as the start date
        start = most_recent(pgrest, cfg["provider_id"])
        start = to_unix(start)

    if not end:
        # use current time as the end date
        end = int(datetime.today().timestamp())

    interval = cfg["interval"]
    paging = cfg["paging"]

    if cfg.pop("time_format") == "mills":
        # mills to unix
        start, end, interval = int(start * 1000), int(end * 1000), int(
            interval * 1000)

    if not cfg["client_params"].get("token"):
        auth_info = cfg.get("oauth")
        url = auth_info.pop("url")
        cfg["client_params"]["token"] = get_token(url, auth_info)

    client = ProviderClient(**cfg["client_params"])

    total = 0
    workers = 6
    pool = ThreadPool(workers)

    results = pool.map(async_wrapper, range(start, end, interval))

    pool.close()

    pool.join()

    for result in results:
        if result:
            total += len(result)

    logging.info(total)
    return total
Esempio n. 5
0
def main():
    args = cli_args()

    cfg_dataset = cfg[args.dataset]

    limit = cfg_dataset.get("limit")

    if not args.last_run_date or args.replace:
        last_run_date = "1970-01-01T00:00:00"
    else:
        last_run_date = datetime.utcfromtimestamp(int(args.last_run_date))
        last_run_date = last_run_date.isoformat()

    pgrest = Postgrest(cfg_dataset["pgrest_base_url"], auth=JOB_DB_API_TOKEN)
    '''
    The `interval` is the number of records which will be processed on each loop.
    It servers as the `offset` paramenter, so it's the means by which we chunk
    records.

    1000 matches the max records of returned by our postgres instance,
    so each loop = 1 request to the source db (postgrest). We have disabled
    pagination in our requests (see below), so it's very important that our
    `offset` does not exceed the number of records the API will return
    in one request.
    '''
    interval = 1000

    offset = 0

    records_processed = 0

    while True:
        '''
        Download records in chunks, posting each chunk to socrata.

        Note that the `order` param ensures that each offset request
        returns the expected chunk of records, by preserving the order
        in which records are returned. Ordering slows down the response
        from the postgREST, but it's necessary to ensure consistent
        results.
        '''
        params = {
            cfg_dataset["modified_date_field"]: f"gte.{last_run_date}",
            "limit": limit,
            "order": "{}.asc".format(cfg_dataset["modified_date_field"]),
            "offset": offset
        }
        '''
        We disable `pagination` in this request because we are simulating
        pagination by manually passing an `offest` to each request.
        '''
        records = pgrest.select(params=params, pagination=False)

        print("got {} records".format(len(records)))

        if not records:
            break

        if args.destination[0] == "socrata":
            date_fields = cfg_dataset.get("date_fields")

            pub = socrata_pub(records,
                              cfg_dataset,
                              args.replace,
                              date_fields=date_fields)

            print("Published {} records.".format(len(records)))

        offset += interval

        records_processed += len(records)

    return records_processed
Esempio n. 6
0
import requests
import pdb
import json
from datetime import datetime, timedelta
from pypgrest import Postgrest
from tdutils import argutil


from config.secrets import *

form_id = "44359e32-1a7f-41bd-b53e-3ebc039bd21a"
key = FULCRUM_CRED.get("api_key")

# create postgrest instance
pgrest = Postgrest(
    "http://transportation-data.austintexas.io/signal_pms", auth=JOB_DB_API_TOKEN
)


def get_pgrest_records():
    """Summary
    
    Returns:
        TYPE: Description
    """
    # the datetime converstin for modified_date is not right. The time part are missing

    params = {}
    results = pgrest.select(params=params)

    if len(results) != 0:
import time
import copy

import _setpath
from config.secrets import *
from config.knack.config import SIGNAL_PMS_POSTGRE_KNACK as cfg

# from tdutils.pgrestutil import Postgrest
from tdutils import argutil, datautil
from pypgrest import Postgrest

key = FULCRUM.get("api_key")

# create postgrest instance

pgrest = Postgrest(cfg.get("postgre_url"), auth=JOB_DB_API_TOKEN)


def get_postgre_records():
    """Summary
    get postgreSQL records for all signal pms currently in postgreSQL
    
    Returns:
        list: list of dictionaries of all pgrest records
    """
    params = {}

    postgre_records = pgrest.select(params=params)
    postgre_records_df = pd.DataFrame.from_dict(postgre_records)

    # temporary fix to remove duplicate pm records
Esempio n. 8
0
def main():

    args = cli_args()

    provider_name = args.provider_name

    cfg = secrets.PROVIDERS[provider_name]

    pgrest = Postgrest(secrets.PG["url"], auth=secrets.PG["token"])

    start = args.start
    end = args.end
    offset = cfg["time_offset_seconds"]

    if not start:
        # use the most recent record as the start date (minus the offset)
        start = most_recent(pgrest, cfg["provider_id"])
        start = to_unix(start)
        start = start - offset

    if not end:
        # use current time as the end date
        end = int(datetime.today().timestamp())

    interval = cfg["interval"]

    if cfg.get("time_format") == "mills":
        # mills to unix
        start, end, interval = int(start * 1000), int(end * 1000), int(
            interval * 1000)

    auth_type = cfg.get("auth_type")

    if not cfg.get("token") and auth_type.lower() != "httpbasicauth":
        token_res = get_token(cfg["auth_url"], cfg["auth_data"])
        cfg["token"] = token_res[cfg["auth_token_res_key"]]

    client_params = build_client_params(cfg)

    client = ProviderClient(**client_params)

    total = 0

    for i in range(start, end, interval):

        data = get_data(client, i, interval, cfg["paging"])

        print(start)

        if data:

            data = parse_routes(data)

            data = drop_dupes(data)

            data = [{
                field["name"]: row[field["name"]]
                for field in config.FIELDS if field.get("upload_mds")
            } for row in data]

            data = floats_to_iso(
                data,
                [
                    field["name"]
                    for field in config.FIELDS if field.get("datetime")
                ],
            )

            post_data(pgrest, data)

            total += len(data)

        else:
            continue

    return total
Esempio n. 9
0
class PerfMetDB:
    """
    Represents a connection to the PostgREST instance of the performance metrics tables.
    """
    def __init__(self, accessPointJob, accessPointObs, apiKey, needsObs=False):
        """
        Initializes the connection to the PostgREST instance.
        
        @param accessPointJob: the PostgREST "etl_perfmet_job" table endpoint
        @param accessPointObs: the PostgREST "etl_perfmet_obs" table endpoint
        @param apiKey: the PostgREST API key needed to write to the endpoints
        @param needsObs: set this to True to enable the writing of observations.
        """
        self.jobDB = Postgrest(accessPointJob, auth=apiKey)
        self.obsDB = None
        if needsObs:
            self.obsDB = Postgrest(accessPointObs, auth=apiKey)

    def writeJob(self, perfMet):
        """
        Writes the job information to the job log.
        """
        metadata = {
            "data_source":
            perfMet.dataSource,
            "stage":
            perfMet.stage,
            "seconds":
            perfMet.processingTotal,
            "records":
            perfMet.records,
            "processing_date":
            str(perfMet.processingTime),
            "collection_start":
            str(date_util.localize(perfMet.collectTimeStart))
            if perfMet.collectTimeStart else None,
            "collection_end":
            str(date_util.localize(perfMet.collectTimeEnd))
            if perfMet.collectTimeEnd else None
        }
        self.jobDB.upsert(metadata)

    def readAllJobs(self, timestampIn):
        """
        Reads all jobs activity for the given processing day of the timestamp.
        """
        day = date_util.roundDay(date_util.localize(timestampIn))
        command = {
            "select":
            "data_source,stage,seconds,records,processing_date,collection_start,collection_end",
            "processing_date": [
                "gte.%s" % str(day),
                "lt.%s" % str(
                    date_util.localize(
                        day.replace(tzinfo=None) + datetime.timedelta(days=1)))
            ],
            "order":
            "data_source,stage"
        }
        return self.jobDB.select(params=command)

    def getRecentJobsDate(self):
        """
        Returns the most recent processing date for jobs.
        """
        command = {
            "select": "processing_date",
            "order": "processing_date.desc",
            "limit": 1
        }
        ret = self.jobDB.select(params=command)
        if ret and ret[0] and "processing_date" in ret[0]:
            ret = ret[0]["processing_date"]
        else:
            ret = None
        return ret

    def writeObs(self, perfMet):
        """
        Writes observations to the observations log.
        """
        metadata = []
        if not perfMet.observations:
            return
        for identifier, obs in perfMet.observations.items():
            minTimestamp = obs.minTimestamp
            if minTimestamp:
                if isinstance(minTimestamp, datetime.datetime):
                    minTimestamp = str(date_util.localize(minTimestamp))
            maxTimestamp = obs.maxTimestamp
            if maxTimestamp:
                if isinstance(maxTimestamp, datetime.datetime):
                    maxTimestamp = str(date_util.localize(maxTimestamp))
            metadata.append({
                "data_source": perfMet.dataSource,
                "sensor_name": identifier[0],
                "data_type": identifier[1],
                "data": obs.observation,
                "expected": obs.expected,
                "collection_date": str(obs.collectionDate),
                "timestamp_min": minTimestamp,
                "timestamp_max": maxTimestamp
            })
        self.obsDB.upsert(metadata)

    def readAllObs(self,
                   timestampIn,
                   earlyDate=None,
                   dataSource=None,
                   obsType=None):
        """
        Reads all observations activity for the given collection day of the timestamp.
        """
        if not earlyDate:
            timestampIn = date_util.roundDay(date_util.localize(timestampIn))
            earlyDate = date_util.localize(
                timestampIn.replace(tzinfo=None) - datetime.timedelta(days=1))
            collDateClause = [
                "gte.%s" % str(timestampIn),
                "lt.%s" % str(
                    date_util.localize(
                        timestampIn.replace(tzinfo=None) +
                        datetime.timedelta(days=1)))
            ]
        else:
            collDateClause = [
                "gt.%s" % str(earlyDate),
                "lte.%s" % str(timestampIn)
            ]

        command = {
            "select":
            "data_source,sensor_name,data_type,data,expected,collection_date,timestamp_min,timestamp_max",
            "collection_date": collDateClause,
            "order": "data_type,sensor_name,collection_date"
        }
        if dataSource:
            command["data_source"] = "eq.%s" % dataSource
        if obsType:
            command["data_type"] = "eq.%s" % obsType
        return self.obsDB.select(params=command)
def main():
    def async_wrapper(interval):
        start = interval[0]
        end = interval[1]

        entry = "bird [{}, {}]".format(start, end)
        print(entry)

        try:
            t0 = time.time()

            print('get data')
            data = get_data(client, start, end)

            elapsed = time.time() - t0

            logging.info('Request Time: {} seconds for {} records'.format(
                elapsed, len(data)))

            data = parse_routes(data)

            data = drop_dupes(data)

            data = [{
                field['name']: row[field['name']]
                for field in config.FIELDS if field.get('upload_mds')
            } for row in data]

            data = floats_to_iso(data, [
                field['name']
                for field in config.FIELDS if field.get('datetime')
            ])

            if data:
                results = post_data(pgrest, data)
                return data

        except Exception:
            logging.error("#FAILED {}".format(entry))

    args = cli_args()

    provider_name = args.provider_name

    cfg = secrets.PROVIDERS[provider_name]

    pgrest = Postgrest(secrets.PG["url"], auth=secrets.PG["token"])

    if not cfg["client_params"].get("token"):
        auth_info = cfg.get("oauth")
        url = auth_info.pop("url")
        cfg["client_params"]["token"] = get_token(url, auth_info)

    client = ProviderClient(**cfg["client_params"])

    total = 0
    workers = 4
    pool = ThreadPool(workers)

    results = pool.map(async_wrapper, INTERVALS)

    pool.close()

    pool.join()

    for result in results:
        if result:
            total += len(result)

    logging.info(total)
    return total
Esempio n. 11
0
def main():
    def split_trips(data, fields):
        """
        Split trip into separate rows (one for trip start and one fore trip end)
        """
        split = []
        for row in data:
            new_row = {
                "type": "start",
                "x": row[fields["trip_start_x_field"]],
                "y": row[fields["trip_start_y_field"]],
            }

            new_row.update(**row)

            split.append(new_row)

            new_row = {
                "type": "end",
                fields["trip_id_field"]: row[fields["trip_id_field"]],
                "x": row[fields["trip_end_x_field"]],
                "y": row[fields["trip_end_y_field"]],
            }

            new_row.update(**row)

            split.append(new_row)

        return split

    def create_points(data):
        """
        Create shapely geometry from list of dicts
        """
        for row in data:

            if row["x"] and row["y"]:
                try:
                    row["geometry"] = point.Point(float(row["x"]),
                                                  float(row["y"]))
                except:
                    row["geometry"] = None
            else:
                row["geometry"] = None

        return data

    def read_json(f):
        """
        Load (geo)JSON into memory
        """
        with open(f, "r") as fin:
            return json.loads(fin.read())

    def point_in_poly(
        points=None,
        polys=None,
        row_property_key="cell_id",
        feature_property_key="id",
        geom_key="geometry",
        null_val=None,
    ):
        """
        Get property of polygon that intersects input point. Assumes input polygons
        do not overlap.
    
        points: list of dicts with shapely point geometries
        
        polys: geojson polygon feature collection
        
        row_property_key: the property name of the input point that will be assigned a
            value from intersecting the polygon 
        
        feature_property_key: the property name of the intersecting polygon that will
            be assigned to intersecting point(s)

        geom_key: the name of the feature proerty which contains the geometry of the
            feature. Applies to both input points and polygons

        null_val: the value which will be assigned to `<row_property_key>` if no
            intersecting polygon is found.

        Returns points with updated <row_property_key>
        """
        start_time = time.time()
        count = 0

        # create grid cell index of polygon *bounding boxes*
        idx = index.Index()
        for pos, feature in enumerate(polys["features"]):
            idx.insert(pos, shape(feature[geom_key]).bounds)

        # find intersecting polygon
        for i, pt in enumerate(points):

            if pt[geom_key]:

                matched = False

                # iterate through polygon *bounding boxes* that intersect with point
                for intersect_pos in idx.intersection(pt[geom_key].coords[0]):

                    poly = shape(polys["features"][intersect_pos][geom_key])

                    # check if point intersects actual polygon
                    if pt[geom_key].intersects(poly):
                        pt.update({
                            row_property_key:
                            polys["features"][intersect_pos]["properties"]
                            [feature_property_key]
                        })
                        matched = True

                        # break because we assume there are no overlapping polygons
                        break

                if not matched:
                    #  add empty cell ID rows not contained by poly
                    count += 1
                    pt.update({row_property_key: null_val})
            else:
                count += 1
                pt.update({row_property_key: null_val})

        elapsed_time = time.time() - start_time
        print(f"{count} points outside the input poly(s)")
        print(f"{elapsed_time} seconds for point in poly")
        return points

    def merge_trips(trips, id_field="trip_id"):
        #  merge start and end trips to single trip with start and end cell
        new_trips = {}

        for trip in trips:
            trip_id = trip[id_field]
            trip_type = trip.get("type")

            geoid = trip.get("census_geoid")
            district = trip.get("council_district")

            if trip_type == "start":
                current_data = {
                    "census_geoid_start": geoid,
                    "council_district_start": district,
                }

            elif trip_type == "end":
                current_data = {
                    "census_geoid_end": geoid,
                    "council_district_end": district,
                }

            if trip_id in new_trips:
                new_trips[trip_id].update(current_data)

            else:
                new_trips[trip_id] = current_data
                new_trips[trip_id].update(**trip)

        return [new_trips[trip] for trip in new_trips.keys()]

    def reduce_fields(data, fieldnames):
        return [{key: row[key] for key in fieldnames} for row in data]

    def post_trips(client, trips):
        print("upsert!")
        print(len(trips))
        return client.upsert(trips)

    field_map = {
        "trip_id_field": "trip_id",
        "trip_start_y_field": "start_latitude",
        "trip_start_x_field": "start_longitude",
        "trip_end_y_field": "end_latitude",
        "trip_end_x_field": "end_longitude",
    }

    # move working directory to script location to
    # ensure relative paths work (in case script
    # is run by external launcher)
    if os.path.dirname(__file__):
        os.chdir(os.path.dirname(__file__))

    districts = read_json(config.DISTRICTS_GEOJSON)
    census_tracts = read_json(config.CENSUS_TRACTS_GEOJSON)

    # num of records that will be processed per request
    # not to exceed the postgrest db request limit, which you need to know
    interval = 5000
    total = 0

    pgrest = Postgrest(secrets.PG["url"], auth=secrets.PG["token"])

    while True:
        # loop until request no longer yields trips

        params = {
            "select":
            "trip_id,provider_id,start_latitude,start_longitude,end_latitude,end_longitude",
            "census_geoid_start":
            "is.null",  # assume if census geoid is null the record has not been processed
            "limit": interval,
        }

        print("get data")

        trips = pgrest.select(params)

        if not trips:
            logging.info("All records processed.")
            break

        trips = split_trips(trips, field_map)

        trips = create_points(trips)

        trips = point_in_poly(
            points=trips,
            polys=census_tracts,
            row_property_key="census_geoid",
            feature_property_key="GEOID10",
            null_val="OUT_OF_BOUNDS",
        )

        trips = point_in_poly(
            points=trips,
            polys=districts,
            row_property_key="council_district",
            feature_property_key="district_n",
            null_val=0,
        )

        trips = merge_trips(trips)

        trips = reduce_fields(
            trips,
            [
                field["name"]
                for field in config.FIELDS if field.get("upload_postgrest")
            ],
        )

        post_trips(pgrest, trips)

        total += len(trips)

    return total
Esempio n. 12
0
 def __init__(self, accessPoint, apiKey):
     """
     Initializes the PostgREST access with a given access point URL and the API key.
     """
     self.catalogDB = Postgrest(accessPoint, auth=apiKey)
Esempio n. 13
0
class CatalogPostgREST:
    """
    Implements catalog access functions using PostgREST.
    """
    def __init__(self, accessPoint, apiKey):
        """
        Initializes the PostgREST access with a given access point URL and the API key.
        """
        self.catalogDB = Postgrest(accessPoint, auth=apiKey)

    def query(self,
              dataSource,
              stage,
              base,
              ext,
              earlyDate=None,
              lateDate=None,
              exactEarlyDate=False,
              limit=None,
              start=None,
              reverse=False):
        """
        Performs a query on the given datatype, data stage, base, ext, and optional early and late dates. Returns a list
        of dictionary objects, each a result.
        
        @param exactEarlyDate: Set this to true to query only on exact date defined by the earlyDate parameter
        @param limit Limits the output to a specific number of records. If None, then the driver default is used.
        @param start sets the start frome wnen doing a multi-chunk query.
        @param reverse will allow the results to be sorted in descending order.
        """
        # TODO: Do we need a query that will return a catalog entry that contains a given collection date (between collection_date
        # and collection_end)?

        # Specify query plus required parameters and sorting/pagination parameters:
        command = {
            "select":
            "collection_date,collection_end,processing_date,pointer,id_base,id_ext,metadata",
            "repository":
            "eq.%s" % stage,
            "data_source":
            "eq.%s" % dataSource,
            "order":
            ("collection_date.asc" if not reverse else "collection_date.desc")
            + ",id_base.asc,id_ext.asc",
            "limit":
            1 if limit is None else limit,
            "offset":
            0 if start is None else start
        }

        # Allow base and ext identifiers to be omitted, or to be a "match first part of string" query:
        if base is not None:
            if "%%" in base:
                command["id_base"] = "like.%s" % base.replace("%%", "*")
            else:
                command["id_base"] = "eq.%s" % base
        if ext is not None:
            if "%%" in ext:
                command["id_ext"] = "like.%s" % ext.replace("%%", "*")
            else:
                command["id_ext"] = "eq.%s" % ext

        # Collection date range: May need to use an array because there could be two constraints:
        collDateRange = []
        if earlyDate is not None:
            if exactEarlyDate:
                collDateRange.append("eq.%s" % str(earlyDate))
            else:
                collDateRange.append("gte.%s" % str(earlyDate))
        if lateDate is not None:
            collDateRange.append("lt.%s" % str(lateDate))
        if collDateRange:
            if len(collDateRange) == 1:
                command["collection_date"] = collDateRange[0]
            else:
                command["collection_date"] = collDateRange

        # Run the query:
        return self.catalogDB.select(params=command)

    def upsert(self, upsertDataList):
        """
        Performs an upsert operation on the given list of dictionary objects. Each dictionary object shall contain
        "repository", "data_source", "id_base", "id_ext", "pointer", "collection_date", "collection_end" (optional),
        "processing_date", and optionally "metadata".
        """
        try:
            self.catalogDB.upsert(upsertDataList)
        except:
            print(
                "ERROR: Exception encountered in CatalogPostgREST.upsert(). Input:"
            )
            print(upsertDataList)
            raise

    @staticmethod
    def getPreferredChunk():
        """
        Retruns the preferred chunk size that catalog.Catalog.query() should used in requests.
        """
        return PREFERRED_CHUNK_SIZE