def cluster_profiles(city): """Return the cluster profiles in `city`, when running a K-means algorithm between `day` and `day+window` Parameters ---------- city : str City of interest, either `bordeaux` or `lyon` Returns ------- dict Cluster profiles for each cluster, at each hour of the day """ query = cluster_profile_query(city) eng = db() rset = eng.execute(query) if not rset: logger.warning("rset is empty") return {"data": []} result = [] for cluster in (dict(zip(rset.keys(), row)) for row in rset): result.append({ "cluster_id": cluster['cluster_id'], "start": cluster['start'], 'stop': cluster['stop'], 'hour': list(range(24)), 'values': [cluster[h] for h in ["h{:02d}".format(i) for i in range(24)]] }) return {"data": result}
def station_clusters(city, station_ids=None, geojson=False): """Return the cluster IDs of shared-bike stations in `city`, when running a K-means algorithm between `day` and `day+window` Parameters ---------- city : str City of interest, either `bordeaux` or `lyon` station_ids : list of integer Shared-bike station IDs ; if None, all the city stations are considered geojson : boolean If true, returns the clustered stations under the GeoJSON format Returns ------- dict Cluster profiles for each cluster, at each hour of the day """ if station_ids is None: station_ids = get_station_ids(city) query = station_cluster_query(city) eng = db() rset = eng.execute(query, id_list=tuple(str(x) for x in station_ids)) if not rset: logger.warning("rset is empty") return {"data": []} data = {"data": [dict(zip(rset.keys(), row)) for row in rset]} if geojson: return clustered_station_geojson(data["data"]) return data
def daily_transaction_list(city, day, limit, order_by, window=0, backward=True): """Retrieve the daily transaction for the Bordeaux stations city: str day: date Data for this specific date limit: int order_by: str window: int (0 by default) Number of days to look around the specific date backward: bool (True by default) Get data before the date or not, according to the window number Return a list of dicts """ window = time_window(day, window, backward) query = daily_query_stations(city, limit, order_by) eng = db() rset = eng.execute( query, start=window.start, stop=window.stop, order_reference_date=window.order_reference_date).fetchall() return processing_daily_data(rset, window)
def cluster_profiles(city): """Return the cluster profiles in `city`, when running a K-means algorithm between `day` and `day+window` Parameters ---------- city : str City of interest, either `bordeaux` or `lyon` Returns ------- dict Cluster profiles for each cluster, at each hour of the day """ query = cluster_profile_query(city) eng = db() rset = eng.execute(query) if not rset: logger.warning("rset is empty") return {"data": []} result = [] for cluster in (dict(zip(rset.keys(), row)) for row in rset): result.append({"cluster_id": cluster['cluster_id'], "start": cluster['start'], 'stop': cluster['stop'], 'hour': list(range(24)), 'values': [cluster[h] for h in ["h{:02d}".format(i) for i in range(24)]]} ) return {"data": result}
def run(self): query = ("SELECT DISTINCT id AS station_id, timestamp AS ts, " "available_bikes AS nb_bikes, available_stands AS nb_stands, " "available_bikes::float / (available_bikes::float " "+ available_stands::float) AS probability " "FROM {schema}.{tablename} " "WHERE timestamp >= %(start)s " "AND timestamp < %(stop)s " "AND (available_bikes > 0 OR available_stands > 0) " "AND (status = 'open')" "ORDER BY id, timestamp" ";").format(schema=self.city, tablename='timeseries') eng = db() df = pd.io.sql.read_sql_query(query, eng, params={ "start": self.start, "stop": self.stop }) df.station_id = df.station_id.astype(int) if df.empty: raise Exception( "There is not any data to process in the DataFrame. " + "Please check the dates.") prediction_model = train_prediction_model(df, self.validation, self.frequency) self.output().makedirs() prediction_model.save_model(self.output().path)
def run(self): query = """SELECT id ,st_x(geom) as lat ,st_y(geom) as lon FROM {schema}.{table}; """.format(schema=self.city, table='stations') df = pd.io.sql.read_sql_query(query, db()) clusters = compute_geo_clusters(df) self.output().makedirs() path = self.output().path clusters['labels'].to_hdf(path, '/clusters') clusters['centroids'].to_hdf(path, '/centroids')
def latest_predictions(city, limit, geojson, freq='1H'): """Get bike availability predictions for a specific city. Parameters ---------- city : str Name of the city limit : int Max number of stations geosjon : bool Data in geojson? freq : str Time horizon Returns ------- dict """ query = """with latest as ( select station_id as id ,timestamp ,nb_bikes ,rank() over (partition by station_id order by timestamp desc) as rank from {city}.prediction where frequency=%(freq)s and timestamp >= %(min_date)s ) select P.id ,P.timestamp ,P.nb_bikes ,S.name ,S.nb_stations as nb_stands ,st_x(S.geom) as x ,st_y(S.geom) as y from latest as P join {city}.station as S using(id) where P.rank=1 order by id """.format(city=city) if limit is not None: query += " limit {limit}".format(limit=limit) eng = db() # avoid getting the full history min_date = datetime.now() - timedelta(days=2) rset = eng.execute(query, freq=freq, min_date=min_date, limit=limit) keys = rset.keys() result = [dict(zip(keys, row)) for row in rset] predict_date = max(x['timestamp'] for x in result) if geojson: return station_geojson( result, feature_list=['id', 'name', 'timestamp', 'nb_bikes', 'nb_stands']) return {"data": result, "date": predict_date}
def run(self): query = ("SELECT DISTINCT * FROM {schema}.{tablename} " "WHERE timestamp >= %(start)s AND timestamp < %(stop)s " "ORDER BY timestamp, id" ";").format(schema=self.city, tablename='timeseries') eng = db() query_params = {"start": self.date, "stop": self.date + timedelta(1)} df = pd.io.sql.read_sql_query(query, eng, params=query_params) transactions = (df.query("status == 'open'").groupby("id") ['available_bikes'].apply(lambda s: s.diff().abs().sum( )).dropna().to_frame().reset_index()) transactions = transactions.rename_axis( {"available_bikes": "transactions"}, axis=1) with self.output().open('w') as fobj: transactions.to_csv(fobj, index=False)
def timeseries(city, station_ids, start, stop): """Get timeseries data between two dates for a specific city and a list of station ids """ query = """SELECT T.* ,S.name as name FROM {schema}.{table} AS T LEFT JOIN {schema}.{station} AS S using(id) WHERE id IN %(id_list)s AND timestamp >= %(start)s AND timestamp < %(stop)s ORDER BY id,timestamp """.format(schema=config[city]['schema'], table=config['database']['timeseries'], station=config['database']['stations']) eng = db() rset = eng.execute(query, id_list=tuple(x for x in station_ids), start=start, stop=stop) return processing_timeseries(rset)
def run(self): query = ("SELECT id, timestamp, available_bikes " "FROM {}.timeseries " "WHERE timestamp >= %(start)s " "AND timestamp < %(stop)s;" "").format(self.city) eng = db() df = pd.io.sql.read_sql_query(query, eng, params={"start": self.start, "stop": self.stop}) df.columns = ["station_id", "ts", "nb_bikes"] clusters = compute_clusters(df) self.output().makedirs() path = self.output().path clusters['labels'].to_hdf(path, '/clusters') clusters['centroids'].to_hdf(path, '/centroids')
def timeseries(city, station_ids, start, stop): """Get timeseries data between two dates for a specific city and a list of station ids """ query = """SELECT T.* ,S.name as name FROM {schema}.{table} AS T LEFT JOIN {schema}.{station} AS S using(id) WHERE id IN %(id_list)s AND timestamp >= %(start)s AND timestamp < %(stop)s ORDER BY id,timestamp """.format(schema=city, table='timeseries', station='station') eng = db() rset = eng.execute(query, id_list=tuple(x for x in station_ids), start=start, stop=stop) return processing_timeseries(rset)
def stations(city, limit, geojson): """List of bicycle stations city: string limit: int geojson: boolean Return a list of dict, one dict by bicycle station """ query = _query_stations(city, limit) eng = db() rset = eng.execute(query) keys = rset.keys() result = [dict(zip(keys, row)) for row in rset] if geojson: return station_geojson(result) return {"data": result}
def run(self): query = """SELECT DISTINCT * FROM {schema}.{tablename} WHERE last_update >= %(start)s AND last_update < %(stop)s ORDER BY last_update,number""".format(schema=config["lyon"]["schema"], tablename=config['lyon']['table']) eng = db() df = pd.io.sql.read_sql_query(query, eng, params={"start": self.date, "stop": self.date + timedelta(1)}) transactions = (df.query("status == 'OPEN'") .groupby("number")['available_bikes'] .apply(lambda s: s.diff().abs().sum()) .dropna() .to_frame() .reset_index()) transactions = transactions.rename_axis({"available_bikes": "transactions"}, axis=1) with self.output().open('w') as fobj: transactions.to_csv(fobj, index=False)
def run(self): query = ("SELECT DISTINCT * FROM {schema}.{tablename} " "WHERE timestamp >= %(start)s AND timestamp < %(stop)s " "ORDER BY timestamp, id" ";").format(schema=self.city, tablename=config['database']['timeseries']) eng = db() query_params = {"start": self.date, "stop": self.date + timedelta(1)} df = pd.io.sql.read_sql_query(query, eng, params=query_params) transactions = (df.query("status == 'open'") .groupby("id")['available_bikes'] .apply(lambda s: s.diff().abs().sum()) .dropna() .to_frame() .reset_index()) transactions = transactions.rename_axis({"available_bikes": "transactions"}, axis=1) with self.output().open('w') as fobj: transactions.to_csv(fobj, index=False)
def specific_stations(city, ids): """List of specific bicycle stations. Parameters ---------- city : string ids : list Returns ------- list of dict One dict by bicycle station """ query = _query_stations(city, 1).replace("LIMIT 1", 'WHERE id IN %(id_list)s') eng = db() rset = eng.execute(query, id_list=tuple(str(x) for x in ids)).fetchall() if not rset: return [] return {"data": [dict(zip(x.keys(), x)) for x in rset]}
def latest_station_timewindow(city, start, stop): """Get latest available stations (by date) between `start` and `stop`. Parameters ---------- city : str The name of the city. start : datetime stop : datetime Returns ------- pd.DataFrame """ query = """with ranked as ( select distinct id as station_id , timestamp as ts , available_bikes as nb_bikes , available_stands as nb_stands , available_bikes::float / (available_bikes::float + available_stands::float) as probability -- rank over station_id and timestamp to take the most recent ones , rank() over (partition by id order by timestamp desc) as rk from {schema}.timeseries where timestamp >= %(start)s and timestamp < %(stop)s and (available_bikes > 0 or available_stands > 0) and status = 'open' order by id, timestamp ) select station_id ,ts ,nb_bikes ,nb_stands ,probability from ranked where rk=1 order by station_id; """.format(schema=city) eng = db() return pd.io.sql.read_sql_query(query, eng, params={"start": start, "stop": stop})
def run(self): query = """SELECT DISTINCT ident, type, state, available_bike, ts FROM {schema}.{tablename} WHERE ts >= %(start)s AND ts < %(stop)s ORDER BY ident,ts;""".format(schema=config["bordeaux"]["schema"], tablename=config['bordeaux']['table']) eng = db() df = pd.io.sql.read_sql_query(query, eng, params={ "start": self.date, "stop": self.date + timedelta(1) }) transactions = (df.query("state == 'CONNECTEE'").groupby("ident") ['available_bike'].apply(lambda s: s.diff().abs().sum( )).dropna().to_frame().reset_index()) transactions = transactions.rename_axis( {"available_bike": "transactions"}, axis=1) with self.output().open('w') as fobj: transactions.to_csv(fobj, index=False)
def get_station_ids(city): """Provides the list of shared-bike station IDs Parameters ---------- city : str City of interest, either `bordeaux` or `lyon` Returns ------- list of integers IDs of the shared-bike stations in the `city` """ query = ("SELECT id FROM {schema}.{table}" ";").format(schema=city, table='station') eng = db() rset = eng.execute(query).fetchall() if not rset: return [] return [row[0] for row in rset]
def daily_transaction_list(city, day, limit, order_by, window=0, backward=True): """Retrieve the daily transaction for the Bordeaux stations city: str day: date Data for this specific date limit: int order_by: str window: int (0 by default) Number of days to look around the specific date backward: bool (True by default) Get data before the date or not, according to the window number Return a list of dicts """ window = time_window(day, window, backward) query = daily_query_stations(city, limit, order_by) eng = db() rset = eng.execute(query, start=window.start, stop=window.stop, order_reference_date=window.order_reference_date).fetchall() return processing_daily_data(rset, window)
def get_station_ids(city): """Provides the list of shared-bike station IDs Parameters ---------- city : str City of interest, either `bordeaux` or `lyon` Returns ------- list of integers IDs of the shared-bike stations in the `city` """ query = ("SELECT id FROM {schema}.{table}" ";").format(schema=config[city]["schema"], table=config['database']['stations']) eng = db() rset = eng.execute(query).fetchall() if not rset: return [] return [row[0] for row in rset]
def daily_transaction(city, station_ids, day, window=0, backward=True): """Retrieve the daily transaction for the Bordeaux stations stations_ids: list of int List of ids station day: date Data for this specific date window: int (0 by default) Number of days to look around the specific date backward: bool (True by default) Get data before the date or not, according to the window number Return a list of dicts """ window = time_window(day, window, backward) query = daily_query(city) eng = db() rset = eng.execute(query, id_list=tuple(str(x) for x in station_ids), start=window.start, stop=window.stop).fetchall() return processing_daily_data(rset, window)
def cluster_profiles(city): """Return the cluster profiles in `city`, when running a K-means algorithm between `day` and `day+window` Parameters ---------- city : str City of interest, either `bordeaux` or `lyon` Returns ------- dict Cluster profiles for each cluster, at each hour of the day """ query = cluster_profile_query(city) eng = db() df = pd.io.sql.read_sql_query(query, eng) if df.empty: logger.warning("df is empty") return {"data": []} df = df.set_index('cluster_id') labels = find_cluster(df) result = [] for cluster_id, cluster in df.iterrows(): result.append({ "cluster_id": cluster_id, 'label': labels[cluster_id], "start": cluster['start'], 'stop': cluster['stop'], 'hour': list(range(24)), 'values': [cluster[h] for h in ["h{:02d}".format(i) for i in range(24)]] }) return {"data": result}
def run(self): query = ("SELECT DISTINCT id AS station_id, timestamp AS ts, " "available_bikes AS nb_bikes, available_stands AS nb_stands, " "available_bikes::float / (available_bikes::float " "+ available_stands::float) AS probability " "FROM {schema}.{tablename} " "WHERE timestamp >= %(start)s " "AND timestamp < %(stop)s " "AND (available_bikes > 0 OR available_stands > 0) " "AND (status = 'open')" "ORDER BY id, timestamp" ";").format(schema=self.city, tablename=config['database']['timeseries']) eng = db() df = pd.io.sql.read_sql_query(query, eng, params={"start": self.start, "stop": self.stop}) if df.empty: raise Exception("There is not any data to process in the DataFrame. " + "Please check the dates.") prediction_model = train_prediction_model(df, self.validation, self.frequency) self.output().makedirs() prediction_model.save_model(self.output().path)
def stations(city, limit, geojson): """List of bicycle stations Parameters ---------- city : string limit : int geojson : boolean Returns ------- list a list of dict, one dict by bicycle station """ query = _query_stations(city, limit) eng = db() rset = eng.execute(query) keys = rset.keys() result = [dict(zip(keys, row)) for row in rset] if geojson: return station_geojson( result, feature_list=['id', 'name', 'address', 'city', 'nb_stands']) return {"data": result}
def prediction_timeseries(city, station_ids, start, stop, values_num, with_current_values, freq='1H'): """Get bike availability predictions between `start` and `stop` dates for `city` at stations `station_ids` Parameters ---------- city : str City of interest station_ids : list of ints Ids of the stations that are considered start : datetime Begin of prediction period stop : datetime End of prediction period values_num : int Number of predict values with_current_values : bool Include the current values? Returns ------- list of dict """ # get predicted data query = """SELECT T.station_id AS id , T.timestamp AS timestamp , T.nb_bikes AS nb_bikes , S.nb_stations as nb_stands , S.name AS name FROM {city}.prediction AS T LEFT JOIN {city}.station AS S ON (T.station_id::varchar = S.id::varchar) WHERE id IN %(id_list)s AND timestamp >= %(start)s AND timestamp < %(stop)s AND frequency = %(freq)s ORDER BY id,timestamp;""".format(city=city) eng = db() rset_pred = eng.execute(query, id_list=tuple(x for x in station_ids), start=start, stop=stop, freq=freq) # current values rset_current = None if with_current_values: query = """select distinct id , timestamp , available_bikes as nb_bikes , S.nb_stations as nb_stands , S.name from {city}.timeseries as T left join {city}.station as S using(id) where id in %(id_list)s AND timestamp >= %(start)s and timestamp < %(stop)s """.format(city=city) rset_current = eng.execute(query, id_list=tuple(x for x in station_ids), start=start, stop=stop) pred = [dict(zip(x.keys(), x)) for x in rset_pred] # truncate the nth latest values and add the prediction time for data in pred[-values_num:]: data['at'] = freq current = [] if rset_current: current = [dict(zip(x.keys(), x)) for x in rset_current] for data in current: data['at'] = '0' return current + pred[-values_num:]