Example #1
0
def cluster_profiles(city):
    """Return the cluster profiles in `city`, when running a K-means algorithm
    between `day` and `day+window`

    Parameters
    ----------
    city : str
        City of interest, either `bordeaux` or `lyon`

    Returns
    -------
    dict
        Cluster profiles for each cluster, at each hour of the day
    """
    query = cluster_profile_query(city)
    eng = db()
    rset = eng.execute(query)
    if not rset:
        logger.warning("rset is empty")
        return {"data": []}
    result = []
    for cluster in (dict(zip(rset.keys(), row)) for row in rset):
        result.append({
            "cluster_id":
            cluster['cluster_id'],
            "start":
            cluster['start'],
            'stop':
            cluster['stop'],
            'hour':
            list(range(24)),
            'values':
            [cluster[h] for h in ["h{:02d}".format(i) for i in range(24)]]
        })
    return {"data": result}
Example #2
0
def station_clusters(city, station_ids=None, geojson=False):
    """Return the cluster IDs of shared-bike stations in `city`, when running a
    K-means algorithm between `day` and `day+window`

    Parameters
    ----------
    city : str
        City of interest, either `bordeaux` or `lyon`
    station_ids : list of integer
        Shared-bike station IDs ; if None, all the city stations are considered
    geojson : boolean
        If true, returns the clustered stations under the GeoJSON format

    Returns
    -------
    dict
        Cluster profiles for each cluster, at each hour of the day

    """
    if station_ids is None:
        station_ids = get_station_ids(city)
    query = station_cluster_query(city)
    eng = db()
    rset = eng.execute(query,
                       id_list=tuple(str(x) for x in station_ids))
    if not rset:
        logger.warning("rset is empty")
        return {"data": []}
    data = {"data": [dict(zip(rset.keys(), row)) for row in rset]}
    if geojson:
        return clustered_station_geojson(data["data"])
    return data
Example #3
0
def daily_transaction_list(city,
                           day,
                           limit,
                           order_by,
                           window=0,
                           backward=True):
    """Retrieve the daily transaction for the Bordeaux stations

    city: str
    day: date
        Data for this specific date
    limit: int
    order_by: str
    window: int (0 by default)
        Number of days to look around the specific date
    backward: bool (True by default)
        Get data before the date or not, according to the window number

    Return a list of dicts
    """
    window = time_window(day, window, backward)
    query = daily_query_stations(city, limit, order_by)
    eng = db()
    rset = eng.execute(
        query,
        start=window.start,
        stop=window.stop,
        order_reference_date=window.order_reference_date).fetchall()
    return processing_daily_data(rset, window)
Example #4
0
def station_clusters(city, station_ids=None, geojson=False):
    """Return the cluster IDs of shared-bike stations in `city`, when running a
    K-means algorithm between `day` and `day+window`

    Parameters
    ----------
    city : str
        City of interest, either `bordeaux` or `lyon`
    station_ids : list of integer
        Shared-bike station IDs ; if None, all the city stations are considered
    geojson : boolean
        If true, returns the clustered stations under the GeoJSON format

    Returns
    -------
    dict
        Cluster profiles for each cluster, at each hour of the day

    """
    if station_ids is None:
        station_ids = get_station_ids(city)
    query = station_cluster_query(city)
    eng = db()
    rset = eng.execute(query, id_list=tuple(str(x) for x in station_ids))
    if not rset:
        logger.warning("rset is empty")
        return {"data": []}
    data = {"data": [dict(zip(rset.keys(), row)) for row in rset]}
    if geojson:
        return clustered_station_geojson(data["data"])
    return data
Example #5
0
def cluster_profiles(city):
    """Return the cluster profiles in `city`, when running a K-means algorithm
    between `day` and `day+window`

    Parameters
    ----------
    city : str
        City of interest, either `bordeaux` or `lyon`

    Returns
    -------
    dict
        Cluster profiles for each cluster, at each hour of the day
    """
    query = cluster_profile_query(city)
    eng = db()
    rset = eng.execute(query)
    if not rset:
        logger.warning("rset is empty")
        return {"data": []}
    result = []
    for cluster in (dict(zip(rset.keys(), row)) for row in rset):
        result.append({"cluster_id": cluster['cluster_id'],
                       "start": cluster['start'],
                       'stop': cluster['stop'],
                       'hour': list(range(24)),
                       'values': [cluster[h] for h in ["h{:02d}".format(i) for i in range(24)]]}
        )
    return {"data": result}
Example #6
0
 def run(self):
     query = ("SELECT DISTINCT id AS station_id, timestamp AS ts, "
              "available_bikes AS nb_bikes, available_stands AS nb_stands, "
              "available_bikes::float / (available_bikes::float "
              "+ available_stands::float) AS probability "
              "FROM {schema}.{tablename} "
              "WHERE timestamp >= %(start)s "
              "AND timestamp < %(stop)s "
              "AND (available_bikes > 0 OR available_stands > 0) "
              "AND (status = 'open')"
              "ORDER BY id, timestamp"
              ";").format(schema=self.city, tablename='timeseries')
     eng = db()
     df = pd.io.sql.read_sql_query(query,
                                   eng,
                                   params={
                                       "start": self.start,
                                       "stop": self.stop
                                   })
     df.station_id = df.station_id.astype(int)
     if df.empty:
         raise Exception(
             "There is not any data to process in the DataFrame. " +
             "Please check the dates.")
     prediction_model = train_prediction_model(df, self.validation,
                                               self.frequency)
     self.output().makedirs()
     prediction_model.save_model(self.output().path)
Example #7
0
 def run(self):
     query = """SELECT id
           ,st_x(geom) as lat
           ,st_y(geom) as lon
         FROM {schema}.{table};
         """.format(schema=self.city, table='stations')
     df = pd.io.sql.read_sql_query(query, db())
     clusters = compute_geo_clusters(df)
     self.output().makedirs()
     path = self.output().path
     clusters['labels'].to_hdf(path, '/clusters')
     clusters['centroids'].to_hdf(path, '/centroids')
Example #8
0
def latest_predictions(city, limit, geojson, freq='1H'):
    """Get bike availability predictions for a specific city.

    Parameters
    ----------
    city : str
        Name of the city
    limit : int
        Max number of stations
    geosjon : bool
        Data in geojson?
    freq : str
        Time horizon

    Returns
    -------
    dict
    """
    query = """with latest as (
      select station_id as id
        ,timestamp
        ,nb_bikes
        ,rank() over (partition by station_id order by timestamp desc) as rank
      from {city}.prediction
      where frequency=%(freq)s
         and timestamp >= %(min_date)s
    )
    select P.id
      ,P.timestamp
      ,P.nb_bikes
      ,S.name
      ,S.nb_stations as nb_stands
      ,st_x(S.geom) as x
      ,st_y(S.geom) as y
    from latest as P
    join {city}.station as S using(id)
    where P.rank=1
    order by id
    """.format(city=city)
    if limit is not None:
        query += " limit {limit}".format(limit=limit)
    eng = db()
    # avoid getting the full history
    min_date = datetime.now() - timedelta(days=2)
    rset = eng.execute(query, freq=freq, min_date=min_date, limit=limit)
    keys = rset.keys()
    result = [dict(zip(keys, row)) for row in rset]
    predict_date = max(x['timestamp'] for x in result)
    if geojson:
        return station_geojson(
            result,
            feature_list=['id', 'name', 'timestamp', 'nb_bikes', 'nb_stands'])
    return {"data": result, "date": predict_date}
Example #9
0
 def run(self):
     query = ("SELECT DISTINCT * FROM {schema}.{tablename} "
              "WHERE timestamp >= %(start)s AND timestamp < %(stop)s "
              "ORDER BY timestamp, id"
              ";").format(schema=self.city, tablename='timeseries')
     eng = db()
     query_params = {"start": self.date, "stop": self.date + timedelta(1)}
     df = pd.io.sql.read_sql_query(query, eng, params=query_params)
     transactions = (df.query("status == 'open'").groupby("id")
                     ['available_bikes'].apply(lambda s: s.diff().abs().sum(
                     )).dropna().to_frame().reset_index())
     transactions = transactions.rename_axis(
         {"available_bikes": "transactions"}, axis=1)
     with self.output().open('w') as fobj:
         transactions.to_csv(fobj, index=False)
Example #10
0
def timeseries(city, station_ids, start, stop):
    """Get timeseries data between two dates for a specific city and a list of station ids
    """
    query = """SELECT T.*
      ,S.name as name
    FROM {schema}.{table} AS T
    LEFT JOIN {schema}.{station} AS S using(id)
    WHERE id IN %(id_list)s AND timestamp >= %(start)s AND timestamp < %(stop)s
    ORDER BY id,timestamp
    """.format(schema=config[city]['schema'],
               table=config['database']['timeseries'],
               station=config['database']['stations'])
    eng = db()
    rset = eng.execute(query, id_list=tuple(x for x in station_ids),
                       start=start, stop=stop)
    return processing_timeseries(rset)
Example #11
0
 def run(self):
     query = ("SELECT id, timestamp, available_bikes "
              "FROM {}.timeseries "
              "WHERE timestamp >= %(start)s "
              "AND timestamp < %(stop)s;"
              "").format(self.city)
     eng = db()
     df = pd.io.sql.read_sql_query(query, eng,
                                   params={"start": self.start,
                                           "stop": self.stop})
     df.columns = ["station_id", "ts", "nb_bikes"]
     clusters = compute_clusters(df)
     self.output().makedirs()
     path = self.output().path
     clusters['labels'].to_hdf(path, '/clusters')
     clusters['centroids'].to_hdf(path, '/centroids')
Example #12
0
def timeseries(city, station_ids, start, stop):
    """Get timeseries data between two dates for a specific city and a list of station ids
    """
    query = """SELECT T.*
      ,S.name as name
    FROM {schema}.{table} AS T
    LEFT JOIN {schema}.{station} AS S using(id)
    WHERE id IN %(id_list)s AND timestamp >= %(start)s AND timestamp < %(stop)s
    ORDER BY id,timestamp
    """.format(schema=city, table='timeseries', station='station')
    eng = db()
    rset = eng.execute(query,
                       id_list=tuple(x for x in station_ids),
                       start=start,
                       stop=stop)
    return processing_timeseries(rset)
Example #13
0
def stations(city, limit, geojson):
    """List of bicycle stations

    city: string
    limit: int
    geojson: boolean

    Return a list of dict, one dict by bicycle station
    """
    query = _query_stations(city, limit)
    eng = db()
    rset = eng.execute(query)
    keys = rset.keys()
    result = [dict(zip(keys, row)) for row in rset]
    if geojson:
        return station_geojson(result)
    return {"data": result}
Example #14
0
 def run(self):
     query = """SELECT DISTINCT * FROM {schema}.{tablename}
       WHERE last_update >= %(start)s AND last_update < %(stop)s
       ORDER BY last_update,number""".format(schema=config["lyon"]["schema"],
                                             tablename=config['lyon']['table'])
     eng = db()
     df = pd.io.sql.read_sql_query(query, eng, params={"start": self.date,
                                                       "stop": self.date + timedelta(1)})
     transactions = (df.query("status == 'OPEN'")
                     .groupby("number")['available_bikes']
                     .apply(lambda s: s.diff().abs().sum())
                     .dropna()
                     .to_frame()
                     .reset_index())
     transactions = transactions.rename_axis({"available_bikes": "transactions"}, axis=1)
     with self.output().open('w') as fobj:
         transactions.to_csv(fobj, index=False)
Example #15
0
def stations(city, limit, geojson):
    """List of bicycle stations

    city: string
    limit: int
    geojson: boolean

    Return a list of dict, one dict by bicycle station
    """
    query = _query_stations(city, limit)
    eng = db()
    rset = eng.execute(query)
    keys = rset.keys()
    result = [dict(zip(keys, row)) for row in rset]
    if geojson:
        return station_geojson(result)
    return {"data": result}
Example #16
0
 def run(self):
     query = ("SELECT DISTINCT * FROM {schema}.{tablename} "
              "WHERE timestamp >= %(start)s AND timestamp < %(stop)s "
              "ORDER BY timestamp, id"
              ";").format(schema=self.city,
                          tablename=config['database']['timeseries'])
     eng = db()
     query_params = {"start": self.date,
                     "stop": self.date + timedelta(1)}
     df = pd.io.sql.read_sql_query(query, eng, params=query_params)
     transactions = (df.query("status == 'open'")
                     .groupby("id")['available_bikes']
                     .apply(lambda s: s.diff().abs().sum())
                     .dropna()
                     .to_frame()
                     .reset_index())
     transactions = transactions.rename_axis({"available_bikes": "transactions"}, axis=1)
     with self.output().open('w') as fobj:
         transactions.to_csv(fobj, index=False)
Example #17
0
def specific_stations(city, ids):
    """List of specific bicycle stations.

    Parameters
    ----------
    city : string
    ids : list

    Returns
    -------
    list of dict
        One dict by bicycle station
    """
    query = _query_stations(city, 1).replace("LIMIT 1", 'WHERE id IN %(id_list)s')
    eng = db()
    rset = eng.execute(query, id_list=tuple(str(x) for x in ids)).fetchall()
    if not rset:
        return []
    return {"data": [dict(zip(x.keys(), x)) for x in rset]}
Example #18
0
def latest_station_timewindow(city, start, stop):
    """Get latest available stations (by date) between `start` and `stop`.

    Parameters
    ----------
    city : str
        The name of the city.
    start : datetime
    stop : datetime

    Returns
    -------
    pd.DataFrame
    """
    query = """with ranked as (
    select distinct id as station_id
      , timestamp as ts
      , available_bikes as nb_bikes
      , available_stands as nb_stands
      , available_bikes::float / (available_bikes::float + available_stands::float) as probability
      -- rank over station_id and timestamp to take the most recent ones
      , rank() over (partition by id order by timestamp desc) as rk
    from {schema}.timeseries
    where timestamp >= %(start)s
    and timestamp < %(stop)s
    and (available_bikes > 0 or available_stands > 0)
    and status = 'open'
    order by id, timestamp
      )

    select station_id
      ,ts
      ,nb_bikes
      ,nb_stands
      ,probability
    from ranked
    where rk=1
    order by station_id;
    """.format(schema=city)
    eng = db()
    return pd.io.sql.read_sql_query(query, eng,
                                    params={"start": start,
                                            "stop": stop})
Example #19
0
 def run(self):
     query = """SELECT DISTINCT ident, type, state, available_bike, ts
        FROM {schema}.{tablename}
        WHERE ts >= %(start)s AND ts < %(stop)s
        ORDER BY ident,ts;""".format(schema=config["bordeaux"]["schema"],
                                     tablename=config['bordeaux']['table'])
     eng = db()
     df = pd.io.sql.read_sql_query(query,
                                   eng,
                                   params={
                                       "start": self.date,
                                       "stop": self.date + timedelta(1)
                                   })
     transactions = (df.query("state == 'CONNECTEE'").groupby("ident")
                     ['available_bike'].apply(lambda s: s.diff().abs().sum(
                     )).dropna().to_frame().reset_index())
     transactions = transactions.rename_axis(
         {"available_bike": "transactions"}, axis=1)
     with self.output().open('w') as fobj:
         transactions.to_csv(fobj, index=False)
Example #20
0
def specific_stations(city, ids):
    """List of specific bicycle stations.

    Parameters
    ----------
    city : string
    ids : list

    Returns
    -------
    list of dict
        One dict by bicycle station
    """
    query = _query_stations(city, 1).replace("LIMIT 1",
                                             'WHERE id IN %(id_list)s')
    eng = db()
    rset = eng.execute(query, id_list=tuple(str(x) for x in ids)).fetchall()
    if not rset:
        return []
    return {"data": [dict(zip(x.keys(), x)) for x in rset]}
Example #21
0
def get_station_ids(city):
    """Provides the list of shared-bike station IDs

    Parameters
    ----------
    city : str
        City of interest, either `bordeaux` or `lyon`

    Returns
    -------
    list of integers
        IDs of the shared-bike stations in the `city`
    """
    query = ("SELECT id FROM {schema}.{table}"
             ";").format(schema=city, table='station')
    eng = db()
    rset = eng.execute(query).fetchall()
    if not rset:
        return []
    return [row[0] for row in rset]
Example #22
0
def daily_transaction_list(city, day, limit, order_by, window=0, backward=True):
    """Retrieve the daily transaction for the Bordeaux stations

    city: str
    day: date
        Data for this specific date
    limit: int
    order_by: str
    window: int (0 by default)
        Number of days to look around the specific date
    backward: bool (True by default)
        Get data before the date or not, according to the window number

    Return a list of dicts
    """
    window = time_window(day, window, backward)
    query = daily_query_stations(city, limit, order_by)
    eng = db()
    rset = eng.execute(query, start=window.start, stop=window.stop,
                       order_reference_date=window.order_reference_date).fetchall()
    return processing_daily_data(rset, window)
Example #23
0
def get_station_ids(city):
    """Provides the list of shared-bike station IDs

    Parameters
    ----------
    city : str
        City of interest, either `bordeaux` or `lyon`

    Returns
    -------
    list of integers
        IDs of the shared-bike stations in the `city`
    """
    query = ("SELECT id FROM {schema}.{table}"
             ";").format(schema=config[city]["schema"],
                         table=config['database']['stations'])
    eng = db()
    rset = eng.execute(query).fetchall()
    if not rset:
        return []
    return [row[0] for row in rset]
Example #24
0
def daily_transaction(city, station_ids, day, window=0, backward=True):
    """Retrieve the daily transaction for the Bordeaux stations

    stations_ids: list of int
        List of ids station
    day: date
        Data for this specific date
    window: int (0 by default)
        Number of days to look around the specific date
    backward: bool (True by default)
        Get data before the date or not, according to the window number

    Return a list of dicts
    """
    window = time_window(day, window, backward)
    query = daily_query(city)
    eng = db()
    rset = eng.execute(query,
                       id_list=tuple(str(x) for x in station_ids),
                       start=window.start, stop=window.stop).fetchall()
    return processing_daily_data(rset, window)
Example #25
0
def cluster_profiles(city):
    """Return the cluster profiles in `city`, when running a K-means algorithm
    between `day` and `day+window`

    Parameters
    ----------
    city : str
        City of interest, either `bordeaux` or `lyon`

    Returns
    -------
    dict
        Cluster profiles for each cluster, at each hour of the day
    """
    query = cluster_profile_query(city)
    eng = db()
    df = pd.io.sql.read_sql_query(query, eng)
    if df.empty:
        logger.warning("df is empty")
        return {"data": []}
    df = df.set_index('cluster_id')
    labels = find_cluster(df)
    result = []
    for cluster_id, cluster in df.iterrows():
        result.append({
            "cluster_id":
            cluster_id,
            'label':
            labels[cluster_id],
            "start":
            cluster['start'],
            'stop':
            cluster['stop'],
            'hour':
            list(range(24)),
            'values':
            [cluster[h] for h in ["h{:02d}".format(i) for i in range(24)]]
        })
    return {"data": result}
Example #26
0
def daily_transaction(city, station_ids, day, window=0, backward=True):
    """Retrieve the daily transaction for the Bordeaux stations

    stations_ids: list of int
        List of ids station
    day: date
        Data for this specific date
    window: int (0 by default)
        Number of days to look around the specific date
    backward: bool (True by default)
        Get data before the date or not, according to the window number

    Return a list of dicts
    """
    window = time_window(day, window, backward)
    query = daily_query(city)
    eng = db()
    rset = eng.execute(query,
                       id_list=tuple(str(x) for x in station_ids),
                       start=window.start,
                       stop=window.stop).fetchall()
    return processing_daily_data(rset, window)
Example #27
0
 def run(self):
     query = ("SELECT DISTINCT id AS station_id, timestamp AS ts, "
              "available_bikes AS nb_bikes, available_stands AS nb_stands, "
              "available_bikes::float / (available_bikes::float "
              "+ available_stands::float) AS probability "
              "FROM {schema}.{tablename} "
              "WHERE timestamp >= %(start)s "
              "AND timestamp < %(stop)s "
              "AND (available_bikes > 0 OR available_stands > 0) "
              "AND (status = 'open')"
              "ORDER BY id, timestamp"
              ";").format(schema=self.city,
                          tablename=config['database']['timeseries'])
     eng = db()
     df = pd.io.sql.read_sql_query(query, eng,
                                   params={"start": self.start,
                                           "stop": self.stop})
     if df.empty:
         raise Exception("There is not any data to process in the DataFrame. "
                         + "Please check the dates.")
     prediction_model = train_prediction_model(df, self.validation, self.frequency)
     self.output().makedirs()
     prediction_model.save_model(self.output().path)
Example #28
0
def stations(city, limit, geojson):
    """List of bicycle stations

    Parameters
    ----------
    city : string
    limit : int
    geojson : boolean

    Returns
    -------
    list
        a list of dict, one dict by bicycle station
    """
    query = _query_stations(city, limit)
    eng = db()
    rset = eng.execute(query)
    keys = rset.keys()
    result = [dict(zip(keys, row)) for row in rset]
    if geojson:
        return station_geojson(
            result,
            feature_list=['id', 'name', 'address', 'city', 'nb_stands'])
    return {"data": result}
Example #29
0
def prediction_timeseries(city,
                          station_ids,
                          start,
                          stop,
                          values_num,
                          with_current_values,
                          freq='1H'):
    """Get bike availability predictions between `start` and `stop` dates for
    `city` at stations `station_ids`

    Parameters
    ----------
    city : str
        City of interest
    station_ids : list of ints
        Ids of the stations that are considered
    start : datetime
        Begin of prediction period
    stop : datetime
        End of prediction period
    values_num : int
        Number of predict values
    with_current_values : bool
        Include the current values?
    Returns
    -------
    list of dict
    """
    # get predicted data
    query = """SELECT T.station_id AS id
         , T.timestamp AS timestamp
         , T.nb_bikes AS nb_bikes
         , S.nb_stations as nb_stands
         , S.name AS name
         FROM {city}.prediction AS T
         LEFT JOIN {city}.station AS S ON (T.station_id::varchar = S.id::varchar)
         WHERE id IN %(id_list)s
           AND timestamp >= %(start)s AND timestamp < %(stop)s
           AND frequency = %(freq)s
         ORDER BY id,timestamp;""".format(city=city)
    eng = db()
    rset_pred = eng.execute(query,
                            id_list=tuple(x for x in station_ids),
                            start=start,
                            stop=stop,
                            freq=freq)
    # current values
    rset_current = None
    if with_current_values:
        query = """select distinct id
           , timestamp
           , available_bikes as nb_bikes
           , S.nb_stations as nb_stands
           , S.name
        from {city}.timeseries as T
        left join {city}.station as S using(id)
        where id in %(id_list)s
           AND timestamp >= %(start)s and timestamp < %(stop)s
        """.format(city=city)
        rset_current = eng.execute(query,
                                   id_list=tuple(x for x in station_ids),
                                   start=start,
                                   stop=stop)
    pred = [dict(zip(x.keys(), x)) for x in rset_pred]
    # truncate the nth latest values and add the prediction time
    for data in pred[-values_num:]:
        data['at'] = freq
    current = []
    if rset_current:
        current = [dict(zip(x.keys(), x)) for x in rset_current]
        for data in current:
            data['at'] = '0'
    return current + pred[-values_num:]