Python db_to_df_limited Examples, views_utils.dbutils.db_to_df_limited Python Examples

Example #1

0

Show file

def fetch_data(model):
    """ Reads df_train and df_forecast from db

    df_train contains ids, outcomes and features for the training period.
    df_forecast contains only ids for the forecasting period.
    """

    def get_columns_train(model, ids):
        columns = []
        columns += ids
        columns.append(model['outcome'])
        columns += model['features']
        return columns

    def get_columns_forecast(model, ids):
        columns = ids
        return columns

    connectstring = model['table']['connectstring']
    schema   = model['table']['schema']
    table    = model['table']['table']
    timevar  = model['table']['timevar']
    groupvar = model['table']['groupvar']
    ids = [timevar, groupvar]

    df_train = dbutils.db_to_df_limited(
        connectstring = connectstring,
        schema   = schema,
        table    = table,
        columns  = get_columns_train(model, ids),
        timevar  = timevar,
        groupvar = groupvar,
        tmin     = model['train_start'],
        tmax     = model['train_end']
        )

    df_forecast = dbutils.db_to_df_limited(
        connectstring = connectstring,
        schema   = schema,
        table    = table,
        columns  = get_columns_forecast(model, ids),
        timevar  = timevar,
        groupvar = groupvar,
        tmin     = model['forecast_start'],
        tmax     = model['forecast_end']
        )

    return df_train, df_forecast

Example #2

0

Show file

def plot_world_average_with_actuals(df, connectstring, directory, timevar,
                                    groupvar):
    directory = directory + "/world_avg_w_actuals/"
    create_dirs([directory])

    columns = get_numeric_cols(df)
    t_start_actuals, t_end_actuals = get_times_actuals(df)

    cols_actuals = [
        "ged_dummy_sb", "ged_dummy_ns", "ged_dummy_os", "acled_dummy_pr"
    ]
    schema_actuals = "preflight"
    if groupvar == "pg_id":
        table_actuals = "flight_pgm"
    elif groupvar == "country_id":
        table_actuals = "flight_cm"

    df_actuals = dbutils.db_to_df_limited(connectstring,
                                          schema_actuals,
                                          table_actuals,
                                          columns=cols_actuals,
                                          timevar=timevar,
                                          groupvar=groupvar,
                                          tmin=t_start_actuals,
                                          tmax=t_end_actuals)

    df_months = fetch_df_months(connectstring)

    df = df.merge(df_actuals, left_index=True, right_index=True, how='outer')

    df_mean = df.groupby(level=0).mean()

    for c in columns:

        actual = maputils.match_plotvar_actual(c)

        path = directory + "world_" + str(c) + ".png"
        plt.figure()

        plt.plot(df_mean[actual], label='History')
        plt.plot(df_mean[c], linestyle='--', label=c)

        plt.xticks(*make_ticks(df, df_months), rotation=90)

        #title = "{}\n{}".format("World", c)
        #plt.title(title, loc='left')
        plt.legend()

        plt.tight_layout()
        plt.savefig(path)
        print("wrote", path)
        plt.close()

Example #3

0

Show file

def get_predictions_eval_test():
    schema_pred = "landed"
    schema_actuals = "preflight"

    table_ds = "ds_pgm_eval_test"
    table_osa = "osa_pgm_eval_test"
    table_ebma = "ebma_pgm_eval_test"

    table_actuals = "flight_pgm"

    timevar = "month_id"
    groupvar = "pg_id"
    ids = [timevar, groupvar]

    outcomes = [
        "ged_dummy_sb", "ged_dummy_ns", "ged_dummy_os", "acled_dummy_pr"
    ]

    df_ds = dbutils.db_to_df(connectstring, schema_pred, table_ds)
    df_osa = dbutils.db_to_df(connectstring, schema_pred, table_osa)
    df_ebma = dbutils.db_to_df(connectstring, schema_pred, table_ebma)
    df_ds.set_index(ids, inplace=True)
    df_osa.set_index(ids, inplace=True)
    df_ebma.set_index(ids, inplace=True)

    t_start_ds = df_ds.index.get_level_values(timevar).min()
    t_start_osa = df_osa.index.get_level_values(timevar).min()
    t_start_ebma = df_ebma.index.get_level_values(timevar).min()
    t_end_ds = df_ds.index.get_level_values(timevar).max()
    t_end_osa = df_osa.index.get_level_values(timevar).max()
    t_end_ebma = df_ebma.index.get_level_values(timevar).max()

    start_same = t_start_ds == t_start_osa == t_start_ebma
    end_same = t_end_ds == t_end_osa == t_end_ebma

    if not start_same and end_same:
        raise RuntimeError("The time indexes for ds, osa and ebma don't match")

    df = dbutils.db_to_df_limited(connectstring,
                                  schema_actuals,
                                  table_actuals,
                                  columns=outcomes + ids,
                                  timevar=timevar,
                                  groupvar=groupvar,
                                  tmin=t_start_ds,
                                  tmax=t_end_ds)

    df = df.merge(df_ds, left_index=True, right_index=True)
    df = df.merge(df_osa, left_index=True, right_index=True)
    df = df.merge(df_ebma, left_index=True, right_index=True)
    return df

Example #4

0

Show file

def get_data(connectstring, level="cm", runtype="fcast"):
    """ Get actuals and calibration period and testing period predictions

    Args:
        connectstring:
        level: cm or pgm
        runtype: fcast or eval
    """

    schema_actuals = "preflight"
    schema_predictions = "landed"

    table_actuals = "flight_" + level
    cols_actual = ["ged_dummy_" + t for t in ["sb", "ns", "os"]]
    cols_actual.append("acled_dummy_pr")

    timevar = "month_id"
    if level == "cm":
        groupvar = "country_id"
    elif level == "pgm":
        groupvar = "pg_id"
    ids = [timevar, groupvar]

    table_osa_calib = "_".join(["osa", level, runtype, "calib"])
    table_osa_test = "_".join(["osa", level, runtype, "test"])
    table_ds_calib = "_".join(["ds", level, runtype, "calib"])
    table_ds_test = "_".join(["ds", level, runtype, "test"])
    table_cl_calib = "_".join(["cl", level, runtype, "calib"])
    table_cl_test = "_".join(["cl", level, runtype, "test"])



    df_osa_calib = dbutils.db_to_df(connectstring, schema_predictions,
                                    table_osa_calib, ids=ids)
    df_osa_test = dbutils.db_to_df(connectstring, schema_predictions,
                                   table_osa_test, ids=ids)

    df_ds_calib = dbutils.db_to_df(connectstring, schema_predictions,
                                   table_ds_calib, ids=ids)
    df_ds_test = dbutils.db_to_df(connectstring, schema_predictions,
                                  table_ds_test, ids=ids)

    # only include cl for pgm level
    if level=="pgm":
        df_cl_calib = dbutils.db_to_df(connectstring, schema_predictions,
                                       table_cl_calib, ids=ids)
        df_cl_test = dbutils.db_to_df(connectstring, schema_predictions,
                                      table_cl_test, ids=ids)


    assert_equal_times(df_osa_calib, df_ds_calib, timevar)
    assert_equal_times(df_osa_test, df_ds_test, timevar)

    t_start_calib = df_osa_calib.index.get_level_values(timevar).min()
    t_end_calib = df_osa_calib.index.get_level_values(timevar).max()

    df_pred_calib = df_osa_calib.merge(df_ds_calib,
                                       left_index=True, right_index=True)

    df_pred_test = df_osa_test.merge(df_ds_test,
                                     left_index=True, right_index=True)

    # only include cl for pgm level
    if level == "pgm":
        df_pred_calib = df_pred_calib.merge(df_cl_calib,
                                            left_index=True, right_index=True)
        df_pred_test = df_pred_test.merge(df_cl_test,
                                            left_index=True, right_index=True)

    df_actuals = dbutils.db_to_df_limited(connectstring,
                                          schema_actuals, table_actuals,
                                          columns=cols_actual,
                                          timevar=timevar,
                                          groupvar=groupvar,
                                          tmin=t_start_calib,
                                          tmax=t_end_calib)

    for df in [df_actuals, df_pred_calib, df_pred_test]:
        df.sort_index(inplace=True)


    return df_actuals, df_pred_calib, df_pred_test

Example #5

0

Show file

File: heatmap_proportions.py Project: jemceach/OpenViEWS

df_cpgm = dbutils.db_to_df(connectstring,
                           schema="staging_test",
                           table="cpgm",
                           ids=[timevar, groupvar])

df = df_pgm.merge(df_cpgm, left_index=True, right_index=True)
df.reset_index(inplace=True)
df = df.merge(df_c, on=["country_id"])
df.set_index([timevar, groupvar], inplace=True)

tmin = df.index.get_level_values(timevar).min()
tmax = df.index.get_level_values(timevar).max()
df_actuals = dbutils.db_to_df_limited(connectstring,
                                      schema="preflight",
                                      table="flight_pgm",
                                      timevar="month_id",
                                      groupvar=groupvar,
                                      tmin=tmin,
                                      tmax=tmax,
                                      columns=outcomes.copy())

df_actuals.fillna(0, inplace=True)
print(outcomes)
df = df.merge(df_actuals, left_index=True, right_index=True)
df.reset_index(inplace=True)

df_months = fetch_df_months(connectstring)

df.set_index(['month_id'], inplace=True)
df = df.merge(df_months, left_index=True, right_index=True)
df.sort_index(inplace=True)
df['month'] = df['month'].astype(str)

Example #6

0

Show file

def plot_map_worker(local_settings, plotjob):

    connectstring = local_settings['connectstring']
    dir_plots = local_settings['dir_plots']
    dir_spatial_pgm = local_settings['dir_spatial_pgm']
    dir_spatial_cm = local_settings['dir_spatial_cm']

    plotvar         = plotjob['plotvar']
    varname_actual  = plotjob['varname_actual']
    schema_plotvar  = plotjob['schema_plotvar']
    schema_actual   = plotjob['schema_actual']
    table_plotvar   = plotjob['table_plotvar']
    table_actual    = plotjob['table_actual']
    timevar         = plotjob['timevar']
    groupvar        = plotjob['groupvar']
    variable_scale  = plotjob['variable_scale']
    projection      = plotjob['projection']
    crop            = plotjob['crop']
    run_id          = plotjob['run_id']

    path_shape_pg = dir_spatial_pgm + "/priogrid"
    path_shape_c = dir_spatial_cm + "/country"

    ids = [timevar, groupvar]

    print("Plotting variable {} from table {}.{}".format(plotvar,
        schema_plotvar, table_plotvar))

    print(json.dumps(plotjob, indent=4))

    df_plotvar = dbutils.db_to_df(connectstring, schema_plotvar, table_plotvar,
        [plotvar], ids)

    df_plotvar = restrict_prob_lower_bound(df_plotvar, 0.001)

    time_start, time_end = get_time_limits(df_plotvar, timevar)
    print("time_start:", time_start)
    print("time_end:", time_end)

    have_actual=False
    if varname_actual is not None:
        have_actual=True

    if groupvar == "pg_id":
        df_geo = fetch_df_geo_pgm(connectstring)
        df_geo = prune_priogrid(df_geo)
        size = get_figure_size(df_geo, scale=0.6)

    if have_actual and groupvar == "pg_id":
        df_actuals = dbutils.db_to_df_limited(connectstring, schema_actual,
                                               table_actual, [varname_actual],
                                               timevar, groupvar,
                                               time_start, time_end)

        df_event_coords = get_df_actuals_event_coords(df_actuals, df_geo,
                                                          varname_actual)


    elif groupvar == "country_id":
        df_geo = fetch_df_geo_c(crop)
        size = get_figure_size(df_geo, scale=0.6)

    if timevar == 'month_id':
        df_months = fetch_df_months(connectstring)

    plotvar_bounds = get_var_bounds(df_plotvar, plotvar)
    print("Bounds: ", plotvar_bounds)
    times = range(time_start, time_end+1)

    cmap = get_cmap(variable_scale)


    ticks = make_ticks(variable_scale)

    dir_schema  = dir_plots  + schema_plotvar + "/"
    dir_table   = dir_schema + table_plotvar  + "/"
    dir_plotvar = dir_table  + plotvar        + "/"
    create_dirs([dir_plots, dir_schema, dir_table, dir_plotvar])

    for t in times:
        print("Plotting for {}".format(t))
        df_plotvar_t = df_plotvar.loc[t]

        #fig = plt.figure(figsize = size)
        fig, ax = plt.subplots(figsize = size)

        print("Making basemap")
        map = get_basemap(projection, df_geo)

        print("Reading shape")

        if groupvar == "pg_id":
            map.readshapefile(path_shape_pg, 'GID', drawbounds=False)

        if groupvar == "country_id":
            map.readshapefile(path_shape_c, 'ID', drawbounds=False)

        if groupvar == "pg_id" and have_actual:
            events_t = get_events_t(df_event_coords, t)
            map = plot_events_on_map(map, events_t)

        print("Making collection")
        # Plot the probs
        collection = make_collection(map,
                                     df_plotvar_t,
                                     cmap,
                                     ticks['values'],
                                     variable_scale,
                                     plotvar_bounds,
                                     groupvar)
        ax.add_collection(collection)

        if groupvar == "pg_id":
           # The Africa limited shapefile
            map.readshapefile(path_shape_pg, 'GID', drawbounds=True)


        cbar_fontsize = size[1]/2

        if variable_scale in ["logodds", "prob"]:
            # if we're plotting logodds or probs set custom ticks
            cbar = plt.colorbar(collection,
                                ticks=ticks['values'],
                                fraction=0.046,
                                pad=0.04)
            cbar.ax.set_yticklabels(ticks['labels'], size=cbar_fontsize)
        else:
            # else use default colorbar for interval variables
            cbar = plt.colorbar(collection)

        map.drawmapboundary()

        if groupvar == "pg_id":
            map.readshapefile(path_shape_pg, 'GID', drawbounds=True)
            map.readshapefile(path_shape_c, 'ID', drawbounds=True, color='w',
                linewidth = 2)
            map.readshapefile(path_shape_c, 'ID', drawbounds=True, color='k',
                linewidth = 1)
        elif groupvar == "country_id":
            map.readshapefile(path_shape_c, 'ID', drawbounds=True)


        s_t_plotvar = "{}.{}".format(schema_plotvar, table_plotvar)
        text_box = "Modelname: {}\nRun: {}\nTable: {}".format(plotvar,
                                                                 run_id,
                                                                 s_t_plotvar)
        bbox = {'boxstyle' : 'square',  'facecolor' : "white"}
        lon_min = df_geo['longitude'].min()
        lon_max = df_geo['longitude'].max()
        lat_min = df_geo['latitude'].min()
        lat_max = df_geo['latitude'].max()

        w_eu = 6.705
        h_eu = 4.5
        w_erc = 4.7109375
        h_erc = 4.5
        w_views = w_eu + w_erc + 1
        h_views = 4.024

        lon_start_eu = lon_min + 1
        lon_end_eu = lon_start_eu + w_eu
        lon_start_erc = lon_end_eu + 1
        lon_end_erc = lon_start_erc + w_erc
        lon_start_views = lon_start_eu
        lon_end_views = lon_end_erc

        lat_start_eu = lat_min + 1
        lat_end_eu = lat_start_eu + h_eu
        lat_start_erc = lat_start_eu
        lat_end_erc = lat_start_erc + h_erc
        lat_start_views = lat_end_eu + 1
        lat_end_views = lat_start_views + h_views

        lon_min_textbox = lon_start_views
        lat_min_textbox = lat_end_views + 1

        box_logo_eu     = (lon_start_eu, lon_end_eu,
                           lat_start_eu, lat_end_eu)
        box_logo_erc    = (lon_start_erc, lon_end_erc,
                           lat_start_erc, lat_end_erc)
        box_logo_views  = (lon_start_views, lon_end_views,
                           lat_start_views, lat_end_views)

        plt.text(lon_min_textbox, lat_min_textbox,
                 text_box, bbox=bbox, fontsize=size[1]*0.5)

        logo_eu = mpimg.imread("/storage/static/logos/eu.png")
        logo_erc = mpimg.imread("/storage/static/logos/erc.png")
        logo_views = mpimg.imread("/storage/static/logos/views.png")

        plt.imshow(logo_erc, extent = box_logo_erc)
        plt.imshow(logo_eu, extent = box_logo_eu)
        plt.imshow(logo_views, extent = box_logo_views)

        text_title = "TITLE"
        if timevar == 'month_id':
            text_title = month_id_to_datestr(df_months, t)
        plt.figtext(0.5, 0.85, text_title, fontsize=size[1], ha='center')

        path = dir_plotvar + str(t) + ".png"
        plt.savefig(path, bbox_inches="tight")
        print("wrote", path)
        plt.close()

Example #7

0

Show file

File: eval_landed.py Project: jemceach/OpenViEWS

    t_end_ds = df_ds.index.get_level_values(timevar).max()
    t_end_osa = df_osa.index.get_level_values(timevar).max()
    t_end_ebma = df_ebma.index.get_level_values(timevar).max()

    start_same = t_start_ds == t_start_osa == t_start_ebma
    end_same = t_end_ds == t_end_osa == t_end_ebma
except:
    print("Error")

    if not start_same and end_same:
        raise RuntimeError("The time indexes for ds, osa and ebma don't match")

    df = dbutils.db_to_df_limited(connectstring,
                                  schema_actuals,
                                  table_actuals,
                                  columns=outcomes + ids,
                                  timevar=timevar,
                                  groupvar=groupvar,
                                  tmin=t_start_ds,
                                  tmax=t_end_ds)

    df = df.merge(df_ds, left_index=True, right_index=True)
    df = df.merge(df_osa, left_index=True, right_index=True)
    df = df.merge(df_ebma, left_index=True, right_index=True)

    df.to_hdf("temp.hdf5", key='data')

cols = df.columns
cols = [col.replace("_eval_test", "") for col in cols]
cols = [col.replace("_pgm", "") for col in cols]
cols = [col.replace("_downsampled", "_dsmp") for col in cols]
cols = [col.replace("_fullsample", "_fsmp") for col in cols]