def publish_results(connectstring, df, level, runtype): schema = "landed" table = "calibrated_{level}_{runtype}_test".format(level=level, runtype=runtype) dbutils.df_to_db(connectstring, df, schema, table, if_exists="replace", write_index=True)
def publish_results(connectstring, df, runtype, period): schema = "landed" table = "cl_pgm_{runtype}_{period}".format(runtype=runtype, period=period) dbutils.df_to_db(connectstring, df, schema, table, if_exists="replace", write_index=True)
def getREIGN(schema="dataprep", table="reign", if_exists="replace"): """ Fetches most recent full REIGN dataset. """ #find current download link url = 'https://oefdatascience.github.io/REIGN.github.io/menu/reign_current.html' html_doc = urllib.request.urlopen(url).read() soup = BeautifulSoup(html_doc, 'lxml') container = soup.find("div", {"class": "post-container"}) link = container.find("a", href=True)['href'] print("downloading data from {}".format(link)) #fetch reign = pd.read_csv(link) #drop irrelevant variables #reign.drop(['leader'], axis=1, inplace=True) #exponentiate variables that have been logged for some unapparent reason #not sure about the other floats reign['lastelection'] = round(np.exp(reign['lastelection'])) reign['loss'] = round(np.exp(reign['loss'])) reign['irregular'] = round(np.exp(reign['irregular'])) #add transforms reign['tenure_log'] = np.log1p(reign.tenure_months) nomonths = reign['tenure_months'] == 0 #reign['regimeduration_log'] = np.log(reign.regimeduration) print(reign[nomonths]) print(reign.head()) #push to db dbutils.df_to_db(connectstring, reign, schema, table, if_exists, write_index=True)
logit_acled_ones_test_os, rf_canon_ones_test_sb, rf_canon_ones_test_ns, rf_canon_ones_test_os, rf_acled_ones_test_sb, rf_acled_ones_test_ns, rf_acled_ones_test_os) output_schema = "landed" output_table = "step_run1_test" models = [ logit_canon_full_test_sb, logit_canon_full_test_ns, logit_canon_full_test_os, logit_acled_full_test_sb, logit_acled_full_test_ns, logit_acled_full_test_os, logit_canon_ones_test_sb, logit_canon_ones_test_ns, logit_canon_ones_test_os, logit_acled_ones_test_sb, logit_acled_ones_test_ns, logit_acled_ones_test_os, rf_canon_ones_test_sb, rf_canon_ones_test_ns, rf_canon_ones_test_os, rf_acled_ones_test_sb, rf_acled_ones_test_ns, rf_acled_ones_test_os ] df_results = nstep.forecast_many(models) # Write forecast to db dbutils.df_to_db(connectstring, df_results, output_schema, output_table, if_exists="replace", write_index=True) print("test complete!")
db_table = "fvp" path_varlist = "../varlists/fvp/rawnames.txt" path_renames = "../varlists/fvp/renames.txt" varlist = datautils.read_varlist(path_varlist) renames = datautils.read_renames(path_renames) df = dbutils.file_to_df(path_input) df.columns = df.columns.str.lower() available = list(df.columns) wanted_not_in_data = [col for col in varlist if col not in available] wanted_in_data = [col for col in varlist if col in available] if len(wanted_not_in_data) > 0: message = "There are variables in " + path_varlist + " that aren't in" + path_input for v in wanted_not_in_data: print("\t", v) raise ValueError(message) # Subset the df to only those requested and existing in the data df = df[wanted_in_data] df = datautils.apply_renames(df, renames) dbutils.df_to_db(db_connectstring, df, db_schema, db_table, if_exists="replace")
import pandas as pd sys.path.append("..") from views_utils import dbutils parser = argparse.ArgumentParser() parser.add_argument("--path", type=str, help="dir_data") parser.add_argument("--schema", type=str, help="schema") parser.add_argument("--table", type=str, help="table") args_main = parser.parse_args() path = args_main.path table = args_main.table schema = args_main.schema prefix = "postgres" db = "views" uname = "VIEWSADMIN" hostname = "VIEWSHOST" port = "5432" connectstring = dbutils.make_connectstring(prefix, db, uname, hostname, port) df = pd.read_hdf(path) print(f"read {path}") dbutils.df_to_db(connectstring=connectstring, df=df, schema=schema, table=table, if_exists="replace", write_index=True)
table_input = "transforms_pgm_imp_1" schema_output = "landed" table_output = "rescaled_pgm" ids = [timevar, groupvar] vars_to_rescale = [var['name'] for var in rescales] vars_rescaled = [] cols = ids + vars_to_rescale df = dbutils.db_to_df(connectstring, schema_input, table_input, columns=cols, ids=ids) for rescale in rescales: name_new = rescale['name'] + "_rescaled" vars_rescaled.append(name_new) rescale['opts'].update({'x': df[rescale['name']]}) df[name_new] = scale_to_range(**rescale['opts']) df = df[vars_rescaled] dbutils.df_to_db(connectstring, df, schema_output, table_output, if_exists="replace", write_index=True)
import sys import pandas as pd sys.path.insert(0, "../..") from views_utils.dbutils import make_connectstring, df_to_db prefix = "postgres" db = "views" uname = "VIEWSADMIN" hostname = "VIEWSHOST" port = "5432" connectstring = make_connectstring(prefix, db, uname, hostname, port) loas = ["pgm", "cm"] schema = "launched" if_exists = "replace" for loa in loas: for imp in range(1, 6): table = loa + "_imp_ds_" + str(imp) path_input = "/storage/runs/current/ds/results/" path_input += loa + "_transforms/" + loa + "_imp_" + str(imp) + ".hdf5" print(schema, table) print(path_input) df = pd.read_hdf(path_input) df_to_db(connectstring, df, schema, table, if_exists, write_index=True)
uname="VIEWSADMIN") cols_actual = [ "ged_dummy_sb", "ged_dummy_ns", "ged_dummy_os", "acled_dummy_pr" ] # Get the country_id for each pgm df_country_keys = dbutils.db_to_df(connectstring, "staging_test", "cpgm") df_country_keys.set_index(["month_id", "pg_id"], inplace=True) df = df_country_keys.copy() df_actuals = dbutils.db_to_df(connectstring, "preflight", "flight_pgm", ids=["month_id", "pg_id"], columns=cols_actual) df_actuals = df_actuals.add_prefix("pgm_") df = df.merge(df_actuals, left_index=True, right_index=True) df_mean = df.groupby(["month_id", "country_id"]).mean() table_out = "agg_cm_actuals" print("table_out: {}".format(table_out)) dbutils.df_to_db(connectstring, df_mean, "landed", table_out, if_exists="replace", write_index=True)
"ensemble_pgm_fcast_test", "ensemble_pgm_eval_test" ] # ensemble for table in tables: path = f"/storage/runs/archive/r.2018.08.01/ensemble/results/{table}.hdf5" df_prev = pd.read_hdf(path) # set index if "cm" in table: df_prev.set_index(ids_cm, inplace=True) if "pgm" in table: df_prev.set_index(ids_pgm, inplace=True) # push table dbutils.df_to_db(connectstring, df_prev, schema, table, if_exists, write_index=True) #decay for level in ["cm", "pgm"]: path = f"/storage/runs/archive/r.2018.08.01/ds/transforms/{level}_transforms/data/{level}_imp_1.hdf5" table = f"transforms_{level}_imp_1" df_prev = pd.read_hdf(path) # isolate the decay columns cols = [col for col in df_prev if "decay" in col] if level == "cm": df_prev = df_prev[cols] if level == "pgm": df_prev = df_prev[cols] # push table
for year in range((ucdp_update.year + 1), (fcast_end.year + 1)): for month in range(1, 13): endvalues_base['year'] = year endvalues_base['month'] = month endvalues_base['month_id'] = endvalues_base['month_id'] + 1 # concatenate endvalues = pd.concat([endvalues, endvalues_base]) #concatenate that business vdem = pd.concat([vdem, endvalues]) print(endvalues.month_id.values) #testfile to inspect testcase = vdem[(vdem.country_id == 124) | (vdem.country_id == 125)] testcase.to_csv("testcase_long.csv") #cleanup and return the values for changemonths only return(vdem) # single new_vdem = monthifyVDEM() new_vdem.set_index(['country_id', 'month_id'], inplace=True) new_vdem.to_csv("vdem_monthly.csv") #push to DB connectstring = dbutils.make_connectstring(prefix="postgresql", db="views", uname="VIEWSADMIN", hostname="VIEWSHOST", port="5432") schema = "dataprep" if_exists = "replace" table = "vdem_regimechange" dbutils.df_to_db(connectstring, new_vdem, schema, table, if_exists, write_index=True)