def main(): # Open final football file csv. final_df = pd.read_csv(ppj('OUT_DATA_FOOTBALL', 'games_combined.csv'), low_memory=False) # Get unique set of clubs for both home and away teams. unique_home = final_df['home_club'].unique().tolist() unique_away = final_df['away_club'].unique().tolist() unique_clubs = unique_home + unique_away # Combined list. unique_clubs = list(set(unique_clubs))[1:] # Unique intersection. unique_clubs = unique_clubs[0:150] # !!Subset!! # Scraping via multiprocessing. dict_list = [] with mp.Pool() as pool: out = pool.map(get_geodata, unique_clubs) dict_list.extend(out) # Dicts to dataframe. club_longlat_df = pd.DataFrame(dict_list) # Merge to game data and store as csv. club_longlat_df.to_csv(ppj('OUT_DATA_FOOTBALL', 'club_longlat.csv'), index=False)
def extract_true_factors(): """Merge tables generated from simulated data, where columns 'fac1', 'fac2', 'fac3' from **table_2** contain the factor ids in **table_1**. The output is one pandas dataframe (saved as pickle) that contains the multiindex (caseid, period) and the three (true) factors. """ # Read in dataframes from Stata files. factor = pd.read_stata( ppj("OUT_DATA","tables", "data_table_1.dta"), index_col = 'factor_id', columns = ['factor_id', 'true_fac'] ) f_nr=['fac1', 'fac2', 'fac3'] case = pd.read_stata( ppj("OUT_DATA", "tables", "data_table_2.dta"), columns = f_nr + ['caseid', 't'] ) case.set_index(['caseid', 't'], inplace = True) # Join data at indices, generate one dataframe and save as pickle. for f in f_nr: case = case.join(factor, on = f, rsuffix='_'+f)
def mp_scraping(mtchday, game_df): """ Checks wheter CSV file for specified *mtchday* ID already exists, if not games corresponding to this ID are scraped via multiprocessing. The game URLs are stored in *game_df* connecting matchday ID and game URLs. The resulting dataframe is saved as a CSV file. """ # Check wheter matchday games are already scraped. if not os.path.isfile( ppj('OUT_DATA_FOOTBALL_CSV', '{}.csv'.format(mtchday))): temp_urls = game_df[(game_df['mtchday_id'] == mtchday) & ( game_df['doable'] == 1)]['game_url'].unique().tolist() # Scraping via multiprocessing. dict_list = [] with mp.Pool() as pool: out = pool.map(scrape_game_data, temp_urls) dict_list.extend(out) # To list. # Dicts to dataframe and save as CSV. df = pd.DataFrame(dict_list) df['mtchday_id'] = mtchday df.to_csv(ppj('OUT_DATA_FOOTBALL_CSV', '{}.csv'.format(mtchday)), index=False) else: pass
def main(): # Load game-, player- and geo-data plyr_df = pd.read_csv(ppj('OUT_DATA_FOOTBALL', 'plyr_nationality.csv')) longlat_df = pd.read_csv(ppj('OUT_DATA_FOOTBALL', 'club_longlat.csv')) games_df = pd.read_csv(ppj('OUT_DATA_FOOTBALL', 'games_combined.csv'), low_memory=False) # Merge all files to final csv. # Merge geo- and game data. final_df = pd.merge(games_df, longlat_df, how='left', on='home_club') # Merge geo-, gamedata on player nationality data. final_df = merge_nationality(final_df, plyr_df) # Get relative ethnicity for each team. final_df = relative_nationality(final_df) # Split date into day, month and year and store in seperate columns. date_data = '(?P<fb_day>[^.]+).(?P<fb_month>[^.]+).(?P<fb_year>[^.]+)' final_df = pd.concat( [final_df, final_df['fb_date'].str.extract(date_data).astype(int)], axis=1) final_df['fb_year'] = final_df['fb_year'] + 2000 # To four digit integer. # Save as CSV file. final_df.to_csv(ppj('OUT_DATA_FOOTBALL', 'games_final.csv'), index=False)
def save_reg(reg_list, data_name): """ Save the regression results for premiums and claims data, respectively. Args: reg_list (list): a list consisting of the results of all regressions data_name (str): name of tables (see **wscript** file) """ for idx in [0, 2, 5, 7]: reg_list[idx] = round(reg_list[idx], 6) for idx in [4, 9]: reg_list[idx] = round(reg_list[idx], 3) result_df = pd.DataFrame(reg_list, columns=['results']).T result_df.columns = [ 'a', 'ta', 'm', 'tm', 'Rsp', 'b', 'tb', 'n', 'tn', 'Rsq' ] result_df.index.name = 'type' dfs = np.split(result_df, [5], axis=1) reg_p = dfs[0].T.rename(columns={'results': 'Premiums Data'}).T reg_q = dfs[1].T.rename(columns={'results': 'Claims Data'}).T reg_p.to_csv(ppj('OUT_TABLES', '{}_prem_reg.csv'.format(data_name)), index=True, sep=',') reg_q.to_csv(ppj('OUT_TABLES', '{}_clam_reg.csv'.format(data_name)), index=True, sep=',')
def prepare_data(): """Merge tables generated from simulated data, where columns 'fac1', 'fac2', 'fac3' from **table_2** contain the factor ids in **table_1** and columns 'x1', 'x2' from **table_2** contain control ids in **table_3**. The output are one pandas dataframe (saved as pickle) per factor, named 'meas_facX' (X as 1, 2, 3), that contains the multiindex (caseid, period), the two controls, and three measurements. """ # Read in dataframes from Stata files. factor = pd.read_stata(ppj("OUT_DATA", "tables", "data_table_1.dta"), index_col='factor_id', columns=['factor_id', 'meas1', 'meas2', 'meas3']) case = pd.read_stata(ppj("OUT_DATA", "tables", "data_table_2.dta")) case.set_index(['caseid', 't'], inplace=True) control = pd.read_stata(ppj("OUT_DATA", "tables", "data_table_3.dta"), index_col='cont_id') # Join data at indices, generate one dataframe per factor # and save as pickle. c_nr = ['x1', 'x2'] for nr, c in enumerate(c_nr): case = case.join(control, on=c, rsuffix='_' + str(nr + 1))
def itt_analysis_without_controls_as_table(name_df): """Load ITT analysis results, without controls, for a version of ``gate_final.csv``, format them and save them as .tex tabulars. Args: name_df (string): name of a version of ``gate_final.csv``. Returns: Save results to .tex files. """ complete_no_controls_coeff = pd.read_csv(ppj( "OUT_ANALYSIS", name_df + "_no_controls_coeff.csv"), index_col=0) complete_no_controls_summary = pd.read_csv(ppj( "OUT_ANALYSIS", name_df + "_no_controls_summary.csv"), index_col=0) complete_no_controls_coeff = complete_no_controls_coeff.reindex( pretty_index_dict).rename(pretty_index_dict) complete_no_controls_coeff = complete_no_controls_coeff.dropna(how="all") complete_no_controls_coeff.to_latex( ppj("OUT_TABLES", "table_" + name_df + "_no_controls_coeff.tex"), float_format="{:.3g}".format, na_rep=" ", multicolumn_format="c", ) complete_no_controls_summary.T.to_latex( ppj("OUT_TABLES", "table_" + name_df + "_no_controls_summary.tex"), float_format="{:.3g}".format, na_rep=" ", multicolumn_format="c", )
def itt_analysis_with_controls(name_df, in_dir): """Perform OLS regression, with controls, to estimate ITT on a version of ``gate_final.csv``. Args: name_df (string): name of a version of ``gate_final.csv``. dir (string): the directory in which the version of ``gate_final.csv`` is stored. Returns: Save regression results to .csv files. """ gate_controls = pd.read_csv(ppj(in_dir, name_df + ".csv")) gate_controls = gate_controls.drop( [ "gateid", "site", "completed_w2", "missing_cov", "missing_out", "hhincome", "white", "hhincome_50_74k", "philadelphia", ], axis=1, ) complete_controls = generate_regression_output(gate_controls, "hhincome_w2", type="OLS") complete_controls[0].to_csv( ppj("OUT_ANALYSIS", name_df + "_controls_coeff.csv")) complete_controls[1].to_csv( ppj("OUT_ANALYSIS", name_df + "_controls_summary.csv"))
def welch_df_as_table(): """Format ``welch_df.csv`` and save the result to ``table_welch.tex``. """ welch_df = pd.read_csv(ppj("OUT_ANALYSIS", "welch_df.csv"), index_col=[0], header=[0, 1]) welch_df = welch_df.reindex(pretty_index_dict).rename(pretty_index_dict) welch_df = welch_df.dropna(how="all") welch_df = welch_df.rename(columns=({ "mean1": "Missing", "mean0": "No missing" })) idx = pd.IndexSlice keep_format = [ "Age", "Highest grade achieved", "Standardized autonomy index", "Standardized risk-tolerance index", ] rows_to_format = [ item for item in welch_df.index if item not in keep_format ] subset = idx[rows_to_format, idx[:, ["Missing", "No missing"]]] welch_df = format_as_percentage(welch_df, subset) subset_stars = idx[:, idx[:, "p-value"]] correction = len(welch_df) - 1 welch_df = assign_stars(welch_df, subset_stars, correction) welch_df.to_latex( ppj("OUT_TABLES", "table_welch_df.tex"), float_format="{:.3g}".format, na_rep=" ", multicolumn_format="c", )
def save_data(known_claim, guess_claim, nonadj_table, cpi_table): """ Generate the final tables (including the table in original paper). Arg: known_claim (pd.DataFrame): known gross claims table guess_claim (pd.DataFrame): guessed claims based on the known growth rate nonadj_table (pd.DataFrame): claims/premiums/wealth with inflation cpi_table (pd.DataFrame): CPI data of the U.S. """ known_claim = _clam_fil() growth_mean = _known_growth(known_claim) guess_claim = _guess_clam(known_claim, growth_mean) clam_olt = _outlier_clam(known_claim, guess_claim) clam_olt.to_csv(ppj('OUT_DATA', 'claims_outlier.csv'), index=False, sep=',') cpi_adj = cpi_adjust(nonadj_table, cpi_table) cpi_adj.to_csv(ppj('OUT_DATA', 'cpi_adjust.csv'), index=False, sep=',') stab = five_moving(nonadj_table, cpi_table) stab.to_csv(ppj('OUT_DATA', 'recent_table.csv'), index=False, sep=',') data_in_paper = raw_dict['paper_table'] data_in_paper.to_csv( ppj('OUT_DATA', 'szpiro_table.csv'), index=False, sep=',')
def main(): # Read in both final datasets, containing all election and football data. election_df = pd.read_csv(ppj('OUT_DATA_ELEC', 'elections_final.csv'), low_memory=False) game_df = pd.read_csv(ppj('OUT_DATA_FOOTBALL', 'games_final.csv'), low_memory=False) # Drop NaN values. election_df.dropna(subset=['elec_postal'], inplace=True) game_df.dropna(subset=['home_postal'], inplace=True) # Merge dataframes according to postal code and year column. final_df = pd.merge(election_df, game_df, left_on=['elec_postal', 'elec_year'], right_on=['home_postal', 'fb_year']) # Compute geodistance and keep matches within 20km. final_df['geo_dist'] = [ get_geo_distance(final_df, x) for x in range(len(final_df)) ] final_df = final_df[final_df['geo_dist'] < 20] # Compute time distance amd keep mtaches within 14 days. final_df['time_dist'] = get_time_distance(final_df) final_df = final_df[final_df['time_dist'].between(0, 14, inclusive=True)] # Group by election id and date, which results in the final dataframe. final_df = final_df.groupby(['elec_off_name', 'elec_id']).mean().reset_index() # Save to csv. final_df.to_csv(ppj('OUT_DATA', 'elections_games_final.csv'), index=False)
def plot_rich_poor(df_sorted): for c in range(2, 5, 2): plt.plot( df_sorted["time"], df_sorted.T.iloc[c], "--", label="Poor ", color="orange", linewidth=2, scalex=False, ) plt.plot(df_sorted["time"], df_sorted.T.iloc[c + 1], "--", label="Rich", color="blue") plt.legend(loc=8) plt.xticks(df_sorted["time"][::8], rotation=70) plt.xlabel("Time") plt.ylabel("Real Consumption") # save plot and clean figure if c == 2: plt.savefig(ppj("OUT_FIGURES", "agg_rich_vs_poor"), bbox_inches="tight") else: plt.savefig(ppj("OUT_FIGURES", "het_rich_vs_poor"), bbox_inches="tight") plt.clf()
def main(): # Read in combined election csv. elec_df = pd.read_csv(ppj('OUT_DATA_ELEC', 'elections_combined.csv'), low_memory=False) # Election office name plus municipality name as search name. srch_term_list = get_srch_term_list(elec_df) # Google maps search via multiprocessing. dict_list = [] with mp.Pool() as pool: out = pool.map(gmaps_elec_offices, srch_term_list) dict_list.extend(out) # Dicts to dataframe. long_lat_df = pd.DataFrame(dict_list) long_lat_df.to_csv(ppj('OUT_DATA_ELEC', 'elec_off_longlat.csv'), index=False) # Merge latitude and longitude data to combined election csv. elec_final_df = pd.merge(elec_df, long_lat_df, how='left', on='srch_term') elec_final_df.to_csv(ppj('OUT_DATA_ELEC', 'elections_final.csv'), index=False) # Final data without postal ballots. elec_final_df = elec_final_df[elec_final_df['postal_vote'] == 0] elec_final_df.to_csv(ppj('OUT_DATA_ELEC', 'elections_final_wo_postal.csv'), index=False)
def main(): # Load game URL data and get unique ID list. game_df = pd.read_csv(ppj('OUT_DATA_FOOTBALL', 'game_urls.csv')) mtchday_list = game_df['mtchday_id'].unique().tolist() # Run multiprocessed scraping by matchday ID. for mtchday in mtchday_list: mp_scraping(mtchday, game_df) # Create .txt file to indicate end of scraping process. open(ppj('OUT_DATA_FOOTBALL_CSV', 'scraping_finished.txt'), 'a').close()
def save_report(rep_dict, data_name, h_test, level): """ Save the reports in text files. Args: rep_dict (dict): regression results to be reported. data_name (str): name of tables (see **wscript** file) h_test (int): h = h_test? interest in 1 (saved in **values_in_interest.json.json**) level (float64): ((1 - level)*100)% is significant level. """ data = pd.read_csv(ppj('OUT_DATA', '{}.csv'.format(data_name))) y, x = gen_xy(data, h_test) with open(ppj('OUT_ANALYSIS', 'report_{}.txt'.format(data_name)), 'w') as text: for key, data in y.items(): if np.absolute(rep_dict['{}_th0'.format(key)]) < rep_dict['tstat']: text.write( 'For {} ({}):\n' 'h = {} is not significantly different from {}, ' 'with t-value {}({}) given significant level {}%.\n' '1st estimator is {} (t={})\n2nd estimator is {} (t={})\n' 'R^2 is {}\n' 'c_p = {}\n' '{} <= c_p <= {} (confidence band)\n' '\n'.format(data_name, key, rep_dict['{}_hhat'.format(key)], rep_dict['htest'], rep_dict['{}_th0'.format(key)].round(4), rep_dict['tstat'].round(2), int((1 - level) * 100), rep_dict['{}_ahat'.format(key)].round(6), rep_dict['{}_ta'.format(key)], rep_dict['{}_mhat'.format(key)].round(6), rep_dict['{}_tm'.format(key)], rep_dict['{}_RS'.format(key)], rep_dict['{}_rra'.format(key)], rep_dict['{}_lb'.format(key)], rep_dict['{}_ub'.format(key)])) else: text.write('For {} ({}):\n' 'h = {} is likely not {} with the t-value {} ({})\n' '\n'.format(data_name, key, rep_dict['{}_hhat'.format(key)], rep_dict['htest'], rep_dict['{}_th0'.format(key)].round(4), rep_dict['tstat'].round(2)))
def main(): # Read in relevant files. elec_mun_df = pd.read_csv(ppj('OUT_DATA_ELEC', 'election_mun.csv')) elec_url_df = pd.read_csv(ppj('OUT_DATA_ELEC', 'election_data.csv')) # Merge files containing election and municipal information. elec_df = pd.merge(elec_url_df, elec_mun_df, how='left', on='mun_url') # Create election id. elec_df = create_election_id(elec_df) # Save as csv. elec_df.to_csv(ppj('OUT_DATA_ELEC', 'election_id_data.csv'), index=False)
def expand_voting_files(elec_master_df): """ Downloads all CSV files from the corresponding download url. Each file is expanded by columns containing ID, municipality name, voting level, and state information. Further, a list containing all occurring column names is created. """ # List to store all column names. colnames_list = [''] # Loop through download urls. dwnld_url_list = elec_master_df['dwnld_url'].tolist() for i, export_url in enumerate(dwnld_url_list): # Create file name form election ID. file_name = elec_master_df.loc[i, 'elec_id'] # Download file to separate folder. urlretrieve(export_url, ppj('OUT_DATA_ELEC_CSV', '{}.csv'.format(file_name))) # Read in downloaded file. temp_df = pd.read_csv(ppj('OUT_DATA_ELEC_CSV', '{}.csv'.format(file_name)), sep=';') # Get column names to ASCI. temp_df.columns = [unidecode(x).lower() for x in temp_df.columns] temp_df.rename(columns={'name': 'elec_off_name'}, inplace=True) # Expand election results with election information. temp_df['elec_id'] = elec_master_df.loc[i, 'elec_id'] temp_df['mun_clearname'] = elec_master_df.loc[i, 'mun_clearname'] temp_df['state'] = elec_master_df.loc[i, 'state'] temp_df['elec_year'] = elec_master_df.loc[i, 'elec_year'] temp_df['elec_date'] = elec_master_df.loc[i, 'elec_date'] # Get columns of temp_df and append those to overall columns list. temp_columns = list(temp_df) for columns in temp_columns: if columns not in colnames_list: colnames_list.append(columns) # Overwrite original csv file. temp_df.to_csv(ppj('OUT_DATA_ELEC_CSV', '{}.csv'.format(file_name)), index=False) # Create .txt file to indicate finshing of download process. open(ppj('OUT_DATA_ELEC_CSV', 'election_dwnld_finished.txt'), 'a').close() return colnames_list
def create_heatmap_nan(): """Create nullity correlation heatmap and save the plot to ``matrix_nan.png`` in the "OUT_DATA" directory. """ index_category = pd.Index(new_labels) sorted_by_category = gate_plot[index_category] heatmap_nan = msno.heatmap(sorted_by_category, vmin=0, cmap="OrRd") heatmap_nan.get_xticklabels()[16].set_fontweight("bold") heatmap_nan.get_yticklabels()[16].set_fontweight("bold") # Interesting fact: # When plotting heatmaps with seaborn (on which the "missingno" library # builds), the first and the last row is cut in halve, because of a bug # in the matplotlib regression between 3.1.0 and 3.1.1 # We are correcting it this way: bottom, top = heatmap_nan.get_ylim() heatmap_nan.set_ylim(bottom + 0.5, top - 0.5) positions = np.array([1, 3, 5, 8, 10, 14, 16]) labels = [ "BACKGROUND", "HOUSEHOLD", "FINANCE", "HEALTH", "EMPLOYMENT", "PERSONALITY", ] heatmap_nan.hlines(positions, xmin=0, xmax=positions, lw=8, color="white") for position, label in zip(positions, labels): heatmap_nan.text(position + 0.35, position + 0.35, label, fontsize=14) heatmap_nan.figure.savefig(ppj("OUT_FIGURES", "heatmap_nan.png"), bbox_inches="tight")
def main(): # Read in final dataset, containing all games and player data. game_df = pd.read_csv(ppj('OUT_DATA_FOOTBALL', 'games_combined.csv')) unique_plyrs = get_unique_plyrs(game_df) # Scraping via multiprocessing. dict_list = [] with mp.Pool() as pool: out = pool.map(get_age_nat, unique_plyrs) dict_list.extend(out) plyr_df = pd.DataFrame(dict_list) # Dicts to df. # Save player url, age and nationality in seperate csv file. plyr_df.to_csv(ppj('OUT_DATA_FOOTBALL', 'plyr_nationality.csv'), index=False)
def make_dot_file(ctx): # Lazy load module from bld.project_paths import project_paths_join as ppj # Select task groups, drop first which are project paths groups = [group for group in ctx.groups if len(group) != 0] groups = groups[1:] # Create dag = digraph() for group in groups: for taskgen in group: name = taskgen.get_name() add_nodes(dag, [name]) # Add dependencies deps = Utils.to_list(getattr(taskgen, "deps", [])) for dep in deps: dep = Path(dep).name add_nodes(dag, [dep]) add_edges(dag, [(dep, name)]) # Write targets targets = Utils.to_list(getattr(taskgen, "target", [])) for target in targets: target = Path(target).name add_nodes(dag, [target]) add_edges(dag, [(name, target)]) dag = apply_styles(dag, styles) # Save DAG dag.render(ppj("OUT_FIGURES", "dag"))
def main(): # Load previously scraped matchday files, containing municipal URLs. elec_df = pd.read_csv(ppj("OUT_DATA_ELEC", "election_mun.csv")) # List to store resulting election dictionaries. dict_list = [] # Multiprocessed scraping. with mp.Pool() as pool: out = pool.map(scrape_elec_data, elec_df.mun_url.values) out = list(itertools.chain.from_iterable(out)) # Flatten list. dict_list.extend(out) # Extent dictionaries to list. # Create dataframe from dictionaries and save as csv. df = pd.DataFrame(dict_list) df.to_csv(ppj("OUT_DATA_ELEC", "election_data.csv"), index=False)
def plot_agg_het(df_sorted): for t in [2, 3]: plt.plot( df_sorted["time"], df_sorted.T.iloc[t], "--", label="Aggregate CPI ", color="orange", linewidth=2, scalex=False, ) plt.plot( df_sorted["time"], df_sorted.T.iloc[t + 2], "--", label="Heterogeneous CPI", color="blue", ) plt.legend(loc=8) plt.xticks(df_sorted["time"][::8], rotation=70) plt.xlabel("Time") plt.ylabel("Real Consumption") # save plot and clean figure plt.savefig( ppj("OUT_FIGURES", "comparison_agg_vs_het_" + df_sorted.columns[t][-7:-4]), bbox_inches="tight", ) plt.clf()
def plot_transition(): """Plot results for transitional dynamics.""" # Load results plot_x = np.arange(duration_transition + 1) plot_y = np.array([ results_transition["aggregate_capital"], results_transition["aggregate_labor"] ]) # Create figure and plot fig = plt.figure() ax1 = fig.add_subplot(111) line1 = ax1.plot(plot_x, plot_y[0, :], color="tab:blue", label="assets") ax2 = ax1.twinx() line2 = ax2.plot(plot_x, plot_y[1, :], color="tab:orange", label="human capital") lines = line1 + line2 labels = [line.get_label() for line in lines] ax1.legend(lines, (labels), loc=0) ax1.set(xlabel="transition period", ylabel="assets", ybound=[0, 15.0]) ax2.set(ylabel="human capital", ybound=[0, hc_max]) # Save figure fig.savefig(ppj("OUT_FIGURES", "results_transition.png"))
def regression_tables(macro_ind): """creates a regression table as latex for each variable in macro_ind macro_ind: list of name of variabels of interest as strings """ for ind in macro_ind: mod = smf.ols(formula=ind + "~ fraction ", data=df) res1 = mod.fit() mod = smf.ols(formula=ind + "~ mean ", data=df) res2 = mod.fit() mod = smf.ols(formula=ind + "~ var ", data=df) res3 = mod.fit() mod = smf.ols(formula=ind + "~ var + mean + var ", data=df) res4 = mod.fit() textfile = open(ppj("OUT_ANALYSIS", ind + "_on_sentiment.txt"), "w") textfile.write( summary_col( [res1, res2, res3, res4], stars=True, float_format="%0.2f", model_names=["\n(0)", "\n(1)", "\n(2)", "\n(3)"], info_dict={ "N": lambda x: "{:d}".format(int(x.nobs)), "R2": lambda x: f"{x.rsquared:.2f}", }, ).as_latex()) textfile.close()
def expected_predict(): out = {} for i in DB: with open(ppj("OUT_MODEL_SPECS", f"{i}_rid.json")) as f: data = json.load(f) out[f"{i}"] = np.array(data) return out
def extract_sources(): df = read_parquet_in_date_chunks(ppj("OUT_DATA", "tweets-cleaned")) sources = (df.urls.str.split("/", n=3, expand=True)[2].str.replace( "www.", "").value_counts()) return sources
def plot_locations(locations_by_round, model_name): "Plot the distribution of agents after cycle_num rounds of the loop." n_cycles = len(locations_by_round) nrows = int(np.ceil(n_cycles / 2 - 0.01)) figsize = (2 * 3, nrows * 2) fig, axes = plt.subplots(nrows=nrows, ncols=2, figsize=figsize) fig.subplots_adjust( left=0.05, right=0.95, bottom=0.05, top=0.95, wspace=0.25, hspace=0.25 ) for item, ax in np.ndenumerate(axes): n_cycle = item[0] * 2 + item[1] if n_cycle == n_cycles: # Remove last element if number of cycles is uneven fig.delaxes(ax) break locs = locations_by_round[n_cycle] ax.set_title("Cycle {}".format(n_cycle)) ax.tick_params(labelbottom="off", labelleft="off") ax.set_facecolor("azure") ax.plot( locs[0][:, 0], locs[0][:, 1], "o", markerfacecolor="orange", **PLOT_ARGS ) ax.plot(locs[1][:, 0], locs[1][:, 1], "o", markerfacecolor="green", **PLOT_ARGS) fig.savefig(ppj("OUT_FIGURES", "schelling_{}.png".format(model_name)))
def plot_locations(locations_by_round, model_name): """Plot the distribution of agents after cycle_num rounds of the loop.""" n_cycles = len(locations_by_round) nrows = int(np.ceil(n_cycles / 2 - 0.01)) figsize = (2 * 3, nrows * 2) fig, axes = plt.subplots(nrows=nrows, ncols=2, figsize=figsize) fig.subplots_adjust( left=0.05, right=0.95, bottom=0.05, top=0.95, wspace=0.25, hspace=0.25 ) for item, ax in np.ndenumerate(axes): n_cycle = item[0] * 2 + item[1] if n_cycle == n_cycles: # Remove last element if number of cycles is uneven fig.delaxes(ax) break locs = locations_by_round[n_cycle] ax.set_title(f"Cycle {n_cycle}") ax.tick_params(labelbottom=False, labelleft=False) ax.set_facecolor("azure") ax.plot( locs[0][:, 0], locs[0][:, 1], "o", markerfacecolor="orange", **PLOT_ARGS ) ax.plot(locs[1][:, 0], locs[1][:, 1], "o", markerfacecolor="green", **PLOT_ARGS) fig.savefig(ppj("OUT_FIGURES", f"schelling_{model_name}.png"))
def main(): # Main URL to start from. main_url = 'https://www.fupa.net' # Regions to scrap. regions = ['mittelrhein'] # regions = ['mittelrhein', 'niederrhein', 'ruhrgebiet', 'westrhein'] # Intitalize dictionary and pandas dataframe to store league data. matchday_dict = dict() matchday_df = pd.DataFrame() # Loop through all predefined regions to get district districs within each # region. for region in regions: district_url_list = get_district_list(region, main_url, matchday_dict) # Loop through districts to get leagues within each district. for district_url in district_url_list: leagues_url_list = get_league_list( district_url, matchday_dict, main_url) # Loop through each single league to get list of all past seasons # within a league. for league_url in leagues_url_list: seasons_list = get_seaons_list(league_url, matchday_dict) # Get matchday url for all seasons within a league, from which # to start scraping. for season in seasons_list: matchday_df = get_matchday_url( season, matchday_dict, matchday_df) # Save matchday dataframe as .csv file. matchday_df.to_csv(ppj('OUT_DATA_FOOTBALL', 'matchday_data.csv'))
def figure_maker_func(turkstat_data, analysis_data, freq): """Function for generating figures. Depending on the frequency of data appropriate title is assigned. Args: | turkstat_data(pd.Dataframe): the TurkStat dataset containing percentage changes with dates on the index | analysis_data (pd.Dataframe): the dataset containing percentage price changes with dates on the index | freq (str): frequency of data Returns: the figure and also saves it with appropriate title """ if freq == "yearly": title = "Percentage Price Change Compared the Last Year's Same Month" elif freq == "monthly": title = "Monthly Percentage Price Change" plt.figure(figsize=[14, 5]) plt.title(title) plt.ylabel("Percentage Change(%)") plt.plot(turkstat_data, "r-s", color="blue", linestyle="dashed") plt.plot(analysis_data, "r-s", color="red") plt.legend(labels=["Accomodation Services(TurkStat)", "Analysis(Airbnb)"]) plt.grid() plt.savefig(ppj("OUT_FIGURES", f"{freq}_change_figure.png"))
def create_logistic_dataframe(name_df): """Check for missing values mechanism of dataset ``gate_final.csv`` via logistic regression and save results to ``logistic_df.csv``. """ gate_logistic = name_df.drop( [ "gateid", "hhincome", "hhincome_w2", "completed_w2", "site", "philadelphia", "white", "hhincome_50_74k", "worked_for_relatives_friends_se", ], axis=1, ) results_cov = generate_regression_output(gate_logistic.drop("missing_out", axis=1), "missing_cov", type="Logit") results_out = generate_regression_output(gate_logistic, "missing_out", type="Logit") logistic_df = pd.concat( [results_cov, results_out], axis=1, keys=["Missing in covariates", "Missing in outcome"], sort=False, ) logistic_df.to_csv(ppj("OUT_ANALYSIS", "logistic_df.csv"))
def setup_agents(model): """Load the simulated initial locations and return a list that holds all agents. """ initial_locations = np.loadtxt(ppj("OUT_DATA", "initial_locations.csv"), delimiter=",") initial_locations = initial_locations.reshape(2, model["n_types"], 30000) agents = [] for typ in range(model["n_types"]): for i in range(model["n_agents_by_type"][typ]): agents.append( Agent( typ=typ, initial_location=initial_locations[typ, :, i], n_neighbours=model["n_neighbours"], require_same_type=model["require_same_type"], max_moves=model["max_moves"] ) ) return agents
n_cycle = item[0] * 2 + item[1] if n_cycle == n_cycles: # Remove last element if number of cycles is uneven fig.delaxes(ax) break locs = locations_by_round[n_cycle] ax.set_title("Cycle {}".format(n_cycle)) ax.tick_params(labelbottom="off", labelleft="off") ax.set_facecolor("azure") ax.plot( locs[0][:, 0], locs[0][:, 1], "o", markerfacecolor="orange", **PLOT_ARGS ) ax.plot(locs[1][:, 0], locs[1][:, 1], "o", markerfacecolor="green", **PLOT_ARGS) fig.savefig(ppj("OUT_FIGURES", "schelling_{}.png".format(model_name))) if __name__ == "__main__": model_name = sys.argv[1] model = json.load( open(ppj("IN_MODEL_SPECS", model_name + ".json"), encoding="utf-8") ) # Load locations after each round with open( ppj("OUT_ANALYSIS", "schelling_{}.pickle".format(model_name)), "rb" ) as in_file: locations_by_round = pickle.load(in_file) plot_locations(locations_by_round, model_name)
def save_data(sample): sample.tofile(ppj("OUT_DATA", "initial_locations.csv"), sep=",")
from skillmodels import SkillModel from bld.project_paths import project_paths_join as ppj import pandas as pd from pandas import DataFrame import json import sys if __name__ == '__main__': model_name, dataset_name, estimator = sys.argv[1:4] # load the model dict from a json file in src.model_specs with open(ppj('IN_MODEL_SPECS', '{}.json'.format(model_name))) as j: model_dict = json.load(j) # load the dataset from a dta file in bld.out.data dataset = pd.read_stata(ppj('OUT_DATA', '{}.dta'.format(dataset_name))) # create an instance of SkillModel mod = SkillModel(model_dict=model_dict, dataset=dataset, estimator=estimator, model_name=model_name, dataset_name=dataset_name) # call its fit method to estimate the model res = mod.fit() # create a pandas DataFrame containing the parameters and standard errors df = DataFrame(data=res.params, columns=['params'], index=res.param_names) df['se'] = res.bse df['pvalues'] = res.pvalues df['tvalues'] = res.tvalues
import pandas as pd import sys from bld.project_paths import project_paths_join as ppj if __name__ == '__main__': dataset_name = sys.argv[1] data = pd.read_stata(ppj('IN_DATA', '{}.dta'.format(dataset_name))) data['id'] = data['caseid'] - 1 data['period'] = data['period'] - 1 data = data.drop( ['dy1', 'dy2', 'dy3', 'dy4', 'dy5', 'dy6', 'dQ1'], axis=1) data.to_stata(ppj("OUT_DATA", "{}_ready.dta".format(dataset_name)))
bottom=0.05, top=0.95, wspace=0.25, hspace=0.25 ) for item, ax in np.ndenumerate(axes): n_cycle = item[0] * 2 + item[1] if n_cycle == n_cycles: # Remove last element if number of cycles is uneven fig.delaxes(ax) break locs = locations_by_round[n_cycle] ax.set_title("Cycle {}".format(n_cycle)) ax.tick_params(labelbottom="off", labelleft="off") ax.set_axis_bgcolor("azure") ax.plot(locs[0][:, 0], locs[0][:, 1], "o", markerfacecolor="orange", **PLOT_ARGS) ax.plot(locs[1][:, 0], locs[1][:, 1], "o", markerfacecolor="green", **PLOT_ARGS) fig.savefig(ppj("OUT_FIGURES", "schelling_{}.png".format(model_name))) if __name__ == "__main__": model_name = sys.argv[1] model = json.load(open(ppj("IN_MODEL_SPECS", model_name + ".json"), encoding="utf-8")) # Load locations after each round with open(ppj("OUT_ANALYSIS", "schelling_{}.pickle".format(model_name)), "rb") as in_file: locations_by_round = pickle.load(in_file) plot_locations(locations_by_round, model_name)
ax.set_ylim(0, y_max) ax.set_autoscale_on(False) ax.legend([axes[col] for col in colnames], colnames, fontsize=14) plt.title( title, fontsize=28, y=1 + 2.5 / nr_rows, x=-0.6, loc="left", weight="bold" ) plt.savefig(path, bbox_inches="tight", pad_inches=0.8) plt.close(fig) if __name__ == "__main__": model, dataset = sys.argv[1:3] true_path = ppj("LIBRARY", "true_{}_results.csv") true_df = pd.read_csv(true_path.format(model), index_col="index") estimated_path = ppj("OUT_ANALYSIS", "{}_{}/results_df.csv") estimated_df = pd.read_csv(estimated_path.format(model, dataset), index_col="index") df = pd.concat([estimated_df, true_df], axis=1, sort=False) df["Fortran"] = df["chs_params"] - df["true_value"] df["Python"] = df["params"] - df["true_value"] plot_results_comparison( df=df, colnames=["Python", "Fortran"], path=ppj("OUT_ANALYSIS", "{}_{}/comparison_plot".format(model, dataset)), title=( "Comparison of Python and Fortran results\nBars show " "deviation from true population Parameters"
[.925, 0.04, 0.75], [.925, 0.04, 0.75]], np.zeros((3, 0))] true_loadings = np.arange(start=0.5, stop=1.85, step=0.05) true_intercepts = np.arange(start=-0.665, stop=0.665, step=0.05) true_X_zero = np.array([10, 15, 30]) true_cov_matrix = np.array([[2.0, 0.05, 0.1], [0.05, 4.0, 0.0], [0.1, 0.0, 9.0]]) nobs = 8000 base_meas_sd = 0.7 base_trans_sd = 1.0 true_meas_sd = true_loadings * base_meas_sd true_trans_sd = [[0.4, 0.5], [0.4, 0.5], [0.4, 0.5]] large_df = generate_test_data( nobs=nobs, factors=factor_names, periods=periods, included_positions=included_positions, meas_names=meas_names, initial_mean=true_X_zero, initial_cov=true_cov_matrix, intercepts=true_intercepts, loadings=true_loadings, meas_sd=true_meas_sd, gammas=true_gammas, trans_sd=true_trans_sd) out_path = ppj('OUT_DATA', 'ns_translog_data.dta') large_df.to_stata(out_path)
if not (agent.location == old_location).all(): someone_moved = True _store_locations_by_round(locations_by_round[-1], agents) # We are done if everybody is happy. if not someone_moved: break if someone_moved: logging.info("No convergence achieved after {} iterations".format(model["max_iterations"])) return locations_by_round if __name__ == "__main__": model_name = sys.argv[1] model = json.load(open(ppj("IN_MODEL_SPECS", model_name + ".json"), encoding="utf-8")) logging.basicConfig( filename=ppj("OUT_ANALYSIS", "log", "schelling_{}.log".format(model_name)), filemode="w", level=logging.INFO ) np.random.seed(model["rng_seed"]) logging.info(model["rng_seed"]) # Load initial locations and setup agents agents = setup_agents(model) # Run the main analysis locations_by_round = run_analysis(agents, model) # Store list with locations after each round with open(ppj("OUT_ANALYSIS", "schelling_{}.pickle".format(model_name)), "wb") as out_file: