Ejemplo n.º 1
0
def main():
    # Open final football file csv.
    final_df = pd.read_csv(ppj('OUT_DATA_FOOTBALL', 'games_combined.csv'),
                           low_memory=False)

    # Get unique set of clubs for both home and away teams.
    unique_home = final_df['home_club'].unique().tolist()
    unique_away = final_df['away_club'].unique().tolist()
    unique_clubs = unique_home + unique_away  # Combined list.
    unique_clubs = list(set(unique_clubs))[1:]  # Unique intersection.

    unique_clubs = unique_clubs[0:150]  # !!Subset!!

    # Scraping via multiprocessing.
    dict_list = []
    with mp.Pool() as pool:
        out = pool.map(get_geodata, unique_clubs)
        dict_list.extend(out)

    # Dicts to dataframe.
    club_longlat_df = pd.DataFrame(dict_list)

    # Merge to game data and store as csv.
    club_longlat_df.to_csv(ppj('OUT_DATA_FOOTBALL', 'club_longlat.csv'),
                           index=False)
Ejemplo n.º 2
0
def extract_true_factors():
    """Merge tables generated from simulated data, where columns 'fac1',
    'fac2', 'fac3' from **table_2** contain the factor ids in **table_1**.
    
    The output is one pandas dataframe (saved as pickle) that contains the
    multiindex (caseid, period) and the three (true) factors.
    
    """

    # Read in dataframes from Stata files.
    factor = pd.read_stata(
                            ppj("OUT_DATA","tables", "data_table_1.dta"),
                            index_col = 'factor_id',
                            columns = ['factor_id', 'true_fac']
                          )
    f_nr=['fac1', 'fac2', 'fac3']
    case = pd.read_stata(
                            ppj("OUT_DATA", "tables", "data_table_2.dta"),
                            columns = f_nr + ['caseid', 't']
                        )
    case.set_index(['caseid', 't'], inplace = True)
    
    # Join data at indices, generate one dataframe and save as pickle.
    
    for f in f_nr:
        case = case.join(factor, on = f, rsuffix='_'+f)
Ejemplo n.º 3
0
def mp_scraping(mtchday, game_df):
    """
    Checks wheter CSV file for specified *mtchday* ID already exists, if not
    games corresponding to this ID are scraped via multiprocessing. The game
    URLs are stored in *game_df* connecting matchday ID and game URLs. The 
    resulting dataframe is saved as a CSV file.
    """

    # Check wheter matchday games are already scraped.
    if not os.path.isfile(
            ppj('OUT_DATA_FOOTBALL_CSV', '{}.csv'.format(mtchday))):
        temp_urls = game_df[(game_df['mtchday_id'] == mtchday) & (
            game_df['doable'] == 1)]['game_url'].unique().tolist()

        # Scraping via multiprocessing.
        dict_list = []
        with mp.Pool() as pool:
            out = pool.map(scrape_game_data, temp_urls)
            dict_list.extend(out)  # To list.

        # Dicts to dataframe and save as CSV.
        df = pd.DataFrame(dict_list)
        df['mtchday_id'] = mtchday
        df.to_csv(ppj('OUT_DATA_FOOTBALL_CSV', '{}.csv'.format(mtchday)),
                  index=False)
    else:
        pass
Ejemplo n.º 4
0
def main():
    # Load game-, player- and geo-data
    plyr_df = pd.read_csv(ppj('OUT_DATA_FOOTBALL', 'plyr_nationality.csv'))
    longlat_df = pd.read_csv(ppj('OUT_DATA_FOOTBALL', 'club_longlat.csv'))
    games_df = pd.read_csv(ppj('OUT_DATA_FOOTBALL', 'games_combined.csv'),
                           low_memory=False)

    # Merge all files to final csv.
    # Merge geo- and game data.
    final_df = pd.merge(games_df, longlat_df, how='left', on='home_club')

    # Merge geo-, gamedata on player nationality data.
    final_df = merge_nationality(final_df, plyr_df)

    # Get relative ethnicity for each team.
    final_df = relative_nationality(final_df)

    # Split date into day, month and year and store in seperate columns.
    date_data = '(?P<fb_day>[^.]+).(?P<fb_month>[^.]+).(?P<fb_year>[^.]+)'
    final_df = pd.concat(
        [final_df, final_df['fb_date'].str.extract(date_data).astype(int)],
        axis=1)
    final_df['fb_year'] = final_df['fb_year'] + 2000  # To four digit integer.

    # Save as CSV file.
    final_df.to_csv(ppj('OUT_DATA_FOOTBALL', 'games_final.csv'), index=False)
Ejemplo n.º 5
0
def save_reg(reg_list, data_name):
    """
    Save the regression results for premiums and claims data, respectively.

    Args:
        reg_list (list): a list consisting of the results of all regressions

        data_name (str): name of tables (see **wscript** file)

    """
    for idx in [0, 2, 5, 7]:
        reg_list[idx] = round(reg_list[idx], 6)
    for idx in [4, 9]:
        reg_list[idx] = round(reg_list[idx], 3)

    result_df = pd.DataFrame(reg_list, columns=['results']).T
    result_df.columns = [
        'a', 'ta', 'm', 'tm', 'Rsp', 'b', 'tb', 'n', 'tn', 'Rsq'
    ]
    result_df.index.name = 'type'
    dfs = np.split(result_df, [5], axis=1)
    reg_p = dfs[0].T.rename(columns={'results': 'Premiums Data'}).T
    reg_q = dfs[1].T.rename(columns={'results': 'Claims Data'}).T

    reg_p.to_csv(ppj('OUT_TABLES', '{}_prem_reg.csv'.format(data_name)),
                 index=True,
                 sep=',')
    reg_q.to_csv(ppj('OUT_TABLES', '{}_clam_reg.csv'.format(data_name)),
                 index=True,
                 sep=',')
Ejemplo n.º 6
0
def prepare_data():
    """Merge tables generated from simulated data, where columns 'fac1',
    'fac2', 'fac3' from **table_2** contain the factor ids in **table_1**
    and columns 'x1', 'x2' from **table_2** contain control ids in **table_3**.
    
    The output are one pandas dataframe (saved as pickle) per factor, named
    'meas_facX' (X as 1, 2, 3), that contains the multiindex (caseid, period),
    the two controls, and three measurements.
    
    """

    # Read in dataframes from Stata files.
    factor = pd.read_stata(ppj("OUT_DATA", "tables", "data_table_1.dta"),
                           index_col='factor_id',
                           columns=['factor_id', 'meas1', 'meas2', 'meas3'])
    case = pd.read_stata(ppj("OUT_DATA", "tables", "data_table_2.dta"))
    case.set_index(['caseid', 't'], inplace=True)
    control = pd.read_stata(ppj("OUT_DATA", "tables", "data_table_3.dta"),
                            index_col='cont_id')

    # Join data at indices, generate one dataframe per factor
    # and save as pickle.
    c_nr = ['x1', 'x2']
    for nr, c in enumerate(c_nr):
        case = case.join(control, on=c, rsuffix='_' + str(nr + 1))
Ejemplo n.º 7
0
def itt_analysis_without_controls_as_table(name_df):
    """Load ITT analysis results, without controls, for a version of
    ``gate_final.csv``, format them and save them as .tex tabulars.

    Args:
        name_df (string): name of a version of ``gate_final.csv``.

    Returns:
         Save results to .tex files.

    """
    complete_no_controls_coeff = pd.read_csv(ppj(
        "OUT_ANALYSIS", name_df + "_no_controls_coeff.csv"),
                                             index_col=0)
    complete_no_controls_summary = pd.read_csv(ppj(
        "OUT_ANALYSIS", name_df + "_no_controls_summary.csv"),
                                               index_col=0)
    complete_no_controls_coeff = complete_no_controls_coeff.reindex(
        pretty_index_dict).rename(pretty_index_dict)
    complete_no_controls_coeff = complete_no_controls_coeff.dropna(how="all")
    complete_no_controls_coeff.to_latex(
        ppj("OUT_TABLES", "table_" + name_df + "_no_controls_coeff.tex"),
        float_format="{:.3g}".format,
        na_rep=" ",
        multicolumn_format="c",
    )
    complete_no_controls_summary.T.to_latex(
        ppj("OUT_TABLES", "table_" + name_df + "_no_controls_summary.tex"),
        float_format="{:.3g}".format,
        na_rep=" ",
        multicolumn_format="c",
    )
Ejemplo n.º 8
0
def itt_analysis_with_controls(name_df, in_dir):
    """Perform OLS regression, with controls, to estimate ITT on a version of
    ``gate_final.csv``.

    Args:
        name_df (string): name of a version of ``gate_final.csv``.
        dir (string): the directory in which the version of ``gate_final.csv``
            is stored.

    Returns:
         Save regression results to .csv files.

    """
    gate_controls = pd.read_csv(ppj(in_dir, name_df + ".csv"))
    gate_controls = gate_controls.drop(
        [
            "gateid",
            "site",
            "completed_w2",
            "missing_cov",
            "missing_out",
            "hhincome",
            "white",
            "hhincome_50_74k",
            "philadelphia",
        ],
        axis=1,
    )
    complete_controls = generate_regression_output(gate_controls,
                                                   "hhincome_w2",
                                                   type="OLS")
    complete_controls[0].to_csv(
        ppj("OUT_ANALYSIS", name_df + "_controls_coeff.csv"))
    complete_controls[1].to_csv(
        ppj("OUT_ANALYSIS", name_df + "_controls_summary.csv"))
Ejemplo n.º 9
0
def welch_df_as_table():
    """Format ``welch_df.csv`` and save the result to ``table_welch.tex``.

    """
    welch_df = pd.read_csv(ppj("OUT_ANALYSIS", "welch_df.csv"),
                           index_col=[0],
                           header=[0, 1])
    welch_df = welch_df.reindex(pretty_index_dict).rename(pretty_index_dict)
    welch_df = welch_df.dropna(how="all")
    welch_df = welch_df.rename(columns=({
        "mean1": "Missing",
        "mean0": "No missing"
    }))
    idx = pd.IndexSlice
    keep_format = [
        "Age",
        "Highest grade achieved",
        "Standardized autonomy index",
        "Standardized risk-tolerance index",
    ]
    rows_to_format = [
        item for item in welch_df.index if item not in keep_format
    ]
    subset = idx[rows_to_format, idx[:, ["Missing", "No missing"]]]
    welch_df = format_as_percentage(welch_df, subset)
    subset_stars = idx[:, idx[:, "p-value"]]
    correction = len(welch_df) - 1
    welch_df = assign_stars(welch_df, subset_stars, correction)
    welch_df.to_latex(
        ppj("OUT_TABLES", "table_welch_df.tex"),
        float_format="{:.3g}".format,
        na_rep=" ",
        multicolumn_format="c",
    )
def save_data(known_claim, guess_claim, nonadj_table, cpi_table):
    """
    Generate the final tables (including the table in original paper).

    Arg:
        known_claim (pd.DataFrame): known gross claims table

        guess_claim (pd.DataFrame):
        guessed claims based on the known growth rate

        nonadj_table (pd.DataFrame): claims/premiums/wealth with inflation

        cpi_table (pd.DataFrame): CPI data of the U.S.

    """
    known_claim = _clam_fil()
    growth_mean = _known_growth(known_claim)
    guess_claim = _guess_clam(known_claim, growth_mean)
    clam_olt = _outlier_clam(known_claim, guess_claim)
    clam_olt.to_csv(ppj('OUT_DATA', 'claims_outlier.csv'),
                    index=False, sep=',')

    cpi_adj = cpi_adjust(nonadj_table, cpi_table)
    cpi_adj.to_csv(ppj('OUT_DATA', 'cpi_adjust.csv'),
                   index=False, sep=',')

    stab = five_moving(nonadj_table, cpi_table)
    stab.to_csv(ppj('OUT_DATA', 'recent_table.csv'), index=False, sep=',')

    data_in_paper = raw_dict['paper_table']
    data_in_paper.to_csv(
        ppj('OUT_DATA', 'szpiro_table.csv'), index=False, sep=',')
def main():
    # Read in both final datasets, containing all election and football data.
    election_df = pd.read_csv(ppj('OUT_DATA_ELEC', 'elections_final.csv'),
                              low_memory=False)
    game_df = pd.read_csv(ppj('OUT_DATA_FOOTBALL', 'games_final.csv'),
                          low_memory=False)

    # Drop NaN values.
    election_df.dropna(subset=['elec_postal'], inplace=True)
    game_df.dropna(subset=['home_postal'], inplace=True)

    # Merge dataframes according to postal code and year column.
    final_df = pd.merge(election_df,
                        game_df,
                        left_on=['elec_postal', 'elec_year'],
                        right_on=['home_postal', 'fb_year'])

    # Compute geodistance and keep matches within 20km.
    final_df['geo_dist'] = [
        get_geo_distance(final_df, x) for x in range(len(final_df))
    ]
    final_df = final_df[final_df['geo_dist'] < 20]

    # Compute time distance amd keep mtaches within 14 days.
    final_df['time_dist'] = get_time_distance(final_df)
    final_df = final_df[final_df['time_dist'].between(0, 14, inclusive=True)]

    # Group by election id and date, which results in the final dataframe.
    final_df = final_df.groupby(['elec_off_name',
                                 'elec_id']).mean().reset_index()

    # Save to csv.
    final_df.to_csv(ppj('OUT_DATA', 'elections_games_final.csv'), index=False)
Ejemplo n.º 12
0
def plot_rich_poor(df_sorted):
    for c in range(2, 5, 2):
        plt.plot(
            df_sorted["time"],
            df_sorted.T.iloc[c],
            "--",
            label="Poor ",
            color="orange",
            linewidth=2,
            scalex=False,
        )
        plt.plot(df_sorted["time"],
                 df_sorted.T.iloc[c + 1],
                 "--",
                 label="Rich",
                 color="blue")
        plt.legend(loc=8)
        plt.xticks(df_sorted["time"][::8], rotation=70)
        plt.xlabel("Time")
        plt.ylabel("Real Consumption")

        # save plot and clean figure
        if c == 2:
            plt.savefig(ppj("OUT_FIGURES", "agg_rich_vs_poor"),
                        bbox_inches="tight")
        else:
            plt.savefig(ppj("OUT_FIGURES", "het_rich_vs_poor"),
                        bbox_inches="tight")
        plt.clf()
Ejemplo n.º 13
0
def main():
    # Read in combined election csv.
    elec_df = pd.read_csv(ppj('OUT_DATA_ELEC', 'elections_combined.csv'),
                          low_memory=False)

    # Election office name plus municipality name as search name.
    srch_term_list = get_srch_term_list(elec_df)

    # Google maps search via multiprocessing.
    dict_list = []
    with mp.Pool() as pool:
        out = pool.map(gmaps_elec_offices, srch_term_list)
        dict_list.extend(out)

    # Dicts to dataframe.
    long_lat_df = pd.DataFrame(dict_list)
    long_lat_df.to_csv(ppj('OUT_DATA_ELEC', 'elec_off_longlat.csv'),
                       index=False)

    # Merge latitude and longitude data to combined election csv.
    elec_final_df = pd.merge(elec_df, long_lat_df, how='left', on='srch_term')
    elec_final_df.to_csv(ppj('OUT_DATA_ELEC', 'elections_final.csv'),
                         index=False)

    # Final data without postal ballots.
    elec_final_df = elec_final_df[elec_final_df['postal_vote'] == 0]
    elec_final_df.to_csv(ppj('OUT_DATA_ELEC', 'elections_final_wo_postal.csv'),
                         index=False)
Ejemplo n.º 14
0
def main():
    # Load game URL data and get unique ID list.
    game_df = pd.read_csv(ppj('OUT_DATA_FOOTBALL', 'game_urls.csv'))
    mtchday_list = game_df['mtchday_id'].unique().tolist()

    # Run multiprocessed scraping by matchday ID.
    for mtchday in mtchday_list:
        mp_scraping(mtchday, game_df)

    # Create .txt file to indicate end of scraping process.
    open(ppj('OUT_DATA_FOOTBALL_CSV', 'scraping_finished.txt'), 'a').close()
Ejemplo n.º 15
0
def save_report(rep_dict, data_name, h_test, level):
    """
    Save the reports in text files.

    Args:
        rep_dict (dict): regression results to be reported.

        data_name (str): name of tables (see **wscript** file)

        h_test (int): h = h_test? interest in 1
        (saved in **values_in_interest.json.json**)

        level (float64): ((1 - level)*100)% is significant level.

    """
    data = pd.read_csv(ppj('OUT_DATA', '{}.csv'.format(data_name)))
    y, x = gen_xy(data, h_test)

    with open(ppj('OUT_ANALYSIS', 'report_{}.txt'.format(data_name)),
              'w') as text:
        for key, data in y.items():
            if np.absolute(rep_dict['{}_th0'.format(key)]) < rep_dict['tstat']:
                text.write(
                    'For {} ({}):\n'
                    'h = {} is not significantly different from {}, '
                    'with t-value {}({}) given significant level {}%.\n'
                    '1st estimator is {} (t={})\n2nd estimator is {} (t={})\n'
                    'R^2 is {}\n'
                    'c_p = {}\n'
                    '{} <= c_p <= {} (confidence band)\n'
                    '\n'.format(data_name, key,
                                rep_dict['{}_hhat'.format(key)],
                                rep_dict['htest'],
                                rep_dict['{}_th0'.format(key)].round(4),
                                rep_dict['tstat'].round(2),
                                int((1 - level) * 100),
                                rep_dict['{}_ahat'.format(key)].round(6),
                                rep_dict['{}_ta'.format(key)],
                                rep_dict['{}_mhat'.format(key)].round(6),
                                rep_dict['{}_tm'.format(key)],
                                rep_dict['{}_RS'.format(key)],
                                rep_dict['{}_rra'.format(key)],
                                rep_dict['{}_lb'.format(key)],
                                rep_dict['{}_ub'.format(key)]))

            else:
                text.write('For {} ({}):\n'
                           'h = {} is likely not {} with the t-value {} ({})\n'
                           '\n'.format(data_name, key,
                                       rep_dict['{}_hhat'.format(key)],
                                       rep_dict['htest'],
                                       rep_dict['{}_th0'.format(key)].round(4),
                                       rep_dict['tstat'].round(2)))
Ejemplo n.º 16
0
def main():
    # Read in relevant files.
    elec_mun_df = pd.read_csv(ppj('OUT_DATA_ELEC', 'election_mun.csv'))
    elec_url_df = pd.read_csv(ppj('OUT_DATA_ELEC', 'election_data.csv'))

    # Merge files containing election and municipal information.
    elec_df = pd.merge(elec_url_df, elec_mun_df, how='left', on='mun_url')

    # Create election id.
    elec_df = create_election_id(elec_df)

    # Save as csv.
    elec_df.to_csv(ppj('OUT_DATA_ELEC', 'election_id_data.csv'), index=False)
def expand_voting_files(elec_master_df):
    """
    Downloads all CSV files from the corresponding download url. 
    Each file is expanded by columns containing ID, municipality 
    name, voting level, and state information. Further, a list 
    containing all occurring column names is created.
    """

    # List to store all column names.
    colnames_list = ['']

    # Loop through download urls.
    dwnld_url_list = elec_master_df['dwnld_url'].tolist()
    for i, export_url in enumerate(dwnld_url_list):

        # Create file name form election ID.
        file_name = elec_master_df.loc[i, 'elec_id']
        # Download file to separate folder.
        urlretrieve(export_url,
                    ppj('OUT_DATA_ELEC_CSV', '{}.csv'.format(file_name)))

        # Read in downloaded file.
        temp_df = pd.read_csv(ppj('OUT_DATA_ELEC_CSV',
                                  '{}.csv'.format(file_name)),
                              sep=';')

        # Get column names to ASCI.
        temp_df.columns = [unidecode(x).lower() for x in temp_df.columns]
        temp_df.rename(columns={'name': 'elec_off_name'}, inplace=True)

        # Expand election results with election information.
        temp_df['elec_id'] = elec_master_df.loc[i, 'elec_id']
        temp_df['mun_clearname'] = elec_master_df.loc[i, 'mun_clearname']
        temp_df['state'] = elec_master_df.loc[i, 'state']
        temp_df['elec_year'] = elec_master_df.loc[i, 'elec_year']
        temp_df['elec_date'] = elec_master_df.loc[i, 'elec_date']

        # Get columns of temp_df and append those to overall columns list.
        temp_columns = list(temp_df)
        for columns in temp_columns:
            if columns not in colnames_list:
                colnames_list.append(columns)

        # Overwrite original csv file.
        temp_df.to_csv(ppj('OUT_DATA_ELEC_CSV', '{}.csv'.format(file_name)),
                       index=False)

    # Create .txt file to indicate finshing of download process.
    open(ppj('OUT_DATA_ELEC_CSV', 'election_dwnld_finished.txt'), 'a').close()

    return colnames_list
Ejemplo n.º 18
0
def create_heatmap_nan():
    """Create nullity correlation heatmap and save the plot to ``matrix_nan.png``
    in the "OUT_DATA" directory.

    """
    index_category = pd.Index(new_labels)
    sorted_by_category = gate_plot[index_category]
    heatmap_nan = msno.heatmap(sorted_by_category, vmin=0, cmap="OrRd")
    heatmap_nan.get_xticklabels()[16].set_fontweight("bold")
    heatmap_nan.get_yticklabels()[16].set_fontweight("bold")
    # Interesting fact:
    # When plotting heatmaps with seaborn (on which the "missingno" library
    # builds), the first and the last row is cut in halve, because of a bug
    # in the matplotlib regression between 3.1.0 and 3.1.1
    # We are correcting it this way:
    bottom, top = heatmap_nan.get_ylim()
    heatmap_nan.set_ylim(bottom + 0.5, top - 0.5)
    positions = np.array([1, 3, 5, 8, 10, 14, 16])
    labels = [
        "BACKGROUND",
        "HOUSEHOLD",
        "FINANCE",
        "HEALTH",
        "EMPLOYMENT",
        "PERSONALITY",
    ]
    heatmap_nan.hlines(positions, xmin=0, xmax=positions, lw=8, color="white")
    for position, label in zip(positions, labels):
        heatmap_nan.text(position + 0.35, position + 0.35, label, fontsize=14)
    heatmap_nan.figure.savefig(ppj("OUT_FIGURES", "heatmap_nan.png"),
                               bbox_inches="tight")
def main():
    # Read in final dataset, containing all games and player data.
    game_df = pd.read_csv(ppj('OUT_DATA_FOOTBALL', 'games_combined.csv'))
    unique_plyrs = get_unique_plyrs(game_df)

    # Scraping via multiprocessing.
    dict_list = []
    with mp.Pool() as pool:
        out = pool.map(get_age_nat, unique_plyrs)
        dict_list.extend(out)

    plyr_df = pd.DataFrame(dict_list)  # Dicts to df.

    # Save player url, age and nationality in seperate csv file.
    plyr_df.to_csv(ppj('OUT_DATA_FOOTBALL', 'plyr_nationality.csv'),
                   index=False)
def make_dot_file(ctx):
    # Lazy load module
    from bld.project_paths import project_paths_join as ppj

    # Select task groups, drop first which are project paths
    groups = [group for group in ctx.groups if len(group) != 0]
    groups = groups[1:]
    # Create
    dag = digraph()
    for group in groups:
        for taskgen in group:
            name = taskgen.get_name()

            add_nodes(dag, [name])
            # Add dependencies
            deps = Utils.to_list(getattr(taskgen, "deps", []))
            for dep in deps:
                dep = Path(dep).name
                add_nodes(dag, [dep])
                add_edges(dag, [(dep, name)])

            # Write targets
            targets = Utils.to_list(getattr(taskgen, "target", []))
            for target in targets:
                target = Path(target).name
                add_nodes(dag, [target])
                add_edges(dag, [(name, target)])

    dag = apply_styles(dag, styles)

    # Save DAG
    dag.render(ppj("OUT_FIGURES", "dag"))
def main():
    # Load previously scraped matchday files, containing municipal URLs.
    elec_df = pd.read_csv(ppj("OUT_DATA_ELEC", "election_mun.csv"))

    # List to store resulting election dictionaries.
    dict_list = []

    # Multiprocessed scraping.
    with mp.Pool() as pool:
        out = pool.map(scrape_elec_data, elec_df.mun_url.values)
        out = list(itertools.chain.from_iterable(out))  # Flatten list.
        dict_list.extend(out)  # Extent dictionaries to list.

    # Create dataframe from dictionaries and save as csv.
    df = pd.DataFrame(dict_list)
    df.to_csv(ppj("OUT_DATA_ELEC", "election_data.csv"), index=False)
Ejemplo n.º 22
0
def plot_agg_het(df_sorted):
    for t in [2, 3]:
        plt.plot(
            df_sorted["time"],
            df_sorted.T.iloc[t],
            "--",
            label="Aggregate CPI ",
            color="orange",
            linewidth=2,
            scalex=False,
        )
        plt.plot(
            df_sorted["time"],
            df_sorted.T.iloc[t + 2],
            "--",
            label="Heterogeneous CPI",
            color="blue",
        )
        plt.legend(loc=8)
        plt.xticks(df_sorted["time"][::8], rotation=70)
        plt.xlabel("Time")
        plt.ylabel("Real Consumption")

        # save plot and clean figure
        plt.savefig(
            ppj("OUT_FIGURES",
                "comparison_agg_vs_het_" + df_sorted.columns[t][-7:-4]),
            bbox_inches="tight",
        )
        plt.clf()
Ejemplo n.º 23
0
def plot_transition():
    """Plot results for transitional dynamics."""

    # Load results
    plot_x = np.arange(duration_transition + 1)
    plot_y = np.array([
        results_transition["aggregate_capital"],
        results_transition["aggregate_labor"]
    ])

    # Create figure and plot
    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    line1 = ax1.plot(plot_x, plot_y[0, :], color="tab:blue", label="assets")
    ax2 = ax1.twinx()
    line2 = ax2.plot(plot_x,
                     plot_y[1, :],
                     color="tab:orange",
                     label="human capital")
    lines = line1 + line2
    labels = [line.get_label() for line in lines]
    ax1.legend(lines, (labels), loc=0)
    ax1.set(xlabel="transition period", ylabel="assets", ybound=[0, 15.0])
    ax2.set(ylabel="human capital", ybound=[0, hc_max])

    # Save figure
    fig.savefig(ppj("OUT_FIGURES", "results_transition.png"))
Ejemplo n.º 24
0
def regression_tables(macro_ind):
    """creates a regression table as latex for each variable in macro_ind
        macro_ind: list of name of variabels of interest as strings
    """
    for ind in macro_ind:

        mod = smf.ols(formula=ind + "~ fraction ", data=df)
        res1 = mod.fit()

        mod = smf.ols(formula=ind + "~ mean ", data=df)
        res2 = mod.fit()

        mod = smf.ols(formula=ind + "~ var ", data=df)
        res3 = mod.fit()

        mod = smf.ols(formula=ind + "~ var + mean + var ", data=df)
        res4 = mod.fit()

        textfile = open(ppj("OUT_ANALYSIS", ind + "_on_sentiment.txt"), "w")
        textfile.write(
            summary_col(
                [res1, res2, res3, res4],
                stars=True,
                float_format="%0.2f",
                model_names=["\n(0)", "\n(1)", "\n(2)", "\n(3)"],
                info_dict={
                    "N": lambda x: "{:d}".format(int(x.nobs)),
                    "R2": lambda x: f"{x.rsquared:.2f}",
                },
            ).as_latex())
        textfile.close()
Ejemplo n.º 25
0
def expected_predict():
    out = {}
    for i in DB:
        with open(ppj("OUT_MODEL_SPECS", f"{i}_rid.json")) as f:
            data = json.load(f)
        out[f"{i}"] = np.array(data)
    return out
def make_dot_file(ctx):
    # Lazy load module
    from bld.project_paths import project_paths_join as ppj

    # Select task groups, drop first which are project paths
    groups = [group for group in ctx.groups if len(group) != 0]
    groups = groups[1:]
    # Create
    dag = digraph()
    for group in groups:
        for taskgen in group:
            name = taskgen.get_name()

            add_nodes(dag, [name])
            # Add dependencies
            deps = Utils.to_list(getattr(taskgen, "deps", []))
            for dep in deps:
                dep = Path(dep).name
                add_nodes(dag, [dep])
                add_edges(dag, [(dep, name)])

            # Write targets
            targets = Utils.to_list(getattr(taskgen, "target", []))
            for target in targets:
                target = Path(target).name
                add_nodes(dag, [target])
                add_edges(dag, [(name, target)])

    dag = apply_styles(dag, styles)

    # Save DAG
    dag.render(ppj("OUT_FIGURES", "dag"))
Ejemplo n.º 27
0
def extract_sources():
    df = read_parquet_in_date_chunks(ppj("OUT_DATA", "tweets-cleaned"))

    sources = (df.urls.str.split("/", n=3, expand=True)[2].str.replace(
        "www.", "").value_counts())

    return sources
def plot_locations(locations_by_round, model_name):
    "Plot the distribution of agents after cycle_num rounds of the loop."
    n_cycles = len(locations_by_round)
    nrows = int(np.ceil(n_cycles / 2 - 0.01))
    figsize = (2 * 3, nrows * 2)
    fig, axes = plt.subplots(nrows=nrows, ncols=2, figsize=figsize)
    fig.subplots_adjust(
        left=0.05, right=0.95, bottom=0.05, top=0.95, wspace=0.25, hspace=0.25
    )
    for item, ax in np.ndenumerate(axes):
        n_cycle = item[0] * 2 + item[1]
        if n_cycle == n_cycles:
            # Remove last element if number of cycles is uneven
            fig.delaxes(ax)
            break
        locs = locations_by_round[n_cycle]
        ax.set_title("Cycle {}".format(n_cycle))
        ax.tick_params(labelbottom="off", labelleft="off")
        ax.set_facecolor("azure")
        ax.plot(
            locs[0][:, 0], locs[0][:, 1], "o", markerfacecolor="orange", **PLOT_ARGS
        )
        ax.plot(locs[1][:, 0], locs[1][:, 1], "o", markerfacecolor="green", **PLOT_ARGS)

    fig.savefig(ppj("OUT_FIGURES", "schelling_{}.png".format(model_name)))
def plot_locations(locations_by_round, model_name):
    """Plot the distribution of agents after cycle_num rounds of the loop."""
    n_cycles = len(locations_by_round)
    nrows = int(np.ceil(n_cycles / 2 - 0.01))
    figsize = (2 * 3, nrows * 2)
    fig, axes = plt.subplots(nrows=nrows, ncols=2, figsize=figsize)
    fig.subplots_adjust(
        left=0.05, right=0.95, bottom=0.05, top=0.95, wspace=0.25, hspace=0.25
    )
    for item, ax in np.ndenumerate(axes):
        n_cycle = item[0] * 2 + item[1]
        if n_cycle == n_cycles:
            # Remove last element if number of cycles is uneven
            fig.delaxes(ax)
            break
        locs = locations_by_round[n_cycle]
        ax.set_title(f"Cycle {n_cycle}")
        ax.tick_params(labelbottom=False, labelleft=False)
        ax.set_facecolor("azure")
        ax.plot(
            locs[0][:, 0], locs[0][:, 1], "o", markerfacecolor="orange", **PLOT_ARGS
        )
        ax.plot(locs[1][:, 0], locs[1][:, 1], "o", markerfacecolor="green", **PLOT_ARGS)

    fig.savefig(ppj("OUT_FIGURES", f"schelling_{model_name}.png"))
Ejemplo n.º 30
0
def main():
    # Main URL to start from.
    main_url = 'https://www.fupa.net'
    # Regions to scrap.
    regions = ['mittelrhein']
    # regions = ['mittelrhein', 'niederrhein', 'ruhrgebiet', 'westrhein']

    # Intitalize dictionary and pandas dataframe to store league data.
    matchday_dict = dict()
    matchday_df = pd.DataFrame()

    # Loop through all predefined regions to get district districs within each
    # region.
    for region in regions:
        district_url_list = get_district_list(region, main_url, matchday_dict)

        # Loop through districts to get leagues within each district.
        for district_url in district_url_list:
            leagues_url_list = get_league_list(
                district_url, matchday_dict, main_url)

            # Loop through each single league to get list of all past seasons
            # within a league.
            for league_url in leagues_url_list:
                seasons_list = get_seaons_list(league_url, matchday_dict)

                # Get matchday url for all seasons within a league, from which
                # to start scraping.
                for season in seasons_list:
                    matchday_df = get_matchday_url(
                        season, matchday_dict, matchday_df)

    # Save matchday dataframe as .csv file.
    matchday_df.to_csv(ppj('OUT_DATA_FOOTBALL', 'matchday_data.csv'))
Ejemplo n.º 31
0
def figure_maker_func(turkstat_data, analysis_data, freq):
    """Function for generating figures.

    Depending on the frequency of data appropriate title is assigned.

    Args:
        | turkstat_data(pd.Dataframe): the TurkStat dataset containing percentage
            changes with dates on the index
        | analysis_data (pd.Dataframe): the dataset containing percentage price
            changes with dates on the index
        | freq (str): frequency of data

    Returns:
        the figure and also saves it with appropriate title
    """

    if freq == "yearly":
        title = "Percentage Price Change Compared the Last Year's Same Month"
    elif freq == "monthly":
        title = "Monthly Percentage Price Change"

    plt.figure(figsize=[14, 5])
    plt.title(title)
    plt.ylabel("Percentage Change(%)")
    plt.plot(turkstat_data, "r-s", color="blue", linestyle="dashed")
    plt.plot(analysis_data, "r-s", color="red")
    plt.legend(labels=["Accomodation Services(TurkStat)", "Analysis(Airbnb)"])
    plt.grid()

    plt.savefig(ppj("OUT_FIGURES", f"{freq}_change_figure.png"))
Ejemplo n.º 32
0
def create_logistic_dataframe(name_df):
    """Check for missing values mechanism of dataset ``gate_final.csv`` via logistic
    regression and save results to ``logistic_df.csv``.

    """
    gate_logistic = name_df.drop(
        [
            "gateid",
            "hhincome",
            "hhincome_w2",
            "completed_w2",
            "site",
            "philadelphia",
            "white",
            "hhincome_50_74k",
            "worked_for_relatives_friends_se",
        ],
        axis=1,
    )
    results_cov = generate_regression_output(gate_logistic.drop("missing_out",
                                                                axis=1),
                                             "missing_cov",
                                             type="Logit")
    results_out = generate_regression_output(gate_logistic,
                                             "missing_out",
                                             type="Logit")
    logistic_df = pd.concat(
        [results_cov, results_out],
        axis=1,
        keys=["Missing in covariates", "Missing in outcome"],
        sort=False,
    )
    logistic_df.to_csv(ppj("OUT_ANALYSIS", "logistic_df.csv"))
Ejemplo n.º 33
0
def setup_agents(model):
    """Load the simulated initial locations and return a list
    that holds all agents.

    """

    initial_locations = np.loadtxt(ppj("OUT_DATA", "initial_locations.csv"), delimiter=",")
    initial_locations = initial_locations.reshape(2, model["n_types"], 30000)

    agents = []
    for typ in range(model["n_types"]):
        for i in range(model["n_agents_by_type"][typ]):
            agents.append(
                Agent(
                    typ=typ,
                    initial_location=initial_locations[typ, :, i],
                    n_neighbours=model["n_neighbours"],
                    require_same_type=model["require_same_type"],
                    max_moves=model["max_moves"]
                )
            )

    return agents
        n_cycle = item[0] * 2 + item[1]
        if n_cycle == n_cycles:
            # Remove last element if number of cycles is uneven
            fig.delaxes(ax)
            break
        locs = locations_by_round[n_cycle]
        ax.set_title("Cycle {}".format(n_cycle))
        ax.tick_params(labelbottom="off", labelleft="off")
        ax.set_facecolor("azure")
        ax.plot(
            locs[0][:, 0], locs[0][:, 1], "o", markerfacecolor="orange", **PLOT_ARGS
        )
        ax.plot(locs[1][:, 0], locs[1][:, 1], "o", markerfacecolor="green", **PLOT_ARGS)

    fig.savefig(ppj("OUT_FIGURES", "schelling_{}.png".format(model_name)))


if __name__ == "__main__":
    model_name = sys.argv[1]
    model = json.load(
        open(ppj("IN_MODEL_SPECS", model_name + ".json"), encoding="utf-8")
    )

    # Load locations after each round
    with open(
        ppj("OUT_ANALYSIS", "schelling_{}.pickle".format(model_name)), "rb"
    ) as in_file:
        locations_by_round = pickle.load(in_file)

    plot_locations(locations_by_round, model_name)
def save_data(sample):
    sample.tofile(ppj("OUT_DATA", "initial_locations.csv"), sep=",")
Ejemplo n.º 36
0
from skillmodels import SkillModel
from bld.project_paths import project_paths_join as ppj
import pandas as pd
from pandas import DataFrame
import json
import sys


if __name__ == '__main__':
    model_name, dataset_name, estimator = sys.argv[1:4]

    # load the model dict from a json file in src.model_specs
    with open(ppj('IN_MODEL_SPECS', '{}.json'.format(model_name))) as j:
        model_dict = json.load(j)

    # load the dataset from a dta file in bld.out.data
    dataset = pd.read_stata(ppj('OUT_DATA', '{}.dta'.format(dataset_name)))

    # create an instance of SkillModel
    mod = SkillModel(model_dict=model_dict, dataset=dataset,
                     estimator=estimator,
                     model_name=model_name, dataset_name=dataset_name)

    # call its fit method to estimate the model
    res = mod.fit()

    # create a pandas DataFrame containing the parameters and standard errors
    df = DataFrame(data=res.params, columns=['params'], index=res.param_names)
    df['se'] = res.bse
    df['pvalues'] = res.pvalues
    df['tvalues'] = res.tvalues
import pandas as pd
import sys
from bld.project_paths import project_paths_join as ppj

if __name__ == '__main__':
    dataset_name = sys.argv[1]
    data = pd.read_stata(ppj('IN_DATA', '{}.dta'.format(dataset_name)))

    data['id'] = data['caseid'] - 1
    data['period'] = data['period'] - 1
    data = data.drop(
        ['dy1', 'dy2', 'dy3', 'dy4', 'dy5', 'dy6', 'dQ1'], axis=1)

    data.to_stata(ppj("OUT_DATA", "{}_ready.dta".format(dataset_name)))
Ejemplo n.º 38
0
        bottom=0.05,
        top=0.95,
        wspace=0.25,
        hspace=0.25
    )
    for item, ax in np.ndenumerate(axes):
        n_cycle = item[0] * 2 + item[1]
        if n_cycle == n_cycles:
            # Remove last element if number of cycles is uneven
            fig.delaxes(ax)
            break
        locs = locations_by_round[n_cycle]
        ax.set_title("Cycle {}".format(n_cycle))
        ax.tick_params(labelbottom="off", labelleft="off")
        ax.set_axis_bgcolor("azure")
        ax.plot(locs[0][:, 0], locs[0][:, 1], "o", markerfacecolor="orange", **PLOT_ARGS)
        ax.plot(locs[1][:, 0], locs[1][:, 1], "o", markerfacecolor="green", **PLOT_ARGS)

    fig.savefig(ppj("OUT_FIGURES", "schelling_{}.png".format(model_name)))


if __name__ == "__main__":
    model_name = sys.argv[1]
    model = json.load(open(ppj("IN_MODEL_SPECS", model_name + ".json"), encoding="utf-8"))

    # Load locations after each round
    with open(ppj("OUT_ANALYSIS", "schelling_{}.pickle".format(model_name)), "rb") as in_file:
        locations_by_round = pickle.load(in_file)

    plot_locations(locations_by_round, model_name)
    ax.set_ylim(0, y_max)
    ax.set_autoscale_on(False)

    ax.legend([axes[col] for col in colnames], colnames, fontsize=14)

    plt.title(
        title, fontsize=28, y=1 + 2.5 / nr_rows, x=-0.6, loc="left", weight="bold"
    )

    plt.savefig(path, bbox_inches="tight", pad_inches=0.8)
    plt.close(fig)


if __name__ == "__main__":
    model, dataset = sys.argv[1:3]
    true_path = ppj("LIBRARY", "true_{}_results.csv")
    true_df = pd.read_csv(true_path.format(model), index_col="index")
    estimated_path = ppj("OUT_ANALYSIS", "{}_{}/results_df.csv")
    estimated_df = pd.read_csv(estimated_path.format(model, dataset), index_col="index")
    df = pd.concat([estimated_df, true_df], axis=1, sort=False)

    df["Fortran"] = df["chs_params"] - df["true_value"]
    df["Python"] = df["params"] - df["true_value"]

    plot_results_comparison(
        df=df,
        colnames=["Python", "Fortran"],
        path=ppj("OUT_ANALYSIS", "{}_{}/comparison_plot".format(model, dataset)),
        title=(
            "Comparison of Python and Fortran results\nBars show "
            "deviation from true population Parameters"
         [.925, 0.04, 0.75],
         [.925, 0.04, 0.75]],

        np.zeros((3, 0))]

    true_loadings = np.arange(start=0.5, stop=1.85, step=0.05)
    true_intercepts = np.arange(start=-0.665, stop=0.665, step=0.05)
    true_X_zero = np.array([10, 15, 30])
    true_cov_matrix = np.array([[2.0, 0.05, 0.1],
                                [0.05, 4.0, 0.0],
                                [0.1, 0.0, 9.0]])

    nobs = 8000
    base_meas_sd = 0.7
    base_trans_sd = 1.0

    true_meas_sd = true_loadings * base_meas_sd
    true_trans_sd = [[0.4, 0.5], [0.4, 0.5], [0.4, 0.5]]

    large_df = generate_test_data(
        nobs=nobs, factors=factor_names, periods=periods,
        included_positions=included_positions,
        meas_names=meas_names,
        initial_mean=true_X_zero, initial_cov=true_cov_matrix,
        intercepts=true_intercepts, loadings=true_loadings,
        meas_sd=true_meas_sd, gammas=true_gammas,
        trans_sd=true_trans_sd)

    out_path = ppj('OUT_DATA', 'ns_translog_data.dta')
    large_df.to_stata(out_path)
Ejemplo n.º 41
0
            if not (agent.location == old_location).all():
                someone_moved = True
        _store_locations_by_round(locations_by_round[-1], agents)
        # We are done if everybody is happy.
        if not someone_moved:
            break

    if someone_moved:
        logging.info("No convergence achieved after {} iterations".format(model["max_iterations"]))

    return locations_by_round


if __name__ == "__main__":
    model_name = sys.argv[1]
    model = json.load(open(ppj("IN_MODEL_SPECS", model_name + ".json"), encoding="utf-8"))

    logging.basicConfig(
        filename=ppj("OUT_ANALYSIS", "log", "schelling_{}.log".format(model_name)),
        filemode="w",
        level=logging.INFO
    )
    np.random.seed(model["rng_seed"])
    logging.info(model["rng_seed"])

    # Load initial locations and setup agents
    agents = setup_agents(model)
    # Run the main analysis
    locations_by_round = run_analysis(agents, model)
    # Store list with locations after each round
    with open(ppj("OUT_ANALYSIS", "schelling_{}.pickle".format(model_name)), "wb") as out_file: