def load_results(): actual_likelihood = \ load_output("app9_likelihood.csv").iloc[0]["values"] probabilities = \ load_output("app9_probabilities.csv").probability sample = load_output("app9_sample.csv").likelihood return actual_likelihood, probabilities, sample
def load_results(): df = load_output("app5_actual_total_ent.csv") actual_total_ent = df["values"][0] df = load_output("app5_ent_in_top_probs.csv") probabilities = df.probability df = load_output("app5_ent_in_top_entropies.csv") entropies = df.entropy return actual_total_ent, probabilities, entropies
def _get_merged_data() -> pd.DataFrame: """ Merges 2019 merged data if needed or retrieves the version already cached on disk Hidden as it does not respect hooks which could be confusing. (This could be easily relaxed via requiring idempotency from hooks.) """ if not output_exists("merged.csv"): filename = r'HU/EP_2019_szavaz_k_ri_eredm_ny.xlsx' print("file: %s" % filename) print("reading sheet names ...") xls = pd.ExcelFile(filename) print("found %d sheets" % len(xls.sheet_names)) dfs = [] for name in xls.sheet_names: print("reading", name) df = pd.read_excel(filename, sheet_name=name) print("read %d rows" % len(df)) dfs.append(df) df = pd.concat(dfs) save_output(df, "merged.csv") else: df = load_output("merged.csv") return df
def get_act_data(): if not output_exists(ACT_FILENAME): df_act = check_equalities_2() save_output(df_act, ACT_FILENAME) else: df_act = load_output(ACT_FILENAME) return df_act
def get_2019_cleaned_data(): if not output_exists("cleaned.csv"): df = _get_merged_data() """ there is a mostly NaN row (someone left a total count in) --> remove! """ df.columns = [ "Unnamed", "Megye", "Telepules", "Szavazokor", "Nevjegyzekben", "Megjelent", "Belyegzetlen", "Lebelyegzett", "Elteres megjelentektol", "Ervenytelen", "Ervenyes", # parties "MSZP", "MKKP", "Jobbik", "Fidesz", "Momentum", "DK", "Mi Hazank", "Munkaspart", "LMP" ] # There is a mostly NAN line at the end of the Budapest sheet, remove it nan_line_idxs = np.where(np.isnan(df.Ervenyes)) if len(nan_line_idxs) != 1 or (nan_line_idxs[0] != 1405): raise Exception("Only a certain NaN line was expected, please " "check the data.") df.drop(nan_line_idxs[0], inplace=True) save_output(df, "cleaned.csv") else: df = load_output("cleaned.csv") df["Telepules"] = _translate_capital_district_name(df["Telepules"]) df = _apply_processing_hooks(df, get_2019_cleaned_data) return df
def load_results(): """ Returns four data frames. df_comparative: comparative data sorted by p_both ie. both years being regular, ascending ~ (1 - p_either) suspects: top suspects based on p_both df_comparative_sorted_by_both: comparative data sorted by p_both_incorr ie. both being incorrect, descending suspects2: top suspects based on p_both_incorr df_comparative_sorted_by_all_3: comparative data sorted by p_all_3 i.e. all 3 (2014, 2018, 2019) being regular suspects3: top candidates from df_comparative_sorted_by_all_3. Other criteria: at least 8 electoral wards and at least 100 Fidesz party votes (in 2019) in each ward in each suspect settlement in order to be considered, """ df_comparative = load_output("app6_comparative_result.csv") suspects = load_output("app6_suspects.csv") df_comparative_sorted_by_both = load_output( "app6_comparative_result_sort_by_both_incorr.csv" ) suspects2 = load_output("app6_suspects2.csv") df_comparative_sorted_by_all_3 = load_output( "app6_comparative_sorted_by_all_3.csv" ) suspects3 = load_output("app6_suspects3.csv") return ( df_comparative, suspects, df_comparative_sorted_by_both, suspects2, df_comparative_sorted_by_all_3, suspects3, )
def load_results(): df_p_twins = load_output("app8_twins.csv") suspects = load_output("app8_suspects.csv") return df_p_twins, suspects
def load_results(): df_fidesz_jobbik_joint = load_output("app7_fidesz_jobbik_joint.csv") suspects = load_output("app7_suspects.csv") return df_fidesz_jobbik_joint, suspects
def generate_data(): agg_cols = OrderedDict([("ld_Fidesz", [get_entropy, len]), ("Fidesz", min)]) df_Fidesz_ent = df.groupby(["Telepules"]).aggregate(agg_cols) df_Fidesz_ent.reset_index(inplace=True) df_Fidesz_ent.columns = ["Settlement", "ld_entropy", "count", "min"] """ So two things cab be of interest: settlements with >= 50% of both being incorrect and those with < 5% of both being correct """ df_Fidesz_ent["count"] = df_Fidesz_ent["count"].astype(int) df_Fidesz_ent["prob_of_entr"] = df_Fidesz_ent.apply(calc_prob, axis=1) df_Fidesz_ent.sort_values(["prob_of_entr"], inplace=True) df_Fidesz_ent_2018 = load_output("Fidesz_entr_prob_2018.csv") df_Fidesz_ent_2014 = load_2014_data() df_comparative = pd.merge(df_Fidesz_ent, df_Fidesz_ent_2018, how="inner", on=["Settlement"]) df_Fidesz_ent_2014.columns = [ column + "_z" if column != "Settlement" else column for column in df_Fidesz_ent_2014.columns ] df_comparative = pd.merge(df_comparative, df_Fidesz_ent_2014, how="inner", on=["Settlement"], suffixes=["", "_z"]) df_comparative.columns = ["Settlement", "ld_Entropy_2019", "count_2019", "min_votes_2019", "p_2019", "ld_entropy_2018", "min_votes_2018", "count_2018", "p_2018", "ld_Entropy_2014", "count_2014", "min_votes_2014", "p_2014"] df_comparative["p_all_3"] = \ df_comparative.p_2019 * df_comparative.p_2018 * df_comparative.p_2014 df_comparative["p_all_2"] = df_comparative.p_2019 * df_comparative.p_2018 df_comparative["p_both_incorr"] = \ (1 - df_comparative.p_2019) * (1 - df_comparative.p_2018) df_comparative.sort_values(["p_all_2"], inplace=True) # p_both: prob. of both being 'regular' # p_both_incorr: prob. of both being 'irregular' suspects = df_comparative.loc[(df_comparative.p_all_2 < 0.1) & (df_comparative.min_votes_2019 >= 100 ) & (df_comparative.count_2018 >= 8)] df_comparative_sorted_by_both = \ df_comparative.sort_values(["p_both_incorr"], ascending=[False]) suspects2 = df_comparative_sorted_by_both.loc[ (df_comparative_sorted_by_both.p_both_incorr >= 0.5) & (df_comparative_sorted_by_both.min_votes_2019 >= 100) & (df_comparative_sorted_by_both.count_2018 >= 8) ] df_comparative_sorted_by_all_3 = df_comparative.sort_values(["p_all_3"]) suspects3 = df_comparative_sorted_by_all_3.loc[ (df_comparative_sorted_by_all_3.p_all_3 <= 0.1) & (df_comparative_sorted_by_both.min_votes_2019 >= 100) & (df_comparative_sorted_by_both.count_2018 >= 8) ] save_results(df_comparative, suspects, df_comparative_sorted_by_both, suspects2, df_comparative_sorted_by_all_3, suspects3)