コード例 #1
0
def load_results():
    actual_likelihood = \
        load_output("app9_likelihood.csv").iloc[0]["values"]
    probabilities = \
        load_output("app9_probabilities.csv").probability
    sample = load_output("app9_sample.csv").likelihood

    return actual_likelihood, probabilities, sample
コード例 #2
0
def load_results():
    df = load_output("app5_actual_total_ent.csv")
    actual_total_ent = df["values"][0]

    df = load_output("app5_ent_in_top_probs.csv")
    probabilities = df.probability

    df = load_output("app5_ent_in_top_entropies.csv")
    entropies = df.entropy

    return actual_total_ent, probabilities, entropies
コード例 #3
0
def _get_merged_data() -> pd.DataFrame:
    """ Merges 2019 merged data if needed or retrieves the version
        already cached on disk

        Hidden as it does not respect hooks which could be confusing.
        (This could be easily relaxed via requiring idempotency from hooks.)
    """
    if not output_exists("merged.csv"):
        filename = r'HU/EP_2019_szavaz_k_ri_eredm_ny.xlsx'
        print("file: %s" % filename)
        print("reading sheet names ...")
        xls = pd.ExcelFile(filename)
        print("found %d sheets" % len(xls.sheet_names))

        dfs = []

        for name in xls.sheet_names:
            print("reading", name)
            df = pd.read_excel(filename, sheet_name=name)
            print("read %d rows" % len(df))
            dfs.append(df)

        df = pd.concat(dfs)
        save_output(df, "merged.csv")
    else:
        df = load_output("merged.csv")
    return df
コード例 #4
0
def get_act_data():
    if not output_exists(ACT_FILENAME):
        df_act = check_equalities_2()
        save_output(df_act, ACT_FILENAME)
    else:
        df_act = load_output(ACT_FILENAME)
    return df_act
コード例 #5
0
def get_2019_cleaned_data():
    if not output_exists("cleaned.csv"):
        df = _get_merged_data()
        """ there is a mostly NaN row (someone left a total count in)
            --> remove!
        """
        df.columns = [
            "Unnamed",
            "Megye",
            "Telepules",
            "Szavazokor",
            "Nevjegyzekben",
            "Megjelent",
            "Belyegzetlen",
            "Lebelyegzett",
            "Elteres megjelentektol",
            "Ervenytelen",
            "Ervenyes",
            # parties
            "MSZP",
            "MKKP",
            "Jobbik",
            "Fidesz",
            "Momentum",
            "DK",
            "Mi Hazank",
            "Munkaspart",
            "LMP"
        ]

        # There is a mostly NAN line at the end of the Budapest sheet, remove it
        nan_line_idxs = np.where(np.isnan(df.Ervenyes))
        if len(nan_line_idxs) != 1 or (nan_line_idxs[0] != 1405):
            raise Exception("Only a certain NaN line was expected, please "
                            "check the data.")
        df.drop(nan_line_idxs[0], inplace=True)
        save_output(df, "cleaned.csv")
    else:
        df = load_output("cleaned.csv")

    df["Telepules"] = _translate_capital_district_name(df["Telepules"])
    df = _apply_processing_hooks(df, get_2019_cleaned_data)
    return df
コード例 #6
0
def load_results():
    """
    Returns four data frames.

    df_comparative: comparative data sorted by p_both ie. both
        years being regular, ascending ~ (1 - p_either)
    suspects: top suspects based on p_both
    df_comparative_sorted_by_both: comparative data sorted by
        p_both_incorr ie. both being incorrect, descending
    suspects2: top suspects based on p_both_incorr
    df_comparative_sorted_by_all_3: comparative data sorted by
        p_all_3 i.e. all 3 (2014, 2018, 2019) being regular
    suspects3: top candidates from df_comparative_sorted_by_all_3.

    Other criteria: at least 8 electoral wards and at least 100 Fidesz
        party votes (in 2019) in each ward in each suspect settlement
        in order to be considered,
    """

    df_comparative = load_output("app6_comparative_result.csv")
    suspects = load_output("app6_suspects.csv")
    df_comparative_sorted_by_both = load_output(
        "app6_comparative_result_sort_by_both_incorr.csv"
    )
    suspects2 = load_output("app6_suspects2.csv")

    df_comparative_sorted_by_all_3 = load_output(
        "app6_comparative_sorted_by_all_3.csv"
    )
    suspects3 = load_output("app6_suspects3.csv")

    return (
        df_comparative,
        suspects,
        df_comparative_sorted_by_both,
        suspects2,
        df_comparative_sorted_by_all_3,
        suspects3,
    )
コード例 #7
0
def load_results():
    df_p_twins = load_output("app8_twins.csv")
    suspects = load_output("app8_suspects.csv")
    return df_p_twins, suspects
コード例 #8
0
def load_results():
    df_fidesz_jobbik_joint = load_output("app7_fidesz_jobbik_joint.csv")
    suspects = load_output("app7_suspects.csv")
    return df_fidesz_jobbik_joint, suspects
コード例 #9
0
def generate_data():
    agg_cols = OrderedDict([("ld_Fidesz", [get_entropy, len]),
                            ("Fidesz", min)])

    df_Fidesz_ent = df.groupby(["Telepules"]).aggregate(agg_cols)
    df_Fidesz_ent.reset_index(inplace=True)
    df_Fidesz_ent.columns = ["Settlement", "ld_entropy", "count", "min"]

    """
    So two things cab be of interest:
    settlements with >= 50% of both being incorrect
    and those with   < 5% of both being correct
    """

    df_Fidesz_ent["count"] = df_Fidesz_ent["count"].astype(int)
    df_Fidesz_ent["prob_of_entr"] = df_Fidesz_ent.apply(calc_prob, axis=1)

    df_Fidesz_ent.sort_values(["prob_of_entr"], inplace=True)

    df_Fidesz_ent_2018 = load_output("Fidesz_entr_prob_2018.csv")
    df_Fidesz_ent_2014 = load_2014_data()


    df_comparative = pd.merge(df_Fidesz_ent, df_Fidesz_ent_2018,
                              how="inner", on=["Settlement"])
    df_Fidesz_ent_2014.columns = [
        column + "_z" if column != "Settlement" else column
        for column in df_Fidesz_ent_2014.columns
    ]
    df_comparative = pd.merge(df_comparative, df_Fidesz_ent_2014,
                              how="inner", on=["Settlement"], suffixes=["", "_z"])

    df_comparative.columns = ["Settlement", "ld_Entropy_2019", "count_2019",
                              "min_votes_2019", "p_2019",
                              "ld_entropy_2018", "min_votes_2018",
                              "count_2018", "p_2018",
                              "ld_Entropy_2014", "count_2014",
                              "min_votes_2014", "p_2014"]

    df_comparative["p_all_3"] = \
        df_comparative.p_2019 * df_comparative.p_2018 * df_comparative.p_2014
    df_comparative["p_all_2"] = df_comparative.p_2019 * df_comparative.p_2018
    df_comparative["p_both_incorr"] = \
        (1 - df_comparative.p_2019) * (1 - df_comparative.p_2018)
    df_comparative.sort_values(["p_all_2"], inplace=True)

    # p_both: prob. of both being 'regular'
    # p_both_incorr: prob. of both being 'irregular'

    suspects = df_comparative.loc[(df_comparative.p_all_2 < 0.1) &
                                  (df_comparative.min_votes_2019 >= 100 ) &
                                  (df_comparative.count_2018 >= 8)]
    df_comparative_sorted_by_both = \
        df_comparative.sort_values(["p_both_incorr"], ascending=[False])
    suspects2 = df_comparative_sorted_by_both.loc[
        (df_comparative_sorted_by_both.p_both_incorr >= 0.5) &
        (df_comparative_sorted_by_both.min_votes_2019 >= 100) &
        (df_comparative_sorted_by_both.count_2018 >= 8)
    ]

    df_comparative_sorted_by_all_3 = df_comparative.sort_values(["p_all_3"])
    suspects3 = df_comparative_sorted_by_all_3.loc[
        (df_comparative_sorted_by_all_3.p_all_3 <= 0.1) &
        (df_comparative_sorted_by_both.min_votes_2019 >= 100) &
        (df_comparative_sorted_by_both.count_2018 >= 8)
    ]

    save_results(df_comparative, suspects,
                 df_comparative_sorted_by_both, suspects2,
                 df_comparative_sorted_by_all_3, suspects3)