예제 #1
0
def IQR(regions=None, countries=None):
    """IQR method for region dataframe.
    
    Args:
        regions (pd.DataFrame, optional): Optional data to work on.
                                          If not given, all regions used.
        countries ()
    """

    # fetch region dataframe
    regions = _src.regions_df() if regions is None else regions

    # subset country
    if countries:
        regions = regions[regions.Country.isin(list(countries))]
    descriptive_cols = {'Region', 'Code', 'Country'}

    def _IQR(x):
        """IQR method for a vector."""
        q1, q3 = np.percentile(sorted(x), [25, 75])
        iqr = q3 - q1
        low, high = (q1 - (1.5 * iqr), q3 + (1.5 * iqr))
        return (x < low) | (x > high)

    # remove descriptive columns
    outs = {}
    for c in set(regions.columns) - descriptive_cols:
        flags = _IQR(regions[c])
        outs[c] = regions[flags]

    # return outliers
    return outs
예제 #2
0
def popdensity_boxplot_noPRG(name=None, regions_df=None):

    # default data if not given
    regions_df = regions_df if regions_df is not None else _src.regions_df()

    # crop prague off
    regions_df = regions_df[regions_df.Code != "CZ010"]

    # plot
    popdensity_boxplot(name=name, regions_df=regions_df)
예제 #3
0
def administrative_divisions_similar(pi=True, alpha=.05):
    """Tests that regions have same mean in their populations, areas and densities.
    Use two-sampled t_test with preceding F-test to test equal variances.
    Only the result of t_test is returned.
    
    H0: mu1 = mu2
    HA: mu1 != mu2
    
    Args:
        pi (bool): If True, returns pi values.
                   If False, returns validity of H0.
                   Defautly True.
        alpha (float): Significance level.
    """

    # data
    attributes = ['Population', 'Area', 'Density']
    regions_df = _src.regions_df()
    pi_df = _src.regions_countries_pairs(attributes)

    # perform the test
    for a in attributes:
        for i, r in pi_df.iterrows():
            # data
            data1 = regions_df[regions_df.Country == r.Country1][a]
            data2 = regions_df[regions_df.Country == r.Country2][a]

            # F-test
            #ftest, fpi = _tools.f_test(data1, data2)
            ftest, fpi = stats.levene(data1, data2)
            equal_var = fpi > alpha

            # test
            pop_pi = stats.ttest_ind(data1, data2, equal_var=equal_var)
            # write down pvalue
            pi_df.at[i, a] = pop_pi.pvalue

    # return pi value
    if pi: return pi_df

    # make decision
    pi_df = pd.concat(
        [pi_df[['Country1', 'Country2']], pi_df[attributes] > alpha],
        axis=1,
        ignore_index=True)
    pi_df.columns = ['Country1', 'Country2', *attributes]
    return pi_df
예제 #4
0
def regions_normally_distributed(pi=True, alpha=.05):
    """Tests whether regions are distributed normally in their
    populations, areas and densities over different countries.
    Uses Shapiro-Wilk test.
    
    H0: Regions of country are distributed normally in the attribute.
    HA: They are not.
    
    Args:
        pi (bool): If True, returns pi values.
                   If False, returns validity of H0.
                   Defautly True.
        alpha (float): Significance level.
    """

    # data
    attributes = ['Population', 'Area', 'Density']
    regions_df = _src.regions_df()

    # empty single country
    countries = regions_df.Country.unique()
    # create dataframe
    pi_dict = {k: [None for _ in range(len(countries))] for k in attributes}
    pi_dict = {'Country': [c for c in countries], **pi_dict}
    # pi values dataframes
    pi_df = pd.DataFrame(pi_dict)

    # perform the test
    for a in attributes:
        for i, r in pi_df.iterrows():
            # data
            data = regions_df[regions_df.Country == r.Country][a]

            # Shapiro-Wilk test
            normal_pi = stats.shapiro(data)
            pi_df.at[i, a] = normal_pi.pvalue

    # return pi value
    if pi: return pi_df

    # make decision
    pi_df = pd.concat([pi_df[['Country']], pi_df[attributes] > alpha],
                      axis=1,
                      ignore_index=True)
    pi_df.columns = ['Country', *attributes]
    return pi_df
예제 #5
0
def area_population_scatter(name=None, regions_df=None):

    # default data if not given
    regions_df = regions_df if regions_df is not None else _src.regions_df()

    # plot
    ax = sns.jointplot(x="Area",
                       y="Population",
                       hue="Country",
                       data=regions_df)
    # axis limits
    ax.ax_marg_x.set_xlim(-1.5 * 10**4, 1.25 * 10**5)
    ax.ax_marg_y.set_ylim(-5 * 10**5, 6 * 10**6)
    if name is None:
        plt.show()

        # save
    else:
        plt.savefig(name)
예제 #6
0
def popdensity_boxplot(name=None, regions_df=None):

    # default data if not given
    regions_df = regions_df if regions_df is not None else _src.regions_df()

    # plot
    plt.rcParams.update({'font.size': 20})
    #plt.yscale("log")
    sns.violinplot(x="Country", y="Density", data=regions_df)
    #sns.boxplot(x="Country", y="Density", data=regions_df, color = "1")
    sns.stripplot(x="Country",
                  y="Density",
                  color='black',
                  size=6,
                  alpha=0.8,
                  data=regions_df)
    if name is None:
        plt.show()

        # save
    else:
        plt.savefig("density.png")