def IQR(regions=None, countries=None): """IQR method for region dataframe. Args: regions (pd.DataFrame, optional): Optional data to work on. If not given, all regions used. countries () """ # fetch region dataframe regions = _src.regions_df() if regions is None else regions # subset country if countries: regions = regions[regions.Country.isin(list(countries))] descriptive_cols = {'Region', 'Code', 'Country'} def _IQR(x): """IQR method for a vector.""" q1, q3 = np.percentile(sorted(x), [25, 75]) iqr = q3 - q1 low, high = (q1 - (1.5 * iqr), q3 + (1.5 * iqr)) return (x < low) | (x > high) # remove descriptive columns outs = {} for c in set(regions.columns) - descriptive_cols: flags = _IQR(regions[c]) outs[c] = regions[flags] # return outliers return outs
def popdensity_boxplot_noPRG(name=None, regions_df=None): # default data if not given regions_df = regions_df if regions_df is not None else _src.regions_df() # crop prague off regions_df = regions_df[regions_df.Code != "CZ010"] # plot popdensity_boxplot(name=name, regions_df=regions_df)
def administrative_divisions_similar(pi=True, alpha=.05): """Tests that regions have same mean in their populations, areas and densities. Use two-sampled t_test with preceding F-test to test equal variances. Only the result of t_test is returned. H0: mu1 = mu2 HA: mu1 != mu2 Args: pi (bool): If True, returns pi values. If False, returns validity of H0. Defautly True. alpha (float): Significance level. """ # data attributes = ['Population', 'Area', 'Density'] regions_df = _src.regions_df() pi_df = _src.regions_countries_pairs(attributes) # perform the test for a in attributes: for i, r in pi_df.iterrows(): # data data1 = regions_df[regions_df.Country == r.Country1][a] data2 = regions_df[regions_df.Country == r.Country2][a] # F-test #ftest, fpi = _tools.f_test(data1, data2) ftest, fpi = stats.levene(data1, data2) equal_var = fpi > alpha # test pop_pi = stats.ttest_ind(data1, data2, equal_var=equal_var) # write down pvalue pi_df.at[i, a] = pop_pi.pvalue # return pi value if pi: return pi_df # make decision pi_df = pd.concat( [pi_df[['Country1', 'Country2']], pi_df[attributes] > alpha], axis=1, ignore_index=True) pi_df.columns = ['Country1', 'Country2', *attributes] return pi_df
def regions_normally_distributed(pi=True, alpha=.05): """Tests whether regions are distributed normally in their populations, areas and densities over different countries. Uses Shapiro-Wilk test. H0: Regions of country are distributed normally in the attribute. HA: They are not. Args: pi (bool): If True, returns pi values. If False, returns validity of H0. Defautly True. alpha (float): Significance level. """ # data attributes = ['Population', 'Area', 'Density'] regions_df = _src.regions_df() # empty single country countries = regions_df.Country.unique() # create dataframe pi_dict = {k: [None for _ in range(len(countries))] for k in attributes} pi_dict = {'Country': [c for c in countries], **pi_dict} # pi values dataframes pi_df = pd.DataFrame(pi_dict) # perform the test for a in attributes: for i, r in pi_df.iterrows(): # data data = regions_df[regions_df.Country == r.Country][a] # Shapiro-Wilk test normal_pi = stats.shapiro(data) pi_df.at[i, a] = normal_pi.pvalue # return pi value if pi: return pi_df # make decision pi_df = pd.concat([pi_df[['Country']], pi_df[attributes] > alpha], axis=1, ignore_index=True) pi_df.columns = ['Country', *attributes] return pi_df
def area_population_scatter(name=None, regions_df=None): # default data if not given regions_df = regions_df if regions_df is not None else _src.regions_df() # plot ax = sns.jointplot(x="Area", y="Population", hue="Country", data=regions_df) # axis limits ax.ax_marg_x.set_xlim(-1.5 * 10**4, 1.25 * 10**5) ax.ax_marg_y.set_ylim(-5 * 10**5, 6 * 10**6) if name is None: plt.show() # save else: plt.savefig(name)
def popdensity_boxplot(name=None, regions_df=None): # default data if not given regions_df = regions_df if regions_df is not None else _src.regions_df() # plot plt.rcParams.update({'font.size': 20}) #plt.yscale("log") sns.violinplot(x="Country", y="Density", data=regions_df) #sns.boxplot(x="Country", y="Density", data=regions_df, color = "1") sns.stripplot(x="Country", y="Density", color='black', size=6, alpha=0.8, data=regions_df) if name is None: plt.show() # save else: plt.savefig("density.png")