]] df = pd.concat([df, missing]) df = df[~df.continent_code.isnull()].reset_index(drop=True) df["level"] = "country" df = df.rename( columns={ "code_col": "code", "name_es_col": "name_es", "continent_code": "parent_code", }) assert df.loc[6, "name"] is pd.np.nan df.loc[6, "name"] = u"Netherlands Antilles" regions = pd.read_table( "../Mexico/in/Mexico Country codes - continents - Continents - Regions.tsv", encoding="utf-8", ) df = pd.concat([df, regions]).reset_index(drop=True) h = Hierarchy(["region", "country"]) parent_id_table = parent_code_table_to_parent_id_table(df, h) c = Classification(parent_id_table, h) c.to_csv("out/locations_international_dane.csv") # c.to_stata("out/locations_international_dane.dta")
) if __name__ == "__main__": names = pd.read_table("./in/AgProducts_Names.tsv", encoding="utf-8", dtype={"code": str}) hierarchy = pd.read_table("./in/AgProducts_Hierarchy.tsv", encoding="utf-8") hierarchy.columns = [ "level3_code", "level2_code", "level1_code", "level0_code" ] fields = {"level0": [], "level1": [], "level2": [], "level3": []} h = Hierarchy(["level0", "level1", "level2", "level3"]) parent_code_table = repeated_table_to_parent_id_table(hierarchy, h, fields) parent_code_table.code = parent_code_table.code.astype(str) parent_code_table = parent_code_table.merge(names, on=["code", "level"]) parent_id_table = parent_code_table_to_parent_id_table( parent_code_table, h) parent_id_table["name"] = parent_id_table.name_en parent_id_table = parent_id_table[[ "code", "name", "level", "name_en", "name_es", "parent_id" ]] c = Classification(parent_id_table, h) c.table.code = c.table.code.str.lower()
four_digit["parent_code"] = four_digit.code.apply(lambda x: x[:2]) four_digit = four_digit.drop("community", axis=1) four_digit["level"] = "4digit" two_digit = hs4.iloc[1241:1339] two_digit["code"] = two_digit.code.astype(str).str.zfill(2) two_digit = two_digit.rename(columns={"community": "parent_code"}) two_digit["parent_code"] = two_digit.parent_code.astype(str).str.zfill(3) two_digit["level"] = "2digit" section = hs4.iloc[1339:].drop("community", axis=1) section["code"] = section.code.astype(str).str.zfill(3) section["parent_code"] = None section["level"] = "section" hs_clean = pd.concat([section, two_digit, four_digit]) hs_clean = hs_clean.reset_index(drop=True) h = Hierarchy(["section", "2digit", "4digit"]) hs_clean = parent_code_table_to_parent_id_table(hs_clean, h) c = Classification(hs_clean, h) # community = pd.read_table("in/hs4_community.tsv", encoding="utf-8") # hs4 = hs4.merge(community, left_on="community", right_on="code", how="inner") # weird bug where pandas infer_type was returning mixed instead of string c.table.code = c.table.code.astype(str) c.to_csv("out/hs92_atlas.csv") c.to_stata("out/hs92_atlas.dta")
df = pd.read_table("in/DIVIPOLA_20150331.txt", encoding="utf-16") df.columns = ["department_code", "municipality_code", "population_center_code", "department_name", "municipality_name", "population_center_name", "population_center_type", "longitude", "", "latitude", "district", "municipality_type", "metro_area"] df = df[["department_code", "department_name", "municipality_code", "municipality_name", "population_center_code", "population_center_name"]] df.department_code = df.department_code.astype(str).str.zfill(2) df.municipality_code = df.municipality_code.astype(str).str.zfill(5) df.population_center_code = df.population_center_code.astype(str).str.zfill(8) df.department_name = df.department_name.str.title() df.municipality_name = df.municipality_name.str.title() df.population_center_name = df.population_center_name.str.title() h = Hierarchy(["department", "municipality", "population_center"]) parent_code_table = repeated_table_to_parent_id_table(df, h) parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h) c = Classification(parent_id_table, h) c.to_csv("out/locations_colombia_dane.csv") c.to_stata("out/locations_colombia_dane.dta")
if __name__ == "__main__": assert (len(sys.argv) == 3) file_name = sys.argv[1] new_file_prefix = sys.argv[2] df = pd.read_table(file_name, encoding="utf-16") df = parse_dane(df) df = df[~df.duplicated(["code"])] df = df.reset_index(drop=True) df.columns = ["name", "level", "code"] df.name = df.name.str.title() from classification import (parent_code_table_to_parent_id_table, Classification, Hierarchy, ordered_table_to_parent_code_table) h = Hierarchy(DANE_HIERARCHY) df = ordered_table_to_parent_code_table(df, h) df = parent_code_table_to_parent_id_table(df, h) c = Classification(df, h) # weird bug where pandas infer_type was returning mixed instead of string c.table.code = c.table.code.astype(str) c.to_csv(new_file_prefix + ".csv") c.to_stata(new_file_prefix + ".dta")
from classification import ( Hierarchy, repeated_table_to_parent_id_table, parent_code_table_to_parent_id_table, spread_out_entries, sort_by_code_and_level, Classification, ) if __name__ == "__main__": df = pd.read_csv("./in/NACE_Rev2_custom_hierarchy.csv") df.columns = ["level", "code", "parent_code", "name"] df.level = df.level.astype(str) df.loc[df.level == "1", "level"] = "section" df.loc[df.level == "2", "level"] = "division" df.loc[df.level == "3", "level"] = "group" h = Hierarchy(["section", "division", "group"]) df = sort_by_code_and_level(df, h) df = parent_code_table_to_parent_id_table(df, h) level_starts = {"section": 0, "division": 100, "group": 300} df = spread_out_entries(df, level_starts, h) c = Classification(df, h) c.to_csv("./out/nace_industries.csv")
Classification) if __name__ == "__main__": sinco = pd.read_csv("in/SINCO_2011.csv", header=None, encoding="latin-1") sinco.columns = ["data"] sinco = sinco[~sinco.data.str.startswith("INEGI.")] sinco = sinco[~sinco.data.str.startswith(u"Clave Descripción")] for index, row in reversed( list(sinco[~sinco.data.str.match("^\d* ")].iterrows())): sinco.ix[index - 1] += (" " + sinco.ix[index]) sinco = sinco[sinco.data.str.match("^\d* ")] sinco = sinco.data.str.split(" ", 1).apply(pd.Series, 1) sinco.columns = ["code", "name"] sinco["level"] = sinco["code"].apply(lambda x: str(len(x)) + "digit") h = Hierarchy(["1digit", "2digit", "3digit", "4digit"]) parent_code_table = ordered_table_to_parent_code_table(sinco, h) parent_id_table = parent_code_table_to_parent_id_table( parent_code_table, h) c = Classification(parent_id_table, h) c.to_csv("out/occupations_sinco_2011.csv") c.to_stata("out/occupations_sinco_2011.dta")
repeated_table_to_parent_id_table, parent_code_table_to_parent_id_table, Classification, ) if __name__ == "__main__": names = pd.read_table("./in/Livestock_Names.tsv", encoding="utf-8", dtype={"code": str}) hierarchy = pd.read_table("./in/Livestock_Hierarchy.tsv", encoding="utf-8") hierarchy.columns = ["level1_code", "level0_code"] fields = {"level0": [], "level1": []} h = Hierarchy(["level0", "level1"]) parent_code_table = repeated_table_to_parent_id_table(hierarchy, h, fields) parent_code_table.code = parent_code_table.code.astype(str) parent_code_table = parent_code_table.merge(names, on=["code", "level"]) parent_id_table = parent_code_table_to_parent_id_table( parent_code_table, h) parent_id_table["name"] = parent_id_table.name_en parent_id_table = parent_id_table[[ "code", "name", "level", "name_en", "name_es",
"city_name", "city_code" ]] metro_areas.columns = ["name", "code"] metro_areas["parent_code"] = metro_areas.code.str.slice(0, 2) metro_areas["level"] = "msa" df.loc[df.level == "department", "parent_code"] = "COL" df = pd.concat([pd.DataFrame(colombia).T, df, metro_areas]) df = df.sort(["level", "code"], ascending=True) df = df.reset_index(drop=True) h = Hierarchy(["country", "department", "msa", "municipality"]) parent_id_table = parent_code_table_to_parent_id_table(df, h) parent_id_table["name_es"] = parent_id_table.name parent_id_table["name_short_en"] = parent_id_table.name parent_id_table["name_short_es"] = parent_id_table.name # Work around issue where parent_code_table_to_parent_id_table breaks # because the parent of munis are not msas depts = df[df.level == "department"] depts = depts[["code"]].reset_index().set_index("code") lookup_table = depts.to_dict()["index"] def fill_parents(row): if row.level == "municipality" and pd.isnull(row.parent_id): row.parent_id = lookup_table[row.code[:2]] return row
df.municipality_code = df.state_code + df.municipality_code df.locality_code = df.municipality_code + df.locality_code df.state_name = df.state_name.str.title() df.municipality_name = df.municipality_name.str.title() df.locality_name = df.locality_name.str.title() df = df.rename( columns={ "state_name": "name_en_state", "municipality_name": "name_en_municipality", "locality_name": "name_en_locality", }) h = Hierarchy(["state", "municipality", "locality"]) parent_code_table = repeated_table_to_parent_id_table( df, h, level_fields={ "state": ["name_en_state"], "municipality": ["name_en_municipality"], "locality": ["name_en_locality"], }, ) # TODO: This isn't the official classification level name but this makes # compatibility between colombia and mexico way easier parent_code_table.loc[parent_code_table.level == "state", "level"] = "department"
"latitude", "longitude", "altitude", "map_code", "ambito", "population_total", "population_male", "population_female", "dwellings_occupied"] df = df[["state_code", "state_name", "municipality_code", "municipality_name", "locality_code", "locality_name"]] df.state_code = df.state_code.astype(str).str.zfill(2) df.municipality_code = df.municipality_code.astype(str).str.zfill(3) df.locality_code = df.locality_code.astype(str).str.zfill(4) df.municipality_code = df.state_code + df.municipality_code df.locality_code = df.municipality_code + df.locality_code df.state_name = df.state_name.str.title() df.municipality_name = df.municipality_name.str.title() df.locality_name = df.locality_name.str.title() h = Hierarchy(["state", "municipality", "locality"]) parent_code_table = repeated_table_to_parent_id_table(df, h) parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h) c = Classification(parent_id_table, h) c.to_csv("out/locations_mexico_inegi.csv") c.to_stata("out/locations_mexico_inegi.dta")
selected_rows = df.name_english.str.contains(regex) df["tag_en_" + name] = False df.loc[selected_rows, "tag_en_" + name] = True df.name_english = df.name_english.map(lambda x: re.sub(regex, "", x)) for name, regex in regexes.items(): selected_rows = df.name_spanish.str.contains(regex) df["tag_sp_" + name] = False df.loc[selected_rows, "tag_sp_" + name] = True df.name_spanish = df.name_spanish.map(lambda x: re.sub(regex, "", x)) # Replace trailing comma and space df.name_spanish = df.name_spanish.str.replace(", $", "") df.name_english = df.name_english.str.replace(", $", "") h = Hierarchy( ["twodigit", "threedigit", "fourdigit", "fivedigit", "sixdigit"]) df.loc[df.code.str.len() == 2, "level"] = "twodigit" df.loc[df.code.str.len() == 3, "level"] = "threedigit" df.loc[df.code.str.len() == 4, "level"] = "fourdigit" df.loc[df.code.str.len() == 5, "level"] = "fivedigit" df.loc[df.code.str.len() == 6, "level"] = "sixdigit" spanish = df[["code", "level", "name_spanish"]] spanish.columns = ["code", "level", "name_es"] # make sure this is the hand-fixed version assert df.loc[304, "code"] == '31' df = df[["code", "name_english", "level"]] df.columns = ["code", "name", "level"]
row["level"] = "country" row["parent_code"] = pd.np.nan elif row.code.endswith("0000"): row["level"] = "department" row["parent_code"] = "000000" elif row.code.endswith("00"): row["level"] = "province" row["parent_code"] = row["code"][:2] + "0000" else: row["level"] = "district" row["parent_code"] = row["code"][:4] + "00" return row df = df.apply(fix_levels, axis=1) h = Hierarchy(["country", "department", "province", "district"]) df.level = df.level.astype("category", categories=h, ordered=True) df = df.sort_values(by=["level", "code"]) df.level = df.level.astype(str) df = df.reset_index(drop=True) parent_id_table = parent_code_table_to_parent_id_table(df, h) # TODO: This isn't the official classification level name but this makes # compatibility between colombia and mexico way easier # parent_code_table.loc[parent_code_table.level == "state", "level"] = "department" # Drop the "locality" level since we don't use it # parent_code_table = parent_code_table[parent_code_table.level != "locality"]
import pandas as pd from classification import ( Hierarchy, repeated_table_to_parent_id_table, parent_code_table_to_parent_id_table, Classification, ) if __name__ == "__main__": df = pd.read_table("in/Col_occupations_SOC_2010 - Hierarchy.tsv", encoding="utf-8") h = Hierarchy([ "major_group", "minor_group", "broad_occupation", "detailed_occupation" ]) fields = { "major_group": ["name_en_major_group", "name_es_major_group"], "minor_group": [ "name_en_minor_group", "name_es_minor_group", "name_short_es_minor_group", ], "broad_occupation": ["name_en_broad_occupation", "name_es_broad_occupation"], "detailed_occupation": [ "name_en_detailed_occupation", "name_es_detailed_occupation", "name_short_es_detailed_occupation", ],
return x trans = trans.apply(fill_code, axis=1) # Prospedia specific trans = trans[trans.level != "section"] df = pd.read_table("./in/prospedia_hs_structure.txt") df.columns = ["4digit_code", "2digit_code", "prospedia_section_code"] df["4digit_code"] = df["4digit_code"].astype(str).str.zfill(4) df["4digit_name"] = None df["2digit_code"] = df["2digit_code"].astype(str).str.zfill(2) df["2digit_name"] = None df["prospedia_section_name"] = None df["prospedia_section_code"] = df["prospedia_section_code"].astype(str).str.zfill(1) h = Hierarchy(["prospedia_section", "2digit", "4digit"]) parent_code_table = repeated_table_to_parent_id_table(df, h) parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h) parent_id_table = parent_id_table.merge(trans, on=["level", "code"]) parent_id_table.name = parent_id_table.name_en assert parent_id_table.name.isnull().sum() == 3 parent_id_table.loc[parent_id_table.name.isnull(), "name"] = u"No name" assert parent_id_table.name.isnull().sum() == 0 c = Classification(parent_id_table, h) c.to_csv("out/products_mexico_prospedia.csv") c.to_stata("out/products_mexico_prospedia.dta")
"./in/col_industry_name_category_master - Hierarchy.tsv", encoding="utf-8") df.class_code = df.class_code.astype(int).astype(str).str.zfill(4) df.division_code = df.division_code.astype(int).astype(str).str.zfill(2) df.section_code = df.section_code.astype(int).astype(str).str.zfill(1) names = pd.read_table("./in/col_industry_name_category_master - Names.tsv", encoding="utf-8") names.loc[names.level == "section", "code"] = names.code.astype(str) names.loc[names.level == "division", "code"] = names.code.astype(str).str.zfill(2) names.loc[names.level == "class", "code"] = names.code.astype(str).str.zfill(4) h = Hierarchy(["section", "division", "class"]) parent_code_table = (repeated_table_to_parent_id_table( df, h, level_fields={ "section": [], "division": [], "class": [] }).sort_values(by=["level", "code"]).reset_index(drop=True)) parent_id_table = parent_code_table_to_parent_id_table( parent_code_table, h) parent_id_table = parent_id_table.merge(names) parent_id_table["name"] = parent_id_table.name_en c = Classification(parent_id_table, h)