Ejemplo n.º 1
0
    ]]

    df = pd.concat([df, missing])
    df = df[~df.continent_code.isnull()].reset_index(drop=True)

    df["level"] = "country"

    df = df.rename(
        columns={
            "code_col": "code",
            "name_es_col": "name_es",
            "continent_code": "parent_code",
        })

    assert df.loc[6, "name"] is pd.np.nan
    df.loc[6, "name"] = u"Netherlands Antilles"

    regions = pd.read_table(
        "../Mexico/in/Mexico Country codes - continents - Continents - Regions.tsv",
        encoding="utf-8",
    )
    df = pd.concat([df, regions]).reset_index(drop=True)

    h = Hierarchy(["region", "country"])
    parent_id_table = parent_code_table_to_parent_id_table(df, h)

    c = Classification(parent_id_table, h)

    c.to_csv("out/locations_international_dane.csv")
    # c.to_stata("out/locations_international_dane.dta")
Ejemplo n.º 2
0
)

if __name__ == "__main__":
    names = pd.read_table("./in/AgProducts_Names.tsv",
                          encoding="utf-8",
                          dtype={"code": str})

    hierarchy = pd.read_table("./in/AgProducts_Hierarchy.tsv",
                              encoding="utf-8")
    hierarchy.columns = [
        "level3_code", "level2_code", "level1_code", "level0_code"
    ]

    fields = {"level0": [], "level1": [], "level2": [], "level3": []}

    h = Hierarchy(["level0", "level1", "level2", "level3"])
    parent_code_table = repeated_table_to_parent_id_table(hierarchy, h, fields)
    parent_code_table.code = parent_code_table.code.astype(str)

    parent_code_table = parent_code_table.merge(names, on=["code", "level"])

    parent_id_table = parent_code_table_to_parent_id_table(
        parent_code_table, h)
    parent_id_table["name"] = parent_id_table.name_en

    parent_id_table = parent_id_table[[
        "code", "name", "level", "name_en", "name_es", "parent_id"
    ]]

    c = Classification(parent_id_table, h)
    c.table.code = c.table.code.str.lower()
Ejemplo n.º 3
0
    four_digit["parent_code"] = four_digit.code.apply(lambda x: x[:2])
    four_digit = four_digit.drop("community", axis=1)
    four_digit["level"] = "4digit"

    two_digit = hs4.iloc[1241:1339]
    two_digit["code"] = two_digit.code.astype(str).str.zfill(2)
    two_digit = two_digit.rename(columns={"community": "parent_code"})
    two_digit["parent_code"] = two_digit.parent_code.astype(str).str.zfill(3)
    two_digit["level"] = "2digit"

    section = hs4.iloc[1339:].drop("community", axis=1)
    section["code"] = section.code.astype(str).str.zfill(3)
    section["parent_code"] = None
    section["level"] = "section"

    hs_clean = pd.concat([section, two_digit, four_digit])
    hs_clean = hs_clean.reset_index(drop=True)

    h = Hierarchy(["section", "2digit", "4digit"])
    hs_clean = parent_code_table_to_parent_id_table(hs_clean, h)
    c = Classification(hs_clean, h)

    # community = pd.read_table("in/hs4_community.tsv", encoding="utf-8")
    # hs4 = hs4.merge(community, left_on="community", right_on="code", how="inner")

    # weird bug where pandas infer_type was returning mixed instead of string
    c.table.code = c.table.code.astype(str)

    c.to_csv("out/hs92_atlas.csv")
    c.to_stata("out/hs92_atlas.dta")
Ejemplo n.º 4
0
    df = pd.read_table("in/DIVIPOLA_20150331.txt", encoding="utf-16")
    df.columns = ["department_code", "municipality_code",
                  "population_center_code", "department_name",
                  "municipality_name", "population_center_name",
                  "population_center_type", "longitude", "", "latitude",
                  "district", "municipality_type", "metro_area"]

    df = df[["department_code", "department_name", "municipality_code",
             "municipality_name", "population_center_code",
             "population_center_name"]]


    df.department_code = df.department_code.astype(str).str.zfill(2)
    df.municipality_code = df.municipality_code.astype(str).str.zfill(5)
    df.population_center_code = df.population_center_code.astype(str).str.zfill(8)

    df.department_name = df.department_name.str.title()
    df.municipality_name = df.municipality_name.str.title()
    df.population_center_name = df.population_center_name.str.title()

    h = Hierarchy(["department", "municipality", "population_center"])

    parent_code_table = repeated_table_to_parent_id_table(df, h)
    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)

    c = Classification(parent_id_table, h)

    c.to_csv("out/locations_colombia_dane.csv")
    c.to_stata("out/locations_colombia_dane.dta")
Ejemplo n.º 5
0

if __name__ == "__main__":
    assert (len(sys.argv) == 3)

    file_name = sys.argv[1]
    new_file_prefix = sys.argv[2]

    df = pd.read_table(file_name, encoding="utf-16")
    df = parse_dane(df)
    df = df[~df.duplicated(["code"])]
    df = df.reset_index(drop=True)
    df.columns = ["name", "level", "code"]

    df.name = df.name.str.title()

    from classification import (parent_code_table_to_parent_id_table,
                                Classification, Hierarchy,
                                ordered_table_to_parent_code_table)

    h = Hierarchy(DANE_HIERARCHY)
    df = ordered_table_to_parent_code_table(df, h)
    df = parent_code_table_to_parent_id_table(df, h)
    c = Classification(df, h)

    # weird bug where pandas infer_type was returning mixed instead of string
    c.table.code = c.table.code.astype(str)

    c.to_csv(new_file_prefix + ".csv")
    c.to_stata(new_file_prefix + ".dta")
Ejemplo n.º 6
0
from classification import (
    Hierarchy,
    repeated_table_to_parent_id_table,
    parent_code_table_to_parent_id_table,
    spread_out_entries,
    sort_by_code_and_level,
    Classification,
)

if __name__ == "__main__":
    df = pd.read_csv("./in/NACE_Rev2_custom_hierarchy.csv")
    df.columns = ["level", "code", "parent_code", "name"]

    df.level = df.level.astype(str)

    df.loc[df.level == "1", "level"] = "section"
    df.loc[df.level == "2", "level"] = "division"
    df.loc[df.level == "3", "level"] = "group"

    h = Hierarchy(["section", "division", "group"])

    df = sort_by_code_and_level(df, h)
    df = parent_code_table_to_parent_id_table(df, h)

    level_starts = {"section": 0, "division": 100, "group": 300}
    df = spread_out_entries(df, level_starts, h)

    c = Classification(df, h)
    c.to_csv("./out/nace_industries.csv")
Ejemplo n.º 7
0
                            Classification)

if __name__ == "__main__":

    sinco = pd.read_csv("in/SINCO_2011.csv", header=None, encoding="latin-1")

    sinco.columns = ["data"]
    sinco = sinco[~sinco.data.str.startswith("INEGI.")]
    sinco = sinco[~sinco.data.str.startswith(u"Clave Descripción")]

    for index, row in reversed(
            list(sinco[~sinco.data.str.match("^\d* ")].iterrows())):
        sinco.ix[index - 1] += (" " + sinco.ix[index])

    sinco = sinco[sinco.data.str.match("^\d* ")]

    sinco = sinco.data.str.split(" ", 1).apply(pd.Series, 1)
    sinco.columns = ["code", "name"]

    sinco["level"] = sinco["code"].apply(lambda x: str(len(x)) + "digit")
    h = Hierarchy(["1digit", "2digit", "3digit", "4digit"])

    parent_code_table = ordered_table_to_parent_code_table(sinco, h)
    parent_id_table = parent_code_table_to_parent_id_table(
        parent_code_table, h)

    c = Classification(parent_id_table, h)

    c.to_csv("out/occupations_sinco_2011.csv")
    c.to_stata("out/occupations_sinco_2011.dta")
Ejemplo n.º 8
0
    repeated_table_to_parent_id_table,
    parent_code_table_to_parent_id_table,
    Classification,
)

if __name__ == "__main__":
    names = pd.read_table("./in/Livestock_Names.tsv",
                          encoding="utf-8",
                          dtype={"code": str})

    hierarchy = pd.read_table("./in/Livestock_Hierarchy.tsv", encoding="utf-8")
    hierarchy.columns = ["level1_code", "level0_code"]

    fields = {"level0": [], "level1": []}

    h = Hierarchy(["level0", "level1"])
    parent_code_table = repeated_table_to_parent_id_table(hierarchy, h, fields)
    parent_code_table.code = parent_code_table.code.astype(str)

    parent_code_table = parent_code_table.merge(names, on=["code", "level"])

    parent_id_table = parent_code_table_to_parent_id_table(
        parent_code_table, h)
    parent_id_table["name"] = parent_id_table.name_en

    parent_id_table = parent_id_table[[
        "code",
        "name",
        "level",
        "name_en",
        "name_es",
Ejemplo n.º 9
0
                                    "city_name", "city_code"
                                ]]

    metro_areas.columns = ["name", "code"]

    metro_areas["parent_code"] = metro_areas.code.str.slice(0, 2)
    metro_areas["level"] = "msa"

    df.loc[df.level == "department", "parent_code"] = "COL"

    df = pd.concat([pd.DataFrame(colombia).T, df, metro_areas])

    df = df.sort(["level", "code"], ascending=True)
    df = df.reset_index(drop=True)

    h = Hierarchy(["country", "department", "msa", "municipality"])
    parent_id_table = parent_code_table_to_parent_id_table(df, h)
    parent_id_table["name_es"] = parent_id_table.name
    parent_id_table["name_short_en"] = parent_id_table.name
    parent_id_table["name_short_es"] = parent_id_table.name

    # Work around issue where parent_code_table_to_parent_id_table breaks
    # because the parent of munis are not msas
    depts = df[df.level == "department"]
    depts = depts[["code"]].reset_index().set_index("code")
    lookup_table = depts.to_dict()["index"]

    def fill_parents(row):
        if row.level == "municipality" and pd.isnull(row.parent_id):
            row.parent_id = lookup_table[row.code[:2]]
        return row
Ejemplo n.º 10
0
    df.municipality_code = df.state_code + df.municipality_code
    df.locality_code = df.municipality_code + df.locality_code

    df.state_name = df.state_name.str.title()
    df.municipality_name = df.municipality_name.str.title()
    df.locality_name = df.locality_name.str.title()

    df = df.rename(
        columns={
            "state_name": "name_en_state",
            "municipality_name": "name_en_municipality",
            "locality_name": "name_en_locality",
        })

    h = Hierarchy(["state", "municipality", "locality"])

    parent_code_table = repeated_table_to_parent_id_table(
        df,
        h,
        level_fields={
            "state": ["name_en_state"],
            "municipality": ["name_en_municipality"],
            "locality": ["name_en_locality"],
        },
    )

    # TODO: This isn't the official classification level name but this makes
    # compatibility between colombia and mexico way easier
    parent_code_table.loc[parent_code_table.level == "state",
                          "level"] = "department"
Ejemplo n.º 11
0
                  "latitude", "longitude", "altitude",
                  "map_code", "ambito",
                  "population_total", "population_male", "population_female",
                  "dwellings_occupied"]

    df = df[["state_code", "state_name", "municipality_code",
             "municipality_name", "locality_code",
             "locality_name"]]


    df.state_code = df.state_code.astype(str).str.zfill(2)
    df.municipality_code = df.municipality_code.astype(str).str.zfill(3)
    df.locality_code = df.locality_code.astype(str).str.zfill(4)

    df.municipality_code = df.state_code + df.municipality_code
    df.locality_code = df.municipality_code + df.locality_code

    df.state_name = df.state_name.str.title()
    df.municipality_name = df.municipality_name.str.title()
    df.locality_name = df.locality_name.str.title()

    h = Hierarchy(["state", "municipality", "locality"])

    parent_code_table = repeated_table_to_parent_id_table(df, h)
    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)

    c = Classification(parent_id_table, h)

    c.to_csv("out/locations_mexico_inegi.csv")
    c.to_stata("out/locations_mexico_inegi.dta")
Ejemplo n.º 12
0
        selected_rows = df.name_english.str.contains(regex)
        df["tag_en_" + name] = False
        df.loc[selected_rows, "tag_en_" + name] = True
        df.name_english = df.name_english.map(lambda x: re.sub(regex, "", x))

    for name, regex in regexes.items():
        selected_rows = df.name_spanish.str.contains(regex)
        df["tag_sp_" + name] = False
        df.loc[selected_rows, "tag_sp_" + name] = True
        df.name_spanish = df.name_spanish.map(lambda x: re.sub(regex, "", x))

    # Replace trailing comma and space
    df.name_spanish = df.name_spanish.str.replace(", $", "")
    df.name_english = df.name_english.str.replace(", $", "")

    h = Hierarchy(
        ["twodigit", "threedigit", "fourdigit", "fivedigit", "sixdigit"])

    df.loc[df.code.str.len() == 2, "level"] = "twodigit"
    df.loc[df.code.str.len() == 3, "level"] = "threedigit"
    df.loc[df.code.str.len() == 4, "level"] = "fourdigit"
    df.loc[df.code.str.len() == 5, "level"] = "fivedigit"
    df.loc[df.code.str.len() == 6, "level"] = "sixdigit"

    spanish = df[["code", "level", "name_spanish"]]
    spanish.columns = ["code", "level", "name_es"]

    # make sure this is the hand-fixed version
    assert df.loc[304, "code"] == '31'

    df = df[["code", "name_english", "level"]]
    df.columns = ["code", "name", "level"]
Ejemplo n.º 13
0
            row["level"] = "country"
            row["parent_code"] = pd.np.nan
        elif row.code.endswith("0000"):
            row["level"] = "department"
            row["parent_code"] = "000000"
        elif row.code.endswith("00"):
            row["level"] = "province"
            row["parent_code"] = row["code"][:2] + "0000"
        else:
            row["level"] = "district"
            row["parent_code"] = row["code"][:4] + "00"
        return row

    df = df.apply(fix_levels, axis=1)

    h = Hierarchy(["country", "department", "province", "district"])
    df.level = df.level.astype("category", categories=h, ordered=True)

    df = df.sort_values(by=["level", "code"])

    df.level = df.level.astype(str)
    df = df.reset_index(drop=True)
    parent_id_table = parent_code_table_to_parent_id_table(df, h)

    # TODO: This isn't the official classification level name but this makes
    # compatibility between colombia and mexico way easier
    # parent_code_table.loc[parent_code_table.level == "state", "level"] = "department"

    # Drop the "locality" level since we don't use it
    # parent_code_table = parent_code_table[parent_code_table.level != "locality"]
Ejemplo n.º 14
0
import pandas as pd

from classification import (
    Hierarchy,
    repeated_table_to_parent_id_table,
    parent_code_table_to_parent_id_table,
    Classification,
)

if __name__ == "__main__":

    df = pd.read_table("in/Col_occupations_SOC_2010 - Hierarchy.tsv",
                       encoding="utf-8")

    h = Hierarchy([
        "major_group", "minor_group", "broad_occupation", "detailed_occupation"
    ])
    fields = {
        "major_group": ["name_en_major_group", "name_es_major_group"],
        "minor_group": [
            "name_en_minor_group",
            "name_es_minor_group",
            "name_short_es_minor_group",
        ],
        "broad_occupation":
        ["name_en_broad_occupation", "name_es_broad_occupation"],
        "detailed_occupation": [
            "name_en_detailed_occupation",
            "name_es_detailed_occupation",
            "name_short_es_detailed_occupation",
        ],
Ejemplo n.º 15
0
        return x

    trans = trans.apply(fill_code, axis=1)

    # Prospedia specific
    trans = trans[trans.level != "section"]
    df = pd.read_table("./in/prospedia_hs_structure.txt")
    df.columns = ["4digit_code", "2digit_code", "prospedia_section_code"]
    df["4digit_code"] = df["4digit_code"].astype(str).str.zfill(4)
    df["4digit_name"] = None
    df["2digit_code"] = df["2digit_code"].astype(str).str.zfill(2)
    df["2digit_name"] = None
    df["prospedia_section_name"] = None
    df["prospedia_section_code"] = df["prospedia_section_code"].astype(str).str.zfill(1)

    h = Hierarchy(["prospedia_section", "2digit", "4digit"])

    parent_code_table = repeated_table_to_parent_id_table(df, h)
    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)

    parent_id_table = parent_id_table.merge(trans, on=["level", "code"])
    parent_id_table.name = parent_id_table.name_en

    assert parent_id_table.name.isnull().sum() == 3
    parent_id_table.loc[parent_id_table.name.isnull(), "name"] = u"No name"
    assert parent_id_table.name.isnull().sum() == 0

    c = Classification(parent_id_table, h)

    c.to_csv("out/products_mexico_prospedia.csv")
    c.to_stata("out/products_mexico_prospedia.dta")
Ejemplo n.º 16
0
        "./in/col_industry_name_category_master - Hierarchy.tsv",
        encoding="utf-8")

    df.class_code = df.class_code.astype(int).astype(str).str.zfill(4)
    df.division_code = df.division_code.astype(int).astype(str).str.zfill(2)
    df.section_code = df.section_code.astype(int).astype(str).str.zfill(1)

    names = pd.read_table("./in/col_industry_name_category_master - Names.tsv",
                          encoding="utf-8")
    names.loc[names.level == "section", "code"] = names.code.astype(str)
    names.loc[names.level == "division",
              "code"] = names.code.astype(str).str.zfill(2)
    names.loc[names.level == "class",
              "code"] = names.code.astype(str).str.zfill(4)

    h = Hierarchy(["section", "division", "class"])
    parent_code_table = (repeated_table_to_parent_id_table(
        df, h, level_fields={
            "section": [],
            "division": [],
            "class": []
        }).sort_values(by=["level", "code"]).reset_index(drop=True))

    parent_id_table = parent_code_table_to_parent_id_table(
        parent_code_table, h)
    parent_id_table = parent_id_table.merge(names)

    parent_id_table["name"] = parent_id_table.name_en

    c = Classification(parent_id_table, h)