Esempio n. 1
0
    ]]

    df = pd.concat([df, missing])
    df = df[~df.continent_code.isnull()].reset_index(drop=True)

    df["level"] = "country"

    df = df.rename(
        columns={
            "code_col": "code",
            "name_es_col": "name_es",
            "continent_code": "parent_code",
        })

    assert df.loc[6, "name"] is pd.np.nan
    df.loc[6, "name"] = u"Netherlands Antilles"

    regions = pd.read_table(
        "../Mexico/in/Mexico Country codes - continents - Continents - Regions.tsv",
        encoding="utf-8",
    )
    df = pd.concat([df, regions]).reset_index(drop=True)

    h = Hierarchy(["region", "country"])
    parent_id_table = parent_code_table_to_parent_id_table(df, h)

    c = Classification(parent_id_table, h)

    c.to_csv("out/locations_international_dane.csv")
    # c.to_stata("out/locations_international_dane.dta")
Esempio n. 2
0
    four_digit["parent_code"] = four_digit.code.apply(lambda x: x[:2])
    four_digit = four_digit.drop("community", axis=1)
    four_digit["level"] = "4digit"

    two_digit = hs4.iloc[1241:1339]
    two_digit["code"] = two_digit.code.astype(str).str.zfill(2)
    two_digit = two_digit.rename(columns={"community": "parent_code"})
    two_digit["parent_code"] = two_digit.parent_code.astype(str).str.zfill(3)
    two_digit["level"] = "2digit"

    section = hs4.iloc[1339:].drop("community", axis=1)
    section["code"] = section.code.astype(str).str.zfill(3)
    section["parent_code"] = None
    section["level"] = "section"

    hs_clean = pd.concat([section, two_digit, four_digit])
    hs_clean = hs_clean.reset_index(drop=True)

    h = Hierarchy(["section", "2digit", "4digit"])
    hs_clean = parent_code_table_to_parent_id_table(hs_clean, h)
    c = Classification(hs_clean, h)

    # community = pd.read_table("in/hs4_community.tsv", encoding="utf-8")
    # hs4 = hs4.merge(community, left_on="community", right_on="code", how="inner")

    # weird bug where pandas infer_type was returning mixed instead of string
    c.table.code = c.table.code.astype(str)

    c.to_csv("out/hs92_atlas.csv")
    c.to_stata("out/hs92_atlas.dta")
Esempio n. 3
0
    # Replace trailing comma and space
    df.name_spanish = df.name_spanish.str.replace(", $", "")
    df.name_english = df.name_english.str.replace(", $", "")

    h = Hierarchy(["twodigit", "threedigit", "fourdigit", "fivedigit", "sixdigit"])

    df.loc[df.code.str.len() == 2, "level"] = "twodigit"
    df.loc[df.code.str.len() == 3, "level"] = "threedigit"
    df.loc[df.code.str.len() == 4, "level"] = "fourdigit"
    df.loc[df.code.str.len() == 5, "level"] = "fivedigit"
    df.loc[df.code.str.len() == 6, "level"] = "sixdigit"

    spanish = df[["code", "level", "name_spanish"]]
    spanish.columns = ["code", "level", "name_es"]

    # make sure this is the hand-fixed version
    assert df.loc[304, "code"] == "31"

    df = df[["code", "name_english", "level"]]
    df.columns = ["code", "name", "level"]

    parent_code_table = ordered_table_to_parent_code_table(df, h)
    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)

    parent_id_table = parent_id_table.merge(spanish, on=["level", "code"])

    c = Classification(parent_id_table, h)

    c.to_csv("out/industries_mexico_scian_2007.csv")
    c.to_stata("out/industries_mexico_scian_2007.dta")
Esempio n. 4
0
    # Drop the 5-digit level.
    names = names[names.level != "5digit"]
    hierarchy = hierarchy.iloc[:, 1:].drop_duplicates()

    fields = {"section": [], "2digit": [], "3digit": [], "4digit": []}

    h = Hierarchy(["section", "2digit", "3digit", "4digit"])
    parent_code_table = repeated_table_to_parent_id_table(hierarchy, h, fields)
    parent_code_table.code = parent_code_table.code.astype(str)
    parent_code_table = parent_code_table.merge(names, on=["code", "level"])

    # Sort by level order (not necessarily alphabetical)
    parent_code_table = sort_by_code_and_level(parent_code_table, h)

    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)
    parent_id_table["name"] = parent_id_table.name_en

    parent_id_table = parent_id_table[
        [
            "code",
            "name",
            "level",
            "name_en",
            "name_es",
            "name_short_en",
            "name_short_es",
            "parent_id",
        ]
    ]

if __name__ == "__main__":
    assert(len(sys.argv) == 3)

    file_name = sys.argv[1]
    new_file_prefix = sys.argv[2]

    df = pd.read_table(file_name, encoding="utf-16")
    df = parse_dane(df)
    df = df[~df.duplicated(["code"])]
    df = df.reset_index(drop=True)
    df.columns = ["name", "level", "code"]

    df.name = df.name.str.title()

    from classification import (parent_code_table_to_parent_id_table,
                                Classification, Hierarchy,
                                ordered_table_to_parent_code_table)

    h = Hierarchy(DANE_HIERARCHY)
    df = ordered_table_to_parent_code_table(df, h)
    df = parent_code_table_to_parent_id_table(df, h)
    c = Classification(df, h)

    # weird bug where pandas infer_type was returning mixed instead of string
    c.table.code = c.table.code.astype(str)

    c.to_csv(new_file_prefix + ".csv")
    c.to_stata(new_file_prefix + ".dta")
Esempio n. 6
0
    four_digit["parent_code"] = four_digit.code.apply(lambda x: x[:2])
    four_digit = four_digit.drop("community", axis=1)
    four_digit["level"] = "4digit"

    two_digit = hs4.iloc[1241:1339]
    two_digit["code"] = two_digit.code.astype(str).str.zfill(2)
    two_digit = two_digit.rename(columns={"community": "parent_code"})
    two_digit["parent_code"] = two_digit.parent_code.astype(str).str.zfill(3)
    two_digit["level"] = "2digit"

    section = hs4.iloc[1339:].drop("community", axis=1)
    section["code"] = section.code.astype(str).str.zfill(3)
    section["parent_code"] = None
    section["level"] = "section"

    hs_clean = pd.concat([section, two_digit, four_digit])
    hs_clean = hs_clean.reset_index(drop=True)

    h = Hierarchy(["section", "2digit", "4digit"])
    hs_clean = parent_code_table_to_parent_id_table(hs_clean, h)
    c = Classification(hs_clean, h)

    #community = pd.read_table("in/hs4_community.tsv", encoding="utf-8")
    #hs4 = hs4.merge(community, left_on="community", right_on="code", how="inner")

    # weird bug where pandas infer_type was returning mixed instead of string
    c.table.code = c.table.code.astype(str)

    c.to_csv("out/hs92_atlas.csv")
    c.to_stata("out/hs92_atlas.dta")