예제 #1
0
        "2digit_code",
        "section_code",
    ]

    services = pd.read_csv(
        "./in/Services_Hierarchy.csv", encoding="utf-8", dtype={"code": str}
    )

    # Drop the 5-digit level.
    names = names[names.level != "5digit"]
    hierarchy = hierarchy.iloc[:, 1:].drop_duplicates()

    fields = {"section": [], "2digit": [], "3digit": [], "4digit": []}

    h = Hierarchy(["section", "2digit", "3digit", "4digit"])
    parent_code_table = repeated_table_to_parent_id_table(hierarchy, h, fields)
    parent_code_table.code = parent_code_table.code.astype(str)
    parent_code_table = parent_code_table.merge(names, on=["code", "level"])

    # Sort by level order (not necessarily alphabetical)
    parent_code_table = sort_by_code_and_level(parent_code_table, h)

    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)
    parent_id_table["name"] = parent_id_table.name_en

    parent_id_table = parent_id_table[
        [
            "code",
            "name",
            "level",
            "name_en",
예제 #2
0
    df = pd.read_table("in/DIVIPOLA_20150331.txt", encoding="utf-16")
    df.columns = ["department_code", "municipality_code",
                  "population_center_code", "department_name",
                  "municipality_name", "population_center_name",
                  "population_center_type", "longitude", "", "latitude",
                  "district", "municipality_type", "metro_area"]

    df = df[["department_code", "department_name", "municipality_code",
             "municipality_name", "population_center_code",
             "population_center_name"]]


    df.department_code = df.department_code.astype(str).str.zfill(2)
    df.municipality_code = df.municipality_code.astype(str).str.zfill(5)
    df.population_center_code = df.population_center_code.astype(str).str.zfill(8)

    df.department_name = df.department_name.str.title()
    df.municipality_name = df.municipality_name.str.title()
    df.population_center_name = df.population_center_name.str.title()

    h = Hierarchy(["department", "municipality", "population_center"])

    parent_code_table = repeated_table_to_parent_id_table(df, h)
    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)

    c = Classification(parent_id_table, h)

    c.to_csv("out/locations_colombia_dane.csv")
    c.to_stata("out/locations_colombia_dane.dta")
예제 #3
0
    df.locality_name = df.locality_name.str.title()

    df = df.rename(
        columns={
            "state_name": "name_en_state",
            "municipality_name": "name_en_municipality",
            "locality_name": "name_en_locality",
        })

    h = Hierarchy(["state", "municipality", "locality"])

    parent_code_table = repeated_table_to_parent_id_table(
        df,
        h,
        level_fields={
            "state": ["name_en_state"],
            "municipality": ["name_en_municipality"],
            "locality": ["name_en_locality"],
        },
    )

    # TODO: This isn't the official classification level name but this makes
    # compatibility between colombia and mexico way easier
    parent_code_table.loc[parent_code_table.level == "state",
                          "level"] = "department"

    # Drop the "locality" level since we don't use it
    parent_code_table = parent_code_table[
        parent_code_table.level != "locality"]

    # This adds a highest level element that represents the whole country
예제 #4
0
        fix_spanish_title_case, na_action="ignore")

    h = Hierarchy(["department", "municipality", "population_center"])

    df = df.rename(
        columns={
            "department_name": "name_department",
            "municipality_name": "name_municipality",
            "population_center_name": "name_population_center",
        })

    parent_code_table = repeated_table_to_parent_id_table(
        df,
        h,
        level_fields={
            "department": ["name_department"],
            "municipality": ["name_municipality"],
            "population_center": ["name_population_center"],
        },
    )
    parent_id_table = parent_code_table_to_parent_id_table(
        parent_code_table, h)

    # Reorder columns to keep diff clean
    parent_id_table = parent_id_table.ix[:, [
        "code", "name", "level", "parent_id"
    ]]

    c = Classification(parent_id_table, h)

    c.to_csv("out/locations_colombia_dane.csv")
예제 #5
0
            "name_short_es_minor_group",
        ],
        "broad_occupation":
        ["name_en_broad_occupation", "name_es_broad_occupation"],
        "detailed_occupation": [
            "name_en_detailed_occupation",
            "name_es_detailed_occupation",
            "name_short_es_detailed_occupation",
        ],
    }

    # TODO: no short names for these
    df["name_short_es_broad_occupation"] = ""
    df["name_short_es_major_group"] = ""

    df = repeated_table_to_parent_id_table(df, h, fields)

    df["name_short_en"] = df["name_en"]
    df["name_short_es"] = df["name_short_es"].fillna(df.name_es)

    assert not df[df.level == "detailed_occupation"].code.str.endswith(
        "0").all()
    assert df[df.level == "broad_occupation"].code.str.endswith("0").all()
    assert df[df.level == "major_group"].code.str.endswith("000").all()
    assert df[df.level == "minor_group"].code.str.endswith("00").all()

    df["name"] = df.name_en
    df = parent_code_table_to_parent_id_table(df, h)

    c = Classification(df, h)
예제 #6
0
    df.class_code = df.class_code.astype(int).astype(str).str.zfill(4)
    df.division_code = df.division_code.astype(int).astype(str).str.zfill(2)
    df.section_code = df.section_code.astype(int).astype(str).str.zfill(1)

    names = pd.read_table("./in/col_industry_name_category_master - Names.tsv",
                          encoding="utf-8")
    names.loc[names.level == "section", "code"] = names.code.astype(str)
    names.loc[names.level == "division",
              "code"] = names.code.astype(str).str.zfill(2)
    names.loc[names.level == "class",
              "code"] = names.code.astype(str).str.zfill(4)

    h = Hierarchy(["section", "division", "class"])
    parent_code_table = (repeated_table_to_parent_id_table(
        df, h, level_fields={
            "section": [],
            "division": [],
            "class": []
        }).sort_values(by=["level", "code"]).reset_index(drop=True))

    parent_id_table = parent_code_table_to_parent_id_table(
        parent_code_table, h)
    parent_id_table = parent_id_table.merge(names)

    parent_id_table["name"] = parent_id_table.name_en

    c = Classification(parent_id_table, h)

    c.to_csv("out/industries_colombia_isic_prosperia.csv")
    c.to_stata("out/industries_colombia_isic_prosperia.dta")