コード例 #1
0
ファイル: clean.py プロジェクト: alfredoagg/classifications
    # Replace trailing comma and space
    df.name_spanish = df.name_spanish.str.replace(", $", "")
    df.name_english = df.name_english.str.replace(", $", "")

    h = Hierarchy(["twodigit", "threedigit", "fourdigit", "fivedigit", "sixdigit"])

    df.loc[df.code.str.len() == 2, "level"] = "twodigit"
    df.loc[df.code.str.len() == 3, "level"] = "threedigit"
    df.loc[df.code.str.len() == 4, "level"] = "fourdigit"
    df.loc[df.code.str.len() == 5, "level"] = "fivedigit"
    df.loc[df.code.str.len() == 6, "level"] = "sixdigit"

    spanish = df[["code", "level", "name_spanish"]]
    spanish.columns = ["code", "level", "name_es"]

    # make sure this is the hand-fixed version
    assert df.loc[304, "code"] == "31"

    df = df[["code", "name_english", "level"]]
    df.columns = ["code", "name", "level"]

    parent_code_table = ordered_table_to_parent_code_table(df, h)
    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)

    parent_id_table = parent_id_table.merge(spanish, on=["level", "code"])

    c = Classification(parent_id_table, h)

    c.to_csv("out/industries_mexico_scian_2007.csv")
    c.to_stata("out/industries_mexico_scian_2007.dta")
コード例 #2
0
ファイル: clean.py プロジェクト: alfredoagg/classifications
                            Classification)

if __name__ == "__main__":

    sinco = pd.read_csv("in/SINCO_2011.csv", header=None, encoding="latin-1")

    sinco.columns = ["data"]
    sinco = sinco[~sinco.data.str.startswith("INEGI.")]
    sinco = sinco[~sinco.data.str.startswith(u"Clave Descripción")]

    for index, row in reversed(
            list(sinco[~sinco.data.str.match("^\d* ")].iterrows())):
        sinco.ix[index - 1] += (" " + sinco.ix[index])

    sinco = sinco[sinco.data.str.match("^\d* ")]

    sinco = sinco.data.str.split(" ", 1).apply(pd.Series, 1)
    sinco.columns = ["code", "name"]

    sinco["level"] = sinco["code"].apply(lambda x: str(len(x)) + "digit")
    h = Hierarchy(["1digit", "2digit", "3digit", "4digit"])

    parent_code_table = ordered_table_to_parent_code_table(sinco, h)
    parent_id_table = parent_code_table_to_parent_id_table(
        parent_code_table, h)

    c = Classification(parent_id_table, h)

    c.to_csv("out/occupations_sinco_2011.csv")
    c.to_stata("out/occupations_sinco_2011.dta")
コード例 #3
0

if __name__ == "__main__":
    assert (len(sys.argv) == 3)

    file_name = sys.argv[1]
    new_file_prefix = sys.argv[2]

    df = pd.read_table(file_name, encoding="utf-16")
    df = parse_dane(df)
    df = df[~df.duplicated(["code"])]
    df = df.reset_index(drop=True)
    df.columns = ["name", "level", "code"]

    df.name = df.name.str.title()

    from classification import (parent_code_table_to_parent_id_table,
                                Classification, Hierarchy,
                                ordered_table_to_parent_code_table)

    h = Hierarchy(DANE_HIERARCHY)
    df = ordered_table_to_parent_code_table(df, h)
    df = parent_code_table_to_parent_id_table(df, h)
    c = Classification(df, h)

    # weird bug where pandas infer_type was returning mixed instead of string
    c.table.code = c.table.code.astype(str)

    c.to_csv(new_file_prefix + ".csv")
    c.to_stata(new_file_prefix + ".dta")
コード例 #4
0
ファイル: clean.py プロジェクト: alfredoagg/classifications
from classification import (Hierarchy, ordered_table_to_parent_code_table,
                            parent_code_table_to_parent_id_table,
                            Classification)

if __name__ == "__main__":

    sinco = pd.read_csv("in/SINCO_2011.csv", header=None, encoding="latin-1")

    sinco.columns = ["data"]
    sinco = sinco[~sinco.data.str.startswith("INEGI.")]
    sinco = sinco[~sinco.data.str.startswith(u"Clave Descripción")]

    for index, row in reversed(list(sinco[~sinco.data.str.match("^\d* ")].iterrows())):
        sinco.ix[index - 1] += (" " + sinco.ix[index])

    sinco = sinco[sinco.data.str.match("^\d* ")]

    sinco = sinco.data.str.split(" ", 1).apply(pd.Series, 1)
    sinco.columns = ["code", "name"]

    sinco["level"] = sinco["code"].apply(lambda x: str(len(x)) + "digit")
    h = Hierarchy(["1digit", "2digit", "3digit", "4digit"])

    parent_code_table = ordered_table_to_parent_code_table(sinco, h)
    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)

    c = Classification(parent_id_table, h)

    c.to_csv("out/occupations_sinco_2011.csv")
    c.to_stata("out/occupations_sinco_2011.dta")