#df_trips.drop([df_trips["origin_commune_id"]=="aaaaa"], inplace=True)
#df_trips.drop([df_trips["destination_commune_id"]=="aaaaa"], inplace=True)
#df_trips = df_trips[~df_trips["origin_commune_id"].isin(["aaaaa"]) | ~df_trips["destination_commune_id"].isin(["aaaaa"])]
df_trips = df_trips[~df_trips["origin_commune_id"].isin(["aaaaa"])]
df_trips = df_trips[~df_trips["destination_commune_id"].isin(["aaaaa"])]

# Clean employment
df_persons["employed"] = df_persons["P9"].isin([1, 2, 3]  )##1: temps plein,   2: temps partiel et 3: apprentit,
# stage, formation

# Studies
df_persons["studies"] = df_persons["P9"].isin([4, 5]  )##4:etudiant  5: scolaire jusqu'au BAC

# Calculate consumption units
hts.check_household_size(df_households, df_persons)
df_households = pd.merge(df_households, hts.calculate_consumption_units(df_persons), on = "household_id")

# Socioprofessional class
df_persons["socioprofessional_class"] = df_persons["PCSD"].fillna(00).astype(int)

# Number of vehicles
df_households["number_of_vehicles"] = df_households["M6"] + df_households["M14"  ]##M6: Voiture particulière
# M14: Nbre de 2/3 roues motorisées
df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(np.int)
df_households["number_of_bikes"] = df_households["M21"].astype(np.int)

# License
df_persons["has_license"] = df_persons["P7"] == 1

# Car availability
df_persons = pd.merge(
def execute(context):
    df_individu, df_tcm_individu, df_menage, df_tcm_menage, df_deploc, df_comm = context.stage(
        "data.hts.entd.raw")

    # Make copies
    df_persons = pd.DataFrame(df_tcm_individu, copy=True)
    df_households = pd.DataFrame(df_tcm_menage, copy=True)
    df_trips = pd.DataFrame(df_deploc, copy=True)

    # Get weights for persons that actually have trips
    df_persons = pd.merge(df_persons,
                          df_trips[["IDENT_IND",
                                    "PONDKI"]].drop_duplicates("IDENT_IND"),
                          on="IDENT_IND",
                          how="left")
    df_persons["is_kish"] = ~df_persons["PONDKI"].isna()
    df_persons["trip_weight"] = df_persons["PONDKI"].fillna(0.0)

    # Important: If someone did not have any trips on the reference day, ENTD asked
    # for another day. With this flag we make sure that we only cover "reference days".
    f = df_trips["V2_MOBILREF"] == 1
    df_trips = df_trips[f]
    print("Filtering out %d non-reference day trips" % np.count_nonzero(~f))

    # Merge in additional information from ENTD
    df_households = pd.merge(df_households,
                             df_menage[[
                                 "idENT_MEN", "V1_JNBVEH", "V1_JNBMOTO",
                                 "V1_JNBCYCLO", "V1_JNBVELOADT"
                             ]],
                             on="idENT_MEN",
                             how="left")

    df_persons = pd.merge(df_persons,
                          df_individu[[
                              "IDENT_IND", "V1_GPERMIS", "V1_GPERMIS2R",
                              "V1_ICARTABON"
                          ]],
                          on="IDENT_IND",
                          how="left")

    # Transform original IDs to integer (they are hierarchichal)
    df_persons["entd_person_id"] = df_persons["IDENT_IND"].astype(np.int)
    df_persons["entd_household_id"] = df_persons["IDENT_MEN"].astype(np.int)
    df_households["entd_household_id"] = df_households["idENT_MEN"].astype(
        np.int)
    df_trips["entd_person_id"] = df_trips["IDENT_IND"].astype(np.int)

    # Construct new IDs for households, persons and trips (which are unique globally)
    df_households["household_id"] = np.arange(len(df_households))

    df_persons = pd.merge(df_persons,
                          df_households[["entd_household_id", "household_id"]],
                          on="entd_household_id")
    df_persons["person_id"] = np.arange(len(df_persons))

    df_trips = pd.merge(
        df_trips,
        df_persons[["entd_person_id", "person_id", "household_id"]],
        on=["entd_person_id"])
    df_trips["trip_id"] = np.arange(len(df_trips))

    # Weight
    df_persons["person_weight"] = df_persons["PONDV1"].astype(np.float)
    df_households["household_weight"] = df_households["PONDV1"].astype(
        np.float)

    # Clean age
    df_persons.loc[:, "age"] = df_persons["AGE"]

    # Clean sex
    df_persons.loc[df_persons["SEXE"] == 1, "sex"] = "male"
    df_persons.loc[df_persons["SEXE"] == 2, "sex"] = "female"
    df_persons["sex"] = df_persons["sex"].astype("category")

    # Household size
    df_households["household_size"] = df_households["NPERS"]

    # Clean departement
    df_households["departement_id"] = pd.to_numeric(
        df_households["DEP"], errors="coerce").fillna(-1).astype(np.int)

    df_persons["departement_id"] = pd.to_numeric(
        df_persons["DEP"], errors="coerce").fillna(-1).astype(np.int)

    df_trips["origin_departement_id"] = pd.to_numeric(
        df_trips["V2_MORIDEP"], errors="coerce").fillna(-1).astype(np.int)

    df_trips["destination_departement_id"] = pd.to_numeric(
        df_trips["V2_MDESDEP"], errors="coerce").fillna(-1).astype(np.int)

    # Clean employment
    df_persons["employed"] = df_persons["SITUA"].isin([1, 2])

    # Studies
    # Many < 14 year old have NaN
    df_persons["studies"] = df_persons["ETUDES"].fillna(1) == 1

    # Number of vehicles
    df_households["number_of_vehicles"] = 0
    df_households["number_of_vehicles"] += df_households["V1_JNBVEH"].fillna(0)
    df_households["number_of_vehicles"] += df_households["V1_JNBMOTO"].fillna(
        0)
    df_households["number_of_vehicles"] += df_households["V1_JNBCYCLO"].fillna(
        0)
    #df_households["number_of_vehicles"] += df_households["V1_JNBAUTVEH"]
    #df_households["number_of_vehicles"] += df_households["V1_JNBCCVUL"]
    df_households["number_of_vehicles"] = df_households[
        "number_of_vehicles"].astype(np.int)

    df_households["number_of_bikes"] = df_households["V1_JNBVELOADT"].fillna(
        0).astype(np.int)

    # License
    df_persons["has_license"] = (df_persons["V1_GPERMIS"]
                                 == 1) | (df_persons["V1_GPERMIS2R"] == 1)

    # Has subscription
    df_persons["has_pt_subscription"] = df_persons["V1_ICARTABON"] == 1

    # Household income
    df_households["income_class"] = -1
    df_households.loc[
        df_households["TrancheRevenuMensuel"].str.startswith("Moins de 400"),
        "income_class"] = 0
    df_households.loc[
        df_households["TrancheRevenuMensuel"].str.startswith("De 400"),
        "income_class"] = 1
    df_households.loc[
        df_households["TrancheRevenuMensuel"].str.startswith("De 600"),
        "income_class"] = 2
    df_households.loc[
        df_households["TrancheRevenuMensuel"].str.startswith("De 800"),
        "income_class"] = 3
    df_households.loc[
        df_households["TrancheRevenuMensuel"].str.startswith("De 1 000"),
        "income_class"] = 4
    df_households.loc[
        df_households["TrancheRevenuMensuel"].str.startswith("De 1 200"),
        "income_class"] = 5
    df_households.loc[
        df_households["TrancheRevenuMensuel"].str.startswith("De 1 500"),
        "income_class"] = 6
    df_households.loc[
        df_households["TrancheRevenuMensuel"].str.startswith("De 1 800"),
        "income_class"] = 7
    df_households.loc[
        df_households["TrancheRevenuMensuel"].str.startswith("De 2 000"),
        "income_class"] = 8
    df_households.loc[
        df_households["TrancheRevenuMensuel"].str.startswith("De 2 500"),
        "income_class"] = 9
    df_households.loc[
        df_households["TrancheRevenuMensuel"].str.startswith("De 3 000"),
        "income_class"] = 10
    df_households.loc[
        df_households["TrancheRevenuMensuel"].str.startswith("De 4 000"),
        "income_class"] = 11
    df_households.loc[
        df_households["TrancheRevenuMensuel"].str.startswith("De 6 000"),
        "income_class"] = 12
    df_households.loc[
        df_households["TrancheRevenuMensuel"].str.startswith("10 000"),
        "income_class"] = 13
    df_households["income_class"] = df_households["income_class"].astype(
        np.int)

    # Trip purpose
    df_trips["following_purpose"] = "other"
    df_trips["preceeding_purpose"] = "other"

    for prefix, activity_type in PURPOSE_MAP:
        df_trips.loc[
            df_trips["V2_MMOTIFDES"].astype(np.str).str.startswith(prefix),
            "following_purpose"] = activity_type

        df_trips.loc[
            df_trips["V2_MMOTIFORI"].astype(np.str).str.startswith(prefix),
            "preceeding_purpose"] = activity_type

    df_trips["following_purpose"] = df_trips["following_purpose"].astype(
        "category")
    df_trips["preceeding_purpose"] = df_trips["preceeding_purpose"].astype(
        "category")

    # Trip mode
    df_trips["mode"] = "pt"

    for prefix, mode in MODES_MAP:
        df_trips.loc[df_trips["V2_MTP"].astype(np.str).str.startswith(prefix),
                     "mode"] = mode

    df_trips["mode"] = df_trips["mode"].astype("category")

    # Further trip attributes
    df_trips["routed_distance"] = df_trips["V2_MDISTTOT"] * 1000.0
    df_trips["routed_distance"] = df_trips["routed_distance"].fillna(
        0.0)  # This should be just one within Île-de-France

    # Only leave weekday trips
    f = df_trips["V2_TYPJOUR"] == 1
    print("Removing %d trips on weekends" % np.count_nonzero(~f))
    df_trips = df_trips[f]

    # Only leave one day per person
    initial_count = len(df_trips)

    df_first_day = df_trips[[
        "person_id", "IDENT_JOUR"
    ]].sort_values(by=["person_id", "IDENT_JOUR"]).drop_duplicates("person_id")
    df_trips = pd.merge(df_trips,
                        df_first_day,
                        how="inner",
                        on=["person_id", "IDENT_JOUR"])

    final_count = len(df_trips)
    print("Removed %d trips for non-primary days" %
          (initial_count - final_count))

    # Trip flags
    df_trips = hts.compute_first_last(df_trips)

    # Trip times
    df_trips["departure_time"] = df_trips["V2_MORIHDEP"].apply(
        convert_time).astype(np.float)
    df_trips["arrival_time"] = df_trips["V2_MDESHARR"].apply(
        convert_time).astype(np.float)
    df_trips = hts.fix_trip_times(df_trips)

    # Durations
    df_trips["trip_duration"] = df_trips["arrival_time"] - df_trips[
        "departure_time"]
    hts.compute_activity_duration(df_trips)

    # Add weight to trips
    df_trips["trip_weight"] = df_trips["PONDKI"]

    # Chain length
    df_persons = pd.merge(df_persons,
                          df_trips[["person_id", "NDEP"
                                    ]].drop_duplicates("person_id").rename(
                                        columns={"NDEP": "number_of_trips"}),
                          on="person_id",
                          how="left")
    df_persons["number_of_trips"] = df_persons["number_of_trips"].fillna(
        -1).astype(np.int)
    df_persons.loc[(df_persons["number_of_trips"] == -1)
                   & df_persons["is_kish"], "number_of_trips"] = 0

    # Passenger attribute
    df_persons["is_passenger"] = df_persons["person_id"].isin(
        df_trips[df_trips["mode"] == "car_passenger"]["person_id"].unique())

    # Calculate consumption units
    hts.check_household_size(df_households, df_persons)
    df_households = pd.merge(df_households,
                             hts.calculate_consumption_units(df_persons),
                             on="household_id")

    # Socioprofessional class
    df_persons["socioprofessional_class"] = df_persons["CS24"].fillna(
        80).astype(int) // 10

    return df_households, df_persons, df_trips
Exemple #3
0
def execute(context):
    df_households, df_persons, df_trips = context.stage("data.hts.egt.raw")

    # Make copies
    df_households = pd.DataFrame(df_households, copy=True)
    df_persons = pd.DataFrame(df_persons, copy=True)
    df_trips = pd.DataFrame(df_trips, copy=True)

    # Transform original IDs to integer (they are hierarchichal)
    df_households["egt_household_id"] = df_households["NQUEST"].astype(np.int)
    df_persons["egt_person_id"] = df_persons["NP"].astype(np.int)
    df_persons["egt_household_id"] = df_persons["NQUEST"].astype(np.int)
    df_trips["egt_person_id"] = df_trips["NP"].astype(np.int)
    df_trips["egt_household_id"] = df_trips["NQUEST"].astype(np.int)
    df_trips["egt_trip_id"] = df_trips["ND"].astype(np.int)

    # Construct new IDs for households, persons and trips (which are unique globally)
    df_households["household_id"] = np.arange(len(df_households))

    df_persons = pd.merge(df_persons,
                          df_households[["egt_household_id", "household_id"]],
                          on="egt_household_id")
    df_persons["person_id"] = np.arange(len(df_persons))

    df_trips = pd.merge(df_trips,
                        df_persons[[
                            "egt_person_id", "egt_household_id", "person_id",
                            "household_id"
                        ]],
                        on=["egt_person_id", "egt_household_id"])
    df_trips["trip_id"] = np.arange(len(df_trips))

    # Trip flags
    df_trips = hts.compute_first_last(df_trips)

    # Weight
    df_persons["person_weight"] = df_persons["POIDSP"].astype(np.float)
    df_households["household_weight"] = df_households["POIDSM"].astype(
        np.float)

    # Clean age
    df_persons["age"] = df_persons["AGE"].astype(np.int)

    # Clean sex
    df_persons.loc[df_persons["SEXE"] == 1, "sex"] = "male"
    df_persons.loc[df_persons["SEXE"] == 2, "sex"] = "female"
    df_persons["sex"] = df_persons["sex"].astype("category")

    # Household size
    df_households["household_size"] = df_households["MNP"].astype(np.int)

    # Clean departement
    df_persons["departement_id"] = df_persons["RESDEP"].astype(str).astype(
        "category")
    df_households["departement_id"] = df_households["RESDEP"].astype(
        str).astype("category")
    df_trips["origin_departement_id"] = df_trips["ORDEP"].astype(str).astype(
        "category")
    df_trips["destination_departement_id"] = df_trips["DESTDEP"].astype(
        str).astype("category")

    # Clean employment
    df_persons["employed"] = df_persons["OCCP"].isin([1.0, 2.0])

    # Studies
    df_persons["studies"] = df_persons["OCCP"].isin([3.0, 4.0, 5.0])

    # Number of vehicles
    df_households["number_of_vehicles"] = df_households[
        "NB_2RM"] + df_households["NB_VD"]
    df_households["number_of_vehicles"] = df_households[
        "number_of_vehicles"].astype(np.int)
    df_households["number_of_bikes"] = df_households["NB_VELO"].astype(np.int)

    # License
    df_persons["has_license"] = (df_persons["PERMVP"]
                                 == 1) | (df_persons["PERM2RM"] == 1)

    # Has subscription
    df_persons["has_pt_subscription"] = df_persons["ABONTC"] > 1

    # Household income
    df_households["income_class"] = df_households["REVENU"] - 1
    df_households.loc[df_households["income_class"].isin([10.0, 11.0, np.nan]),
                      "income_class"] = -1
    df_households["income_class"] = df_households["income_class"].astype(
        np.int)

    # Trip purpose
    df_trips["following_purpose"] = "other"
    df_trips["preceding_purpose"] = "other"

    for category, purpose in PURPOSE_MAP.items():
        df_trips.loc[df_trips["DESTMOT_H9"] == category,
                     "following_purpose"] = purpose
        df_trips.loc[df_trips["ORMOT_H9"] == category,
                     "preceding_purpose"] = purpose

    df_trips["following_purpose"] = df_trips["following_purpose"].astype(
        "category")
    df_trips["preceding_purpose"] = df_trips["preceding_purpose"].astype(
        "category")

    # Trip mode
    df_trips["mode"] = "pt"

    for category, mode in MODES_MAP.items():
        df_trips.loc[df_trips["MODP_H7"] == category, "mode"] = mode

    df_trips["mode"] = df_trips["mode"].astype("category")

    # Further trip attributes
    df_trips["euclidean_distance"] = df_trips["DPORTEE"] * 1000.0

    # Trip times
    df_trips[
        "departure_time"] = df_trips["ORH"] * 3600.0 + df_trips["ORM"] * 60.0
    df_trips[
        "arrival_time"] = df_trips["DESTH"] * 3600.0 + df_trips["DESTM"] * 60.0
    df_trips = hts.fix_trip_times(df_trips)

    # Durations
    df_trips["trip_duration"] = df_trips["arrival_time"] - df_trips[
        "departure_time"]
    hts.compute_activity_duration(df_trips)

    # Add weight to trips
    df_trips = pd.merge(
        df_trips,
        df_persons[["person_id", "person_weight"]],
        on="person_id",
        how="left").rename(columns={"person_weight": "trip_weight"})
    df_persons["trip_weight"] = df_persons["person_weight"]

    # Chain length
    df_persons["number_of_trips"] = df_persons["NBDEPL"].fillna(0).astype(
        np.int)

    # Passenger attribute
    df_persons["is_passenger"] = df_persons["person_id"].isin(
        df_trips[df_trips["mode"] == "car_passenger"]["person_id"].unique())

    # Calculate consumption units
    hts.check_household_size(df_households, df_persons)
    df_households = pd.merge(df_households,
                             hts.calculate_consumption_units(df_persons),
                             on="household_id")

    # Socioprofessional class
    df_persons["socioprofessional_class"] = df_persons["CS8"].fillna(8).astype(
        int)

    # Drop people that have NaN departure or arrival times in trips
    # Filter for people with NaN departure or arrival times in trips
    f = df_trips["departure_time"].isna()
    f |= df_trips["arrival_time"].isna()

    f = df_persons["person_id"].isin(df_trips[f]["person_id"])

    nan_count = np.count_nonzero(f)
    total_count = len(df_persons)

    print(
        "Dropping %d/%d persons because of NaN values in departure and arrival times"
        % (nan_count, total_count))

    df_persons = df_persons[~f]
    df_trips = df_trips[df_trips["person_id"].isin(
        df_persons["person_id"].unique())]
    df_households = df_households[df_households["household_id"].isin(
        df_persons["household_id"])]

    # Fix activity types (because of inconsistent EGT data and removing in the timing fixing step)
    hts.fix_activity_types(df_trips)

    return df_households, df_persons, df_trips
Exemple #4
0
def execute(context):
    df_households, df_persons, df_trips = context.stage("data.hts.emc2.raw")

    # Make copies
    df_households = pd.DataFrame(df_households, copy=True)
    df_persons = pd.DataFrame(df_persons, copy=True)
    df_trips = pd.DataFrame(df_trips, copy=True)

    df_households['ZFM'] = df_households['ZFM'].map(lambda x: int(x) // 100)
    df_persons['ZFP'] = df_persons['ZFP'].map(lambda x: int(x) // 100)
    df_trips['ZFD'] = df_trips['ZFD'].map(lambda x: int(x) // 100)
    df_trips['D3'] = df_trips['D3'].map(lambda x: int(x // 100))
    df_trips['D7'] = df_trips['D7'].map(lambda x: int(x // 100))

    # Construct new IDs for households, persons and trips (which are unique globally)
    df_households["household_id"] = np.arange(len(df_households))

    df_persons = pd.merge(df_persons,
                          df_households[["MID", "household_id"]],
                          on="MID")
    df_persons["person_id"] = np.arange(len(df_persons))

    df_trips = pd.merge(df_trips,
                        df_persons[["PER", "MID", "person_id",
                                    "household_id"]],
                        on=["PER", "MID"])
    df_trips["trip_id"] = np.arange(len(df_trips))

    # Trip flags
    df_trips = hts.compute_first_last(df_trips)

    # Weight
    df_households["household_weight"] = df_households["COEM"]
    df_persons["person_weight"] = df_persons["COE2"]

    # Clean age
    df_persons["age"] = df_persons["P4"].astype(np.int)

    # Clean sex
    df_persons.loc[df_persons["P2"] == 1, "sex"] = "male"
    df_persons.loc[df_persons["P2"] == 2, "sex"] = "female"
    df_persons["sex"] = df_persons["sex"].astype("category")

    # Household size
    df_households["household_size"] = df_persons.groupby("household_id").size()

    # Clean commune
    df_persons = df_persons.merge(
        df_zone, left_on='ZFP', right_on='ZF',
        how='left').rename(columns={'INSEE': 'commune_id'})
    df_households = df_households.merge(
        df_zone, left_on='ZFM', right_on='ZF',
        how='left').rename(columns={'INSEE': 'commune_id'})

    df_persons['commune_id'].fillna(0, inplace=True)
    df_households['commune_id'].fillna(0, inplace=True)

    df_trips = df_trips.merge(
        df_zone, left_on='D3', right_on='ZF',
        how='left').rename(columns={'INSEE': 'origin_commune_id'})
    df_trips = df_trips.merge(
        df_zone, left_on='D7', right_on='ZF',
        how='left').rename(columns={'INSEE': 'destination_commune_id'})

    df_trips['origin_commune_id'].fillna(0, inplace=True)
    df_trips['destination_commune_id'].fillna(0, inplace=True)

    df_persons["commune_id"] = df_persons["commune_id"].astype("category")
    df_households["commune_id"] = df_households["commune_id"].astype(
        "category")
    df_trips["origin_commune_id"] = df_trips["origin_commune_id"].astype(
        "category")
    df_trips["destination_commune_id"] = df_trips[
        "destination_commune_id"].astype("category")

    # Clean departement
    df_persons["departement_id"] = (
        df_persons["commune_id"].astype(int) /
        1000).astype(int).astype(str).astype("category")
    df_households["departement_id"] = (
        df_households["commune_id"].astype(int) /
        1000).astype(int).astype(str).astype("category")
    df_trips["origin_departement_id"] = (
        df_trips["origin_commune_id"].astype(int) /
        1000).astype(int).astype(str).astype("category")
    df_trips["destination_departement_id"] = (
        df_trips["destination_commune_id"].astype(int) /
        1000).astype(int).astype(str).astype("category")

    # Clean employment
    df_persons["employed"] = df_persons["P9"].astype('Float32').isin(
        [1.0, 2.0])

    # Studies
    df_persons["studies"] = df_persons["P9"].astype('Float32').isin(
        [3.0, 4.0, 5.0])

    # Number of vehicles
    df_households[
        "number_of_vehicles"] = df_households["M6"] + df_households["M14"]
    df_households["number_of_vehicles"] = df_households[
        "number_of_vehicles"].astype(np.int)
    df_households["number_of_bikes"] = df_households["M21"].astype(np.int)

    # License
    df_persons["has_license"] = (df_persons["P7"] == 1)

    # Has subscription
    df_persons["has_pt_subscription"] = (df_persons["P12"] != 4)

    # # Household income
    # df_households["income_class"] = df_households["REVENU"] - 1
    # df_households.loc[df_households["income_class"].isin([10.0, 11.0, np.nan]), "income_class"] = -1
    # df_households["income_class"] = df_households["income_class"].astype(np.int)

    df_households[
        "income_class"] = 0  # np.random.randint(0, 10, size=(len(df_households), 1)).astype(int)

    # Trip purpose
    df_trips["following_purpose"] = "other"
    df_trips["preceding_purpose"] = "other"

    for purpose, category in PURPOSE_MAP.items():
        df_trips.loc[df_trips["D2A"].isin(category),
                     "preceding_purpose"] = purpose
        df_trips.loc[df_trips["D5A"].isin(category),
                     "following_purpose"] = purpose

    df_trips["preceding_purpose"] = df_trips["preceding_purpose"].astype(
        "category")
    df_trips["following_purpose"] = df_trips["following_purpose"].astype(
        "category")

    # Trip mode
    df_trips["mode"] = "walk"

    for mode, category in MODES_MAP.items():
        df_trips.loc[df_trips["MODP"].isin(category), "mode"] = mode

    df_trips["mode"] = df_trips["mode"].astype("category")

    # Further trip attributes
    df_trips["euclidean_distance"] = df_trips["DOIB"]

    # Trip times
    df_trips["departure_time"] = df_trips["D4A"] * 3600 + df_trips["D4B"] * 60
    df_trips["arrival_time"] = df_trips["D8A"] * 3600 + df_trips["D8B"] * 60
    df_trips = hts.fix_trip_times(df_trips)

    # Durations
    df_trips["trip_duration"] = df_trips["arrival_time"] - df_trips[
        "departure_time"]
    hts.compute_activity_duration(df_trips)

    # Add weight to trips
    df_trips = pd.merge(
        df_trips,
        df_persons[["person_id", "person_weight"]],
        on="person_id",
        how="left").rename(columns={"person_weight": "trip_weight"})
    df_persons["trip_weight"] = df_persons["person_weight"]

    # Chain length
    df_persons["number_of_trips"] = df_trips.groupby("person_id").size()

    # Passenger attribute
    df_persons["is_passenger"] = df_persons["person_id"].isin(
        df_trips[df_trips["mode"] == "car_passenger"]["person_id"].unique())

    # Calculate consumption units
    hts.check_household_size(df_households, df_persons)
    df_households = pd.merge(df_households,
                             hts.calculate_consumption_units(df_persons),
                             on="household_id")

    # Socioprofessional class
    df_persons["socioprofessional_class"] = df_persons["P11"].fillna(8).astype(
        int)
    df_persons.loc[df_persons["socioprofessional_class"].isin([7, 8, 9]),
                   "socioprofessional_class"] = 8
    df_persons.loc[df_persons["P9"] == 7, "socioprofessional_class"] = 7

    # Drop people that have NaN departure or arrival times in trips
    # Filter for people with NaN departure or arrival times in trips
    f = df_trips["departure_time"].isna()
    f |= df_trips["arrival_time"].isna()

    f = df_persons["person_id"].isin(df_trips[f]["person_id"])

    nan_count = np.count_nonzero(f)
    total_count = len(df_persons)

    print(
        "Dropping %d/%d persons because of NaN values in departure and arrival times"
        % (nan_count, total_count))

    df_persons = df_persons[~f]
    df_trips = df_trips[df_trips["person_id"].isin(
        df_persons["person_id"].unique())]
    df_households = df_households[df_households["household_id"].isin(
        df_persons["household_id"])]

    # Fix activity types (because of inconsistent EGT data and removing in the timing fixing step)
    hts.fix_activity_types(df_trips)

    return df_households, df_persons, df_trips
Exemple #5
0
def execute(context):
    df = pd.read_hdf("%s/census.hdf" % context.path("data.census.raw"))

    # Construct household IDs for persons with NUMMI != Z
    df_household_ids = df[["CANTVILLE", "NUMMI"]]
    df_household_ids = df_household_ids[df_household_ids["NUMMI"] != "Z"]
    df_household_ids["temporary"] = df_household_ids[
        "CANTVILLE"] + df_household_ids["NUMMI"]
    df_household_ids = df_household_ids.drop_duplicates("temporary")
    df_household_ids["household_id"] = np.arange(len(df_household_ids))
    df = pd.merge(df, df_household_ids, on=["CANTVILLE", "NUMMI"], how="left")

    # Fill up undefined household ids (those where NUMMI == Z)
    f = np.isnan(df["household_id"])
    df.loc[f, "household_id"] = np.arange(
        np.count_nonzero(f)) + df["household_id"].max()
    df["household_id"] = df["household_id"].astype(np.int)

    # Put person IDs
    df["person_id"] = np.arange(len(df))

    # Sorting
    df = df.sort_values(by=["household_id", "person_id"])

    # Spatial information
    df["departement_id"] = df["DEPT"].astype("category")

    df["commune_id"] = df["IRIS"].str[:5]
    f_undefined = df["commune_id"].str.contains("Z")
    df.loc[f_undefined, "commune_id"] = "undefined"
    df["commune_id"] = df["commune_id"].astype("category")

    df["iris_id"] = df["IRIS"]
    f_undefined = df["iris_id"].str.contains("Z") | df["iris_id"].str.contains(
        "X")
    df.loc[f_undefined, "iris_id"] = "undefined"
    df["iris_id"] = df["iris_id"].astype("category")

    # Verify with requested codes
    df_codes = context.stage("data.spatial.codes")

    excess_communes = set(df["commune_id"].unique()) - set(
        df_codes["commune_id"].unique())
    if not excess_communes == {"undefined"}:
        raise RuntimeError("Found additional communes: %s" % excess_communes)

    excess_iris = set(df["iris_id"].unique()) - set(
        df_codes["iris_id"].unique())
    if not excess_iris == {"undefined"}:
        raise RuntimeError("Found additional IRIS: %s" % excess_iris)

    # Age
    df["age"] = df["AGED"].apply(
        lambda x: "0" if x == "000" else x.lstrip("0")).astype(np.int)

    # Clean COUPLE
    df["couple"] = df["COUPLE"] == "1"

    # Clean TRANS
    df.loc[df["TRANS"] == "1", "commute_mode"] = np.nan
    df.loc[df["TRANS"] == "2", "commute_mode"] = "walk"
    df.loc[df["TRANS"] == "3", "commute_mode"] = "bike"
    df.loc[df["TRANS"] == "4", "commute_mode"] = "car"
    df.loc[df["TRANS"] == "5", "commute_mode"] = "pt"
    df.loc[df["TRANS"] == "Z", "commute_mode"] = np.nan
    df["commute_mode"] = df["commute_mode"].astype("category")

    # Weight
    df["weight"] = df["IPONDI"].astype(np.float)

    # Clean SEXE
    df.loc[df["SEXE"] == "1", "sex"] = "male"
    df.loc[df["SEXE"] == "2", "sex"] = "female"
    df["sex"] = df["sex"].astype("category")

    # Clean employment
    df["employed"] = df["TACT"] == "11"

    # Studies
    df["studies"] = df["ETUD"] == "1"

    # Number of vehicles
    df["number_of_vehicles"] = df["VOIT"].apply(
        lambda x: str(x).replace("Z", "0").replace("X", "0")).astype(np.int)

    df["number_of_vehicles"] += df["DEROU"].apply(lambda x: str(x).replace(
        "U", "0").replace("Z", "0").replace("X", "0")).astype(np.int)

    # Household size
    df_size = df[[
        "household_id"
    ]].groupby("household_id").size().reset_index(name="household_size")
    df = pd.merge(df, df_size)

    # Socioprofessional category
    df["socioprofessional_class"] = df["CS1"].astype(np.int)

    # Place of work or education
    df["work_outside_region"] = df["ILT"].isin(("4", "5", "6"))
    df["education_outside_region"] = df["ILETUD"].isin(("4", "5", "6"))

    # Consumption units
    df = pd.merge(df, hts.calculate_consumption_units(df), on="household_id")

    return df[[
        "person_id", "household_id", "weight", "iris_id", "commune_id",
        "departement_id", "age", "sex", "couple", "commute_mode", "employed",
        "studies", "number_of_vehicles", "household_size",
        "work_outside_region", "education_outside_region", "consumption_units",
        "socioprofessional_class"
    ]]
def execute(context):
    df_households, df_persons, df_trips, df_spatial = context.stage(
        "data.hts.edgt_lyon.raw_cerema")

    # Merge departement into households
    df_spatial = df_spatial[["ZF__2015", "DepCom"]].copy()
    df_spatial["MP2"] = df_spatial["ZF__2015"].astype(str)
    df_spatial["departement_id"] = df_spatial["DepCom"].str[:2]
    df_spatial = df_spatial[["MP2", "departement_id"]]

    # Attention, some households get lost here!
    df_households = pd.merge(df_households, df_spatial, on="MP2", how="left")
    df_households["departement_id"] = df_households["departement_id"].fillna(
        "unknown")

    # Transform original IDs to integer (they are hierarchichal)
    df_households["edgt_household_id"] = (df_households["ECH"] +
                                          df_households["MP2"]).astype(int)
    df_persons["edgt_person_id"] = df_persons["PER"].astype(np.int)
    df_persons["edgt_household_id"] = (df_persons["ECH"] +
                                       df_persons["PP2"]).astype(int)
    df_trips["edgt_person_id"] = df_trips["PER"].astype(np.int)
    df_trips["edgt_household_id"] = (df_trips["ECH"] +
                                     df_trips["DP2"]).astype(int)
    df_trips["edgt_trip_id"] = df_trips["NDEP"].astype(np.int)

    # Construct new IDs for households, persons and trips (which are unique globally)
    df_households["household_id"] = np.arange(len(df_households))

    df_persons = pd.merge(
        df_persons,
        df_households[["edgt_household_id", "household_id", "departement_id"]],
        on=["edgt_household_id"
            ]).sort_values(by=["household_id", "edgt_person_id"])
    df_persons["person_id"] = np.arange(len(df_persons))

    df_trips = pd.merge(
        df_trips,
        df_persons[[
            "edgt_person_id", "edgt_household_id", "person_id", "household_id"
        ]],
        on=["edgt_person_id", "edgt_household_id"
            ]).sort_values(by=["household_id", "person_id", "edgt_trip_id"])
    df_trips["trip_id"] = np.arange(len(df_trips))

    # Trip flags
    df_trips = hts.compute_first_last(df_trips)

    # Weight
    df_persons["person_weight"] = df_persons["COEP"].astype(np.float)
    df_households["household_weight"] = df_households["COEM"].astype(np.float)

    # Clean age
    df_persons["age"] = df_persons["P4"].astype(np.int)

    # Clean sex
    df_persons.loc[df_persons["P2"] == 1, "sex"] = "male"
    df_persons.loc[df_persons["P2"] == 2, "sex"] = "female"
    df_persons["sex"] = df_persons["sex"].astype("category")

    # Household size
    df_size = df_persons.groupby("household_id").size().reset_index(
        name="household_size")
    df_households = pd.merge(df_households, df_size, on="household_id")

    # Clean departement
    df_trips = pd.merge(
        df_trips,
        df_spatial.rename(columns={
            "MP2": "D3",
            "departement_id": "origin_departement_id"
        }),
        on="D3",
        how="left")

    df_trips = pd.merge(
        df_trips,
        df_spatial.rename(columns={
            "MP2": "D7",
            "departement_id": "destination_departement_id"
        }),
        on="D7",
        how="left")

    df_trips["origin_departement_id"] = df_trips[
        "origin_departement_id"].fillna("unknown")
    df_trips["destination_departement_id"] = df_trips[
        "destination_departement_id"].fillna("unknown")

    df_households["departement_id"] = df_households["departement_id"].astype(
        "category")
    df_persons["departement_id"] = df_persons["departement_id"].astype(
        "category")
    df_trips["origin_departement_id"] = df_trips[
        "origin_departement_id"].astype("category")
    df_trips["destination_departement_id"] = df_trips[
        "destination_departement_id"].astype("category")

    # Clean employment
    df_persons["employed"] = df_persons["P7"].isin(["1", "2"])

    # Studies
    df_persons["studies"] = df_persons["P7"].isin(["3", "4", "5"])

    # Number of vehicles
    df_households[
        "number_of_vehicles"] = df_households["M6"] + df_households["M5"]
    df_households["number_of_vehicles"] = df_households[
        "number_of_vehicles"].astype(np.int)
    df_households["number_of_bikes"] = df_households["M7"].astype(np.int)

    # License
    df_persons["has_license"] = df_persons["P5"] == "1"

    # Has subscription
    df_persons["has_pt_subscription"] = df_persons["P10"].isin(["1", "2", "3"])

    # Trip purpose
    df_trips["following_purpose"] = "invalid"
    df_trips["preceding_purpose"] = "invalid"

    for purpose, values in PURPOSE_MAP.items():
        df_trips.loc[df_trips["D5A"].isin(values),
                     "following_purpose"] = purpose
        df_trips.loc[df_trips["D2A"].isin(values),
                     "preceding_purpose"] = purpose

    assert np.count_nonzero(df_trips["following_purpose"] == "invalid") == 0
    assert np.count_nonzero(df_trips["preceding_purpose"] == "invalid") == 0

    df_trips["following_purpose"] = df_trips["following_purpose"].astype(
        "category")
    df_trips["preceding_purpose"] = df_trips["preceding_purpose"].astype(
        "category")

    # Trip mode
    for mode, values in MODES_MAP.items():
        df_trips.loc[df_trips["MODP"].isin(values), "mode"] = mode

    assert np.count_nonzero(df_trips["following_purpose"] == "invalid") == 0
    df_trips["mode"] = df_trips["mode"].astype("category")

    # Further trip attributes
    df_trips["euclidean_distance"] = df_trips["DOIB"]
    df_trips["routed_distance"] = df_trips["DIST"]

    # Trip times
    df_trips["departure_time"] = 3600.0 * (df_trips["D4"] // 100)  # hour
    df_trips["departure_time"] += 60.0 * (df_trips["D4"] % 100)  # minute

    df_trips["arrival_time"] = 3600.0 * (df_trips["D8"] // 100)  # hour
    df_trips["arrival_time"] += 60.0 * (df_trips["D8"] % 100)  # minute

    df_trips = df_trips.sort_values(
        by=["household_id", "person_id", "trip_id"])
    df_trips = hts.fix_trip_times(df_trips)

    # Durations
    df_trips["trip_duration"] = df_trips["arrival_time"] - df_trips[
        "departure_time"]
    hts.compute_activity_duration(df_trips)

    # Add weight to trips
    df_trips = pd.merge(df_trips,
                        df_persons[["person_id", "COEQ"]],
                        on="person_id",
                        how="left").rename(columns={"COEQ": "trip_weight"})
    df_persons["trip_weight"] = df_persons["COEQ"]

    # Chain length
    df_count = df_trips[[
        "person_id"
    ]].groupby("person_id").size().reset_index(name="number_of_trips")
    df_persons = pd.merge(df_persons, df_count, on="person_id", how="left")
    df_persons["number_of_trips"] = df_persons["number_of_trips"].fillna(
        -1).astype(int)

    # Passenger attribute
    df_persons["is_passenger"] = df_persons["person_id"].isin(
        df_trips[df_trips["mode"] == "car_passenger"]["person_id"].unique())

    # Calculate consumption units
    hts.check_household_size(df_households, df_persons)
    df_households = pd.merge(df_households,
                             hts.calculate_consumption_units(df_persons),
                             on="household_id")

    # Socioprofessional class
    df_persons["socioprofessional_class"] = df_persons["P9"].fillna(8).astype(
        int)

    # Check departure and arrival times
    assert np.count_nonzero(df_trips["departure_time"].isna()) == 0
    assert np.count_nonzero(df_trips["arrival_time"].isna()) == 0

    # Fix activity types (because of inconsistent EGT data and removing in the timing fixing step)
    hts.fix_activity_types(df_trips)

    return df_households, df_persons, df_trips