#df_trips.drop([df_trips["origin_commune_id"]=="aaaaa"], inplace=True) #df_trips.drop([df_trips["destination_commune_id"]=="aaaaa"], inplace=True) #df_trips = df_trips[~df_trips["origin_commune_id"].isin(["aaaaa"]) | ~df_trips["destination_commune_id"].isin(["aaaaa"])] df_trips = df_trips[~df_trips["origin_commune_id"].isin(["aaaaa"])] df_trips = df_trips[~df_trips["destination_commune_id"].isin(["aaaaa"])] # Clean employment df_persons["employed"] = df_persons["P9"].isin([1, 2, 3] )##1: temps plein, 2: temps partiel et 3: apprentit, # stage, formation # Studies df_persons["studies"] = df_persons["P9"].isin([4, 5] )##4:etudiant 5: scolaire jusqu'au BAC # Calculate consumption units hts.check_household_size(df_households, df_persons) df_households = pd.merge(df_households, hts.calculate_consumption_units(df_persons), on = "household_id") # Socioprofessional class df_persons["socioprofessional_class"] = df_persons["PCSD"].fillna(00).astype(int) # Number of vehicles df_households["number_of_vehicles"] = df_households["M6"] + df_households["M14" ]##M6: Voiture particulière # M14: Nbre de 2/3 roues motorisées df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(np.int) df_households["number_of_bikes"] = df_households["M21"].astype(np.int) # License df_persons["has_license"] = df_persons["P7"] == 1 # Car availability df_persons = pd.merge(
def execute(context): df_individu, df_tcm_individu, df_menage, df_tcm_menage, df_deploc, df_comm = context.stage( "data.hts.entd.raw") # Make copies df_persons = pd.DataFrame(df_tcm_individu, copy=True) df_households = pd.DataFrame(df_tcm_menage, copy=True) df_trips = pd.DataFrame(df_deploc, copy=True) # Get weights for persons that actually have trips df_persons = pd.merge(df_persons, df_trips[["IDENT_IND", "PONDKI"]].drop_duplicates("IDENT_IND"), on="IDENT_IND", how="left") df_persons["is_kish"] = ~df_persons["PONDKI"].isna() df_persons["trip_weight"] = df_persons["PONDKI"].fillna(0.0) # Important: If someone did not have any trips on the reference day, ENTD asked # for another day. With this flag we make sure that we only cover "reference days". f = df_trips["V2_MOBILREF"] == 1 df_trips = df_trips[f] print("Filtering out %d non-reference day trips" % np.count_nonzero(~f)) # Merge in additional information from ENTD df_households = pd.merge(df_households, df_menage[[ "idENT_MEN", "V1_JNBVEH", "V1_JNBMOTO", "V1_JNBCYCLO", "V1_JNBVELOADT" ]], on="idENT_MEN", how="left") df_persons = pd.merge(df_persons, df_individu[[ "IDENT_IND", "V1_GPERMIS", "V1_GPERMIS2R", "V1_ICARTABON" ]], on="IDENT_IND", how="left") # Transform original IDs to integer (they are hierarchichal) df_persons["entd_person_id"] = df_persons["IDENT_IND"].astype(np.int) df_persons["entd_household_id"] = df_persons["IDENT_MEN"].astype(np.int) df_households["entd_household_id"] = df_households["idENT_MEN"].astype( np.int) df_trips["entd_person_id"] = df_trips["IDENT_IND"].astype(np.int) # Construct new IDs for households, persons and trips (which are unique globally) df_households["household_id"] = np.arange(len(df_households)) df_persons = pd.merge(df_persons, df_households[["entd_household_id", "household_id"]], on="entd_household_id") df_persons["person_id"] = np.arange(len(df_persons)) df_trips = pd.merge( df_trips, df_persons[["entd_person_id", "person_id", "household_id"]], on=["entd_person_id"]) df_trips["trip_id"] = np.arange(len(df_trips)) # Weight df_persons["person_weight"] = df_persons["PONDV1"].astype(np.float) df_households["household_weight"] = df_households["PONDV1"].astype( np.float) # Clean age df_persons.loc[:, "age"] = df_persons["AGE"] # Clean sex df_persons.loc[df_persons["SEXE"] == 1, "sex"] = "male" df_persons.loc[df_persons["SEXE"] == 2, "sex"] = "female" df_persons["sex"] = df_persons["sex"].astype("category") # Household size df_households["household_size"] = df_households["NPERS"] # Clean departement df_households["departement_id"] = pd.to_numeric( df_households["DEP"], errors="coerce").fillna(-1).astype(np.int) df_persons["departement_id"] = pd.to_numeric( df_persons["DEP"], errors="coerce").fillna(-1).astype(np.int) df_trips["origin_departement_id"] = pd.to_numeric( df_trips["V2_MORIDEP"], errors="coerce").fillna(-1).astype(np.int) df_trips["destination_departement_id"] = pd.to_numeric( df_trips["V2_MDESDEP"], errors="coerce").fillna(-1).astype(np.int) # Clean employment df_persons["employed"] = df_persons["SITUA"].isin([1, 2]) # Studies # Many < 14 year old have NaN df_persons["studies"] = df_persons["ETUDES"].fillna(1) == 1 # Number of vehicles df_households["number_of_vehicles"] = 0 df_households["number_of_vehicles"] += df_households["V1_JNBVEH"].fillna(0) df_households["number_of_vehicles"] += df_households["V1_JNBMOTO"].fillna( 0) df_households["number_of_vehicles"] += df_households["V1_JNBCYCLO"].fillna( 0) #df_households["number_of_vehicles"] += df_households["V1_JNBAUTVEH"] #df_households["number_of_vehicles"] += df_households["V1_JNBCCVUL"] df_households["number_of_vehicles"] = df_households[ "number_of_vehicles"].astype(np.int) df_households["number_of_bikes"] = df_households["V1_JNBVELOADT"].fillna( 0).astype(np.int) # License df_persons["has_license"] = (df_persons["V1_GPERMIS"] == 1) | (df_persons["V1_GPERMIS2R"] == 1) # Has subscription df_persons["has_pt_subscription"] = df_persons["V1_ICARTABON"] == 1 # Household income df_households["income_class"] = -1 df_households.loc[ df_households["TrancheRevenuMensuel"].str.startswith("Moins de 400"), "income_class"] = 0 df_households.loc[ df_households["TrancheRevenuMensuel"].str.startswith("De 400"), "income_class"] = 1 df_households.loc[ df_households["TrancheRevenuMensuel"].str.startswith("De 600"), "income_class"] = 2 df_households.loc[ df_households["TrancheRevenuMensuel"].str.startswith("De 800"), "income_class"] = 3 df_households.loc[ df_households["TrancheRevenuMensuel"].str.startswith("De 1 000"), "income_class"] = 4 df_households.loc[ df_households["TrancheRevenuMensuel"].str.startswith("De 1 200"), "income_class"] = 5 df_households.loc[ df_households["TrancheRevenuMensuel"].str.startswith("De 1 500"), "income_class"] = 6 df_households.loc[ df_households["TrancheRevenuMensuel"].str.startswith("De 1 800"), "income_class"] = 7 df_households.loc[ df_households["TrancheRevenuMensuel"].str.startswith("De 2 000"), "income_class"] = 8 df_households.loc[ df_households["TrancheRevenuMensuel"].str.startswith("De 2 500"), "income_class"] = 9 df_households.loc[ df_households["TrancheRevenuMensuel"].str.startswith("De 3 000"), "income_class"] = 10 df_households.loc[ df_households["TrancheRevenuMensuel"].str.startswith("De 4 000"), "income_class"] = 11 df_households.loc[ df_households["TrancheRevenuMensuel"].str.startswith("De 6 000"), "income_class"] = 12 df_households.loc[ df_households["TrancheRevenuMensuel"].str.startswith("10 000"), "income_class"] = 13 df_households["income_class"] = df_households["income_class"].astype( np.int) # Trip purpose df_trips["following_purpose"] = "other" df_trips["preceeding_purpose"] = "other" for prefix, activity_type in PURPOSE_MAP: df_trips.loc[ df_trips["V2_MMOTIFDES"].astype(np.str).str.startswith(prefix), "following_purpose"] = activity_type df_trips.loc[ df_trips["V2_MMOTIFORI"].astype(np.str).str.startswith(prefix), "preceeding_purpose"] = activity_type df_trips["following_purpose"] = df_trips["following_purpose"].astype( "category") df_trips["preceeding_purpose"] = df_trips["preceeding_purpose"].astype( "category") # Trip mode df_trips["mode"] = "pt" for prefix, mode in MODES_MAP: df_trips.loc[df_trips["V2_MTP"].astype(np.str).str.startswith(prefix), "mode"] = mode df_trips["mode"] = df_trips["mode"].astype("category") # Further trip attributes df_trips["routed_distance"] = df_trips["V2_MDISTTOT"] * 1000.0 df_trips["routed_distance"] = df_trips["routed_distance"].fillna( 0.0) # This should be just one within Île-de-France # Only leave weekday trips f = df_trips["V2_TYPJOUR"] == 1 print("Removing %d trips on weekends" % np.count_nonzero(~f)) df_trips = df_trips[f] # Only leave one day per person initial_count = len(df_trips) df_first_day = df_trips[[ "person_id", "IDENT_JOUR" ]].sort_values(by=["person_id", "IDENT_JOUR"]).drop_duplicates("person_id") df_trips = pd.merge(df_trips, df_first_day, how="inner", on=["person_id", "IDENT_JOUR"]) final_count = len(df_trips) print("Removed %d trips for non-primary days" % (initial_count - final_count)) # Trip flags df_trips = hts.compute_first_last(df_trips) # Trip times df_trips["departure_time"] = df_trips["V2_MORIHDEP"].apply( convert_time).astype(np.float) df_trips["arrival_time"] = df_trips["V2_MDESHARR"].apply( convert_time).astype(np.float) df_trips = hts.fix_trip_times(df_trips) # Durations df_trips["trip_duration"] = df_trips["arrival_time"] - df_trips[ "departure_time"] hts.compute_activity_duration(df_trips) # Add weight to trips df_trips["trip_weight"] = df_trips["PONDKI"] # Chain length df_persons = pd.merge(df_persons, df_trips[["person_id", "NDEP" ]].drop_duplicates("person_id").rename( columns={"NDEP": "number_of_trips"}), on="person_id", how="left") df_persons["number_of_trips"] = df_persons["number_of_trips"].fillna( -1).astype(np.int) df_persons.loc[(df_persons["number_of_trips"] == -1) & df_persons["is_kish"], "number_of_trips"] = 0 # Passenger attribute df_persons["is_passenger"] = df_persons["person_id"].isin( df_trips[df_trips["mode"] == "car_passenger"]["person_id"].unique()) # Calculate consumption units hts.check_household_size(df_households, df_persons) df_households = pd.merge(df_households, hts.calculate_consumption_units(df_persons), on="household_id") # Socioprofessional class df_persons["socioprofessional_class"] = df_persons["CS24"].fillna( 80).astype(int) // 10 return df_households, df_persons, df_trips
def execute(context): df_households, df_persons, df_trips = context.stage("data.hts.egt.raw") # Make copies df_households = pd.DataFrame(df_households, copy=True) df_persons = pd.DataFrame(df_persons, copy=True) df_trips = pd.DataFrame(df_trips, copy=True) # Transform original IDs to integer (they are hierarchichal) df_households["egt_household_id"] = df_households["NQUEST"].astype(np.int) df_persons["egt_person_id"] = df_persons["NP"].astype(np.int) df_persons["egt_household_id"] = df_persons["NQUEST"].astype(np.int) df_trips["egt_person_id"] = df_trips["NP"].astype(np.int) df_trips["egt_household_id"] = df_trips["NQUEST"].astype(np.int) df_trips["egt_trip_id"] = df_trips["ND"].astype(np.int) # Construct new IDs for households, persons and trips (which are unique globally) df_households["household_id"] = np.arange(len(df_households)) df_persons = pd.merge(df_persons, df_households[["egt_household_id", "household_id"]], on="egt_household_id") df_persons["person_id"] = np.arange(len(df_persons)) df_trips = pd.merge(df_trips, df_persons[[ "egt_person_id", "egt_household_id", "person_id", "household_id" ]], on=["egt_person_id", "egt_household_id"]) df_trips["trip_id"] = np.arange(len(df_trips)) # Trip flags df_trips = hts.compute_first_last(df_trips) # Weight df_persons["person_weight"] = df_persons["POIDSP"].astype(np.float) df_households["household_weight"] = df_households["POIDSM"].astype( np.float) # Clean age df_persons["age"] = df_persons["AGE"].astype(np.int) # Clean sex df_persons.loc[df_persons["SEXE"] == 1, "sex"] = "male" df_persons.loc[df_persons["SEXE"] == 2, "sex"] = "female" df_persons["sex"] = df_persons["sex"].astype("category") # Household size df_households["household_size"] = df_households["MNP"].astype(np.int) # Clean departement df_persons["departement_id"] = df_persons["RESDEP"].astype(str).astype( "category") df_households["departement_id"] = df_households["RESDEP"].astype( str).astype("category") df_trips["origin_departement_id"] = df_trips["ORDEP"].astype(str).astype( "category") df_trips["destination_departement_id"] = df_trips["DESTDEP"].astype( str).astype("category") # Clean employment df_persons["employed"] = df_persons["OCCP"].isin([1.0, 2.0]) # Studies df_persons["studies"] = df_persons["OCCP"].isin([3.0, 4.0, 5.0]) # Number of vehicles df_households["number_of_vehicles"] = df_households[ "NB_2RM"] + df_households["NB_VD"] df_households["number_of_vehicles"] = df_households[ "number_of_vehicles"].astype(np.int) df_households["number_of_bikes"] = df_households["NB_VELO"].astype(np.int) # License df_persons["has_license"] = (df_persons["PERMVP"] == 1) | (df_persons["PERM2RM"] == 1) # Has subscription df_persons["has_pt_subscription"] = df_persons["ABONTC"] > 1 # Household income df_households["income_class"] = df_households["REVENU"] - 1 df_households.loc[df_households["income_class"].isin([10.0, 11.0, np.nan]), "income_class"] = -1 df_households["income_class"] = df_households["income_class"].astype( np.int) # Trip purpose df_trips["following_purpose"] = "other" df_trips["preceding_purpose"] = "other" for category, purpose in PURPOSE_MAP.items(): df_trips.loc[df_trips["DESTMOT_H9"] == category, "following_purpose"] = purpose df_trips.loc[df_trips["ORMOT_H9"] == category, "preceding_purpose"] = purpose df_trips["following_purpose"] = df_trips["following_purpose"].astype( "category") df_trips["preceding_purpose"] = df_trips["preceding_purpose"].astype( "category") # Trip mode df_trips["mode"] = "pt" for category, mode in MODES_MAP.items(): df_trips.loc[df_trips["MODP_H7"] == category, "mode"] = mode df_trips["mode"] = df_trips["mode"].astype("category") # Further trip attributes df_trips["euclidean_distance"] = df_trips["DPORTEE"] * 1000.0 # Trip times df_trips[ "departure_time"] = df_trips["ORH"] * 3600.0 + df_trips["ORM"] * 60.0 df_trips[ "arrival_time"] = df_trips["DESTH"] * 3600.0 + df_trips["DESTM"] * 60.0 df_trips = hts.fix_trip_times(df_trips) # Durations df_trips["trip_duration"] = df_trips["arrival_time"] - df_trips[ "departure_time"] hts.compute_activity_duration(df_trips) # Add weight to trips df_trips = pd.merge( df_trips, df_persons[["person_id", "person_weight"]], on="person_id", how="left").rename(columns={"person_weight": "trip_weight"}) df_persons["trip_weight"] = df_persons["person_weight"] # Chain length df_persons["number_of_trips"] = df_persons["NBDEPL"].fillna(0).astype( np.int) # Passenger attribute df_persons["is_passenger"] = df_persons["person_id"].isin( df_trips[df_trips["mode"] == "car_passenger"]["person_id"].unique()) # Calculate consumption units hts.check_household_size(df_households, df_persons) df_households = pd.merge(df_households, hts.calculate_consumption_units(df_persons), on="household_id") # Socioprofessional class df_persons["socioprofessional_class"] = df_persons["CS8"].fillna(8).astype( int) # Drop people that have NaN departure or arrival times in trips # Filter for people with NaN departure or arrival times in trips f = df_trips["departure_time"].isna() f |= df_trips["arrival_time"].isna() f = df_persons["person_id"].isin(df_trips[f]["person_id"]) nan_count = np.count_nonzero(f) total_count = len(df_persons) print( "Dropping %d/%d persons because of NaN values in departure and arrival times" % (nan_count, total_count)) df_persons = df_persons[~f] df_trips = df_trips[df_trips["person_id"].isin( df_persons["person_id"].unique())] df_households = df_households[df_households["household_id"].isin( df_persons["household_id"])] # Fix activity types (because of inconsistent EGT data and removing in the timing fixing step) hts.fix_activity_types(df_trips) return df_households, df_persons, df_trips
def execute(context): df_households, df_persons, df_trips = context.stage("data.hts.emc2.raw") # Make copies df_households = pd.DataFrame(df_households, copy=True) df_persons = pd.DataFrame(df_persons, copy=True) df_trips = pd.DataFrame(df_trips, copy=True) df_households['ZFM'] = df_households['ZFM'].map(lambda x: int(x) // 100) df_persons['ZFP'] = df_persons['ZFP'].map(lambda x: int(x) // 100) df_trips['ZFD'] = df_trips['ZFD'].map(lambda x: int(x) // 100) df_trips['D3'] = df_trips['D3'].map(lambda x: int(x // 100)) df_trips['D7'] = df_trips['D7'].map(lambda x: int(x // 100)) # Construct new IDs for households, persons and trips (which are unique globally) df_households["household_id"] = np.arange(len(df_households)) df_persons = pd.merge(df_persons, df_households[["MID", "household_id"]], on="MID") df_persons["person_id"] = np.arange(len(df_persons)) df_trips = pd.merge(df_trips, df_persons[["PER", "MID", "person_id", "household_id"]], on=["PER", "MID"]) df_trips["trip_id"] = np.arange(len(df_trips)) # Trip flags df_trips = hts.compute_first_last(df_trips) # Weight df_households["household_weight"] = df_households["COEM"] df_persons["person_weight"] = df_persons["COE2"] # Clean age df_persons["age"] = df_persons["P4"].astype(np.int) # Clean sex df_persons.loc[df_persons["P2"] == 1, "sex"] = "male" df_persons.loc[df_persons["P2"] == 2, "sex"] = "female" df_persons["sex"] = df_persons["sex"].astype("category") # Household size df_households["household_size"] = df_persons.groupby("household_id").size() # Clean commune df_persons = df_persons.merge( df_zone, left_on='ZFP', right_on='ZF', how='left').rename(columns={'INSEE': 'commune_id'}) df_households = df_households.merge( df_zone, left_on='ZFM', right_on='ZF', how='left').rename(columns={'INSEE': 'commune_id'}) df_persons['commune_id'].fillna(0, inplace=True) df_households['commune_id'].fillna(0, inplace=True) df_trips = df_trips.merge( df_zone, left_on='D3', right_on='ZF', how='left').rename(columns={'INSEE': 'origin_commune_id'}) df_trips = df_trips.merge( df_zone, left_on='D7', right_on='ZF', how='left').rename(columns={'INSEE': 'destination_commune_id'}) df_trips['origin_commune_id'].fillna(0, inplace=True) df_trips['destination_commune_id'].fillna(0, inplace=True) df_persons["commune_id"] = df_persons["commune_id"].astype("category") df_households["commune_id"] = df_households["commune_id"].astype( "category") df_trips["origin_commune_id"] = df_trips["origin_commune_id"].astype( "category") df_trips["destination_commune_id"] = df_trips[ "destination_commune_id"].astype("category") # Clean departement df_persons["departement_id"] = ( df_persons["commune_id"].astype(int) / 1000).astype(int).astype(str).astype("category") df_households["departement_id"] = ( df_households["commune_id"].astype(int) / 1000).astype(int).astype(str).astype("category") df_trips["origin_departement_id"] = ( df_trips["origin_commune_id"].astype(int) / 1000).astype(int).astype(str).astype("category") df_trips["destination_departement_id"] = ( df_trips["destination_commune_id"].astype(int) / 1000).astype(int).astype(str).astype("category") # Clean employment df_persons["employed"] = df_persons["P9"].astype('Float32').isin( [1.0, 2.0]) # Studies df_persons["studies"] = df_persons["P9"].astype('Float32').isin( [3.0, 4.0, 5.0]) # Number of vehicles df_households[ "number_of_vehicles"] = df_households["M6"] + df_households["M14"] df_households["number_of_vehicles"] = df_households[ "number_of_vehicles"].astype(np.int) df_households["number_of_bikes"] = df_households["M21"].astype(np.int) # License df_persons["has_license"] = (df_persons["P7"] == 1) # Has subscription df_persons["has_pt_subscription"] = (df_persons["P12"] != 4) # # Household income # df_households["income_class"] = df_households["REVENU"] - 1 # df_households.loc[df_households["income_class"].isin([10.0, 11.0, np.nan]), "income_class"] = -1 # df_households["income_class"] = df_households["income_class"].astype(np.int) df_households[ "income_class"] = 0 # np.random.randint(0, 10, size=(len(df_households), 1)).astype(int) # Trip purpose df_trips["following_purpose"] = "other" df_trips["preceding_purpose"] = "other" for purpose, category in PURPOSE_MAP.items(): df_trips.loc[df_trips["D2A"].isin(category), "preceding_purpose"] = purpose df_trips.loc[df_trips["D5A"].isin(category), "following_purpose"] = purpose df_trips["preceding_purpose"] = df_trips["preceding_purpose"].astype( "category") df_trips["following_purpose"] = df_trips["following_purpose"].astype( "category") # Trip mode df_trips["mode"] = "walk" for mode, category in MODES_MAP.items(): df_trips.loc[df_trips["MODP"].isin(category), "mode"] = mode df_trips["mode"] = df_trips["mode"].astype("category") # Further trip attributes df_trips["euclidean_distance"] = df_trips["DOIB"] # Trip times df_trips["departure_time"] = df_trips["D4A"] * 3600 + df_trips["D4B"] * 60 df_trips["arrival_time"] = df_trips["D8A"] * 3600 + df_trips["D8B"] * 60 df_trips = hts.fix_trip_times(df_trips) # Durations df_trips["trip_duration"] = df_trips["arrival_time"] - df_trips[ "departure_time"] hts.compute_activity_duration(df_trips) # Add weight to trips df_trips = pd.merge( df_trips, df_persons[["person_id", "person_weight"]], on="person_id", how="left").rename(columns={"person_weight": "trip_weight"}) df_persons["trip_weight"] = df_persons["person_weight"] # Chain length df_persons["number_of_trips"] = df_trips.groupby("person_id").size() # Passenger attribute df_persons["is_passenger"] = df_persons["person_id"].isin( df_trips[df_trips["mode"] == "car_passenger"]["person_id"].unique()) # Calculate consumption units hts.check_household_size(df_households, df_persons) df_households = pd.merge(df_households, hts.calculate_consumption_units(df_persons), on="household_id") # Socioprofessional class df_persons["socioprofessional_class"] = df_persons["P11"].fillna(8).astype( int) df_persons.loc[df_persons["socioprofessional_class"].isin([7, 8, 9]), "socioprofessional_class"] = 8 df_persons.loc[df_persons["P9"] == 7, "socioprofessional_class"] = 7 # Drop people that have NaN departure or arrival times in trips # Filter for people with NaN departure or arrival times in trips f = df_trips["departure_time"].isna() f |= df_trips["arrival_time"].isna() f = df_persons["person_id"].isin(df_trips[f]["person_id"]) nan_count = np.count_nonzero(f) total_count = len(df_persons) print( "Dropping %d/%d persons because of NaN values in departure and arrival times" % (nan_count, total_count)) df_persons = df_persons[~f] df_trips = df_trips[df_trips["person_id"].isin( df_persons["person_id"].unique())] df_households = df_households[df_households["household_id"].isin( df_persons["household_id"])] # Fix activity types (because of inconsistent EGT data and removing in the timing fixing step) hts.fix_activity_types(df_trips) return df_households, df_persons, df_trips
def execute(context): df = pd.read_hdf("%s/census.hdf" % context.path("data.census.raw")) # Construct household IDs for persons with NUMMI != Z df_household_ids = df[["CANTVILLE", "NUMMI"]] df_household_ids = df_household_ids[df_household_ids["NUMMI"] != "Z"] df_household_ids["temporary"] = df_household_ids[ "CANTVILLE"] + df_household_ids["NUMMI"] df_household_ids = df_household_ids.drop_duplicates("temporary") df_household_ids["household_id"] = np.arange(len(df_household_ids)) df = pd.merge(df, df_household_ids, on=["CANTVILLE", "NUMMI"], how="left") # Fill up undefined household ids (those where NUMMI == Z) f = np.isnan(df["household_id"]) df.loc[f, "household_id"] = np.arange( np.count_nonzero(f)) + df["household_id"].max() df["household_id"] = df["household_id"].astype(np.int) # Put person IDs df["person_id"] = np.arange(len(df)) # Sorting df = df.sort_values(by=["household_id", "person_id"]) # Spatial information df["departement_id"] = df["DEPT"].astype("category") df["commune_id"] = df["IRIS"].str[:5] f_undefined = df["commune_id"].str.contains("Z") df.loc[f_undefined, "commune_id"] = "undefined" df["commune_id"] = df["commune_id"].astype("category") df["iris_id"] = df["IRIS"] f_undefined = df["iris_id"].str.contains("Z") | df["iris_id"].str.contains( "X") df.loc[f_undefined, "iris_id"] = "undefined" df["iris_id"] = df["iris_id"].astype("category") # Verify with requested codes df_codes = context.stage("data.spatial.codes") excess_communes = set(df["commune_id"].unique()) - set( df_codes["commune_id"].unique()) if not excess_communes == {"undefined"}: raise RuntimeError("Found additional communes: %s" % excess_communes) excess_iris = set(df["iris_id"].unique()) - set( df_codes["iris_id"].unique()) if not excess_iris == {"undefined"}: raise RuntimeError("Found additional IRIS: %s" % excess_iris) # Age df["age"] = df["AGED"].apply( lambda x: "0" if x == "000" else x.lstrip("0")).astype(np.int) # Clean COUPLE df["couple"] = df["COUPLE"] == "1" # Clean TRANS df.loc[df["TRANS"] == "1", "commute_mode"] = np.nan df.loc[df["TRANS"] == "2", "commute_mode"] = "walk" df.loc[df["TRANS"] == "3", "commute_mode"] = "bike" df.loc[df["TRANS"] == "4", "commute_mode"] = "car" df.loc[df["TRANS"] == "5", "commute_mode"] = "pt" df.loc[df["TRANS"] == "Z", "commute_mode"] = np.nan df["commute_mode"] = df["commute_mode"].astype("category") # Weight df["weight"] = df["IPONDI"].astype(np.float) # Clean SEXE df.loc[df["SEXE"] == "1", "sex"] = "male" df.loc[df["SEXE"] == "2", "sex"] = "female" df["sex"] = df["sex"].astype("category") # Clean employment df["employed"] = df["TACT"] == "11" # Studies df["studies"] = df["ETUD"] == "1" # Number of vehicles df["number_of_vehicles"] = df["VOIT"].apply( lambda x: str(x).replace("Z", "0").replace("X", "0")).astype(np.int) df["number_of_vehicles"] += df["DEROU"].apply(lambda x: str(x).replace( "U", "0").replace("Z", "0").replace("X", "0")).astype(np.int) # Household size df_size = df[[ "household_id" ]].groupby("household_id").size().reset_index(name="household_size") df = pd.merge(df, df_size) # Socioprofessional category df["socioprofessional_class"] = df["CS1"].astype(np.int) # Place of work or education df["work_outside_region"] = df["ILT"].isin(("4", "5", "6")) df["education_outside_region"] = df["ILETUD"].isin(("4", "5", "6")) # Consumption units df = pd.merge(df, hts.calculate_consumption_units(df), on="household_id") return df[[ "person_id", "household_id", "weight", "iris_id", "commune_id", "departement_id", "age", "sex", "couple", "commute_mode", "employed", "studies", "number_of_vehicles", "household_size", "work_outside_region", "education_outside_region", "consumption_units", "socioprofessional_class" ]]
def execute(context): df_households, df_persons, df_trips, df_spatial = context.stage( "data.hts.edgt_lyon.raw_cerema") # Merge departement into households df_spatial = df_spatial[["ZF__2015", "DepCom"]].copy() df_spatial["MP2"] = df_spatial["ZF__2015"].astype(str) df_spatial["departement_id"] = df_spatial["DepCom"].str[:2] df_spatial = df_spatial[["MP2", "departement_id"]] # Attention, some households get lost here! df_households = pd.merge(df_households, df_spatial, on="MP2", how="left") df_households["departement_id"] = df_households["departement_id"].fillna( "unknown") # Transform original IDs to integer (they are hierarchichal) df_households["edgt_household_id"] = (df_households["ECH"] + df_households["MP2"]).astype(int) df_persons["edgt_person_id"] = df_persons["PER"].astype(np.int) df_persons["edgt_household_id"] = (df_persons["ECH"] + df_persons["PP2"]).astype(int) df_trips["edgt_person_id"] = df_trips["PER"].astype(np.int) df_trips["edgt_household_id"] = (df_trips["ECH"] + df_trips["DP2"]).astype(int) df_trips["edgt_trip_id"] = df_trips["NDEP"].astype(np.int) # Construct new IDs for households, persons and trips (which are unique globally) df_households["household_id"] = np.arange(len(df_households)) df_persons = pd.merge( df_persons, df_households[["edgt_household_id", "household_id", "departement_id"]], on=["edgt_household_id" ]).sort_values(by=["household_id", "edgt_person_id"]) df_persons["person_id"] = np.arange(len(df_persons)) df_trips = pd.merge( df_trips, df_persons[[ "edgt_person_id", "edgt_household_id", "person_id", "household_id" ]], on=["edgt_person_id", "edgt_household_id" ]).sort_values(by=["household_id", "person_id", "edgt_trip_id"]) df_trips["trip_id"] = np.arange(len(df_trips)) # Trip flags df_trips = hts.compute_first_last(df_trips) # Weight df_persons["person_weight"] = df_persons["COEP"].astype(np.float) df_households["household_weight"] = df_households["COEM"].astype(np.float) # Clean age df_persons["age"] = df_persons["P4"].astype(np.int) # Clean sex df_persons.loc[df_persons["P2"] == 1, "sex"] = "male" df_persons.loc[df_persons["P2"] == 2, "sex"] = "female" df_persons["sex"] = df_persons["sex"].astype("category") # Household size df_size = df_persons.groupby("household_id").size().reset_index( name="household_size") df_households = pd.merge(df_households, df_size, on="household_id") # Clean departement df_trips = pd.merge( df_trips, df_spatial.rename(columns={ "MP2": "D3", "departement_id": "origin_departement_id" }), on="D3", how="left") df_trips = pd.merge( df_trips, df_spatial.rename(columns={ "MP2": "D7", "departement_id": "destination_departement_id" }), on="D7", how="left") df_trips["origin_departement_id"] = df_trips[ "origin_departement_id"].fillna("unknown") df_trips["destination_departement_id"] = df_trips[ "destination_departement_id"].fillna("unknown") df_households["departement_id"] = df_households["departement_id"].astype( "category") df_persons["departement_id"] = df_persons["departement_id"].astype( "category") df_trips["origin_departement_id"] = df_trips[ "origin_departement_id"].astype("category") df_trips["destination_departement_id"] = df_trips[ "destination_departement_id"].astype("category") # Clean employment df_persons["employed"] = df_persons["P7"].isin(["1", "2"]) # Studies df_persons["studies"] = df_persons["P7"].isin(["3", "4", "5"]) # Number of vehicles df_households[ "number_of_vehicles"] = df_households["M6"] + df_households["M5"] df_households["number_of_vehicles"] = df_households[ "number_of_vehicles"].astype(np.int) df_households["number_of_bikes"] = df_households["M7"].astype(np.int) # License df_persons["has_license"] = df_persons["P5"] == "1" # Has subscription df_persons["has_pt_subscription"] = df_persons["P10"].isin(["1", "2", "3"]) # Trip purpose df_trips["following_purpose"] = "invalid" df_trips["preceding_purpose"] = "invalid" for purpose, values in PURPOSE_MAP.items(): df_trips.loc[df_trips["D5A"].isin(values), "following_purpose"] = purpose df_trips.loc[df_trips["D2A"].isin(values), "preceding_purpose"] = purpose assert np.count_nonzero(df_trips["following_purpose"] == "invalid") == 0 assert np.count_nonzero(df_trips["preceding_purpose"] == "invalid") == 0 df_trips["following_purpose"] = df_trips["following_purpose"].astype( "category") df_trips["preceding_purpose"] = df_trips["preceding_purpose"].astype( "category") # Trip mode for mode, values in MODES_MAP.items(): df_trips.loc[df_trips["MODP"].isin(values), "mode"] = mode assert np.count_nonzero(df_trips["following_purpose"] == "invalid") == 0 df_trips["mode"] = df_trips["mode"].astype("category") # Further trip attributes df_trips["euclidean_distance"] = df_trips["DOIB"] df_trips["routed_distance"] = df_trips["DIST"] # Trip times df_trips["departure_time"] = 3600.0 * (df_trips["D4"] // 100) # hour df_trips["departure_time"] += 60.0 * (df_trips["D4"] % 100) # minute df_trips["arrival_time"] = 3600.0 * (df_trips["D8"] // 100) # hour df_trips["arrival_time"] += 60.0 * (df_trips["D8"] % 100) # minute df_trips = df_trips.sort_values( by=["household_id", "person_id", "trip_id"]) df_trips = hts.fix_trip_times(df_trips) # Durations df_trips["trip_duration"] = df_trips["arrival_time"] - df_trips[ "departure_time"] hts.compute_activity_duration(df_trips) # Add weight to trips df_trips = pd.merge(df_trips, df_persons[["person_id", "COEQ"]], on="person_id", how="left").rename(columns={"COEQ": "trip_weight"}) df_persons["trip_weight"] = df_persons["COEQ"] # Chain length df_count = df_trips[[ "person_id" ]].groupby("person_id").size().reset_index(name="number_of_trips") df_persons = pd.merge(df_persons, df_count, on="person_id", how="left") df_persons["number_of_trips"] = df_persons["number_of_trips"].fillna( -1).astype(int) # Passenger attribute df_persons["is_passenger"] = df_persons["person_id"].isin( df_trips[df_trips["mode"] == "car_passenger"]["person_id"].unique()) # Calculate consumption units hts.check_household_size(df_households, df_persons) df_households = pd.merge(df_households, hts.calculate_consumption_units(df_persons), on="household_id") # Socioprofessional class df_persons["socioprofessional_class"] = df_persons["P9"].fillna(8).astype( int) # Check departure and arrival times assert np.count_nonzero(df_trips["departure_time"].isna()) == 0 assert np.count_nonzero(df_trips["arrival_time"].isna()) == 0 # Fix activity types (because of inconsistent EGT data and removing in the timing fixing step) hts.fix_activity_types(df_trips) return df_households, df_persons, df_trips