def get_full_cols(): """Return all the columns use by all the models.""" # make list of all useful columns full_cols = [] for k, v in FILE_COLUMNS.items(): full_cols.extend(_load_file_from_pickle(v)) return (set(full_cols))
def convert_numeric_cols(df): # getting categorical cols mapper_path = Path(MAPPER_FOLDER) mapper_file = mapper_path / MAPPERS["categorical"] categorical_mapper = _load_file_from_pickle(mapper_file) to_float = set(df.columns) - set(categorical_mapper.keys()) # to_float = set(["CODIGO_VEHICULO", "COD_RIES_sini", "CONDICION_ROBO_EXP50", "CODIGO_REAPERTURA", # "ESTAD_VEH_ASEG", "CODIGO_BAJA", "COD_RAMO_sini"]) # to_float = to_float.intersection(set(df.columns)) for col in to_float: df.loc[:, col] = pd.to_numeric(df.loc[:, col], errors="coerce") return df
def features_formatter(df): """Make specific features transformations.""" # Create new variables condicion cond_cols = df.filter(regex="^cond_*").columns df["total_condicion"] = df[cond_cols].sum(axis=1) df["es_gte_5"] = df["total_condicion"] >= 5 # Create new variables siniestros to_date = ['FECHA_SINI', 'FEC_DENU_SINI', 'FECHA_NAC_ASEG', 'FECHA_NAC_TERC', "FECHA_FORMAL"] date_format = "%d/%m/%Y" for col in to_date: df[col] = pd.to_datetime(df[col], format=date_format, errors="coerce") # df["TIPO_EXPED"] = df["TIPO_EXPED"].to_string() df["TIPO_EXPED"] = df["TIPO_EXPED"].astype(str) # making new variables if "MCA_COASEG" in df.columns: df["MCA_COASEG"] = df["MCA_COASEG"] == "SI" df["dist_fformal_fsini"] = (df["FECHA_FORMAL"] - df["FECHA_SINI"]).dt.days df["dist_fformal_fdenu"] = (df["FECHA_FORMAL"] - df["FEC_DENU_SINI"]).dt.days df["dias_entre_denu_y_sini"] = (df["FEC_DENU_SINI"] - df["FECHA_SINI"]).dt.days df["edad_aseg"] = df["FECHA_SINI"].dt.year - df['FECHA_NAC_ASEG'].dt.year df["edad_terc"] = df["FECHA_SINI"].dt.year - df['FECHA_NAC_TERC'].dt.year df["existe_FECHA_FORMAL"] = df["FECHA_FORMAL"].notna() # Create new variables asegurados # replace values df["TIPO_ACTIVIDAD"].replace("SinDato", np.nan, inplace=True) # Create new variables vigabt to_date = ['FECHA_VIG_ORIG_POL', 'FECHA_VIG_POL', ] date_format = "%d/%m/%Y" for col in to_date: df[col] = pd.to_datetime(df[col], format=date_format, errors="coerce") df["cambio_cobro"] = (df["COD_COBRO"] != df["COD_COBRO_ANTERIOR"]) & (df["COD_COBRO_ANTERIOR"].notna()) df["ANTIG_calc"] = (df["FECHA_VIG_POL"] - df["FECHA_VIG_ORIG_POL"]).dt.days df["CONV_COMISIONARIO"] = df["CONV_COMISIONARIO"].astype(str) # transform df["TIPO_EXPED"] = df["TIPO_EXPED"].astype(str).str.zfill(3) # creating datetime features expand_datetime(df, ["FECHA_SINI"], drop=False, time=True, inplace=True) # antiguedad poliza: rename ANTIG_calc as ANTIG_pol df.rename(columns={"ANTIG_calc": "ANTIG_pol"}, inplace=True) # COD_POST_OCURRENCIA & COD_POST_POLIZA post_cols = ["COD_POST_POLIZA", "COD_POST_OCURRENCIA", "COD_POST_TERC"] for col in post_cols: tmp = df[df[col] >= 1000000] # df[col] = df[col].to_string() df[col] = df[col].astype(str) df[col] = df[col].str.replace(".0","", regex=False) # removing last 3 digits df.loc[tmp.index, col] = df.loc[tmp.index, col].str[:-3] # returning to float for the mapper df[col] = pd.to_numeric(df[col], errors="coerce") # COD_POSTAL mapper maper_path = Path(MAPPER_FOLDER) mapper_file = maper_path / MAPPERS["cluster_location"] cluster_mapper = _load_file_from_pickle(mapper_file) cols_to_map = ["COD_POST_POLIZA", "COD_POST_OCURRENCIA", "COD_POST_TERC"] for col in cols_to_map: for k, v in cluster_mapper.items(): new_col = col + "_" + k df[new_col] = df[col] df[new_col] = df[new_col].map(v) # create column number of conditions activated cond_cols = df.filter(regex="^cond_*").columns df["cant_cond"] = (df[cond_cols] > 0).sum(axis=1) # categorical mapper full_cols = get_full_cols() # add missing columns df = add_missing_columns(df, full_cols) mapper_file = maper_path / MAPPERS["categorical"] categorical_mapper = _load_file_from_pickle(mapper_file) cat_cols = set(full_cols).intersection(set(categorical_mapper.keys())) for col in cat_cols: df[col] = df[col].map(categorical_mapper[col]) # reduce number of variables used # add tipo_exped always full_cols.update(["NUM_SECU_EXPED", "TIPO_EXPED", "total_condicion", "cant_cond"]) df = df[full_cols].copy() # replace nulls by -1 df.fillna(-1, inplace=True) to_float = set(["CODIGO_VEHICULO", "COD_RIES_sini", "CONDICION_ROBO_EXP50", "CODIGO_REAPERTURA", "ESTAD_VEH_ASEG", "CODIGO_BAJA", "COD_RAMO_sini"]) to_float = to_float.intersection(set(df.columns)) for col in to_float: df[col] = pd.to_numeric(df[col], errors="coerce") return df
def load_mapper(mapper_file): """Load and return the categorical feature mapper.""" return _load_file_from_pickle(mapper_file)
def load_columns(columns_file): """Load the array of columns use in the dump model.""" return _load_file_from_pickle(columns_file)
def load_model(model_file): """Load the dump model.""" return _load_file_from_pickle(model_file)
def transform_data(): print("Formatting data: ", end="") output_path = Path(OUTPUT_FOLDER) file = output_path / "merged.feather" df = pd.read_feather(file) # transform data # df["TIPO_EXPED"] = df["TIPO_EXPED"].to_string().zfill(3) df["TIPO_EXPED"] = df["TIPO_EXPED"].astype(str).str.zfill(3) # creating datetime features expand_datetime(df, ["FECHA_SINI"], drop=False, time=True, inplace=True) # antiguedad poliza: rename ANTIG_calc as ANTIG_pol df.rename(columns={"ANTIG_calc": "ANTIG_pol"}, inplace=True) # COD_POST_OCURRENCIA & COD_POST_POLIZA post_cols = ["COD_POST_POLIZA", "COD_POST_OCURRENCIA", "COD_POST_TERC"] for col in post_cols: tmp = df[df[col] >= 1000000] # df[col] = df[col].to_string() df[col] = df[col].astype(str) df[col] = df[col].str.replace(".0", "", regex=False) # removing last 3 digits df.loc[tmp.index, col] = df.loc[tmp.index, col].str[:-3] # returning to float for the mapper df[col] = pd.to_numeric(df[col], errors="coerce") # COD_POSTAL mapper maper_path = Path(MAPPER_FOLDER) mapper_file = maper_path / MAPPERS["cluster_location"] cluster_mapper = _load_file_from_pickle(mapper_file) cols_to_map = ["COD_POST_POLIZA", "COD_POST_OCURRENCIA", "COD_POST_TERC"] for col in cols_to_map: for k, v in cluster_mapper.items(): new_col = col + "_" + k df[new_col] = df[col] df[new_col] = df[new_col].map(v) # create column number of conditions activated cond_cols = df.filter(regex="^cond_*").columns df["cant_cond"] = (df[cond_cols] > 0).sum(axis=1) # categorical mapper full_cols = get_full_cols() # add missing columns df = add_missing_columns(df, full_cols) mapper_file = maper_path / MAPPERS["categorical"] categorical_mapper = _load_file_from_pickle(mapper_file) cat_cols = set(full_cols).intersection(set(categorical_mapper.keys())) for col in cat_cols: df[col] = df[col].map(categorical_mapper[col]) # reduce number of variables used # add tipo_exped always full_cols.update( ["NUM_SECU_EXPED", "TIPO_EXPED", "total_condicion", "cant_cond"]) df = df[full_cols].copy() # replace nulls by -1 df.fillna(-1, inplace=True) to_float = set([ "CODIGO_VEHICULO", "COD_RIES_sini", "CONDICION_ROBO_EXP50", "CODIGO_REAPERTURA", "ESTAD_VEH_ASEG", "CODIGO_BAJA", "COD_RAMO_sini" ]) to_float = to_float.intersection(set(df.columns)) for col in to_float: df[col] = pd.to_numeric(df[col], errors="coerce") # save file save_file = Path(OUTPUT_FOLDER) / "merged_transformed.feather" df.to_feather(save_file) print("... OK") return df
from configs import FILE_COLUMNS from helpers import _load_file_from_pickle from data_preparation import get_full_cols if __name__ == "__main__": print("all columns") # print(get_full_cols()) for k, path in FILE_COLUMNS.items(): # cant_cond cols = _load_file_from_pickle(path) val = "MCA_POLIZA_VIP" if val in cols: print(f"{k} -> {val}")