def _transform_data(self, df, df_base, bl_processed): """Transform original dataset. :param df: Input DataFrame. :param bl_processed :return: Transformed DataFrame. """ if self._is_diario: df = df.withColumn('TEST', lit(1)) df_base = df_base.withColumn('TEST', lit(0)) df = df.union(df_base) # Cast key variables and rename headers exprs = [ df[column].alias(column.replace('"', '')) for column in df.columns ] df = df.select(*exprs) exprs = [ df[column].alias(column.replace(' ', '')) for column in df.columns ] df = df.select(*exprs) df = df.withColumnRenamed('hist_siniestro_poliza_otro_id_siniestro', 'id_siniestro') df = df.withColumnRenamed('auditCodigoSiniestroReferencia', 'id_siniestro_ref') df = df.withColumn('id_siniestro_ref', df.id_siniestro_ref.cast(IntegerType())) df = df.withColumn('id_siniestro', df.id_siniestro.cast(IntegerType())) df = df.dropna(subset=['id_siniestro_ref']) df = df.dropna(subset=['id_siniestro']) # DATE VARIABLES FORMAT fecha_variables = [ "hist_siniestro_poliza_otro_fecha_ocurrencia", "hist_siniestro_poliza_otro_fecha_terminado", "auditFechaAperturaSiniestroReferencia" ] func = udf(lambda x: datetime.datetime.strptime(x, '%Y/%m/%d'), DateType()) for col in fecha_variables: df = df.fillna({col: '1900/01/01'}) df = df.withColumn(col, func(df[col])) df = df.withColumn( col, when(df[col] == '1900-01-01', None).otherwise(df[col])) df = df.filter(df[col] <= time.strftime('%Y-%m-%d')) # We check that the sinister in the other policy is before the reference sinister, because we want to know the # past values df = df.filter(df['auditFechaAperturaSiniestroReferencia'] >= df['hist_siniestro_poliza_otro_fecha_ocurrencia']) # COUNT POLIZA-VERSION: We count how many sinisters before have the costumer. It counts how many times appear a # row in the table, because each line is referred to a unique sinister df = df.withColumn('hist_sin_poliza_otro_count_version', lit(1)) w = (Window().partitionBy(df.id_siniestro_ref).rowsBetween( -sys.maxsize, sys.maxsize)) df = df.withColumn( 'hist_sin_poliza_otro_count', count_(df.hist_sin_poliza_otro_count_version).over(w)) # COUNT POLIZAS: We count how many policies has the customer. We have to construct another table so we can # group at the level of policies. count_poliza = df.select( ['id_siniestro_ref', 'hist_siniestro_poliza_otro_id_poliza']) count_poliza = count_poliza.dropDuplicates() count_poliza = count_poliza.withColumnRenamed( 'hist_siniestro_poliza_otro_id_poliza', 'hist_sin_poliza_otro_count_polizas') count_poliza = count_poliza.withColumn( 'hist_sin_poliza_otro_count_polizas', count_(df['id_siniestro_ref']).over(w)) count_poliza = count_poliza.dropDuplicates(subset=['id_siniestro_ref']) df = df.join(count_poliza, on='id_siniestro_ref', how='left') # SINIESTROS/POLIZAS: Here we calculate the ratio nºsinisters/nº policies df = df.withColumn( 'hist_siniestro_poliza_otro_siniestros_polizas', df['hist_sin_poliza_otro_count'] / df['hist_sin_poliza_otro_count_polizas']) # FUE UN SINIESTRO FRAUDULENTO? We check if the id_siniestro is associated with a previous Fraud Sinister bl_processed = bl_processed.select('id_siniestro').dropDuplicates( subset=['id_siniestro']) bl_processed = bl_processed.withColumn('hist_sin_poliza_otro_fraude', lit(1)) df = df.join(bl_processed, on='id_siniestro', how='left') df = df.withColumn( 'hist_sin_poliza_otro_fraude', when(df['hist_sin_poliza_otro_fraude'].isNull(), 0).otherwise(df['hist_sin_poliza_otro_fraude'])) # POR PRODUCTO: We group the product number by predefined categories in tabla_productos. It permits a better # classification. Here we have to pre-process the product label format to have coincidence. types = df.select( 'hist_siniestro_poliza_otro_id_producto').distinct().collect() types = [ty['hist_siniestro_poliza_otro_id_producto'] for ty in types] types_list = [ when(df['hist_siniestro_poliza_otro_id_producto'] == ty, 1).otherwise(0).alias('d_hist_sin_poliza_otro_producto_' + ty) for ty in types ] df = df.select(list(df.columns) + types_list) df.drop('hist_siniestro_poliza_otro_id_producto') # DUMMIES: We acumulate the dummy variables to get the variables at cod_filiacion level types = ['d_hist_sin_poliza_otro_producto_' + x for x in types] var_dummies = [ "hist_siniestro_poliza_otro_bbdd", "hist_siniestro_poliza_otro_unidad_investigacion", "hist_siniestro_poliza_otro_incidencia_tecnica", "hist_siniestro_poliza_otro_incidencia_tecnica_positiva", "hist_siniestro_poliza_otro_incidencias", "hist_siniestro_poliza_otro_cobertura" ] + types for col in var_dummies: df = df.withColumn(col + '_count', sum_(df[col]).over(w)) df = df.drop(col) # FECHAS: We have two dates. fecha_ocurrencia and fecha_terminado. We have to take into account claims # that are not finished. If the claim is notfinished we input today as date # and create a variable that indicates the situation. df = df.withColumn( 'hist_siniestro_poliza_otro_no_terminado', when(df['hist_siniestro_poliza_otro_fecha_terminado'].isNull(), 1).otherwise(0)) df = df.fillna({ 'hist_siniestro_poliza_otro_fecha_terminado': time.strftime('%Y-%m-%d') }) # Claim duration: We calculate the cumulated duration and the average duration. df = df.withColumn( 'hist_poliza_otro_fecha_apertura_terminado', datediff('hist_siniestro_poliza_otro_fecha_terminado', 'hist_siniestro_poliza_otro_fecha_ocurrencia')) df = df.withColumn( 'hist_poliza_otro_fecha_apertura_terminado', sum_(df['hist_poliza_otro_fecha_apertura_terminado']).over(w)) df = df.withColumn( 'hist_poliza_otro_duracion_promedio_sin', df['hist_poliza_otro_fecha_apertura_terminado'] / df['hist_sin_poliza_otro_count']) # ULTIMO SINIESTRO DE LA POLIZA df = df.withColumnRenamed( 'hist_siniestro_poliza_otro_fecha_ocurrencia', 'hist_siniestro_poliza_otro_ultimo_fecha_ocurrencia') df = df.orderBy('hist_siniestro_poliza_otro_ultimo_fecha_ocurrencia', ascending=False) # CARGA SINIESTRAL # Outlier: First we calculate the outliers quantity by cliente-sinister so we can get the intra-effect df = df.withColumnRenamed( 'coste_del_siniestro_por_rol', 'hist_siniestro_poliza_otro_carga_siniestral') df = df.fillna({'hist_siniestro_poliza_otro_carga_siniestral': 0}) df = df.withColumn( 'hist_siniestro_poliza_otro_carga_siniestral', df.hist_siniestro_poliza_otro_carga_siniestral.cast(FloatType())) # Construimos el outlier a nivel siniestro: Luego hacemos la suma de los casos de outlier por id_siniestro_ref df = outliers.Outliers.outliers_mad( df, 'hist_siniestro_poliza_otro_carga_siniestral', not_count_zero=True) df = df.withColumn( 'hist_siniestro_poliza_otro_carga_siniestral_mad_outlier_count', sum_(df['hist_siniestro_poliza_otro_carga_siniestral_mad_outlier'] ).over(w)) df = df.withColumn( 'hist_siniestro_poliza_otro_carga_siniestral_mad_outlier_promedio', df['hist_siniestro_poliza_otro_carga_siniestral_mad_outlier_count'] / df['hist_sin_poliza_otro_count']) df = df.drop('hist_siniestro_poliza_otro_carga_siniestral_mad_outlier') # We calculate the sum and the average by sinister df = df.withColumn( 'hist_siniestro_poliza_otro_carga_siniestral_count', sum_(df['hist_siniestro_poliza_otro_carga_siniestral']).over(w)) df = df.withColumn( 'hist_siniestro_poliza_otro_carga_siniestral_promedio', df['hist_siniestro_poliza_otro_carga_siniestral_count'] / df['hist_sin_poliza_otro_count']) # COBERTURAS # mayor a 3: we consider as outlier > 3, because the mean is concentrated around 1.28 df = df.withColumn( 'hist_sin_poliza_otro_mayor3coberturas', when(df["hist_siniestro_poliza_otro_coberturas_involucradas"] > 3, 1).otherwise(0)) df = df.withColumn( 'hist_sin_poliza_otro_mayor3coberturas', sum_(df['hist_sin_poliza_otro_mayor3coberturas']).over(w)) # promedio: Average by claim df = df.withColumn( 'hist_sin_poliza_otro_cober_sum', sum_( df['hist_siniestro_poliza_otro_coberturas_involucradas']).over( w)) df = df.withColumn( 'hist_sin_poliza_otro_cober_promedio', df["hist_sin_poliza_otro_cober_sum"] / df['hist_sin_poliza_otro_count']) # pagas-cubiertas: We calculate this at the coustomer cumulated level and not to claim level df = df.withColumn( 'hist_siniestro_poliza_otro_coberturas_involucradas_pagadas_sum', sum_(df[ 'hist_siniestro_poliza_otro_coberturas_involucradas_pagadas']). over(w)) df = df.withColumn( 'hist_sin_poliza_otro_pagas_cubiertas', df["hist_siniestro_poliza_otro_coberturas_involucradas_pagadas_sum"] / df['hist_sin_poliza_otro_cober_sum']) # no pagas: Here we calculate at the claim level, counting the total unpaid coverages df = df.withColumn( 'hist_sin_poliza_otro_cob_no_pagas', when( df['hist_siniestro_poliza_otro_coberturas_involucradas_pagadas'] == 0, 1).otherwise(0)) df = df.withColumn( 'hist_sin_poliza_otro_cob_no_pagas', sum_(df['hist_sin_poliza_otro_cob_no_pagas']).over(w)) # DELETE VARIABLES: We delete variables that are not relevant or have been transformed del_variables = [ 'hist_siniestro_poliza_otro_id_poliza', 'hist_siniestro_poliza_otro_id_producto', 'hist_siniestro_poliza_otro_version', 'hist_siniestro_poliza_otro_id_siniestro', 'hist_siniestro_poliza_otro_fecha_terminado', 'hist_siniestro_poliza_otro_bbdd', 'hist_siniestro_poliza_otro_unidad_investigacion', 'hist_siniestro_poliza_otro_incidencia_tecnica', 'hist_siniestro_poliza_otro_incidencia_tecnica_positiva', 'hist_siniestro_poliza_otro_incidencias', 'hist_siniestro_poliza_otro_cobertura', 'hist_siniestro_poliza_otro_carga_siniestral', 'hist_siniestro_poliza_otro_coberturas_involucradas', 'hist_siniestro_poliza_otro_coberturas_involucradas_pagadas', 'id_fiscal', 'hist_sin_poliza_otro_count_version', 'Agrupación productos', 'Producto', 'auditFechaAperturaSiniestroReferencia', 'cliente_codfiliacion', 'audit_siniestro_codigo_compania', 'id_siniestro' ] df = df.drop(*del_variables) df = df.withColumnRenamed('id_siniestro_ref', 'id_siniestro') df = df.dropDuplicates(subset=['id_siniestro']) # OUTLIER: We calculate the outliers referred to the ratio claims/policies. df = outliers.Outliers.outliers_mad( df, 'hist_siniestro_poliza_otro_siniestros_polizas', not_count_zero=False) if self._is_diario: df = df.filter(df['TEST'] == 1) df = df.drop('TEST') return df
def _transform(self, df, auxiliar_train): if not self.train_file: auxiliar_train = auxiliar_train.drop('WinningBid') auxiliar_train = auxiliar_train.withColumn('test', lit(0)) df = df.withColumn('test', lit(1)) df = auxiliar_train.union(df) del auxiliar_train # We create the time as Index split_col = split(df['ApproximateDate'], ' ') df = df.withColumn('time', split_col.getItem(1)) # time # Hour Index func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hms'), IntegerType()) df = df.withColumn('hms_index', func_index(df['time'])) # We order by UserId-Date df = df.orderBy(['UserID', 'hms_index']) # We check Null Values df.select([count_(when(isnan(c), c)).alias(c) for c in df.columns]).show() # We create a rank of users by how many times in the past saw an ad w = (Window().partitionBy(df.UserID).orderBy('time').rowsBetween( Window.unboundedPreceding, 0)) df = df.withColumn('user_id_acumulative', count_(df['UserId']).over(w)) # Number of Ads/User/Second df = df.withColumn('key_id', concat(df['UserID'], lit(' '), df['hms_index'])) w = (Window().partitionBy(df.key_id).orderBy('hms_index').rowsBetween( -sys.maxsize, sys.maxsize)) df = df.withColumn('number_ads_user_second', count_(df.key_id).over(w)) # Number of Ads/User df_group = df.groupby(['key_id' ]).agg(count_('key_id').alias('count_ads')) split_col = split(df_group['key_id'], ' ') df_group = df_group.withColumn('UserID', split_col.getItem(0)) # time w = (Window().partitionBy( df_group.UserID).orderBy('key_id').rowsBetween( Window.unboundedPreceding, 0)) df_group = df_group.withColumn('number_ads_user', sum_(df_group.count_ads).over(w)) df_group = df_group.select(['key_id', 'number_ads_user']) df = df.join(df_group, how='left', on='key_id') del df_group # Number of Users/Second w = (Window().partitionBy(df.ApproximateDate).rowsBetween( -sys.maxsize, sys.maxsize)) df = df.withColumn('number_user_second', approx_count_distinct(df.UserID).over(w)) # Number of Ads/Second df = df.withColumn('number_ads_second', count_(df.ApproximateDate).over(w)) # Browser Dummy Transformation types = df.select('Browser').distinct().collect() types = [val['Browser'] for val in types] new_cols = [ when(df['Browser'] == ty, 1).otherwise(0).alias('d_browser_' + ty) for ty in types ] df = df.select(df.columns + new_cols) # Decompose Date Variables df = df.withColumn('date', to_date(df['ApproximateDate'])) # date df = df.withColumn('month', month(df['ApproximateDate'])) # month df = df.withColumn('day', dayofmonth(df['ApproximateDate'])) # day df = df.withColumn('weekday', dayofweek( df['ApproximateDate'])) # weekday 1=Monday df = df.withColumn('hour', hour(df['time'])) # hour df = df.withColumn('minute', minute(df['time'])) # minute # Peak Hour df = df.withColumn('peak6am8am', when(df['hour'].between(6, 8), 1).otherwise(0)) df = df.withColumn('peak14pm16pm', when(df['hour'].between(14, 16), 1).otherwise(0)) # Minute Index func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hm'), IntegerType()) df = df.withColumn('hm_index', func_index(df['time'])) # Convert to time-series by Minute # We reduce to minutes df_time_serie_ads = df.select([ 'hms_index', 'hm_index', 'number_user_second', 'number_ads_second' ]).drop_duplicates() df_time_serie_user = df.select(['UserID', 'hm_index']).drop_duplicates() # Group-by the values df_time_serie_user = df_time_serie_user.groupBy('hm_index').agg( approx_count_distinct('UserID')) df_time_serie_ads = df_time_serie_ads.groupBy('hm_index').agg({ 'number_ads_second': 'sum' }).drop_duplicates(subset=['hm_index']) # Join ads-users per minute df_time_serie = df_time_serie_ads.join(df_time_serie_user, how='left', on='hm_index') del df_time_serie_ads, df_time_serie_user # Rename columns df_time_serie = df_time_serie.withColumnRenamed( 'sum(number_ads_second)', 'number_ads_minute').withColumnRenamed( 'approx_count_distinct(UserID)', 'number_user_minute') # Resample Range of Minutes resample_range = list( range( df_time_serie.select(min_( col('hm_index'))).limit(1).collect()[0][0], df_time_serie.select(max_( col('hm_index'))).limit(1).collect()[0][0] + 1, 1)) resample_range = self._spark.createDataFrame(resample_range, IntegerType()) # Join the original df df_time_serie = resample_range.join( df_time_serie, how='left', on=resample_range.value == df_time_serie.hm_index).drop( *['hm_index']).fillna(0) # Create Lags By Minutes w = Window().partitionBy().orderBy(col('value')) if self.ar_min_lag > 0: df_time_serie = df_time_serie.select( '*', lag('number_user_minute').over(w).alias( 'ar1_number_user_minute')) df_time_serie = df_time_serie.select( '*', lag('number_ads_minute').over(w).alias( 'ar1_number_ads_minute')) if self.ar_min_lag > 1: for l in range(2, self.ar_min_lag + 1, 1): df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_user_minute').over( w).alias('ar' + str(l) + '_number_user_minute')) df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_ads_minute').over( w).alias('ar' + str(l) + '_number_ads_minute')) # Remove the lagged Null Values df_time_serie = df_time_serie.dropna() # join and remove lag Null values of the first minute df = df.orderBy(['UserID', 'hms_index']) df = df.join(df_time_serie.orderBy(['hm_index']), how='left', on=df.hm_index == df_time_serie.value).drop('value') # Convert to time-series and resample by Seconds df_time_serie = df.select( ['hms_index', 'number_user_second', 'number_ads_second']).drop_duplicates() resample_range = list( range( df_time_serie.select(min_( col('hms_index'))).limit(1).collect()[0][0], df_time_serie.select(max_( col('hms_index'))).limit(1).collect()[0][0] + 1, 1)) resample_range = self._spark.createDataFrame(resample_range, IntegerType()) # Join the original df df_time_serie = resample_range.join( df_time_serie, how='left', on=resample_range.value == df_time_serie.hms_index).drop( *['hms_index']).fillna(0) # Create lags w = Window().partitionBy().orderBy(col('value')) if self.ar_lags > 0: df_time_serie = df_time_serie.select( '*', lag('number_user_second').over(w).alias( 'ar1_number_user_second')) df_time_serie = df_time_serie.select( '*', lag('number_ads_second').over(w).alias( 'ar1_number_ads_second')) if self.ar_lags > 1: for l in range(2, self.ar_lags + 1, 1): df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_user_second').over( w).alias('ar' + str(l) + '_number_user_second')) df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_ads_second').over( w).alias('ar' + str(l) + '_number_ads_second')) # Create Moving Average if self.ma_ss_lag is not None: # Get hour from index func_index = udf(lambda x: auxiliar_func.num_to_time(x), StringType()) df_time_serie = df_time_serie.withColumn( 'time', func_index(df_time_serie['value'])) # minute MA terms (Average per second last xx seconds) if self.ma_ss_lag is not None: for lag_val in self.ma_ss_lag: # range to take into account w = (Window.orderBy(df_time_serie['value']).rangeBetween( -lag_val, 0)) # MA variables df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_user_second', avg('number_user_second').over(w)) df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_ads_second', avg('number_ads_second').over(w)) # Increasing ID df_time_serie = df_time_serie.withColumn( 'rn', monotonically_increasing_id()) # Replace first values by Null df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_user_second', when(df_time_serie['rn'] < lag_val, None).otherwise( df_time_serie['ma_seconds_' + str(lag_val) + '_number_user_second'])) df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_ads_second', when(df_time_serie['rn'] < lag_val, None).otherwise( df_time_serie['ma_seconds_' + str(lag_val) + '_number_ads_second'])) # Get the average by Minute df_time_serie = df_time_serie.withColumn( 'ma_minute_' + str(lag_val) + '_number_user_second', df_time_serie['ma_seconds_' + str(lag_val) + '_number_user_second'] * 60) df_time_serie = df_time_serie.withColumn( 'ma_minute_' + str(lag_val) + '_number_ads_second', df_time_serie['ma_seconds_' + str(lag_val) + '_number_ads_second'] * 60) df_time_serie = df_time_serie.drop(*['rn']) # Remove the lagged Null Values df_time_serie = df_time_serie.drop( *['time', 'number_user_second', 'number_ads_second']).dropna() # join and remove lag Null values of the first minute df = df.join( df_time_serie.orderBy(['value']), how='left', on=df.hms_index == df_time_serie.value).drop('value').dropna() if self.train_file and not self.variable_analysis: df = df.select([ 'key_id', 'hms_index', 'number_ads_user', 'number_user_second', 'number_ads_second', 'number_ads_user_second', 'peak6am8am', 'peak14pm16pm', 'user_id_acumulative' ] + [x for x in df.columns if x.startswith('d_browser')] + [x for x in df.columns if x.startswith('ar')] + [x for x in df.columns if x.startswith('ma_')] + ['WinningBid']) if not self.train_file: df = df.filter(df['test'] == 1) df = df.select([ 'UserID', 'key_id', 'number_ads_user', 'hms_index', 'number_user_second', 'number_ads_second', 'number_ads_user_second', 'peak6am8am', 'peak14pm16pm', 'user_id_acumulative' ] + [x for x in df.columns if x.startswith('d_browser')] + [x for x in df.columns if x.startswith('ar')] + [x for x in df.columns if x.startswith('ma_')]) df = df.orderBy(['hms_index', 'UserID']) df.show() return df
def checklist5(df_reserva, df_id, df_reserva_new=None, df_id_new=None): """ 2 siniestros de robo con joyas del mismo asegurado :return: This return a Dataframe with the columns 'id_siniestro', 'checklist5_poliza', 'checklist5_nif', where 'checklist5_' represents how many sinister (by policy/nif) belongs to JOYAS coverage """ exprs = [ df_id[column].alias(column.replace('"', '')) for column in df_id.columns ] df_id = df_id.select(*exprs) exprs = [ df_id[column].alias(column.replace(' ', '')) for column in df_id.columns ] df_id = df_id.select(*exprs) df_reserva = df_reserva.select( ['id_siniestro', 'id_poliza', 'po_res_cobertura']) df_id = df_id.select(['id_siniestro', 'id_fiscal']) if df_reserva_new is not None: df_reserva_new = df_reserva_new.select( ['id_siniestro', 'id_poliza', 'po_res_cobertura']) df_reserva = df_reserva.union(df_reserva_new) df_reserva = df_reserva.dropDuplicates( subset=['id_siniestro', 'po_res_cobertura']) df_reserva = df_reserva.withColumn( 'po_res_cobertura', when(df_reserva['po_res_cobertura'].contains('JOY'), 'INCIDENCIA').otherwise(df_reserva['po_res_cobertura'])) df_reserva = df_reserva.withColumn( 'po_res_cobertura', when(df_reserva['po_res_cobertura'].contains('ESPECIAL'), 'INCIDENCIA').otherwise(df_reserva['po_res_cobertura'])) df_reserva = df_reserva.filter( df_reserva['po_res_cobertura'] == 'INCIDENCIA') # We merge with ID by sinister if df_id_new is not None: exprs = [ df_id_new[column].alias(column.replace('"', '')) for column in df_id_new.columns ] df_id_new = df_id_new.select(*exprs) exprs = [ df_id_new[column].alias(column.replace(' ', '')) for column in df_id_new.columns ] df_id_new = df_id_new.select(*exprs) df_id_new = df_id_new.select(['id_siniestro', 'id_fiscal']) df_id = df_id.union(df_id_new) df_reserva = df_reserva.withColumn( 'id_siniestro', df_reserva.id_siniestro.cast(IntegerType())) df_id = df_id.withColumn('id_siniestro', df_id.id_siniestro.cast(IntegerType())) reserva_cobertura = df_reserva.join(df_id, 'id_siniestro', how='left') # We calculate the COUNT of JOYAS reserva_cobertura = reserva_cobertura.dropDuplicates( subset=['id_siniestro']) # Now we have the values by claim, we group by id_poliza and by nif w = (Window().partitionBy('id_siniestro').rowsBetween( -sys.maxsize, sys.maxsize)) reserva_cobertura = reserva_cobertura.withColumn( 'checklist5_poliza', count_(reserva_cobertura['id_poliza']).over(w)) reserva_cobertura = reserva_cobertura.withColumn( 'checklist5_nif', count_(reserva_cobertura['id_fiscal']).over(w)) reserva_cobertura = reserva_cobertura.drop( *['id_poliza', 'id_fiscal', 'po_res_cobertura']) return reserva_cobertura
def transform_data(self, df, df_reserva, df_reserva_new, df_fecha, init_date_new_, init_date_historic_): """Transform original dataset. :param df: Input DataFrame. :param df_reserva :param df_reserva_new :param df_fecha :param init_date_new_: Minimun date for new claims :param init_date_historic_: Max historical data :return: Transformed DataFrame. """ # Cast key variables and rename headers df = df.withColumnRenamed('auditCodigoSiniestroReferencia', 'id_siniestro') df = df.withColumn('id_siniestro', df.id_siniestro.cast(IntegerType())) # CONSERVED VARIABLES: We drop the variables that are not well defined or that at wrong defined. var_conserved = [ "id_siniestro", 'id_poliza', 'version_poliza', "fecha_poliza_emision", "fecha_poliza_efecto_natural", "fecha_poliza_efecto_mvto", "fecha_poliza_vto_movimiento", "fecha_poliza_vto_natural", "fecha_siniestro_ocurrencia", 'fecha_siniestro_comunicacion', "fecha_primera_visita_peritaje", "fecha_ultima_visita_peritaje" ] df = df.select(*var_conserved) # We fill siniestro_comunicacion with siniestro_ocurrencia df = df.withColumn( 'fecha_siniestro_comunicacion', coalesce('fecha_siniestro_comunicacion', 'fecha_siniestro_ocurrencia')) # STRIP dates: YEAR, MONTH, WEEKDAY, DAY var_fecha = [ "fecha_poliza_emision", "fecha_poliza_efecto_natural", "fecha_poliza_efecto_mvto", "fecha_poliza_vto_movimiento", "fecha_poliza_vto_natural", "fecha_siniestro_ocurrencia", 'fecha_primera_visita_peritaje', 'fecha_ultima_visita_peritaje', 'fecha_siniestro_comunicacion' ] func = udf(lambda x: datetime.datetime.strptime(x, '%Y/%m/%d'), DateType()) for col in var_fecha: year_name = str(col) + '_year' month_name = str(col) + '_month' day_name = str(col) + '_day' weekday_name = str(col) + '_weekday' df = df.fillna({col: '1900/01/01'}) df = df.withColumn(col, func(df[col])) df = df.withColumn( col, when(df[col] == '1900-01-01', None).otherwise(df[col])) df = df.withColumn(year_name, year(df[col])) df = df.withColumn(month_name, month(df[col])) df = df.withColumn(day_name, dayofmonth(df[col])) df = df.withColumn(weekday_name, date_format(col, 'u') - 1) # We adapt to (0=Monday, 1=Tuesday...) df = df.withColumn(weekday_name, df[weekday_name].cast(IntegerType())) # Filtering by INIT_DATE parameter df = df.filter(df['fecha_siniestro_ocurrencia'] >= init_date_historic_) # CHECKLIST 6a df = df.withColumn('checklist6a', lit(0)) df = df.withColumn('checklist6a_PP', lit(0)) # CHECKLIST 6b if self._is_diario: # Filtering new Claims INIT_DATE df = df.filter( df['fecha_siniestro_comunicacion'] >= init_date_new_) auxiliar_list = checklist_spark.checklist6b( df, df_fecha, df_reserva_new, df_reserva) else: auxiliar_list = checklist_spark.checklist6b( None, df, None, df_reserva) if auxiliar_list: r = Row('id_siniestro_c', 'checklist_6b') df_claims = self._spark.createDataFrame( r(i, x) for i, x in auxiliar_list) df = df.join(df_claims, df.id_siniestro == df_claims.id_siniestro_c, how='left') del df_claims, r, auxiliar_list df = df.drop('id_siniestro_c') df = df.fillna({'checklist_6b': 0}) else: df = df.withColumn('checklist_6b', lit(0)) # CHECKLIST 7 if self._is_diario: auxiliar_list = checklist_spark.checklist_7( df, df_fecha, df_reserva_new, df_reserva) else: auxiliar_list = checklist_spark.checklist_7( None, df, None, df_reserva) if auxiliar_list: r = Row('id_siniestro', 'checklist_7') df_claims = self._spark.createDataFrame( r(i, x) for i, x in auxiliar_list) del auxiliar_list, r df = df.join(df_claims, on='id_siniestro', how='left') del df_claims df = df.drop('id_siniestro_c') df = df.fillna({'checklist_7': 0}) else: df = df.withColumn('checklist_7', lit(0)) # CHECKLIST 14 if self._is_diario: auxiliar_list = checklist_spark.checklist_14( df, df_fecha, df_reserva_new, df_reserva) else: auxiliar_list = checklist_spark.checklist_14( None, df, None, df_reserva) if auxiliar_list: r = Row('id_siniestro_c', 'checklist_14') df_claims = self._spark.createDataFrame( r(i, x) for i, x in auxiliar_list) w = (Window().partitionBy(df_claims.id_siniestro_c).rowsBetween( -sys.maxsize, sys.maxsize)) df_claims = df_claims.withColumn( 'checklist_14_coberturas_repetidas', sum_(df_claims.checklist_14).over(w)) df_claims = df_claims.withColumn( 'checklist_14_siniestros_involucrados', count_(df_claims.checklist_14).over(w)) df_claims = df_claims.dropDuplicates(subset=['id_siniestro_c']) df_claims.drop('checklist_14') df = df.join(df_claims, df.id_siniestro == df_claims.id_siniestro_c, how='left') del df_claims, r, auxiliar_list df = df.drop('id_siniestro_c') df = df.fillna({'checklist_14_coberturas_repetidas': 0}) df = df.fillna({'checklist_14_siniestros_involucrados': 0}) else: df = df.withColumn('checklist_14_coberturas_repetidas', lit(0)) df = df.withColumn('checklist_14_siniestros_involucrados', lit(0)) # COMPLEX NON-COMPLEX VARIABLES: We define two types of dates. That dates we want more detail we generate # every type of possible variable. Non-complex will be more agroupated variables. var_fecha_complex = ["fecha_siniestro_ocurrencia"] var_fecha_less_complex = [ "fecha_poliza_efecto_natural", "fecha_poliza_vto_natural" ] for i in var_fecha_complex: # We create dummies col_names = [ str(i) + '_year', str(i) + '_month', str(i) + '_weekday' ] for col in col_names: types = df.select(col).distinct().collect() types = [ty[col] for ty in types] type_list = [ when(df[col] == ty, 1).otherwise(0).alias('d_' + col + '_' + str(ty)) for ty in types ] df = df.select(list(df.columns) + type_list) # days range day = str(i) + '_day' df = df.withColumn(day + '1_10', when(df[day].between(1, 10), 1).otherwise(0)) df = df.withColumn(day + '10_20', when(df[day].between(11, 20), 1).otherwise(0)) df = df.withColumn(day + '20_30', when(df[day].between(21, 31), 1).otherwise(0)) for i in var_fecha_less_complex: # month in holiday df = df.withColumn( str(i) + '_month_holiday', when(df[str(i) + '_month'].isin([1, 8, 12]), 1).otherwise(0)) # days range day = str(i) + '_day' df = df.withColumn(day + '1_10', when(df[day].between(1, 10), 1).otherwise(0)) df = df.withColumn(day + '10_20', when(df[day].between(11, 20), 1).otherwise(0)) df = df.withColumn(day + '20_30', when(df[day].between(21, 31), 1).otherwise(0)) # weekend or monday df = df.withColumn( str(i) + '_weekday_weekend', when(df[str(i) + '_weekday'].isin([6, 7]), 1).otherwise(0)) df = df.withColumn( str(i) + '_weekday_monday', when(df[str(i) + '_weekday'] == 0, 1).otherwise(0)) # FIRST DELETE: We delete that variables we generated before that are not relevant or are # too specific. del_variables = [ 'fecha_poliza_emision_year', 'fecha_poliza_emision_month', 'fecha_poliza_emision_day', 'fecha_poliza_emision_weekday', 'fecha_poliza_efecto_natural_year', 'fecha_poliza_efecto_natural_month', 'fecha_poliza_efecto_natural_day', 'fecha_poliza_efecto_natural_weekday', 'fecha_poliza_efecto_mvto_year', 'fecha_poliza_efecto_mvto_month', 'fecha_poliza_efecto_mvto_day', 'fecha_poliza_efecto_mvto_weekday', 'fecha_poliza_vto_movimiento_year', 'fecha_poliza_vto_movimiento_month', 'fecha_poliza_vto_movimiento_day', 'fecha_poliza_vto_movimiento_weekday', 'fecha_poliza_vto_natural_year', 'fecha_poliza_vto_natural_month', 'fecha_poliza_vto_natural_day', 'fecha_poliza_vto_natural_weekday', 'fecha_siniestro_ocurrencia_year', 'fecha_siniestro_ocurrencia_month', 'fecha_siniestro_ocurrencia_day', 'fecha_siniestro_ocurrencia_weekday', 'fecha_primera_visita_peritaje_year', 'fecha_primera_visita_peritaje_month', 'fecha_primera_visita_peritaje_day', 'fecha_primera_visita_peritaje_weekday', 'fecha_ultima_visita_peritaje_year', 'fecha_ultima_visita_peritaje_month', 'fecha_ultima_visita_peritaje_day', 'fecha_ultima_visita_peritaje_weekday', 'fecha_siniestro_comunicación_year', 'fecha_siniestro_comunicación_month', 'fecha_siniestro_comunicación_day', 'fecha_siniestro_comunicación_weekday', 'id_poliza', 'hogar_poblacion', 'version_poliza' ] df = df.drop(*del_variables) # FECHAS LOGICAS: We create different types of dates var that can be relevant to fraud analysis. # Diferencia entre primera póliza emisión y último vencimiento natural df = df.withColumn( 'fecha_diferencia_vto_emision', datediff(df['fecha_poliza_vto_natural'], df['fecha_poliza_emision'])) # if fecha efecto < fecha emision => d = 1 df = df.withColumn( 'fecha_indicador_efecto_emision', when( df['fecha_poliza_emision'] > df['fecha_poliza_efecto_natural'], 1).otherwise(0)) # diferencia entre siniestro y efecto: 5, 15, 30 días df = df.withColumn( 'fecha_diferencia_siniestro_efecto', datediff(df['fecha_siniestro_ocurrencia'], df['fecha_poliza_efecto_natural'])) days_var = [5, 15, 30] for col in days_var: df = df.withColumn( 'fecha_diferencia_siniestro_efecto_' + str(col), when(df['fecha_diferencia_siniestro_efecto'] <= col, 1).otherwise(0)) # diferencia entre siniestro y primera emisión: 5, 15, 30 días df = df.withColumn( 'fecha_diferencia_siniestro_emision', datediff(df['fecha_siniestro_ocurrencia'], df['fecha_poliza_emision'])) for col in days_var: df = df.withColumn( 'fecha_diferencia_siniestro_emision_' + str(col), when(df['fecha_diferencia_siniestro_emision'] <= col, 1).otherwise(0)) # diferencia entre siniestro y vencimiento 5, 15, 30 días df = df.withColumn( 'fecha_diferencia_siniestro_vto_natural', datediff(df['fecha_poliza_vto_natural'], df['fecha_siniestro_ocurrencia'])) for col in days_var: df = df.withColumn( 'fecha_diferencia_siniestro_vto_natural_' + str(col), when(df['fecha_diferencia_siniestro_vto_natural'] <= col, 1).otherwise(0)) # if fecha comunicacion > fecha ocurrencia en 7 días, d = 1 df = df.withColumn( 'fecha_diferencia_siniestro_comunicacion', datediff(df['fecha_siniestro_comunicacion'], df['fecha_siniestro_ocurrencia'])) df = df.withColumn( 'fecha_diferencia_comunicacion_outlier', when(df['fecha_diferencia_siniestro_comunicacion'] >= 7, 1).otherwise(0)) df = df.drop('fecha_siniestro_comunicacion') df = df.dropDuplicates(subset=['id_siniestro']) return df
def _transform_data(self, df, df_base, bl_processed): """Transform original dataset. :param df: Input DataFrame. :param bl_processed :return: Transformed DataFrame. """ if self._is_diario: df = df.withColumn('TEST', lit(1)) df_base = df_base.withColumn('TEST', lit(0)) df = df.union(df_base) # Cast key variables and rename headers df = df.withColumnRenamed('auditCodigoSiniestroReferencia', 'id_siniestro_ref') df = df.withColumn('id_siniestro_ref', df.id_siniestro_ref.cast(IntegerType())) df = df.withColumn('id_siniestro', df.id_siniestro.cast(IntegerType())) df = df.dropna(subset=['id_siniestro_ref']) df = df.dropna(subset=['id_siniestro']) # DATE VARIABLES FORMAT fecha_variables = ['hist_siniestro_otro_fecha_ocurrencia', 'hist_siniestro_fecha_terminado'] func = udf(lambda x: datetime.datetime.strptime(x, '%Y/%m/%d'), DateType()) for col in fecha_variables: df = df.fillna({col: '1900/01/01'}) df = df.withColumn(col, func(df[col])) df = df.withColumn(col, when(df[col] == '1900-01-01', None).otherwise(df[col])) df = df.filter(df[col] <= time.strftime('%Y-%m-%d')) # COUNT ID_SINIESTRO_REF CUANTOS SINIESTROS TIENE df = df.withColumn('hist_sin_otros_count_version', lit(1)) w = (Window().partitionBy(df.id_siniestro_ref).rowsBetween(-sys.maxsize, sys.maxsize)) df = df.withColumn('hist_sin_otros_count', count_(df.hist_sin_otros_count_version).over(w)) # SINIESTRO TERMINADO: We transform in dummy variables the category siniestro_sit types = df.select('hist_siniestro_otro_sit').distinct().collect() types = [ty['hist_siniestro_otro_sit'] for ty in types] type_list = [when(df['hist_siniestro_otro_sit'] == ty, 1).otherwise(0).alias('d_hist_siniestro_otro_sit_' + ty) for ty in types] df = df.select(list(df.columns) + type_list) # DUMMIES ACUMULATIVAS types = ['d_hist_siniestro_otro_sit_' + x for x in types] var_dummies = ["hist_siniestro_otro_rehusado", "hist_siniestro_otro_bbdd", "hist_siniestro_otro_unidad_investigacion", "hist_siniestro_otro_incidencia_tecnica", "hist_siniestro_otro_incidencia_tecnica_positiva", "hist_siniestro_otro_incidencias", "hist_siniestro_otro_cobertura", "hist_siniestro_otro_rehabilitado"] + types for col in var_dummies: df = df.withColumn(col + '_count', sum_(df[col]).over(w)) df = df.drop(col) # DATE VARIABLES # Duración = Fecha_Terminado - Fecha_Ocurrrencia df = df.withColumn('hist_otros_fecha_apertura_terminado', datediff('hist_siniestro_fecha_terminado', 'hist_siniestro_otro_fecha_ocurrencia')) df = df.withColumn('hist_otros_fecha_apertura_terminado', sum_(df['hist_otros_fecha_apertura_terminado']).over(w)) # Duración Promedio df = df.withColumn('hist_otros_duracion_promedio_sin', df['hist_otros_fecha_apertura_terminado'] / df['hist_sin_otros_count']) # Último Siniestro de la póliza: We are going to keep the first row, it is the last sinister. df = df.withColumnRenamed('hist_siniestro_otro_fecha_ocurrencia', 'hist_siniestro_otro_ultimo_fecha_ocurrencia') df = df.orderBy('hist_siniestro_otro_ultimo_fecha_ocurrencia', ascending=False) # FUE UN SINIESTRO FRAUDULENTO? We check if the id_siniestro is associated with a previous Fraud Sinister bl_processed = bl_processed.select('id_siniestro').dropDuplicates(subset=['id_siniestro']) bl_processed = bl_processed.withColumn('hist_sin_otro_fraude_count', lit(1)) df = df.join(bl_processed, on='id_siniestro', how='left') df = df.withColumn('hist_sin_otro_fraude_count', when(df['hist_sin_otro_fraude_count'].isNull(), 0).otherwise( df['hist_sin_otro_fraude_count'])) df = df.withColumn('hist_sin_otro_fraude_count', sum_(df['hist_sin_otro_fraude_count']).over(w)) # CARGA SINIESTRAL df = df.withColumnRenamed('coste_del_siniestro_por_rol', 'hist_siniestro_carga_siniestral') df = df.fillna({'hist_siniestro_carga_siniestral': 0}) df = df.withColumn('hist_siniestro_carga_siniestral', df.hist_siniestro_carga_siniestral.cast(FloatType())) # Construimos el outlier a nivel siniestro: Luego hacemos la suma de los casos de outlier por id_siniestro_ref df = outliers.Outliers.outliers_mad(df, 'hist_siniestro_carga_siniestral', not_count_zero=True) df = df.withColumn('hist_siniestro_carga_siniestral_mad_outlier_count', sum_(df['hist_siniestro_carga_siniestral_mad_outlier']).over(w)) # suma total: Sumamos el total de la carga siniestral df = df.withColumn('hist_siniestro_carga_siniestral_sum', sum_(df['hist_siniestro_carga_siniestral']).over(w)) # promedio df = df.withColumn('hist_sin_carga_siniestral_promedio', df['hist_siniestro_carga_siniestral_sum'] / df['hist_sin_otros_count']) # COBERTURAS # Outliers df = outliers.Outliers.outliers_mad(df, 'hist_siniestro_coberturas_involucradas', not_count_zero=True) df = df.withColumn('hist_siniestro_coberturas_involucradas_mad_outlier', when(df['hist_siniestro_coberturas_involucradas'] > 3, 1).otherwise(0)) df = df.withColumn('hist_siniestro_coberturas_involucradas_mad_outlier_count', sum_(df['hist_siniestro_coberturas_involucradas_mad_outlier']).over(w)) df = df.drop('hist_siniestro_coberturas_involucradas_mad_outlier') # promedio df = df.withColumn('hist_sin_otros_cober_sum', sum_(df['hist_siniestro_coberturas_involucradas']).over(w)) df = df.withColumn('hist_sin_otros_cober_promedio', df['hist_sin_otros_cober_sum'] / df['hist_sin_otros_count']) # pagas-cubiertas df = df.withColumn('hist_siniestro_coberturas_involucradas_pagadas_sum', sum_(df['hist_siniestro_coberturas_involucradas_pagadas']).over(w)) df = df.withColumn('hist_sin_otros_pagas_cubiertas', df['hist_siniestro_coberturas_involucradas_pagadas_sum'] / df['hist_sin_otros_cober_sum']) # no-pagas df = df.withColumn('hist_sin_otros_cob_no_pagas', df['hist_siniestro_coberturas_involucradas'] - df['hist_siniestro_coberturas_involucradas_pagadas']) df = df.withColumn('hist_sin_otros_cob_no_pagas', sum_(df['hist_sin_otros_cob_no_pagas']).over(w)) # VARIABLES DEL del_variables = ['id_fiscal', 'hist_siniestro_otro_descripcion', "hist_siniestro_duracion", 'hist_siniestro_fecha_terminado', 'hist_sin_otros_count_version', 'hist_siniestro_otro_oficina_productora', 'hist_siniestro_carga_siniestral', 'hist_siniestro_coberturas_involucradas', 'hist_siniestro_coberturas_involucradas_pagadas', 'hist_otros_fecha_apertura_terminado', 'auditFechaAperturaSiniestroReferencia', 'id_siniestro', 'id_poliza', 'version_poliza', 'audit_siniestro_producto_tecnico', 'audit_siniestro_codigo_compania', 'hist_siniestro_otro_sit' ] df = df.drop(*del_variables) df = df.withColumnRenamed('id_siniestro_ref', 'id_siniestro') # Tomamos el primero de cada uno de los siniestros df = df.dropDuplicates(subset=['id_siniestro']) if self._is_diario: df = df.filter(df['TEST'] == 1) df = df.drop('TEST') return df
.withColumn("created", clean_string_for_date_udf(col_("created"))) \ .withColumn("removed", clean_string_for_date_udf(col_("removed"))) \ .withColumn("create_cohort", create_cohort_id_udf(col_("created"))) \ .withColumn("remove_cohort", create_cohort_id_udf(col_("removed"))) \ create_window = Window.partitionBy().orderBy("created") line_decay_df = line_decay_df.withColumn("prev_created", lag_(line_decay_df.created).over(create_window)) \ .withColumn("time_between", minutes_between_udf(col_("created"), col_("prev_created"))) line_decay_df.show() created_in_cohorts_df = line_decay_df.groupBy("create_cohort")\ .agg( \ count_(lit_(1)).alias("total_in_cohort"),\ avg_(col_("time_between")).alias("avg_time_between")\ ) removed_in_cohorts_df = line_decay_df.groupBy("create_cohort", "remove_cohort")\ .count()\ .withColumnRenamed("count", "removed_in_this_cohort") lifespand_days_in_cohort_df = line_decay_df.groupBy("create_cohort", "remove_cohort")\ .agg(max_(col_("lifespan")))\ .withColumnRenamed("max(lifespan)", "lifespan") \ lifespand_days_in_cohort_df.show() removed_cohorts_with_lifespan_df = removed_in_cohorts_df.join(lifespand_days_in_cohort_df, \ (removed_in_cohorts_df.create_cohort == lifespand_days_in_cohort_df.create_cohort) & \