コード例 #1
0
    def _transform_data(self, df, df_base, bl_processed):
        """Transform original dataset.

        :param df: Input DataFrame.
        :param bl_processed
        :return: Transformed DataFrame.
        """

        if self._is_diario:
            df = df.withColumn('TEST', lit(1))
            df_base = df_base.withColumn('TEST', lit(0))
            df = df.union(df_base)

        # Cast key variables and rename headers
        exprs = [
            df[column].alias(column.replace('"', '')) for column in df.columns
        ]
        df = df.select(*exprs)
        exprs = [
            df[column].alias(column.replace(' ', '')) for column in df.columns
        ]
        df = df.select(*exprs)

        df = df.withColumnRenamed('hist_siniestro_poliza_otro_id_siniestro',
                                  'id_siniestro')
        df = df.withColumnRenamed('auditCodigoSiniestroReferencia',
                                  'id_siniestro_ref')
        df = df.withColumn('id_siniestro_ref',
                           df.id_siniestro_ref.cast(IntegerType()))
        df = df.withColumn('id_siniestro', df.id_siniestro.cast(IntegerType()))
        df = df.dropna(subset=['id_siniestro_ref'])
        df = df.dropna(subset=['id_siniestro'])

        # DATE VARIABLES FORMAT
        fecha_variables = [
            "hist_siniestro_poliza_otro_fecha_ocurrencia",
            "hist_siniestro_poliza_otro_fecha_terminado",
            "auditFechaAperturaSiniestroReferencia"
        ]
        func = udf(lambda x: datetime.datetime.strptime(x, '%Y/%m/%d'),
                   DateType())
        for col in fecha_variables:
            df = df.fillna({col: '1900/01/01'})
            df = df.withColumn(col, func(df[col]))
            df = df.withColumn(
                col,
                when(df[col] == '1900-01-01', None).otherwise(df[col]))
            df = df.filter(df[col] <= time.strftime('%Y-%m-%d'))

        # We check that the sinister in the other policy is before the reference sinister, because we want to know the
        # past values
        df = df.filter(df['auditFechaAperturaSiniestroReferencia'] >=
                       df['hist_siniestro_poliza_otro_fecha_ocurrencia'])

        # COUNT POLIZA-VERSION: We count how many sinisters before have the costumer. It counts how many times appear a
        # row in the table, because each line is referred to a unique sinister
        df = df.withColumn('hist_sin_poliza_otro_count_version', lit(1))
        w = (Window().partitionBy(df.id_siniestro_ref).rowsBetween(
            -sys.maxsize, sys.maxsize))
        df = df.withColumn(
            'hist_sin_poliza_otro_count',
            count_(df.hist_sin_poliza_otro_count_version).over(w))

        # COUNT POLIZAS: We count how many policies has the customer. We have to construct another table so we can
        # group at the level of policies.
        count_poliza = df.select(
            ['id_siniestro_ref', 'hist_siniestro_poliza_otro_id_poliza'])
        count_poliza = count_poliza.dropDuplicates()
        count_poliza = count_poliza.withColumnRenamed(
            'hist_siniestro_poliza_otro_id_poliza',
            'hist_sin_poliza_otro_count_polizas')
        count_poliza = count_poliza.withColumn(
            'hist_sin_poliza_otro_count_polizas',
            count_(df['id_siniestro_ref']).over(w))
        count_poliza = count_poliza.dropDuplicates(subset=['id_siniestro_ref'])
        df = df.join(count_poliza, on='id_siniestro_ref', how='left')

        # SINIESTROS/POLIZAS: Here we calculate the ratio nºsinisters/nº policies
        df = df.withColumn(
            'hist_siniestro_poliza_otro_siniestros_polizas',
            df['hist_sin_poliza_otro_count'] /
            df['hist_sin_poliza_otro_count_polizas'])

        # FUE UN SINIESTRO FRAUDULENTO? We check if the id_siniestro is associated with a previous Fraud Sinister
        bl_processed = bl_processed.select('id_siniestro').dropDuplicates(
            subset=['id_siniestro'])
        bl_processed = bl_processed.withColumn('hist_sin_poliza_otro_fraude',
                                               lit(1))
        df = df.join(bl_processed, on='id_siniestro', how='left')
        df = df.withColumn(
            'hist_sin_poliza_otro_fraude',
            when(df['hist_sin_poliza_otro_fraude'].isNull(),
                 0).otherwise(df['hist_sin_poliza_otro_fraude']))

        # POR PRODUCTO: We group the product number by predefined categories in tabla_productos. It permits a better
        # classification. Here we have to pre-process the product label format to have coincidence.
        types = df.select(
            'hist_siniestro_poliza_otro_id_producto').distinct().collect()
        types = [ty['hist_siniestro_poliza_otro_id_producto'] for ty in types]
        types_list = [
            when(df['hist_siniestro_poliza_otro_id_producto'] == ty,
                 1).otherwise(0).alias('d_hist_sin_poliza_otro_producto_' + ty)
            for ty in types
        ]
        df = df.select(list(df.columns) + types_list)
        df.drop('hist_siniestro_poliza_otro_id_producto')

        # DUMMIES: We acumulate the dummy variables to get the variables at cod_filiacion level
        types = ['d_hist_sin_poliza_otro_producto_' + x for x in types]
        var_dummies = [
            "hist_siniestro_poliza_otro_bbdd",
            "hist_siniestro_poliza_otro_unidad_investigacion",
            "hist_siniestro_poliza_otro_incidencia_tecnica",
            "hist_siniestro_poliza_otro_incidencia_tecnica_positiva",
            "hist_siniestro_poliza_otro_incidencias",
            "hist_siniestro_poliza_otro_cobertura"
        ] + types
        for col in var_dummies:
            df = df.withColumn(col + '_count', sum_(df[col]).over(w))
            df = df.drop(col)

        # FECHAS: We have two dates. fecha_ocurrencia and fecha_terminado. We have to take into account claims
        # that are not finished. If the claim is notfinished we input today as date
        # and create a variable that indicates the situation.
        df = df.withColumn(
            'hist_siniestro_poliza_otro_no_terminado',
            when(df['hist_siniestro_poliza_otro_fecha_terminado'].isNull(),
                 1).otherwise(0))
        df = df.fillna({
            'hist_siniestro_poliza_otro_fecha_terminado':
            time.strftime('%Y-%m-%d')
        })

        # Claim duration: We calculate the cumulated duration and the average duration.
        df = df.withColumn(
            'hist_poliza_otro_fecha_apertura_terminado',
            datediff('hist_siniestro_poliza_otro_fecha_terminado',
                     'hist_siniestro_poliza_otro_fecha_ocurrencia'))
        df = df.withColumn(
            'hist_poliza_otro_fecha_apertura_terminado',
            sum_(df['hist_poliza_otro_fecha_apertura_terminado']).over(w))
        df = df.withColumn(
            'hist_poliza_otro_duracion_promedio_sin',
            df['hist_poliza_otro_fecha_apertura_terminado'] /
            df['hist_sin_poliza_otro_count'])

        # ULTIMO SINIESTRO DE LA POLIZA
        df = df.withColumnRenamed(
            'hist_siniestro_poliza_otro_fecha_ocurrencia',
            'hist_siniestro_poliza_otro_ultimo_fecha_ocurrencia')
        df = df.orderBy('hist_siniestro_poliza_otro_ultimo_fecha_ocurrencia',
                        ascending=False)

        # CARGA SINIESTRAL
        # Outlier: First we calculate the outliers quantity by cliente-sinister so we can get the intra-effect
        df = df.withColumnRenamed(
            'coste_del_siniestro_por_rol',
            'hist_siniestro_poliza_otro_carga_siniestral')
        df = df.fillna({'hist_siniestro_poliza_otro_carga_siniestral': 0})
        df = df.withColumn(
            'hist_siniestro_poliza_otro_carga_siniestral',
            df.hist_siniestro_poliza_otro_carga_siniestral.cast(FloatType()))

        # Construimos el outlier a nivel siniestro: Luego hacemos la suma de los casos de outlier por id_siniestro_ref
        df = outliers.Outliers.outliers_mad(
            df,
            'hist_siniestro_poliza_otro_carga_siniestral',
            not_count_zero=True)
        df = df.withColumn(
            'hist_siniestro_poliza_otro_carga_siniestral_mad_outlier_count',
            sum_(df['hist_siniestro_poliza_otro_carga_siniestral_mad_outlier']
                 ).over(w))
        df = df.withColumn(
            'hist_siniestro_poliza_otro_carga_siniestral_mad_outlier_promedio',
            df['hist_siniestro_poliza_otro_carga_siniestral_mad_outlier_count']
            / df['hist_sin_poliza_otro_count'])
        df = df.drop('hist_siniestro_poliza_otro_carga_siniestral_mad_outlier')

        # We calculate the sum and the average by sinister
        df = df.withColumn(
            'hist_siniestro_poliza_otro_carga_siniestral_count',
            sum_(df['hist_siniestro_poliza_otro_carga_siniestral']).over(w))
        df = df.withColumn(
            'hist_siniestro_poliza_otro_carga_siniestral_promedio',
            df['hist_siniestro_poliza_otro_carga_siniestral_count'] /
            df['hist_sin_poliza_otro_count'])

        # COBERTURAS
        # mayor a 3: we consider as outlier > 3, because the mean is concentrated around 1.28
        df = df.withColumn(
            'hist_sin_poliza_otro_mayor3coberturas',
            when(df["hist_siniestro_poliza_otro_coberturas_involucradas"] > 3,
                 1).otherwise(0))
        df = df.withColumn(
            'hist_sin_poliza_otro_mayor3coberturas',
            sum_(df['hist_sin_poliza_otro_mayor3coberturas']).over(w))

        # promedio: Average by claim
        df = df.withColumn(
            'hist_sin_poliza_otro_cober_sum',
            sum_(
                df['hist_siniestro_poliza_otro_coberturas_involucradas']).over(
                    w))
        df = df.withColumn(
            'hist_sin_poliza_otro_cober_promedio',
            df["hist_sin_poliza_otro_cober_sum"] /
            df['hist_sin_poliza_otro_count'])

        # pagas-cubiertas: We calculate this at the coustomer cumulated level and not to claim level
        df = df.withColumn(
            'hist_siniestro_poliza_otro_coberturas_involucradas_pagadas_sum',
            sum_(df[
                'hist_siniestro_poliza_otro_coberturas_involucradas_pagadas']).
            over(w))
        df = df.withColumn(
            'hist_sin_poliza_otro_pagas_cubiertas',
            df["hist_siniestro_poliza_otro_coberturas_involucradas_pagadas_sum"]
            / df['hist_sin_poliza_otro_cober_sum'])

        # no pagas: Here we calculate at the claim level, counting the total unpaid coverages
        df = df.withColumn(
            'hist_sin_poliza_otro_cob_no_pagas',
            when(
                df['hist_siniestro_poliza_otro_coberturas_involucradas_pagadas']
                == 0, 1).otherwise(0))
        df = df.withColumn(
            'hist_sin_poliza_otro_cob_no_pagas',
            sum_(df['hist_sin_poliza_otro_cob_no_pagas']).over(w))

        # DELETE VARIABLES: We delete variables that are not relevant or have been transformed
        del_variables = [
            'hist_siniestro_poliza_otro_id_poliza',
            'hist_siniestro_poliza_otro_id_producto',
            'hist_siniestro_poliza_otro_version',
            'hist_siniestro_poliza_otro_id_siniestro',
            'hist_siniestro_poliza_otro_fecha_terminado',
            'hist_siniestro_poliza_otro_bbdd',
            'hist_siniestro_poliza_otro_unidad_investigacion',
            'hist_siniestro_poliza_otro_incidencia_tecnica',
            'hist_siniestro_poliza_otro_incidencia_tecnica_positiva',
            'hist_siniestro_poliza_otro_incidencias',
            'hist_siniestro_poliza_otro_cobertura',
            'hist_siniestro_poliza_otro_carga_siniestral',
            'hist_siniestro_poliza_otro_coberturas_involucradas',
            'hist_siniestro_poliza_otro_coberturas_involucradas_pagadas',
            'id_fiscal', 'hist_sin_poliza_otro_count_version',
            'Agrupación productos', 'Producto',
            'auditFechaAperturaSiniestroReferencia', 'cliente_codfiliacion',
            'audit_siniestro_codigo_compania', 'id_siniestro'
        ]

        df = df.drop(*del_variables)
        df = df.withColumnRenamed('id_siniestro_ref', 'id_siniestro')
        df = df.dropDuplicates(subset=['id_siniestro'])

        # OUTLIER: We calculate the outliers referred to the ratio claims/policies.
        df = outliers.Outliers.outliers_mad(
            df,
            'hist_siniestro_poliza_otro_siniestros_polizas',
            not_count_zero=False)

        if self._is_diario:
            df = df.filter(df['TEST'] == 1)
            df = df.drop('TEST')

        return df
コード例 #2
0
ファイル: etl.py プロジェクト: sebalp1987/aller_media_test
    def _transform(self, df, auxiliar_train):

        if not self.train_file:
            auxiliar_train = auxiliar_train.drop('WinningBid')
            auxiliar_train = auxiliar_train.withColumn('test', lit(0))
            df = df.withColumn('test', lit(1))
            df = auxiliar_train.union(df)
            del auxiliar_train

        # We create the time as Index
        split_col = split(df['ApproximateDate'], ' ')
        df = df.withColumn('time', split_col.getItem(1))  # time

        # Hour Index
        func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hms'),
                         IntegerType())
        df = df.withColumn('hms_index', func_index(df['time']))

        # We order by UserId-Date
        df = df.orderBy(['UserID', 'hms_index'])

        # We check Null Values
        df.select([count_(when(isnan(c), c)).alias(c)
                   for c in df.columns]).show()

        # We create a rank of users by how many times in the past saw an ad
        w = (Window().partitionBy(df.UserID).orderBy('time').rowsBetween(
            Window.unboundedPreceding, 0))
        df = df.withColumn('user_id_acumulative', count_(df['UserId']).over(w))

        # Number of Ads/User/Second
        df = df.withColumn('key_id',
                           concat(df['UserID'], lit(' '), df['hms_index']))
        w = (Window().partitionBy(df.key_id).orderBy('hms_index').rowsBetween(
            -sys.maxsize, sys.maxsize))
        df = df.withColumn('number_ads_user_second', count_(df.key_id).over(w))

        # Number of Ads/User
        df_group = df.groupby(['key_id'
                               ]).agg(count_('key_id').alias('count_ads'))
        split_col = split(df_group['key_id'], ' ')
        df_group = df_group.withColumn('UserID', split_col.getItem(0))  # time
        w = (Window().partitionBy(
            df_group.UserID).orderBy('key_id').rowsBetween(
                Window.unboundedPreceding, 0))
        df_group = df_group.withColumn('number_ads_user',
                                       sum_(df_group.count_ads).over(w))
        df_group = df_group.select(['key_id', 'number_ads_user'])
        df = df.join(df_group, how='left', on='key_id')
        del df_group

        # Number of Users/Second
        w = (Window().partitionBy(df.ApproximateDate).rowsBetween(
            -sys.maxsize, sys.maxsize))
        df = df.withColumn('number_user_second',
                           approx_count_distinct(df.UserID).over(w))

        # Number of Ads/Second
        df = df.withColumn('number_ads_second',
                           count_(df.ApproximateDate).over(w))

        # Browser Dummy Transformation
        types = df.select('Browser').distinct().collect()
        types = [val['Browser'] for val in types]
        new_cols = [
            when(df['Browser'] == ty, 1).otherwise(0).alias('d_browser_' + ty)
            for ty in types
        ]
        df = df.select(df.columns + new_cols)

        # Decompose Date Variables
        df = df.withColumn('date', to_date(df['ApproximateDate']))  # date
        df = df.withColumn('month', month(df['ApproximateDate']))  # month
        df = df.withColumn('day', dayofmonth(df['ApproximateDate']))  # day
        df = df.withColumn('weekday', dayofweek(
            df['ApproximateDate']))  # weekday 1=Monday

        df = df.withColumn('hour', hour(df['time']))  # hour
        df = df.withColumn('minute', minute(df['time']))  # minute

        # Peak Hour
        df = df.withColumn('peak6am8am',
                           when(df['hour'].between(6, 8), 1).otherwise(0))
        df = df.withColumn('peak14pm16pm',
                           when(df['hour'].between(14, 16), 1).otherwise(0))

        # Minute Index
        func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hm'),
                         IntegerType())
        df = df.withColumn('hm_index', func_index(df['time']))

        # Convert to time-series by Minute
        # We reduce to minutes
        df_time_serie_ads = df.select([
            'hms_index', 'hm_index', 'number_user_second', 'number_ads_second'
        ]).drop_duplicates()
        df_time_serie_user = df.select(['UserID',
                                        'hm_index']).drop_duplicates()

        # Group-by the values
        df_time_serie_user = df_time_serie_user.groupBy('hm_index').agg(
            approx_count_distinct('UserID'))
        df_time_serie_ads = df_time_serie_ads.groupBy('hm_index').agg({
            'number_ads_second':
            'sum'
        }).drop_duplicates(subset=['hm_index'])

        # Join ads-users per minute
        df_time_serie = df_time_serie_ads.join(df_time_serie_user,
                                               how='left',
                                               on='hm_index')
        del df_time_serie_ads, df_time_serie_user

        # Rename columns
        df_time_serie = df_time_serie.withColumnRenamed(
            'sum(number_ads_second)', 'number_ads_minute').withColumnRenamed(
                'approx_count_distinct(UserID)', 'number_user_minute')

        # Resample Range of Minutes
        resample_range = list(
            range(
                df_time_serie.select(min_(
                    col('hm_index'))).limit(1).collect()[0][0],
                df_time_serie.select(max_(
                    col('hm_index'))).limit(1).collect()[0][0] + 1, 1))

        resample_range = self._spark.createDataFrame(resample_range,
                                                     IntegerType())

        # Join the original df
        df_time_serie = resample_range.join(
            df_time_serie,
            how='left',
            on=resample_range.value == df_time_serie.hm_index).drop(
                *['hm_index']).fillna(0)

        # Create Lags By Minutes
        w = Window().partitionBy().orderBy(col('value'))
        if self.ar_min_lag > 0:
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_user_minute').over(w).alias(
                    'ar1_number_user_minute'))
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_ads_minute').over(w).alias(
                    'ar1_number_ads_minute'))

            if self.ar_min_lag > 1:
                for l in range(2, self.ar_min_lag + 1, 1):
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_user_minute').over(
                            w).alias('ar' + str(l) + '_number_user_minute'))
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_ads_minute').over(
                            w).alias('ar' + str(l) + '_number_ads_minute'))

        # Remove the lagged Null Values
        df_time_serie = df_time_serie.dropna()

        # join and remove lag Null values of the first minute
        df = df.orderBy(['UserID', 'hms_index'])
        df = df.join(df_time_serie.orderBy(['hm_index']),
                     how='left',
                     on=df.hm_index == df_time_serie.value).drop('value')

        # Convert to time-series and resample by Seconds
        df_time_serie = df.select(
            ['hms_index', 'number_user_second',
             'number_ads_second']).drop_duplicates()
        resample_range = list(
            range(
                df_time_serie.select(min_(
                    col('hms_index'))).limit(1).collect()[0][0],
                df_time_serie.select(max_(
                    col('hms_index'))).limit(1).collect()[0][0] + 1, 1))
        resample_range = self._spark.createDataFrame(resample_range,
                                                     IntegerType())

        # Join the original df
        df_time_serie = resample_range.join(
            df_time_serie,
            how='left',
            on=resample_range.value == df_time_serie.hms_index).drop(
                *['hms_index']).fillna(0)

        # Create lags
        w = Window().partitionBy().orderBy(col('value'))
        if self.ar_lags > 0:
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_user_second').over(w).alias(
                    'ar1_number_user_second'))
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_ads_second').over(w).alias(
                    'ar1_number_ads_second'))

            if self.ar_lags > 1:
                for l in range(2, self.ar_lags + 1, 1):
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_user_second').over(
                            w).alias('ar' + str(l) + '_number_user_second'))
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_ads_second').over(
                            w).alias('ar' + str(l) + '_number_ads_second'))

        # Create Moving Average
        if self.ma_ss_lag is not None:

            # Get hour from index
            func_index = udf(lambda x: auxiliar_func.num_to_time(x),
                             StringType())
            df_time_serie = df_time_serie.withColumn(
                'time', func_index(df_time_serie['value']))

            # minute MA terms (Average per second last xx seconds)
            if self.ma_ss_lag is not None:
                for lag_val in self.ma_ss_lag:
                    # range to take into account
                    w = (Window.orderBy(df_time_serie['value']).rangeBetween(
                        -lag_val, 0))
                    # MA variables
                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_user_second',
                        avg('number_user_second').over(w))
                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_ads_second',
                        avg('number_ads_second').over(w))

                    # Increasing ID
                    df_time_serie = df_time_serie.withColumn(
                        'rn', monotonically_increasing_id())

                    # Replace first values by Null
                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_user_second',
                        when(df_time_serie['rn'] < lag_val, None).otherwise(
                            df_time_serie['ma_seconds_' + str(lag_val) +
                                          '_number_user_second']))

                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_ads_second',
                        when(df_time_serie['rn'] < lag_val, None).otherwise(
                            df_time_serie['ma_seconds_' + str(lag_val) +
                                          '_number_ads_second']))

                    # Get the average by Minute
                    df_time_serie = df_time_serie.withColumn(
                        'ma_minute_' + str(lag_val) + '_number_user_second',
                        df_time_serie['ma_seconds_' + str(lag_val) +
                                      '_number_user_second'] * 60)
                    df_time_serie = df_time_serie.withColumn(
                        'ma_minute_' + str(lag_val) + '_number_ads_second',
                        df_time_serie['ma_seconds_' + str(lag_val) +
                                      '_number_ads_second'] * 60)
                df_time_serie = df_time_serie.drop(*['rn'])

        # Remove the lagged Null Values
        df_time_serie = df_time_serie.drop(
            *['time', 'number_user_second', 'number_ads_second']).dropna()
        # join and remove lag Null values of the first minute
        df = df.join(
            df_time_serie.orderBy(['value']),
            how='left',
            on=df.hms_index == df_time_serie.value).drop('value').dropna()

        if self.train_file and not self.variable_analysis:
            df = df.select([
                'key_id', 'hms_index', 'number_ads_user', 'number_user_second',
                'number_ads_second', 'number_ads_user_second', 'peak6am8am',
                'peak14pm16pm', 'user_id_acumulative'
            ] + [x for x in df.columns if x.startswith('d_browser')] +
                           [x for x in df.columns if x.startswith('ar')] +
                           [x for x in df.columns if x.startswith('ma_')] +
                           ['WinningBid'])

        if not self.train_file:
            df = df.filter(df['test'] == 1)
            df = df.select([
                'UserID', 'key_id', 'number_ads_user', 'hms_index',
                'number_user_second', 'number_ads_second',
                'number_ads_user_second', 'peak6am8am', 'peak14pm16pm',
                'user_id_acumulative'
            ] + [x for x in df.columns if x.startswith('d_browser')] +
                           [x for x in df.columns if x.startswith('ar')] +
                           [x for x in df.columns if x.startswith('ma_')])

        df = df.orderBy(['hms_index', 'UserID'])
        df.show()
        return df
コード例 #3
0
def checklist5(df_reserva, df_id, df_reserva_new=None, df_id_new=None):
    """
    2 siniestros de robo con joyas del mismo asegurado
    :return: This return a Dataframe with the columns 'id_siniestro', 'checklist5_poliza', 'checklist5_nif', where
    'checklist5_' represents how many sinister (by policy/nif) belongs to JOYAS coverage
    """
    exprs = [
        df_id[column].alias(column.replace('"', ''))
        for column in df_id.columns
    ]
    df_id = df_id.select(*exprs)
    exprs = [
        df_id[column].alias(column.replace(' ', ''))
        for column in df_id.columns
    ]
    df_id = df_id.select(*exprs)

    df_reserva = df_reserva.select(
        ['id_siniestro', 'id_poliza', 'po_res_cobertura'])
    df_id = df_id.select(['id_siniestro', 'id_fiscal'])
    if df_reserva_new is not None:
        df_reserva_new = df_reserva_new.select(
            ['id_siniestro', 'id_poliza', 'po_res_cobertura'])
        df_reserva = df_reserva.union(df_reserva_new)

    df_reserva = df_reserva.dropDuplicates(
        subset=['id_siniestro', 'po_res_cobertura'])
    df_reserva = df_reserva.withColumn(
        'po_res_cobertura',
        when(df_reserva['po_res_cobertura'].contains('JOY'),
             'INCIDENCIA').otherwise(df_reserva['po_res_cobertura']))
    df_reserva = df_reserva.withColumn(
        'po_res_cobertura',
        when(df_reserva['po_res_cobertura'].contains('ESPECIAL'),
             'INCIDENCIA').otherwise(df_reserva['po_res_cobertura']))
    df_reserva = df_reserva.filter(
        df_reserva['po_res_cobertura'] == 'INCIDENCIA')

    # We merge with ID by sinister
    if df_id_new is not None:
        exprs = [
            df_id_new[column].alias(column.replace('"', ''))
            for column in df_id_new.columns
        ]
        df_id_new = df_id_new.select(*exprs)
        exprs = [
            df_id_new[column].alias(column.replace(' ', ''))
            for column in df_id_new.columns
        ]
        df_id_new = df_id_new.select(*exprs)
        df_id_new = df_id_new.select(['id_siniestro', 'id_fiscal'])
        df_id = df_id.union(df_id_new)

    df_reserva = df_reserva.withColumn(
        'id_siniestro', df_reserva.id_siniestro.cast(IntegerType()))
    df_id = df_id.withColumn('id_siniestro',
                             df_id.id_siniestro.cast(IntegerType()))

    reserva_cobertura = df_reserva.join(df_id, 'id_siniestro', how='left')

    # We calculate the COUNT of JOYAS
    reserva_cobertura = reserva_cobertura.dropDuplicates(
        subset=['id_siniestro'])

    # Now we have the values by claim, we group by id_poliza and by nif
    w = (Window().partitionBy('id_siniestro').rowsBetween(
        -sys.maxsize, sys.maxsize))
    reserva_cobertura = reserva_cobertura.withColumn(
        'checklist5_poliza',
        count_(reserva_cobertura['id_poliza']).over(w))
    reserva_cobertura = reserva_cobertura.withColumn(
        'checklist5_nif',
        count_(reserva_cobertura['id_fiscal']).over(w))

    reserva_cobertura = reserva_cobertura.drop(
        *['id_poliza', 'id_fiscal', 'po_res_cobertura'])

    return reserva_cobertura
コード例 #4
0
ファイル: fecha.py プロジェクト: sebalp1987/fraud_detection
    def transform_data(self, df, df_reserva, df_reserva_new, df_fecha,
                       init_date_new_, init_date_historic_):
        """Transform original dataset.

        :param df: Input DataFrame.
        :param df_reserva
        :param df_reserva_new
        :param df_fecha
        :param init_date_new_: Minimun date for new claims
        :param init_date_historic_: Max historical data
        :return: Transformed DataFrame.
        """
        # Cast key variables and rename headers
        df = df.withColumnRenamed('auditCodigoSiniestroReferencia',
                                  'id_siniestro')
        df = df.withColumn('id_siniestro', df.id_siniestro.cast(IntegerType()))

        # CONSERVED VARIABLES: We drop the variables that are not well defined or that at wrong defined.
        var_conserved = [
            "id_siniestro", 'id_poliza', 'version_poliza',
            "fecha_poliza_emision", "fecha_poliza_efecto_natural",
            "fecha_poliza_efecto_mvto", "fecha_poliza_vto_movimiento",
            "fecha_poliza_vto_natural", "fecha_siniestro_ocurrencia",
            'fecha_siniestro_comunicacion', "fecha_primera_visita_peritaje",
            "fecha_ultima_visita_peritaje"
        ]

        df = df.select(*var_conserved)

        # We fill siniestro_comunicacion with siniestro_ocurrencia
        df = df.withColumn(
            'fecha_siniestro_comunicacion',
            coalesce('fecha_siniestro_comunicacion',
                     'fecha_siniestro_ocurrencia'))

        # STRIP dates: YEAR, MONTH, WEEKDAY, DAY
        var_fecha = [
            "fecha_poliza_emision", "fecha_poliza_efecto_natural",
            "fecha_poliza_efecto_mvto", "fecha_poliza_vto_movimiento",
            "fecha_poliza_vto_natural", "fecha_siniestro_ocurrencia",
            'fecha_primera_visita_peritaje', 'fecha_ultima_visita_peritaje',
            'fecha_siniestro_comunicacion'
        ]

        func = udf(lambda x: datetime.datetime.strptime(x, '%Y/%m/%d'),
                   DateType())

        for col in var_fecha:
            year_name = str(col) + '_year'
            month_name = str(col) + '_month'
            day_name = str(col) + '_day'
            weekday_name = str(col) + '_weekday'
            df = df.fillna({col: '1900/01/01'})
            df = df.withColumn(col, func(df[col]))
            df = df.withColumn(
                col,
                when(df[col] == '1900-01-01', None).otherwise(df[col]))
            df = df.withColumn(year_name, year(df[col]))
            df = df.withColumn(month_name, month(df[col]))
            df = df.withColumn(day_name, dayofmonth(df[col]))
            df = df.withColumn(weekday_name,
                               date_format(col, 'u') -
                               1)  # We adapt to (0=Monday, 1=Tuesday...)
            df = df.withColumn(weekday_name,
                               df[weekday_name].cast(IntegerType()))

        # Filtering by INIT_DATE parameter
        df = df.filter(df['fecha_siniestro_ocurrencia'] >= init_date_historic_)

        # CHECKLIST 6a
        df = df.withColumn('checklist6a', lit(0))
        df = df.withColumn('checklist6a_PP', lit(0))

        # CHECKLIST 6b
        if self._is_diario:
            # Filtering new Claims INIT_DATE
            df = df.filter(
                df['fecha_siniestro_comunicacion'] >= init_date_new_)
            auxiliar_list = checklist_spark.checklist6b(
                df, df_fecha, df_reserva_new, df_reserva)

        else:
            auxiliar_list = checklist_spark.checklist6b(
                None, df, None, df_reserva)

        if auxiliar_list:
            r = Row('id_siniestro_c', 'checklist_6b')
            df_claims = self._spark.createDataFrame(
                r(i, x) for i, x in auxiliar_list)
            df = df.join(df_claims,
                         df.id_siniestro == df_claims.id_siniestro_c,
                         how='left')
            del df_claims, r, auxiliar_list

            df = df.drop('id_siniestro_c')
            df = df.fillna({'checklist_6b': 0})
        else:
            df = df.withColumn('checklist_6b', lit(0))

        # CHECKLIST 7
        if self._is_diario:
            auxiliar_list = checklist_spark.checklist_7(
                df, df_fecha, df_reserva_new, df_reserva)
        else:
            auxiliar_list = checklist_spark.checklist_7(
                None, df, None, df_reserva)

        if auxiliar_list:
            r = Row('id_siniestro', 'checklist_7')
            df_claims = self._spark.createDataFrame(
                r(i, x) for i, x in auxiliar_list)
            del auxiliar_list, r

            df = df.join(df_claims, on='id_siniestro', how='left')
            del df_claims
            df = df.drop('id_siniestro_c')
            df = df.fillna({'checklist_7': 0})
        else:
            df = df.withColumn('checklist_7', lit(0))

        # CHECKLIST 14
        if self._is_diario:
            auxiliar_list = checklist_spark.checklist_14(
                df, df_fecha, df_reserva_new, df_reserva)
        else:
            auxiliar_list = checklist_spark.checklist_14(
                None, df, None, df_reserva)

        if auxiliar_list:
            r = Row('id_siniestro_c', 'checklist_14')
            df_claims = self._spark.createDataFrame(
                r(i, x) for i, x in auxiliar_list)

            w = (Window().partitionBy(df_claims.id_siniestro_c).rowsBetween(
                -sys.maxsize, sys.maxsize))
            df_claims = df_claims.withColumn(
                'checklist_14_coberturas_repetidas',
                sum_(df_claims.checklist_14).over(w))
            df_claims = df_claims.withColumn(
                'checklist_14_siniestros_involucrados',
                count_(df_claims.checklist_14).over(w))
            df_claims = df_claims.dropDuplicates(subset=['id_siniestro_c'])
            df_claims.drop('checklist_14')
            df = df.join(df_claims,
                         df.id_siniestro == df_claims.id_siniestro_c,
                         how='left')
            del df_claims, r, auxiliar_list
            df = df.drop('id_siniestro_c')
            df = df.fillna({'checklist_14_coberturas_repetidas': 0})
            df = df.fillna({'checklist_14_siniestros_involucrados': 0})
        else:
            df = df.withColumn('checklist_14_coberturas_repetidas', lit(0))
            df = df.withColumn('checklist_14_siniestros_involucrados', lit(0))

        # COMPLEX NON-COMPLEX VARIABLES: We define two types of dates. That dates we want more detail we generate
        # every type of possible variable. Non-complex will be more agroupated variables.
        var_fecha_complex = ["fecha_siniestro_ocurrencia"]
        var_fecha_less_complex = [
            "fecha_poliza_efecto_natural", "fecha_poliza_vto_natural"
        ]

        for i in var_fecha_complex:
            # We create dummies
            col_names = [
                str(i) + '_year',
                str(i) + '_month',
                str(i) + '_weekday'
            ]
            for col in col_names:
                types = df.select(col).distinct().collect()
                types = [ty[col] for ty in types]
                type_list = [
                    when(df[col] == ty,
                         1).otherwise(0).alias('d_' + col + '_' + str(ty))
                    for ty in types
                ]
                df = df.select(list(df.columns) + type_list)

            # days range
            day = str(i) + '_day'
            df = df.withColumn(day + '1_10',
                               when(df[day].between(1, 10), 1).otherwise(0))
            df = df.withColumn(day + '10_20',
                               when(df[day].between(11, 20), 1).otherwise(0))
            df = df.withColumn(day + '20_30',
                               when(df[day].between(21, 31), 1).otherwise(0))

        for i in var_fecha_less_complex:
            # month in holiday
            df = df.withColumn(
                str(i) + '_month_holiday',
                when(df[str(i) + '_month'].isin([1, 8, 12]), 1).otherwise(0))

            # days range
            day = str(i) + '_day'
            df = df.withColumn(day + '1_10',
                               when(df[day].between(1, 10), 1).otherwise(0))
            df = df.withColumn(day + '10_20',
                               when(df[day].between(11, 20), 1).otherwise(0))
            df = df.withColumn(day + '20_30',
                               when(df[day].between(21, 31), 1).otherwise(0))

            # weekend or monday
            df = df.withColumn(
                str(i) + '_weekday_weekend',
                when(df[str(i) + '_weekday'].isin([6, 7]), 1).otherwise(0))
            df = df.withColumn(
                str(i) + '_weekday_monday',
                when(df[str(i) + '_weekday'] == 0, 1).otherwise(0))

            # FIRST DELETE: We delete that variables we generated before that are not relevant or are
            # too specific.

        del_variables = [
            'fecha_poliza_emision_year', 'fecha_poliza_emision_month',
            'fecha_poliza_emision_day', 'fecha_poliza_emision_weekday',
            'fecha_poliza_efecto_natural_year',
            'fecha_poliza_efecto_natural_month',
            'fecha_poliza_efecto_natural_day',
            'fecha_poliza_efecto_natural_weekday',
            'fecha_poliza_efecto_mvto_year', 'fecha_poliza_efecto_mvto_month',
            'fecha_poliza_efecto_mvto_day', 'fecha_poliza_efecto_mvto_weekday',
            'fecha_poliza_vto_movimiento_year',
            'fecha_poliza_vto_movimiento_month',
            'fecha_poliza_vto_movimiento_day',
            'fecha_poliza_vto_movimiento_weekday',
            'fecha_poliza_vto_natural_year', 'fecha_poliza_vto_natural_month',
            'fecha_poliza_vto_natural_day', 'fecha_poliza_vto_natural_weekday',
            'fecha_siniestro_ocurrencia_year',
            'fecha_siniestro_ocurrencia_month',
            'fecha_siniestro_ocurrencia_day',
            'fecha_siniestro_ocurrencia_weekday',
            'fecha_primera_visita_peritaje_year',
            'fecha_primera_visita_peritaje_month',
            'fecha_primera_visita_peritaje_day',
            'fecha_primera_visita_peritaje_weekday',
            'fecha_ultima_visita_peritaje_year',
            'fecha_ultima_visita_peritaje_month',
            'fecha_ultima_visita_peritaje_day',
            'fecha_ultima_visita_peritaje_weekday',
            'fecha_siniestro_comunicación_year',
            'fecha_siniestro_comunicación_month',
            'fecha_siniestro_comunicación_day',
            'fecha_siniestro_comunicación_weekday', 'id_poliza',
            'hogar_poblacion', 'version_poliza'
        ]
        df = df.drop(*del_variables)

        # FECHAS LOGICAS: We create different types of dates var that can be relevant to fraud analysis.
        # Diferencia entre primera póliza emisión y último vencimiento natural
        df = df.withColumn(
            'fecha_diferencia_vto_emision',
            datediff(df['fecha_poliza_vto_natural'],
                     df['fecha_poliza_emision']))

        # if fecha efecto < fecha emision => d = 1
        df = df.withColumn(
            'fecha_indicador_efecto_emision',
            when(
                df['fecha_poliza_emision'] > df['fecha_poliza_efecto_natural'],
                1).otherwise(0))

        # diferencia entre siniestro y efecto: 5, 15, 30 días
        df = df.withColumn(
            'fecha_diferencia_siniestro_efecto',
            datediff(df['fecha_siniestro_ocurrencia'],
                     df['fecha_poliza_efecto_natural']))
        days_var = [5, 15, 30]
        for col in days_var:
            df = df.withColumn(
                'fecha_diferencia_siniestro_efecto_' + str(col),
                when(df['fecha_diferencia_siniestro_efecto'] <= col,
                     1).otherwise(0))

        # diferencia entre siniestro y primera emisión: 5, 15, 30 días
        df = df.withColumn(
            'fecha_diferencia_siniestro_emision',
            datediff(df['fecha_siniestro_ocurrencia'],
                     df['fecha_poliza_emision']))
        for col in days_var:
            df = df.withColumn(
                'fecha_diferencia_siniestro_emision_' + str(col),
                when(df['fecha_diferencia_siniestro_emision'] <= col,
                     1).otherwise(0))

        # diferencia entre siniestro y vencimiento 5, 15, 30 días
        df = df.withColumn(
            'fecha_diferencia_siniestro_vto_natural',
            datediff(df['fecha_poliza_vto_natural'],
                     df['fecha_siniestro_ocurrencia']))
        for col in days_var:
            df = df.withColumn(
                'fecha_diferencia_siniestro_vto_natural_' + str(col),
                when(df['fecha_diferencia_siniestro_vto_natural'] <= col,
                     1).otherwise(0))

        # if fecha comunicacion > fecha ocurrencia en 7 días, d = 1
        df = df.withColumn(
            'fecha_diferencia_siniestro_comunicacion',
            datediff(df['fecha_siniestro_comunicacion'],
                     df['fecha_siniestro_ocurrencia']))
        df = df.withColumn(
            'fecha_diferencia_comunicacion_outlier',
            when(df['fecha_diferencia_siniestro_comunicacion'] >= 7,
                 1).otherwise(0))
        df = df.drop('fecha_siniestro_comunicacion')

        df = df.dropDuplicates(subset=['id_siniestro'])

        return df
コード例 #5
0
    def _transform_data(self, df, df_base, bl_processed):
        """Transform original dataset.

        :param df: Input DataFrame.
        :param bl_processed
        :return: Transformed DataFrame.
        """

        if self._is_diario:
            df = df.withColumn('TEST', lit(1))
            df_base = df_base.withColumn('TEST', lit(0))
            df = df.union(df_base)

        # Cast key variables and rename headers
        df = df.withColumnRenamed('auditCodigoSiniestroReferencia', 'id_siniestro_ref')
        df = df.withColumn('id_siniestro_ref', df.id_siniestro_ref.cast(IntegerType()))
        df = df.withColumn('id_siniestro', df.id_siniestro.cast(IntegerType()))
        df = df.dropna(subset=['id_siniestro_ref'])
        df = df.dropna(subset=['id_siniestro'])

        # DATE VARIABLES FORMAT
        fecha_variables = ['hist_siniestro_otro_fecha_ocurrencia', 'hist_siniestro_fecha_terminado']
        func = udf(lambda x: datetime.datetime.strptime(x, '%Y/%m/%d'), DateType())
        for col in fecha_variables:
            df = df.fillna({col: '1900/01/01'})
            df = df.withColumn(col, func(df[col]))
            df = df.withColumn(col, when(df[col] == '1900-01-01', None).otherwise(df[col]))
            df = df.filter(df[col] <= time.strftime('%Y-%m-%d'))

        # COUNT ID_SINIESTRO_REF CUANTOS SINIESTROS TIENE
        df = df.withColumn('hist_sin_otros_count_version', lit(1))
        w = (Window().partitionBy(df.id_siniestro_ref).rowsBetween(-sys.maxsize, sys.maxsize))
        df = df.withColumn('hist_sin_otros_count', count_(df.hist_sin_otros_count_version).over(w))

        # SINIESTRO TERMINADO: We transform in dummy variables the category siniestro_sit
        types = df.select('hist_siniestro_otro_sit').distinct().collect()
        types = [ty['hist_siniestro_otro_sit'] for ty in types]
        type_list = [when(df['hist_siniestro_otro_sit'] == ty, 1).otherwise(0).alias('d_hist_siniestro_otro_sit_' + ty)
                     for ty in types]
        df = df.select(list(df.columns) + type_list)

        # DUMMIES ACUMULATIVAS
        types = ['d_hist_siniestro_otro_sit_' + x for x in types]
        var_dummies = ["hist_siniestro_otro_rehusado", "hist_siniestro_otro_bbdd",
                       "hist_siniestro_otro_unidad_investigacion", "hist_siniestro_otro_incidencia_tecnica",
                       "hist_siniestro_otro_incidencia_tecnica_positiva", "hist_siniestro_otro_incidencias",
                       "hist_siniestro_otro_cobertura", "hist_siniestro_otro_rehabilitado"] + types
        for col in var_dummies:
            df = df.withColumn(col + '_count', sum_(df[col]).over(w))
            df = df.drop(col)

        # DATE VARIABLES
        # Duración = Fecha_Terminado - Fecha_Ocurrrencia
        df = df.withColumn('hist_otros_fecha_apertura_terminado', datediff('hist_siniestro_fecha_terminado',
                                                                           'hist_siniestro_otro_fecha_ocurrencia'))

        df = df.withColumn('hist_otros_fecha_apertura_terminado',
                           sum_(df['hist_otros_fecha_apertura_terminado']).over(w))

        # Duración Promedio
        df = df.withColumn('hist_otros_duracion_promedio_sin', df['hist_otros_fecha_apertura_terminado'] /
                           df['hist_sin_otros_count'])

        # Último Siniestro de la póliza: We are going to keep the first row, it is the last sinister.
        df = df.withColumnRenamed('hist_siniestro_otro_fecha_ocurrencia', 'hist_siniestro_otro_ultimo_fecha_ocurrencia')
        df = df.orderBy('hist_siniestro_otro_ultimo_fecha_ocurrencia', ascending=False)

        # FUE UN SINIESTRO FRAUDULENTO? We check if the id_siniestro is associated with a previous Fraud Sinister
        bl_processed = bl_processed.select('id_siniestro').dropDuplicates(subset=['id_siniestro'])
        bl_processed = bl_processed.withColumn('hist_sin_otro_fraude_count', lit(1))
        df = df.join(bl_processed, on='id_siniestro', how='left')
        df = df.withColumn('hist_sin_otro_fraude_count', when(df['hist_sin_otro_fraude_count'].isNull(), 0).otherwise(
            df['hist_sin_otro_fraude_count']))
        df = df.withColumn('hist_sin_otro_fraude_count', sum_(df['hist_sin_otro_fraude_count']).over(w))

        # CARGA SINIESTRAL
        df = df.withColumnRenamed('coste_del_siniestro_por_rol', 'hist_siniestro_carga_siniestral')
        df = df.fillna({'hist_siniestro_carga_siniestral': 0})
        df = df.withColumn('hist_siniestro_carga_siniestral', df.hist_siniestro_carga_siniestral.cast(FloatType()))

        # Construimos el outlier a nivel siniestro: Luego hacemos la suma de los casos de outlier por id_siniestro_ref
        df = outliers.Outliers.outliers_mad(df, 'hist_siniestro_carga_siniestral', not_count_zero=True)
        df = df.withColumn('hist_siniestro_carga_siniestral_mad_outlier_count',
                           sum_(df['hist_siniestro_carga_siniestral_mad_outlier']).over(w))

        # suma total: Sumamos el total de la carga siniestral
        df = df.withColumn('hist_siniestro_carga_siniestral_sum', sum_(df['hist_siniestro_carga_siniestral']).over(w))

        # promedio
        df = df.withColumn('hist_sin_carga_siniestral_promedio', df['hist_siniestro_carga_siniestral_sum']
                           / df['hist_sin_otros_count'])

        # COBERTURAS
        # Outliers
        df = outliers.Outliers.outliers_mad(df, 'hist_siniestro_coberturas_involucradas', not_count_zero=True)
        df = df.withColumn('hist_siniestro_coberturas_involucradas_mad_outlier',
                           when(df['hist_siniestro_coberturas_involucradas'] > 3, 1).otherwise(0))
        df = df.withColumn('hist_siniestro_coberturas_involucradas_mad_outlier_count',
                           sum_(df['hist_siniestro_coberturas_involucradas_mad_outlier']).over(w))
        df = df.drop('hist_siniestro_coberturas_involucradas_mad_outlier')

        # promedio
        df = df.withColumn('hist_sin_otros_cober_sum', sum_(df['hist_siniestro_coberturas_involucradas']).over(w))
        df = df.withColumn('hist_sin_otros_cober_promedio', df['hist_sin_otros_cober_sum'] / df['hist_sin_otros_count'])

        # pagas-cubiertas
        df = df.withColumn('hist_siniestro_coberturas_involucradas_pagadas_sum',
                           sum_(df['hist_siniestro_coberturas_involucradas_pagadas']).over(w))
        df = df.withColumn('hist_sin_otros_pagas_cubiertas',
                           df['hist_siniestro_coberturas_involucradas_pagadas_sum'] / df['hist_sin_otros_cober_sum'])


        # no-pagas
        df = df.withColumn('hist_sin_otros_cob_no_pagas', df['hist_siniestro_coberturas_involucradas'] -
                           df['hist_siniestro_coberturas_involucradas_pagadas'])
        df = df.withColumn('hist_sin_otros_cob_no_pagas', sum_(df['hist_sin_otros_cob_no_pagas']).over(w))

        # VARIABLES DEL
        del_variables = ['id_fiscal', 'hist_siniestro_otro_descripcion', "hist_siniestro_duracion",
                         'hist_siniestro_fecha_terminado', 'hist_sin_otros_count_version',
                         'hist_siniestro_otro_oficina_productora',
                         'hist_siniestro_carga_siniestral', 'hist_siniestro_coberturas_involucradas',
                         'hist_siniestro_coberturas_involucradas_pagadas',
                         'hist_otros_fecha_apertura_terminado',
                         'auditFechaAperturaSiniestroReferencia', 'id_siniestro', 'id_poliza', 'version_poliza',
                         'audit_siniestro_producto_tecnico',
                         'audit_siniestro_codigo_compania', 'hist_siniestro_otro_sit'
                         ]

        df = df.drop(*del_variables)
        df = df.withColumnRenamed('id_siniestro_ref', 'id_siniestro')

        # Tomamos el primero de cada uno de los siniestros
        df = df.dropDuplicates(subset=['id_siniestro'])

        if self._is_diario:
            df = df.filter(df['TEST'] == 1)
            df = df.drop('TEST')

        return df
コード例 #6
0
    .withColumn("created", clean_string_for_date_udf(col_("created"))) \
    .withColumn("removed", clean_string_for_date_udf(col_("removed"))) \
    .withColumn("create_cohort", create_cohort_id_udf(col_("created"))) \
    .withColumn("remove_cohort", create_cohort_id_udf(col_("removed"))) \

create_window = Window.partitionBy().orderBy("created")
line_decay_df = line_decay_df.withColumn("prev_created", lag_(line_decay_df.created).over(create_window)) \
    .withColumn("time_between", minutes_between_udf(col_("created"), col_("prev_created")))

line_decay_df.show()



created_in_cohorts_df = line_decay_df.groupBy("create_cohort")\
    .agg( \
        count_(lit_(1)).alias("total_in_cohort"),\
        avg_(col_("time_between")).alias("avg_time_between")\
        )

removed_in_cohorts_df = line_decay_df.groupBy("create_cohort", "remove_cohort")\
    .count()\
    .withColumnRenamed("count", "removed_in_this_cohort")

lifespand_days_in_cohort_df = line_decay_df.groupBy("create_cohort", "remove_cohort")\
    .agg(max_(col_("lifespan")))\
    .withColumnRenamed("max(lifespan)", "lifespan") \

lifespand_days_in_cohort_df.show()

removed_cohorts_with_lifespan_df = removed_in_cohorts_df.join(lifespand_days_in_cohort_df, \
                                                              (removed_in_cohorts_df.create_cohort == lifespand_days_in_cohort_df.create_cohort) & \