Beispiel #1
0
def glean_2(invoice_df, line_item_df):
    invoice = invoice_df.alias('invoice')
    line_item = line_item_df.alias('line_item')
    joined_table = invoice.join(line_item,
                                invoice.invoice_id == line_item.invoice_id,
                                how='left')
    glean2 = joined_table.groupBy(
        invoice.invoice_id, invoice.canonical_vendor_id, invoice.invoice_date,
        invoice.period_end_date).agg(
            max_('line_item.period_end_date').alias('max_line_end_date'))
    glean2 = glean2.withColumn(
        'end_date', funcs.greatest('max_line_end_date',
                                   invoice.period_end_date))
    glean2 = glean2.withColumn('difference',
                               funcs.datediff('end_date', 'invoice_date'))
    glean2 = glean2[glean2['difference'] > 90]
    glean2 = glean2.withColumn('glean_location', funcs.lit('invoice'))
    glean2 = glean2.withColumn("glean_type", funcs.lit('accrual_alert'))
    glean2 = glean2.withColumn(
        "glean_text",
        funcs.concat(
            funcs.lit('Line items from vendor '),
            funcs.col('canonical_vendor_id'),
            funcs.lit(' in this invoice cover future periods (through '),
            funcs.col('end_date'), funcs.lit(' )')))
    glean2 = glean2.withColumn('glean_date', funcs.col('invoice_date'))
    return glean2
Beispiel #2
0
    def main(self, sc, *args):
        spark_sql = SparkSession.builder.getOrCreate()

        raw_data = self.requires().load_dataframe(spark_sql)

        max_dates = (raw_data.groupBy().agg(max_('rent_date'),
                                            max_('return_date')).first())
        max_date = min(max_dates.asDict().values())

        testing_end_date = max_date
        validation_end_date = testing_start_date = testing_end_date - pd.Timedelta(
            self.test_period).to_pytimedelta()
        training_end_date = validation_start_date = validation_end_date - pd.Timedelta(
            self.validation_period).to_pytimedelta()

        if not self.output()['training'].exists():
            (raw_data.filter(raw_data.rent_date < training_end_date).write.csv(
                self.output()['training'].path, header='true'))

        if not self.output()['validation'].exists():
            (raw_data.filter(
                raw_data.rent_date >= validation_start_date).filter(
                    raw_data.rent_date < validation_end_date).write.csv(
                        self.output()['validation'].path, header='true'))

        if not self.output()['testing'].exists():
            (raw_data.filter(raw_data.rent_date >= testing_start_date).filter(
                raw_data.rent_date <= testing_end_date).write.csv(
                    self.output()['testing'].path, header='true'))

        with self.output()['metadata'].open('w') as f:
            json.dump(
                {
                    'training_end_date': training_end_date.isoformat(),
                    'validation_start_date': validation_start_date.isoformat(),
                    'validation_end_date': validation_end_date.isoformat(),
                    'testing_start_date': testing_start_date.isoformat(),
                    'testing_end_date': testing_end_date.isoformat(),
                },
                f,
                indent=2,
            )
Beispiel #3
0
           substring(concat(lit('0'), split_date.getItem(0)), -2, 2), lit("-"),
           substring(concat(lit('0'), split_date.getItem(1)), -2, 2), lit(" "),
           concat(split_date.getItem(3), lit(':00'))))
online_retail = online_retail.withColumn(
    "InvoiceDate", online_retail.InvoiceDate.cast(TimestampType()))

## remove the null values and junk values
online_retail = online_retail.na.drop(
    subset=["CustomerID", "InvoiceDate", "UnitPrice", "Quantity"])
online_retail = online_retail.filter(online_retail.UnitPrice > 0).filter(
    online_retail.Quantity > 0)

## group by CustomerID to get the last invoice data for each customer
rfm = online_retail\
.groupBy("CustomerID")\
.agg(max_("InvoiceDate").alias('LastDate'))

## get the most recent date in the data
max_date = online_retail.agg({"InvoiceDate": "max"}).collect()[0][0]
rfm = rfm.withColumn("RecentDate", lit(max_date))

## compute the days since last invoice for each customer
rfm = rfm.withColumn("recency", datediff(col("RecentDate"), col("LastDate")))
rfm = rfm.select("CustomerID", "recency")

## group by CustomerID to get monetary and frequency
online_retail = online_retail.withColumn("TotalValue",
                                         col("Quantity") * col("UnitPrice"))
monetary = online_retail.groupBy("CustomerID").agg(
    sum("TotalValue").alias('monetary'))
freq = online_retail.groupBy("CustomerID").agg(
Beispiel #4
0
    def _transform(self, df, auxiliar_train):

        if not self.train_file:
            auxiliar_train = auxiliar_train.drop('WinningBid')
            auxiliar_train = auxiliar_train.withColumn('test', lit(0))
            df = df.withColumn('test', lit(1))
            df = auxiliar_train.union(df)
            del auxiliar_train

        # We create the time as Index
        split_col = split(df['ApproximateDate'], ' ')
        df = df.withColumn('time', split_col.getItem(1))  # time

        # Hour Index
        func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hms'),
                         IntegerType())
        df = df.withColumn('hms_index', func_index(df['time']))

        # We order by UserId-Date
        df = df.orderBy(['UserID', 'hms_index'])

        # We check Null Values
        df.select([count_(when(isnan(c), c)).alias(c)
                   for c in df.columns]).show()

        # We create a rank of users by how many times in the past saw an ad
        w = (Window().partitionBy(df.UserID).orderBy('time').rowsBetween(
            Window.unboundedPreceding, 0))
        df = df.withColumn('user_id_acumulative', count_(df['UserId']).over(w))

        # Number of Ads/User/Second
        df = df.withColumn('key_id',
                           concat(df['UserID'], lit(' '), df['hms_index']))
        w = (Window().partitionBy(df.key_id).orderBy('hms_index').rowsBetween(
            -sys.maxsize, sys.maxsize))
        df = df.withColumn('number_ads_user_second', count_(df.key_id).over(w))

        # Number of Ads/User
        df_group = df.groupby(['key_id'
                               ]).agg(count_('key_id').alias('count_ads'))
        split_col = split(df_group['key_id'], ' ')
        df_group = df_group.withColumn('UserID', split_col.getItem(0))  # time
        w = (Window().partitionBy(
            df_group.UserID).orderBy('key_id').rowsBetween(
                Window.unboundedPreceding, 0))
        df_group = df_group.withColumn('number_ads_user',
                                       sum_(df_group.count_ads).over(w))
        df_group = df_group.select(['key_id', 'number_ads_user'])
        df = df.join(df_group, how='left', on='key_id')
        del df_group

        # Number of Users/Second
        w = (Window().partitionBy(df.ApproximateDate).rowsBetween(
            -sys.maxsize, sys.maxsize))
        df = df.withColumn('number_user_second',
                           approx_count_distinct(df.UserID).over(w))

        # Number of Ads/Second
        df = df.withColumn('number_ads_second',
                           count_(df.ApproximateDate).over(w))

        # Browser Dummy Transformation
        types = df.select('Browser').distinct().collect()
        types = [val['Browser'] for val in types]
        new_cols = [
            when(df['Browser'] == ty, 1).otherwise(0).alias('d_browser_' + ty)
            for ty in types
        ]
        df = df.select(df.columns + new_cols)

        # Decompose Date Variables
        df = df.withColumn('date', to_date(df['ApproximateDate']))  # date
        df = df.withColumn('month', month(df['ApproximateDate']))  # month
        df = df.withColumn('day', dayofmonth(df['ApproximateDate']))  # day
        df = df.withColumn('weekday', dayofweek(
            df['ApproximateDate']))  # weekday 1=Monday

        df = df.withColumn('hour', hour(df['time']))  # hour
        df = df.withColumn('minute', minute(df['time']))  # minute

        # Peak Hour
        df = df.withColumn('peak6am8am',
                           when(df['hour'].between(6, 8), 1).otherwise(0))
        df = df.withColumn('peak14pm16pm',
                           when(df['hour'].between(14, 16), 1).otherwise(0))

        # Minute Index
        func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hm'),
                         IntegerType())
        df = df.withColumn('hm_index', func_index(df['time']))

        # Convert to time-series by Minute
        # We reduce to minutes
        df_time_serie_ads = df.select([
            'hms_index', 'hm_index', 'number_user_second', 'number_ads_second'
        ]).drop_duplicates()
        df_time_serie_user = df.select(['UserID',
                                        'hm_index']).drop_duplicates()

        # Group-by the values
        df_time_serie_user = df_time_serie_user.groupBy('hm_index').agg(
            approx_count_distinct('UserID'))
        df_time_serie_ads = df_time_serie_ads.groupBy('hm_index').agg({
            'number_ads_second':
            'sum'
        }).drop_duplicates(subset=['hm_index'])

        # Join ads-users per minute
        df_time_serie = df_time_serie_ads.join(df_time_serie_user,
                                               how='left',
                                               on='hm_index')
        del df_time_serie_ads, df_time_serie_user

        # Rename columns
        df_time_serie = df_time_serie.withColumnRenamed(
            'sum(number_ads_second)', 'number_ads_minute').withColumnRenamed(
                'approx_count_distinct(UserID)', 'number_user_minute')

        # Resample Range of Minutes
        resample_range = list(
            range(
                df_time_serie.select(min_(
                    col('hm_index'))).limit(1).collect()[0][0],
                df_time_serie.select(max_(
                    col('hm_index'))).limit(1).collect()[0][0] + 1, 1))

        resample_range = self._spark.createDataFrame(resample_range,
                                                     IntegerType())

        # Join the original df
        df_time_serie = resample_range.join(
            df_time_serie,
            how='left',
            on=resample_range.value == df_time_serie.hm_index).drop(
                *['hm_index']).fillna(0)

        # Create Lags By Minutes
        w = Window().partitionBy().orderBy(col('value'))
        if self.ar_min_lag > 0:
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_user_minute').over(w).alias(
                    'ar1_number_user_minute'))
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_ads_minute').over(w).alias(
                    'ar1_number_ads_minute'))

            if self.ar_min_lag > 1:
                for l in range(2, self.ar_min_lag + 1, 1):
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_user_minute').over(
                            w).alias('ar' + str(l) + '_number_user_minute'))
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_ads_minute').over(
                            w).alias('ar' + str(l) + '_number_ads_minute'))

        # Remove the lagged Null Values
        df_time_serie = df_time_serie.dropna()

        # join and remove lag Null values of the first minute
        df = df.orderBy(['UserID', 'hms_index'])
        df = df.join(df_time_serie.orderBy(['hm_index']),
                     how='left',
                     on=df.hm_index == df_time_serie.value).drop('value')

        # Convert to time-series and resample by Seconds
        df_time_serie = df.select(
            ['hms_index', 'number_user_second',
             'number_ads_second']).drop_duplicates()
        resample_range = list(
            range(
                df_time_serie.select(min_(
                    col('hms_index'))).limit(1).collect()[0][0],
                df_time_serie.select(max_(
                    col('hms_index'))).limit(1).collect()[0][0] + 1, 1))
        resample_range = self._spark.createDataFrame(resample_range,
                                                     IntegerType())

        # Join the original df
        df_time_serie = resample_range.join(
            df_time_serie,
            how='left',
            on=resample_range.value == df_time_serie.hms_index).drop(
                *['hms_index']).fillna(0)

        # Create lags
        w = Window().partitionBy().orderBy(col('value'))
        if self.ar_lags > 0:
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_user_second').over(w).alias(
                    'ar1_number_user_second'))
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_ads_second').over(w).alias(
                    'ar1_number_ads_second'))

            if self.ar_lags > 1:
                for l in range(2, self.ar_lags + 1, 1):
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_user_second').over(
                            w).alias('ar' + str(l) + '_number_user_second'))
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_ads_second').over(
                            w).alias('ar' + str(l) + '_number_ads_second'))

        # Create Moving Average
        if self.ma_ss_lag is not None:

            # Get hour from index
            func_index = udf(lambda x: auxiliar_func.num_to_time(x),
                             StringType())
            df_time_serie = df_time_serie.withColumn(
                'time', func_index(df_time_serie['value']))

            # minute MA terms (Average per second last xx seconds)
            if self.ma_ss_lag is not None:
                for lag_val in self.ma_ss_lag:
                    # range to take into account
                    w = (Window.orderBy(df_time_serie['value']).rangeBetween(
                        -lag_val, 0))
                    # MA variables
                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_user_second',
                        avg('number_user_second').over(w))
                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_ads_second',
                        avg('number_ads_second').over(w))

                    # Increasing ID
                    df_time_serie = df_time_serie.withColumn(
                        'rn', monotonically_increasing_id())

                    # Replace first values by Null
                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_user_second',
                        when(df_time_serie['rn'] < lag_val, None).otherwise(
                            df_time_serie['ma_seconds_' + str(lag_val) +
                                          '_number_user_second']))

                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_ads_second',
                        when(df_time_serie['rn'] < lag_val, None).otherwise(
                            df_time_serie['ma_seconds_' + str(lag_val) +
                                          '_number_ads_second']))

                    # Get the average by Minute
                    df_time_serie = df_time_serie.withColumn(
                        'ma_minute_' + str(lag_val) + '_number_user_second',
                        df_time_serie['ma_seconds_' + str(lag_val) +
                                      '_number_user_second'] * 60)
                    df_time_serie = df_time_serie.withColumn(
                        'ma_minute_' + str(lag_val) + '_number_ads_second',
                        df_time_serie['ma_seconds_' + str(lag_val) +
                                      '_number_ads_second'] * 60)
                df_time_serie = df_time_serie.drop(*['rn'])

        # Remove the lagged Null Values
        df_time_serie = df_time_serie.drop(
            *['time', 'number_user_second', 'number_ads_second']).dropna()
        # join and remove lag Null values of the first minute
        df = df.join(
            df_time_serie.orderBy(['value']),
            how='left',
            on=df.hms_index == df_time_serie.value).drop('value').dropna()

        if self.train_file and not self.variable_analysis:
            df = df.select([
                'key_id', 'hms_index', 'number_ads_user', 'number_user_second',
                'number_ads_second', 'number_ads_user_second', 'peak6am8am',
                'peak14pm16pm', 'user_id_acumulative'
            ] + [x for x in df.columns if x.startswith('d_browser')] +
                           [x for x in df.columns if x.startswith('ar')] +
                           [x for x in df.columns if x.startswith('ma_')] +
                           ['WinningBid'])

        if not self.train_file:
            df = df.filter(df['test'] == 1)
            df = df.select([
                'UserID', 'key_id', 'number_ads_user', 'hms_index',
                'number_user_second', 'number_ads_second',
                'number_ads_user_second', 'peak6am8am', 'peak14pm16pm',
                'user_id_acumulative'
            ] + [x for x in df.columns if x.startswith('d_browser')] +
                           [x for x in df.columns if x.startswith('ar')] +
                           [x for x in df.columns if x.startswith('ma_')])

        df = df.orderBy(['hms_index', 'UserID'])
        df.show()
        return df
Beispiel #5
0
#print line_decay_df.schema

#all_lines_by_creator = line_decay_df.groupBy(col_("creator"), \
#                                             trunc_(col_("created"), 'mon').alias("cohort"))\
#    .count()\
#    .withColumnRenamed("count", "lines_created")

total_active_lines = line_decay_df.filter(col_("removed").isNull()).count()

all_lines_by_creator = line_decay_df.groupBy(col_("creator"))\
    .count()\
    .withColumnRenamed("count", "lines_created")

author_agg = line_decay_df.groupBy(col_("creator"))\
    .agg(min_(col_("created")).alias("author_first"), \
         max_(col_("created")).alias("author_last"),
         avg_(col_("lifespan")).alias("avg_lifespan"))

#author_last = line_decay_df.groupBy(col_("creator"))\
#    .agg(max_(col_("created")))\
#    .withColumnRenamed("max(created)", "author_last")

removed_lines_by_creator = line_decay_df.filter(col_("removed").isNotNull())\
    .groupBy(col_("creator"))\
    .count()\
    .withColumnRenamed("count", "lines_removed")

active_lines_by_creator = line_decay_df.filter(col_("removed").isNull())\
    .groupBy(col_("creator"))\
    .count()\
    .withColumnRenamed("count", "lines_active")
Beispiel #6
0
# df_pages.show()

df_categories = df.selectExpr('id', 'title', 'explode(categories) as category',
                              'last_modify_date')
df_links = df.selectExpr('id', 'title', 'explode(page_links) as link',
                         'last_modify_date')

# df_links.show()

joined_df = df_links.alias("dl")\
    .join(df_pages.alias("pg"), col('dl.link') == col('pg.title'))\
    .selectExpr('dl.*','pg.last_modify_date as link_modify_date')\
    .filter(col('link_modify_date') > col('dl.last_modify_date'))\
    .withColumn('outdate_time',(unix_timestamp(col('link_modify_date'),TIME_FORMAT) - unix_timestamp(col('dl.last_modify_date'),TIME_FORMAT)))\
    .groupBy('title')\
    .agg(max_(col('outdate_time')).alias('max_outdate_time'))

joined_df.printSchema()
# joined_df.show()

cat_out_dated = df_categories.alias('ct')\
    .join(joined_df.alias('jd'),expr('ct.title = jd.title'))\
    .select('ct.category','ct.title','jd.max_outdate_time')

# cat_out_dated.sort('ct.category').show()

cat_out_dated_grouped = cat_out_dated\
    .withColumn("row_number",row_number().over(Window.partitionBy('ct.category').orderBy(desc('jd.max_outdate_time'))))\
    .where('row_number = 1')\
    .drop('row_number')
Beispiel #7
0
      .csv("/opt/spark-data/weather")

    # Watermark data
    watermarkedBeamformedDF = beamformedDF.withWatermark(
        "beamformedtimestamp", "1 hours")
    watermarkedWeatherDF = weatherDF.withWatermark("weathertimestamp",
                                                   "2 hours")

    # Joining dataframes
    joinedDF = watermarkedBeamformedDF.join(
        watermarkedWeatherDF,
        expr("""
      hourly_beamformedtimestamp = weathertimestamp AND
      beamformedtimestamp >= weathertimestamp AND
      beamformedtimestamp <= weathertimestamp + interval 1 hour
      """), "leftOuter")

    # Sample operation on joined data
    maxTimeStamp = joinedDF.groupby("beamformedtimestamp",
                                    "weathertimestamp").agg(
                                        max_("beamformedtimestamp"))

    # Start running the query that prints the running counts to the console
    query = maxTimeStamp \
      .writeStream \
      .outputMode('complete') \
      .format('console') \
      .start()

    query.awaitTermination()
Beispiel #8
0
|  Bob|                                1|
+-----+---------------------------------+
'''
##Select maximum date i.e. latest date
#Source: https://stackoverflow.com/questions/38377894/how-to-get-maxdate-from-given-set-of-data-grouped-by-some-fields-using-pyspark

from pyspark.sql.functions import col, max as max_

df = sc.parallelize([
    ("2016-04-06 16:36", 1234, 111, 1),
    ("2016-04-06 17:35", 1234, 111, 5),
]).toDF(["datetime", "userId", "memberId", "value"])

(df.withColumn("datetime", col("datetime").cast("timestamp"))
    .groupBy("userId", "memberId")
    .agg(max_("datetime")))


#After writing dataframe it is read with column _c0, _c1, _c2 ... etc. ; This script renames _c0, _c1, _c2 ... to names given in list li
li=[<colnames>]
st=''
for i in range(len(li)):
	st=st+".withColumnRenamed('_c"+str(i)+"','"+str(li[i])+"')"



#Typecasting to date format
#Source: https://stackoverflow.com/questions/38080748/convert-pyspark-string-to-date-format
from datetime import datetime
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType
Beispiel #9
0
def run_spark_job(spark_args):
    """
    :param spark_args:
    :type spark_args:
    :return:
    :rtype:
    """
    storage_account_name = spark_args.storage_account_name
    input_container = spark_args.input_container_name
    output_container = spark_args.output_container_name
    input_folder = spark_args.input_folder_path
    output_folder = spark_args.output_folder_path
    application_id = spark_args.application_id
    directory_id = spark_args.directory_id
    adb_secret_scope = spark_args.adb_secret_scope_name
    adb_sp_client_key_secret_name = spark_args.adb_sp_client_key_secret_name
    database = args.jdbc_database
    jdbcHost = args.jdbc_host
    jdbcPort = args.jdbc_port

    if storage_account_name is None:
        ValueError("Missing storage_account_name parameter!")
    if output_container is None:
        ValueError("Missing output_container_name parameter!")
    if output_folder is None:
        ValueError("Missing output_folder_path parameter!")
    if application_id is None:
        ValueError("Missing application_id parameter!")
    if directory_id is None:
        ValueError("Missing directory_id parameter!")
    if adb_secret_scope is None:
        ValueError("Missing adb_secret_scope_name parameter!")
    if adb_sp_client_key_secret_name is None:
        ValueError("Missing adb_sp_client_key_secret_name parameter!")
    if database is None:
        ValueError("Missing database parameter!")
    if jdbcHost is None:
        ValueError("Missing jdbcHost parameter!")
    if jdbcPort is None:
        ValueError("Missing jdbcPort parameter!")

    client_secret = SERVICE_PRINCIPAL_SECRET if SERVICE_PRINCIPAL_SECRET is not None else \
        dbutils.secrets.get(scope=adb_secret_scope, key=adb_sp_client_key_secret_name)

    # conf  = SparkConf().setAppName("CalendarProcesor").setMaster("local")
    spark = SparkSession.builder.master("local").getOrCreate()
    # spark = SparkSession.builder.config(conf).getOrCreate()
    # spark.conf.set("spark.app.name", "CalendarProcesor")
    spark.sparkContext.setLogLevel("ERROR")

    spark.conf.set(
        f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net",
        "OAuth")
    spark.conf.set(
        f"fs.azure.account.oauth.provider.type.{storage_account_name}.dfs.core.windows.net",
        "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
    spark.conf.set(
        f"fs.azure.account.oauth2.client.id.{storage_account_name}.dfs.core.windows.net",
        application_id)
    spark.conf.set(
        f"fs.azure.account.oauth2.client.secret.{storage_account_name}.dfs.core.windows.net",
        client_secret)
    spark.conf.set(
        f"fs.azure.account.oauth2.client.endpoint.{storage_account_name}.dfs.core.windows.net",
        f"https://login.microsoftonline.com/{directory_id}/oauth2/token")

    spark.sparkContext.setJobGroup("Running calendar processing",
                                   f"[calendar_events_attendance_extractor]")

    logger.info(
        "[calendar_events_attendance_extractor] Preparing the jobs for analyzing the calendar data"
    )
    container_client_credential = ClientSecretCredential(
        tenant_id=directory_id,
        client_id=application_id,
        client_secret=client_secret)

    blob_service_client = BlobServiceClient(
        account_url=f"https://{storage_account_name}.blob.core.windows.net",
        credential=container_client_credential)

    container_client = blob_service_client.get_container_client(
        input_container)

    if input_container is not None and input_folder is not None:

        input_folder_name = input_folder
        list_of_json_files_from_folder = []
        if input_folder_name.startswith("/"):
            input_folder_name = input_folder_name[1:]

        # next will determine the latest folder to be used
        for entry in container_client.list_blobs(
                name_starts_with=input_folder_name + "/"):
            list_of_json_files_from_folder.append(entry.name)

        collected_folders = []
        for entry in list_of_json_files_from_folder:
            if "events_" not in entry: continue
            tokens = entry.split("/")
            standard_folder_name = tokens[1]
            try:
                tag = int(standard_folder_name.replace("events_", ""))
                collected_folders.append(tag)
            except Exception as e:
                pass
        if not len(collected_folders):
            raise Exception("Could not retrieve the latest run")
        collected_folders.sort(reverse=True)
        input_folder_name = input_folder_name + "/" + "events_" + str(
            collected_folders[0])

        wasb_file_path = f"abfss://{input_container}@{storage_account_name}.dfs.core.windows.net/{input_folder_name}"
        logger.info(
            f"[calendar_events_attendance_extractor] input wasb_file_path: {wasb_file_path}"
        )

        input_df = spark.read.json(wasb_file_path)

        input_df = input_df.filter(
            input_df.subject.contains("Watercooler Event"))

        spark_res_rdd = input_df.rdd.mapPartitions(process_spark_partitions)

        if not spark_res_rdd.isEmpty():
            spark_res_df = spark_res_rdd.map(lambda x: Row(**x)).toDF()

            from pyspark.sql.functions import max as max_

            grouped_events_df = spark_res_df.groupBy(
                "groupDisplayName", "email_address").agg(
                    max_("lastModifiedDateTime").alias("lastModifiedDateTime"))

            grouped_events_df = grouped_events_df.join(
                spark_res_df,
                ["groupDisplayName", "email_address", "lastModifiedDateTime"],
                "inner")

            table = f"""
            (SELECT day, group_name, display_name
            FROM [{database}].dbo.groups_per_day) foo
            """

            groups_per_day_df = generate_dataframe_from_table(
                spark, spark_args, table)

            events_attendance_df = grouped_events_df.join(
                groups_per_day_df, grouped_events_df.groupDisplayName ==
                groups_per_day_df.display_name, "inner")

            events_attendance_df = events_attendance_df.drop(
                "groupDisplayName")

            write_attendance_info(input_df=events_attendance_df,
                                  output_folder=output_folder,
                                  output_container=output_container,
                                  storage_account_name=storage_account_name,
                                  container_client=container_client)

    else:
        raise ValueError(
            "Not enough arguments given in order to read input data: jdbc-database & input-container are missing."
        )
Beispiel #10
0
df_monthly_ts = df.withColumn("yearmonth", f.concat(f.year("editTime"), f.lit('-'), format_string("%02d", f.month("editTime"))))\
    .withColumn("yearmonth", col("yearmonth").cast("timestamp"))
df_monthly_ts = df_monthly_ts.groupBy("yearmonth",
                                      "title").count().orderBy(desc("count"))

df = df.withColumn(
    "yearmonth",
    f.concat(f.year("editTime"), f.lit('-'),
             format_string("%02d", f.month("editTime"))))
df_monthly = df.groupBy("yearmonth", "title").count().orderBy(desc("count"))
print("Number of edits per month over all articles: ")
df_monthly.select("title", "yearmonth", "count").show()

min_date, max_date = df_monthly_ts.select(
    min_("yearmonth").cast("long"),
    max_("yearmonth").cast("long")).first()

data = [(min_date, max_date)]
df_dates = spark.createDataFrame(data, ["minDate", "maxDate"])
df_min_max_date = df_dates.withColumn(
    "minDate",
    col("minDate").cast("timestamp")).withColumn(
        "maxDate",
        col("maxDate").cast("timestamp"))

df_formatted_ts = df_min_max_date.withColumn("monthsDiff", f.months_between("maxDate", "minDate"))\
    .withColumn("repeat", f.expr("split(repeat(',', monthsDiff), ',')"))\
    .select("*", f.posexplode("repeat").alias("date", "val"))\
    .withColumn("date", f.expr("add_months(minDate, date)"))\
    .withColumn("yearmonth", f.concat(f.year("date"), f.lit('-'), format_string("%02d", f.month("date"))))\
    .select('yearmonth')
Beispiel #11
0
common_df = sqlContext.createDataFrame(rdd_join, schema)

# Working with TRADES data:
trade = common_df.filter(common_df.rec_type == "T")

# Selecting most important columns to save space
trade_df = trade.select('trade_dt', 'symbol', 'exchange', 'event_tm',
                        'event_seq_nb', 'trade_pr', 'execution_id',
                        'trade_size', 'arrival_tm')

# Grouping by to get the latest value
trade_gb = trade_df.withColumn("arrival_tm",
                               col("arrival_tm").cast("timestamp")).groupBy(
                                   'trade_dt', 'symbol', 'exchange',
                                   'event_tm',
                                   'event_seq_nb').agg(max_('arrival_tm'))

# Joining the data with the latest arrival_tm to get the rest of the columns
trade_complete = trade_gb.join(
    trade, ['trade_dt', 'symbol', 'exchange', 'event_tm', 'event_seq_nb'],
    how='left')

# Working with QUOTES data:
quotes = common_df.filter(common_df.rec_type == "Q")

# Selecting most important columns to save space
quotes_df = quotes.select('trade_dt', 'symbol', 'exchange', 'event_tm',
                          'event_seq_nb', 'trade_pr', 'execution_id',
                          'trade_size', 'arrival_tm')

# Grouping by to get the latest value
Beispiel #12
0
def process_demographic_data(spark, input_data, output_data):
    """
    Process the demographic data by dropping the duplicates rows and create a new column 'Major Race' based on a group by function
    
    Parameters:
    spark: the spark session
    input_data: the path of the input folder of the data in the local machine
    output_data: the output folder in S3 Bucket
    """
    
    demo_data = input_data+'us-cities-demographics.csv'
    demo_df = spark.read.format('csv').options(header='true',sep=';').load(demo_data)

    demo_df = demo_df.select('City', 'State','Median Age','Male Population','Female Population','Total Population','Foreign-born','State Code','Race','Count').drop_duplicates(subset=['City', 'State','Race'])
    
    demo_df= demo_df.withColumn("Count",col("Count").cast(IntegerType()))
    
    # Using group by to know the major race of every city
    group_df= demo_df.groupby('City', 'State','Median Age','Male Population','Female Population','Total Population','Foreign-born','State Code').pivot('race').agg(max_('Count'))

    group_df = group_df.na.fill({'Hispanic or Latino':0, 'White':0, 'Asian':0, 'Black or African-American':0, 'American Indian and Alaska Native':0})
    cols = group_df.columns[8:13]
    maxcol = F.udf(lambda row: cols[row.index(max(row))], StringType())  
    group_df = group_df.withColumn("Major Race", maxcol(F.struct([group_df[x] for x in group_df.columns[8:13]])))
 
    group_df.write.option("header","true").csv(output_data+'demographic_data/')
Beispiel #13
0
line_decay_df.show()



created_in_cohorts_df = line_decay_df.groupBy("create_cohort")\
    .agg( \
        count_(lit_(1)).alias("total_in_cohort"),\
        avg_(col_("time_between")).alias("avg_time_between")\
        )

removed_in_cohorts_df = line_decay_df.groupBy("create_cohort", "remove_cohort")\
    .count()\
    .withColumnRenamed("count", "removed_in_this_cohort")

lifespand_days_in_cohort_df = line_decay_df.groupBy("create_cohort", "remove_cohort")\
    .agg(max_(col_("lifespan")))\
    .withColumnRenamed("max(lifespan)", "lifespan") \

lifespand_days_in_cohort_df.show()

removed_cohorts_with_lifespan_df = removed_in_cohorts_df.join(lifespand_days_in_cohort_df, \
                                                              (removed_in_cohorts_df.create_cohort == lifespand_days_in_cohort_df.create_cohort) & \
                                                              (removed_in_cohorts_df.remove_cohort == lifespand_days_in_cohort_df.remove_cohort)) \
        .drop(lifespand_days_in_cohort_df.create_cohort) \
        .drop(lifespand_days_in_cohort_df.remove_cohort)

cohort_df = created_in_cohorts_df.join(removed_cohorts_with_lifespan_df, "create_cohort", "left_outer") \
    .orderBy("create_cohort", "lifespan")

raw_data = cohort_df.collect()
processed_data = []
Beispiel #14
0
    def _transform_data(self, df, df_base, df_processed, id_base, id_new, bl_processed, aux_perceptor, aux_servicios):
        """Transform original dataset.

        :param df: Input DataFrame.
        :param bl_processed
        :return: Transformed DataFrame.
        """
        # Cast key variables and rename headers
        df = df.withColumn('id_siniestro', df.id_siniestro.cast(IntegerType()))

        if self._is_diario:
            checklist5 = checklist_spark.checklist5(df_reserva=df_base, df_reserva_new=df, df_id=id_base,
                                                    df_id_new=id_new)
        else:
            checklist5 = checklist_spark.checklist5(df_reserva=df, df_id=id_base)

        df = df.join(checklist5, on='id_siniestro', how='left')
        df = df.fillna({'checklist5_poliza': 0, 'checklist5_nif': 0})
        del checklist5

        # RESERVA INICIAL INDEMNIZACION
        reserva_indem = df.select(['id_siniestro', 'po_res_cobertura_id', 'po_res_indem'])
        reserva_indem = reserva_indem.dropDuplicates(subset=['id_siniestro', 'po_res_cobertura_id', 'po_res_indem'])
        reserva_indem = reserva_indem.drop('po_res_cobertura_id')
        reserva_indem = reserva_indem.withColumn('po_res_indem', reserva_indem.po_res_indem.cast(FloatType()))
        reserva_indem = reserva_indem.groupBy(['id_siniestro']).agg(sum_('po_res_indem').alias('po_res_indem'))
        reserva_indem = reserva_indem.withColumn('po_res_indem_mayor_5000',
                                                 when(reserva_indem['po_res_indem'] >= 5000, 1).otherwise(0))
        df = df.drop('po_res_indem')
        df = df.join(reserva_indem, on='id_siniestro', how='left')
        del reserva_indem

        # RESERVA INICIAL GASTO
        reserva_gasto = df.select(['id_siniestro', 'po_res_cobertura_id', 'po_res_gasto'])
        reserva_gasto = reserva_gasto.dropDuplicates(subset=['id_siniestro', 'po_res_cobertura_id', 'po_res_gasto'])
        reserva_gasto = reserva_gasto.drop('po_res_cobertura_id')
        reserva_gasto = reserva_gasto.withColumn('po_res_gasto', reserva_gasto.po_res_gasto.cast(FloatType()))
        reserva_gasto = reserva_gasto.groupBy('id_siniestro').agg(sum_('po_res_gasto').alias('po_res_gasto'))
        reserva_gasto = reserva_gasto.withColumn('po_res_gasto_mayor_1000',
                                                 when(reserva_gasto['po_res_gasto'] >= 1000, 1).otherwise(0))
        reserva_gasto = reserva_gasto.drop('po_res_gasto')
        df = df.join(reserva_gasto, on='id_siniestro', how='left')
        del reserva_gasto

        # COUNT POLIZAS POR SINIESTRO
        df = df.withColumn('pondera_siniestro', lit(1))
        w = (Window().partitionBy('id_siniestro').rowsBetween(-sys.maxsize, sys.maxsize))
        df = df.withColumn('po_reserva_pagoxsiniestro_count', sum_(df['pondera_siniestro']).over(w))
        df = df.withColumn('po_reserva_indemxsiniestro_count', sum_(df['po_pago_indicador_indem']).over(w))
        # PAGO INDEM ANULADOS: Cuando la anulación es == 1 marca tanto el pago como su anulación
        df = df.withColumn('po_pago_indemnizacion_importe_neto',
                           when(df['po_pago_es_anulacion'] == 1, 1).otherwise(df['po_pago_indemnizacion_importe_neto']))

        # GASTO_INDEM_ANULADOS: Cuando el gasto es == 1 marca tanto el pago como su anulación
        df = df.withColumn('po_gasto_indemnizacion_importe_neto',
                           when(df['po_gasto_es_anulacion'] == 1, 1).otherwise(df['po_gasto_es_anulacion']))

        # PAGOS: Sumamos el importe neto de factura + los pagos netos por indemnizaciòn
        df = df.withColumn('po_pago_importe_neto', df['po_pago_factura_importe_neto'].cast(FloatType()) + df[
            'po_pago_indemnizacion_importe_neto'].cast(FloatType()))

        # GASTOS:
        df = df.withColumn('po_gasto_importe_neto', df['po_gasto_factura_importe_neto'].cast(FloatType()) +
                           df['po_gasto_indemnizacion_importe_neto'].cast(FloatType()))

        # PAGO ASEGURADO: Si la persona no es el asegurado ponemos los importes del Asegurado en 0
        df = df.withColumn('po_pago_importe_neto_ASEGURADO',
                           when(df['persona_objeto_asegurado'].cast(IntegerType()) == 0, 0).otherwise(
                               df['po_pago_importe_neto']))

        # IMPORTE PORCENTUAL QUE EFECTIVAMENTE COBRA EL ASEGURADO: importe_neto_asegurado/importe_total
        df = df.withColumn('po_pago_importe_porcentual_ASEGURADO', df['po_pago_importe_neto_ASEGURADO']
                           / (df['po_pago_importe_neto'] + 1))

        # IBAN Blacklist
        bl_processed_iban = bl_processed.filter(~((bl_processed['iban'].isNull()) | (bl_processed['iban'] == '?')))
        bl_processed_iban = bl_processed_iban.select('iban')
        bl_processed_iban = bl_processed_iban.dropDuplicates(subset=['iban'])
        df = df.join(bl_processed_iban, df.po_pago_IBAN == bl_processed_iban.iban, how='left')
        df = df.withColumn('peritaje_pago_iban_blacklist', when(df['iban'].isNull(), 0).otherwise(1))
        df = df.drop('iban')
        df = df.join(bl_processed_iban, df.po_gasto_IBAN == bl_processed_iban.iban, how='left')
        df = df.withColumn('peritaje_gasto_iban_blacklist', when(df['iban'].isNull(), 0).otherwise(1))
        df = df.drop('iban')
        del bl_processed_iban

        # INT Variables
        # Agrupamos los valores INT por Siniestro y lo guardamos en la lista de INT's
        int_var = ['peritaje_pago_iban_blacklist', 'peritaje_gasto_iban_blacklist']
        int_outliers = []

        for col in int_var:
            count = col + '_count'
            df = df.withColumn(count, sum_(df[col]).over(w))
            int_outliers.append(count)
            df = df.drop(col)

        # PERSONA OBJETO RESERVABLE
        df = df.withColumn('id_persona_objeto_reservable_max',
                           max_(df['id_persona_objeto_reservable'].cast(IntegerType())).over(w))
        df = df.drop('id_persona_objeto_reservable')

        # CATEGORICAL VARIABLE
        # Redefinimos la variable pago_gasto_codigo como categórica
        asegurado = STRING.Parameters.Asegurado_Beneficiario_Perjudicado
        profesional = STRING.Parameters.Profesional_Legal
        detective = STRING.Parameters.Detective
        perito = STRING.Parameters.Perito
        reparador = STRING.Parameters.Reparador
        otros = STRING.Parameters.todos

        df = df.withColumn('po_pago_gasto_codigo', df.po_pago_gasto_codigo.cast(StringType()))
        df = df.withColumn('po_pago_gasto_codigo',
                           when(df['po_pago_gasto_codigo'].isin(otros), df['po_pago_gasto_codigo']).otherwise(
                               'Otros'))
        df = df.withColumn('po_pago_gasto_codigo',
                           when(df['po_pago_gasto_codigo'].isin(asegurado),
                                'Asegurado_Beneficiario_Perjudicado').otherwise(df['po_pago_gasto_codigo']))
        df = df.withColumn('po_pago_gasto_codigo',
                           when(df['po_pago_gasto_codigo'].isin(profesional),
                                'Profesional_Legal').otherwise(df['po_pago_gasto_codigo']))
        df = df.withColumn('po_pago_gasto_codigo',
                           when(df['po_pago_gasto_codigo'].isin(detective),
                                'Detective').otherwise(df['po_pago_gasto_codigo']))
        df = df.withColumn('po_pago_gasto_codigo',
                           when(df['po_pago_gasto_codigo'].isin(perito),
                                'Perito').otherwise(df['po_pago_gasto_codigo']))
        df = df.withColumn('po_pago_gasto_codigo',
                           when(df['po_pago_gasto_codigo'].isin(reparador),
                                'Reparador').otherwise(df['po_pago_gasto_codigo']))

        # GARANTIA
        garantia = STRING.Parameters.dict_garantias
        funct = udf(lambda x: f.replace_dict_contain(x, key_values=garantia), StringType())
        df = df.withColumn('po_res_garantia', funct(df['po_res_garantia']))

        # COBERTURA
        cobertura_1 = STRING.Parameters.dict_cobertura_1
        funct = udf(lambda x: f.replace_dict_contain(x, key_values=cobertura_1), StringType())
        df = df.withColumn('po_res_cobertura', funct(df['po_res_cobertura']))
        df = df.withColumn('po_res_cobertura',
                           when(df['po_res_cobertura'] == 'DE', 'ELECTRICIDAD').otherwise(df['po_res_cobertura']))
        cobertura_2 = STRING.Parameters.dict_cobertura_2
        funct = udf(lambda x: f.replace_dict_contain(x, key_values=cobertura_2), StringType())
        df = df.withColumn('po_res_cobertura', funct(df['po_res_cobertura']))

        # Pasamos todas las categóricas para obtener dummies
        categorical_var = ['po_res_garantia', 'po_res_cobertura', 'po_res_situacion',
                           'po_pago_medio', 'po_pago_gasto_codigo']
        variable_dummy = []

        funct = udf(lambda x: f.normalize_string(x), StringType())
        for col in categorical_var:
            df = df.withColumn(col, df[col].cast(StringType()))
            df = df.withColumn(col, funct(df[col]))
            types = df.select(col).distinct().collect()
            types = [ty[col] for ty in types]
            types_list = [when(df[col] == ty, 1).otherwise(0).alias('d_' + col + '_' + ty) for ty in types if
                          ty not in [None, '0', 0]]
            df = df.select(list(df.columns) + types_list)
            variable_dummy += ['d_' + col + '_' + ty for ty in types if ty not in [None, '0', 0]]
            df = df.drop(col)

        # Agrupamos las dummies por siniestro
        variable_dummy += ['po_pago_indicador_indem', 'po_pago_rehusado', 'po_pago_gasto_codigo_detective',
                           'persona_objeto_asegurado']

        for col in variable_dummy:
            name = col + '_count'
            df = df.fillna({col: 0})
            df = df.withColumn(col, df[col].cast(IntegerType()))
            df = df.withColumn(name, sum_(df[col]).over(w))
            df = df.drop(col)

        # Ajustamos las situaciones para representarlo porcentualmente:
        for col in list(df.columns):
            if col.startswith('d_po_res_situacion'):
                df = df.withColumn(col, df[col] / df['po_reserva_pagoxsiniestro_count'])

        # Ajustamos po_pago_indicador_indem para verlo porcentualmente
        df = df.withColumn('po_pago_indicador_indem_count', df['po_pago_indicador_indem_count'] / df[
            'po_reserva_pagoxsiniestro_count'])

        # FLOAT VARIABLES
        # Agrupamos las FLOAT por siniestro y las ponemos para analizar como Outliers
        variable_float_perceptor = ['po_pago_factura_importe_neto', 'po_pago_indemnizacion_importe_neto',
                                    'po_gasto_factura_importe_neto', 'po_gasto_indemnizacion_importe_neto',
                                    'po_pago_importe_neto', 'po_gasto_importe_neto',
                                    'po_pago_importe_neto_ASEGURADO',
                                    'po_pago_importe_porcentual_ASEGURADO',
                                    ]
        float_outliers = []
        for col in variable_float_perceptor:
            name = col + '_count'
            df = df.fillna({col: 0})
            df = df.withColumn(col, df[col].cast(FloatType()))
            df = df.withColumn(name, sum_(df[col]).over(w))
            float_outliers.append(name)

        # Porcentual_Asegurado: Lo ajustamos para que refleje el valor porcentual
        df = df.withColumn('po_pago_importe_porcentual_ASEGURADO_count',
                           df['po_pago_importe_porcentual_ASEGURADO_count'] / df['po_reserva_indemxsiniestro_count'])

        # TABLA DE PERCEPTOR Y SERVICIOS
        df = df.withColumn('po_pago_perceptor',
                           when(df['po_pago_perceptor'].startswith('AIDE ASISTENCIA'), 'AIDE ASISTENCIA').otherwise(
                               df['po_pago_perceptor']))

        if self._is_diario:
            df = df.join(aux_perceptor, on='po_pago_perceptor', how='left')
            df = df.join(aux_servicios.drop('id_siniestro'), on='po_gasto_perceptor', how='left')
            for col in list(aux_perceptor.columns) + list(aux_servicios.columns):
                if col not in ['po_pago_perceptor', 'po_gasto_perceptor']:
                    df = df.fillna({col: 0})
            del aux_perceptor
            del aux_servicios

        else:
            # TABLA DE PERCEPTOR
            df.withColumn('po_pago_perceptor',
                          when(df['po_pago_perceptor'].isin([0, '0']), None).otherwise(df['po_pago_perceptor']))
            w_perceptor = (Window().partitionBy('po_pago_perceptor').rowsBetween(-sys.maxsize, sys.maxsize))

            # Por Pagos
            df = df.withColumn('pondera_perceptor', when(df['po_pago_perceptor'].isNotNull(), 1).otherwise(0))
            df = df.withColumn('po_pagos_total_countxperceptor', sum_(df['pondera_perceptor']).over(w_perceptor))
            df = df.drop('pondera_perceptor')

            # Pago anulación
            df = df.fillna({'po_pago_es_anulacion': 0})
            df = df.withColumn('po_pago_es_anulacion_countxperceptor',
                               sum_(df['po_pago_es_anulacion'].cast(IntegerType())).over(w_perceptor))
            df = df.drop('po_pago_es_anulacion')

            variable_float_perceptor = ['po_pago_factura_importe_neto', 'po_pago_indemnizacion_importe_neto']
            # Nota: está bien que haya valores negativos porque son los recobros a otras empresas
            for col in variable_float_perceptor:
                name = col + '_countxperceptor'
                df = df.withColumn(name, sum_(df[col].cast(FloatType())).over(w_perceptor))
                df = df.drop(col)

            # Count a nivel siniestro
            df = df.withColumn('po_pagoxsiniestro_countxperceptor',
                               size(collect_set(df['id_siniestro']).over(w_perceptor)))

            # Obtenemos los niveles promedio por Perceptor-Siniestro
            for col in ['po_pago_factura_importe_neto_countxperceptor',
                        'po_pago_indemnizacion_importe_neto_countxperceptor']:
                df = df.withColumn(col + 'xpromediosiniestro', df[col] / df['po_pagoxsiniestro_countxperceptor'])

            # Perceptor Aparece en blacklist y cuántas
            # FUE UN SINIESTRO FRAUDULENTO? We check if the id_siniestro is associated with a previous Fraud Sinister
            bl_processed = bl_processed.select('id_siniestro').dropDuplicates(subset=['id_siniestro'])
            bl_processed = bl_processed.withColumn('po_reserva_perceptor_fraud', lit(1))
            df = df.join(bl_processed, on='id_siniestro', how='left')
            df = df.withColumn('po_reserva_perceptor_fraud',
                               when(df['po_reserva_perceptor_fraud'].isNull(), 0).otherwise(
                                   df['po_reserva_perceptor_fraud']))

            perc_aux = df.select(['po_pago_perceptor', 'id_siniestro', 'po_reserva_perceptor_fraud']).dropDuplicates(
                subset=['po_pago_perceptor', 'id_siniestro'])
            perc_aux = perc_aux.groupBy('po_pago_perceptor').agg(
                sum_('po_reserva_perceptor_fraud').alias('po_fraude_countxperceptor'))
            df = df.join(perc_aux, on='po_pago_perceptor', how='left')
            del perc_aux

            df = df.withColumn('po_fraude_porcentaje_perceptor',
                               df['po_fraude_countxperceptor'] / df['po_pagoxsiniestro_countxperceptor'])

            df.select(['po_pago_perceptor', 'po_pagos_total_countxperceptor', 'po_pago_es_anulacion_countxperceptor',
                       'po_pago_factura_importe_neto_countxperceptor',
                       'po_pago_indemnizacion_importe_neto_countxperceptor', 'po_pagoxsiniestro_countxperceptor',
                       'po_pago_factura_importe_neto_countxperceptorxpromediosiniestro',
                       'po_pago_indemnizacion_importe_neto_countxperceptorxpromediosiniestro',
                       'po_fraude_countxperceptor', 'po_fraude_porcentaje_perceptor']
                      ).dropDuplicates(subset=['po_pago_perceptor']).coalesce(1).write.mode("overwrite").option(
                "header", "true").option("sep", ";").csv(STRING.training_auxiliar_perceptor)

            # TABLA DE SERVICIOS
            df = df.withColumn('po_gasto_perceptor',
                               when(df['po_gasto_perceptor'].isin([0, '0']), None).otherwise(df['po_gasto_perceptor']))
            w_servicio = (Window().partitionBy('po_gasto_perceptor').rowsBetween(-sys.maxsize, sys.maxsize))

            # Por Pagos
            df = df.withColumn('pondera_servicio', when(df['po_gasto_perceptor'].isNotNull(), 1).otherwise(0))
            df = df.withColumn('po_pagos_total_countxservicios', sum_(df['pondera_servicio']).over(w_servicio))
            df = df.drop('pondera_servicio')

            variable_float_servicio = ['po_gasto_factura_importe_neto', 'po_gasto_indemnizacion_importe_neto']
            # Nota: está bien que haya valores negativos porque son los recobros a otras empresas
            for col in variable_float_servicio:
                name = col + '_countxservicios'
                df = df.withColumn(name, sum_(df[col].cast(FloatType())).over(w_servicio))
                df = df.drop(col)

            # Count a nivel siniestro
            df = df.withColumn('po_pagoxsiniestro_countxservicios',
                               size(collect_set(df['id_siniestro']).over(w_servicio)))

            # Obtenemos el promedio global por Servicio
            for col in ['po_gasto_factura_importe_neto_countxservicios',
                        'po_gasto_indemnizacion_importe_neto_countxservicios']:
                df = df.withColumn(col + 'xpromediosiniestro', df[col] / df['po_pagoxsiniestro_countxservicios'])

            # Perceptor Aparece en blacklist y cuántas
            # FUE UN SINIESTRO FRAUDULENTO? We check if the id_siniestro is associated with a previous Fraud Sinister
            bl_processed = bl_processed.withColumn('po_reserva_servicios_fraud', lit(1))
            df = df.join(bl_processed[['id_siniestro', 'po_reserva_servicios_fraud']], on='id_siniestro', how='left')
            df = df.withColumn('po_reserva_servicios_fraud',
                               when(df['po_reserva_servicios_fraud'].isNull(), 0).otherwise(
                                   df['po_reserva_servicios_fraud']))
            del bl_processed

            serv_aux = df.select(['po_gasto_perceptor', 'id_siniestro', 'po_reserva_servicios_fraud']).dropDuplicates(
                subset=['po_gasto_perceptor', 'id_siniestro'])
            serv_aux = serv_aux.groupBy('po_gasto_perceptor').agg(
                sum_('po_reserva_servicios_fraud').alias('po_fraude_countxservicios'))
            df = df.join(serv_aux, on='po_gasto_perceptor', how='left')
            del serv_aux

            df = df.withColumn('po_fraude_porcentaje_servicios',
                               df['po_fraude_countxservicios'] / df['po_pagoxsiniestro_countxservicios'])

            df.select(['po_gasto_perceptor', 'id_siniestro', 'po_pagos_total_countxservicios',
                       'po_gasto_factura_importe_neto_countxservicios',
                       'po_gasto_indemnizacion_importe_neto_countxservicios', 'po_pagoxsiniestro_countxservicios',
                       'po_gasto_factura_importe_neto_countxserviciosxpromediosiniestro',
                       'po_gasto_indemnizacion_importe_neto_countxserviciosxpromediosiniestro',
                       'po_fraude_countxservicios', 'po_fraude_porcentaje_servicios']
                      ).dropDuplicates(subset=['po_gasto_perceptor']).coalesce(1).write.mode("overwrite").option(
                "header", "true").option("sep", ";").csv(STRING.training_auxiliar_servicios)


        df = df.drop(*['po_pago_perceptor', 'po_gasto_perceptor'])
        df = df.dropDuplicates(subset=['id_siniestro'])
        df = df.fillna({'po_fraude_countxservicios': 0, 'po_fraude_porcentaje_servicios': 0})

        if self._is_diario:
            df_processed = df_processed.select(int_outliers + float_outliers)
            for col in int_outliers + float_outliers:
                df = outliers.Outliers.outliers_test_values(df, df_processed, col, not_count_zero=True)
        else:
            for col in int_outliers + float_outliers:
                df = outliers.Outliers.outliers_mad(df, col, not_count_zero=True)

        # DELETE VARIABLES
        del_variables = ['id_poliza', 'version_poliza', "po_res_garantia_id",
                         "po_res_cobertura_id", 'po_res_limite',
                         "po_pago_IBAN", "po_pago_emision", "po_pago_factura_fecha_emision",
                         'po_pago_factura_importe_neto', 'po_pago_indemnizacion_importe_neto',
                         'po_gasto_IBAN', 'po_gasto_emision', 'po_gasto_factura_fecha_emision',
                         'po_pago_es_anulacion', 'po_gasto_es_anulacion', 'pondera_siniestro',
                         'po_gasto_perceptor', 'po_gasto_factura_importe_neto', 'po_gasto_indemnizacion_importe_neto',
                         'po_pago_importe_neto', 'po_gasto_importe_neto',
                         'po_pago_importe_neto_ASEGURADO', 'po_pago_importe_porcentual_ASEGURADO',
                         'audit_poliza_producto_tecnico',
                         'audit_siniestro_codigo_compania', 'po_reserva_indemxsiniestro_count'
                         ]

        df = df.drop(*del_variables)

        return df
Beispiel #15
0
########### SOLUTION TO PART 1 #############
## This returns an aggregated dataframe with the client_port and all the activities, in the same time interval
df_time.orderBy("client_port", "Date", "Interval").show()

########### SOLUTION TO PART 2###############
## Session time = In every session interval, for every IP, difference between the maximum and minimum Time of request

from pyspark.sql.functions import col, max as max_
from pyspark.sql.functions import col, min as min_

## find maximum timestamp for every ip in every timestamp
dfmaxtime = df_time.withColumn("timestamp",
                               col("timestamp").cast("timestamp")).groupBy(
                                   "client_port", "Date",
                                   "Interval").agg(max_("Time"))

## find minimum timestamp for every ip in every timestamp
dfmintime = df_time.withColumn("timestamp",
                               col("timestamp").cast("timestamp")).groupBy(
                                   "client_port", "Date",
                                   "Interval").agg(min_("Time"))

## merge the two dataframes and calculate the difference between the timestamps
dftime = dfmaxtime.join(dfmintime, ['client_port', 'Date', 'Interval'])

## changing column names
dftime = dftime.select(
    col("client_port").alias("client_port"),
    col("Date").alias("Date"),
    col("Interval").alias("Interval"),
starins_US=strains_US=strains_US \
 .withColumn("new_date",date_format_udf(col("date"))) \
 .drop("date") \
 .withColumnRenamed("new_date","date")

#converting to date type
strains_US=strains_US \
.withColumn("new_date",to_date("date","yyyy-mm-dd")) \
.drop("date") \
.withColumnRenamed("new_date","date")

# COMMAND ----------

import datetime
from pyspark.sql.functions import col, max as max_, min as min_
strains_US.agg(max_("date")).show()
strains_US.agg(min_("date")).show()

#All strains in USA were collected between 01-01-2020 to 01-31-2020

# COMMAND ----------

#1) UDF to categorize strains based on date collected


def categorize(x):
    if x <= datetime.datetime.strptime("2020-01-10", "%Y-%m-%d").date():
        return 1
    elif x <= datetime.datetime.strptime("2020-01-20", "%Y-%m-%d").date():
        return 2
    else:
Beispiel #17
0
start_time = time.time()
df_monthly = df.groupBy("yearmonth", "title").count().orderBy(desc("count"))
end_time = time.time()
duration = end_time - start_time
stf(worker_count, file_count, duration, 'mgroupby2')

print("Number of edits per month over all articles: ")

start_time = time.time()
df_monthly.select("title", "yearmonth", "count").show()
end_time = time.time()
duration = end_time - start_time
stf(worker_count, file_count, duration, 'mselectshow')

start_time = time.time()
min_date, max_date = df_monthly_ts.select(min_("yearmonth").cast("long"), max_("yearmonth").cast("long")).first()
end_time = time.time()
duration = end_time - start_time
stf(worker_count, file_count, duration, 'mselect2')

data = [(min_date, max_date)]

start_time = time.time()
df_dates = spark.createDataFrame(data, ["minDate", "maxDate"])
end_time = time.time()
duration = end_time - start_time
stf(worker_count, file_count, duration, 'mcreateframe')

start_time = time.time()
df_min_max_date = df_dates.withColumn("minDate", col("minDate").cast("timestamp")).withColumn("maxDate", col("maxDate").cast("timestamp"))
end_time = time.time()