def glean_2(invoice_df, line_item_df): invoice = invoice_df.alias('invoice') line_item = line_item_df.alias('line_item') joined_table = invoice.join(line_item, invoice.invoice_id == line_item.invoice_id, how='left') glean2 = joined_table.groupBy( invoice.invoice_id, invoice.canonical_vendor_id, invoice.invoice_date, invoice.period_end_date).agg( max_('line_item.period_end_date').alias('max_line_end_date')) glean2 = glean2.withColumn( 'end_date', funcs.greatest('max_line_end_date', invoice.period_end_date)) glean2 = glean2.withColumn('difference', funcs.datediff('end_date', 'invoice_date')) glean2 = glean2[glean2['difference'] > 90] glean2 = glean2.withColumn('glean_location', funcs.lit('invoice')) glean2 = glean2.withColumn("glean_type", funcs.lit('accrual_alert')) glean2 = glean2.withColumn( "glean_text", funcs.concat( funcs.lit('Line items from vendor '), funcs.col('canonical_vendor_id'), funcs.lit(' in this invoice cover future periods (through '), funcs.col('end_date'), funcs.lit(' )'))) glean2 = glean2.withColumn('glean_date', funcs.col('invoice_date')) return glean2
def main(self, sc, *args): spark_sql = SparkSession.builder.getOrCreate() raw_data = self.requires().load_dataframe(spark_sql) max_dates = (raw_data.groupBy().agg(max_('rent_date'), max_('return_date')).first()) max_date = min(max_dates.asDict().values()) testing_end_date = max_date validation_end_date = testing_start_date = testing_end_date - pd.Timedelta( self.test_period).to_pytimedelta() training_end_date = validation_start_date = validation_end_date - pd.Timedelta( self.validation_period).to_pytimedelta() if not self.output()['training'].exists(): (raw_data.filter(raw_data.rent_date < training_end_date).write.csv( self.output()['training'].path, header='true')) if not self.output()['validation'].exists(): (raw_data.filter( raw_data.rent_date >= validation_start_date).filter( raw_data.rent_date < validation_end_date).write.csv( self.output()['validation'].path, header='true')) if not self.output()['testing'].exists(): (raw_data.filter(raw_data.rent_date >= testing_start_date).filter( raw_data.rent_date <= testing_end_date).write.csv( self.output()['testing'].path, header='true')) with self.output()['metadata'].open('w') as f: json.dump( { 'training_end_date': training_end_date.isoformat(), 'validation_start_date': validation_start_date.isoformat(), 'validation_end_date': validation_end_date.isoformat(), 'testing_start_date': testing_start_date.isoformat(), 'testing_end_date': testing_end_date.isoformat(), }, f, indent=2, )
substring(concat(lit('0'), split_date.getItem(0)), -2, 2), lit("-"), substring(concat(lit('0'), split_date.getItem(1)), -2, 2), lit(" "), concat(split_date.getItem(3), lit(':00')))) online_retail = online_retail.withColumn( "InvoiceDate", online_retail.InvoiceDate.cast(TimestampType())) ## remove the null values and junk values online_retail = online_retail.na.drop( subset=["CustomerID", "InvoiceDate", "UnitPrice", "Quantity"]) online_retail = online_retail.filter(online_retail.UnitPrice > 0).filter( online_retail.Quantity > 0) ## group by CustomerID to get the last invoice data for each customer rfm = online_retail\ .groupBy("CustomerID")\ .agg(max_("InvoiceDate").alias('LastDate')) ## get the most recent date in the data max_date = online_retail.agg({"InvoiceDate": "max"}).collect()[0][0] rfm = rfm.withColumn("RecentDate", lit(max_date)) ## compute the days since last invoice for each customer rfm = rfm.withColumn("recency", datediff(col("RecentDate"), col("LastDate"))) rfm = rfm.select("CustomerID", "recency") ## group by CustomerID to get monetary and frequency online_retail = online_retail.withColumn("TotalValue", col("Quantity") * col("UnitPrice")) monetary = online_retail.groupBy("CustomerID").agg( sum("TotalValue").alias('monetary')) freq = online_retail.groupBy("CustomerID").agg(
def _transform(self, df, auxiliar_train): if not self.train_file: auxiliar_train = auxiliar_train.drop('WinningBid') auxiliar_train = auxiliar_train.withColumn('test', lit(0)) df = df.withColumn('test', lit(1)) df = auxiliar_train.union(df) del auxiliar_train # We create the time as Index split_col = split(df['ApproximateDate'], ' ') df = df.withColumn('time', split_col.getItem(1)) # time # Hour Index func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hms'), IntegerType()) df = df.withColumn('hms_index', func_index(df['time'])) # We order by UserId-Date df = df.orderBy(['UserID', 'hms_index']) # We check Null Values df.select([count_(when(isnan(c), c)).alias(c) for c in df.columns]).show() # We create a rank of users by how many times in the past saw an ad w = (Window().partitionBy(df.UserID).orderBy('time').rowsBetween( Window.unboundedPreceding, 0)) df = df.withColumn('user_id_acumulative', count_(df['UserId']).over(w)) # Number of Ads/User/Second df = df.withColumn('key_id', concat(df['UserID'], lit(' '), df['hms_index'])) w = (Window().partitionBy(df.key_id).orderBy('hms_index').rowsBetween( -sys.maxsize, sys.maxsize)) df = df.withColumn('number_ads_user_second', count_(df.key_id).over(w)) # Number of Ads/User df_group = df.groupby(['key_id' ]).agg(count_('key_id').alias('count_ads')) split_col = split(df_group['key_id'], ' ') df_group = df_group.withColumn('UserID', split_col.getItem(0)) # time w = (Window().partitionBy( df_group.UserID).orderBy('key_id').rowsBetween( Window.unboundedPreceding, 0)) df_group = df_group.withColumn('number_ads_user', sum_(df_group.count_ads).over(w)) df_group = df_group.select(['key_id', 'number_ads_user']) df = df.join(df_group, how='left', on='key_id') del df_group # Number of Users/Second w = (Window().partitionBy(df.ApproximateDate).rowsBetween( -sys.maxsize, sys.maxsize)) df = df.withColumn('number_user_second', approx_count_distinct(df.UserID).over(w)) # Number of Ads/Second df = df.withColumn('number_ads_second', count_(df.ApproximateDate).over(w)) # Browser Dummy Transformation types = df.select('Browser').distinct().collect() types = [val['Browser'] for val in types] new_cols = [ when(df['Browser'] == ty, 1).otherwise(0).alias('d_browser_' + ty) for ty in types ] df = df.select(df.columns + new_cols) # Decompose Date Variables df = df.withColumn('date', to_date(df['ApproximateDate'])) # date df = df.withColumn('month', month(df['ApproximateDate'])) # month df = df.withColumn('day', dayofmonth(df['ApproximateDate'])) # day df = df.withColumn('weekday', dayofweek( df['ApproximateDate'])) # weekday 1=Monday df = df.withColumn('hour', hour(df['time'])) # hour df = df.withColumn('minute', minute(df['time'])) # minute # Peak Hour df = df.withColumn('peak6am8am', when(df['hour'].between(6, 8), 1).otherwise(0)) df = df.withColumn('peak14pm16pm', when(df['hour'].between(14, 16), 1).otherwise(0)) # Minute Index func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hm'), IntegerType()) df = df.withColumn('hm_index', func_index(df['time'])) # Convert to time-series by Minute # We reduce to minutes df_time_serie_ads = df.select([ 'hms_index', 'hm_index', 'number_user_second', 'number_ads_second' ]).drop_duplicates() df_time_serie_user = df.select(['UserID', 'hm_index']).drop_duplicates() # Group-by the values df_time_serie_user = df_time_serie_user.groupBy('hm_index').agg( approx_count_distinct('UserID')) df_time_serie_ads = df_time_serie_ads.groupBy('hm_index').agg({ 'number_ads_second': 'sum' }).drop_duplicates(subset=['hm_index']) # Join ads-users per minute df_time_serie = df_time_serie_ads.join(df_time_serie_user, how='left', on='hm_index') del df_time_serie_ads, df_time_serie_user # Rename columns df_time_serie = df_time_serie.withColumnRenamed( 'sum(number_ads_second)', 'number_ads_minute').withColumnRenamed( 'approx_count_distinct(UserID)', 'number_user_minute') # Resample Range of Minutes resample_range = list( range( df_time_serie.select(min_( col('hm_index'))).limit(1).collect()[0][0], df_time_serie.select(max_( col('hm_index'))).limit(1).collect()[0][0] + 1, 1)) resample_range = self._spark.createDataFrame(resample_range, IntegerType()) # Join the original df df_time_serie = resample_range.join( df_time_serie, how='left', on=resample_range.value == df_time_serie.hm_index).drop( *['hm_index']).fillna(0) # Create Lags By Minutes w = Window().partitionBy().orderBy(col('value')) if self.ar_min_lag > 0: df_time_serie = df_time_serie.select( '*', lag('number_user_minute').over(w).alias( 'ar1_number_user_minute')) df_time_serie = df_time_serie.select( '*', lag('number_ads_minute').over(w).alias( 'ar1_number_ads_minute')) if self.ar_min_lag > 1: for l in range(2, self.ar_min_lag + 1, 1): df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_user_minute').over( w).alias('ar' + str(l) + '_number_user_minute')) df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_ads_minute').over( w).alias('ar' + str(l) + '_number_ads_minute')) # Remove the lagged Null Values df_time_serie = df_time_serie.dropna() # join and remove lag Null values of the first minute df = df.orderBy(['UserID', 'hms_index']) df = df.join(df_time_serie.orderBy(['hm_index']), how='left', on=df.hm_index == df_time_serie.value).drop('value') # Convert to time-series and resample by Seconds df_time_serie = df.select( ['hms_index', 'number_user_second', 'number_ads_second']).drop_duplicates() resample_range = list( range( df_time_serie.select(min_( col('hms_index'))).limit(1).collect()[0][0], df_time_serie.select(max_( col('hms_index'))).limit(1).collect()[0][0] + 1, 1)) resample_range = self._spark.createDataFrame(resample_range, IntegerType()) # Join the original df df_time_serie = resample_range.join( df_time_serie, how='left', on=resample_range.value == df_time_serie.hms_index).drop( *['hms_index']).fillna(0) # Create lags w = Window().partitionBy().orderBy(col('value')) if self.ar_lags > 0: df_time_serie = df_time_serie.select( '*', lag('number_user_second').over(w).alias( 'ar1_number_user_second')) df_time_serie = df_time_serie.select( '*', lag('number_ads_second').over(w).alias( 'ar1_number_ads_second')) if self.ar_lags > 1: for l in range(2, self.ar_lags + 1, 1): df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_user_second').over( w).alias('ar' + str(l) + '_number_user_second')) df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_ads_second').over( w).alias('ar' + str(l) + '_number_ads_second')) # Create Moving Average if self.ma_ss_lag is not None: # Get hour from index func_index = udf(lambda x: auxiliar_func.num_to_time(x), StringType()) df_time_serie = df_time_serie.withColumn( 'time', func_index(df_time_serie['value'])) # minute MA terms (Average per second last xx seconds) if self.ma_ss_lag is not None: for lag_val in self.ma_ss_lag: # range to take into account w = (Window.orderBy(df_time_serie['value']).rangeBetween( -lag_val, 0)) # MA variables df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_user_second', avg('number_user_second').over(w)) df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_ads_second', avg('number_ads_second').over(w)) # Increasing ID df_time_serie = df_time_serie.withColumn( 'rn', monotonically_increasing_id()) # Replace first values by Null df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_user_second', when(df_time_serie['rn'] < lag_val, None).otherwise( df_time_serie['ma_seconds_' + str(lag_val) + '_number_user_second'])) df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_ads_second', when(df_time_serie['rn'] < lag_val, None).otherwise( df_time_serie['ma_seconds_' + str(lag_val) + '_number_ads_second'])) # Get the average by Minute df_time_serie = df_time_serie.withColumn( 'ma_minute_' + str(lag_val) + '_number_user_second', df_time_serie['ma_seconds_' + str(lag_val) + '_number_user_second'] * 60) df_time_serie = df_time_serie.withColumn( 'ma_minute_' + str(lag_val) + '_number_ads_second', df_time_serie['ma_seconds_' + str(lag_val) + '_number_ads_second'] * 60) df_time_serie = df_time_serie.drop(*['rn']) # Remove the lagged Null Values df_time_serie = df_time_serie.drop( *['time', 'number_user_second', 'number_ads_second']).dropna() # join and remove lag Null values of the first minute df = df.join( df_time_serie.orderBy(['value']), how='left', on=df.hms_index == df_time_serie.value).drop('value').dropna() if self.train_file and not self.variable_analysis: df = df.select([ 'key_id', 'hms_index', 'number_ads_user', 'number_user_second', 'number_ads_second', 'number_ads_user_second', 'peak6am8am', 'peak14pm16pm', 'user_id_acumulative' ] + [x for x in df.columns if x.startswith('d_browser')] + [x for x in df.columns if x.startswith('ar')] + [x for x in df.columns if x.startswith('ma_')] + ['WinningBid']) if not self.train_file: df = df.filter(df['test'] == 1) df = df.select([ 'UserID', 'key_id', 'number_ads_user', 'hms_index', 'number_user_second', 'number_ads_second', 'number_ads_user_second', 'peak6am8am', 'peak14pm16pm', 'user_id_acumulative' ] + [x for x in df.columns if x.startswith('d_browser')] + [x for x in df.columns if x.startswith('ar')] + [x for x in df.columns if x.startswith('ma_')]) df = df.orderBy(['hms_index', 'UserID']) df.show() return df
#print line_decay_df.schema #all_lines_by_creator = line_decay_df.groupBy(col_("creator"), \ # trunc_(col_("created"), 'mon').alias("cohort"))\ # .count()\ # .withColumnRenamed("count", "lines_created") total_active_lines = line_decay_df.filter(col_("removed").isNull()).count() all_lines_by_creator = line_decay_df.groupBy(col_("creator"))\ .count()\ .withColumnRenamed("count", "lines_created") author_agg = line_decay_df.groupBy(col_("creator"))\ .agg(min_(col_("created")).alias("author_first"), \ max_(col_("created")).alias("author_last"), avg_(col_("lifespan")).alias("avg_lifespan")) #author_last = line_decay_df.groupBy(col_("creator"))\ # .agg(max_(col_("created")))\ # .withColumnRenamed("max(created)", "author_last") removed_lines_by_creator = line_decay_df.filter(col_("removed").isNotNull())\ .groupBy(col_("creator"))\ .count()\ .withColumnRenamed("count", "lines_removed") active_lines_by_creator = line_decay_df.filter(col_("removed").isNull())\ .groupBy(col_("creator"))\ .count()\ .withColumnRenamed("count", "lines_active")
# df_pages.show() df_categories = df.selectExpr('id', 'title', 'explode(categories) as category', 'last_modify_date') df_links = df.selectExpr('id', 'title', 'explode(page_links) as link', 'last_modify_date') # df_links.show() joined_df = df_links.alias("dl")\ .join(df_pages.alias("pg"), col('dl.link') == col('pg.title'))\ .selectExpr('dl.*','pg.last_modify_date as link_modify_date')\ .filter(col('link_modify_date') > col('dl.last_modify_date'))\ .withColumn('outdate_time',(unix_timestamp(col('link_modify_date'),TIME_FORMAT) - unix_timestamp(col('dl.last_modify_date'),TIME_FORMAT)))\ .groupBy('title')\ .agg(max_(col('outdate_time')).alias('max_outdate_time')) joined_df.printSchema() # joined_df.show() cat_out_dated = df_categories.alias('ct')\ .join(joined_df.alias('jd'),expr('ct.title = jd.title'))\ .select('ct.category','ct.title','jd.max_outdate_time') # cat_out_dated.sort('ct.category').show() cat_out_dated_grouped = cat_out_dated\ .withColumn("row_number",row_number().over(Window.partitionBy('ct.category').orderBy(desc('jd.max_outdate_time'))))\ .where('row_number = 1')\ .drop('row_number')
.csv("/opt/spark-data/weather") # Watermark data watermarkedBeamformedDF = beamformedDF.withWatermark( "beamformedtimestamp", "1 hours") watermarkedWeatherDF = weatherDF.withWatermark("weathertimestamp", "2 hours") # Joining dataframes joinedDF = watermarkedBeamformedDF.join( watermarkedWeatherDF, expr(""" hourly_beamformedtimestamp = weathertimestamp AND beamformedtimestamp >= weathertimestamp AND beamformedtimestamp <= weathertimestamp + interval 1 hour """), "leftOuter") # Sample operation on joined data maxTimeStamp = joinedDF.groupby("beamformedtimestamp", "weathertimestamp").agg( max_("beamformedtimestamp")) # Start running the query that prints the running counts to the console query = maxTimeStamp \ .writeStream \ .outputMode('complete') \ .format('console') \ .start() query.awaitTermination()
| Bob| 1| +-----+---------------------------------+ ''' ##Select maximum date i.e. latest date #Source: https://stackoverflow.com/questions/38377894/how-to-get-maxdate-from-given-set-of-data-grouped-by-some-fields-using-pyspark from pyspark.sql.functions import col, max as max_ df = sc.parallelize([ ("2016-04-06 16:36", 1234, 111, 1), ("2016-04-06 17:35", 1234, 111, 5), ]).toDF(["datetime", "userId", "memberId", "value"]) (df.withColumn("datetime", col("datetime").cast("timestamp")) .groupBy("userId", "memberId") .agg(max_("datetime"))) #After writing dataframe it is read with column _c0, _c1, _c2 ... etc. ; This script renames _c0, _c1, _c2 ... to names given in list li li=[<colnames>] st='' for i in range(len(li)): st=st+".withColumnRenamed('_c"+str(i)+"','"+str(li[i])+"')" #Typecasting to date format #Source: https://stackoverflow.com/questions/38080748/convert-pyspark-string-to-date-format from datetime import datetime from pyspark.sql.functions import col, udf from pyspark.sql.types import DateType
def run_spark_job(spark_args): """ :param spark_args: :type spark_args: :return: :rtype: """ storage_account_name = spark_args.storage_account_name input_container = spark_args.input_container_name output_container = spark_args.output_container_name input_folder = spark_args.input_folder_path output_folder = spark_args.output_folder_path application_id = spark_args.application_id directory_id = spark_args.directory_id adb_secret_scope = spark_args.adb_secret_scope_name adb_sp_client_key_secret_name = spark_args.adb_sp_client_key_secret_name database = args.jdbc_database jdbcHost = args.jdbc_host jdbcPort = args.jdbc_port if storage_account_name is None: ValueError("Missing storage_account_name parameter!") if output_container is None: ValueError("Missing output_container_name parameter!") if output_folder is None: ValueError("Missing output_folder_path parameter!") if application_id is None: ValueError("Missing application_id parameter!") if directory_id is None: ValueError("Missing directory_id parameter!") if adb_secret_scope is None: ValueError("Missing adb_secret_scope_name parameter!") if adb_sp_client_key_secret_name is None: ValueError("Missing adb_sp_client_key_secret_name parameter!") if database is None: ValueError("Missing database parameter!") if jdbcHost is None: ValueError("Missing jdbcHost parameter!") if jdbcPort is None: ValueError("Missing jdbcPort parameter!") client_secret = SERVICE_PRINCIPAL_SECRET if SERVICE_PRINCIPAL_SECRET is not None else \ dbutils.secrets.get(scope=adb_secret_scope, key=adb_sp_client_key_secret_name) # conf = SparkConf().setAppName("CalendarProcesor").setMaster("local") spark = SparkSession.builder.master("local").getOrCreate() # spark = SparkSession.builder.config(conf).getOrCreate() # spark.conf.set("spark.app.name", "CalendarProcesor") spark.sparkContext.setLogLevel("ERROR") spark.conf.set( f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net", "OAuth") spark.conf.set( f"fs.azure.account.oauth.provider.type.{storage_account_name}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider") spark.conf.set( f"fs.azure.account.oauth2.client.id.{storage_account_name}.dfs.core.windows.net", application_id) spark.conf.set( f"fs.azure.account.oauth2.client.secret.{storage_account_name}.dfs.core.windows.net", client_secret) spark.conf.set( f"fs.azure.account.oauth2.client.endpoint.{storage_account_name}.dfs.core.windows.net", f"https://login.microsoftonline.com/{directory_id}/oauth2/token") spark.sparkContext.setJobGroup("Running calendar processing", f"[calendar_events_attendance_extractor]") logger.info( "[calendar_events_attendance_extractor] Preparing the jobs for analyzing the calendar data" ) container_client_credential = ClientSecretCredential( tenant_id=directory_id, client_id=application_id, client_secret=client_secret) blob_service_client = BlobServiceClient( account_url=f"https://{storage_account_name}.blob.core.windows.net", credential=container_client_credential) container_client = blob_service_client.get_container_client( input_container) if input_container is not None and input_folder is not None: input_folder_name = input_folder list_of_json_files_from_folder = [] if input_folder_name.startswith("/"): input_folder_name = input_folder_name[1:] # next will determine the latest folder to be used for entry in container_client.list_blobs( name_starts_with=input_folder_name + "/"): list_of_json_files_from_folder.append(entry.name) collected_folders = [] for entry in list_of_json_files_from_folder: if "events_" not in entry: continue tokens = entry.split("/") standard_folder_name = tokens[1] try: tag = int(standard_folder_name.replace("events_", "")) collected_folders.append(tag) except Exception as e: pass if not len(collected_folders): raise Exception("Could not retrieve the latest run") collected_folders.sort(reverse=True) input_folder_name = input_folder_name + "/" + "events_" + str( collected_folders[0]) wasb_file_path = f"abfss://{input_container}@{storage_account_name}.dfs.core.windows.net/{input_folder_name}" logger.info( f"[calendar_events_attendance_extractor] input wasb_file_path: {wasb_file_path}" ) input_df = spark.read.json(wasb_file_path) input_df = input_df.filter( input_df.subject.contains("Watercooler Event")) spark_res_rdd = input_df.rdd.mapPartitions(process_spark_partitions) if not spark_res_rdd.isEmpty(): spark_res_df = spark_res_rdd.map(lambda x: Row(**x)).toDF() from pyspark.sql.functions import max as max_ grouped_events_df = spark_res_df.groupBy( "groupDisplayName", "email_address").agg( max_("lastModifiedDateTime").alias("lastModifiedDateTime")) grouped_events_df = grouped_events_df.join( spark_res_df, ["groupDisplayName", "email_address", "lastModifiedDateTime"], "inner") table = f""" (SELECT day, group_name, display_name FROM [{database}].dbo.groups_per_day) foo """ groups_per_day_df = generate_dataframe_from_table( spark, spark_args, table) events_attendance_df = grouped_events_df.join( groups_per_day_df, grouped_events_df.groupDisplayName == groups_per_day_df.display_name, "inner") events_attendance_df = events_attendance_df.drop( "groupDisplayName") write_attendance_info(input_df=events_attendance_df, output_folder=output_folder, output_container=output_container, storage_account_name=storage_account_name, container_client=container_client) else: raise ValueError( "Not enough arguments given in order to read input data: jdbc-database & input-container are missing." )
df_monthly_ts = df.withColumn("yearmonth", f.concat(f.year("editTime"), f.lit('-'), format_string("%02d", f.month("editTime"))))\ .withColumn("yearmonth", col("yearmonth").cast("timestamp")) df_monthly_ts = df_monthly_ts.groupBy("yearmonth", "title").count().orderBy(desc("count")) df = df.withColumn( "yearmonth", f.concat(f.year("editTime"), f.lit('-'), format_string("%02d", f.month("editTime")))) df_monthly = df.groupBy("yearmonth", "title").count().orderBy(desc("count")) print("Number of edits per month over all articles: ") df_monthly.select("title", "yearmonth", "count").show() min_date, max_date = df_monthly_ts.select( min_("yearmonth").cast("long"), max_("yearmonth").cast("long")).first() data = [(min_date, max_date)] df_dates = spark.createDataFrame(data, ["minDate", "maxDate"]) df_min_max_date = df_dates.withColumn( "minDate", col("minDate").cast("timestamp")).withColumn( "maxDate", col("maxDate").cast("timestamp")) df_formatted_ts = df_min_max_date.withColumn("monthsDiff", f.months_between("maxDate", "minDate"))\ .withColumn("repeat", f.expr("split(repeat(',', monthsDiff), ',')"))\ .select("*", f.posexplode("repeat").alias("date", "val"))\ .withColumn("date", f.expr("add_months(minDate, date)"))\ .withColumn("yearmonth", f.concat(f.year("date"), f.lit('-'), format_string("%02d", f.month("date"))))\ .select('yearmonth')
common_df = sqlContext.createDataFrame(rdd_join, schema) # Working with TRADES data: trade = common_df.filter(common_df.rec_type == "T") # Selecting most important columns to save space trade_df = trade.select('trade_dt', 'symbol', 'exchange', 'event_tm', 'event_seq_nb', 'trade_pr', 'execution_id', 'trade_size', 'arrival_tm') # Grouping by to get the latest value trade_gb = trade_df.withColumn("arrival_tm", col("arrival_tm").cast("timestamp")).groupBy( 'trade_dt', 'symbol', 'exchange', 'event_tm', 'event_seq_nb').agg(max_('arrival_tm')) # Joining the data with the latest arrival_tm to get the rest of the columns trade_complete = trade_gb.join( trade, ['trade_dt', 'symbol', 'exchange', 'event_tm', 'event_seq_nb'], how='left') # Working with QUOTES data: quotes = common_df.filter(common_df.rec_type == "Q") # Selecting most important columns to save space quotes_df = quotes.select('trade_dt', 'symbol', 'exchange', 'event_tm', 'event_seq_nb', 'trade_pr', 'execution_id', 'trade_size', 'arrival_tm') # Grouping by to get the latest value
def process_demographic_data(spark, input_data, output_data): """ Process the demographic data by dropping the duplicates rows and create a new column 'Major Race' based on a group by function Parameters: spark: the spark session input_data: the path of the input folder of the data in the local machine output_data: the output folder in S3 Bucket """ demo_data = input_data+'us-cities-demographics.csv' demo_df = spark.read.format('csv').options(header='true',sep=';').load(demo_data) demo_df = demo_df.select('City', 'State','Median Age','Male Population','Female Population','Total Population','Foreign-born','State Code','Race','Count').drop_duplicates(subset=['City', 'State','Race']) demo_df= demo_df.withColumn("Count",col("Count").cast(IntegerType())) # Using group by to know the major race of every city group_df= demo_df.groupby('City', 'State','Median Age','Male Population','Female Population','Total Population','Foreign-born','State Code').pivot('race').agg(max_('Count')) group_df = group_df.na.fill({'Hispanic or Latino':0, 'White':0, 'Asian':0, 'Black or African-American':0, 'American Indian and Alaska Native':0}) cols = group_df.columns[8:13] maxcol = F.udf(lambda row: cols[row.index(max(row))], StringType()) group_df = group_df.withColumn("Major Race", maxcol(F.struct([group_df[x] for x in group_df.columns[8:13]]))) group_df.write.option("header","true").csv(output_data+'demographic_data/')
line_decay_df.show() created_in_cohorts_df = line_decay_df.groupBy("create_cohort")\ .agg( \ count_(lit_(1)).alias("total_in_cohort"),\ avg_(col_("time_between")).alias("avg_time_between")\ ) removed_in_cohorts_df = line_decay_df.groupBy("create_cohort", "remove_cohort")\ .count()\ .withColumnRenamed("count", "removed_in_this_cohort") lifespand_days_in_cohort_df = line_decay_df.groupBy("create_cohort", "remove_cohort")\ .agg(max_(col_("lifespan")))\ .withColumnRenamed("max(lifespan)", "lifespan") \ lifespand_days_in_cohort_df.show() removed_cohorts_with_lifespan_df = removed_in_cohorts_df.join(lifespand_days_in_cohort_df, \ (removed_in_cohorts_df.create_cohort == lifespand_days_in_cohort_df.create_cohort) & \ (removed_in_cohorts_df.remove_cohort == lifespand_days_in_cohort_df.remove_cohort)) \ .drop(lifespand_days_in_cohort_df.create_cohort) \ .drop(lifespand_days_in_cohort_df.remove_cohort) cohort_df = created_in_cohorts_df.join(removed_cohorts_with_lifespan_df, "create_cohort", "left_outer") \ .orderBy("create_cohort", "lifespan") raw_data = cohort_df.collect() processed_data = []
def _transform_data(self, df, df_base, df_processed, id_base, id_new, bl_processed, aux_perceptor, aux_servicios): """Transform original dataset. :param df: Input DataFrame. :param bl_processed :return: Transformed DataFrame. """ # Cast key variables and rename headers df = df.withColumn('id_siniestro', df.id_siniestro.cast(IntegerType())) if self._is_diario: checklist5 = checklist_spark.checklist5(df_reserva=df_base, df_reserva_new=df, df_id=id_base, df_id_new=id_new) else: checklist5 = checklist_spark.checklist5(df_reserva=df, df_id=id_base) df = df.join(checklist5, on='id_siniestro', how='left') df = df.fillna({'checklist5_poliza': 0, 'checklist5_nif': 0}) del checklist5 # RESERVA INICIAL INDEMNIZACION reserva_indem = df.select(['id_siniestro', 'po_res_cobertura_id', 'po_res_indem']) reserva_indem = reserva_indem.dropDuplicates(subset=['id_siniestro', 'po_res_cobertura_id', 'po_res_indem']) reserva_indem = reserva_indem.drop('po_res_cobertura_id') reserva_indem = reserva_indem.withColumn('po_res_indem', reserva_indem.po_res_indem.cast(FloatType())) reserva_indem = reserva_indem.groupBy(['id_siniestro']).agg(sum_('po_res_indem').alias('po_res_indem')) reserva_indem = reserva_indem.withColumn('po_res_indem_mayor_5000', when(reserva_indem['po_res_indem'] >= 5000, 1).otherwise(0)) df = df.drop('po_res_indem') df = df.join(reserva_indem, on='id_siniestro', how='left') del reserva_indem # RESERVA INICIAL GASTO reserva_gasto = df.select(['id_siniestro', 'po_res_cobertura_id', 'po_res_gasto']) reserva_gasto = reserva_gasto.dropDuplicates(subset=['id_siniestro', 'po_res_cobertura_id', 'po_res_gasto']) reserva_gasto = reserva_gasto.drop('po_res_cobertura_id') reserva_gasto = reserva_gasto.withColumn('po_res_gasto', reserva_gasto.po_res_gasto.cast(FloatType())) reserva_gasto = reserva_gasto.groupBy('id_siniestro').agg(sum_('po_res_gasto').alias('po_res_gasto')) reserva_gasto = reserva_gasto.withColumn('po_res_gasto_mayor_1000', when(reserva_gasto['po_res_gasto'] >= 1000, 1).otherwise(0)) reserva_gasto = reserva_gasto.drop('po_res_gasto') df = df.join(reserva_gasto, on='id_siniestro', how='left') del reserva_gasto # COUNT POLIZAS POR SINIESTRO df = df.withColumn('pondera_siniestro', lit(1)) w = (Window().partitionBy('id_siniestro').rowsBetween(-sys.maxsize, sys.maxsize)) df = df.withColumn('po_reserva_pagoxsiniestro_count', sum_(df['pondera_siniestro']).over(w)) df = df.withColumn('po_reserva_indemxsiniestro_count', sum_(df['po_pago_indicador_indem']).over(w)) # PAGO INDEM ANULADOS: Cuando la anulación es == 1 marca tanto el pago como su anulación df = df.withColumn('po_pago_indemnizacion_importe_neto', when(df['po_pago_es_anulacion'] == 1, 1).otherwise(df['po_pago_indemnizacion_importe_neto'])) # GASTO_INDEM_ANULADOS: Cuando el gasto es == 1 marca tanto el pago como su anulación df = df.withColumn('po_gasto_indemnizacion_importe_neto', when(df['po_gasto_es_anulacion'] == 1, 1).otherwise(df['po_gasto_es_anulacion'])) # PAGOS: Sumamos el importe neto de factura + los pagos netos por indemnizaciòn df = df.withColumn('po_pago_importe_neto', df['po_pago_factura_importe_neto'].cast(FloatType()) + df[ 'po_pago_indemnizacion_importe_neto'].cast(FloatType())) # GASTOS: df = df.withColumn('po_gasto_importe_neto', df['po_gasto_factura_importe_neto'].cast(FloatType()) + df['po_gasto_indemnizacion_importe_neto'].cast(FloatType())) # PAGO ASEGURADO: Si la persona no es el asegurado ponemos los importes del Asegurado en 0 df = df.withColumn('po_pago_importe_neto_ASEGURADO', when(df['persona_objeto_asegurado'].cast(IntegerType()) == 0, 0).otherwise( df['po_pago_importe_neto'])) # IMPORTE PORCENTUAL QUE EFECTIVAMENTE COBRA EL ASEGURADO: importe_neto_asegurado/importe_total df = df.withColumn('po_pago_importe_porcentual_ASEGURADO', df['po_pago_importe_neto_ASEGURADO'] / (df['po_pago_importe_neto'] + 1)) # IBAN Blacklist bl_processed_iban = bl_processed.filter(~((bl_processed['iban'].isNull()) | (bl_processed['iban'] == '?'))) bl_processed_iban = bl_processed_iban.select('iban') bl_processed_iban = bl_processed_iban.dropDuplicates(subset=['iban']) df = df.join(bl_processed_iban, df.po_pago_IBAN == bl_processed_iban.iban, how='left') df = df.withColumn('peritaje_pago_iban_blacklist', when(df['iban'].isNull(), 0).otherwise(1)) df = df.drop('iban') df = df.join(bl_processed_iban, df.po_gasto_IBAN == bl_processed_iban.iban, how='left') df = df.withColumn('peritaje_gasto_iban_blacklist', when(df['iban'].isNull(), 0).otherwise(1)) df = df.drop('iban') del bl_processed_iban # INT Variables # Agrupamos los valores INT por Siniestro y lo guardamos en la lista de INT's int_var = ['peritaje_pago_iban_blacklist', 'peritaje_gasto_iban_blacklist'] int_outliers = [] for col in int_var: count = col + '_count' df = df.withColumn(count, sum_(df[col]).over(w)) int_outliers.append(count) df = df.drop(col) # PERSONA OBJETO RESERVABLE df = df.withColumn('id_persona_objeto_reservable_max', max_(df['id_persona_objeto_reservable'].cast(IntegerType())).over(w)) df = df.drop('id_persona_objeto_reservable') # CATEGORICAL VARIABLE # Redefinimos la variable pago_gasto_codigo como categórica asegurado = STRING.Parameters.Asegurado_Beneficiario_Perjudicado profesional = STRING.Parameters.Profesional_Legal detective = STRING.Parameters.Detective perito = STRING.Parameters.Perito reparador = STRING.Parameters.Reparador otros = STRING.Parameters.todos df = df.withColumn('po_pago_gasto_codigo', df.po_pago_gasto_codigo.cast(StringType())) df = df.withColumn('po_pago_gasto_codigo', when(df['po_pago_gasto_codigo'].isin(otros), df['po_pago_gasto_codigo']).otherwise( 'Otros')) df = df.withColumn('po_pago_gasto_codigo', when(df['po_pago_gasto_codigo'].isin(asegurado), 'Asegurado_Beneficiario_Perjudicado').otherwise(df['po_pago_gasto_codigo'])) df = df.withColumn('po_pago_gasto_codigo', when(df['po_pago_gasto_codigo'].isin(profesional), 'Profesional_Legal').otherwise(df['po_pago_gasto_codigo'])) df = df.withColumn('po_pago_gasto_codigo', when(df['po_pago_gasto_codigo'].isin(detective), 'Detective').otherwise(df['po_pago_gasto_codigo'])) df = df.withColumn('po_pago_gasto_codigo', when(df['po_pago_gasto_codigo'].isin(perito), 'Perito').otherwise(df['po_pago_gasto_codigo'])) df = df.withColumn('po_pago_gasto_codigo', when(df['po_pago_gasto_codigo'].isin(reparador), 'Reparador').otherwise(df['po_pago_gasto_codigo'])) # GARANTIA garantia = STRING.Parameters.dict_garantias funct = udf(lambda x: f.replace_dict_contain(x, key_values=garantia), StringType()) df = df.withColumn('po_res_garantia', funct(df['po_res_garantia'])) # COBERTURA cobertura_1 = STRING.Parameters.dict_cobertura_1 funct = udf(lambda x: f.replace_dict_contain(x, key_values=cobertura_1), StringType()) df = df.withColumn('po_res_cobertura', funct(df['po_res_cobertura'])) df = df.withColumn('po_res_cobertura', when(df['po_res_cobertura'] == 'DE', 'ELECTRICIDAD').otherwise(df['po_res_cobertura'])) cobertura_2 = STRING.Parameters.dict_cobertura_2 funct = udf(lambda x: f.replace_dict_contain(x, key_values=cobertura_2), StringType()) df = df.withColumn('po_res_cobertura', funct(df['po_res_cobertura'])) # Pasamos todas las categóricas para obtener dummies categorical_var = ['po_res_garantia', 'po_res_cobertura', 'po_res_situacion', 'po_pago_medio', 'po_pago_gasto_codigo'] variable_dummy = [] funct = udf(lambda x: f.normalize_string(x), StringType()) for col in categorical_var: df = df.withColumn(col, df[col].cast(StringType())) df = df.withColumn(col, funct(df[col])) types = df.select(col).distinct().collect() types = [ty[col] for ty in types] types_list = [when(df[col] == ty, 1).otherwise(0).alias('d_' + col + '_' + ty) for ty in types if ty not in [None, '0', 0]] df = df.select(list(df.columns) + types_list) variable_dummy += ['d_' + col + '_' + ty for ty in types if ty not in [None, '0', 0]] df = df.drop(col) # Agrupamos las dummies por siniestro variable_dummy += ['po_pago_indicador_indem', 'po_pago_rehusado', 'po_pago_gasto_codigo_detective', 'persona_objeto_asegurado'] for col in variable_dummy: name = col + '_count' df = df.fillna({col: 0}) df = df.withColumn(col, df[col].cast(IntegerType())) df = df.withColumn(name, sum_(df[col]).over(w)) df = df.drop(col) # Ajustamos las situaciones para representarlo porcentualmente: for col in list(df.columns): if col.startswith('d_po_res_situacion'): df = df.withColumn(col, df[col] / df['po_reserva_pagoxsiniestro_count']) # Ajustamos po_pago_indicador_indem para verlo porcentualmente df = df.withColumn('po_pago_indicador_indem_count', df['po_pago_indicador_indem_count'] / df[ 'po_reserva_pagoxsiniestro_count']) # FLOAT VARIABLES # Agrupamos las FLOAT por siniestro y las ponemos para analizar como Outliers variable_float_perceptor = ['po_pago_factura_importe_neto', 'po_pago_indemnizacion_importe_neto', 'po_gasto_factura_importe_neto', 'po_gasto_indemnizacion_importe_neto', 'po_pago_importe_neto', 'po_gasto_importe_neto', 'po_pago_importe_neto_ASEGURADO', 'po_pago_importe_porcentual_ASEGURADO', ] float_outliers = [] for col in variable_float_perceptor: name = col + '_count' df = df.fillna({col: 0}) df = df.withColumn(col, df[col].cast(FloatType())) df = df.withColumn(name, sum_(df[col]).over(w)) float_outliers.append(name) # Porcentual_Asegurado: Lo ajustamos para que refleje el valor porcentual df = df.withColumn('po_pago_importe_porcentual_ASEGURADO_count', df['po_pago_importe_porcentual_ASEGURADO_count'] / df['po_reserva_indemxsiniestro_count']) # TABLA DE PERCEPTOR Y SERVICIOS df = df.withColumn('po_pago_perceptor', when(df['po_pago_perceptor'].startswith('AIDE ASISTENCIA'), 'AIDE ASISTENCIA').otherwise( df['po_pago_perceptor'])) if self._is_diario: df = df.join(aux_perceptor, on='po_pago_perceptor', how='left') df = df.join(aux_servicios.drop('id_siniestro'), on='po_gasto_perceptor', how='left') for col in list(aux_perceptor.columns) + list(aux_servicios.columns): if col not in ['po_pago_perceptor', 'po_gasto_perceptor']: df = df.fillna({col: 0}) del aux_perceptor del aux_servicios else: # TABLA DE PERCEPTOR df.withColumn('po_pago_perceptor', when(df['po_pago_perceptor'].isin([0, '0']), None).otherwise(df['po_pago_perceptor'])) w_perceptor = (Window().partitionBy('po_pago_perceptor').rowsBetween(-sys.maxsize, sys.maxsize)) # Por Pagos df = df.withColumn('pondera_perceptor', when(df['po_pago_perceptor'].isNotNull(), 1).otherwise(0)) df = df.withColumn('po_pagos_total_countxperceptor', sum_(df['pondera_perceptor']).over(w_perceptor)) df = df.drop('pondera_perceptor') # Pago anulación df = df.fillna({'po_pago_es_anulacion': 0}) df = df.withColumn('po_pago_es_anulacion_countxperceptor', sum_(df['po_pago_es_anulacion'].cast(IntegerType())).over(w_perceptor)) df = df.drop('po_pago_es_anulacion') variable_float_perceptor = ['po_pago_factura_importe_neto', 'po_pago_indemnizacion_importe_neto'] # Nota: está bien que haya valores negativos porque son los recobros a otras empresas for col in variable_float_perceptor: name = col + '_countxperceptor' df = df.withColumn(name, sum_(df[col].cast(FloatType())).over(w_perceptor)) df = df.drop(col) # Count a nivel siniestro df = df.withColumn('po_pagoxsiniestro_countxperceptor', size(collect_set(df['id_siniestro']).over(w_perceptor))) # Obtenemos los niveles promedio por Perceptor-Siniestro for col in ['po_pago_factura_importe_neto_countxperceptor', 'po_pago_indemnizacion_importe_neto_countxperceptor']: df = df.withColumn(col + 'xpromediosiniestro', df[col] / df['po_pagoxsiniestro_countxperceptor']) # Perceptor Aparece en blacklist y cuántas # FUE UN SINIESTRO FRAUDULENTO? We check if the id_siniestro is associated with a previous Fraud Sinister bl_processed = bl_processed.select('id_siniestro').dropDuplicates(subset=['id_siniestro']) bl_processed = bl_processed.withColumn('po_reserva_perceptor_fraud', lit(1)) df = df.join(bl_processed, on='id_siniestro', how='left') df = df.withColumn('po_reserva_perceptor_fraud', when(df['po_reserva_perceptor_fraud'].isNull(), 0).otherwise( df['po_reserva_perceptor_fraud'])) perc_aux = df.select(['po_pago_perceptor', 'id_siniestro', 'po_reserva_perceptor_fraud']).dropDuplicates( subset=['po_pago_perceptor', 'id_siniestro']) perc_aux = perc_aux.groupBy('po_pago_perceptor').agg( sum_('po_reserva_perceptor_fraud').alias('po_fraude_countxperceptor')) df = df.join(perc_aux, on='po_pago_perceptor', how='left') del perc_aux df = df.withColumn('po_fraude_porcentaje_perceptor', df['po_fraude_countxperceptor'] / df['po_pagoxsiniestro_countxperceptor']) df.select(['po_pago_perceptor', 'po_pagos_total_countxperceptor', 'po_pago_es_anulacion_countxperceptor', 'po_pago_factura_importe_neto_countxperceptor', 'po_pago_indemnizacion_importe_neto_countxperceptor', 'po_pagoxsiniestro_countxperceptor', 'po_pago_factura_importe_neto_countxperceptorxpromediosiniestro', 'po_pago_indemnizacion_importe_neto_countxperceptorxpromediosiniestro', 'po_fraude_countxperceptor', 'po_fraude_porcentaje_perceptor'] ).dropDuplicates(subset=['po_pago_perceptor']).coalesce(1).write.mode("overwrite").option( "header", "true").option("sep", ";").csv(STRING.training_auxiliar_perceptor) # TABLA DE SERVICIOS df = df.withColumn('po_gasto_perceptor', when(df['po_gasto_perceptor'].isin([0, '0']), None).otherwise(df['po_gasto_perceptor'])) w_servicio = (Window().partitionBy('po_gasto_perceptor').rowsBetween(-sys.maxsize, sys.maxsize)) # Por Pagos df = df.withColumn('pondera_servicio', when(df['po_gasto_perceptor'].isNotNull(), 1).otherwise(0)) df = df.withColumn('po_pagos_total_countxservicios', sum_(df['pondera_servicio']).over(w_servicio)) df = df.drop('pondera_servicio') variable_float_servicio = ['po_gasto_factura_importe_neto', 'po_gasto_indemnizacion_importe_neto'] # Nota: está bien que haya valores negativos porque son los recobros a otras empresas for col in variable_float_servicio: name = col + '_countxservicios' df = df.withColumn(name, sum_(df[col].cast(FloatType())).over(w_servicio)) df = df.drop(col) # Count a nivel siniestro df = df.withColumn('po_pagoxsiniestro_countxservicios', size(collect_set(df['id_siniestro']).over(w_servicio))) # Obtenemos el promedio global por Servicio for col in ['po_gasto_factura_importe_neto_countxservicios', 'po_gasto_indemnizacion_importe_neto_countxservicios']: df = df.withColumn(col + 'xpromediosiniestro', df[col] / df['po_pagoxsiniestro_countxservicios']) # Perceptor Aparece en blacklist y cuántas # FUE UN SINIESTRO FRAUDULENTO? We check if the id_siniestro is associated with a previous Fraud Sinister bl_processed = bl_processed.withColumn('po_reserva_servicios_fraud', lit(1)) df = df.join(bl_processed[['id_siniestro', 'po_reserva_servicios_fraud']], on='id_siniestro', how='left') df = df.withColumn('po_reserva_servicios_fraud', when(df['po_reserva_servicios_fraud'].isNull(), 0).otherwise( df['po_reserva_servicios_fraud'])) del bl_processed serv_aux = df.select(['po_gasto_perceptor', 'id_siniestro', 'po_reserva_servicios_fraud']).dropDuplicates( subset=['po_gasto_perceptor', 'id_siniestro']) serv_aux = serv_aux.groupBy('po_gasto_perceptor').agg( sum_('po_reserva_servicios_fraud').alias('po_fraude_countxservicios')) df = df.join(serv_aux, on='po_gasto_perceptor', how='left') del serv_aux df = df.withColumn('po_fraude_porcentaje_servicios', df['po_fraude_countxservicios'] / df['po_pagoxsiniestro_countxservicios']) df.select(['po_gasto_perceptor', 'id_siniestro', 'po_pagos_total_countxservicios', 'po_gasto_factura_importe_neto_countxservicios', 'po_gasto_indemnizacion_importe_neto_countxservicios', 'po_pagoxsiniestro_countxservicios', 'po_gasto_factura_importe_neto_countxserviciosxpromediosiniestro', 'po_gasto_indemnizacion_importe_neto_countxserviciosxpromediosiniestro', 'po_fraude_countxservicios', 'po_fraude_porcentaje_servicios'] ).dropDuplicates(subset=['po_gasto_perceptor']).coalesce(1).write.mode("overwrite").option( "header", "true").option("sep", ";").csv(STRING.training_auxiliar_servicios) df = df.drop(*['po_pago_perceptor', 'po_gasto_perceptor']) df = df.dropDuplicates(subset=['id_siniestro']) df = df.fillna({'po_fraude_countxservicios': 0, 'po_fraude_porcentaje_servicios': 0}) if self._is_diario: df_processed = df_processed.select(int_outliers + float_outliers) for col in int_outliers + float_outliers: df = outliers.Outliers.outliers_test_values(df, df_processed, col, not_count_zero=True) else: for col in int_outliers + float_outliers: df = outliers.Outliers.outliers_mad(df, col, not_count_zero=True) # DELETE VARIABLES del_variables = ['id_poliza', 'version_poliza', "po_res_garantia_id", "po_res_cobertura_id", 'po_res_limite', "po_pago_IBAN", "po_pago_emision", "po_pago_factura_fecha_emision", 'po_pago_factura_importe_neto', 'po_pago_indemnizacion_importe_neto', 'po_gasto_IBAN', 'po_gasto_emision', 'po_gasto_factura_fecha_emision', 'po_pago_es_anulacion', 'po_gasto_es_anulacion', 'pondera_siniestro', 'po_gasto_perceptor', 'po_gasto_factura_importe_neto', 'po_gasto_indemnizacion_importe_neto', 'po_pago_importe_neto', 'po_gasto_importe_neto', 'po_pago_importe_neto_ASEGURADO', 'po_pago_importe_porcentual_ASEGURADO', 'audit_poliza_producto_tecnico', 'audit_siniestro_codigo_compania', 'po_reserva_indemxsiniestro_count' ] df = df.drop(*del_variables) return df
########### SOLUTION TO PART 1 ############# ## This returns an aggregated dataframe with the client_port and all the activities, in the same time interval df_time.orderBy("client_port", "Date", "Interval").show() ########### SOLUTION TO PART 2############### ## Session time = In every session interval, for every IP, difference between the maximum and minimum Time of request from pyspark.sql.functions import col, max as max_ from pyspark.sql.functions import col, min as min_ ## find maximum timestamp for every ip in every timestamp dfmaxtime = df_time.withColumn("timestamp", col("timestamp").cast("timestamp")).groupBy( "client_port", "Date", "Interval").agg(max_("Time")) ## find minimum timestamp for every ip in every timestamp dfmintime = df_time.withColumn("timestamp", col("timestamp").cast("timestamp")).groupBy( "client_port", "Date", "Interval").agg(min_("Time")) ## merge the two dataframes and calculate the difference between the timestamps dftime = dfmaxtime.join(dfmintime, ['client_port', 'Date', 'Interval']) ## changing column names dftime = dftime.select( col("client_port").alias("client_port"), col("Date").alias("Date"), col("Interval").alias("Interval"),
starins_US=strains_US=strains_US \ .withColumn("new_date",date_format_udf(col("date"))) \ .drop("date") \ .withColumnRenamed("new_date","date") #converting to date type strains_US=strains_US \ .withColumn("new_date",to_date("date","yyyy-mm-dd")) \ .drop("date") \ .withColumnRenamed("new_date","date") # COMMAND ---------- import datetime from pyspark.sql.functions import col, max as max_, min as min_ strains_US.agg(max_("date")).show() strains_US.agg(min_("date")).show() #All strains in USA were collected between 01-01-2020 to 01-31-2020 # COMMAND ---------- #1) UDF to categorize strains based on date collected def categorize(x): if x <= datetime.datetime.strptime("2020-01-10", "%Y-%m-%d").date(): return 1 elif x <= datetime.datetime.strptime("2020-01-20", "%Y-%m-%d").date(): return 2 else:
start_time = time.time() df_monthly = df.groupBy("yearmonth", "title").count().orderBy(desc("count")) end_time = time.time() duration = end_time - start_time stf(worker_count, file_count, duration, 'mgroupby2') print("Number of edits per month over all articles: ") start_time = time.time() df_monthly.select("title", "yearmonth", "count").show() end_time = time.time() duration = end_time - start_time stf(worker_count, file_count, duration, 'mselectshow') start_time = time.time() min_date, max_date = df_monthly_ts.select(min_("yearmonth").cast("long"), max_("yearmonth").cast("long")).first() end_time = time.time() duration = end_time - start_time stf(worker_count, file_count, duration, 'mselect2') data = [(min_date, max_date)] start_time = time.time() df_dates = spark.createDataFrame(data, ["minDate", "maxDate"]) end_time = time.time() duration = end_time - start_time stf(worker_count, file_count, duration, 'mcreateframe') start_time = time.time() df_min_max_date = df_dates.withColumn("minDate", col("minDate").cast("timestamp")).withColumn("maxDate", col("maxDate").cast("timestamp")) end_time = time.time()