## Session time = In every session interval, for every IP, difference between the maximum and minimum Time of request from pyspark.sql.functions import col, max as max_ from pyspark.sql.functions import col, min as min_ ## find maximum timestamp for every ip in every timestamp dfmaxtime = df_time.withColumn("timestamp", col("timestamp").cast("timestamp")).groupBy( "client_port", "Date", "Interval").agg(max_("Time")) ## find minimum timestamp for every ip in every timestamp dfmintime = df_time.withColumn("timestamp", col("timestamp").cast("timestamp")).groupBy( "client_port", "Date", "Interval").agg(min_("Time")) ## merge the two dataframes and calculate the difference between the timestamps dftime = dfmaxtime.join(dfmintime, ['client_port', 'Date', 'Interval']) ## changing column names dftime = dftime.select( col("client_port").alias("client_port"), col("Date").alias("Date"), col("Interval").alias("Interval"), col("max(Time)").alias("Time1"), col("min(Time)").alias("Time2")) ## Concatinating Date and Time so that can be parsed as a timestamp dftime = dftime.withColumn('TimeCon1', F.concat(F.col('Date'), F.lit('T'), F.col('Time1')))
.withColumn("new_date",date_format_udf(col("date"))) \ .drop("date") \ .withColumnRenamed("new_date","date") #converting to date type strains_US=strains_US \ .withColumn("new_date",to_date("date","yyyy-mm-dd")) \ .drop("date") \ .withColumnRenamed("new_date","date") # COMMAND ---------- import datetime from pyspark.sql.functions import col, max as max_, min as min_ strains_US.agg(max_("date")).show() strains_US.agg(min_("date")).show() #All strains in USA were collected between 01-01-2020 to 01-31-2020 # COMMAND ---------- #1) UDF to categorize strains based on date collected def categorize(x): if x <= datetime.datetime.strptime("2020-01-10", "%Y-%m-%d").date(): return 1 elif x <= datetime.datetime.strptime("2020-01-20", "%Y-%m-%d").date(): return 2 else: return 3
def _transform(self, df, auxiliar_train): if not self.train_file: auxiliar_train = auxiliar_train.drop('WinningBid') auxiliar_train = auxiliar_train.withColumn('test', lit(0)) df = df.withColumn('test', lit(1)) df = auxiliar_train.union(df) del auxiliar_train # We create the time as Index split_col = split(df['ApproximateDate'], ' ') df = df.withColumn('time', split_col.getItem(1)) # time # Hour Index func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hms'), IntegerType()) df = df.withColumn('hms_index', func_index(df['time'])) # We order by UserId-Date df = df.orderBy(['UserID', 'hms_index']) # We check Null Values df.select([count_(when(isnan(c), c)).alias(c) for c in df.columns]).show() # We create a rank of users by how many times in the past saw an ad w = (Window().partitionBy(df.UserID).orderBy('time').rowsBetween( Window.unboundedPreceding, 0)) df = df.withColumn('user_id_acumulative', count_(df['UserId']).over(w)) # Number of Ads/User/Second df = df.withColumn('key_id', concat(df['UserID'], lit(' '), df['hms_index'])) w = (Window().partitionBy(df.key_id).orderBy('hms_index').rowsBetween( -sys.maxsize, sys.maxsize)) df = df.withColumn('number_ads_user_second', count_(df.key_id).over(w)) # Number of Ads/User df_group = df.groupby(['key_id' ]).agg(count_('key_id').alias('count_ads')) split_col = split(df_group['key_id'], ' ') df_group = df_group.withColumn('UserID', split_col.getItem(0)) # time w = (Window().partitionBy( df_group.UserID).orderBy('key_id').rowsBetween( Window.unboundedPreceding, 0)) df_group = df_group.withColumn('number_ads_user', sum_(df_group.count_ads).over(w)) df_group = df_group.select(['key_id', 'number_ads_user']) df = df.join(df_group, how='left', on='key_id') del df_group # Number of Users/Second w = (Window().partitionBy(df.ApproximateDate).rowsBetween( -sys.maxsize, sys.maxsize)) df = df.withColumn('number_user_second', approx_count_distinct(df.UserID).over(w)) # Number of Ads/Second df = df.withColumn('number_ads_second', count_(df.ApproximateDate).over(w)) # Browser Dummy Transformation types = df.select('Browser').distinct().collect() types = [val['Browser'] for val in types] new_cols = [ when(df['Browser'] == ty, 1).otherwise(0).alias('d_browser_' + ty) for ty in types ] df = df.select(df.columns + new_cols) # Decompose Date Variables df = df.withColumn('date', to_date(df['ApproximateDate'])) # date df = df.withColumn('month', month(df['ApproximateDate'])) # month df = df.withColumn('day', dayofmonth(df['ApproximateDate'])) # day df = df.withColumn('weekday', dayofweek( df['ApproximateDate'])) # weekday 1=Monday df = df.withColumn('hour', hour(df['time'])) # hour df = df.withColumn('minute', minute(df['time'])) # minute # Peak Hour df = df.withColumn('peak6am8am', when(df['hour'].between(6, 8), 1).otherwise(0)) df = df.withColumn('peak14pm16pm', when(df['hour'].between(14, 16), 1).otherwise(0)) # Minute Index func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hm'), IntegerType()) df = df.withColumn('hm_index', func_index(df['time'])) # Convert to time-series by Minute # We reduce to minutes df_time_serie_ads = df.select([ 'hms_index', 'hm_index', 'number_user_second', 'number_ads_second' ]).drop_duplicates() df_time_serie_user = df.select(['UserID', 'hm_index']).drop_duplicates() # Group-by the values df_time_serie_user = df_time_serie_user.groupBy('hm_index').agg( approx_count_distinct('UserID')) df_time_serie_ads = df_time_serie_ads.groupBy('hm_index').agg({ 'number_ads_second': 'sum' }).drop_duplicates(subset=['hm_index']) # Join ads-users per minute df_time_serie = df_time_serie_ads.join(df_time_serie_user, how='left', on='hm_index') del df_time_serie_ads, df_time_serie_user # Rename columns df_time_serie = df_time_serie.withColumnRenamed( 'sum(number_ads_second)', 'number_ads_minute').withColumnRenamed( 'approx_count_distinct(UserID)', 'number_user_minute') # Resample Range of Minutes resample_range = list( range( df_time_serie.select(min_( col('hm_index'))).limit(1).collect()[0][0], df_time_serie.select(max_( col('hm_index'))).limit(1).collect()[0][0] + 1, 1)) resample_range = self._spark.createDataFrame(resample_range, IntegerType()) # Join the original df df_time_serie = resample_range.join( df_time_serie, how='left', on=resample_range.value == df_time_serie.hm_index).drop( *['hm_index']).fillna(0) # Create Lags By Minutes w = Window().partitionBy().orderBy(col('value')) if self.ar_min_lag > 0: df_time_serie = df_time_serie.select( '*', lag('number_user_minute').over(w).alias( 'ar1_number_user_minute')) df_time_serie = df_time_serie.select( '*', lag('number_ads_minute').over(w).alias( 'ar1_number_ads_minute')) if self.ar_min_lag > 1: for l in range(2, self.ar_min_lag + 1, 1): df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_user_minute').over( w).alias('ar' + str(l) + '_number_user_minute')) df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_ads_minute').over( w).alias('ar' + str(l) + '_number_ads_minute')) # Remove the lagged Null Values df_time_serie = df_time_serie.dropna() # join and remove lag Null values of the first minute df = df.orderBy(['UserID', 'hms_index']) df = df.join(df_time_serie.orderBy(['hm_index']), how='left', on=df.hm_index == df_time_serie.value).drop('value') # Convert to time-series and resample by Seconds df_time_serie = df.select( ['hms_index', 'number_user_second', 'number_ads_second']).drop_duplicates() resample_range = list( range( df_time_serie.select(min_( col('hms_index'))).limit(1).collect()[0][0], df_time_serie.select(max_( col('hms_index'))).limit(1).collect()[0][0] + 1, 1)) resample_range = self._spark.createDataFrame(resample_range, IntegerType()) # Join the original df df_time_serie = resample_range.join( df_time_serie, how='left', on=resample_range.value == df_time_serie.hms_index).drop( *['hms_index']).fillna(0) # Create lags w = Window().partitionBy().orderBy(col('value')) if self.ar_lags > 0: df_time_serie = df_time_serie.select( '*', lag('number_user_second').over(w).alias( 'ar1_number_user_second')) df_time_serie = df_time_serie.select( '*', lag('number_ads_second').over(w).alias( 'ar1_number_ads_second')) if self.ar_lags > 1: for l in range(2, self.ar_lags + 1, 1): df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_user_second').over( w).alias('ar' + str(l) + '_number_user_second')) df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_ads_second').over( w).alias('ar' + str(l) + '_number_ads_second')) # Create Moving Average if self.ma_ss_lag is not None: # Get hour from index func_index = udf(lambda x: auxiliar_func.num_to_time(x), StringType()) df_time_serie = df_time_serie.withColumn( 'time', func_index(df_time_serie['value'])) # minute MA terms (Average per second last xx seconds) if self.ma_ss_lag is not None: for lag_val in self.ma_ss_lag: # range to take into account w = (Window.orderBy(df_time_serie['value']).rangeBetween( -lag_val, 0)) # MA variables df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_user_second', avg('number_user_second').over(w)) df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_ads_second', avg('number_ads_second').over(w)) # Increasing ID df_time_serie = df_time_serie.withColumn( 'rn', monotonically_increasing_id()) # Replace first values by Null df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_user_second', when(df_time_serie['rn'] < lag_val, None).otherwise( df_time_serie['ma_seconds_' + str(lag_val) + '_number_user_second'])) df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_ads_second', when(df_time_serie['rn'] < lag_val, None).otherwise( df_time_serie['ma_seconds_' + str(lag_val) + '_number_ads_second'])) # Get the average by Minute df_time_serie = df_time_serie.withColumn( 'ma_minute_' + str(lag_val) + '_number_user_second', df_time_serie['ma_seconds_' + str(lag_val) + '_number_user_second'] * 60) df_time_serie = df_time_serie.withColumn( 'ma_minute_' + str(lag_val) + '_number_ads_second', df_time_serie['ma_seconds_' + str(lag_val) + '_number_ads_second'] * 60) df_time_serie = df_time_serie.drop(*['rn']) # Remove the lagged Null Values df_time_serie = df_time_serie.drop( *['time', 'number_user_second', 'number_ads_second']).dropna() # join and remove lag Null values of the first minute df = df.join( df_time_serie.orderBy(['value']), how='left', on=df.hms_index == df_time_serie.value).drop('value').dropna() if self.train_file and not self.variable_analysis: df = df.select([ 'key_id', 'hms_index', 'number_ads_user', 'number_user_second', 'number_ads_second', 'number_ads_user_second', 'peak6am8am', 'peak14pm16pm', 'user_id_acumulative' ] + [x for x in df.columns if x.startswith('d_browser')] + [x for x in df.columns if x.startswith('ar')] + [x for x in df.columns if x.startswith('ma_')] + ['WinningBid']) if not self.train_file: df = df.filter(df['test'] == 1) df = df.select([ 'UserID', 'key_id', 'number_ads_user', 'hms_index', 'number_user_second', 'number_ads_second', 'number_ads_user_second', 'peak6am8am', 'peak14pm16pm', 'user_id_acumulative' ] + [x for x in df.columns if x.startswith('d_browser')] + [x for x in df.columns if x.startswith('ar')] + [x for x in df.columns if x.startswith('ma_')]) df = df.orderBy(['hms_index', 'UserID']) df.show() return df
#line_decay_df.show() #print line_decay_df.schema #all_lines_by_creator = line_decay_df.groupBy(col_("creator"), \ # trunc_(col_("created"), 'mon').alias("cohort"))\ # .count()\ # .withColumnRenamed("count", "lines_created") total_active_lines = line_decay_df.filter(col_("removed").isNull()).count() all_lines_by_creator = line_decay_df.groupBy(col_("creator"))\ .count()\ .withColumnRenamed("count", "lines_created") author_agg = line_decay_df.groupBy(col_("creator"))\ .agg(min_(col_("created")).alias("author_first"), \ max_(col_("created")).alias("author_last"), avg_(col_("lifespan")).alias("avg_lifespan")) #author_last = line_decay_df.groupBy(col_("creator"))\ # .agg(max_(col_("created")))\ # .withColumnRenamed("max(created)", "author_last") removed_lines_by_creator = line_decay_df.filter(col_("removed").isNotNull())\ .groupBy(col_("creator"))\ .count()\ .withColumnRenamed("count", "lines_removed") active_lines_by_creator = line_decay_df.filter(col_("removed").isNull())\ .groupBy(col_("creator"))\ .count()\
""" from datetime import datetime, timedelta from pyspark.sql.functions import min as min_ from delta.tables import * date = datetime.strptime(getArgument("exec_date"), '%Y-%m-%d') # Loading already aggregated table minimum_date = DeltaTable.forPath(spark, 's3://prod-delta/processed/minimum_date') # Loading filtered origin table new_ids = spark.read.format('delta') \ .load(f's3://prod-historical/processed/historical/year={date.year}/month={date.month}/day={date.day}') \ .selectExpr('userid AS personid_m' , 'properties_product_guid AS deviceid_m' , 'time_stamp') new_ids = new_ids.groupBy('personid_m', 'deviceid_m') \ .agg(min_('time_stamp')) \ .withColumnRenamed('min(time_stamp)', 'createdon_madrid') new_ids = new_ids.where( 'personid_m is not null AND personid_m != "" AND deviceid_m is not null') minimum_date.alias("minimum_date").merge( new_ids.alias("new_ids"), "minimum_date.personid_m = new_ids.personid_m AND minimum_date.deviceid_m = new_ids.deviceid_m") \ .whenNotMatchedInsertAll() \ .execute()
df_monthly_ts = df.withColumn("yearmonth", f.concat(f.year("editTime"), f.lit('-'), format_string("%02d", f.month("editTime"))))\ .withColumn("yearmonth", col("yearmonth").cast("timestamp")) df_monthly_ts = df_monthly_ts.groupBy("yearmonth", "title").count().orderBy(desc("count")) df = df.withColumn( "yearmonth", f.concat(f.year("editTime"), f.lit('-'), format_string("%02d", f.month("editTime")))) df_monthly = df.groupBy("yearmonth", "title").count().orderBy(desc("count")) print("Number of edits per month over all articles: ") df_monthly.select("title", "yearmonth", "count").show() min_date, max_date = df_monthly_ts.select( min_("yearmonth").cast("long"), max_("yearmonth").cast("long")).first() data = [(min_date, max_date)] df_dates = spark.createDataFrame(data, ["minDate", "maxDate"]) df_min_max_date = df_dates.withColumn( "minDate", col("minDate").cast("timestamp")).withColumn( "maxDate", col("maxDate").cast("timestamp")) df_formatted_ts = df_min_max_date.withColumn("monthsDiff", f.months_between("maxDate", "minDate"))\ .withColumn("repeat", f.expr("split(repeat(',', monthsDiff), ',')"))\ .select("*", f.posexplode("repeat").alias("date", "val"))\ .withColumn("date", f.expr("add_months(minDate, date)"))\ .withColumn("yearmonth", f.concat(f.year("date"), f.lit('-'), format_string("%02d", f.month("date"))))\
start_time = time.time() df_monthly = df.groupBy("yearmonth", "title").count().orderBy(desc("count")) end_time = time.time() duration = end_time - start_time stf(worker_count, file_count, duration, 'mgroupby2') print("Number of edits per month over all articles: ") start_time = time.time() df_monthly.select("title", "yearmonth", "count").show() end_time = time.time() duration = end_time - start_time stf(worker_count, file_count, duration, 'mselectshow') start_time = time.time() min_date, max_date = df_monthly_ts.select(min_("yearmonth").cast("long"), max_("yearmonth").cast("long")).first() end_time = time.time() duration = end_time - start_time stf(worker_count, file_count, duration, 'mselect2') data = [(min_date, max_date)] start_time = time.time() df_dates = spark.createDataFrame(data, ["minDate", "maxDate"]) end_time = time.time() duration = end_time - start_time stf(worker_count, file_count, duration, 'mcreateframe') start_time = time.time() df_min_max_date = df_dates.withColumn("minDate", col("minDate").cast("timestamp")).withColumn("maxDate", col("maxDate").cast("timestamp")) end_time = time.time()