Beispiel #1
0
## Session time = In every session interval, for every IP, difference between the maximum and minimum Time of request

from pyspark.sql.functions import col, max as max_
from pyspark.sql.functions import col, min as min_

## find maximum timestamp for every ip in every timestamp
dfmaxtime = df_time.withColumn("timestamp",
                               col("timestamp").cast("timestamp")).groupBy(
                                   "client_port", "Date",
                                   "Interval").agg(max_("Time"))

## find minimum timestamp for every ip in every timestamp
dfmintime = df_time.withColumn("timestamp",
                               col("timestamp").cast("timestamp")).groupBy(
                                   "client_port", "Date",
                                   "Interval").agg(min_("Time"))

## merge the two dataframes and calculate the difference between the timestamps
dftime = dfmaxtime.join(dfmintime, ['client_port', 'Date', 'Interval'])

## changing column names
dftime = dftime.select(
    col("client_port").alias("client_port"),
    col("Date").alias("Date"),
    col("Interval").alias("Interval"),
    col("max(Time)").alias("Time1"),
    col("min(Time)").alias("Time2"))

## Concatinating Date and Time so that can be parsed as a timestamp
dftime = dftime.withColumn('TimeCon1',
                           F.concat(F.col('Date'), F.lit('T'), F.col('Time1')))
 .withColumn("new_date",date_format_udf(col("date"))) \
 .drop("date") \
 .withColumnRenamed("new_date","date")

#converting to date type
strains_US=strains_US \
.withColumn("new_date",to_date("date","yyyy-mm-dd")) \
.drop("date") \
.withColumnRenamed("new_date","date")

# COMMAND ----------

import datetime
from pyspark.sql.functions import col, max as max_, min as min_
strains_US.agg(max_("date")).show()
strains_US.agg(min_("date")).show()

#All strains in USA were collected between 01-01-2020 to 01-31-2020

# COMMAND ----------

#1) UDF to categorize strains based on date collected


def categorize(x):
    if x <= datetime.datetime.strptime("2020-01-10", "%Y-%m-%d").date():
        return 1
    elif x <= datetime.datetime.strptime("2020-01-20", "%Y-%m-%d").date():
        return 2
    else:
        return 3
Beispiel #3
0
    def _transform(self, df, auxiliar_train):

        if not self.train_file:
            auxiliar_train = auxiliar_train.drop('WinningBid')
            auxiliar_train = auxiliar_train.withColumn('test', lit(0))
            df = df.withColumn('test', lit(1))
            df = auxiliar_train.union(df)
            del auxiliar_train

        # We create the time as Index
        split_col = split(df['ApproximateDate'], ' ')
        df = df.withColumn('time', split_col.getItem(1))  # time

        # Hour Index
        func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hms'),
                         IntegerType())
        df = df.withColumn('hms_index', func_index(df['time']))

        # We order by UserId-Date
        df = df.orderBy(['UserID', 'hms_index'])

        # We check Null Values
        df.select([count_(when(isnan(c), c)).alias(c)
                   for c in df.columns]).show()

        # We create a rank of users by how many times in the past saw an ad
        w = (Window().partitionBy(df.UserID).orderBy('time').rowsBetween(
            Window.unboundedPreceding, 0))
        df = df.withColumn('user_id_acumulative', count_(df['UserId']).over(w))

        # Number of Ads/User/Second
        df = df.withColumn('key_id',
                           concat(df['UserID'], lit(' '), df['hms_index']))
        w = (Window().partitionBy(df.key_id).orderBy('hms_index').rowsBetween(
            -sys.maxsize, sys.maxsize))
        df = df.withColumn('number_ads_user_second', count_(df.key_id).over(w))

        # Number of Ads/User
        df_group = df.groupby(['key_id'
                               ]).agg(count_('key_id').alias('count_ads'))
        split_col = split(df_group['key_id'], ' ')
        df_group = df_group.withColumn('UserID', split_col.getItem(0))  # time
        w = (Window().partitionBy(
            df_group.UserID).orderBy('key_id').rowsBetween(
                Window.unboundedPreceding, 0))
        df_group = df_group.withColumn('number_ads_user',
                                       sum_(df_group.count_ads).over(w))
        df_group = df_group.select(['key_id', 'number_ads_user'])
        df = df.join(df_group, how='left', on='key_id')
        del df_group

        # Number of Users/Second
        w = (Window().partitionBy(df.ApproximateDate).rowsBetween(
            -sys.maxsize, sys.maxsize))
        df = df.withColumn('number_user_second',
                           approx_count_distinct(df.UserID).over(w))

        # Number of Ads/Second
        df = df.withColumn('number_ads_second',
                           count_(df.ApproximateDate).over(w))

        # Browser Dummy Transformation
        types = df.select('Browser').distinct().collect()
        types = [val['Browser'] for val in types]
        new_cols = [
            when(df['Browser'] == ty, 1).otherwise(0).alias('d_browser_' + ty)
            for ty in types
        ]
        df = df.select(df.columns + new_cols)

        # Decompose Date Variables
        df = df.withColumn('date', to_date(df['ApproximateDate']))  # date
        df = df.withColumn('month', month(df['ApproximateDate']))  # month
        df = df.withColumn('day', dayofmonth(df['ApproximateDate']))  # day
        df = df.withColumn('weekday', dayofweek(
            df['ApproximateDate']))  # weekday 1=Monday

        df = df.withColumn('hour', hour(df['time']))  # hour
        df = df.withColumn('minute', minute(df['time']))  # minute

        # Peak Hour
        df = df.withColumn('peak6am8am',
                           when(df['hour'].between(6, 8), 1).otherwise(0))
        df = df.withColumn('peak14pm16pm',
                           when(df['hour'].between(14, 16), 1).otherwise(0))

        # Minute Index
        func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hm'),
                         IntegerType())
        df = df.withColumn('hm_index', func_index(df['time']))

        # Convert to time-series by Minute
        # We reduce to minutes
        df_time_serie_ads = df.select([
            'hms_index', 'hm_index', 'number_user_second', 'number_ads_second'
        ]).drop_duplicates()
        df_time_serie_user = df.select(['UserID',
                                        'hm_index']).drop_duplicates()

        # Group-by the values
        df_time_serie_user = df_time_serie_user.groupBy('hm_index').agg(
            approx_count_distinct('UserID'))
        df_time_serie_ads = df_time_serie_ads.groupBy('hm_index').agg({
            'number_ads_second':
            'sum'
        }).drop_duplicates(subset=['hm_index'])

        # Join ads-users per minute
        df_time_serie = df_time_serie_ads.join(df_time_serie_user,
                                               how='left',
                                               on='hm_index')
        del df_time_serie_ads, df_time_serie_user

        # Rename columns
        df_time_serie = df_time_serie.withColumnRenamed(
            'sum(number_ads_second)', 'number_ads_minute').withColumnRenamed(
                'approx_count_distinct(UserID)', 'number_user_minute')

        # Resample Range of Minutes
        resample_range = list(
            range(
                df_time_serie.select(min_(
                    col('hm_index'))).limit(1).collect()[0][0],
                df_time_serie.select(max_(
                    col('hm_index'))).limit(1).collect()[0][0] + 1, 1))

        resample_range = self._spark.createDataFrame(resample_range,
                                                     IntegerType())

        # Join the original df
        df_time_serie = resample_range.join(
            df_time_serie,
            how='left',
            on=resample_range.value == df_time_serie.hm_index).drop(
                *['hm_index']).fillna(0)

        # Create Lags By Minutes
        w = Window().partitionBy().orderBy(col('value'))
        if self.ar_min_lag > 0:
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_user_minute').over(w).alias(
                    'ar1_number_user_minute'))
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_ads_minute').over(w).alias(
                    'ar1_number_ads_minute'))

            if self.ar_min_lag > 1:
                for l in range(2, self.ar_min_lag + 1, 1):
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_user_minute').over(
                            w).alias('ar' + str(l) + '_number_user_minute'))
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_ads_minute').over(
                            w).alias('ar' + str(l) + '_number_ads_minute'))

        # Remove the lagged Null Values
        df_time_serie = df_time_serie.dropna()

        # join and remove lag Null values of the first minute
        df = df.orderBy(['UserID', 'hms_index'])
        df = df.join(df_time_serie.orderBy(['hm_index']),
                     how='left',
                     on=df.hm_index == df_time_serie.value).drop('value')

        # Convert to time-series and resample by Seconds
        df_time_serie = df.select(
            ['hms_index', 'number_user_second',
             'number_ads_second']).drop_duplicates()
        resample_range = list(
            range(
                df_time_serie.select(min_(
                    col('hms_index'))).limit(1).collect()[0][0],
                df_time_serie.select(max_(
                    col('hms_index'))).limit(1).collect()[0][0] + 1, 1))
        resample_range = self._spark.createDataFrame(resample_range,
                                                     IntegerType())

        # Join the original df
        df_time_serie = resample_range.join(
            df_time_serie,
            how='left',
            on=resample_range.value == df_time_serie.hms_index).drop(
                *['hms_index']).fillna(0)

        # Create lags
        w = Window().partitionBy().orderBy(col('value'))
        if self.ar_lags > 0:
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_user_second').over(w).alias(
                    'ar1_number_user_second'))
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_ads_second').over(w).alias(
                    'ar1_number_ads_second'))

            if self.ar_lags > 1:
                for l in range(2, self.ar_lags + 1, 1):
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_user_second').over(
                            w).alias('ar' + str(l) + '_number_user_second'))
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_ads_second').over(
                            w).alias('ar' + str(l) + '_number_ads_second'))

        # Create Moving Average
        if self.ma_ss_lag is not None:

            # Get hour from index
            func_index = udf(lambda x: auxiliar_func.num_to_time(x),
                             StringType())
            df_time_serie = df_time_serie.withColumn(
                'time', func_index(df_time_serie['value']))

            # minute MA terms (Average per second last xx seconds)
            if self.ma_ss_lag is not None:
                for lag_val in self.ma_ss_lag:
                    # range to take into account
                    w = (Window.orderBy(df_time_serie['value']).rangeBetween(
                        -lag_val, 0))
                    # MA variables
                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_user_second',
                        avg('number_user_second').over(w))
                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_ads_second',
                        avg('number_ads_second').over(w))

                    # Increasing ID
                    df_time_serie = df_time_serie.withColumn(
                        'rn', monotonically_increasing_id())

                    # Replace first values by Null
                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_user_second',
                        when(df_time_serie['rn'] < lag_val, None).otherwise(
                            df_time_serie['ma_seconds_' + str(lag_val) +
                                          '_number_user_second']))

                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_ads_second',
                        when(df_time_serie['rn'] < lag_val, None).otherwise(
                            df_time_serie['ma_seconds_' + str(lag_val) +
                                          '_number_ads_second']))

                    # Get the average by Minute
                    df_time_serie = df_time_serie.withColumn(
                        'ma_minute_' + str(lag_val) + '_number_user_second',
                        df_time_serie['ma_seconds_' + str(lag_val) +
                                      '_number_user_second'] * 60)
                    df_time_serie = df_time_serie.withColumn(
                        'ma_minute_' + str(lag_val) + '_number_ads_second',
                        df_time_serie['ma_seconds_' + str(lag_val) +
                                      '_number_ads_second'] * 60)
                df_time_serie = df_time_serie.drop(*['rn'])

        # Remove the lagged Null Values
        df_time_serie = df_time_serie.drop(
            *['time', 'number_user_second', 'number_ads_second']).dropna()
        # join and remove lag Null values of the first minute
        df = df.join(
            df_time_serie.orderBy(['value']),
            how='left',
            on=df.hms_index == df_time_serie.value).drop('value').dropna()

        if self.train_file and not self.variable_analysis:
            df = df.select([
                'key_id', 'hms_index', 'number_ads_user', 'number_user_second',
                'number_ads_second', 'number_ads_user_second', 'peak6am8am',
                'peak14pm16pm', 'user_id_acumulative'
            ] + [x for x in df.columns if x.startswith('d_browser')] +
                           [x for x in df.columns if x.startswith('ar')] +
                           [x for x in df.columns if x.startswith('ma_')] +
                           ['WinningBid'])

        if not self.train_file:
            df = df.filter(df['test'] == 1)
            df = df.select([
                'UserID', 'key_id', 'number_ads_user', 'hms_index',
                'number_user_second', 'number_ads_second',
                'number_ads_user_second', 'peak6am8am', 'peak14pm16pm',
                'user_id_acumulative'
            ] + [x for x in df.columns if x.startswith('d_browser')] +
                           [x for x in df.columns if x.startswith('ar')] +
                           [x for x in df.columns if x.startswith('ma_')])

        df = df.orderBy(['hms_index', 'UserID'])
        df.show()
        return df
Beispiel #4
0
#line_decay_df.show()
#print line_decay_df.schema

#all_lines_by_creator = line_decay_df.groupBy(col_("creator"), \
#                                             trunc_(col_("created"), 'mon').alias("cohort"))\
#    .count()\
#    .withColumnRenamed("count", "lines_created")

total_active_lines = line_decay_df.filter(col_("removed").isNull()).count()

all_lines_by_creator = line_decay_df.groupBy(col_("creator"))\
    .count()\
    .withColumnRenamed("count", "lines_created")

author_agg = line_decay_df.groupBy(col_("creator"))\
    .agg(min_(col_("created")).alias("author_first"), \
         max_(col_("created")).alias("author_last"),
         avg_(col_("lifespan")).alias("avg_lifespan"))

#author_last = line_decay_df.groupBy(col_("creator"))\
#    .agg(max_(col_("created")))\
#    .withColumnRenamed("max(created)", "author_last")

removed_lines_by_creator = line_decay_df.filter(col_("removed").isNotNull())\
    .groupBy(col_("creator"))\
    .count()\
    .withColumnRenamed("count", "lines_removed")

active_lines_by_creator = line_decay_df.filter(col_("removed").isNull())\
    .groupBy(col_("creator"))\
    .count()\
Beispiel #5
0
"""
from datetime import datetime, timedelta
from pyspark.sql.functions import min as min_
from delta.tables import *

date = datetime.strptime(getArgument("exec_date"), '%Y-%m-%d')

# Loading already aggregated table
minimum_date = DeltaTable.forPath(spark,
                                  's3://prod-delta/processed/minimum_date')

# Loading filtered origin table
new_ids = spark.read.format('delta') \
    .load(f's3://prod-historical/processed/historical/year={date.year}/month={date.month}/day={date.day}') \
    .selectExpr('userid AS personid_m'
                , 'properties_product_guid AS deviceid_m'
                , 'time_stamp')

new_ids = new_ids.groupBy('personid_m', 'deviceid_m') \
    .agg(min_('time_stamp')) \
    .withColumnRenamed('min(time_stamp)', 'createdon_madrid')

new_ids = new_ids.where(
    'personid_m is not null AND personid_m != "" AND deviceid_m is not null')

minimum_date.alias("minimum_date").merge(
    new_ids.alias("new_ids"),
    "minimum_date.personid_m = new_ids.personid_m AND minimum_date.deviceid_m = new_ids.deviceid_m") \
    .whenNotMatchedInsertAll() \
    .execute()
Beispiel #6
0
df_monthly_ts = df.withColumn("yearmonth", f.concat(f.year("editTime"), f.lit('-'), format_string("%02d", f.month("editTime"))))\
    .withColumn("yearmonth", col("yearmonth").cast("timestamp"))
df_monthly_ts = df_monthly_ts.groupBy("yearmonth",
                                      "title").count().orderBy(desc("count"))

df = df.withColumn(
    "yearmonth",
    f.concat(f.year("editTime"), f.lit('-'),
             format_string("%02d", f.month("editTime"))))
df_monthly = df.groupBy("yearmonth", "title").count().orderBy(desc("count"))
print("Number of edits per month over all articles: ")
df_monthly.select("title", "yearmonth", "count").show()

min_date, max_date = df_monthly_ts.select(
    min_("yearmonth").cast("long"),
    max_("yearmonth").cast("long")).first()

data = [(min_date, max_date)]
df_dates = spark.createDataFrame(data, ["minDate", "maxDate"])
df_min_max_date = df_dates.withColumn(
    "minDate",
    col("minDate").cast("timestamp")).withColumn(
        "maxDate",
        col("maxDate").cast("timestamp"))

df_formatted_ts = df_min_max_date.withColumn("monthsDiff", f.months_between("maxDate", "minDate"))\
    .withColumn("repeat", f.expr("split(repeat(',', monthsDiff), ',')"))\
    .select("*", f.posexplode("repeat").alias("date", "val"))\
    .withColumn("date", f.expr("add_months(minDate, date)"))\
    .withColumn("yearmonth", f.concat(f.year("date"), f.lit('-'), format_string("%02d", f.month("date"))))\
Beispiel #7
0
start_time = time.time()
df_monthly = df.groupBy("yearmonth", "title").count().orderBy(desc("count"))
end_time = time.time()
duration = end_time - start_time
stf(worker_count, file_count, duration, 'mgroupby2')

print("Number of edits per month over all articles: ")

start_time = time.time()
df_monthly.select("title", "yearmonth", "count").show()
end_time = time.time()
duration = end_time - start_time
stf(worker_count, file_count, duration, 'mselectshow')

start_time = time.time()
min_date, max_date = df_monthly_ts.select(min_("yearmonth").cast("long"), max_("yearmonth").cast("long")).first()
end_time = time.time()
duration = end_time - start_time
stf(worker_count, file_count, duration, 'mselect2')

data = [(min_date, max_date)]

start_time = time.time()
df_dates = spark.createDataFrame(data, ["minDate", "maxDate"])
end_time = time.time()
duration = end_time - start_time
stf(worker_count, file_count, duration, 'mcreateframe')

start_time = time.time()
df_min_max_date = df_dates.withColumn("minDate", col("minDate").cast("timestamp")).withColumn("maxDate", col("maxDate").cast("timestamp"))
end_time = time.time()