def task_a_3_step_3_final(spark):
    result = kafka_source(spark, config.BOOTSTRAP_SERVERS, "popular-topics-by-country_step-2").parse_json(a3_struct_common) \
        .withWatermark("timetamp_start", "1 minute").groupBy(
        "timetamp_start",
        "timetamp_end"
    ).agg(
        F.collect_list(
            F.create_map(
                [
                    "country_name",
                    F.create_map(
                        [
                            "topic_name_exp",
                            "topic_sum"
                        ]
                    )
                ]
            )
        ).alias("statistics")
    ).select(
        F.struct(
            F.concat(F.hour('timetamp_start'), lit(":"), F.minute('timetamp_start')).alias("time_start"),
            F.concat(F.hour('timetamp_end'), lit(":"), F.minute('timetamp_end')).alias("time_end"),
            col('statistics')
        ).alias("res")
    ).send_to_kafka(config.BOOTSTRAP_SERVERS, "popular-topics-by-country", config.LOG_PREFIX)

    return result
Example #2
0
def task_a_2_step_1_final(spark):
    a2_struct = T.StructType([
        T.StructField("datetime_start", T.TimestampType()),
        T.StructField("datetime_end", T.TimestampType()),
        T.StructField("map_topics", T.MapType(
            T.StringType(),
            T.ArrayType(T.StringType())
        ))
    ])

    result = kafka_source(spark, config.BOOTSTRAP_SERVERS, "topics-by-state_step-0").parse_json(a2_struct) \
        .withWatermark("datetime_end", "1 minute").groupBy(
        F.window("datetime_end", "3 hour", "1 hour")
    ) \
        .agg(
        F.first("window.start").alias("timestamp_start"),
        F.first("window.end").alias("timestamp_end"),
        F.collect_list("map_topics").alias("statistics")
    ) \
        .select(
        F.struct(
            F.concat(F.hour('timestamp_start'), lit(":"), F.minute('timestamp_start')).alias("time_start"),
            F.concat(F.hour('timestamp_end'), lit(":"), F.minute('timestamp_end')).alias("time_end"),
            concat_maps_udf(col('statistics')).alias("statistics")
        ).alias("res")
    ).send_to_kafka(config.BOOTSTRAP_SERVERS, "topics-by-state", config.LOG_PREFIX)

    return result
Example #3
0
def __appendAggKey(tsdf, freq = None):
    """
    :param tsdf: TSDF object as input
    :param freq: frequency at which to upsample
    :return: return a TSDF with a new aggregate key (called agg_key)
    """
    df = tsdf.df
    checkAllowableFreq(freq)

    # compute timestamp columns
    sec_col = f.second(f.col(tsdf.ts_col))
    min_col = f.minute(f.col(tsdf.ts_col))
    hour_col = f.hour(f.col(tsdf.ts_col))

    if (freq == SEC):
        agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(" "), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lpad(min_col, 2, '0'), f.lit(':'), f.lpad(sec_col, 2, '0')).cast("timestamp")
    elif (freq == MIN):
        agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(' '), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lpad(min_col, 2, '0'), f.lit(':'), f.lit('00')).cast("timestamp")
    elif (freq == HR):
        agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(' '), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lit('00'), f.lit(':'), f.lit('00')).cast("timestamp")
    elif (freq == DAY):
        agg_key = f.col(tsdf.ts_col).cast("date").cast("timestamp")

    df = df.withColumn("agg_key", agg_key)
    return tempo.TSDF(df, tsdf.ts_col, partition_cols = tsdf.partitionCols)
Example #4
0
def createDimDate(df):
    '''
    Creates date dimensional table from DateTime field in upstream dataframe
    :param df:
    :return: Date Dimensional Dataframe
    '''
    df = df.withColumn("rawKey", F.col('rawKey'))
    df = df.withColumn("year", F.year(F.col('DateTime')))
    df = df.withColumn("month", F.month(F.col('DateTime')))
    df = df.withColumn("dayofmonth", F.dayofmonth(F.col('DateTime')))
    df = df.withColumn("dayofweek", F.dayofweek(F.col('DateTime')))
    df = df.withColumn("dayofyear", F.dayofyear(F.col('DateTime')))
    df = df.withColumn("hour", F.hour(F.col('DateTime')))
    df = df.withColumn("minute", F.minute(F.col('DateTime')))
    df = df.withColumn("dateMinute", F.date_format(F.col("DateTime"), "yyyyMMddHHmm"))
    df = df.withColumn("quarter", F.quarter(F.col('DateTime')))
    df = df.withColumn("date", F.to_date(F.col('DateTime')))
    df.createOrReplaceTempView('tempDimDateTable')
    dimDateDF = spark.sql(" SELECT * FROM \
                    (select rawKey,dateMinute,dateTime, date,year, month,dayofmonth,dayofweek,dayofyear,hour, minute,quarter \
                    from tempDimDateTable \
                    group by rawKey,dateMinute,dateTime, date,year, month,dayofmonth,dayofweek,dayofyear,hour, minute,quarter \
                    order by dateMinute ASC) \
                    ")

    # Generating dateKey field
    dimDateDF = dimDateDF.withColumn('dateKey', F.monotonically_increasing_id() + 1)
    # Creating dataframe including date field which will help to generate Fact table
    factHelperDateDF = dimDateDF.select(F.col('rawKey'), F.col('dateKey'), F.col('dateMinute'))
    # Dropping unnecessary rawKey field
    dimDateDF = dimDateDF.drop(F.col('rawKey'))
    return dimDateDF, factHelperDateDF
Example #5
0
    def _transform(self, df):

        time_variable = self.getColumn()
        new_time_variable = time_variable + '_new'

        # code from tawab. Convert all times in a same format.

        df = df.withColumn(
            new_time_variable,
            self.udf_date_formatting()(
                funct.col(time_variable).cast("String")))
        df = df.withColumn(
            new_time_variable,
            funct.from_unixtime(
                funct.unix_timestamp(new_time_variable,
                                     self.time_format)).cast(TimestampType()))

        df = df.withColumn(time_variable + '_year',
                           funct.year(new_time_variable))
        df = df.withColumn(time_variable + '_month',
                           funct.month(new_time_variable))
        df = df.withColumn(time_variable + '_day',
                           funct.dayofmonth(new_time_variable))
        df = df.withColumn(time_variable + '_dayofweek',
                           funct.dayofweek(new_time_variable))
        df = df.withColumn(time_variable + '_hour',
                           funct.hour(new_time_variable))
        df = df.withColumn(time_variable + '_minutes',
                           funct.minute(new_time_variable))
        df = df.withColumn(time_variable + '_seconds',
                           funct.second(new_time_variable))

        df = df.drop(new_time_variable)
        df = df.drop(time_variable)
        return df
Example #6
0
    def vwap(self, frequency='m', volume_col="volume", price_col="price"):
        # set pre_vwap as self or enrich with the frequency
        pre_vwap = self.df
        print('input schema: ', pre_vwap.printSchema())
        if frequency == 'm':
            pre_vwap = self.df.withColumn(
                "time_group",
                f.concat(f.lpad(f.hour(f.col(self.ts_col)), 2, '0'),
                         f.lit(':'),
                         f.lpad(f.minute(f.col(self.ts_col)), 2, '0')))
        elif frequency == 'H':
            pre_vwap = self.df.withColumn(
                "time_group",
                f.concat(f.lpad(f.hour(f.col(self.ts_col)), 2, '0')))
        elif frequency == 'D':
            pre_vwap = self.df.withColumn(
                "time_group",
                f.concat(f.lpad(f.day(f.col(self.ts_col)), 2, '0')))

        group_cols = ['time_group']
        if self.partitionCols:
            group_cols.extend(self.partitionCols)
        vwapped = (pre_vwap.withColumn(
            "dllr_value",
            f.col(price_col) * f.col(volume_col)).groupby(group_cols).agg(
                sum('dllr_value').alias("dllr_value"),
                sum(volume_col).alias(volume_col),
                max(price_col).alias("_".join(["max", price_col]))).withColumn(
                    "vwap",
                    f.col("dllr_value") / f.col(volume_col)))

        return TSDF(vwapped, self.ts_col, self.partitionCols)
Example #7
0
def get_time_to_purchase(timeframe, partner, premium, purchase): # todo
    """
    returns the distribution of time it takes a user to achieve a purchase
    input: a period (string), a partner (string), 2 spark dataframes (first and last one in the workflow)
    output: a dictionary with the bucket and the values associated
    """
    keys = [0, 20, 40, 60, 120, 180, 240, 300, 600]
    result = collections.OrderedDict()
    result = {key: 0 for key in keys}
    timeframe_is = get_date(timeframe)
    purchase_renam = (purchase.filter(purchase.keen.timestamp >= timeframe_is)
                      .filter(purchase.search_info.partner_id == partner)
                      .withColumnRenamed('keen', 'keen_purchase')
                      .withColumnRenamed('flight', 'flight_purchase'))
    premium_renam = (premium.filter(premium.keen.timestamp >= timeframe_is)
                     .filter(premium.search_info.partner_id == partner)
                     .withColumnRenamed('keen', 'keen_premium')
                     .withColumnRenamed('flight', 'flight_premium'))
    joined_df = purchase_renam.join(premium_renam, purchase_renam.search_info.search_id == premium_renam.search_info.search_id, 'inner')
    joined_df = joined_df.withColumn("time_to_purchase", (minute(joined_df.keen_purchase.timestamp) - minute(joined_df.keen_premium.timestamp)) * 60 + second(joined_df.keen_purchase.timestamp) - second(joined_df.keen_premium.timestamp))
    times = joined_df.groupBy("time_to_purchase").sum("purchase.quantity").collect()
    for row in times:
        for i in range(len(keys) - 1):
            if row[0] > keys[i] and row[0] <= keys[i+1]:
                result[keys[i+1]] += row[1]
    result.pop(0)
    return result
Example #8
0
  def _transform(self, df):
    input = self.getInputCol()

    df = df.withColumn("dt_day", F.dayofmonth(input))
    df = df.withColumn("dt_hour", F.hour(input))
    df = df.withColumn("dt_minute", F.minute(input))
    df = df.withColumn("dt_second", F.second(input))

    df = df.withColumn("dt_dayofyear", F.dayofyear(input))
    df = df.withColumn("dt_dayofweek", F.dayofweek(input))
    df = df.withColumn("dt_weekofyear", F.weekofyear(input))

    return df
Example #9
0
def timestamp_to_date(data):
    # Generowanie szczegółowych danych dotyczących czasu na podstawie wartości timestamp
    data = data.withColumn("normal_type",
                           data["timestamp"].cast(TimestampType()))
    godzina = data.withColumn('godzina',
                              hour(data['normal_type']).cast(StringType()))
    minuta = godzina.withColumn(
        'minuta',
        minute(godzina['normal_type']).cast(StringType()))
    data = minuta.withColumn(
        "dzien",
        dayofweek(minuta["normal_type"]).cast(StringType()))
    return data
Example #10
0
def main(spark):
    df = createDataframe(spark)
    df.show(truncate=False)

    df = df.withColumn("date", F.date_format(F.col("time"), "yyyy-MM-dd HH:mm:ss.SSSS")) \
        .withColumn("h", F.hour(F.col("date"))) \
        .withColumn("m", F.minute(F.col("date"))) \
        .withColumn("s", F.second(F.col("date"))) \
        .withColumn("event", F.expr("h*3600 + m*60 +s")) \
        .drop("date","h","m","s")

    df.show(truncate=False)

    inRange = F.udf(in_range, BooleanType())

    df = df.withColumn("between", inRange(F.col("range"), F.col("event")))

    df.show(truncate=False)
Example #11
0
def task_2(json_parsed_df):
    result = json_parsed_df.withWatermark("timestamp", "1 minute").groupBy(
        window("timestamp", "1 minute", "1 minute")
    ).agg(
        struct(
            F.month('window.end').alias('month'),
            F.dayofmonth('window.end').alias('day_of_the_month'),
            F.hour('window.end').alias('hour'),
            F.minute('window.end').alias("minute"),
            F.collect_list('group_city').alias('cities')
        ).alias('res')
    ).select(F.to_json('res').alias('value')).writeStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", ",".join(config.BOOTSTRAP_SERVERS)) \
        .option("topic", "US-cities-every-minute") \
        .option("checkpointLocation", f"{config.LOG_PREFIX}topic_2")

    return result
def process_task_2(df):
    records = df \
        .withWatermark("timestamp", "1 minute") \
        .groupBy(
        F.window("timestamp", "1 minute", "1 minute")
    ).agg(
        struct(
            F.month('window.end').alias('month'),
            F.dayofmonth('window.end').alias('day_of_the_month'),
            F.hour('window.end').alias('hour'),
            F.minute('window.end').alias("minute"),
            F.collect_list('group_city').alias('cities')
        ).alias("result")
    ).select(F.to_json("result").alias("value")).writeStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", ",".join(SERVERS)) \
        .option("topic", "US-cities-every-minute")

    return records
Example #13
0
    def preprocess(self):
        # drop location is 0
        self.df = self.df.filter(self.df.Start_Lat  != 0.0 )\
            .filter(self.df.Start_Lon  != 0.0)\
            .filter(self.df.End_Lon  != 0.0)\
            .filter(self.df.End_Lat  != 0.0)

        self.df = self.df\
            .withColumn(self.datetime_columnname, F.unix_timestamp(F.col(self.datetime_columnname),"yyyy-MM-dd HH:mm:ss").cast(TimestampType()))\
            .withColumn("year", F.year(F.col(self.datetime_columnname)))\
            .withColumn("month", F.month(F.col(self.datetime_columnname)))\
            .withColumn("day", F.dayofmonth(F.col(self.datetime_columnname)))\
            .withColumn("hour", F.hour(F.col(self.datetime_columnname)))\
            .withColumn("hour", F.hour(F.col(self.datetime_columnname)))\
            .withColumn("minute", F.minute(F.col(self.datetime_columnname)))\
            .withColumn("Date", F.to_date(F.col(self.datetime_columnname)))\
            .withColumn("pickup_time", F.round(F.col("hour") + F.col("minute")/60))\
            .withColumn("dayOfTheWeek", F.dayofweek(F.col("Date")))\
            .withColumn("isWeekend", self.isWeekendUDF(F.col("dayOfTheWeek")))\
            .withColumn("isHoliday", self.isHolidayUDF(F.col("Date")))\
            .withColumn("isCashPaid", self.isHolidayUDF(F.col("Payment_Type")))
Example #14
0
 def lowest_avg_idle_user(self, df):
     '''
     Find average idle hours for each users. And then find who is idle less then total average hour.
     '''
     df_idle = df.drop('working_hour', 'start_time',
                       'end_time')  # only take working hour column
     # Find average idle hour for each user
     df_avg = df_idle.groupBy('user_name').agg(
         sqlFun.from_unixtime(
             sqlFun.avg(sqlFun.unix_timestamp('idle_time')),
             'hh:mm:ss').alias('avg_time'))
     # Convert all into hour
     df_avg_hours = df_avg.withColumn(
         'avg_hour',
         (hour(df_avg['avg_time']) * 3600 +
          minute(df_avg['avg_time']) * 60 + second(df_avg['avg_time'])) /
         3600)
     #calculating average hours
     total_avg_idle_hour = df_avg_hours.select(
         avg('avg_hour')).collect()[0][0]
     lowest_idle_users = df_avg_hours.filter(
         df_avg_hours['avg_hour'] < total_avg_idle_hour).select('user_name')
     return lowest_idle_users
def load_data( spark ):
    rdd = spark.read.csv(
        files,
        header='false',
        timestampFormat='MM/dd/yyyy HH:mm:ss',
        schema=schema_struct,
        inferSchema='false'
    )

    #print rdd.take(1)

    station_time = (
        rdd.groupBy([
            'station',
            hour("timestamp").alias("hour"),
            minute("timestamp").alias("minute")
        ]).agg(
            mean("totalflow").alias("flow_mean"),
            stddev("totalflow").alias("flow_std"),
            count("totalflow").alias("flow_count"),
            psmax("totalflow").alias("flow_max"),
            psmin("totalflow").alias("flow_min")
        )
    )

    df = station_time.toPandas()

    #print df.station.unique().shape

    df['flow_std_plus_mean'] = df.flow_mean + df.flow_std
    df['flow_std_minus_mean'] = df.flow_mean - df.flow_std

    df['time'] = df.apply(lambda x:time(int(x.hour),int(x.minute)),axis = 1)

    df.sort_values('time',inplace=True)

    return df
Example #16
0
 def minute(self) -> "ps.Series":
     """
     The minutes of the datetime.
     """
     return self._data.spark.transform(lambda c: F.minute(c).cast(LongType()))
Example #17
0
# Using previous dataframe, extracts the year as an integer from a given date/timestamp/string.
# Similar methods: month, dayofweek, minute, second
# Expected:
# +-----------------+--------------------+-------------+---------------+----------+----+-----+---------+------+------+
# |               id|                  ts|  date string|    time string|  date_new|year|month|dayofweek|minute|second|
# +-----------------+--------------------+-------------+---------------+----------+----+-----+---------+------+------+
# |UA000000107379500|2020-07-04 16:09:...|July 04, 2020|16:09:06.592107|04-09-2020|2020|    7|        7|     9|     6|
# |UA000000107359357|2020-07-04 15:36:...|July 04, 2020|15:36:51.756535|04-36-2020|2020|    7|        7|    36|    51|
# |UA000000107375547|2020-07-04 16:06:...|July 04, 2020|16:06:55.459100|04-06-2020|2020|    7|        7|     6|    55|
# +-----------------+--------------------+-------------+---------------+----------+----+-----+---------+------+------+

# Answer
df = (df.withColumn("year", F.year(F.col("ts"))).withColumn(
    "month",
    F.month(F.col("ts"))).withColumn("dayofweek", F.dayofweek(
        F.col("ts"))).withColumn("minute", F.minute(F.col("ts"))).withColumn(
            "second", F.second(F.col("ts"))))
df.show()

# COMMAND ----------

# Converts the column into DateType with name "date" by casting rules to DateType (use function to_date).
# Then create a column plus_two_days that adds 2 days to the date. Select "date" and "plus_two_days"
# Expected:
# +----------+-------------+
# |      date|plus_two_days|
# +----------+-------------+
# |2020-07-04|   2020-07-06|
# |2020-07-04|   2020-07-06|
# |2020-07-04|   2020-07-06|
# +----------+-------------+
Example #18
0

'''Now we drop year,month,day,hour,minute,date,time columns as we will again try to create these from timestamp column that we created'''
df_nycflights = df_nycflights. \
                drop('year'). \
                drop('month'). \
                drop('day'). \
                drop('hour'). \
                drop('minute'). \
                drop('date'). \
                drop('time')

df_nycflights.show() 

'''Now we extract the fields back'''
df_nycflights = df_nycflights. \
                withColumn('year',year(df_nycflights.timestamp)). \
                withColumn('month',month(df_nycflights.timestamp)). \
                withColumn('day',dayofmonth(df_nycflights.timestamp)). \
                withColumn('hour',hour(df_nycflights.timestamp)). \
                withColumn('minute',minute(df_nycflights.timestamp))  

df_nycflights.show()

'''Now few operations on timestamp '''
df_nycflights = df_nycflights.\
                withColumn('date_sub',date_sub(df_nycflights.timestamp ,10)). \
                withColumn('date_add',date_add(df_nycflights.timestamp ,10)). \
                withColumn('months_between',months_between(df_nycflights.timestamp,df_nycflights.timestamp))

df_nycflights.show()                 
Example #19
0
 def extract_datetime_info(self, datetime_col, info_to_extract):
     self._data_frame = self._data_frame.withColumn(
         datetime_col + '_temp', self.to_date_(datetime_col))
     timestamped = datetime_col + "_timestamped"
     # self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.count())
     uniqueVals = self._data_frame.select(
         datetime_col + '_temp').distinct().na.drop().limit(10).collect()
     try:
         date_format = self._metaHelperInstance.get_datetime_format(
             uniqueVals)
         to_date_udf = udf(
             lambda x: datetime.strptime(x, date_format)
             if x != None else x, DateType())
         self._data_frame = self._data_frame.withColumn(
             datetime_col + '_temp',
             to_date_udf(self._data_frame[datetime_col +
                                          '_temp']).alias(datetime_col +
                                                          '_temp'))
         if info_to_extract == "year":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_year",
                 year(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "month_of_year":
             dict = {
                 1: "January",
                 2: "February",
                 3: "March",
                 4: "April",
                 5: "May",
                 6: "June",
                 7: "July",
                 8: "August",
                 9: "September",
                 10: "October",
                 11: "November",
                 12: "December"
             }
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_month",
                 month(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_etf_month_of_year",
                 self.month_to_string(dict)(col(datetime_col + "_month")))
         if info_to_extract == "day_of_month":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_day_of_month",
                 dayofmonth(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "day_of_year":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_day_of_year",
                 dayofyear(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "day_of_week":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_etf_day_of_week",
                 dayofweek(datetime_col + '_temp'))
         if info_to_extract == "week_of_year":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_week_of_year",
                 weekofyear(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "hour":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_hour",
                 hour(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "minute":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_minute",
                 minute(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "date":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_date",
                 to_timestamp(self._data_frame[datetime_col + '_temp'],
                              "dd/MM/yyyy").cast("date"))
         else:
             pass
     except TypeError:
         if info_to_extract == "year":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_year",
                 year(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "month_of_year":
             dict = {
                 1: "January",
                 2: "February",
                 3: "March",
                 4: "April",
                 5: "May",
                 6: "June",
                 7: "July",
                 8: "August",
                 9: "September",
                 10: "October",
                 11: "November",
                 12: "December"
             }
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_month",
                 month(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_etf_month_of_year",
                 self.month_to_string(dict)(col(datetime_col + "_month")))
         if info_to_extract == "day_of_month":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_day_of_month",
                 dayofmonth(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "day_of_year":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_day_of_year",
                 dayofyear(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "day_of_week":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_etf_day_of_week",
                 dayofweek(datetime_col + '_temp'))
         if info_to_extract == "week_of_year":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_week_of_year",
                 weekofyear(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "hour":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_hour",
                 hour(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "minute":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_minute",
                 minute(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "date":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_date",
                 to_timestamp(self._data_frame[datetime_col + '_temp'],
                              "dd/MM/yyyy").cast("date"))
         else:
             pass
     self._data_frame = self._data_frame.drop(datetime_col + '_temp')
     # self._data_frame = self._data_frame.withColumn(datetime_col, to_timestamp(self._data_frame[datetime_col +'_temp'], "dd/MM/yyyy"))
     # self._data_frame = self._data_frame.withColumn(datetime_col, F.from_unixtime(F.unix_timestamp(self._data_frame[datetime_col +'_temp']), "dd/MM/yyyy"))
     return self._data_frame
Example #20
0
 def _transform(self, df):
     self.check_input_type(df.schema)
     return df.withColumn(self.outputCol, F.minute(df[self.inputCol]))
    def convert_timezone(item):
        from_zone = tz.gettz('UTC')
        to_zone = tz.gettz('America/New_York')
        dt = parser.parse(item['timestamp'])
        utc = dt.replace(tzinfo=from_zone)
        return utc.astimezone(to_zone)
    
    if dtype == "sql":
        return Row(id=d[0], time=convert_timezone(d[1]))
    elif dtype == "pandas":
        return convert_timezone(d[1])

n_parts  = 10
rdd      = sc.textFile(data_path).repartition(n_parts).cache() # partitionBy fails here, need to use repartition()
filtered = (rdd.map(make_json)
               .filter( lambda x: filter_tweets(x,p) )
               .map( get_relevant_fields, preservesPartitioning=True )
            )

data = filtered.map( lambda x: update_tz(x,'sql'), preservesPartitioning=True )
df = sqlContext.createDataFrame(data).cache()
counts = df.groupby(sqlfunc.minute("time")).count().collect()
minutes,cts = zip(*counts)
minutes = [m if m<59 else -1 for m in minutes] # for some reason 7:59 tweets get included in the 8-815 range

plt.bar(minutes,cts)
plt.xlabel("Minutes from 8-830pm", fontsize=16)
plt.ylabel("Tweet frequency", fontsize=16)
plt.savefig(path+'sep16-8pm-hist.png')

long_min = -123.0137
lat_max = 37.8324
long_max = -122.3549

x_res = (long_max - long_min) / 20
y_res = (lat_max - lat_min) / 20

dfTrainRaw = dfTrainRaw.drop('Date')

dfTrainRaw = dfTrainRaw.select('IncidntNum', 'Category', 'Descript', 'Dates',
                               'DayOfWeek', 'PdDistrict', 'Resolution',
                               'Address', 'X', 'Y', 'Location', 'PdId',
                               function.year("Dates").alias('Year'),
                               function.month("Dates").alias('Month'),
                               function.hour("Time").alias('Hour'),
                               function.minute("Time").alias('Minute'),
                               function.dayofmonth("Dates").alias('Day'))

dfTrainRaw = dfTrainRaw.filter(dfTrainRaw.X < -122.3549)

dfMain = dfTrainRaw
dfTrain = dfTrainRaw.filter(dfTrainRaw.Year <= 2015)
dfTest = dfTrainRaw.filter(dfTrainRaw.Year > 2015)

#Preprocessing Train and Test
print("=======TRAIN=======")
dfTrain = preprocess(dfTrain)
dfTrain.show(5)

print("=======TEST=======")
dfTest = preprocess(dfTest)
Example #23
0
    def _transform(self, df, auxiliar_train):

        if not self.train_file:
            auxiliar_train = auxiliar_train.drop('WinningBid')
            auxiliar_train = auxiliar_train.withColumn('test', lit(0))
            df = df.withColumn('test', lit(1))
            df = auxiliar_train.union(df)
            del auxiliar_train

        # We create the time as Index
        split_col = split(df['ApproximateDate'], ' ')
        df = df.withColumn('time', split_col.getItem(1))  # time

        # Hour Index
        func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hms'),
                         IntegerType())
        df = df.withColumn('hms_index', func_index(df['time']))

        # We order by UserId-Date
        df = df.orderBy(['UserID', 'hms_index'])

        # We check Null Values
        df.select([count_(when(isnan(c), c)).alias(c)
                   for c in df.columns]).show()

        # We create a rank of users by how many times in the past saw an ad
        w = (Window().partitionBy(df.UserID).orderBy('time').rowsBetween(
            Window.unboundedPreceding, 0))
        df = df.withColumn('user_id_acumulative', count_(df['UserId']).over(w))

        # Number of Ads/User/Second
        df = df.withColumn('key_id',
                           concat(df['UserID'], lit(' '), df['hms_index']))
        w = (Window().partitionBy(df.key_id).orderBy('hms_index').rowsBetween(
            -sys.maxsize, sys.maxsize))
        df = df.withColumn('number_ads_user_second', count_(df.key_id).over(w))

        # Number of Ads/User
        df_group = df.groupby(['key_id'
                               ]).agg(count_('key_id').alias('count_ads'))
        split_col = split(df_group['key_id'], ' ')
        df_group = df_group.withColumn('UserID', split_col.getItem(0))  # time
        w = (Window().partitionBy(
            df_group.UserID).orderBy('key_id').rowsBetween(
                Window.unboundedPreceding, 0))
        df_group = df_group.withColumn('number_ads_user',
                                       sum_(df_group.count_ads).over(w))
        df_group = df_group.select(['key_id', 'number_ads_user'])
        df = df.join(df_group, how='left', on='key_id')
        del df_group

        # Number of Users/Second
        w = (Window().partitionBy(df.ApproximateDate).rowsBetween(
            -sys.maxsize, sys.maxsize))
        df = df.withColumn('number_user_second',
                           approx_count_distinct(df.UserID).over(w))

        # Number of Ads/Second
        df = df.withColumn('number_ads_second',
                           count_(df.ApproximateDate).over(w))

        # Browser Dummy Transformation
        types = df.select('Browser').distinct().collect()
        types = [val['Browser'] for val in types]
        new_cols = [
            when(df['Browser'] == ty, 1).otherwise(0).alias('d_browser_' + ty)
            for ty in types
        ]
        df = df.select(df.columns + new_cols)

        # Decompose Date Variables
        df = df.withColumn('date', to_date(df['ApproximateDate']))  # date
        df = df.withColumn('month', month(df['ApproximateDate']))  # month
        df = df.withColumn('day', dayofmonth(df['ApproximateDate']))  # day
        df = df.withColumn('weekday', dayofweek(
            df['ApproximateDate']))  # weekday 1=Monday

        df = df.withColumn('hour', hour(df['time']))  # hour
        df = df.withColumn('minute', minute(df['time']))  # minute

        # Peak Hour
        df = df.withColumn('peak6am8am',
                           when(df['hour'].between(6, 8), 1).otherwise(0))
        df = df.withColumn('peak14pm16pm',
                           when(df['hour'].between(14, 16), 1).otherwise(0))

        # Minute Index
        func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hm'),
                         IntegerType())
        df = df.withColumn('hm_index', func_index(df['time']))

        # Convert to time-series by Minute
        # We reduce to minutes
        df_time_serie_ads = df.select([
            'hms_index', 'hm_index', 'number_user_second', 'number_ads_second'
        ]).drop_duplicates()
        df_time_serie_user = df.select(['UserID',
                                        'hm_index']).drop_duplicates()

        # Group-by the values
        df_time_serie_user = df_time_serie_user.groupBy('hm_index').agg(
            approx_count_distinct('UserID'))
        df_time_serie_ads = df_time_serie_ads.groupBy('hm_index').agg({
            'number_ads_second':
            'sum'
        }).drop_duplicates(subset=['hm_index'])

        # Join ads-users per minute
        df_time_serie = df_time_serie_ads.join(df_time_serie_user,
                                               how='left',
                                               on='hm_index')
        del df_time_serie_ads, df_time_serie_user

        # Rename columns
        df_time_serie = df_time_serie.withColumnRenamed(
            'sum(number_ads_second)', 'number_ads_minute').withColumnRenamed(
                'approx_count_distinct(UserID)', 'number_user_minute')

        # Resample Range of Minutes
        resample_range = list(
            range(
                df_time_serie.select(min_(
                    col('hm_index'))).limit(1).collect()[0][0],
                df_time_serie.select(max_(
                    col('hm_index'))).limit(1).collect()[0][0] + 1, 1))

        resample_range = self._spark.createDataFrame(resample_range,
                                                     IntegerType())

        # Join the original df
        df_time_serie = resample_range.join(
            df_time_serie,
            how='left',
            on=resample_range.value == df_time_serie.hm_index).drop(
                *['hm_index']).fillna(0)

        # Create Lags By Minutes
        w = Window().partitionBy().orderBy(col('value'))
        if self.ar_min_lag > 0:
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_user_minute').over(w).alias(
                    'ar1_number_user_minute'))
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_ads_minute').over(w).alias(
                    'ar1_number_ads_minute'))

            if self.ar_min_lag > 1:
                for l in range(2, self.ar_min_lag + 1, 1):
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_user_minute').over(
                            w).alias('ar' + str(l) + '_number_user_minute'))
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_ads_minute').over(
                            w).alias('ar' + str(l) + '_number_ads_minute'))

        # Remove the lagged Null Values
        df_time_serie = df_time_serie.dropna()

        # join and remove lag Null values of the first minute
        df = df.orderBy(['UserID', 'hms_index'])
        df = df.join(df_time_serie.orderBy(['hm_index']),
                     how='left',
                     on=df.hm_index == df_time_serie.value).drop('value')

        # Convert to time-series and resample by Seconds
        df_time_serie = df.select(
            ['hms_index', 'number_user_second',
             'number_ads_second']).drop_duplicates()
        resample_range = list(
            range(
                df_time_serie.select(min_(
                    col('hms_index'))).limit(1).collect()[0][0],
                df_time_serie.select(max_(
                    col('hms_index'))).limit(1).collect()[0][0] + 1, 1))
        resample_range = self._spark.createDataFrame(resample_range,
                                                     IntegerType())

        # Join the original df
        df_time_serie = resample_range.join(
            df_time_serie,
            how='left',
            on=resample_range.value == df_time_serie.hms_index).drop(
                *['hms_index']).fillna(0)

        # Create lags
        w = Window().partitionBy().orderBy(col('value'))
        if self.ar_lags > 0:
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_user_second').over(w).alias(
                    'ar1_number_user_second'))
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_ads_second').over(w).alias(
                    'ar1_number_ads_second'))

            if self.ar_lags > 1:
                for l in range(2, self.ar_lags + 1, 1):
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_user_second').over(
                            w).alias('ar' + str(l) + '_number_user_second'))
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_ads_second').over(
                            w).alias('ar' + str(l) + '_number_ads_second'))

        # Create Moving Average
        if self.ma_ss_lag is not None:

            # Get hour from index
            func_index = udf(lambda x: auxiliar_func.num_to_time(x),
                             StringType())
            df_time_serie = df_time_serie.withColumn(
                'time', func_index(df_time_serie['value']))

            # minute MA terms (Average per second last xx seconds)
            if self.ma_ss_lag is not None:
                for lag_val in self.ma_ss_lag:
                    # range to take into account
                    w = (Window.orderBy(df_time_serie['value']).rangeBetween(
                        -lag_val, 0))
                    # MA variables
                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_user_second',
                        avg('number_user_second').over(w))
                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_ads_second',
                        avg('number_ads_second').over(w))

                    # Increasing ID
                    df_time_serie = df_time_serie.withColumn(
                        'rn', monotonically_increasing_id())

                    # Replace first values by Null
                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_user_second',
                        when(df_time_serie['rn'] < lag_val, None).otherwise(
                            df_time_serie['ma_seconds_' + str(lag_val) +
                                          '_number_user_second']))

                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_ads_second',
                        when(df_time_serie['rn'] < lag_val, None).otherwise(
                            df_time_serie['ma_seconds_' + str(lag_val) +
                                          '_number_ads_second']))

                    # Get the average by Minute
                    df_time_serie = df_time_serie.withColumn(
                        'ma_minute_' + str(lag_val) + '_number_user_second',
                        df_time_serie['ma_seconds_' + str(lag_val) +
                                      '_number_user_second'] * 60)
                    df_time_serie = df_time_serie.withColumn(
                        'ma_minute_' + str(lag_val) + '_number_ads_second',
                        df_time_serie['ma_seconds_' + str(lag_val) +
                                      '_number_ads_second'] * 60)
                df_time_serie = df_time_serie.drop(*['rn'])

        # Remove the lagged Null Values
        df_time_serie = df_time_serie.drop(
            *['time', 'number_user_second', 'number_ads_second']).dropna()
        # join and remove lag Null values of the first minute
        df = df.join(
            df_time_serie.orderBy(['value']),
            how='left',
            on=df.hms_index == df_time_serie.value).drop('value').dropna()

        if self.train_file and not self.variable_analysis:
            df = df.select([
                'key_id', 'hms_index', 'number_ads_user', 'number_user_second',
                'number_ads_second', 'number_ads_user_second', 'peak6am8am',
                'peak14pm16pm', 'user_id_acumulative'
            ] + [x for x in df.columns if x.startswith('d_browser')] +
                           [x for x in df.columns if x.startswith('ar')] +
                           [x for x in df.columns if x.startswith('ma_')] +
                           ['WinningBid'])

        if not self.train_file:
            df = df.filter(df['test'] == 1)
            df = df.select([
                'UserID', 'key_id', 'number_ads_user', 'hms_index',
                'number_user_second', 'number_ads_second',
                'number_ads_user_second', 'peak6am8am', 'peak14pm16pm',
                'user_id_acumulative'
            ] + [x for x in df.columns if x.startswith('d_browser')] +
                           [x for x in df.columns if x.startswith('ar')] +
                           [x for x in df.columns if x.startswith('ma_')])

        df = df.orderBy(['hms_index', 'UserID'])
        df.show()
        return df
Example #24
0
# Cast string a timestamp col time

timeFormatUDF = F.udf(lambda ts: timeFormat(ts))

dataset = dataset.withColumn(
    "time",
    timeFormatUDF(F.col("time")).cast(TimestampType()))

# Separamos col time

dataset = dataset.withColumn("year", F.year(F.col("time")))
dataset = dataset.withColumn("month", F.month(F.col("time")))
dataset = dataset.withColumn("day", F.dayofmonth(F.col("time")))
dataset = dataset.withColumn("hour", F.hour(F.col("time")))
dataset = dataset.withColumn("minute", F.minute(F.col("time")))
dataset = dataset.withColumn("second", F.second(F.col("time")))

# Separamos MCC, MNC y MSIN de la columna IMSI

dataset = dataset.withColumn('mcc', dataset.imsi.substr(1, 3))
dataset = dataset.withColumn('mnc', dataset.imsi.substr(4, 2))
dataset = dataset.withColumn('msin', dataset.imsi.substr(6, 10))

# Separamos TAC, SNR y CD de la columna IMEI
# Formato de los IMEI: TAC -- Serial_Number (14 digitos)

dataset = dataset.withColumn('tac', dataset.imei.substr(1, 8))
dataset = dataset.withColumn('snr', dataset.imei.substr(9, 6))

# Escalamos la columna year con MinMaxScaler en el rango [0,1]
Example #25
0
def process_data(spark):
    """
    Read from S3 and process bike share data into dimensional tables.
    
    the bike share data (as CSVs) is read from a public S3 bucket to dataframes.
    the data is transformed using pyspark.sql functions
    finally data is saved back to the same S3 bucket in parquet fromat
    
    Parameters:
        spark: Spark session    
    """

    # read from S3 to dataframes
    st_station_df = spark.read.csv('s3://omar-dend/station.csv', header=True)
    st_weather_df = spark.read.csv('s3://omar-dend/weather.csv', header=True)
    st_trip_df = spark.read.csv('s3://omar-dend/trip.csv', header=True)
    st_status_df = spark.read.csv('s3://omar-dend/status.csv', header=True)
    st_city_df = spark.read.csv('s3://omar-dend/city.csv', header=True)

    # save counts to ensure later that all rows are present
    station_count = st_station_df.count()
    weather_count = st_weather_df.count()

    # adding timestamp to all the dataframes to standardize datetime
    st_station_df = st_station_df.withColumn(
        'datetime',
        F.to_timestamp(st_station_df.installation_date, 'MM/dd/yyyy'))

    st_weather_df = st_weather_df.withColumn(
        'datetime', F.to_timestamp(st_weather_df.date, 'MM/dd/yyyy'))

    st_trip_df = st_trip_df.withColumn(
        'datetime_start',
        F.to_timestamp(st_trip_df.start_date, 'MM/dd/yyyy HH:mm'))
    st_trip_df = st_trip_df.withColumn(
        'datetime_end', F.to_timestamp(st_trip_df.end_date,
                                       'MM/dd/yyyy HH:mm'))

    st_status_df = st_status_df.withColumn(
        'datetime', F.to_timestamp(st_status_df.time, 'yyyy/MM/dd HH:mm:ss'))

    # create dim_weather
    weather_df = st_weather_df.select('max_temperature_f', 'mean_temperature_f', 'min_temperature_f',
                                      'max_humidity', 'mean_humidity', 'min_humidity',
                                      'max_wind_Speed_mph', 'mean_wind_speed_mph',
                                      'precipitation_inches',
                                      'events', 'zip_code', 'datetime')\
                                    .dropDuplicates()

    # create dim_station
    station_df = st_station_df.select(
        F.col('id').alias('station_id'),
        F.col('name').alias('station_name'), 'lat', 'long', 'dock_count',
        'city',
        F.col('datetime').alias('installation_datetime'))

    station_df = station_df.join(st_city_df, station_df.city == st_city_df.city, 'left')\
                                .drop('city')\
                                .dropDuplicates()

    # make sure none of station or wheather data was dropped by mistake
    station_dim_count = station_df.count()
    weather_dim_count = weather_df.count()

    if station_dim_count != station_count or weather_dim_count != weather_count:
        raise Exception('Some dimensional rows are missing')
    else:
        print('All is good')

    # load (save) dim_staion to S3 in parquet fromat
    station_df.write.mode('overwrite')\
        .parquet('s3://omar-dend/dim_station')

    # load (save) dim_weather to S3 in parquet fromat partitioned by zip_code
    weather_df.write.mode('overwrite')\
        .partitionBy('zip_code')\
        .parquet('s3://omar-dend/dim_weather')

    # create dim_time
    time_df = st_station_df.select('datetime')\
        .withColumn('second', F.second('datetime'))\
        .withColumn('minute', F.minute('datetime'))\
        .withColumn('hour', F.hour('datetime'))\
        .withColumn('day', F.dayofmonth('datetime'))\
        .withColumn('week', F.weekofyear('datetime'))\
        .withColumn('month', F.month('datetime'))\
        .withColumn('year', F.year('datetime'))\
        .withColumn('weekday', F.dayofweek('datetime'))

    time_df = st_weather_df.select('datetime')\
        .withColumn('second', F.second('datetime'))\
        .withColumn('minute', F.minute('datetime'))\
        .withColumn('hour', F.hour('datetime'))\
        .withColumn('day', F.dayofmonth('datetime'))\
        .withColumn('week', F.weekofyear('datetime'))\
        .withColumn('month', F.month('datetime'))\
        .withColumn('year', F.year('datetime'))\
        .withColumn('weekday', F.dayofweek('datetime'))

    time_df = st_trip_df.select(F.col('datetime_start').alias('datetime'))\
        .withColumn('second', F.second('datetime'))\
        .withColumn('minute', F.minute('datetime'))\
        .withColumn('hour', F.hour('datetime'))\
        .withColumn('day', F.dayofmonth('datetime'))\
        .withColumn('week', F.weekofyear('datetime'))\
        .withColumn('month', F.month('datetime'))\
        .withColumn('year', F.year('datetime'))\
        .withColumn('weekday', F.dayofweek('datetime'))\

    time_df = st_trip_df.select(F.col('datetime_end').alias('datetime'))\
        .withColumn('second', F.second('datetime'))\
        .withColumn('minute', F.minute('datetime'))\
        .withColumn('hour', F.hour('datetime'))\
        .withColumn('day', F.dayofmonth('datetime'))\
        .withColumn('week', F.weekofyear('datetime'))\
        .withColumn('month', F.month('datetime'))\
        .withColumn('year', F.year('datetime'))\
        .withColumn('weekday', F.dayofweek('datetime'))\

    time_df = st_status_df.select('datetime')\
        .withColumn('second', F.second('datetime'))\
        .withColumn('minute', F.minute('datetime'))\
        .withColumn('hour', F.hour('datetime'))\
        .withColumn('day', F.dayofmonth('datetime'))\
        .withColumn('week', F.weekofyear('datetime'))\
        .withColumn('month', F.month('datetime'))\
        .withColumn('year', F.year('datetime'))\
        .withColumn('weekday', F.dayofweek('datetime'))\
        .dropDuplicates()

    # load (save) dim_weather to S3 in parquet fromat partitioned by year & month
    time_df.write.mode('overwrite')\
        .partitionBy('year', 'month')\
        .parquet('s3://omar-dend/dim_time')

    # create fact_trip
    trip_df = st_trip_df.select(F.col('id').alias('trip_id'), 'duration', 'bike_id',
                            'subscription_type',
                            'start_station_id', 'end_station_id',
                            'datetime_start', 'datetime_end')\
                            .dropDuplicates()

    # load (save) dim_weather to S3 in parquet fromat
    trip_df.write.mode('overwrite')\
        .parquet('s3://omar-dend/fact_trip')

    # create fact_status
    status_df = st_status_df.select('station_id', 'bikes_available',
                                'docks_available', 'datetime')\
                                .dropDuplicates()

    # load (save) dim_weather to S3 in parquet fromat partitioned by station_id
    status_df.write.mode('overwrite')\
        .partitionBy('station_id')\
        .parquet('s3://omar-dend/fact_status')
Example #26
0
            mapping.append((field.name, field.dataType.typeName(), field.name,
                            field.dataType.typeName()))
    dyf = dyf.apply_mapping(mapping)

    # Add partition columns
    df = dyf.toDF()
    if 'year' in partition_keys:
        df = df.withColumn('year', year(timestamp_column_name))
    if 'month' in partition_keys:
        df = df.withColumn('month', month(timestamp_column_name))
    if 'day' in partition_keys:
        df = df.withColumn('day', dayofmonth(timestamp_column_name))
    if 'hour' in partition_keys:
        df = df.withColumn('hour', hour(timestamp_column_name))
    if 'minute' in partition_keys:
        df = df.withColumn('minute', minute(timestamp_column_name))

    df.drop(col(tmp_timestamp_column_name))
    dyf = DynamicFrame.fromDF(df, glue_context, "add_partitions")

# Write DynamicFrame to S3 in glueparquet format
sink = glue_context.getSink(connection_type="s3",
                            path=output_path,
                            enableUpdateCatalog=True,
                            partitionKeys=partition_keys)
sink.setFormat("glueparquet")
sink.setCatalogInfo(catalogDatabase=output_database,
                    catalogTableName=output_table)
sink.writeFrame(dyf)

job.commit()
Example #27
0
the_day = (sorted([
    x.dayofyear for x in (df_time_ma.select(
        dayofyear('timestamp').alias('dayofyear')).distinct().take(5))
]))[0]  # min doesn't work on javalist
the_hours = 12
the_title = ("Data for asset {}, variable {} for day {}, {} hours".format(
    the_asset, the_variable, the_day, the_hours))

# currently exports to pandas for visualization and export in CSV format, later on the pyspark dataframe is exported in CSV
test_df = df_time_ma.filter(df_time_ma.asset == the_asset).filter(
    df_time_ma.variable == the_variable).filter(
        dayofyear('timestamp') == the_day).filter(
            hour('timestamp') <= the_hours).cache()
test_df_1s = test_df.toPandas()
test_df_60s = test_df.filter(second(df_time_ma.timestamp) == 0).toPandas()
test_df_10m = test_df.filter(minute(df_time_ma.timestamp) % 10 == 0).filter(
    second(df_time_ma.timestamp) == 0).toPandas()

plt.figure(figsize=(12, 4))
plt.plot(test_df_1s.timestamp, test_df_1s.ma, 'b')
plt.plot(test_df_60s.timestamp, test_df_60s.ma, 'r')
plt.plot(test_df_10m.timestamp, test_df_10m.ma, 'g')
plt.grid()
plt.title(the_title)
plt.legend(['1s', '60s', '10m'])
display(plt.gcf())

# COMMAND ----------

from itertools import chain
from pyspark.sql.functions import create_map, lit, round
def start_stream(args):
    validate_params(args)
    _, brokers, topic = args

    spark = create_spark_session()

    json = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", brokers) \
        .option("subscribe", topic) \
        .load()

    json.printSchema()

    # Explicitly set schema
    schema = StructType([
        StructField("symbol", StringType(), False),
        StructField("timestamp", TimestampType(), False),
        StructField("price", DoubleType(), False)
    ])

    json_options = {"timestampFormat": "yyyy-MM-dd'T'HH:mm'Z'"}
    stocks_json = json \
        .select(from_json(F.col("value").cast("string"), schema, json_options).alias("content"))

    stocks_json.printSchema

    stocks = stocks_json.select("content.*")

    ####################################
    # Stream to Parquet
    ####################################
    query = stocks \
        .withColumn('year', year(F.col('timestamp'))) \
        .withColumn('month', month(F.col('timestamp'))) \
        .withColumn('day', dayofmonth(F.col('timestamp'))) \
        .withColumn('hour', hour(F.col('timestamp'))) \
        .withColumn('minute', minute(F.col('timestamp'))) \
        .writeStream \
        .format('parquet') \
        .partitionBy('year', 'month', 'day', 'hour', 'minute') \
        .option('startingOffsets', 'earliest') \
        .option('checkpointLocation', '/dataset/checkpoint') \
        .option('path', '/dataset/streaming.parquet') \
        .trigger(processingTime='30 seconds') \
        .start()

    query.awaitTermination()

    # avg_pricing = stocks \
    #     .groupBy(F.col("symbol")) \
    #     .agg(F.avg(F.col("price")).alias("avg_price"))

    ####################################
    # Console Output
    ####################################
    # query2 = avg_pricing.writeStream \
    #     .outputMode('complete') \
    #     .format("console") \
    #     .trigger(processingTime="10 seconds") \
    #     .start()

    # query2.awaitTermination()

    ####################################
    # Table in Memory
    ####################################
    # query3 = avg_pricing \
    #     .writeStream \
    #     .queryName("avgPricing") \
    #     .outputMode("complete") \
    #     .format("memory") \
    #     .trigger(processingTime="10 seconds") \
    #     .start()
    #
    # while True:
    #     print('\n' + '_' * 30)
    #     # interactively query in-memory table
    #     spark.sql('SELECT * FROM avgPricing').show()
    #     print(query3.lastProgress)
    #     sleep(10)

    # query3.awaitTermination()

    ####################################
    # Writing to Postgres
    ####################################

    # Simple insert
    # query = stream_to_postgres(stocks)
    # query.awaitTermination()

    # Average Price Aggregation
    # query = stream_aggregation_to_postgres(stocks)
    # query.awaitTermination()

    # Final Average Price Aggregation with Timestamp columns
    # query = stream_aggregation_to_postgres_final(stocks)
    # query.awaitTermination()

    pass
Example #29
0
dfinrixm = dfinrix.filter("Speed<0.6 * Reference")

# COMMAND ----------

dftimestamp = dfinrixm.withColumn(
    'NT',
    substring('CentralTime', 1,
              19).astype("Timestamp")).drop("C-Value", "SegmentClosed",
                                            "Score", "Speed", "Average",
                                            "Reference", "Travel", "Time")

# COMMAND ----------

dfmsm = dftimestamp.withColumn(
    "msm",
    hour(dftimestamp.NT) * 60 + minute(dftimestamp.NT)).drop("CentralTime")

# COMMAND ----------

# COMMAND ----------

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, ArrayType, StructType

# COMMAND ----------

dfb = dfmsm.sort("Code", "msm")

# COMMAND ----------
Example #30
0
def main():

    spark = SparkSession \
          .builder \
          .appName("spark_streaming_app") \
          .getOrCreate()

    df = (spark.readStream.format('kafka').option(
        'kafka.bootstrap.servers',
        '104.248.248.196:9092,134.122.78.61:9092,134.209.225.2:9092').option(
            'subscribe', 'stream_data').option('startingOffsets',
                                               'earliest').load())

    df = df.selectExpr('CAST(value as STRING)')

    df = df.select(from_json(col('value'), data_schema).alias('df'))

    func1 = udf(lambda x: states[x.upper()], StringType())

    df = df.filter(col('df.group.group_country')=='us').select('df').withColumn('group_state', func1('df.group.group_state')).\
    withColumn('time', from_unixtime(col('df.event.time')/1000))

    df2 = df.select(
        struct(
            struct(
                col('df.event.event_name'),
                col('df.event.event_id'),
                col('time'),
            ).alias('event'), col('df.group.group_city'),
            col('df.group.group_country'), col('df.group.group_id'),
            col('df.group.group_name'), col('group_state')).alias('value'))

    stream2 = df2.select(to_json('value').alias('value')).writeStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers",'104.248.248.196:9092,134.122.78.61:9092,134.209.225.2:9092') \
        .option("topic", "US-meetups") \
        .option("checkpointLocation", "US-metups-checkpoint")

    stream2 = stream2.start()

    df3 = df.withColumn('timestamp', to_timestamp('time')).\
             withWatermark('timestamp', "1 minute").groupBy(window('timestamp', '1 minute')).\
             agg(struct(month('window.end').alias('month'), dayofmonth('window.end').alias('day_of_the_month'),
              hour('window.end').alias('hour'), minute('window.end').alias('minute'),collect_set('df.group.group_city').alias('cities')).alias('value')).\
             select('value')

    stream3 = df3.select(to_json('value').alias('value')).writeStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers",'104.248.248.196:9092,134.122.78.61:9092,134.209.225.2:9092') \
        .option("topic", "US-cities-every-minute") \
        .option("checkpointLocation", "US-cities-every-minute-checkpoint")

    stream3 = stream3.start()

    df4 = df.select(
        struct(
            struct(
                col('df.event.event_name'),
                col('df.event.event_id'),
                col('time'),
            ).alias('event'), col('df.group.group_topics.topic_name'),
            col('df.group.group_city'), col('df.group.group_country'),
            col('df.group.group_id'), col('df.group.group_name'),
            col('group_state')).alias('value')).filter(
                arrays_overlap(
                    'value.topic_name',
                    array(lit("Computer programming"), lit("Big Data"),
                          lit("Machine Learning"), lit("Python"), lit("Java"),
                          lit("Web Development"))))

    stream4 = df4.select(to_json('value').alias('value')).writeStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers",'104.248.248.196:9092,134.122.78.61:9092,134.209.225.2:9092') \
        .option("topic", "Programming-meetups") \
        .option("checkpointLocation", "Programming-metups-checkpoint")

    stream4 = stream4.start()

    stream4.awaitTermination()

    spark.stop()
def process_log_data(spark, input_data_path):
    pl_start = time()
    print('Starting to process log data')
    # get filepath to log data file
    log_data = input_data_path

    # read log data file
    #df =
    log_schema = StructType([
        StructField("artist", StringType()),
        StructField("auth", StringType()),
        StructField("firstName", StringType()),
        StructField("gender", StringType()),
        StructField("itemInSession", LongType()),
        StructField("lastName", StringType()),
        StructField("length", DoubleType()),
        StructField("level", StringType()),
        StructField("location", StringType()),
        StructField("method", StringType()),
        StructField("page", StringType()),
        StructField("registration", DoubleType()),
        StructField("sessionId", LongType()),
        StructField("song", StringType()),
        StructField("status", StringType()),
        StructField("ts", StringType()),
        StructField("userAgent", StringType()),
        StructField("userId", StringType())
    ])

    log_df = spark.read.json(input_data_path, schema=log_schema)

    # filter by actions for song plays
    # Filter only column page with value "NextSong"
    #df =
    log_df = log_df.filter(log_df.page == 'NextSong').collect()

    # Convert List to Spark
    log_df = spark.createDataFrame(log_df, schema=log_schema)

    # Convert ts from long to datetime
    convert_ts = udf(
        lambda x: datetime.datetime.fromtimestamp(float(x) / 1000.0),
        TimestampType())
    log_df = log_df.withColumn("ts_converted", convert_ts(log_df.ts))

    # Convert registration from double to long
    log_df = log_df.withColumn("registration_converted",
                               log_df.registration.cast(LongType()))

    pl_et = time() - pl_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Process log files : Read & Transformation', round(pl_et, 2)))

    print('Creating users table')
    temp_start = time()
    # extract columns for users table
    # creating users table with columns user_id, first_name, last_name, gender, level
    users_table = log_df.select(['userId', 'firstName', 'lastName', 'gender', 'level'])\
            .withColumnRenamed('userId', 'user_id')\
            .withColumnRenamed('firstName', 'first_name')\
            .withColumnRenamed('lastName', 'last_name').dropDuplicates()

    pl_et = time() - temp_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Creating users table', round(pl_et, 2)))

    # extract columns to create time table
    # Creating time table with columns start_time, hour, day, week, month, year, weekday
    print('Creating time table')
    temp_start = time()
    time_table = log_df.select(['ts_converted'])\
                        .withColumnRenamed('ts_converted','start_time')

    time_table = time_table.withColumn('day', F.dayofmonth('start_time')) \
                          .withColumn('month', F.month('start_time')) \
                          .withColumn('year', F.year('start_time')) \
                          .withColumn('hour', F.hour('start_time')) \
                          .withColumn('minute', F.minute('start_time')) \
                          .withColumn('second', F.second('start_time')) \
                          .withColumn('week', F.weekofyear('start_time')) \
                          .withColumn('weekday', F.dayofweek('start_time')).dropDuplicates()
    pl_et = time() - temp_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Creating time table', round(pl_et, 2)))

    pl_et = time() - pl_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Process log files : Total', round(pl_et, 2)))
    return log_df, users_table, time_table
Example #32
0
bicimadc_ds = bicimadc_ds.withColumnRenamed("light", "occupation")

bicimad_filtered = bicimadc_ds.filter(sf.col("activate") == "1")

bicimad_coordinates = bicimad_filtered.withColumn("geometry", sf.substring("geometry", 35, 40))\
    .withColumn("longitud", sf.split("geometry", ",")[0]) \
    .withColumn("latitud", sf.split("geometry", ",")[1]) \
    .withColumn("latitud", sf.expr("substring(latitud, 2, length(latitud)-3)")) \
    .drop("geometry")

bicimad_partition = bicimad_coordinates\
    .withColumn("year", year("datetime")) \
    .withColumn("month", month("datetime")) \
    .withColumn("day", dayofmonth("datetime")) \
    .withColumn("hour", hour("datetime")) \
    .withColumn("minute", minute("datetime"))


# EnvĂ­o del resultado a kafka en micro-batches
queryToKafka = bicimad_partition\
    .select(bicimad_partition["id"].cast('string').alias("key"),
            to_json(struct("*")).alias("value"))\
    .writeStream \
    .format("kafka") \
    .trigger(processingTime='3 minutes') \
    .option("kafka.bootstrap.servers", 'localhost:9092') \
    .option("topic", "bicimad-druid-stream") \
    .option("checkpointLocation", "/tmp/checkpoint/kafka/stream/bicimad/") \
    .outputMode("Append") \
    .start()
#old checkpoint path: /tmp/checkpoint/kafka/bicimad/