Exemple #1
0
def main():

    spark = SparkSession(sc)

    tweet_schema = types.StructType([
        types.StructField('topic', types.StringType(), True),
        types.StructField('text', types.StringType(), True),
        types.StructField('created_at', types.StringType(), True),
    ])

    tweets = spark.read.json('kafka_data_2.json', schema=tweet_schema)
    tweets = tweets.select(
        col('topic').alias('topic-sentiment'),
        col('text').alias('text_original'), col('created_at'))

    tweets = tweets.withColumn(
        'text_clean',
        regexp_replace(
            tweets['text_original'],
            '(#\w+)|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' '))

    TextBlob_udf = udf(sentiment_calc)
    tweets = tweets.withColumn("sentiment_score",
                               TextBlob_udf(tweets.text_clean).cast('double'))
    tweets = tweets.withColumn("sentiment_score_rounded",
                               func.round(tweets['sentiment_score'], 1))

    datefunc = udf(
        lambda x: datetime.strptime(x, '%a %b %d %H:%M:%S +0000 %Y'),
        TimestampType())
    tweets = tweets.withColumn(
        'created_at_PST_onestep',
        func.from_utc_timestamp(datefunc(tweets['created_at']), "PST"))
 def dim_user(self, tweets:DataFrame)->DataFrame:
     dim_user = (
         tweets
         .withColumn(
             "user_created_at",
             F.from_utc_timestamp(
                 F.to_timestamp(
                     F.regexp_replace(
                         F.regexp_replace("user_created_at", "^[A-Za-z]{3} ", ""),
                         "\+0000 ", ""),
                     "MMM dd HH:mm:ss yyyy"
                 ),
                 "GMT-3"
             )
         )
         .select(
             "user_id",
             "user_name",
             "user_description",
             F.col("user_verification").cast(BooleanType()),
             F.col("user_followers_count").cast(IntegerType()),
             F.col("user_friends_count").cast(IntegerType()),
             "user_created_at",
             "user_location",
         )
     )
     return dim_user
Exemple #3
0
def adjust_timezone(df):
    """Adjust timezone to KST from UTC

    :param df: Input DataFrame
    :return: Output DataFrame
    """
    return (
        df
        .withColumn('timestamp', from_utc_timestamp(unix_timestamp(df['timestamp'], "yyyy-MM-dd'T'HH:mm:ss.SSSXXX")
                                                    .cast('timestamp'), 'KST')))
Exemple #4
0
 def convert_timestamp_to_date(self,df):
     """
     TODO find out if necassary to convert to timestamp, i.e. what is the servers timestamp
     CET : central eastern time : UTC + 1
     CEST: central eastern summer time : UTC + 2 
     """
     utc_to_time_udf = udf(utc_to_time)
     df.show(10)
     UTC = df.withColumn("sessionstarttime", utc_to_time_udf("sessionstarttime"))
     #print("SHOULD BE IN LOCAL TIME")
     UTC.show(10)
     return UTC.withColumn("sessionstarttime", from_utc_timestamp("sessionstarttime", "CET")) #CEST?
    def utc_to_local(self, localTimeZone, subset=None):
        if subset is not None:
            tsCol = (
                x for x in self.inputSchema
                if str(x.dataType) == "TimestampType" and x.name in subset)
        else:
            tsCol = (x for x in self.inputSchema
                     if str(x.dataType) == "TimestampType")

        for x in tsCol:
            self.inputDf = self.inputDf.withColumn(
                x.name, from_utc_timestamp(col(x.name), localTimeZone))
        return self.inputDf
Exemple #6
0
def ad_stream_translator(df):
    from pyspark.sql.types import StructType, StringType
    from pyspark.sql.functions import from_json, col, from_utc_timestamp, when

    payload_schema = (
        StructType()
        .add("clicked", StringType(), False)
        .add("auction_id", StringType(), False)
        .add("num_ads_bid", StringType(), False)
        .add("ad_id", StringType(), False)
        .add("ad_campaign_id", StringType(), False)
        .add("partner_domain_name", StringType(), False)
        .add("content_keyword", StringType(), False)
        .add("ad_content_id", StringType(), False)
        .add("ad_group_id", StringType(), False)
        .add("ad_display_placement", StringType(), False)
        .add("ad_destination_domain_id", StringType(), False)
        .add("partner_id", StringType(), False)
        .add("is_pwa", StringType(), False)
        .add("user_uuid", StringType(), False)
        .add("timestamp", StringType(), False)
        .add("datestr", StringType(), True)
    )

    return (
      df.selectExpr("cast (data as STRING) jsonData")
      .select(from_json("jsonData", payload_schema).alias("payload"))
      .select(
          col("payload.clicked").cast("long").alias("clicked"),
          col("payload.auction_id").alias("auction_id"),
          col("payload.num_ads_bid").cast("long").alias("num_ads_bid"),
          col("payload.ad_id").cast("long").alias("ad_id"),
          col("payload.ad_campaign_id").cast("long").alias("ad_campaign_id"),
          col("payload.partner_domain_name").alias("partner_domain_name"),
          col("payload.content_keyword").alias("content_keyword"),
          col("payload.ad_content_id").cast("long").alias("ad_content_id"),
          col("payload.ad_group_id").alias("ad_group_id"),
          col("payload.ad_display_placement").alias("ad_display_placement"),
          col("payload.ad_destination_domain_id").cast("long").alias("ad_destination_domain_id"),
          col("payload.partner_id").cast("long").alias("partner_id"),
          when(
            col("payload.is_pwa") == "True",
            True).when(
            col("payload.is_pwa") == "False",
            False).alias("is_pwa"),
          col("payload.user_uuid").alias("user_uuid"),
          from_utc_timestamp("payload.timestamp", "UTC").alias("timestamp"),
          col("payload.datestr").alias("datestr")
      )
    )
    def changeTimezone(self, fromTimezone, toTimezone, subset=None):
        if subset is not None:
            tsCol = (
                x for x in self.inputSchema
                if str(x.dataType) == "TimestampType" and x.name in subset)
        else:
            tsCol = (x for x in self.inputSchema
                     if str(x.dataType) == "TimestampType")

        for x in tsCol:
            self.inputDf = self.inputDf.withColumn(
                x.name, to_utc_timestamp(col(x.name), fromTimezone))
            self.inputDf = self.inputDf.withColumn(
                x.name, from_utc_timestamp(col(x.name), toTimezone))
        return self.inputDf
Exemple #8
0
def read_pcap_csv(filename, spark_session):
    schema = build_csv_schema()
    df = spark_session.read.csv(
        filename, header=True, sep=";", schema=schema
    )
    df = df \
        .withColumnRenamed("frame.time", "time") \
        .withColumnRenamed("frame.time_epoch", "time_epoch") \
        .withColumnRenamed("_ws.col.Source", "Source") \
        .withColumnRenamed("_ws.col.Destination", "Destination") \
        .withColumnRenamed("_ws.col.Protocol", "Protocol") \
        .withColumnRenamed("frame.len", "frame_len")
    df = df.withColumn('parsed_time', F.from_utc_timestamp(df['time_epoch'].cast(TimestampType()), 'UTC'))
    df = df.withColumn("id", F.monotonically_increasing_id())
    return df.orderBy("parsed_time")
def process_log_data(spark, input_data, output_data):
    '''
    This function to read log data from JSON files, then will extract columns for users, time & songplays tables, then write their data into parquet files.
    
    Following steps will be performed:
    
    1. Specify the path of song data JSON files is "s3a://udacity-dend/log_data/2018/11/*.json". 
        It means all JSON files located inside subdirectories of "s3a://udacity-dend/log_data/2018/11" will be read.
        Data will be read into a dataframe.
        
    2. Filtering the dataframe at step 1 to select only rows with 'page' of 'NextSong', 
        and also remove records which has NA value in one of following columns: 'ts', 'song', 'userId', 'lastName' or 'firstName'.
        
    3. Call function process_users_table() to process users table.
    
    4. Adding new columns into log dataframe, then call function process_time_table() to process time table.
    
    5. Call function process_songplays_table() to process songplays table.
    '''

    # get filepath to log data file
    log_data = input_data + "log_data/2018/11/*.json"

    # read log data file
    df = read_log_data(spark, log_data)

    # filter by actions for song plays
    df = df.filter(df['page'] == 'NextSong').dropna(
        subset=['ts', 'song', 'userId', 'firstName', 'lastName'])

    # create timestamp column from original timestamp column
    df = df.withColumn("timesttamp", F.from_unixtime(df.ts / 1000))

    # create datetime column from original timestamp column
    df = df.withColumn("datetime",
                       F.from_utc_timestamp(df["timesttamp"], 'PST'))

    # process users table
    process_users_table(df, output_data)

    # process time table
    process_time_table(df, output_data)

    # process songplays table
    process_songplays_table(df, output_data)
 def fact_tweet(self, tweets:DataFrame)->DataFrame:
     assert type(tweets) is DataFrame
     fact_tweet = (
         tweets
         .withColumn(
             "created_at",
             F.from_utc_timestamp(
                 F.to_timestamp(
                     F.regexp_replace(
                         F.regexp_replace("created_at", "^[A-Za-z]{3} ", ""),
                         "\+0000 ", ""),
                     "MMM dd HH:mm:ss yyyy"
                 ),
                 "GMT-3"
             )
         )
         .withColumn(
             "hashtags", 
             F.when(
                 F.col("hashtags") != '',
                 F.regexp_replace("hashtags", ",\s$", "")
             )
             .otherwise(None)
         )
         .withColumn("created_at_partition_year", F.date_format("created_at", "yyyy"))
         .withColumn("created_at_partition_month", F.date_format("created_at", "MM"))
         .withColumn("created_at_partition_day", F.date_format("created_at", "dd"))
         .withColumn("created_at_partition_hour", F.date_format("created_at", "HH"))
         .select(
             "tweet_id",    
             "created_at",
             "text",
             "hashtags",
             F.col("retweet_count").cast(IntegerType()),
             F.col("possibly_sensitive").cast(BooleanType()),
             "lang",   
             "user_id",
             "created_at_partition_year",
             "created_at_partition_month",
             "created_at_partition_day",
             "created_at_partition_hour",
         )
     )
     return fact_tweet
Exemple #11
0
def add_cyclical_features(df, source_column, target_column):
	df = df.withColumn(target_column, F.from_utc_timestamp(F.col(source_column), "UTC"))
	hour = target_column + "_hour"
	weekday = target_column + "_weekday"
	month = target_column + "_month"
	year = target_column + "_year"
	df = df.withColumn(hour, F.hour(F.col(target_column)))
	df = df.withColumn(weekday, F.dayofweek(F.col(target_column))-1)
	df = df.withColumn(month, F.month(F.col(target_column))-1)
	df = df.withColumn("DerivedTime_cos_" + hour, F.cos(F.col(hour) * (2.0 * np.pi / 24)) + 1)
	df = df.withColumn("DerivedTime_sin_" + hour, F.sin(F.col(hour) * (2.0 * np.pi / 24)) + 1)
	df = df.withColumn("DerivedTime_cos_" + weekday, F.cos(F.col(weekday) * (2.0 * np.pi / 7)) + 1)
	df = df.withColumn("DerivedTime_sin_" + weekday, F.sin(F.col(weekday) * (2.0 * np.pi / 7)) + 1)
	df = df.withColumn("DerivedTime_cos_" + month, F.cos(F.col(month) * (2.0 * np.pi / 12)) + 1)
	df = df.withColumn("DerivedTime_sin_" + month, F.sin(F.col(month) * (2.0 * np.pi / 12)) + 1)
	df = df.withColumn("DerivedTime_" + year, F.year(F.col(target_column)))
	drop_cols = [source_column, hour, weekday, month]
	df = df.drop(*drop_cols)
	return df
Exemple #12
0
import pyspark.sql.functions as f

reader = spark.read

# ok we can get the table
label_editors = reader.parquet(
    "/user/nathante/ores_bias_data/ores_label_editors")

# get the edit history
mw_hist = reader.table("wmf.mediawiki_history")
mw_hist = mw_hist.filter(f.col("snapshot") == "2019-03")
mw_hist = mw_hist.filter(f.col("event_entity") == "revision")
mw_hist = mw_hist.select(
    ["revision_id", "event_timestamp", "event_user_id", "wiki_db"])
mw_hist = mw_hist.withColumn(
    "timestamp", f.from_utc_timestamp(f.col("event_timestamp"), tz="utc"))

# we want to identify newcomers and anons
# anons are just the folks without a userid
label_editors = label_editors.withColumn("is_anon", (f.col('userid') == 0))

# newcomers are not anons
non_anons = label_editors.filter(f.col("is_anon") == False)

# find the edits by the editors
edit_histories = non_anons.join(mw_hist,
                                on=[
                                    non_anons.userid == mw_hist.event_user_id,
                                    non_anons.wiki == mw_hist.wiki_db,
                                    non_anons.revid >= mw_hist.revision_id
                                ])
Exemple #13
0
def split(intersections, csv_report):
    logging.info('Splitting started')

    if csv_report:
        if cfg.reporting.use_uuid:
            uuid = uuid4()
            reporter = CSVReport(cfg.reporting.csv_dir, uuid)
        else:
            reporter = CSVReport(cfg.reporting.csv_dir, None)
    else:
        reporter = ExcelReport(cfg.reporting.file)

    logging.info('Spark initialization')
    sc = SparkContext(cfg.spark.master, 'map_test: split')
    sqlContext = SQLContext(sc)

    logging.info('Source file reading')
    df = sqlContext.read.json(cfg.splitting.source_file)
    df = df.withColumn("Date", F.from_utc_timestamp("eventTime", "UTC"))

    users_with_event_count = df.groupBy(F.col("entityId").alias("user")).count()


    logging.info('Filter users with small number of events')
    min_events = 10
    users_with_few_events = (users_with_event_count
                             .filter("count < %d" % (min_events))
                             .select(F.col("user").alias("user_with_few_events")))
    ndf = df.join(users_with_few_events,
                  F.col("entityId")==F.col("user_with_few_events"),
                  how="left_outer")
    df1 = ndf.filter("user_with_few_events is NULL").drop("user_with_few_events")


    logging.info('Split data into train and test')
    train_df, test_df = split_data(df)
    train_df.write.json(cfg.splitting.train_file, mode="overwrite")
    test_df.write.json(cfg.splitting.test_file, mode="overwrite")


    train_df = train_df.select("entityId", "event", "targetEntityId").cache()
    test_df = test_df.select("entityId", "event", "targetEntityId").cache()


    logging.info('Calculation of different stat metrics of datasets')
    events_by_type = (df
                      .groupBy("event")
                      .count()
                      .select(F.col("event"), F.col("count").alias("count_total"))
                      .toPandas())

    events_by_type_test = (test_df
                           .groupBy("event")
                           .count()
                           .select(F.col("event"), F.col("count").alias("count_test"))
                           .toPandas()
                           .set_index("event"))

    events_by_type_train = (train_df
                            .groupBy("event")
                            .count()
                            .select(F.col("event"), F.col("count").alias("count_train"))
                            .toPandas()
                            .set_index("event"))

    unique_users_by_event = (df
                             .select(F.col("entityId"), F.col("event"))
                             .distinct()
                             .groupBy("event")
                             .count()
                             .select(F.col("event"), F.col("count").alias("unique_users_total"))
                             .toPandas()
                             .set_index("event"))

    unique_users_by_event_train = (train_df
                                   .select(F.col("entityId"), F.col("event"))
                                   .distinct()
                                   .groupBy("event")
                                   .count()
                                   .select(F.col("event"), F.col("count").alias("unique_users_train"))
                                   .toPandas()
                                   .set_index("event"))

    unique_users_by_event_test = (test_df
                                  .select(F.col("entityId"), F.col("event"))
                                  .distinct()
                                  .groupBy("event")
                                  .count()
                                  .select(F.col("event"), F.col("count").alias("unique_users_test"))
                                  .toPandas()
                                  .set_index("event"))

    unique_items_by_event = (df
                             .select(F.col("targetEntityId"), F.col("event"))
                             .distinct()
                             .groupBy("event")
                             .count()
                             .select(F.col("event"), F.col("count").alias("unique_items_total"))
                             .toPandas()
                             .set_index("event"))

    unique_items_by_event_train = (train_df
                                   .select(F.col("targetEntityId"), F.col("event"))
                                   .distinct()
                                   .groupBy("event")
                                   .count()
                                   .select(F.col("event"), F.col("count").alias("unique_items_train"))
                                   .toPandas()
                                   .set_index("event"))

    unique_items_by_event_test = (test_df
                                  .select(F.col("targetEntityId"), F.col("event"))
                                  .distinct()
                                  .groupBy("event")
                                  .count()
                                  .select(F.col("event"), F.col("count").alias("unique_items_test"))
                                  .toPandas()
                                  .set_index("event"))

    logging.info('Calculate total counts')
    events = df.count()
    events_train = train_df.count()
    events_test = test_df.count()

    unique_users = df.select("entityId").distinct().count()
    unique_users_train = train_df.select("entityId").distinct().count()
    unique_users_test = test_df.select("entityId").distinct().count()

    unique_items = df.select(F.col("targetEntityId")).distinct().count()
    unique_items_train = train_df.select(F.col("targetEntityId")).distinct().count()
    unique_items_test = test_df.select(F.col("targetEntityId")).distinct().count()

    info_df = events_by_type
    dfs = [unique_users_by_event, unique_items_by_event,
            events_by_type_train, events_by_type_test,
            unique_users_by_event_train, unique_users_by_event_test,
            unique_items_by_event_train, unique_items_by_event_test]

    for data_frame in dfs:
        info_df = info_df.join(data_frame, on="event")

    n_rows, n_cols = info_df.shape

    # totals
    info_df.loc[n_rows] = ['ANY EVENT', events, unique_users, unique_items,
                        events_train, events_test,
                        unique_users_train, unique_users_test,
                        unique_items_train, unique_items_test]

    info_df.insert(4, 'events per user', info_df.ix[:, 1] / info_df.ix[:, 2])
    info_df.insert(5, 'events per item', info_df.ix[:, 1] / info_df.ix[:, 3])

    logging.info('Create event stat worksheet')
    reporter.start_new_sheet('Events stat')
    reporter.report(
        ['event', 'event count', 'unique users', 'unique items',
         'events per user', 'events per item',
         'event count train', 'event count test',
         'unique users train', 'unique users test',
         'unique items train', 'unique items test'],
        [column.tolist() for _, column in info_df.iteritems()],
        selected_rows=[next(info_df.iteritems())[1].tolist().index(cfg.testing.primary_event)],
        cfg=cfg)
    reporter.finish_sheet()

    if intersections:
        logging.info('Start intersections calculation')

        reporter.start_new_sheet('Intersections')

        columns_for_matrix = cfg.testing.events
        logging.info('Process train / train user intersection')
        train_train_users = (
            train_df
            .select(F.col("entityId").alias("user"), F.col("event").alias("event_left"))
            .distinct()
            .join(train_df.select(F.col("entityId").alias("user"), F.col("event").alias("event_right")).distinct(),
               on="user", how="inner")
            .groupBy(["event_left", "event_right"])
            .count()
            .collect())

        trtru = mk_intersection_matrix(train_train_users, columns_for_matrix)
        reporter.report(
            [''] + list(trtru.columns.values),
            [trtru.index.tolist()] + [column for _, column in trtru.iteritems()],
            title='Train / train user intersection')

        logging.info('Process train / test user intersection')
        train_test_users = (
            train_df
            .select(F.col("entityId").alias("user"), F.col("event").alias("event_left"))
            .distinct()
            .join(test_df.select(F.col("entityId").alias("user"), F.col("event").alias("event_right")).distinct(),
               on="user", how="inner")
            .groupBy(["event_left", "event_right"])
            .count()
            .collect())

        trtsu = mk_intersection_matrix(train_test_users, columns_for_matrix,
                                       horizontal_suffix=" train", vertical_suffix=" test")
        reporter.report(
            [''] + list(trtsu.columns.values),
            [trtsu.index.tolist()] + [column for _, column in trtsu.iteritems()],
            title='Train / test user intersection')

        logging.info('Process train / train item intersection')
        train_train_items = (
            train_df
            .select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_left"))
            .distinct()
            .join(train_df.select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_right")).distinct(),
               on="item", how="inner")
            .groupBy(["event_left", "event_right"])
            .count()
            .collect())

        trtri = mk_intersection_matrix(train_train_items, columns_for_matrix)
        reporter.report(
            [''] + list(trtri.columns.values),
            [trtri.index.tolist()] + [column for _, column in trtri.iteritems()],
            title='Train / train item intersection'
        )

        logging.info('Process train / test item intersection')
        train_test_items = (
            train_df
            .select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_left"))
            .distinct()
            .join(test_df.select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_right")).distinct(),
               on="item", how="inner")
            .groupBy(["event_left", "event_right"])
            .count()
            .collect())

        trtsi = mk_intersection_matrix(train_test_items, columns_for_matrix,
                                       horizontal_suffix=" train", vertical_suffix=" test")
        reporter.report(
            [''] + list(trtsi.columns.values),
            [trtsi.index.tolist()] + [column for _, column in trtsi.iteritems()],
            title='Train / test item intersection'
        )

        reporter.report_config(cfg)

    reporter.finish_document()
    logging.info('Splitting finished successfully')
Exemple #14
0
 .show(5))

# COMMAND ----------

withDate = filtered.withColumn('date', func.date_format('timestamp', 'MM/dd/yyyy'))
withDate.printSchema()
withDate.select('title', 'timestamp', 'date').show(3)

# COMMAND ----------

# MAGIC %md
# MAGIC It seems like we want a different function for time zone manipulation and to store the object as a timestamp rather than a string.  Let's use `from_utc_timestamp` to get a timestamp object back with the correct time zone.

# COMMAND ----------

withCEST = withDate.withColumn('cest_time', func.from_utc_timestamp('timestamp', 'Europe/Amsterdam'))
withCEST.printSchema()

(withCEST
 .select('timestamp', 'cest_time')
 .show(3, False))

# COMMAND ----------

# MAGIC %md
# MAGIC Next, let's convert the text field to lowercase.  We'll use the `lower` function for this.

# COMMAND ----------

lowered = withCEST.select('*', func.lower(col('text')).alias('lowerText'))
Exemple #15
0
def split(intersections, csv_report):
    logging.info('Splitting started')

    if csv_report:
        if cfg.reporting.use_uuid:
            uuid = uuid4()
            reporter = CSVReport(cfg.reporting.csv_dir, uuid)
        else:
            reporter = CSVReport(cfg.reporting.csv_dir, None)
    else:
        reporter = ExcelReport(cfg.reporting.file)

    logging.info('Spark initialization')
    sc = SparkContext(cfg.spark.master, 'map_test: split')
    sqlContext = SQLContext(sc)

    logging.info('Source file reading')
    df = sqlContext.read.json(cfg.splitting.source_file)
    df = df.withColumn("Date", F.from_utc_timestamp("eventTime", "UTC"))

    users_with_event_count = df.groupBy(
        F.col("entityId").alias("user")).count()

    logging.info('Filter users with small number of events')
    min_events = 10
    users_with_few_events = (users_with_event_count.filter(
        "count < %d" % (min_events)).select(
            F.col("user").alias("user_with_few_events")))
    ndf = df.join(users_with_few_events,
                  F.col("entityId") == F.col("user_with_few_events"),
                  how="left_outer")
    df1 = ndf.filter("user_with_few_events is NULL").drop(
        "user_with_few_events")

    logging.info('Split data into train and test')
    train_df, test_df = split_data(df)
    train_df.write.json(cfg.splitting.train_file, mode="overwrite")
    test_df.write.json(cfg.splitting.test_file, mode="overwrite")

    train_df = train_df.select("entityId", "event", "targetEntityId").cache()
    test_df = test_df.select("entityId", "event", "targetEntityId").cache()

    logging.info('Calculation of different stat metrics of datasets')
    events_by_type = (df.groupBy("event").count().select(
        F.col("event"),
        F.col("count").alias("count_total")).toPandas())

    events_by_type_test = (test_df.groupBy("event").count().select(
        F.col("event"),
        F.col("count").alias("count_test")).toPandas().set_index("event"))

    events_by_type_train = (train_df.groupBy("event").count().select(
        F.col("event"),
        F.col("count").alias("count_train")).toPandas().set_index("event"))

    unique_users_by_event = (df.select(
        F.col("entityId"),
        F.col("event")).distinct().groupBy("event").count().select(
            F.col("event"),
            F.col("count").alias("unique_users_total")).toPandas().set_index(
                "event"))

    unique_users_by_event_train = (train_df.select(
        F.col("entityId"),
        F.col("event")).distinct().groupBy("event").count().select(
            F.col("event"),
            F.col("count").alias("unique_users_train")).toPandas().set_index(
                "event"))

    unique_users_by_event_test = (test_df.select(
        F.col("entityId"),
        F.col("event")).distinct().groupBy("event").count().select(
            F.col("event"),
            F.col("count").alias("unique_users_test")).toPandas().set_index(
                "event"))

    unique_items_by_event = (df.select(
        F.col("targetEntityId"),
        F.col("event")).distinct().groupBy("event").count().select(
            F.col("event"),
            F.col("count").alias("unique_items_total")).toPandas().set_index(
                "event"))

    unique_items_by_event_train = (train_df.select(
        F.col("targetEntityId"),
        F.col("event")).distinct().groupBy("event").count().select(
            F.col("event"),
            F.col("count").alias("unique_items_train")).toPandas().set_index(
                "event"))

    unique_items_by_event_test = (test_df.select(
        F.col("targetEntityId"),
        F.col("event")).distinct().groupBy("event").count().select(
            F.col("event"),
            F.col("count").alias("unique_items_test")).toPandas().set_index(
                "event"))

    logging.info('Calculate total counts')
    events = df.count()
    events_train = train_df.count()
    events_test = test_df.count()

    unique_users = df.select("entityId").distinct().count()
    unique_users_train = train_df.select("entityId").distinct().count()
    unique_users_test = test_df.select("entityId").distinct().count()

    unique_items = df.select(F.col("targetEntityId")).distinct().count()
    unique_items_train = train_df.select(
        F.col("targetEntityId")).distinct().count()
    unique_items_test = test_df.select(
        F.col("targetEntityId")).distinct().count()

    info_df = events_by_type
    dfs = [
        unique_users_by_event, unique_items_by_event, events_by_type_train,
        events_by_type_test, unique_users_by_event_train,
        unique_users_by_event_test, unique_items_by_event_train,
        unique_items_by_event_test
    ]

    for data_frame in dfs:
        info_df = info_df.join(data_frame, on="event")

    n_rows, n_cols = info_df.shape

    # totals
    info_df.loc[n_rows] = [
        'ANY EVENT', events, unique_users, unique_items, events_train,
        events_test, unique_users_train, unique_users_test, unique_items_train,
        unique_items_test
    ]

    info_df.insert(4, 'events per user', info_df.ix[:, 1] / info_df.ix[:, 2])
    info_df.insert(5, 'events per item', info_df.ix[:, 1] / info_df.ix[:, 3])

    logging.info('Create event stat worksheet')
    reporter.start_new_sheet('Events stat')
    reporter.report([
        'event', 'event count', 'unique users', 'unique items',
        'events per user', 'events per item', 'event count train',
        'event count test', 'unique users train', 'unique users test',
        'unique items train', 'unique items test'
    ], [column.tolist() for _, column in info_df.iteritems()],
                    selected_rows=[
                        next(info_df.iteritems())[1].tolist().index(
                            cfg.testing.primary_event)
                    ],
                    cfg=cfg)
    reporter.finish_sheet()

    if intersections:
        logging.info('Start intersections calculation')

        reporter.start_new_sheet('Intersections')

        columns_for_matrix = cfg.testing.events
        logging.info('Process train / train user intersection')
        train_train_users = (train_df.select(
            F.col("entityId").alias("user"),
            F.col("event").alias("event_left")).distinct().join(
                train_df.select(
                    F.col("entityId").alias("user"),
                    F.col("event").alias("event_right")).distinct(),
                on="user",
                how="inner").groupBy(["event_left",
                                      "event_right"]).count().collect())

        trtru = mk_intersection_matrix(train_train_users, columns_for_matrix)
        reporter.report([''] + list(trtru.columns.values),
                        [trtru.index.tolist()] +
                        [column for _, column in trtru.iteritems()],
                        title='Train / train user intersection')

        logging.info('Process train / test user intersection')
        train_test_users = (train_df.select(
            F.col("entityId").alias("user"),
            F.col("event").alias("event_left")).distinct().join(
                test_df.select(
                    F.col("entityId").alias("user"),
                    F.col("event").alias("event_right")).distinct(),
                on="user",
                how="inner").groupBy(["event_left",
                                      "event_right"]).count().collect())

        trtsu = mk_intersection_matrix(train_test_users,
                                       columns_for_matrix,
                                       horizontal_suffix=" train",
                                       vertical_suffix=" test")
        reporter.report([''] + list(trtsu.columns.values),
                        [trtsu.index.tolist()] +
                        [column for _, column in trtsu.iteritems()],
                        title='Train / test user intersection')

        logging.info('Process train / train item intersection')
        train_train_items = (train_df.select(
            F.col("targetEntityId").alias("item"),
            F.col("event").alias("event_left")).distinct().join(
                train_df.select(
                    F.col("targetEntityId").alias("item"),
                    F.col("event").alias("event_right")).distinct(),
                on="item",
                how="inner").groupBy(["event_left",
                                      "event_right"]).count().collect())

        trtri = mk_intersection_matrix(train_train_items, columns_for_matrix)
        reporter.report([''] + list(trtri.columns.values),
                        [trtri.index.tolist()] +
                        [column for _, column in trtri.iteritems()],
                        title='Train / train item intersection')

        logging.info('Process train / test item intersection')
        train_test_items = (train_df.select(
            F.col("targetEntityId").alias("item"),
            F.col("event").alias("event_left")).distinct().join(
                test_df.select(
                    F.col("targetEntityId").alias("item"),
                    F.col("event").alias("event_right")).distinct(),
                on="item",
                how="inner").groupBy(["event_left",
                                      "event_right"]).count().collect())

        trtsi = mk_intersection_matrix(train_test_items,
                                       columns_for_matrix,
                                       horizontal_suffix=" train",
                                       vertical_suffix=" test")
        reporter.report([''] + list(trtsi.columns.values),
                        [trtsi.index.tolist()] +
                        [column for _, column in trtsi.iteritems()],
                        title='Train / test item intersection')

        reporter.report_config(cfg)

    reporter.finish_document()
    logging.info('Splitting finished successfully')
Exemple #16
0
def process_sdf(sdf_drive, sdf_vehicle):
  sdf_join_drive_vehicle = sdf_drive.alias("drive").join(sdf_vehicle.alias("vehicle"), ["vehicle_id"])
  sdf_join_drive_vehicle_fillna = sdf_join_drive_vehicle.fillna(0)
  sdf_drive_start_of_week = sdf_join_drive_vehicle_fillna.withColumn("week_start_date", \
                                                                     (F.date_sub(F.next_day(
                                                                       F.from_utc_timestamp(F.col("datetime"),
                                                                                            "America/New_York"),
                                                                       'monday'), 7)))

  sdf_Active_horsepower =  sdf_drive_start_of_week.withColumn("Active_horsepower" ,  (F.col("eng_load") / 255) \
                                                              * (F.col("max_torque") * F.col("rpm"))  / 5252)

  # Horsepower utilization – Active horsepower / Max Horsepower
  sdf_Horsepower_utilization = sdf_Active_horsepower.withColumn("Horsepower_utilization", F.col("Active_horsepower") / F.col("max_horsepower"))

  # # Torque Utilization - calculated as Engine load/ 255
  sdf_Torque_Utilization = sdf_Horsepower_utilization.withColumn("Torque_Utilization", F.col("eng_load") / 255)

  # # RPM Utilization – RPM / Maximum horsepower rpm
  sdf_RPM_Utilization = sdf_Torque_Utilization.withColumn("RPM_Utilization", F.col("rpm") / F.col("max_horsepower_rpm") )

  sdf_engine_features = sdf_RPM_Utilization.withColumn("ft_torque_util_60pct_s",
                                                       F.when((F.col("Torque_Utilization") >= 0.6) \
                                                              & (F.col("Torque_Utilization") < 0.7), \
                                                              F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_torque_util_70pct_s", F.when((F.col("Torque_Utilization") >= 0.7) \
                                                 & (F.col("Torque_Utilization") < 0.8), \
                                                 F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_torque_util_80pct_s", F.when((F.col("Torque_Utilization") >= 0.8) \
                                                 & (F.col("Torque_Utilization") < 0.9), \
                                                 F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_torque_util_90pct_s", F.when((F.col("Torque_Utilization") >= 0.9) \
                                                 & (F.col("Torque_Utilization") < 1), \
                                                 F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_horsepower_util_50pct_s", F.when((F.col("Horsepower_utilization") >= 0.5) \
                                                     & (F.col("Horsepower_utilization") < 0.6), \
                                                     F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_horsepower_util_60pct_s", F.when((F.col("Horsepower_utilization") >= 0.6) \
                                                     & (F.col("Horsepower_utilization") < 0.7), \
                                                     F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_horsepower_util_70pct_s", F.when((F.col("Horsepower_utilization") >= 0.7) \
                                                     & (F.col("Horsepower_utilization") < 0.8), \
                                                     F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_horsepower_util_80pct_s", F.when((F.col("Horsepower_utilization") >= 0.8) \
                                                     & (F.col("Horsepower_utilization") < 0.9), \
                                                     F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_rpm_util_50pct_s", F.when((F.col("RPM_Utilization") >= 0.5) \
                                              & (F.col("RPM_Utilization") < 0.6), \
                                              F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_rpm_util_60pct_s", F.when((F.col("RPM_Utilization") >= 0.6) \
                                              & (F.col("RPM_Utilization") < 0.7), \
                                              F.lit(1)).otherwise(F.lit(0))) \
 \
          sdf_engine_features_total = sdf_engine_features.select("vehicle_id", "week_start_date", "datetime", \
                                                                 "ft_torque_util_60pct_s", "ft_torque_util_70pct_s",
                                                                 "ft_torque_util_80pct_s", "ft_torque_util_90pct_s", \
                                                                 "ft_horsepower_util_50pct_s",
                                                                 "ft_horsepower_util_60pct_s",
                                                                 "ft_horsepower_util_70pct_s",
                                                                 "ft_horsepower_util_80pct_s", \
                                                                 "ft_rpm_util_50pct_s", "ft_rpm_util_60pct_s")

  sdf_sdf_engine_features_agg = sdf_engine_features_total.groupBy("vehicle_id", "week_start_date") \
    .agg(F.sum("ft_torque_util_60pct_s").alias("ft_torque_util_60pct_s"), \
         F.sum("ft_torque_util_70pct_s").alias("ft_torque_util_70pct_s"), \
         F.sum("ft_torque_util_80pct_s").alias("ft_torque_util_80pct_s"), \
         F.sum("ft_torque_util_90pct_s").alias("ft_torque_util_90pct_s"), \
         F.sum("ft_horsepower_util_50pct_s").alias("ft_horsepower_util_50pct_s"), \
         F.min("ft_horsepower_util_60pct_s").alias("ft_horsepower_util_60pct_s"), \
         F.min("ft_horsepower_util_70pct_s").alias("ft_horsepower_util_70pct_s"), \
         F.min("ft_horsepower_util_80pct_s").alias("ft_horsepower_util_80pct_s"), \
         F.min("ft_rpm_util_50pct_s").alias("ft_rpm_util_50pct_s"), \
         F.min("ft_rpm_util_60pct_s").alias("ft_rpm_util_60pct_s"), )

  sdf_sdf_engine_features_final = sdf_sdf_engine_features_agg.select("vehicle_id", "week_start_date", \
                                                                     "ft_torque_util_60pct_s", "ft_torque_util_70pct_s",
                                                                     "ft_torque_util_80pct_s", "ft_torque_util_90pct_s", \
                                                                     "ft_horsepower_util_50pct_s",
                                                                     "ft_horsepower_util_60pct_s",
                                                                     "ft_horsepower_util_70pct_s",
                                                                     "ft_horsepower_util_80pct_s", \
                                                                     "ft_rpm_util_50pct_s", "ft_rpm_util_60pct_s")

  sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.sort(F.col("vehicle_id"), F.col("week_start_date"))

  sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.withColumn("week_start_date",
                                                                           F.date_format(F.col("week_start_date"),
                                                                                         "yyyy-MM-dd"))

  sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.fillna(0)

  return sdf_sdf_engine_features_final
                      aws_secret_access_key=secret_key,
                      region_name="us-west-2")

srcfilePath = "s3://" + bucket + "/" + enriched_path + vendor + "/JSON/" + year + "/" + month + "/" + day + ""

tgtfilePath = "s3://" + bucket + "/" + enriched_path + vendor + "/Parquet/"

dfjson = sparkSession.read.format("json").option("multiline", "true").option(
    "inferSchema", "true").load(srcfilePath)

data = dfjson.withColumn("data", explode("DATA")).select("data.*")

# dfPT = data.withColumn("createdDatePT",sf.to_timestamp(udf_TZConversion(sf.regexp_replace(data.createdDate,"T"," ").cast("string"),sf.lit("UTC"),sf.lit("US/Pacific")),"yyyy-MM-dd HH:mm:ss"))
dfPT = data.withColumn(
    "createdDatePT",
    sf.from_utc_timestamp(sf.regexp_replace(data.createdDate, "T", " "),
                          "US/Pacific"))

df = dfPT.withColumn("year",sf.split("createdDate","\-")[0]) \
          .withColumn("month",sf.split("createdDate","\-")[1]) \
          .withColumn("day",sf.split((sf.split((sf.split("createdDate","\-")[2]),"T")[0])," ")[0])

dfbaseData = df.select(
    [col for col in df.columns if not col.startswith("xmlns")])

#dfbaseData.show(10,False)

dfrankedId = dfbaseData.withColumn("row_num", sf.row_number().over(Window.partitionBy("id").orderBy(sf.asc("updatedAt")))) \
                    .where(sf.col("row_num") == 1) \
                    .select(dfbaseData["*"])

dfrankedId.repartition(sf.col("year"),sf.col("month"),sf.col("day")) \
Exemple #18
0
park_data = spark.read.format("csv").option("inferSchema", 'true').load("dbfs:/FileStore/tables/AllFiltParkPred.txt")
park_data = park_data.selectExpr("_c0 as name", "_c1 as free","_c2 as weekday","_c3 as month","_c4 as day","_c5 as year","_c6 as hour")
park_data = park_data.withColumn("hour",substring("hour",0,2))
park_data = park_data.withColumn("hour",park_data["hour"].cast(IntegerType()))
park_data.createOrReplaceTempView("dparking")
#display(park_data)
#park_data = park_data.withColumn("")

# COMMAND ----------

import pandas as pd
from pyspark.sql.functions import from_utc_timestamp

weather = spark.read.format("csv").option("header", "true").option("multiline",'true').option("inferSchema", 'true').load("dbfs:/FileStore/tables/weather.csv")
weather = weather.withColumn('date', weather.date.cast('timestamp'))
weather = weather.withColumn('date', from_utc_timestamp(weather.date, "CET"))
#weather = weather.withColumn("date",substring("date",0,2))
#weather = weather.withColumn("date", unix_timestamp(weather["date"], "yyyy-MM-dd hh"))
display(weather)
weather.createOrReplaceTempView("weather")
weather_pd = weather.toPandas()
weather_pd["date"] = pd.to_datetime(weather_pd["date"], format = "%Y-%m-%d-%H")
weather_pd = weather_pd.set_index('date')
#weather_pd["temperature"]

# COMMAND ----------

#FAI ESTREMA ATTENZIONE: ALCUNI DATI MANCANO DAL FILE METEREOLOGICO!!!! RAGIONE PER CUI DOVRAI FARE PARTICOLARE ATTENZIONE CHE QUANDO FARAI IL JOIN CON GLI ALTRI DATI, LADDOVE MANCHINO DATI METEREOLOGICI, GLI ALTRI DATI VENGANO SCARTATI ..
check_weather = spark.sql("select * from weather where month(date) = 08")
display(check_weather)#chiaramente mancano i dati dal 5 al 10 agosto, ma anche altrove
client = boto3.client('glue', region_name='us-west-2')

######################################
####        CONNECTION BLOCK      ####
######################################

## tire connection
tire_DS = glueContext.create_dynamic_frame.from_catalog(
    database="staging_combined",
    table_name="tire",
    transformation_ctx="tire_DS")
tire_regDF = tire_DS.toDF()
tire_regDF = tire_regDF.withColumn(
    'sourcesystemtimestamp',
    from_utc_timestamp('rovertimestamp', 'America/New_York')).withColumn(
        'sourcesystemdate',
        to_date(from_utc_timestamp('rovertimestamp', 'America/New_York')))
tire_noDel = tire_regDF.filter(tire_regDF['dboperationtype'] != 'D')
tire_drop = tire_noDel.drop('dboperationtype').drop('audtdateadded')
tire_distDF = tire_drop.distinct()
tire_dynDF = DynamicFrame.fromDF(tire_distDF, glueContext, "nested")

## tiredetail connection
tiredet_ds = glueContext.create_dynamic_frame.from_catalog(
    database="staging_combined",
    table_name="tiredetail",
    transformation_ctx="tiredet_ds")
tiredet_regDF = tiredet_ds.toDF()
tiredet_regDF = tiredet_regDF.withColumn(
    'sourcesystemtimestamp',
def main():
    """Main function"""

    # Get args
    args = get_args()

    # container
    container_in = args.container_in
    container_out = args.container_out

    # Azure credentials
    sas_token = args.sas
    storage_account_name = args.storage
    azure_accounts = list()
    azure_accounts.append({
        "storage": storage_account_name,
        "sas": sas_token,
        "container": container_in
    })
    azure_accounts.append({
        "storage": storage_account_name,
        "sas": sas_token,
        "container": container_out
    })

    oauth_login = args.oauth_login
    oauth_client_id = args.oauth_client_id
    oauth_client_secret = args.oauth_client_secret

    # requires hadoop 3.2+
    # azure_oauth = {
    #     "endpoint": oauth_login,
    #     "client-id": oauth_client_id,
    #     "client-secret": oauth_client_secret
    # }
    azure_oauth = False

    # VM
    cores = args.vm_cores
    ram = args.vm_ram
    shuffle_partitions = args.shuffle_partitions

    # Date, state
    country = args.country
    state = args.state

    # process config
    roam_dist_stops = args.roam_dist_stops
    roam_dist_events = args.roam_dist_events

    # Path in - path out
    blob_in = f"wasbs://{container_in}@{storage_account_name}.blob.core.windows.net/stoplocation-v8_r70-s5-a70-h6/country={country}/year=2020/"
    if azure_oauth:
        # we can leverage abfss
        blob_in = f"abfss://{container_in}@{storage_account_name}.dfs.core.windows.net/stoplocation-v8_r70-s5-a70-h6/country={country}/year=2020/"
    timezones_in = "abfss://[email protected]/utils_states_timezones/"

    path_out_distinct = f"distinct_user_clusters-v8_r70-s5-a70-h6_clustered_{roam_dist_stops}m/country={country}"
    path_out_all = f"all_user_clusters-v8_r70-s5-a70-h6_clustered_{roam_dist_stops}m/country={country}"

    # config spark
    conf = getSparkConfig(cores, ram, shuffle_partitions,
                          azure_accounts, azure_oauth)

    # set prop for handling partition columns as strings (fixes prefixes as int)
    conf.set("spark.sql.sources.partitionColumnTypeInference.enabled", "false")

    # Create spark session
    sc = SparkContext(conf=conf).getOrCreate()
    sqlContext = SQLContext(sc)
    spark = sqlContext.sparkSession

    # Init azure client
    blob_service_client = BlobServiceClient.from_connection_string(
        CONN_STRING.format(storage_account_name, sas_token))

    #  build keys, date is mandatory, prefix opt
    partition_key = f"state={state}"

    print("process "+partition_key)
    start_time = time.time()
    local_dir = LOCAL_PATH+partition_key
    print("write temp to "+local_dir)

    # cleanup local if exists
    if (os.path.isdir(local_dir)):
        map(os.unlink, (os.path.join(local_dir, f)
                        for f in os.listdir(local_dir)))

    # Input dataset
    print("read dataset table")
    read_time = time.time()

    dfs = spark.read.format("parquet").load(blob_in)
    dfs_timezones = spark.read.format("parquet").load(timezones_in)

    # apply partition filter
    dfs_state = dfs.where(f"state = '{state}'")

    print("processing with spark")
    spark_time = time.time()

    w = Window().partitionBy('userId').orderBy('begin')

    dfs_state = add_distance_column(dfs_state, order_column='begin')
    dfs_state = dfs_state.fillna(0, subset=['next_travelled_distance'])
    dfs_state = dfs_state.withColumn('lag_next_travelled_distance', F.lag(
        col('next_travelled_distance')).over(w))
    dfs_state = dfs_state.withColumn('lag_end', F.lag('end').over(w))
    dfs_state = dfs_state.withColumn('rn', F.when(((col('lag_next_travelled_distance') != col('prev_travelled_distance')) |
                                                   (col('prev_travelled_distance') > 0) |
                                                   (col('lag_next_travelled_distance') > 0) |
                                                   (col('distance_prev') > roam_dist_events) |
                                                   ((F.dayofyear(col('begin')) - F.dayofyear(col('lag_end')) == 1) &
                                                    (F.hour(col('begin')) < 6))
                                                   ) &
                                                  ((col('lag_end').isNull()) | (col('lag_end') < col('begin'))), 1).otherwise(0))
    # Remove prev_travelled distance when rn == 0 (it happens when lag_end and begin overlap)
    dfs_state = dfs_state.withColumn('prev_travelled_distance', F.when(
        col('rn') == 0, 0).otherwise(col('prev_travelled_distance')))

    w = Window().partitionBy('userId').orderBy(
        'begin').rangeBetween(Window.unboundedPreceding, 0)

    dfs_state = dfs_state.withColumn('group', F.sum('rn').over(w))

    dfs_state = dfs_state.groupBy('userId', 'group', 'state').agg(F.mean('latitude').alias('latitude'),
                                                                  F.mean('longitude').alias(
                                                                      'longitude'),
                                                                  F.min('begin').alias(
                                                                      'begin'),
                                                                  F.max('end').alias('end')).drop('group')

    dfs_destinations = get_destinations(dfs_state, roam_dist=roam_dist_stops)
    dfs_destinations = dfs_destinations.withColumn(
        'prefix', dfs_destinations.userId.substr(1, 2))
    dfs_destinations = dfs_destinations.withColumn(
        'dayofyear', F.dayofyear('begin'))
    dfs_destinations = dfs_destinations.withColumn('year', F.year('begin'))
    # dfs_destinations = dfs_destinations.withColumn('state', F.lit(state))

    # Local time
    dfs_destinations.createOrReplaceTempView("dfs_destinations")
    dfs_destinations = spark.sql("""
      SELECT dfs_destinations_distinct.*, geohash(clusterLatitude, clusterLongitude, 7) as geohash7
      from dfs_destinations
      """)
    dfs_destinations = dfs_destinations.withColumn('geohash5', F.substring(col('geohash7'), 1, 5))
    dfs_destinations = dfs_destinations.join(F.broadcast(dfs_timezones), on='geohash5').drop('geohash5')
    dfs_destinations = dfs_destinations.withColumn('local_begin', F.from_utc_timestamp(col('begin'), col('tzid')))
    dfs_destinations = dfs_destinations.withColumn('offset', (
                (col('local_begin').cast('long') - col('begin').cast('long')) / 3600).cast('int')).drop('local_begin')
    dfs_destinations.persist(StorageLevel.DISK_ONLY)

    # Write
    local_dir_all = local_dir + "/all/"
    dfs_destinations_all = dfs_destinations.select(
        'prefix', 'userId', 'clusterId', 'begin', 'end', 'offset', 'year', 'dayofyear')
    dfs_destinations_all.repartition(256, "prefix", "year", "dayofyear").write.partitionBy(
        "prefix", "year", "dayofyear").format('parquet').mode('overwrite').save(local_dir_all)

    local_dir_distinct = local_dir+"/distinct/"
    dfs_destinations_distinct = dfs_destinations.select(
        'prefix', 'userId', 'clusterId', 'clusterLatitude', 'clusterLongitude', 'geohash7', 'year').distinct()
    dfs_destinations_distinct.repartition(256, "prefix", "year").write.partitionBy(
        "prefix", "year").format('parquet').mode('overwrite').save(local_dir_distinct)

    dfs_destinations.unpersist()

    print("upload local data to azure")
    upload_time = time.time()

    # upload parts 1  "prefix/year"
    print(f"upload files for distinct")
    # upload with threads
    dfutures = []    
    with ThreadPoolExecutor(max_workers=THREADS) as executor:
        years = [2020]
        s_key = f"state={state}"
        for fprefix in enumerate_prefixes():
            print(f"upload files for distinct: {fprefix}")
            prefix_dir = local_dir_distinct+"prefix="+fprefix
            prefix_key = f"prefix={fprefix}"

            for fyear in years:
                f_dir = prefix_dir + "/year="+str(fyear)
                f_key = prefix_key + "/year="+str(fyear)

                # print(f"read files for distinct from {f_dir}")

                if (os.path.isdir(f_dir)):
                    files = [filename for filename in os.listdir(
                        f_dir) if filename.startswith("part-")]

                    if len(files) > 0:

                        for file_local in files:
                            file_path = f_dir+"/"+file_local
                            part_num = int(file_local.split('-')[1])
                            part_key = '{:05d}'.format(part_num)
                            # fix name as static hash to be reproducible
                            filename_hash = hashlib.sha1(
                                str.encode(f_key+s_key+part_key)).hexdigest()

                            blob_key = "{}/{}/{}/part-{}-{}.snappy.parquet".format(
                                path_out_distinct, f_key, s_key, part_key, filename_hash)

                            # print("upload " + file_path + " to " + container_out+":"+blob_key)
                            # upload_blob(blob_service_client,container_out, blob_key, file_path)
                            future = executor.submit(
                                upload_blob, blob_service_client,container_out, blob_key, file_path)
                            dfutures.append(future)                                        

                    # else:
                    #    print(f"no files to upload for {f_key}")

                # else:
                #    print(f"missing partition for {f_key}")

        # end of loop, wait for futures
        for future in dfutures:
            bkey = future.result()

    # ensure we wait all tasks
    # TODO check if all done
    ddone = concurrent.futures.wait(dfutures)

    # upload parts 2 "prefix/year/dayofyear"
    print(f"upload files for all")
    years = [2020]
    s_key = f"state={state}"
    # upload with threads
    afutures = []    
    with ThreadPoolExecutor(max_workers=THREADS) as executor:    
        for fprefix in enumerate_prefixes():
            print(f"upload files for all: {fprefix}")
            prefix_dir = local_dir_all+"prefix="+fprefix
            prefix_key = f"prefix={fprefix}"

            for fyear in years:
                f_dir = prefix_dir + "/year="+str(fyear)
                f_key = prefix_key + "/year="+str(fyear)

                # print(f"read files for all from {f_dir}")

                for fday in range(1, 367):
                    d_dir = f_dir + "/dayofyear="+str(fday)
                    d_key = f_key + "/dayofyear="+str(fday)

                    # print(f"read files for all from {d_dir}")

                    if (os.path.isdir(d_dir)):
                        files = [filename for filename in os.listdir(
                            d_dir) if filename.startswith("part-")]

                        if len(files) > 0:

                            for file_local in files:
                                file_path = d_dir+"/"+file_local
                                part_num = int(file_local.split('-')[1])
                                part_key = '{:05d}'.format(part_num)
                                # fix name as static hash to be reproducible
                                filename_hash = hashlib.sha1(
                                    str.encode(d_key+s_key+part_key)).hexdigest()

                                blob_key = "{}/{}/{}/part-{}-{}.snappy.parquet".format(
                                    path_out_all, d_key, s_key, part_key, filename_hash)

                                # print("upload " + file_path + " to " + container_out+":"+blob_key)
                                # upload_blob(blob_service_client,container_out, blob_key, file_path)
                                future = executor.submit(
                                    upload_blob, blob_service_client,container_out, blob_key, file_path)
                                afutures.append(future)                                       
                        # else:
                        #     print(f"no files to upload for {d_key}")

                    # else:
                    #     print(f"missing partition for {d_key}")
        # end of loop, wait for futures
        for future in afutures:
            bkey = future.result()

    # ensure we wait all tasks
    # TODO check if all done
    adone = concurrent.futures.wait(afutures)

    print("--- {} seconds elapsed ---".format(int(time.time() - start_time)))
    print()
    shutdown_time = time.time()
    spark.stop()

    end_time = time.time()
    print("Done in {} seconds (read:{} spark:{} upload:{} shutdown:{})".format(
        int(end_time - start_time),
        int(spark_time - read_time),
        int(upload_time - spark_time),
        int(shutdown_time - upload_time),
        int(end_time - shutdown_time)
    ))
    print('Done.')
def process_data(data_path, database, table):
    '''
        Purpose: To Process the API results using SPARK and store in HDFS
        '''
    try:
        logger('INFO', "LOADING the Data in Spark for Processing ")

        logger(
            'INFO',
            "In case of java.lang.OutOfMemoryError Tune Spark Parameters in conf/proprties.yml"
        )

        df = myspark.read.format("json").options(
            inferSchema=True,
            dateFormat="yyyy-MM-dd",
            timestampFormat="yyyy-MM-dd'T'HH:mm:ss.SSSZZ",
            ignoreLeadingWhiteSpace=True,
            ignoreTrailingWhiteSpace=True,
            path="/tmp/tempfiles/").load()

        logger('INFO', "Changing the Data Type to Timestamp for few Columns")

        df = df.withColumn(
            "created",
            F.from_utc_timestamp(
                df.created, "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").cast(
                    T.TimestampType())).withColumn(
                        "lastModified",
                        F.from_utc_timestamp(
                            df.lastModified,
                            "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").cast(
                                T.TimestampType())).withColumn(
                                    "lastAccessed",
                                    F.from_utc_timestamp(
                                        df.lastAccessed,
                                        "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").cast(
                                            T.TimestampType()))

        df = df.withColumn(
            "created",
            F.date_format(df.created, "yyyy-MM-dd HH:mm:ss")).withColumn(
                "lastModified",
                F.date_format(df.lastModified,
                              "yyyy-MM-dd HH:mm:ss")).withColumn(
                                  "lastAccessed",
                                  F.date_format(df.lastAccessed,
                                                "yyyy-MM-dd HH:mm:ss"))

        logger('INFO', "Choosing the 11 required Columns ")

        finaldf = df.select("owner", "parentPath", "originalName", "created",
                            "lastModified", "lastAccessed", "size",
                            "sourceType", "permissions", "group", "identity")

        logger(
            'INFO', "%s Records will be LOADED into %s.%s Table " %
            (finaldf.count(), database, table))

        logger('INFO', "Create Table Command Prepared:")

        cmd = "CREATE TABLE IF NOT EXISTS %s.%s (owner STRING,parentPath STRING,originalName STRING,created timestamp,lastModified timestamp,lastAccessed timestamp,size BIGINT,sourceType STRING,permissions STRING,group STRING,identity BIGINT)STORED AS PARQUET" % (
            database, table)

        logger('INFO', cmd)

        logger('INFO',
               "Creating the Table %s.%s if NOT EXISTS" % (database, table))

        myspark.sql(cmd)

        finaldf.createOrReplaceTempView("mytable")

        logger('INFO', "INSERT INTO TABLE Table Command Prepared:")

        cmd = """INSERT INTO TABLE {database}.{table} SELECT owner,parentPath,originalName,created,lastModified,lastAccessed,size,sourceType,permissions,group,identity FROM mytable""".format(
            database=database, table=table)

        logger('INFO', cmd)

        logger('INFO', "Inserting the Data in %s.%s Table" % (database, table))

        myspark.sql(cmd)

        logger(
            'INFO', "%s Records Inserted successfully in %s.%s" %
            (finaldf.count(), database, table))

    except Exception, e:
        logger('ERROR', "Job Failed with below Details !!")
        os.system("rm -f " + data_path + "/../pids/*.pid")
        logger('ERROR', "%s" % e)
        logger('ERROR', "Exiting !!")
        sys.exit()
def process_log_data(spark, input_data, output_data, song_df):
    """
    Description: This function can be used to process log-data files from the
    given input path and transform the data from json files into users, time and songplays
    spark tables and writing these tables to the given output path as parquet tables.

    Arguments:
        spark: SparkSession object.
        input_data: Path to the input JSON files.
        output_data: Path to the output directory that stores output parquet tables.
        song_df: Song data dataframe.

    Returns:
        None.
    """
    # get filepath to log data file
    log_data = input_data + 'log-data/2018/11'

    # define schema for log data file
    log_schema = t.StructType([
        t.StructField("artist", t.StringType(), True),
        t.StructField("auth", t.StringType(), True),
        t.StructField("firstName", t.StringType(), True),
        t.StructField("gender", t.StringType(), True),
        t.StructField("itemInSession", t.IntegerType(), True),
        t.StructField("lastName", t.StringType(), True),
        t.StructField("length", t.DecimalType(12, 7), True),
        t.StructField("level", t.StringType(), True),
        t.StructField("location", t.StringType(), True),
        t.StructField("method", t.StringType(), True),
        t.StructField("page", t.StringType(), True),
        t.StructField("registration", t.DecimalType(16, 2), True),
        t.StructField("sessionId", t.IntegerType(), True),
        t.StructField("song", t.StringType(), True),
        t.StructField("status", t.IntegerType(), True),
        t.StructField("ts", t.LongType(), True),
        t.StructField("userAgent", t.StringType(), True),
        t.StructField("userId", t.StringType(), True)
    ])

    # read log data file using schema
    df = spark \
        .read \
        .format("json") \
        .schema(log_schema) \
        .load(log_data)

    # filter by actions for song plays
    df = df \
        .filter('page = "NextSong"')

    # group by userId for unique users
    users_list = df \
        .groupBy('userId') \
        .agg(f.max('ts').alias('ts'))

    # extract columns to create users table
    users_table = df \
        .join(users_list, ['userId', 'ts'], 'inner') \
        .select([df.userId.cast(t.IntegerType()).alias('user_id'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), 'gender', 'level']) \
        .dropDuplicates()

    # write users table to parquet files
    users_output = output_data + 'users'

    users_table \
        .write \
        .option("path", users_output) \
        .saveAsTable('users', format='parquet')

    # create timestamp column from original timestamp column
    df = df \
        .withColumn('timestamp', f.from_utc_timestamp((df.ts/1000.0).cast('timestamp'), 'UTC'))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda ts: datetime.fromtimestamp(ts / 1000.0),
                       t.TimestampType())
    df = df.withColumn('datetime', get_datetime('ts'))

    # extract columns to create time table
    time_table = df \
        .select([col('datetime').alias('start_time'), dayofmonth(col('datetime')).alias('day'), weekofyear(col('datetime')).alias('week'), month(col('datetime')).alias('month'), year(col('datetime')).alias('year'), dayofweek(col('datetime')).alias('weekday')]) \
        .dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_output = output_data + 'time'

    time_table \
        .write \
        .partitionBy('year', 'month') \
        .option("path", time_output) \
        .saveAsTable('time', format='parquet')

    # join and extract columns from song and log datasets to create songplays table
    cond = [
        df.artist == song_df.artist_name, df.song == song_df.title,
        df.length == song_df.duration
    ]
    songplays_df = df.join(song_df, cond, 'left')

    songplays_df = songplays_df \
        .select(df.datetime.alias('start_time'), df.userId.alias('user_id'), df.level.alias('level'), song_df.song_id.alias('song_id'), song_df.artist_id.alias('artist_id'), df.sessionId.alias('session_id'), df.location.alias('location'), df.userAgent.alias('user_agent'), year(df.datetime).alias('year'), month(df.datetime).alias('month'))
    w = Window().orderBy(f.lit('A'))
    songplays_table = songplays_df.withColumn('songplay_id',
                                              f.row_number().over(w))

    # write songplays table to parquet files partitioned by year and month
    songplays_output = output_data + 'songplays'

    songplays_table \
        .select(['songplay_id', 'start_time', 'user_id', 'level', 'song_id', 'artist_id', 'session_id', 'location', 'user_agent', 'year', 'month'])\
        .write \
        .partitionBy('year', 'month') \
        .option("path", songplays_output) \
        .saveAsTable('songplays', format='parquet')
Exemple #23
0
seed = 21  # Para hacer la extracción aleatoria del 1% de datos
fraction = 0.01

tweets_sample = tweets.sample(fraction, seed)

print("Número de tweets muestreados: {0}".format(tweets_sample.count()))

hiveContext.sql('DROP TABLE IF EXISTS tweets_sample')
hiveContext.registerDataFrameAsTable(tweets_sample, "tweets_sample")

# Estudiamos el número de tweets a cada hora del día

tweets_timestamp = hiveContext.sql("SELECT created_at FROM tweets_sample")

tweets_timestamp = tweets_timestamp.withColumn("hour", lit(hour(from_utc_timestamp(tweets_timestamp.created_at, 'GMT+1'))))\
                                    .withColumn("day", lit(date_format(tweets_timestamp.created_at,'MM-dd-YY')))

tweets_hour_day = tweets_timestamp.groupBy("hour","day")\
                        .agg({"created_at":"count"})\
                        .orderBy("count(created_at)", ascending=False)\
                        .withColumnRenamed('count(created_at)','count')

print("Número de tweets en cada hora en cada día:\n")
tweets_hour_day.limit(20).show()

tweets_hour = tweets_hour_day.drop("day")\
                .groupBy("hour")\
                .agg({"count": "sum"})\
                .withColumnRenamed('sum(count)','tweets')\
                .orderBy('tweets', ascending=False)\
# MAGIC   <img alt="Opens in new tab" src="https://files.training.databricks.com/static/images/external-link-icon-16x16.png"/>&nbsp;Watch full-screen.</a>
# MAGIC </div>

# COMMAND ----------

# MAGIC %md-sandbox
# MAGIC Take a look at the server-side errors by hour to confirm the data meets your expectations. Visualize it by selecting the bar graph icon once the table is displayed. <br><br>
# MAGIC <div><img src="https://files.training.databricks.com/images/eLearning/ETL-Part-1/visualization.png" style="height: 400px" style="margin-bottom: 20px; height: 150px; border: 1px solid #aaa; border-radius: 10px 10px 10px 10px; box-shadow: 5px 5px 5px #aaa"/></div>

# COMMAND ----------

from pyspark.sql.functions import from_utc_timestamp, hour, col

countsDF = (serverErrorDF.select(
    hour(from_utc_timestamp(
        col("time"),
        "GMT")).alias("hour")).groupBy("hour").count().orderBy("hour"))

display(countsDF)

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC The distribution of errors by hour meets the expectations.  There is an uptick in errors around midnight, possibly due to server maintenance at this time.

# COMMAND ----------

# MAGIC %md-sandbox
# MAGIC ### Saving Back to DBFS
# MAGIC
Exemple #25
0
time_df = spark.createDataFrame([(1428476400, )], ['unix_time'])
# Expected:
# +-------------------+
# |                 ts|
# +-------------------+
# |2015-04-08 00:00:00|
# +-------------------+

# Answer
time_df.select(F.from_unixtime('unix_time').alias('ts')).show()
spark.conf.unset("spark.sql.session.timeZone")

# COMMAND ----------

# You have this dataframe, with a column for the datetime (given in UTC time), and the second for the time zone. Convert this datetime in local time, once for the PST (Pacific Standard Time) and in the second case use the time zone column tz

df = spark.createDataFrame([('1997-02-28 10:30:00', 'JST')], ['ts', 'tz'])
# Expected:
# +-------------------+-------------------+
# |           PST_time|         local_time|
# +-------------------+-------------------+
# |1997-02-28 02:30:00|1997-02-28 19:30:00|
# +-------------------+-------------------+

# Answer
df.select(
    F.from_utc_timestamp(df.ts, "PST").alias('PST_time'),
    F.from_utc_timestamp(df.ts, df.tz).alias('local_time')).show()

# COMMAND ----------
Exemple #26
0
                .config("spark.sql.sources.partitionOverwriteMode","dynamic")
                .getOrCreate())

client = boto3.client('s3',aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name="us-west-2")

srcfilePath = "s3://" + bucket + "/" + enriched_path + vendor + "/JSON/" + year + "/" + month + "/" + day +"/tradeline_*"

tgtfilePath = "s3://" + bucket + "/" + enriched_path + vendor + "/Stage/Parquet/tradeLine/"

dfjson = sparkSession.read.format("json").option("multiline", "true").option("inferSchema", "true").load(srcfilePath)

#data = dfjson.select(explode("DATA").alias("data"))
data = dfjson.withColumn("data", explode("DATA")).select("data.*")

# dfPT = data.withColumn("createdDatePT",sf.to_timestamp(udf_TZConversion(sf.regexp_replace(data.createdDate,"T"," ").cast("string"),sf.lit("UTC"),sf.lit("US/Pacific")),"yyyy-MM-dd HH:mm:ss"))
dfPT = data.withColumn("createdDatePT",sf.from_utc_timestamp(sf.regexp_replace(data.createdDate,"T"," "),"US/Pacific"))

df = dfPT.withColumn("year",sf.split("createdDate","\-")[0]) \
          .withColumn("month",sf.split("createdDate","\-")[1]) \
          .withColumn("day",sf.split((sf.split((sf.split("createdDate","\-")[2]),"T")[0])," ")[0])

dfbaseData = df.select([col for col in df.columns if not col.startswith("xmlns")])

#dfbaseData.show(10,False)

# dfrankedId =  dfbaseData.withColumn("row_num", sf.row_number().over(Window.partitionBy("id").orderBy(sf.asc("updatedAt")))) \
                    # .where(sf.col("row_num") == 1) \
                    # .select(dfbaseData["*"])

dfbaseData.repartition(sf.col("year"),sf.col("month"),sf.col("day")) \
                   .write.format("parquet") \
Exemple #27
0
def get_timestamp_in_tz(unixtime_timestamp, ts_format, tz):
    return F.from_utc_timestamp(F.from_unixtime(unixtime_timestamp, ts_format),
                                tz)
Exemple #28
0
 .show(5))

# COMMAND ----------

withDate = filtered.withColumn('date', func.date_format('timestamp', 'MM/dd/yyyy'))
withDate.printSchema()
withDate.select('title', 'timestamp', 'date').show(3)

# COMMAND ----------

# MAGIC %md
# MAGIC It seems like we want a different function for time zone manipulation and to store the object as a timestamp rather than a string.  Let's use `from_utc_timestamp` to get a timestamp object back with the correct time zone.

# COMMAND ----------

withCEST = withDate.withColumn('cest_time', func.from_utc_timestamp('timestamp', 'Europe/Amsterdam'))
withCEST.printSchema()

(withCEST
 .select('timestamp', 'cest_time')
 .show(3, False))

# COMMAND ----------

# MAGIC %md
# MAGIC Next, let's convert the text field to lowercase.  We'll use the `lower` function for this.

# COMMAND ----------

lowered = withCEST.select('*', func.lower(col('text')).alias('lowerText'))
Exemple #29
0
serverErrorDF = (logDF
   .filter((col("code") >= 500) & (col("code") < 600))
   .select("date", "time", "extention", "code")
) 

# COMMAND ----------

display(serverErrorDF)
serverErrorDF.count()

# COMMAND ----------

from pyspark.sql.functions import from_utc_timestamp, hour, col

countsDF = (serverErrorDF
   .select(hour(from_utc_timestamp(col("time"), "GMT")).alias("hour"))
   .groupBy("hour")
   .count()
   .orderBy("hour"))

display(countsDF)

# COMMAND ----------

(serverErrorDF
   .write
   .mode("overwrite")
   .parquet("/tmp" + username + "/log20170329/serverErrorDF.parquet")
)

# COMMAND ----------
Exemple #30
0
# save columns of entity
columns = series_list[0].keys()

# COMMAND ----------

# step through the day
# ceate dataframe from data, convert to spark DF for saving and type conversion
df = pd.DataFrame(series_list, columns=columns)
dfclean = df.drop_duplicates()
sparkDF = spark.createDataFrame(dfclean)

# COMMAND ----------

sparkDF = sparkDF.withColumn('updatedAt',
                             F.from_utc_timestamp('updatedAt',
                                                  'CST')).withColumn(
                                                      'createdAt',
                                                      F.from_utc_timestamp(
                                                          'createdAt', 'CST'))
display(sparkDF.select("id", "updatedAt", "createdAt").orderBy(F.col("id")))
num_files = sparkDF.count()

# COMMAND ----------

print(num_files)

# COMMAND ----------

from pyspark.sql.types import *

Exemple #31
0
@sf.udf(st.ArrayType(st.StringType()))
def flatten(hotels):
  return reduce(lambda x,y:x+y, hotels)

# COMMAND ----------

_searches = hotels_e_search \
                .filter((sf.col('day') >= startDate) & (sf.col('day') <= endDate)) \
                .filter(sf.col('vcid_key') == vcid) \
                .filter(sf.col('inputData.trackingId').isNotNull()) \
                .filter(sf.size(sf.col('outputData.rankList.ranks')) > 0) \
                .withColumn('sort', get_sort(sf.col('inputData.params'))) \
                .withColumn('filters', get_filters(sf.col('inputData.filterInput'))) \
                .withColumn('algo', get_algo(sf.col('outputData.augurConfigID'))) \
                .select(sf.col('day').alias('searchDay'),
                        sf.from_utc_timestamp(sf.col('eventTime').cast(st.TimestampType()), 'IST').alias('searchTime'),
                        sf.col('currentTime').alias('currentSearchTime'),
                        sf.col('vcid_key').alias('vcid'),
                        sf.col('inputData.flavour').alias('flavour'),
                        sf.col('inputData.checkIn').alias('checkIn'),
                        sf.col('inputData.checkOut').alias('checkOut'),
                        sf.col('inputData.pax').alias('pax'),
                        sf.col('inputData.userId').alias('userId'),
                        sf.col('inputData.email').alias('email'),
                        sf.col('inputData.trackingId').alias('trackingId'),
                        sf.col('sort'),
                        sf.col('filters'),
                        sf.col('inputData.filterInput').alias('filterInput'),
                        sf.col('algo'),
                        sf.col('pid'),
                        sf.col('outputData.rankList.ranks').alias('ranks'))
Exemple #32
0
    def ingest(self,
               src_resource=None,
               src_path=None,
               src_provider=None,
               dest_resource=None,
               dest_path=None,
               dest_provider=None,
               eventsourcing=False):

        logger = logging.getLogger()

        #### contants:
        now = datetime.now()
        reserved_cols = ['_ingested', '_date', '_state']

        #### Source metadata:
        md_src = data.metadata(src_resource, src_path, src_provider)
        if not md_src:
            logger.error("No metadata")
            return

        # filter settings from src (provider and resource)
        filter_params = utils.merge(
            md_src['provider'].get('read', {}).get('filter', {}),
            md_src['resource'].get('read', {}).get('filter', {}))

        #### Target metadata:

        # default path for destination is src path
        if (not dest_resource) and (not dest_path) and dest_provider:
            dest_path = md_src['resource']['path']

        md_dest = data.metadata(dest_resource, dest_path, dest_provider)
        if not md_dest:
            return

        if 'read' not in md_dest['resource']:
            md_dest['resource']['read'] = {}

        # match filter with the one from source resource
        md_dest['resource']['read']['filter'] = filter_params

        #### Read source resource
        try:
            df_src = self._read(md_src)
        except Exception as e:
            logger.exception(e)
            return

        #### Read destination schema info
        try:
            schema_path = '{}/schema'.format(md_dest['resource']['path'])
            md = data.metadata(path=schema_path, provider=dest_provider)
            df_schema = self._read(md)
            schema_date_str = df_schema.sort(
                desc("date")).limit(1).collect()[0]['id']
        except Exception as e:
            # logger.warning('source schema does not exist yet.'')
            schema_date_str = now.strftime('%Y%m%dT%H%M%S')

        # destination path - append schema date
        dest_path = '{}/{}'.format(md_dest['resource']['path'],
                                   schema_date_str)
        md_dest['resource']['path'] = dest_path
        md_dest['url'] = data._url(md_dest)

        # if schema not present or schema change detected
        schema_changed = True

        try:
            df_dest = self._read(md_dest)

            # compare schemas
            df_src_cols = [x for x in df_src.columns if x not in reserved_cols]
            df_dest_cols = [
                x for x in df_dest.columns if x not in reserved_cols
            ]
            schema_changed = df_src[df_src_cols].schema.json(
            ) != df_dest[df_dest_cols].schema.json()
        except Exception as e:
            # logger.warning('schema does not exist yet.'')
            df_dest = df_src.filter("False")

        if schema_changed:
            # Different schema, update schema table with new entry
            schema_entry = (schema_date_str, now, df_src.schema.json())
            df_schema = self.context().createDataFrame(
                [schema_entry], ['id', 'date', 'schema'])

            # write the schema to destination provider
            md = data.metadata(path=schema_path,
                               provider=md_dest['resource']['provider'])
            self._write(df_schema, md, mode='append')

        # partitions
        partition_cols = ['_ingested']

        #init df_diff to empty dest dataframe
        df_diff = df_dest.filter("False")

        if not eventsourcing:
            if filter_params.get('policy') == 'date' and filter_params.get(
                    'column'):
                df_diff = dataframe_update(df_src,
                                           df_dest,
                                           updated_col='_ingested',
                                           eventsourcing=eventsourcing)
                df_diff = df_diff.withColumn(
                    '_date',
                    date_format(
                        from_utc_timestamp(filter_params['column'], 'GMT+7'),
                        'yyyy-MM-dd'))
                partition_cols += ['_date']
                ingest_mode = 'append'
                options = {'mode': ingest_mode, 'partitionBy': partition_cols}
            else:
                df_diff = dataframe_update(df_src,
                                           df_dest.filter("False"),
                                           updated_col='_ingested',
                                           eventsourcing=eventsourcing)
                ingest_mode = 'overwrite'
                options = {'mode': ingest_mode, 'partitionBy': partition_cols}
        else:
            # to do
            logger.fatal('event sourcing not implemented yet')

        records_add = df_diff.filter("_state = 0").count()
        records_del = df_diff.filter("_state = 1").count()

        if records_add or records_del or schema_changed:
            md = data.metadata(path=dest_path,
                               provider=md_dest['resource']['provider'])
            self._write(df_diff, md, **options)

        end = datetime.now()
        time_diff = end - now

        logdata = {
            'src_url': md_src['url'],
            'src_table': md_src['resource']['path'],
            'source_option': filter_params,
            'schema_change': schema_changed,
            'target': dest_path,
            'upserts': records_add,
            'deletes': records_del,
            'diff_time': time_diff.total_seconds()
        }

        logtype = {
            'dlf_type': '{}.{}'.format(self.__class__.__name__, func_name())
        }
        logger.info(logdata, extra=logtype)