def extract_field(df): text = 'bbbb' df = spark.createDataFrame([('A$B', 'a$b$c')], ['k', 's']) df.select(f.sha1(f.split(f.col('s'), '\$')).alias('k')).withColumn( 'k', f.when( f.col('k') == f"{text}$", f.concat(f.lit(f"{text}-"), f.sha1(F.split(F.col("s"), "\$").getItem(0)))).otherwise( F.col('k'))).collect()
def normal_df(self, num_rows, num_dummy_columns=1, num_partitions=200): _df = self._spark.range( num_rows, numPartitions=num_partitions).withColumnRenamed('id', 'key') for dummy in range(0, num_dummy_columns): _df = _df.withColumn('dummy_' + str(dummy + 1), sha1(col('key').cast('string'))) return _df.repartition(num_partitions)
def __sha1_concat(hash_columns): """ Private Function - concatenates columns provided using : delimiter Parameters: hash_columns (list): A string list containing the list of columns to hash Returns: str: returns Hash sha1{'column1:column2:'} """ column_ref_list = [when(col(hash_col_name).isNull(),lit(":")).otherwise(concat(col(hash_col_name).cast("string"),lit(":"))) for hash_col_name in hash_columns] concat_string = sha1(concat(*column_ref_list)) return concat_string
def skewed_df(self, num_rows, percentage_rows_skewed, num_dummy_columns=1, num_partitions=200): _df = self._spark.range(num_rows, numPartitions=num_partitions) \ .withColumn('id', when(col('id') <= percentage_rows_skewed * num_rows, lit(1)).otherwise(col('id'))) \ .withColumnRenamed('id', 'key') for dummy in range(0, num_dummy_columns): _df = _df.withColumn('dummy_' + str(dummy + 1), sha1(col('key').cast('string'))) return _df.repartition(num_partitions)
def benchmark2(): print("===Benchmark 2===") print( "Comparing JDBC writes to InnoDB and API writes to ColumnStore with larger datasets" ) print("") emptyDatabase() print("creating dataframe 1: two random generated doubles") randDF = sqlContext.range(0, 7000000).withColumn( 'uniform', rand(seed=23)).withColumn('normal', randn(seed=42)).cache() randDFRows = randDF.count() randDFItems = randDFRows * len(randDF.columns) randDF.printSchema() print("bemchmarking dataframe 1") rand_benchmark = benchmark2execution( "rand", randDF, "id BIGINT, uniform DOUBLE, normal DOUBLE") randDF.unpersist() print( "creating dataframe 2: sha1, sha256, sha512 and md5 hashes of integers" ) tmpDF = sqlContext.createDataFrame( sc.parallelize(range( 0, 3000000)).map(lambda i: Row(number=i, string=str(i)))) hashDF = tmpDF.select(tmpDF.number, sha1(tmpDF.string).alias("sha1"), sha2(tmpDF.string, 256).alias("sha256"), sha2(tmpDF.string, 512).alias("sha512"), md5(tmpDF.string).alias("md5")).cache() hashDFRows = hashDF.count() hashDFItems = hashDFRows * len(hashDF.columns) hashDF.printSchema() print("bemchmarking dataframe 2") hash_benchmark = benchmark2execution( "hash", hashDF, "number BIGINT, sha1 VARCHAR(40), sha256 VARCHAR(64), sha512 VARCHAR(128), md5 VARCHAR(32)" ) hashDF.unpersist() print("jdbc_innodb\tapi_columnstore\t\trows\t\titems") print("%.3fs\t\t%.3fs\t\t%i\t\t%i" % (rand_benchmark[0], rand_benchmark[1], randDFRows, randDFItems)) print("%.3fs\t\t%.3fs\t\t%i\t\t%i" % (hash_benchmark[0], hash_benchmark[1], hashDFRows, hashDFItems))
def make_songplay_data(d_artist_df, d_song_df, event_df): """ Create the songplay fact dataframe. Parameters: d_artist_df (DataFrame): The artist dimension dataframe. d_song_df (DataFrame): The song dimension dataframe. event_df (DataFrame): The raw song play event dataframe. Returns: f_songplay_df (DataFrame): A songplay fact dataframe. """ print('\nmake_songplay_data...') tmp_df = d_song_df.withColumnRenamed('artist_id', 'song_artist_id') tmp_df = tmp_df.join(d_artist_df, d_artist_df.artist_id == tmp_df.song_artist_id) \ .select('song_id', 'title', 'duration', 'artist_id', 'artist_name') comparison = [ event_df.song == tmp_df.title, event_df.length.cast(ShortType()) == tmp_df.duration.cast(ShortType()) ] # extract columns from joined song and log datasets to create songplays table # create hash of timestmap userId and song for unique songplay ID # year and month columns exist for paritioning parquet files f_songplay_df = event_df.withColumn('songplay_id', F.sha1(F.concat_ws('|', 'timestamp', 'userId', 'song'))) \ .withColumn('year', F.year('timestamp')) \ .withColumn('month', F.month('timestamp')) \ .join(tmp_df, comparison, 'left') \ .select(['songplay_id', 'start_time', 'year', 'month', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent']) print('songplay fact record count:', f_songplay_df.count()) not_null_count = f_songplay_df.filter(F.col('song_id').isNotNull()).count() print('songplay fact records with song_id value:', not_null_count) return f_songplay_df
def get_client_details(client_details, fn): # hash client_id using secure sha1 client_details = client_details.withColumn('client_id', sha1(client_details.client_id)) # calculate attribution_site column as : # Value assigned to 'www.mozilla.org or unknown site'. # CASE WHEN medium = unknown AND source = unknown AND campaign = unknown THEN 'unknown site' ELSE 'www_mozilla_org' END #def attribution_site(attribution): # if attribution.medium == 'unknown' and attribution.source == 'unknown' and attribution.campaign == 'unknown' : return 'unknown_site' # else: return 'www_mozilla_org' # NOTE: it seems that calls to udf() must be after SparkContext() is called attribution_site_udf = udf( lambda attribution: None if (attribution is None) else ("unknown_site" if attribution.medium == 'unknown' and attribution.source == 'unknown' and attribution.campaign == 'unknown' else 'www_mozilla_org'), StringType()) client_details = client_details.withColumn( "attribution_site", attribution_site_udf("attribution")) # convert submission date to string client_details_with_date = client_details.withColumn( "date", unix_timestamp("submission_date_s3", "yyyyMMdd").cast("timestamp").cast("date")) w = Window.partitionBy(client_details_with_date["client_id"]).orderBy( client_details_with_date["date"].desc()) client_details_with_date_latest = client_details_with_date.select( '*', F.rank().over(w).alias('rank')).filter(F.col('rank') <= 1) # calculate most recent active date column (has to be done here cause we are not storing history in Vertica) : # most recent date where uri count (pageviews) scalar_parent_browser_engagement_total_uri_count_sum >=5 uri_dt = client_details_with_date.filter( "scalar_parent_browser_engagement_total_uri_count_sum>=5").groupBy( "client_id").agg(F.max("date").alias("max_activity_date")) # left join on clients_details_with_date_latest df = client_details_with_date_latest.join(uri_dt, ["client_id"], 'left') # add most_recent_active_date_int column #df = df.withColumn("max_activity_date_int", F.datediff(F.lit(date.today()), df.max_activity_date)) df = df.withColumn("max_activity_date_int", F.datediff(F.lit(end_date), df.max_activity_date)) # activity_group - calc in vertica? # This will be based on most_recent_active_day - If most recent active day is yesterday (<2) aDAU, if most recent active day is 2-7 days ago, aWAU, if most recent active day is 8-31 days ago aMAU (based on calculation date) activity_group_udf = udf( lambda days_passed: None if (days_passed is None) else ("aDAU" if days_passed < 2 else ("aWAU" if days_passed >= 2 and days_passed < 8 else ("aMAU" if days_passed >= 8 and days_passed <= 31 else None))), StringType()) df = df.withColumn("activity_group", activity_group_udf("max_activity_date_int")) # recast max_activity_date to string yyyyMMdd df = df.withColumn("max_activity_date", F.date_format(uri_dt.max_activity_date, "yyyyMMdd")) # select ordered columns for writing to file df = df.select( "client_id", "os", "os_version", "city", "geo_subdivision1", "geo_subdivision2", "country", "default_search_engine", "default_search_engine_data_submission_url", "default_search_engine_data_load_path", "default_search_engine_data_origin", "e10s_enabled", "channel", "locale", "is_default_browser", "memory_mb", "os_service_pack_major", "os_service_pack_minor", "sample_id", "profile_creation_date", "profile_age_in_days", "active_addons_count_mean", "sync_configured", "sync_count_desktop_sum", "sync_count_mobile_sum", "places_bookmarks_count_mean", "timezone_offset", "attribution_site", "source", "medium", "campaign", "content", "submission_date_s3", "max_activity_date", "activity_group", "distribution_id") #df=df.select("client_id","os","os_version","city","geo_subdivision1","geo_subdivision2","country","default_search_engine","default_search_engine_data_submission_url","default_search_engine_data_load_path","default_search_engine_data_origin","e10s_enabled","channel","locale","is_default_browser","memory_mb","os_service_pack_major","os_service_pack_minor","profile_creation_date","profile_age_in_days","active_addons_count_mean","sync_configured","sync_count_desktop_sum","sync_count_mobile_sum","places_bookmarks_count_mean","timezone_offset","attribution_site","source","medium","campaign","content","max_activity_date","activity_group","distribution_id") #return df #df.write.partitionBy("sample_id").format("parquet").mode("overwrite").save(fn) duplicated = df.withColumn("sample_id_dupe", df["sample_id"]) duplicated.write.partitionBy("sample_id_dupe").format("parquet").mode( "overwrite").save(fn)
from pyspark.sql.types import StringType manualSplitPythonUDF = spark.udf.register("manualSplitSQLUDF", manual_split, StringType()) # COMMAND ---------- # MAGIC %md # MAGIC Create a DataFrame of 100k values with a string to index. Do this by using a hash function, in this case `SHA-1`. # COMMAND ---------- from pyspark.sql.functions import sha1, rand randomDF = (spark.range(1, 10000 * 10 * 10 * 10) .withColumn("random_value", rand(seed=10).cast("string")) .withColumn("hash", sha1("random_value")) .drop("random_value") ) display(randomDF) # COMMAND ---------- # MAGIC %md # MAGIC Apply the UDF by using it just like any other Spark function. # COMMAND ---------- randomAugmentedDF = randomDF.select("*", manualSplitPythonUDF("hash").alias("augmented_col")) display(randomAugmentedDF)
def summary_df(df,fn): #,max_date): # drop null ad_click values df = df.na.drop(subset=["ad_click"]) # Remove non search sessions df = df[df['ad_click']>0] # sum ad_click sum_search_clients_daily = df.groupBy("client_id", "country", "submission_date_s3", "activity_date")\ .agg(F.sum("ad_click").alias("ad_click")) # read revenue_by_country rev_by_country_s3_path = "s3://net-mozaws-prod-us-west-2-pipeline-analysis/nawong/revenue_by_country.csv" rev_by_country = sqlContext.read.csv(rev_by_country_s3_path, header=True) rev_by_country = rev_by_country.withColumn("rev_per_search_float", F.col("rev_per_search").cast("double"))\ .withColumn("yyyyMM_timestamp", F.to_timestamp(F.col("yyyymm"), "yyyyMM"))\ .withColumn("country_code", F.upper(F.col("country_code"))) # add country field and revenue table - need transform to calculate transaction-level monetary value tbl = sum_search_clients_daily.join(rev_by_country, sum_search_clients_daily.country == rev_by_country.country_code,how='left_outer') spec = Window.partitionBy("client_id","country","submission_date_s3").orderBy(F.col("yyyyMM_timestamp").desc()) # NOTE partition includes country because client may change country over time no_country=( tbl .where(F.isnull(F.col("yyyymm"))) .withColumn("rev_per_search_float", F.lit(.005)) ) has_country=( tbl .na.drop(subset=["yyyymm"]) .where("yyyyMM_timestamp <= activity_date") .withColumn('rank', F.row_number().over(spec)) .where("rank = 1") .drop('rank') ) tbl2=(no_country.union(has_country)) # drop first purchase to calculate revenue spec2 = Window.partitionBy("client_id").orderBy(F.col("activity_date").asc()) # earliest date has row #1 search_rev = (tbl2 .withColumn("rank", F.row_number().over(spec2)) .where("rank > 1") ).groupBy("client_id").agg(F.avg(F.col('rev_per_search_float')*F.col('ad_click')).alias("monetary_value")) # compute the final dataset for the BG/NBD model dataset = ( tbl2 .groupBy("client_id") .agg(F.datediff(F.max('activity_date'),F.min("activity_date")).alias("recency"), (F.countDistinct('activity_date')-1).alias("frequency"), (F.datediff(F.lit(end_date.strftime("%Y-%m-%d")).cast("date"),F.min("activity_date"))).alias("T"), F.sum("ad_click").alias("historical_searches"), F.sum(F.col('rev_per_search_float')*F.col('ad_click')).alias("historical_clv")) .join(search_rev, "client_id", how="left") .where("frequency >= 0 AND recency >= 0 AND T >= 0") .select("client_id", (F.crc32("client_id") % 100).alias("sample_id"), "frequency","recency","T","monetary_value","historical_searches","historical_clv") ).fillna(0, subset=['monetary_value']) # anonymize client_id dataset = dataset.withColumn('client_id',sha1(dataset.client_id)) # write dataset recency, freq, age, revenue table per client #dataset.write.partitionBy("sample_id").format("parquet").mode("overwrite").save(fn) duplicated = dataset.withColumn("sample_id_dupe", dataset["sample_id"]) duplicated.write.partitionBy("sample_id_dupe").format("parquet").mode("append").save(fn)
def addChecksumCol(self, colName): self.inputDf = self.inputDf.withColumn( colName, sha1(concat_ws("~~", *self.inputDf.columns))) return self.inputDf