def test_functions_broadcast(self): from pyspark.sql.functions import broadcast df1 = self.spark.createDataFrame([(1, "1"), (2, "2")], ("key", "value")) df2 = self.spark.createDataFrame([(1, "1"), (2, "2")], ("key", "value")) # equijoin - should be converted into broadcast join plan1 = df1.join(broadcast(df2), "key")._jdf.queryExecution().executedPlan() self.assertEqual(1, plan1.toString().count("BroadcastHashJoin")) # no join key -- should not be a broadcast join plan2 = df1.crossJoin(broadcast(df2))._jdf.queryExecution().executedPlan() self.assertEqual(0, plan2.toString().count("BroadcastHashJoin")) # planner should not crash without a join broadcast(df1)._jdf.queryExecution().executedPlan()
def filter_on_case_size(df, case_id_glue="case:concept:name", min_case_size=2, max_case_size=None): """Filters the Spark dataframe keeping only traces with at least the specified number of events """ size_df = df.groupBy(case_id_glue).count() if max_case_size: size_df = size_df.filter((size_df["count"] >= min_case_size) & (size_df["count"] <= max_case_size)) else: size_df = size_df.filter(size_df["count"] >= min_case_size) return df.join(F.broadcast(size_df), case_id_glue).drop("count")
def transform_author_dataset(self): logging.debug("Inside transform author dataset module") author_df = \ self._spark.read.csv( self._load_path + '/author.csv', header=True, mode='PERMISSIVE',inferSchema=True) author_lookup_df = author_df.groupBy('author_id')\ .agg(fn.max('record_create_timestamp').alias('record_create_timestamp')) author_lookup_df.persist() fn.broadcast(author_lookup_df) deduped_author_df = author_df\ .join(author_lookup_df, ['author_id', 'record_create_timestamp'], how='inner')\ .select(author_df.columns) \ .withColumn('name', goodreads_udf.remove_extra_spaces('name')) logging.debug(f"Attempting to write data to {self._save_path + '/authors/'}") deduped_author_df\ .repartition(10)\ .write\ .csv(path = self._save_path + '/authors/', sep = '|', mode='overwrite', compression='gzip', header=True, timestampFormat = 'yyyy-MM-dd HH:mm:ss.SSS', quote = '"', escape = '"')
def test_functions_broadcast(self): from pyspark.sql.functions import broadcast df1 = self.spark.createDataFrame([(1, "1"), (2, "2")], ("key", "value")) df2 = self.spark.createDataFrame([(1, "1"), (2, "2")], ("key", "value")) # equijoin - should be converted into broadcast join plan1 = df1.join(broadcast(df2), "key")._jdf.queryExecution().executedPlan() self.assertEqual(1, plan1.toString().count("BroadcastHashJoin")) # no join key -- should not be a broadcast join plan2 = df1.crossJoin( broadcast(df2))._jdf.queryExecution().executedPlan() self.assertEqual(0, plan2.toString().count("BroadcastHashJoin")) # planner should not crash without a join broadcast(df1)._jdf.queryExecution().executedPlan()
def transform_books_dataset(self): logging.debug("Inside transform books dataset module") books_df = self._spark.read.csv(self._load_path + '/book.csv', header=True, mode='PERMISSIVE', inferSchema=True, quote="\"", escape="\"") books_lookup_df = books_df\ .groupBy('book_id')\ .agg(fn.max('record_create_timestamp').alias('record_create_timestamp')) books_lookup_df.persist() fn.broadcast(books_lookup_df) deduped_books_df = books_df\ .join(books_lookup_df, ['book_id', 'record_create_timestamp'], how='inner')\ .select(books_df.columns) logging.debug(f"Attempting to write data to {self._save_path + '/books/'}") deduped_books_df\ .repartition(10)\ .write\ .csv(path = self._save_path + '/books/', sep = '|', mode='overwrite', compression='gzip', header=True, timestampFormat = 'yyyy-MM-dd HH:mm:ss.SSS', quote = '"', escape = '"')
def initial_centroids(next_selected_cent, data_input, i): if i == k-1: data_cent6 = data_input.join(broadcast(next_selected_cent)) data_cent7 = data_cent6.withColumn(str(i),squaree_spark1(data_cent6.columns[0],data_cent6.columns[1], data_cent6.columns[k+2],data_cent6.columns[k+3]))#+3 +4 data_cent8 = data_cent7.drop('mindist').drop(data_cent7.columns[k+2]).drop(data_cent7.columns[k+3]) return data_cent8 else: data_cent6 = data_input.join(broadcast(next_selected_cent)) data_cent7 = data_cent6.withColumn(str(i), squaree_spark1(data_cent6.columns[0],data_cent6.columns[1],data_cent6.columns[i+3], data_cent6.columns[i+4])) data_cent8 = data_cent7.drop(data_cent7.columns[i+3]). drop(data_cent7.columns[i+4]) data_cent9 = data_cent8.withColumn('mindist1',least(data_cent8.columns[i+3], col('mindist'))) data_cent10 = data_cent9.drop('mindist') data_cent12 = data_cent10.withColumnRenamed('mindist1', 'mindist') data_cent13 = data_cent12.repartition(2001) next_cent_cache = data_cent13.orderBy(desc('mindist')).limit(1).cache() next_cent = next_cent_cache.select(data_cent12.columns[0:2]) return next_cent, data_cent12
def main(): in_directory = sys.argv[1] out_directory = sys.argv[2] comments = spark.read.json(in_directory, schema=schema) comments_cache = comments.cache() # TODO average_score = comments_cache.groupBy("subreddit").mean() average_score = average_score.filter(average_score["avg(score)"] > 0) average_score = functions.broadcast(average_score) join_average_score = average_score.join( comments_cache, on=(average_score['subreddit'] == comments_cache['subreddit'])).drop( average_score['subreddit']) join_average_score_cache = join_average_score.cache() relative_score = join_average_score_cache.select( join_average_score_cache['subreddit'], join_average_score_cache['author'], (join_average_score_cache['score'] / join_average_score_cache['avg(score)']).alias("relative_score")) relative_score_cache = relative_score.cache() max_relative = relative_score_cache.groupby('subreddit').max( 'relative_score') max_relative = functions.broadcast(max_relative) best_comment = max_relative.join( relative_score_cache, on=(max_relative['max(relative_score)'] == relative_score_cache['relative_score'])).drop( max_relative['max(relative_score)']).drop( max_relative['subreddit']) best_comment_cache = best_comment.cache() best_author = best_comment_cache.sort(best_comment_cache['subreddit']) best_author.show() best_author.write.json(out_directory, mode='overwrite')
def run_pagerank(graph, communities, outputs, path, maxiter=10): # Run PageRank pageRank = graph.pageRank(resetProbability=0.15, maxIter=maxiter) # Organize communities based on page rankings and weights topTenRankings = pageRank.vertices.select("id", "pagerank").orderBy( "pagerank", ascending=False).limit(10) topTenRankings = functions.broadcast(topTenRankings) getRankingInfo = communities.join( topTenRankings, communities.id == topTenRankings.id).drop( topTenRankings.id).orderBy("pagerank", ascending=False) getRankingInfo.write.csv(path + '/rankings-' + outputs, mode='overwrite') return pageRank
def make_genre_map(): """ Make mapping of genre wikidata_id value to numan-readable label. """ wd = spark.read.parquet(sys.argv[1]) label_map = wd.filter(wd.label.isNotNull()).select(wd['id'].alias('wikidata_id'), wd['label']) genres = wd.select(functions.explode(wd['genre']).alias('wikidata_id')).distinct() genres = functions.broadcast(genres) # only a few thousand values that we want to keep genres = genres.join(label_map, on='wikidata_id') genres = genres.withColumnRenamed('label', 'genre_label') # output is about <1MB compressed: safe to .coalesce(). genres.coalesce(1).write.json('./genres', mode='overwrite', compression='gzip')
def ar_coefficient_spark(spark, df, param): # We convert the params into a pandas dataframe to apply a groupBy # and to convert a part of the dataframe to a pyspark dataframe df_param = pd.DataFrame(param) dict_params = df_param.groupby('k')['coeff'].apply(list).to_dict() df_k = spark.createDataFrame(df_param[['k']].drop_duplicates()) # We will apply a pandas udf to each partition which is composed of the full time series # multiplied k times because for each k we will compute AR(k) in parallel in the cluster df_ts_k = df.crossJoin(F.broadcast(df_k)) df_value_ar = df_ts_k.groupBy('k').apply(computeARk_generator(dict_params)) return df_value_ar.rdd.map(lambda x: ("coeff_{}__k_{}".format( x.coeff, x.k), x.value_ar)).collect()
def join_df(self, events_df, mentions_df): ''' This function is used to perform joining of mentions data with events data. Used Spark's broadcast join. :param events_df: filtered events data frame will be provided. :param mentions_df: filtered mentions data frame will be provided :return: joined data frame including GlobalEventId, Goldstein score and avg confidence will be returned. ''' # Broadcast events data frame as it's smaller in size and perform join. final_df = mentions_df.join(broadcast(events_df), 'GLOBALEVENTID') return final_df
def normalizeFeatures(df, cols): """ Normalized feature method is used to normalize each feature passed into the list cols""" allCols = df.columns #remove the cols to normalized to set the columns of return dataframe _ = [allCols.remove(x) for x in cols] # calculate the avg and stddev of the features to normalized stats = (df.groupBy().agg(*([stddev_pop(x).alias(x + '_stddev') for x in cols] + [avg(x).alias(x + '_avg') for x in cols]))) # broadcast and join into current DF df = df.join(broadcast(stats)) # normalized the columns and select the required columns gor final DF exprs = [x for x in allCols] + [((df[x] - df[x + '_avg']) / df[x + '_stddev']).alias(x) for x in cols] return df.select(*exprs)
def top_k_rankingmetrics(dataset=None, k=10, ranking_metrics="precisionAt", user="******", item="book_id", rating="rating", prediction="prediction"): ''' This function is to compute the ranking metrics from predictions. Input: 1. k: only evaluate the performance of the top k items 2. ranking_metrics: precisionAt, meanAveragePrecision, ndcgAt 3. user, item, prediction: column names; string type refer to https://vinta.ws/code/spark-ml-cookbook-pyspark.html ''' if dataset == None: print("Error! Please specify a dataset.") return # prediction table windowSpec = Window.partitionBy(user).orderBy(col(prediction).desc()) perUserPredictedItemsDF = dataset \ .select(user, item, prediction, F.rank().over(windowSpec).alias('rank')) \ .where('rank <= {}'.format(k)) \ .groupBy(user) \ .agg(expr('collect_list({}) as items'.format(item))) # actual target table windowSpec = Window.partitionBy(user).orderBy(col(rating).desc()) perUserActualItemsDF = dataset \ .select(user, item, rating, F.rank().over(windowSpec).alias('rank')) \ .where('rank <= {}'.format(k)) \ .groupBy(user) \ .agg(expr('collect_list({}) as items'.format(item))) # join perUserItemsRDD = perUserPredictedItemsDF \ .join(F.broadcast(perUserActualItemsDF), user, 'inner') \ .rdd \ .map(lambda row: (row[1], row[2])) ranking_metrics_evaluator = RankingMetrics(perUserItemsRDD) # get the result of the metric if ranking_metrics == "precisionAt": precision_at_k = ranking_metrics_evaluator.precisionAt(k) #print("precisionAt: {}".format(round(precision_at_k, 4))) return precision_at_k elif ranking_metrics == "meanAveragePrecision": mean_avg_precision = ranking_metrics_evaluator.meanAveragePrecision(k) #print("meanAveragePrecision: {}".format(round(mean_avg_precision, 4))) return mean_avg_precision elif ranking_metrics == "ndcgAt": ndcg_at_k = ranking_metrics_evaluator.ndcgAt(k) #print("meanAveragePrecision: {}".format(round(ndcg_at_k, 4))) return ndcg_at_k
def main(): sc = SparkContext(conf=SparkConf().setAppName("se")) spark = SparkSession.builder.appName("se").getOrCreate() #read in the links parquet file links = spark.read.load("s3a://xmlparq/pr_se_links.parquet") #read in the posts parquet file posts = spark.read.load("s3a://xmlparq/posts.parquet") #filter the questions: questions = posts.filter((f.col('PostTypeId')==1)).filter((f.col('AcceptedAnswerId').isNotNull())) questions_subset = questions.select('Id','AcceptedAnswerId','Tags','CreationDate', 'Community') #filter the answers in another dataframe answers = df3.filter((f.col('PostTypeId')==2)) #rename the answer dataframe columns answers_subset = answers.select("Id","CreationDate","Community") new_names = ['AnsId', 'AnsCreationDate','AnsCommunity'] answers_subset = answers_subset.toDF(*new_names) #perform a join on the questions df and the answer df based on the common answer id qa_deets = questions_subset.join(answers_subset,(answers_subset.AnsCommunity == all_questions.Community) & (questions_subset.AcceptedAnswerId == answers_subset.AnsId)) timeFmt = "yyyy-MM-dd' 'HH:mm:ss.SSS" timeDiff = (f.unix_timestamp('AnsCreationDate', format=timeFmt) - f.unix_timestamp('CreationDate', format=timeFmt)) #divide duration by seconds to convert it to milli- minutes qa_deets = qa_deets.withColumn("Duration", timeDiff/60) qa_deets_subset = qa_deets.select("Id", "Tags","CreationDate","Community", "Duration") #filter off questions which have no answers questions_null = df3.filter((f.col('PostTypeId')==1)).filter((f.col('AcceptedAnswerId').isNull())) questions_null_subset = questions_null.select('Id','Tags','CreationDate','Community') questions_null_duration = questions_null_subset.withColumn('Duration', lit(None).cast(DoubleType())) #combine all questions; questions with answers and question with no answers all_questions = qa_deets_subset.union(questions_null_duration) all_questions = all_questions.withColumn('post_create_date',all_questions['CreationDate'].cast('date')) all_questions = all_questions.withColumnRenamed('id','COMMUNITY_QUESTION_ID') all_questions_subset = all_questions.select('COMMUNITY_QUESTION_ID','Community','Tags', 'post_create_date') links = links.withColumnRenamed("community","lcommunity") """ perform a join based on community and the question id to combine the pagerank score and response time duration in one dataframe """ cred_tags = all_questions.join(broadcast(links), (links.id == all_questions.COMMUNITY_QUESTION_ID) & (all_questions_subset.Community == links.lcommunity), "left_outer") #rename columns as per postgresql schema, round off values and write to the database: total_df = cred_tags.select("COMMUNITY_QUESTION_ID","Community","post_create_date","Tags","Duration","cred_score") total_df = total_df.withColumn("duration",f.round(total_df["Duration"],2)) total_df = total_df.withColumn("pr_score",f.round(total_df["cred_score"],3)) total_df = total_df.withColumnRenamed("COMMUNITY_QUESTION_ID","qid") total_df = total_df.withColumnRenamed("Community","community") total_df = total_df.withColumnRenamed("Tags","tags") total_df = total_df.withColumnRenamed("post_create_date","create_date") total_df_reqd = total_df.select("qid","tags","community","duration","create_date","pr_score") total_df_reqd.write.format("jdbc").mode("append") .option("url", "jdbc:postgresql://hostname/ls?user=xxx&password=xxx").option("dbtable", "questions").option("user", "postgres").option("password", "xxx").save() spark.catalog.clearCache()
def apply_numeric(df, int1, int2, parameters=None): """Filters the Spark dataframe on attribute values (filter cases) """ if parameters is None: parameters = {} attribute_key = parameters[ PARAMETER_CONSTANT_ATTRIBUTE_KEY] if PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters else DEFAULT_NAME_KEY case_id_glue = parameters[ PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME positive = parameters["positive"] if "positive" in parameters else True df_filtered = df.filter(df[attribute_key].between(int1, int2)) df_filtered = df_filtered.groupBy(case_id_glue).count() #filtered_index = df_filtered.select(case_id_glue).rdd.map(lambda x: x[0]).collect() if positive: return df.join(F.broadcast(df_filtered), case_id_glue).drop("count") else: df_left_joined = df.join(F.broadcast(df_filtered), case_id_glue, "left") return df_left_joined.filter( df_left_joined["count"].isNull()).drop("count")
def filter_on_case_performance(df, case_id_glue="case:concept:name", timestamp_key="time:timestamp", min_case_performance=0, max_case_performance=10000000000): """Filters the Spark dataframe on case performance """ grouped_df = df.groupby(case_id_glue) start_end_df = grouped_df.agg(F.min(timestamp_key).alias(timestamp_key), F.max(timestamp_key).alias(timestamp_key+"_1")) start_end_df = start_end_df.withColumn("caseDuration", F.unix_timestamp(start_end_df[timestamp_key+"_1"]) - F.unix_timestamp(start_end_df[timestamp_key])) start_end_df = start_end_df.filter((start_end_df["caseDuration"] > min_case_performance) & (start_end_df["caseDuration"] < max_case_performance))\ .select(case_id_glue) return df.join(F.broadcast(start_end_df), case_id_glue)
def filter_on_ncases(df, case_id_glue="case:concept:name", max_no_cases=1000): """Filters the Spark dataframe keeping only the specified maximum number of traces """ # With conversion to RDD. #cases_to_keep = df.select(case_id_glue).distinct().rdd.map(lambda row : row[0]).collect() #cases_to_keep = cases_to_keep[0:min(len(cases_to_keep), max_no_cases)] #return df.filter(df[case_id_glue].isin(cases_to_keep)) #Without conversion to RDD (better). grouped_df = df.groupBy(case_id_glue).count().limit(max_no_cases).drop("count") return df.join(F.broadcast(grouped_df), case_id_glue)
def filter_df_on_end_activities(df, values, timestamp_key=DEFAULT_TIMESTAMP_KEY, case_id_glue=CASE_CONCEPT_NAME, activity_key=DEFAULT_NAME_KEY, grouped_df=None, positive=True): """Filters the Spark dataframe on end activities """ if grouped_df is None: grouped_df = df.groupby(case_id_glue) grouped_df = grouped_df.agg( F.last(activity_key).alias(activity_key + "_1")) df_end = grouped_df.filter(grouped_df[activity_key + "_1"].isin(values)) if positive: return df.join(F.broadcast(df_end), grouped_df.columns[0]).drop(activity_key + "_1") else: return df.join(F.broadcast(df_end), grouped_df.columns[0], "leftanti")
def main(in_directory, out_directory): comments = spark.read.json(in_directory, schema=schema) comments.cache() average = comments.groupBy('subreddit').avg('score') average = average.withColumnRenamed('avg(score)', 'avg_score') average = average.filter(average['avg_score'] > 0) average = functions.broadcast(average) # average.show() joined_comments = comments.join( average, average.subreddit == comments.subreddit).drop(comments.subreddit) joined_comments = joined_comments.withColumn( 'relative_score', joined_comments['score'] / joined_comments['avg_score']) joined_comments.cache() max_score_comments = joined_comments.groupBy('subreddit').max( 'relative_score') max_score_comments = max_score_comments.withColumnRenamed( 'max(relative_score)', 'max_relative_score') max_score_comments = functions.broadcast(max_score_comments) result_table = joined_comments.join( max_score_comments, (joined_comments.subreddit == max_score_comments.subreddit) & (joined_comments.relative_score == max_score_comments.max_relative_score)).drop( joined_comments.subreddit) # result_table = result_table.drop('avg_score', 'score', 'max_relative_score') result_table = result_table.select('subreddit', 'author', 'relative_score') result_table = result_table.withColumnRenamed('relative_score', 'rel_score') # result_table.show() # TODO result_table.write.json(out_directory, mode='overwrite')
def main(): sparkSession = SparkSession\ .builder\ .appName('UDF and Broadcast')\ .getOrCreate() sparkSession.sparkContext.setLogLevel('ERROR') # define schema # Date,Open,High,Low,Close,Adj Close,Volume,Name stockSchema = StructType([ StructField('Date', DateType(), True), StructField('Open', DoubleType(), True), StructField('High', DoubleType(), True), StructField('Low', DoubleType(), True), StructField('Close', DoubleType(), True), StructField('Adj Close', DoubleType(), True), StructField('Volume', LongType(), True), StructField('Name', StringType(), True) ]) stocks = sparkSession\ .readStream.option('header','true')\ .schema(stockSchema).csv('./data') print('Is streaming', stocks.isStreaming) print(stocks.printSchema()) # User Function def daily_price_delta(open_price, close_price): return close_price - open_price # Registering UDF # sparkSession.udf.register('calculated_price_delta_udf',daily_price_delta, DoubleType()) calculated_price_delta_udf = udf(daily_price_delta, DoubleType()) # Broadcasting price_delta_broadcast_df = broadcast(stocks.withColumn( 'PriceDelta', calculated_price_delta_udf(stocks.Open, stocks.Close))) price_delta_df = price_delta_broadcast_df.select("Date", "Name", "PriceDelta")\ .where("PriceDelta > 15") query = price_delta_df\ .writeStream\ .outputMode("append")\ .format("console")\ .option("truncate",'false')\ .option('numRows', 30)\ .start().awaitTermination()
def dummy_run(spark): from pyspark.ml.recommendation import ALS from pyspark.mllib.evaluation import RankingMetrics import pyspark.sql.functions as F from pyspark.sql.functions import expr train=spark.createDataFrame( [ (82, 124, 5.0), (64, 123, 4.0), (27, 122, 3.0), (25, 122, 1.0), (12, 124, 2.0) ], ['user_id', 'book_id', 'rating'] ) val=spark.createDataFrame( [ (82, 123, 5.0), (64, 122, 4.0), (27, 124, 3.0), (64, 123, 2.0), (12, 122, 4.0) ], ['user_id', 'book_id', 'rating'] ) user_id = val.select('user_id').distinct() true_label = val.select('user_id', 'book_id')\ .groupBy('user_id')\ .agg(expr('collect_list(book_id) as true_item')) als = ALS(rank = 3 , regParam=0.1, userCol="user_id", itemCol="book_id", ratingCol='rating', implicitPrefs=False, coldStartStrategy="drop") model = als.fit(train) recs = model.recommendForUserSubset(user_id, 2) pred_labels = recs.select('user_id','recommendations.book_id') pred_true_rdd = pred_labels.join(F.broadcast(true_label), 'user_id', 'inner') \ .rdd \ .map(lambda row: (row[1], row[2])) metrics = RankingMetrics(pred_true_rdd) mean_ap = metrics.meanAveragePrecision ndcg_at_k = metrics.ndcgAt(2) p_at_k= metrics.precisionAt(2) print('MAP: ', mean_ap , 'NDCG: ', ndcg_at_k, 'Precision at k: ', p_at_k) return
def addCategoryData(entityUserDF, attr, catFreqInfo): print "loading category info data for attr: %s" % attr #global freq table _freqTbl = F.broadcast( sqlContext.createDataFrame([(k, v) for k, v in catFreqInfo[attr].iteritems()], [attr, "p"])) # a table containing (entity, attr, sum(projWt)) attrInfoTbl = entityUserDF.filter(F.length(F.col(attr)) > 0) \ .groupby(entity, attr) \ .agg(F.sum(weightCol).alias('wt')) \ .cache() # sum of projWt for each entity totalsTbl = attrInfoTbl.groupby(entity).agg(F.sum('wt').alias('totWt')) totalsTbl = F.broadcast(totalsTbl) attrInfoTbl = attrInfoTbl.join(totalsTbl, entity) #calcuate mn and sd attrInfoTbl = attrInfoTbl.join(_freqTbl, attr) \ .select(entity, attr, 'wt', (totalsTbl['totWt'] * _freqTbl["p"]).alias("mn"), F.sqrt(((totalsTbl['totWt'] * _freqTbl["p"]) *(1 - _freqTbl["p"]))).alias("sd"))#sd sqrt(mn * (1 - p)) # filter common outliers attrInfoTbl = attrInfoTbl.select( entity, attr, (((F.col("wt") - F.col("mn")) / F.col("sd")) > 2.0).alias('shouldInclude')) attrInfoTbl = attrInfoTbl.filter(attrInfoTbl['shouldInclude']).drop( F.col("shouldInclude")) #collect categories and concat them attrInfoTbl = attrInfoTbl.groupby(entity) \ .agg(F.collect_list(F.col(attr)).alias(attr)) \ .select(entity, F.concat_ws("|", F.col(attr)).alias("common_" + attr)) return attrInfoTbl
def main(in_directory, out_directory): comments = spark.read.json(in_directory, schema=schema).cache() # TODO # get average subreddit score average = comments.groupby('subreddit').avg().cache() # filter average score > 0 average = average.filter(average['avg(score)'] > 0) # merge with original table # average = average.join(comments,'subreddit') # merge with original table (with broadcast) average = functions.broadcast(average) average = average.join(comments, 'subreddit') # add column 'relative_score' average = average.withColumn('relative_score', average['score'] / average['avg(score)']) # get max score average = average.groupby('subreddit').max().cache() # join with original table # average = average.join(comments, 'subreddit') # join with original table (with broadcast) average = functions.broadcast(average) average = average.join(comments, 'subreddit') # filter tuple with max score average = average.filter(average['score'] == average['max(score)']) best_author = average.select( average['subreddit'], average['author'], average['max(relative_score)'].alias('rel_score')) # best_author = max_by_subreddit.join(functions.broadcast(average), 'subreddit', 'inner') best_author.write.json(out_directory, mode='overwrite')
def compute_individual_score(self, aux, df_records): """ Compute scoreboard of auxiliary information aux inside record df_records. Both must be spark dataframes. Returns a spark dataframe. """ merged = broadcast(prepare_join(aux, '_1', True)).crossJoin( prepare_join(df_records, '_2', True)) merged = merged.withColumn('similarity', self.similarity_func(merged)) #merged = merged.withColumn('value', merged.wt * merged.similarity) merged = merged.withColumn('value', merged.similarity) return merged
def filter_too_sparse_IDs(self): """ keeping just those tracks which have > `timestamps_per_hour` points per hour on average :return: filtered `self.df` """ ids = self.df.groupBy(['id', F.to_date(F.col('ts'))]) \ .count() \ .filter(F.col('count') > timestamps_per_hour * 24) \ .select('id') \ .distinct() self.df = self.df.join(F.broadcast(ids), ['id'], how='inner')
def main(in_directory, out_directory): comments = spark.read.json(in_directory, schema=schema).cache() grouped = comments.groupby(comments['subreddit']) averages = grouped.agg(functions.avg(comments['score'])) averages = averages.filter(averages['avg(score)'] > 0) comments = comments.join( functions.broadcast(averages), comments['subreddit'] == averages['subreddit']).drop( averages['subreddit']) comments = comments.withColumn("rel_score", comments['score'] / comments['avg(score)']) grouped = comments.groupby(comments['subreddit']) maxes = grouped.agg(functions.max(comments['rel_score'])) comments = comments.join(functions.broadcast(maxes), comments['subreddit'] == maxes['subreddit']).drop( maxes['subreddit']) max_scores = comments.filter( comments['rel_score'] == comments['max(rel_score)']) best_author = max_scores.select(['subreddit', 'author', 'rel_score']) best_author.write.json(out_directory, mode='overwrite')
def df_sql_approach(): print('DF SQL APPROACH ----------------') print('Reading in files...') pw_schema = StructType([ StructField("hashedpw", StringType(), False), StructField("h", IntegerType(), True) ]) pw_df = sqlc.read.csv(PWS_PATH, header=False, schema=pw_schema, sep=':') common_words_schema = StructType([ StructField("word", StringType(), False) ]) # common_words_df = sqlc.createDataFrame(sc.textFile(COMMONWORDS_PATH), schema=common_words_schema) common_words_df = sqlc.read.csv(COMMONWORDS_PATH, header=False, schema=common_words_schema, sep='|') with time_usage('DF SQL APPROACH'): # The hashes of the common words are only needed when comparing # Idea: Use a function during sql query that hashes the words on-demand sqlc.registerFunction("sha1hash", lambda x: sha1(x).hexdigest().upper()) sqlc.registerDataFrameAsTable(pw_df, "pw_df") sqlc.registerDataFrameAsTable(common_words_df, "common_words_df") # example of how to apply the hash function within sql # print('Example of executing hash function within sql:') # print(sqlc.sql("SELECT *, sha1hash(word) as hash FROM common_words_df").take(2)) # we add the hashed column to each word first with time_usage('hashing common words by adding an extra column'): common_words_df = common_words_df.rdd \ .map(lambda x: (x['word'], sha1(x['word']).hexdigest().upper())) \ .toDF(['word', 'hashedword']) # print("search for single word...") # with time_usage("search for love ❤"): # j = pw_df.filter(pw_df.hashedpw == sha1('love').hexdigest().upper()) # j.show(10000) # print("Count: " + str(j.count())) print("joining tables...") with time_usage('Joining tables on the hash [forced broadcast]'): j = broadcast(common_words_df).join(pw_df, common_words_df.hashedword == pw_df.hashedpw) \ .select(common_words_df['word'], pw_df['h']) j.orderBy("h", ascending=False).show(100) # with time_usage('Joining tables on the hash [no broadcast]'): # j = common_words_df.join(pw_df, common_words_df.hashedword == pw_df.hashedpw) \ # .select(common_words_df['word'], pw_df['h']) # j.orderBy("h", ascending=False).show(100) print("Count: " + str(j.count()))
def filter_by(self, df, columns=None): """ :param df: :param columns: :return: """ import os if not columns: columns = df.columns if os.path.isdir(self.persistent_cache_file): self.__cache = self.session_getter().read.format( self.format_).load(self.persistent_cache_file).join( F.broadcast(df), on=columns, how='inner').drop('a.ip').persist(self.storage_level) else: if self.__cache: self.__cache = self.__cache.join( F.broadcast(df), on=columns, how='inner').drop('a.ip').persist(self.storage_level) else: self.load_empty(self.schema)
def top_10_addons_on_date(data, date, topN, period=7, country_list=None): """ Gets the number of users in the past week who have used the top N addons, broken down by country. Parameters: data - The main ping server. date - The day you which you want to get the top N addons. topN - the number of addons to get. period - number of days to use to calculate metric country_list - a list of country names in string Returns: Dataframe containing the number of users using each of the addons. submission_date_s3, country, addon_id, name, percent_of_active_users """ addon_filter = (~col('addon.is_system')) & (~col('addon.foreign_install')) & \ (~col('addon.addon_id').isin(NON_MOZ_TP)) & (~col('addon.addon_id').like('%@mozilla%')) &\ (~col('addon.addon_id').like('%@shield.mozilla%')) &\ (~col('addon.addon_id').like('%' + UNIFIED_SEARCH_STR + '%')) data_all = keep_countries_and_all(data, country_list) begin = date_plus_x_days(date, -period) wau = data_all.filter((col('submission_date_s3') > begin) & (col('submission_date_s3') <= date))\ .groupBy('country')\ .agg(lit(date).alias('submission_date_s3'), F.countDistinct('client_id').alias('wau')) counts = data_all.select('submission_date_s3', 'country', 'client_id', F.explode('active_addons').alias('addon'))\ .filter((col('submission_date_s3') > begin) & (col('submission_date_s3') <= date))\ .filter(addon_filter)\ .select('country', 'client_id', 'addon.addon_id', 'addon.name')\ .distinct()\ .groupBy('country', 'addon_id')\ .agg(F.count('*').alias('number_of_users'), F.last('name').alias('name'))\ .select('*', lit(date).alias('submission_date_s3'), lit(begin).alias('start_date'), F.row_number().over(Window.partitionBy('country') .orderBy(desc('number_of_users')) .rowsBetween(Window.unboundedPreceding, Window.currentRow)) .alias('rank'))\ .filter(col('rank') <= topN) return counts.join(F.broadcast(wau), on=['country'], how='left')\ .select(lit(date).alias('submission_date_s3'), 'country', 'addon_id', col('name').alias('addon_name'), (100.0 * col('number_of_users') / col('wau')).alias('pct_with_addon'))
def assign_vids_uid(df_impressions, vid_assignment_table, demo_cols): select_cols = ["vid", "timestamp", "h1", "p1", "p2", "user_id"] if demo_cols is not None: df = df_impressions.join(F.broadcast(vid_assignment_table), demo_cols) select_cols = demo_cols + select_cols else: df = df_impressions.join(F.broadcast(vid_assignment_table)) df_vid_impressions = ( df.withColumn("h1", F.hash(F.col("user_id").astype("string"))) #.withColumn("h1", F.hash("user_id")) .withColumn( "p1", F.col("h1") / (2**32) + 0.5).where(F.col("p1") >= F.col("prob_>=")).where( F.col("p1") < F.col("prob_<")).withColumn( "p2", F.hash(F.col("h1").astype("string")) / (2**32) + 0.5).withColumn( "vid", (F.col("p2") * F.col("total_VID")).astype('int') + F.col("start_VID")).select(*select_cols)) return df_vid_impressions
def get_variants_df_with_case_duration(df, parameters=None): """Gets variants dataframe from the Spark dataframe, with case duration that is included """ if parameters is None: parameters = {} case_id_glue = parameters[ PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME activity_key = parameters[ PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY timestamp_key = parameters[ PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY ordered_df = df.orderBy(timestamp_key).select(case_id_glue, timestamp_key, activity_key) grouped_df = ordered_df.groupby(case_id_glue) df1 = grouped_df.agg(F.collect_list(activity_key).alias("variant")) df1 = df1.withColumn("variant", F.concat_ws(",", "variant")).orderBy(case_id_glue) start_df = grouped_df.agg(F.min(timestamp_key).alias(timestamp_key)) first_eve_df = ordered_df.join(F.broadcast(start_df), start_df.columns) end_df = grouped_df.agg(F.max(timestamp_key).alias(timestamp_key)) last_eve_df = ordered_df.join(F.broadcast(end_df), end_df.columns) last_eve_df = last_eve_df.withColumnRenamed(timestamp_key, timestamp_key + "_2") last_eve_df = last_eve_df.withColumnRenamed(activity_key, activity_key + "_2") stacked_df = first_eve_df.join(last_eve_df, case_id_glue).orderBy(case_id_glue) stacked_df = stacked_df.withColumn( "caseDuration", F.unix_timestamp(stacked_df[timestamp_key + "_2"]) - F.unix_timestamp(stacked_df[timestamp_key])) new_df = df1.join(stacked_df, case_id_glue) return new_df
def min_rating_filter_spark( data, min_rating=1, filter_by="user", col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, ): """Filter rating DataFrame for each user with minimum rating. Filter rating data frame with minimum number of ratings for user/item is usually useful to generate a new data frame with warm user/item. The warmth is defined by min_rating argument. For example, a user is called warm if he has rated at least 4 items. Args: data (spark.DataFrame): DataFrame of user-item tuples. Columns of user and item should be present in the DataFrame while other columns like rating, timestamp, etc. can be optional. min_rating (int): minimum number of ratings for user or item. filter_by (str): either "user" or "item", depending on which of the two is to filter with min_rating. col_user (str): column name of user ID. col_item (str): column name of item ID. Returns: spark.DataFrame: DataFrame with at least columns of user and item that has been filtered by the given specifications. """ split_by_column, split_with_column = _check_min_rating_filter( filter_by, min_rating, col_user, col_item ) rating_temp = ( data.groupBy(split_by_column) .agg({split_with_column: "count"}) .withColumnRenamed("count(" + split_with_column + ")", "n" + split_with_column) .where(col("n" + split_with_column) >= min_rating) ) rating_filtered = data.join(broadcast(rating_temp), split_by_column).drop( "n" + split_with_column ) return rating_filtered
df1 = sqlContext.range(100) df2 = sqlContext.range(100) df1.join(df2, df1["id"] == df2["id"]).collect() # COMMAND ---------- # MAGIC %md Look at the Spark UI for that job, and note the stage count and the shuffle. # MAGIC # MAGIC To use a broadcast join, we need at least one of the following: # MAGIC * statistics from running Hive ANALYZE on the table, and the size less than `spark.sql.autoBroadcastJoinThreshold` # MAGIC * statistics from caching the table in Spark, and the size less than `spark.sql.autoBroadcastJoinThreshold` # MAGIC * a broadcast hint applied to the table # COMMAND ---------- from pyspark.sql.functions import broadcast df1.join(broadcast(df2), df1["id"] == df2["id"]).collect() # COMMAND ---------- df2.cache().count() df1.join(df2, df1["id"] == df2["id"]).collect() # COMMAND ---------- df2.unpersist() df1.join(df2, df1["id"] == df2["id"]).collect()
def train_val_split(spark, df, slide_nums, folder, train_frac=0.8, add_row_indices=True, seed=None, debug=False): """ Split a DataFrame of slide samples into training and validation sets. Args: spark: SparkSession. df: A Spark DataFrame in which each row contains the slide number, tumor score, molecular score, and the sample stretched out into a Vector. slide_nums: A list of slide numbers to sample from. folder: Directory containing a `training_ground_truth.csv` file containing the ground truth "tumor_score" and "molecular_score" labels for each slide. train_frac: Fraction of the data to assign to the training set, with `1-frac` assigned to the valiation set. add_row_indices: Boolean for whether or not to prepend an index column contain the row index for use downstream by SystemML. The column name will be "__INDEX". Returns: A Spark DataFrame in which each row contains the slide number, tumor score, molecular score, and the sample stretched out into a Vector. """ # Create DataFrame of labels for the given slide numbers. labels_df = get_labels_df(folder) labels_df = labels_df.loc[slide_nums] # Randomly split slides 80%/20% into train and validation sets. train_nums_df = labels_df.sample(frac=train_frac, random_state=seed) val_nums_df = labels_df.drop(train_nums_df.index) train_nums = (spark.createDataFrame(train_nums_df) .selectExpr("cast(slide_num as int)") .coalesce(1)) val_nums = (spark.createDataFrame(val_nums_df) .selectExpr("cast(slide_num as int)") .coalesce(1)) # Note: Explicitly mark the smaller DataFrames as able to be broadcasted # in order to have Catalyst choose the more efficient BroadcastHashJoin, # rather than the costly SortMergeJoin. train = df.join(F.broadcast(train_nums), on="slide_num") val = df.join(F.broadcast(val_nums), on="slide_num") if debug: # DEBUG: Sanity checks. assert len(pd.merge(train_nums_df, val_nums_df, on="slide_num")) == 0 assert train_nums.join(val_nums, on="slide_num").count() == 0 assert train.join(val, on="slide_num").count() == 0 # - Check distributions. for pdf in train_nums_df, val_nums_df: print(pdf.count()) print(pdf["tumor_score"].value_counts(sort=False)) print(pdf["tumor_score"].value_counts(normalize=True, sort=False), "\n") # - Check total number of examples in each. print(train.count(), val.count()) # - Check physical plans for broadcast join. print(train.explain(), val.explain()) # Add row indices for use with SystemML. if add_row_indices: train = (train.rdd .zipWithIndex() .map(lambda r: (r[1] + 1, *r[0])) # flatten & convert index to 1-based indexing .toDF(['__INDEX', 'slide_num', 'tumor_score', 'molecular_score', 'sample'])) train = train.select(train["__INDEX"].astype("int"), train.slide_num.astype("int"), train.tumor_score.astype("int"), train.molecular_score, train["sample"]) val = (val.rdd .zipWithIndex() .map(lambda r: (r[1] + 1, *r[0])) # flatten & convert index to 1-based indexing .toDF(['__INDEX', 'slide_num', 'tumor_score', 'molecular_score', 'sample'])) val = val.select(val["__INDEX"].astype("int"), val.slide_num.astype("int"), val.tumor_score.astype("int"), val.molecular_score, val["sample"]) return train, val
def spark_chrono_split( data, ratio=0.75, min_rating=1, filter_by="user", col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): """Spark chronological splitter This function splits data in a chronological manner. That is, for each user / item, the split function takes proportions of ratings which is specified by the split ratio(s). The split is stratified. Args: data (spark.DataFrame): Spark DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two sets and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. seed (int): Seed. min_rating (int): minimum number of ratings for user or item. filter_by (str): either "user" or "item", depending on which of the two is to filter with min_rating. col_user (str): column name of user IDs. col_item (str): column name of item IDs. col_timestamp (str): column name of timestamps. Returns: list: Splits of the input data as spark.DataFrame. """ if not (filter_by == "user" or filter_by == "item"): raise ValueError("filter_by should be either 'user' or 'item'.") if min_rating < 1: raise ValueError("min_rating should be integer and larger than or equal to 1.") multi_split, ratio = process_split_ratio(ratio) split_by_column = col_user if filter_by == "user" else col_item if min_rating > 1: data = min_rating_filter_spark( data, min_rating=min_rating, filter_by=filter_by, col_user=col_user, col_item=col_item, ) ratio = ratio if multi_split else [ratio, 1 - ratio] ratio_index = np.cumsum(ratio) window_spec = Window.partitionBy(split_by_column).orderBy(col(col_timestamp)) rating_grouped = ( data.groupBy(split_by_column) .agg({col_timestamp: "count"}) .withColumnRenamed("count(" + col_timestamp + ")", "count") ) rating_all = data.join(broadcast(rating_grouped), on=split_by_column) rating_rank = rating_all.withColumn( "rank", row_number().over(window_spec) / col("count") ) splits = [] for i, _ in enumerate(ratio_index): if i == 0: rating_split = rating_rank.filter(col("rank") <= ratio_index[i]) else: rating_split = rating_rank.filter( (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1]) ) splits.append(rating_split) return splits