Ejemplo n.º 1
0
    def test_functions_broadcast(self):
        from pyspark.sql.functions import broadcast

        df1 = self.spark.createDataFrame([(1, "1"), (2, "2")], ("key", "value"))
        df2 = self.spark.createDataFrame([(1, "1"), (2, "2")], ("key", "value"))

        # equijoin - should be converted into broadcast join
        plan1 = df1.join(broadcast(df2), "key")._jdf.queryExecution().executedPlan()
        self.assertEqual(1, plan1.toString().count("BroadcastHashJoin"))

        # no join key -- should not be a broadcast join
        plan2 = df1.crossJoin(broadcast(df2))._jdf.queryExecution().executedPlan()
        self.assertEqual(0, plan2.toString().count("BroadcastHashJoin"))

        # planner should not crash without a join
        broadcast(df1)._jdf.queryExecution().executedPlan()
Ejemplo n.º 2
0
def filter_on_case_size(df, case_id_glue="case:concept:name", min_case_size=2, max_case_size=None):
    """Filters the Spark dataframe keeping only traces with at least the specified number of events
    """

    size_df = df.groupBy(case_id_glue).count()
    if max_case_size:
        size_df = size_df.filter((size_df["count"] >= min_case_size) & (size_df["count"] <= max_case_size))
    else:
        size_df = size_df.filter(size_df["count"] >= min_case_size)
    return df.join(F.broadcast(size_df), case_id_glue).drop("count")
    def transform_author_dataset(self):
        logging.debug("Inside transform author dataset module")
        author_df = \
            self._spark.read.csv( self._load_path + '/author.csv', header=True, mode='PERMISSIVE',inferSchema=True)

        author_lookup_df = author_df.groupBy('author_id')\
                            .agg(fn.max('record_create_timestamp').alias('record_create_timestamp'))
        author_lookup_df.persist()
        fn.broadcast(author_lookup_df)

        deduped_author_df = author_df\
                            .join(author_lookup_df, ['author_id', 'record_create_timestamp'], how='inner')\
                            .select(author_df.columns) \
                            .withColumn('name', goodreads_udf.remove_extra_spaces('name'))

        logging.debug(f"Attempting to write data to {self._save_path + '/authors/'}")
        deduped_author_df\
            .repartition(10)\
            .write\
            .csv(path = self._save_path + '/authors/', sep = '|', mode='overwrite', compression='gzip', header=True, timestampFormat = 'yyyy-MM-dd HH:mm:ss.SSS', quote = '"', escape = '"')
Ejemplo n.º 4
0
    def test_functions_broadcast(self):
        from pyspark.sql.functions import broadcast

        df1 = self.spark.createDataFrame([(1, "1"), (2, "2")],
                                         ("key", "value"))
        df2 = self.spark.createDataFrame([(1, "1"), (2, "2")],
                                         ("key", "value"))

        # equijoin - should be converted into broadcast join
        plan1 = df1.join(broadcast(df2),
                         "key")._jdf.queryExecution().executedPlan()
        self.assertEqual(1, plan1.toString().count("BroadcastHashJoin"))

        # no join key -- should not be a broadcast join
        plan2 = df1.crossJoin(
            broadcast(df2))._jdf.queryExecution().executedPlan()
        self.assertEqual(0, plan2.toString().count("BroadcastHashJoin"))

        # planner should not crash without a join
        broadcast(df1)._jdf.queryExecution().executedPlan()
    def transform_books_dataset(self):
        logging.debug("Inside transform books dataset module")
        books_df = self._spark.read.csv(self._load_path + '/book.csv', header=True, mode='PERMISSIVE',
                                  inferSchema=True, quote="\"", escape="\"")

        books_lookup_df = books_df\
                            .groupBy('book_id')\
                            .agg(fn.max('record_create_timestamp').alias('record_create_timestamp'))
        books_lookup_df.persist()
        fn.broadcast(books_lookup_df)

        deduped_books_df = books_df\
                           .join(books_lookup_df, ['book_id', 'record_create_timestamp'], how='inner')\
                           .select(books_df.columns)

        logging.debug(f"Attempting to write data to {self._save_path + '/books/'}")
        deduped_books_df\
            .repartition(10)\
            .write\
            .csv(path = self._save_path + '/books/', sep = '|', mode='overwrite', compression='gzip', header=True, timestampFormat = 'yyyy-MM-dd HH:mm:ss.SSS', quote = '"', escape = '"')
def initial_centroids(next_selected_cent, data_input, i):
    if i == k-1:
        

        data_cent6 = data_input.join(broadcast(next_selected_cent))
       
        data_cent7 = data_cent6.withColumn(str(i),squaree_spark1(data_cent6.columns[0],data_cent6.columns[1], data_cent6.columns[k+2],data_cent6.columns[k+3]))#+3 +4
       
        data_cent8 = data_cent7.drop('mindist').drop(data_cent7.columns[k+2]).drop(data_cent7.columns[k+3])
       
        return data_cent8
        
    else:
        

        data_cent6 = data_input.join(broadcast(next_selected_cent))

        
        data_cent7 = data_cent6.withColumn(str(i), squaree_spark1(data_cent6.columns[0],data_cent6.columns[1],data_cent6.columns[i+3], data_cent6.columns[i+4]))

        
        data_cent8 = data_cent7.drop(data_cent7.columns[i+3]).            drop(data_cent7.columns[i+4])
        
       
            
        data_cent9 = data_cent8.withColumn('mindist1',least(data_cent8.columns[i+3], col('mindist')))
       

        data_cent10 = data_cent9.drop('mindist')

        
        data_cent12 = data_cent10.withColumnRenamed('mindist1', 'mindist')
        
        data_cent13 = data_cent12.repartition(2001)
       
      
        next_cent_cache = data_cent13.orderBy(desc('mindist')).limit(1).cache()
        
        next_cent = next_cent_cache.select(data_cent12.columns[0:2])
        
        return next_cent, data_cent12
Ejemplo n.º 7
0
def main():
    in_directory = sys.argv[1]
    out_directory = sys.argv[2]

    comments = spark.read.json(in_directory, schema=schema)
    comments_cache = comments.cache()

    # TODO
    average_score = comments_cache.groupBy("subreddit").mean()
    average_score = average_score.filter(average_score["avg(score)"] > 0)
    average_score = functions.broadcast(average_score)

    join_average_score = average_score.join(
        comments_cache,
        on=(average_score['subreddit'] == comments_cache['subreddit'])).drop(
            average_score['subreddit'])
    join_average_score_cache = join_average_score.cache()

    relative_score = join_average_score_cache.select(
        join_average_score_cache['subreddit'],
        join_average_score_cache['author'],
        (join_average_score_cache['score'] /
         join_average_score_cache['avg(score)']).alias("relative_score"))
    relative_score_cache = relative_score.cache()

    max_relative = relative_score_cache.groupby('subreddit').max(
        'relative_score')
    max_relative = functions.broadcast(max_relative)

    best_comment = max_relative.join(
        relative_score_cache,
        on=(max_relative['max(relative_score)'] ==
            relative_score_cache['relative_score'])).drop(
                max_relative['max(relative_score)']).drop(
                    max_relative['subreddit'])
    best_comment_cache = best_comment.cache()

    best_author = best_comment_cache.sort(best_comment_cache['subreddit'])
    best_author.show()

    best_author.write.json(out_directory, mode='overwrite')
Ejemplo n.º 8
0
def run_pagerank(graph, communities, outputs, path, maxiter=10):
    # Run PageRank
    pageRank = graph.pageRank(resetProbability=0.15, maxIter=maxiter)
    # Organize communities based on page rankings and weights
    topTenRankings = pageRank.vertices.select("id", "pagerank").orderBy(
        "pagerank", ascending=False).limit(10)
    topTenRankings = functions.broadcast(topTenRankings)
    getRankingInfo = communities.join(
        topTenRankings, communities.id == topTenRankings.id).drop(
            topTenRankings.id).orderBy("pagerank", ascending=False)
    getRankingInfo.write.csv(path + '/rankings-' + outputs, mode='overwrite')
    return pageRank
Ejemplo n.º 9
0
def make_genre_map():
    """
    Make mapping of genre wikidata_id value to numan-readable label.
    """
    wd = spark.read.parquet(sys.argv[1])
    label_map = wd.filter(wd.label.isNotNull()).select(wd['id'].alias('wikidata_id'), wd['label'])
    genres = wd.select(functions.explode(wd['genre']).alias('wikidata_id')).distinct()
    genres = functions.broadcast(genres) # only a few thousand values that we want to keep
    genres = genres.join(label_map, on='wikidata_id')
    genres = genres.withColumnRenamed('label', 'genre_label')
    # output is about <1MB compressed: safe to .coalesce().
    genres.coalesce(1).write.json('./genres', mode='overwrite', compression='gzip')
Ejemplo n.º 10
0
def ar_coefficient_spark(spark, df, param):
    # We convert the params into a pandas dataframe to apply a groupBy
    # and to convert a part of the dataframe to a pyspark dataframe
    df_param = pd.DataFrame(param)
    dict_params = df_param.groupby('k')['coeff'].apply(list).to_dict()
    df_k = spark.createDataFrame(df_param[['k']].drop_duplicates())
    # We will apply a pandas udf to each partition which is composed of the full time series
    # multiplied k times because for each k we will compute AR(k) in parallel in the cluster
    df_ts_k = df.crossJoin(F.broadcast(df_k))
    df_value_ar = df_ts_k.groupBy('k').apply(computeARk_generator(dict_params))
    return df_value_ar.rdd.map(lambda x: ("coeff_{}__k_{}".format(
        x.coeff, x.k), x.value_ar)).collect()
Ejemplo n.º 11
0
    def join_df(self, events_df, mentions_df):
        '''
		This function is used to perform joining of mentions data with events data. Used Spark's broadcast join.
		:param events_df: filtered events data frame will be provided.
		:param mentions_df: filtered mentions data frame will be provided
		:return: joined data frame including GlobalEventId, Goldstein score and avg confidence will be returned.
		'''

        # Broadcast events data frame as it's smaller in size and perform join.
        final_df = mentions_df.join(broadcast(events_df), 'GLOBALEVENTID')

        return final_df
def normalizeFeatures(df, cols):
	""" Normalized feature method is used to normalize each feature passed into the list cols"""
	allCols = df.columns
	#remove the cols to normalized to set the columns of return dataframe
	_ = [allCols.remove(x) for x in cols]
	# calculate the avg and stddev of the features to normalized
	stats = (df.groupBy().agg(*([stddev_pop(x).alias(x + '_stddev') for x in cols] + [avg(x).alias(x + '_avg') for x in cols])))
	# broadcast and join into current DF 
	df = df.join(broadcast(stats))
	# normalized the columns and select the required columns gor final DF
	exprs = [x for x in allCols] + [((df[x] - df[x + '_avg']) / df[x + '_stddev']).alias(x) for x in cols]  
	return df.select(*exprs)
Ejemplo n.º 13
0
def top_k_rankingmetrics(dataset=None,
                         k=10,
                         ranking_metrics="precisionAt",
                         user="******",
                         item="book_id",
                         rating="rating",
                         prediction="prediction"):
    '''
	This function is to compute the ranking metrics from predictions.
	Input:
	1. k: only evaluate the performance of the top k items
	2. ranking_metrics: precisionAt, meanAveragePrecision, ndcgAt
	3. user, item, prediction: column names; string type

	refer to https://vinta.ws/code/spark-ml-cookbook-pyspark.html
	'''
    if dataset == None:
        print("Error! Please specify a dataset.")
        return
    # prediction table
    windowSpec = Window.partitionBy(user).orderBy(col(prediction).desc())
    perUserPredictedItemsDF = dataset \
     .select(user, item, prediction, F.rank().over(windowSpec).alias('rank')) \
     .where('rank <= {}'.format(k)) \
     .groupBy(user) \
     .agg(expr('collect_list({}) as items'.format(item)))
    # actual target table
    windowSpec = Window.partitionBy(user).orderBy(col(rating).desc())
    perUserActualItemsDF = dataset \
     .select(user, item, rating, F.rank().over(windowSpec).alias('rank')) \
     .where('rank <= {}'.format(k)) \
     .groupBy(user) \
     .agg(expr('collect_list({}) as items'.format(item)))
    # join
    perUserItemsRDD = perUserPredictedItemsDF \
     .join(F.broadcast(perUserActualItemsDF), user, 'inner') \
     .rdd \
     .map(lambda row: (row[1], row[2]))
    ranking_metrics_evaluator = RankingMetrics(perUserItemsRDD)
    # get the result of the metric
    if ranking_metrics == "precisionAt":
        precision_at_k = ranking_metrics_evaluator.precisionAt(k)
        #print("precisionAt: {}".format(round(precision_at_k, 4)))
        return precision_at_k
    elif ranking_metrics == "meanAveragePrecision":
        mean_avg_precision = ranking_metrics_evaluator.meanAveragePrecision(k)
        #print("meanAveragePrecision: {}".format(round(mean_avg_precision, 4)))
        return mean_avg_precision
    elif ranking_metrics == "ndcgAt":
        ndcg_at_k = ranking_metrics_evaluator.ndcgAt(k)
        #print("meanAveragePrecision: {}".format(round(ndcg_at_k, 4)))
        return ndcg_at_k
Ejemplo n.º 14
0
def main():
  sc = SparkContext(conf=SparkConf().setAppName("se"))
  spark = SparkSession.builder.appName("se").getOrCreate()
  #read in the links parquet file
  links = spark.read.load("s3a://xmlparq/pr_se_links.parquet")
  #read in the posts parquet file
  posts = spark.read.load("s3a://xmlparq/posts.parquet")
  #filter the questions:
  questions = posts.filter((f.col('PostTypeId')==1)).filter((f.col('AcceptedAnswerId').isNotNull()))
  questions_subset = questions.select('Id','AcceptedAnswerId','Tags','CreationDate', 'Community')
  #filter the answers in another dataframe
  answers = df3.filter((f.col('PostTypeId')==2))
  #rename the answer dataframe columns
  answers_subset = answers.select("Id","CreationDate","Community")
  new_names = ['AnsId', 'AnsCreationDate','AnsCommunity']
  answers_subset = answers_subset.toDF(*new_names)
  #perform a join on the questions df and the answer df based on the common answer id
  qa_deets = questions_subset.join(answers_subset,(answers_subset.AnsCommunity == all_questions.Community) & (questions_subset.AcceptedAnswerId == answers_subset.AnsId))
 
  timeFmt = "yyyy-MM-dd' 'HH:mm:ss.SSS"
  timeDiff = (f.unix_timestamp('AnsCreationDate', format=timeFmt)
            - f.unix_timestamp('CreationDate', format=timeFmt))
  #divide duration by seconds to convert it to milli- minutes
  qa_deets = qa_deets.withColumn("Duration", timeDiff/60)

  qa_deets_subset = qa_deets.select("Id",  "Tags","CreationDate","Community", "Duration")
  #filter off questions which have no answers
  questions_null = df3.filter((f.col('PostTypeId')==1)).filter((f.col('AcceptedAnswerId').isNull()))
  questions_null_subset = questions_null.select('Id','Tags','CreationDate','Community')
  questions_null_duration = questions_null_subset.withColumn('Duration', lit(None).cast(DoubleType()))
  #combine all questions; questions with answers and question with no answers
  all_questions = qa_deets_subset.union(questions_null_duration)
  all_questions = all_questions.withColumn('post_create_date',all_questions['CreationDate'].cast('date'))
  all_questions = all_questions.withColumnRenamed('id','COMMUNITY_QUESTION_ID')
  all_questions_subset = all_questions.select('COMMUNITY_QUESTION_ID','Community','Tags', 'post_create_date')

  links  = links.withColumnRenamed("community","lcommunity")
  """ perform a join based on community and the question id to combine the pagerank score and
      response time duration in one dataframe
  """
  cred_tags = all_questions.join(broadcast(links), (links.id == all_questions.COMMUNITY_QUESTION_ID) & (all_questions_subset.Community == links.lcommunity), "left_outer")
  #rename columns as per postgresql schema, round off values and write to the database:
  total_df = cred_tags.select("COMMUNITY_QUESTION_ID","Community","post_create_date","Tags","Duration","cred_score")
  total_df  = total_df.withColumn("duration",f.round(total_df["Duration"],2))
  total_df  = total_df.withColumn("pr_score",f.round(total_df["cred_score"],3))
  total_df  = total_df.withColumnRenamed("COMMUNITY_QUESTION_ID","qid")
  total_df  = total_df.withColumnRenamed("Community","community")
  total_df  =  total_df.withColumnRenamed("Tags","tags")
  total_df  = total_df.withColumnRenamed("post_create_date","create_date")
  total_df_reqd = total_df.select("qid","tags","community","duration","create_date","pr_score")
  total_df_reqd.write.format("jdbc").mode("append") .option("url", "jdbc:postgresql://hostname/ls?user=xxx&password=xxx").option("dbtable", "questions").option("user", "postgres").option("password", "xxx").save()
  spark.catalog.clearCache()
Ejemplo n.º 15
0
def apply_numeric(df, int1, int2, parameters=None):
    """Filters the Spark dataframe on attribute values (filter cases)
    """

    if parameters is None:
        parameters = {}
    attribute_key = parameters[
        PARAMETER_CONSTANT_ATTRIBUTE_KEY] if PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters else DEFAULT_NAME_KEY
    case_id_glue = parameters[
        PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
    positive = parameters["positive"] if "positive" in parameters else True

    df_filtered = df.filter(df[attribute_key].between(int1, int2))
    df_filtered = df_filtered.groupBy(case_id_glue).count()
    #filtered_index = df_filtered.select(case_id_glue).rdd.map(lambda x: x[0]).collect()
    if positive:
        return df.join(F.broadcast(df_filtered), case_id_glue).drop("count")
    else:
        df_left_joined = df.join(F.broadcast(df_filtered), case_id_glue,
                                 "left")
        return df_left_joined.filter(
            df_left_joined["count"].isNull()).drop("count")
Ejemplo n.º 16
0
def filter_on_case_performance(df, case_id_glue="case:concept:name", timestamp_key="time:timestamp",
                               min_case_performance=0, max_case_performance=10000000000):
    """Filters the Spark dataframe on case performance
    """

    grouped_df = df.groupby(case_id_glue)
    start_end_df = grouped_df.agg(F.min(timestamp_key).alias(timestamp_key), F.max(timestamp_key).alias(timestamp_key+"_1"))

    start_end_df = start_end_df.withColumn("caseDuration", F.unix_timestamp(start_end_df[timestamp_key+"_1"]) - F.unix_timestamp(start_end_df[timestamp_key]))
    start_end_df = start_end_df.filter((start_end_df["caseDuration"] > min_case_performance) & (start_end_df["caseDuration"] < max_case_performance))\
                               .select(case_id_glue)

    return df.join(F.broadcast(start_end_df), case_id_glue)
Ejemplo n.º 17
0
def filter_on_ncases(df, case_id_glue="case:concept:name", max_no_cases=1000):
    """Filters the Spark dataframe keeping only the specified maximum number of traces
    """

    # With conversion to RDD.
    #cases_to_keep = df.select(case_id_glue).distinct().rdd.map(lambda row : row[0]).collect()
    #cases_to_keep = cases_to_keep[0:min(len(cases_to_keep), max_no_cases)]
    #return df.filter(df[case_id_glue].isin(cases_to_keep))

    #Without conversion to RDD (better).
    grouped_df = df.groupBy(case_id_glue).count().limit(max_no_cases).drop("count")

    return df.join(F.broadcast(grouped_df), case_id_glue)
Ejemplo n.º 18
0
def filter_df_on_end_activities(df,
                                values,
                                timestamp_key=DEFAULT_TIMESTAMP_KEY,
                                case_id_glue=CASE_CONCEPT_NAME,
                                activity_key=DEFAULT_NAME_KEY,
                                grouped_df=None,
                                positive=True):
    """Filters the Spark dataframe on end activities
    """

    if grouped_df is None:
        grouped_df = df.groupby(case_id_glue)

    grouped_df = grouped_df.agg(
        F.last(activity_key).alias(activity_key + "_1"))
    df_end = grouped_df.filter(grouped_df[activity_key + "_1"].isin(values))

    if positive:
        return df.join(F.broadcast(df_end),
                       grouped_df.columns[0]).drop(activity_key + "_1")
    else:
        return df.join(F.broadcast(df_end), grouped_df.columns[0], "leftanti")
def main(in_directory, out_directory):
    comments = spark.read.json(in_directory, schema=schema)

    comments.cache()
    average = comments.groupBy('subreddit').avg('score')
    average = average.withColumnRenamed('avg(score)', 'avg_score')
    average = average.filter(average['avg_score'] > 0)
    average = functions.broadcast(average)
    # average.show()
    joined_comments = comments.join(
        average,
        average.subreddit == comments.subreddit).drop(comments.subreddit)
    joined_comments = joined_comments.withColumn(
        'relative_score',
        joined_comments['score'] / joined_comments['avg_score'])
    joined_comments.cache()

    max_score_comments = joined_comments.groupBy('subreddit').max(
        'relative_score')
    max_score_comments = max_score_comments.withColumnRenamed(
        'max(relative_score)', 'max_relative_score')
    max_score_comments = functions.broadcast(max_score_comments)

    result_table = joined_comments.join(
        max_score_comments,
        (joined_comments.subreddit == max_score_comments.subreddit) &
        (joined_comments.relative_score
         == max_score_comments.max_relative_score)).drop(
             joined_comments.subreddit)
    # result_table = result_table.drop('avg_score', 'score', 'max_relative_score')

    result_table = result_table.select('subreddit', 'author', 'relative_score')
    result_table = result_table.withColumnRenamed('relative_score',
                                                  'rel_score')
    # result_table.show()

    # TODO

    result_table.write.json(out_directory, mode='overwrite')
Ejemplo n.º 20
0
def main():
    sparkSession = SparkSession\
        .builder\
        .appName('UDF and Broadcast')\
        .getOrCreate()

    sparkSession.sparkContext.setLogLevel('ERROR')

    # define schema
    # Date,Open,High,Low,Close,Adj Close,Volume,Name
    stockSchema = StructType([
        StructField('Date', DateType(), True),
        StructField('Open', DoubleType(), True),
        StructField('High', DoubleType(), True),
        StructField('Low', DoubleType(), True),
        StructField('Close', DoubleType(), True),
        StructField('Adj Close', DoubleType(), True),
        StructField('Volume', LongType(), True),
        StructField('Name', StringType(), True)
    ])

    stocks = sparkSession\
        .readStream.option('header','true')\
        .schema(stockSchema).csv('./data')

    print('Is streaming', stocks.isStreaming)
    print(stocks.printSchema())

    # User Function
    def daily_price_delta(open_price, close_price):
        return close_price - open_price

    # Registering UDF
    # sparkSession.udf.register('calculated_price_delta_udf',daily_price_delta, DoubleType())
    calculated_price_delta_udf = udf(daily_price_delta, DoubleType())

    # Broadcasting
    price_delta_broadcast_df = broadcast(stocks.withColumn(
        'PriceDelta', calculated_price_delta_udf(stocks.Open, stocks.Close)))

    price_delta_df = price_delta_broadcast_df.select("Date", "Name", "PriceDelta")\
        .where("PriceDelta > 15")


    query = price_delta_df\
        .writeStream\
        .outputMode("append")\
        .format("console")\
        .option("truncate",'false')\
        .option('numRows', 30)\
        .start().awaitTermination()
Ejemplo n.º 21
0
def dummy_run(spark):

    from pyspark.ml.recommendation import ALS
    from pyspark.mllib.evaluation import RankingMetrics
    import pyspark.sql.functions as F
    from pyspark.sql.functions import expr

    train=spark.createDataFrame(
    [
        (82, 124, 5.0),
        (64, 123, 4.0),
        (27, 122, 3.0),
        (25, 122, 1.0),
        (12, 124, 2.0)
    ],
    ['user_id', 'book_id', 'rating'] 
    )

    val=spark.createDataFrame(
    [
        (82, 123, 5.0),
        (64, 122, 4.0),
        (27, 124, 3.0),
        (64, 123, 2.0),
        (12, 122, 4.0)
    ],
    ['user_id', 'book_id', 'rating'] 
    )

    user_id = val.select('user_id').distinct()
    true_label = val.select('user_id', 'book_id')\
                .groupBy('user_id')\
                .agg(expr('collect_list(book_id) as true_item'))

    als = ALS(rank = 3 , regParam=0.1, 
                userCol="user_id", itemCol="book_id", ratingCol='rating', 
                implicitPrefs=False, coldStartStrategy="drop")
    model = als.fit(train)

    recs = model.recommendForUserSubset(user_id, 2)
    pred_labels = recs.select('user_id','recommendations.book_id')
    pred_true_rdd = pred_labels.join(F.broadcast(true_label), 'user_id', 'inner') \
                .rdd \
                .map(lambda row: (row[1], row[2]))
    
    metrics = RankingMetrics(pred_true_rdd)
    mean_ap = metrics.meanAveragePrecision
    ndcg_at_k = metrics.ndcgAt(2)
    p_at_k= metrics.precisionAt(2)
    print('MAP: ', mean_ap , 'NDCG: ', ndcg_at_k, 'Precision at k: ', p_at_k)
    return 
Ejemplo n.º 22
0
def addCategoryData(entityUserDF, attr, catFreqInfo):
    print "loading category info data for attr: %s" % attr
    #global freq table
    _freqTbl = F.broadcast(
        sqlContext.createDataFrame([(k, v)
                                    for k, v in catFreqInfo[attr].iteritems()],
                                   [attr, "p"]))

    # a table containing (entity, attr, sum(projWt))
    attrInfoTbl = entityUserDF.filter(F.length(F.col(attr)) > 0) \
                    .groupby(entity, attr) \
                    .agg(F.sum(weightCol).alias('wt')) \
                    .cache()

    # sum of projWt for each entity
    totalsTbl = attrInfoTbl.groupby(entity).agg(F.sum('wt').alias('totWt'))
    totalsTbl = F.broadcast(totalsTbl)

    attrInfoTbl = attrInfoTbl.join(totalsTbl, entity)
    #calcuate mn and sd
    attrInfoTbl = attrInfoTbl.join(_freqTbl, attr) \
                    .select(entity, attr, 'wt',
                            (totalsTbl['totWt'] * _freqTbl["p"]).alias("mn"),
                            F.sqrt(((totalsTbl['totWt'] * _freqTbl["p"]) *(1 - _freqTbl["p"]))).alias("sd"))#sd  sqrt(mn * (1 - p))

    # filter common outliers
    attrInfoTbl = attrInfoTbl.select(
        entity, attr, (((F.col("wt") - F.col("mn")) / F.col("sd")) >
                       2.0).alias('shouldInclude'))
    attrInfoTbl = attrInfoTbl.filter(attrInfoTbl['shouldInclude']).drop(
        F.col("shouldInclude"))

    #collect categories and concat them
    attrInfoTbl = attrInfoTbl.groupby(entity) \
                    .agg(F.collect_list(F.col(attr)).alias(attr)) \
                    .select(entity, F.concat_ws("|", F.col(attr)).alias("common_" + attr))

    return attrInfoTbl
Ejemplo n.º 23
0
def main(in_directory, out_directory):
    comments = spark.read.json(in_directory, schema=schema).cache()

    # TODO
    # get average subreddit score
    average = comments.groupby('subreddit').avg().cache()
    # filter average score > 0
    average = average.filter(average['avg(score)'] > 0)

    # merge with original table
    # average = average.join(comments,'subreddit')

    # merge with original table (with broadcast)
    average = functions.broadcast(average)
    average = average.join(comments, 'subreddit')

    # add column 'relative_score'
    average = average.withColumn('relative_score',
                                 average['score'] / average['avg(score)'])
    # get max score
    average = average.groupby('subreddit').max().cache()

    # join with original table
    # average = average.join(comments, 'subreddit')

    # join with original table (with broadcast)
    average = functions.broadcast(average)
    average = average.join(comments, 'subreddit')

    # filter tuple with max score
    average = average.filter(average['score'] == average['max(score)'])

    best_author = average.select(
        average['subreddit'], average['author'],
        average['max(relative_score)'].alias('rel_score'))
    # best_author = max_by_subreddit.join(functions.broadcast(average), 'subreddit', 'inner')

    best_author.write.json(out_directory, mode='overwrite')
Ejemplo n.º 24
0
    def compute_individual_score(self, aux, df_records):
        """
        Compute scoreboard of auxiliary information aux inside record df_records.
        Both must be spark dataframes.
        Returns a spark dataframe.
        """

        merged = broadcast(prepare_join(aux, '_1', True)).crossJoin(
            prepare_join(df_records, '_2', True))

        merged = merged.withColumn('similarity', self.similarity_func(merged))
        #merged = merged.withColumn('value', merged.wt * merged.similarity)
        merged = merged.withColumn('value', merged.similarity)
        return merged
Ejemplo n.º 25
0
    def filter_too_sparse_IDs(self):
        """
        keeping just those tracks which have > `timestamps_per_hour` points per hour on average

        :return: filtered `self.df`
        """

        ids = self.df.groupBy(['id', F.to_date(F.col('ts'))]) \
            .count() \
            .filter(F.col('count') > timestamps_per_hour * 24) \
            .select('id') \
            .distinct()

        self.df = self.df.join(F.broadcast(ids), ['id'], how='inner')
Ejemplo n.º 26
0
def main(in_directory, out_directory):
    comments = spark.read.json(in_directory, schema=schema).cache()
    grouped = comments.groupby(comments['subreddit'])
    averages = grouped.agg(functions.avg(comments['score']))
    averages = averages.filter(averages['avg(score)'] > 0)

    comments = comments.join(
        functions.broadcast(averages),
        comments['subreddit'] == averages['subreddit']).drop(
            averages['subreddit'])
    comments = comments.withColumn("rel_score",
                                   comments['score'] / comments['avg(score)'])
    grouped = comments.groupby(comments['subreddit'])

    maxes = grouped.agg(functions.max(comments['rel_score']))

    comments = comments.join(functions.broadcast(maxes),
                             comments['subreddit'] == maxes['subreddit']).drop(
                                 maxes['subreddit'])
    max_scores = comments.filter(
        comments['rel_score'] == comments['max(rel_score)'])
    best_author = max_scores.select(['subreddit', 'author', 'rel_score'])
    best_author.write.json(out_directory, mode='overwrite')
Ejemplo n.º 27
0
def df_sql_approach():
    print('DF SQL APPROACH ----------------')
    print('Reading in files...')
    pw_schema = StructType([
        StructField("hashedpw", StringType(), False),
        StructField("h", IntegerType(), True)
    ])

    pw_df = sqlc.read.csv(PWS_PATH, header=False, schema=pw_schema, sep=':')
    common_words_schema = StructType([
        StructField("word", StringType(), False)
    ])
    # common_words_df = sqlc.createDataFrame(sc.textFile(COMMONWORDS_PATH), schema=common_words_schema)
    common_words_df = sqlc.read.csv(COMMONWORDS_PATH, header=False, schema=common_words_schema, sep='|')
    with time_usage('DF SQL APPROACH'):

        # The hashes of the common words are only needed when comparing
        # Idea: Use a function during sql query that hashes the words on-demand

        sqlc.registerFunction("sha1hash", lambda x: sha1(x).hexdigest().upper())
        sqlc.registerDataFrameAsTable(pw_df, "pw_df")
        sqlc.registerDataFrameAsTable(common_words_df, "common_words_df")

        # example of how to apply the hash function within sql
        # print('Example of executing hash function within sql:')
        # print(sqlc.sql("SELECT *, sha1hash(word) as hash FROM common_words_df").take(2))

        # we add the hashed column to each word first
        with time_usage('hashing common words by adding an extra column'):
            common_words_df = common_words_df.rdd \
                .map(lambda x: (x['word'], sha1(x['word']).hexdigest().upper())) \
                .toDF(['word', 'hashedword'])

        # print("search for single word...")
        # with time_usage("search for love ❤"):
        #     j = pw_df.filter(pw_df.hashedpw == sha1('love').hexdigest().upper())
        #     j.show(10000)
        #     print("Count: " + str(j.count()))

        print("joining tables...")
        with time_usage('Joining tables on the hash [forced broadcast]'):
            j = broadcast(common_words_df).join(pw_df, common_words_df.hashedword == pw_df.hashedpw) \
                .select(common_words_df['word'], pw_df['h'])
            j.orderBy("h", ascending=False).show(100)

#        with time_usage('Joining tables on the hash [no broadcast]'):
#            j = common_words_df.join(pw_df, common_words_df.hashedword == pw_df.hashedpw) \
#                .select(common_words_df['word'], pw_df['h'])
#            j.orderBy("h", ascending=False).show(100)
        print("Count: " + str(j.count()))
Ejemplo n.º 28
0
    def filter_by(self, df, columns=None):
        """

        :param df:
        :param columns:
        :return:
        """
        import os
        if not columns:
            columns = df.columns

        if os.path.isdir(self.persistent_cache_file):
            self.__cache = self.session_getter().read.format(
                self.format_).load(self.persistent_cache_file).join(
                    F.broadcast(df), on=columns,
                    how='inner').drop('a.ip').persist(self.storage_level)
        else:
            if self.__cache:
                self.__cache = self.__cache.join(
                    F.broadcast(df), on=columns,
                    how='inner').drop('a.ip').persist(self.storage_level)
            else:
                self.load_empty(self.schema)
Ejemplo n.º 29
0
def top_10_addons_on_date(data, date, topN, period=7, country_list=None):
    """ Gets the number of users in the past week who have used the top N addons,
        broken down by country.

        Parameters:
        data - The main ping server.
        date - The day you which you want to get the top N addons.
        topN - the number of addons to get.
        period - number of days to use to calculate metric
        country_list - a list of country names in string

        Returns:
        Dataframe containing the number of users using each of the addons.
        submission_date_s3, country, addon_id, name, percent_of_active_users
    """
    addon_filter = (~col('addon.is_system')) & (~col('addon.foreign_install')) & \
        (~col('addon.addon_id').isin(NON_MOZ_TP)) & (~col('addon.addon_id').like('%@mozilla%')) &\
        (~col('addon.addon_id').like('%@shield.mozilla%')) &\
        (~col('addon.addon_id').like('%' + UNIFIED_SEARCH_STR + '%'))

    data_all = keep_countries_and_all(data, country_list)
    begin = date_plus_x_days(date, -period)

    wau = data_all.filter((col('submission_date_s3') > begin) &
                          (col('submission_date_s3') <= date))\
        .groupBy('country')\
        .agg(lit(date).alias('submission_date_s3'),
             F.countDistinct('client_id').alias('wau'))

    counts = data_all.select('submission_date_s3', 'country', 'client_id',
                             F.explode('active_addons').alias('addon'))\
        .filter((col('submission_date_s3') > begin) &
                (col('submission_date_s3') <= date))\
        .filter(addon_filter)\
        .select('country', 'client_id', 'addon.addon_id', 'addon.name')\
        .distinct()\
        .groupBy('country', 'addon_id')\
        .agg(F.count('*').alias('number_of_users'), F.last('name').alias('name'))\
        .select('*', lit(date).alias('submission_date_s3'),
                lit(begin).alias('start_date'),
                F.row_number().over(Window.partitionBy('country')
                                    .orderBy(desc('number_of_users'))
                                    .rowsBetween(Window.unboundedPreceding, Window.currentRow))
                              .alias('rank'))\
        .filter(col('rank') <= topN)

    return counts.join(F.broadcast(wau), on=['country'], how='left')\
        .select(lit(date).alias('submission_date_s3'), 'country',
                'addon_id', col('name').alias('addon_name'),
                (100.0 * col('number_of_users') / col('wau')).alias('pct_with_addon'))
Ejemplo n.º 30
0
def assign_vids_uid(df_impressions, vid_assignment_table, demo_cols):
    select_cols = ["vid", "timestamp", "h1", "p1", "p2", "user_id"]
    if demo_cols is not None:
        df = df_impressions.join(F.broadcast(vid_assignment_table), demo_cols)
        select_cols = demo_cols + select_cols
    else:
        df = df_impressions.join(F.broadcast(vid_assignment_table))

    df_vid_impressions = (
        df.withColumn("h1", F.hash(F.col("user_id").astype("string")))
        #.withColumn("h1", F.hash("user_id"))
        .withColumn(
            "p1",
            F.col("h1") / (2**32) +
            0.5).where(F.col("p1") >= F.col("prob_>=")).where(
                F.col("p1") < F.col("prob_<")).withColumn(
                    "p2",
                    F.hash(F.col("h1").astype("string")) / (2**32) +
                    0.5).withColumn(
                        "vid",
                        (F.col("p2") * F.col("total_VID")).astype('int') +
                        F.col("start_VID")).select(*select_cols))
    return df_vid_impressions
Ejemplo n.º 31
0
def get_variants_df_with_case_duration(df, parameters=None):
    """Gets variants dataframe from the Spark dataframe, with case duration that is included
    """
    if parameters is None:
        parameters = {}

    case_id_glue = parameters[
        PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
    activity_key = parameters[
        PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY
    timestamp_key = parameters[
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY

    ordered_df = df.orderBy(timestamp_key).select(case_id_glue, timestamp_key,
                                                  activity_key)
    grouped_df = ordered_df.groupby(case_id_glue)
    df1 = grouped_df.agg(F.collect_list(activity_key).alias("variant"))
    df1 = df1.withColumn("variant",
                         F.concat_ws(",", "variant")).orderBy(case_id_glue)

    start_df = grouped_df.agg(F.min(timestamp_key).alias(timestamp_key))
    first_eve_df = ordered_df.join(F.broadcast(start_df), start_df.columns)
    end_df = grouped_df.agg(F.max(timestamp_key).alias(timestamp_key))
    last_eve_df = ordered_df.join(F.broadcast(end_df), end_df.columns)
    last_eve_df = last_eve_df.withColumnRenamed(timestamp_key,
                                                timestamp_key + "_2")
    last_eve_df = last_eve_df.withColumnRenamed(activity_key,
                                                activity_key + "_2")

    stacked_df = first_eve_df.join(last_eve_df,
                                   case_id_glue).orderBy(case_id_glue)
    stacked_df = stacked_df.withColumn(
        "caseDuration",
        F.unix_timestamp(stacked_df[timestamp_key + "_2"]) -
        F.unix_timestamp(stacked_df[timestamp_key]))
    new_df = df1.join(stacked_df, case_id_glue)
    return new_df
Ejemplo n.º 32
0
def min_rating_filter_spark(
    data,
    min_rating=1,
    filter_by="user",
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
):
    """Filter rating DataFrame for each user with minimum rating.
    Filter rating data frame with minimum number of ratings for user/item is usually useful to
    generate a new data frame with warm user/item. The warmth is defined by min_rating argument. For
    example, a user is called warm if he has rated at least 4 items.

    Args:
        data (spark.DataFrame): DataFrame of user-item tuples. Columns of user and item
            should be present in the DataFrame while other columns like rating, 
            timestamp, etc. can be optional.
        min_rating (int): minimum number of ratings for user or item.
        filter_by (str): either "user" or "item", depending on which of the two is to 
            filter with min_rating.
        col_user (str): column name of user ID.
        col_item (str): column name of item ID.

    Returns:
        spark.DataFrame: DataFrame with at least columns of user and item that has been 
            filtered by the given specifications.
    """
    split_by_column, split_with_column = _check_min_rating_filter(
        filter_by, min_rating, col_user, col_item
    )
    rating_temp = (
        data.groupBy(split_by_column)
        .agg({split_with_column: "count"})
        .withColumnRenamed("count(" + split_with_column + ")", "n" + split_with_column)
        .where(col("n" + split_with_column) >= min_rating)
    )

    rating_filtered = data.join(broadcast(rating_temp), split_by_column).drop(
        "n" + split_with_column
    )
    return rating_filtered
Ejemplo n.º 33
0
df1 = sqlContext.range(100)
df2 = sqlContext.range(100)

df1.join(df2, df1["id"] == df2["id"]).collect()

# COMMAND ----------

# MAGIC %md Look at the Spark UI for that job, and note the stage count and the shuffle.
# MAGIC 
# MAGIC To use a broadcast join, we need at least one of the following:
# MAGIC * statistics from running Hive ANALYZE on the table, and the size less than `spark.sql.autoBroadcastJoinThreshold`
# MAGIC * statistics from caching the table in Spark, and the size less than `spark.sql.autoBroadcastJoinThreshold`
# MAGIC * a broadcast hint applied to the table

# COMMAND ----------

from pyspark.sql.functions import broadcast

df1.join(broadcast(df2), df1["id"] == df2["id"]).collect()

# COMMAND ----------

df2.cache().count()
df1.join(df2, df1["id"] == df2["id"]).collect()

# COMMAND ----------

df2.unpersist()
df1.join(df2, df1["id"] == df2["id"]).collect()
Ejemplo n.º 34
0
def train_val_split(spark, df, slide_nums, folder, train_frac=0.8, add_row_indices=True, seed=None,
                    debug=False):
  """
  Split a DataFrame of slide samples into training and validation sets.

  Args:
    spark: SparkSession.
    df: A Spark DataFrame in which each row contains the slide number,
    tumor score, molecular score, and the sample stretched out into
    a Vector.
    slide_nums: A list of slide numbers to sample from.
    folder: Directory containing a `training_ground_truth.csv` file
      containing the ground truth "tumor_score" and "molecular_score"
      labels for each slide.
    train_frac: Fraction of the data to assign to the training set, with
      `1-frac` assigned to the valiation set.
    add_row_indices: Boolean for whether or not to prepend an index
      column contain the row index for use downstream by SystemML.
      The column name will be "__INDEX".

  Returns:
    A Spark DataFrame in which each row contains the slide number, tumor
    score, molecular score, and the sample stretched out into a Vector.
  """
  # Create DataFrame of labels for the given slide numbers.
  labels_df = get_labels_df(folder)
  labels_df = labels_df.loc[slide_nums]

  # Randomly split slides 80%/20% into train and validation sets.
  train_nums_df = labels_df.sample(frac=train_frac, random_state=seed)
  val_nums_df = labels_df.drop(train_nums_df.index)

  train_nums = (spark.createDataFrame(train_nums_df)
                     .selectExpr("cast(slide_num as int)")
                     .coalesce(1))
  val_nums = (spark.createDataFrame(val_nums_df)
                   .selectExpr("cast(slide_num as int)")
                   .coalesce(1))

  # Note: Explicitly mark the smaller DataFrames as able to be broadcasted
  # in order to have Catalyst choose the more efficient BroadcastHashJoin,
  # rather than the costly SortMergeJoin.
  train = df.join(F.broadcast(train_nums), on="slide_num")
  val = df.join(F.broadcast(val_nums), on="slide_num")

  if debug:
    # DEBUG: Sanity checks.
    assert len(pd.merge(train_nums_df, val_nums_df, on="slide_num")) == 0
    assert train_nums.join(val_nums, on="slide_num").count() == 0
    assert train.join(val, on="slide_num").count() == 0
    #  - Check distributions.
    for pdf in train_nums_df, val_nums_df:
      print(pdf.count())
      print(pdf["tumor_score"].value_counts(sort=False))
      print(pdf["tumor_score"].value_counts(normalize=True, sort=False), "\n")
    #  - Check total number of examples in each.
    print(train.count(), val.count())
    #  - Check physical plans for broadcast join.
    print(train.explain(), val.explain())

  # Add row indices for use with SystemML.
  if add_row_indices:
    train = (train.rdd
                  .zipWithIndex()
                  .map(lambda r: (r[1] + 1, *r[0]))  # flatten & convert index to 1-based indexing
                  .toDF(['__INDEX', 'slide_num', 'tumor_score', 'molecular_score', 'sample']))
    train = train.select(train["__INDEX"].astype("int"), train.slide_num.astype("int"),
                         train.tumor_score.astype("int"), train.molecular_score, train["sample"])

    val = (val.rdd
              .zipWithIndex()
              .map(lambda r: (r[1] + 1, *r[0]))  # flatten & convert index to 1-based indexing
              .toDF(['__INDEX', 'slide_num', 'tumor_score', 'molecular_score', 'sample']))
    val = val.select(val["__INDEX"].astype("int"), val.slide_num.astype("int"),
                     val.tumor_score.astype("int"), val.molecular_score, val["sample"])

  return train, val
Ejemplo n.º 35
0
def spark_chrono_split(
    data,
    ratio=0.75,
    min_rating=1,
    filter_by="user",
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_timestamp=DEFAULT_TIMESTAMP_COL,
):
    """Spark chronological splitter
    This function splits data in a chronological manner. That is, for each user / item, the
    split function takes proportions of ratings which is specified by the split ratio(s).
    The split is stratified.

    Args:
        data (spark.DataFrame): Spark DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two sets and the ratio argument indicates the ratio of
            training data set; if it is a list of float numbers, the splitter splits 
            data into several portions corresponding to the split ratios. If a list is 
            provided and the ratios are not summed to 1, they will be normalized.
        seed (int): Seed.
        min_rating (int): minimum number of ratings for user or item.
        filter_by (str): either "user" or "item", depending on which of the two is to filter
            with min_rating.
        col_user (str): column name of user IDs.
        col_item (str): column name of item IDs.
        col_timestamp (str): column name of timestamps.

    Returns:
        list: Splits of the input data as spark.DataFrame.
    """
    if not (filter_by == "user" or filter_by == "item"):
        raise ValueError("filter_by should be either 'user' or 'item'.")

    if min_rating < 1:
        raise ValueError("min_rating should be integer and larger than or equal to 1.")

    multi_split, ratio = process_split_ratio(ratio)

    split_by_column = col_user if filter_by == "user" else col_item

    if min_rating > 1:
        data = min_rating_filter_spark(
            data,
            min_rating=min_rating,
            filter_by=filter_by,
            col_user=col_user,
            col_item=col_item,
        )

    ratio = ratio if multi_split else [ratio, 1 - ratio]
    ratio_index = np.cumsum(ratio)

    window_spec = Window.partitionBy(split_by_column).orderBy(col(col_timestamp))

    rating_grouped = (
        data.groupBy(split_by_column)
        .agg({col_timestamp: "count"})
        .withColumnRenamed("count(" + col_timestamp + ")", "count")
    )
    rating_all = data.join(broadcast(rating_grouped), on=split_by_column)

    rating_rank = rating_all.withColumn(
        "rank", row_number().over(window_spec) / col("count")
    )

    splits = []
    for i, _ in enumerate(ratio_index):
        if i == 0:
            rating_split = rating_rank.filter(col("rank") <= ratio_index[i])
        else:
            rating_split = rating_rank.filter(
                (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1])
            )

        splits.append(rating_split)

    return splits