def token_score(df, on, value): q_val = value df = df.select([on]) df = df.withColumn('query', F.lit(q_val).cast(F.StringType())) # TODO: implement the pattent pattern = ',' df = df.withColumn('tokens1', F.split(F.col('left'), pattern)) df = df.withColumn('tokens2', F.split(F.col('right'), pattern)) # intersection = tokens1.intersection(tokens2) # diff1to2 = tokens1.difference(tokens2) = pure token 1 # diff2to1 = tokens2.difference(tokens1) = pure token 2 # TODO: implement an intersect and a diff method df = df.withColumn('intersection', F.intersect('tokens1', 'tokens2')) df = df.withColumn('diff1to2', F.diff('tokens1', 'tokens2')) df = df.withColumn('diff2to1', F.diff('tokens2', 'tokens1')) # sorted_sect = " ".join(sorted(intersection)) # sorted_1to2 = " ".join(sorted(diff1to2)) # sorted_2to1 = " ".join(sorted(diff2to1)) # TODO: implement a concat for an array df = df.withColumn('sorted_sect', F.concat_ws(' ', F.sort_array('intersection'))) df = df.withColumn('sorted_1to2 ', F.concat_ws(' ', F.sort_array('diff1to2'))) df = df.withColumn('sorted_2to1', F.concat_ws(' ', F.sort_array('diff2to1'))) # combined_1to2 = sorted_sect + " " + sorted_1to2 = chain 1 that has been sorted # combined_2to1 = sorted_sect + " " + sorted_2to1 = chain 2 that has been sorted # TODO: no, i'm joking df = df.withColumn('combined_1to2', F.concat_ws(' ', ['sorted_sect', 'sorted_1to2'])) df = df.withColumn('combined_1to2', F.concat_ws(' ', ['sorted_sect', 'sorted_2to1'])) # strip # sorted_sect = sorted_sect.strip() # combined_1to2 = combined_1to2.strip() # combined_2to1 = combined_2to1.strip() for c in ['sorted_sect', 'combined_1to2', 'combined_2to1']: df = df.withColumn(c, F.trim(c)) # TODO: create a function spark_ratio df = df.withColumn( 'ratio1', spark_ratio(F.col('sorted_sect', F.col('combined_1to2')))) df = df.withColumn( 'ratio2', spark_ratio(F.col('sorted_sect', F.col('combined_2to1')))) df = df.withColumn( 'ratio3', spark_ratio(F.col('combined_2to1', F.col('combined_1to2')))) # pairwise = [ # ratio_func(sorted_sect, combined_1to2), # ratio_func(sorted_sect, combined_2to1), # ratio_func(combined_1to2, combined_2to1) # ] df = df.withColumn('max_ratio', F.max(['ratio1', 'ratio2', 'ratio3'])) df = df.withColumnRenamed('max_ratio', 'token_fuzzy') df = df.select(['token_fuzzy']) return df
def prepare_final_df(self): self.df = self.df.select("date", func.explode("message.updates")) selected = self.df.select("date", "col.startDatetime", "col.updateType", "col.name", "col.finished").filter( func.col("name").startswith("[")) grouped = selected.groupBy("name", "startDatetime").agg( func.sort_array(func.collect_list("date")).alias("date_array"), func.collect_list("updateType").alias("updateType_array"), func.reverse( func.collect_list("finished")).getItem(0).alias("finished"), ) preprocessed = (grouped.withColumn( "start_datetime", grouped.startDatetime.cast(TimestampType())).withColumn( "last_update", self.helper.get_last_date_udf(grouped.date_array)).withColumn( "finished", func.col("finished").cast(BooleanType())).withColumn( "meeting_name", func.col("name")).select( "start_datetime", "last_update", "finished", "meeting_name").withColumn( "duration", func.col("last_update").cast(LongType()) - func.col("start_datetime").cast(LongType()), )) preprocessed.printSchema() return self.do_post_preprocessing(preprocessed)
def processItemSequence1( spark, rawSampleDataPath): # refactor above code without using UDF # rating data customStruct = StructType([ StructField("userId", StringType()), StructField("movieId", StringType()), StructField("rating", FloatType()), StructField("timestamp", IntegerType()) ]) ratingSamples = spark.read.format("csv").option( "header", "true").schema(customStruct).load(rawSampleDataPath) print("ratingSample and schema:") ratingSamples.show(5) ratingSamples.printSchema() userSeq = ratingSamples \ .where(F.col("rating") >= 3.5) \ .groupBy("userId") \ .agg(F.sort_array(F.collect_list(F.struct("timestamp", "movieId"))).alias("sortedTimeAndMovieId")) \ .withColumn("sortedMovieId", F.col("sortedTimeAndMovieId.movieId")) \ .drop("sortedTimeAndMovieId") print("movieIdStr:") flatedMovieList = userSeq.select("sortedMovieId").rdd.flatMap( lambda x: x) # same as rdd.map(lambda x: x[0]) # flatedMovieList = userSeq.agg(F.collect_list("sortedMovieId")).rdd.flatMap(lambda x: x[0]) # this also works print(flatedMovieList.take(10)) return flatedMovieList
def benchmark_extract_top_keywords(posts, n_keywords=10): """Given TF-IDF output (as "features" column) extracts out the vocabulary index of the 10 keywords with highest TF-IDF (for each post).""" def extract_keys_from_vector(vector): return vector.indices.tolist() def extract_values_from_vector(vector): return vector.values.tolist() extract_keys_from_vector_udf = udf( lambda vector: extract_keys_from_vector(vector), ArrayType(IntegerType())) extract_values_from_vector_udf = udf( lambda vector: extract_values_from_vector(vector), ArrayType(DoubleType())) posts = posts.withColumn("extracted_keys", extract_keys_from_vector_udf("features")) posts = posts.withColumn("extracted_values", extract_values_from_vector_udf("features")) posts = posts.withColumn( "zipped_truncated", slice( sort_array(arrays_zip("extracted_values", "extracted_keys"), asc=False), 1, n_keywords)) take_second = udf(lambda rows: [row[1] for row in rows], ArrayType(IntegerType())) posts = posts.withColumn("top_indices", take_second("zipped_truncated")) return posts
def view(df, state_col='_state', updated_col='_updated', hash_col='_hash'): """ Calculate a view from a log of events by performing the following actions: - squashing the events for each entry record to the last one - remove deleted record from the list """ c = set(df.columns).difference({state_col, updated_col, hash_col}) colnames = [x for x in df.columns if x in c] if updated_col not in df.columns: return df if state_col not in df.columns: return df selected_columns = colnames + ['_last.*'] groupby_columns = colnames # groupby hash_col first if available if hash_col in df.columns: selected_columns = selected_columns + [hash_col] groupby_columns = [hash_col] + groupby_columns row_groups = df.groupBy(groupby_columns) get_sorted_array = F.sort_array(F.collect_list( F.struct(F.col(updated_col), F.col(state_col))), asc=False) df_view = row_groups.agg( get_sorted_array.getItem(0).alias('_last')).select(*selected_columns) df_view = df_view.filter("{} = 0".format(state_col)) return df_view
def main(keyspace, outdir, orderkeys): result = spark.read.format("org.apache.spark.sql.cassandra") \ .options(table='orders_parts', keyspace=keyspace).load() result = result[result.orderkey.isin(orderkeys)] result = result.select( result['orderkey'], result['totalprice'], functions.sort_array('part_names').alias('part_names')).orderBy( result['orderkey']) result.rdd.map(output_line).saveAsTextFile(outdir)
def get_word_vec(self): data = self.merge_df.groupBy('user_id').agg( func.sort_array(func.collect_list(func.struct(func.col('time'), func.col('ad_id'))), asc=True).alias( 'items')) data = data.withColumn("items", func.udf(lambda x: [i[1] for i in x], ArrayType(StringType()))('items')) word2Vec = Word2Vec(vectorSize=128, minCount=10, inputCol="items", outputCol="result") model = word2Vec.fit(data.repartition(1000)) return model
def invert_index(term_doc_df): """ Method inverts indices as defined in bsbi algorithm using spark :param term_doc_df: :return: """ spark_session = Inverter.create_session('invert_index') spark_term_doc_df = spark_session.createDataFrame(term_doc_df) output = spark_term_doc_df.groupby('termID').agg(functions.collect_list('docID').alias('docID_temp')) return output.select("*", functions.sort_array(output["docID_temp"]).alias('docID'))
def test_solve_irls_eqn(spark): lvl1df = spark.read.parquet( f'{data_root}/bt_reduceded_1part.snappy.parquet') sample_blocks_df = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet') \ .withColumn('sample_block', f.col('sample_block').cast('string')) \ .withColumn('sample_ids', f.expr('transform(sample_ids, v -> cast(v as string))')) sample_blocks = { r.sample_block: r.sample_ids for r in sample_blocks_df.collect() } with open(f'{data_root}/test_solve_irls_eqn.json') as json_file: test_values = json.load(json_file) map_key_pattern = ['sample_block', 'label', 'alpha_name'] reduce_key_pattern = ['header_block', 'header', 'label', 'alpha_name'] model_key_pattern = ['sample_block', 'label', 'alpha_name'] map_udf = f.pandas_udf( lambda key, pdf: map_irls_eqn(key, map_key_pattern, pdf, labeldf, sample_blocks, covdf, beta_cov_dict, maskdf, alphas), irls_eqn_struct, PandasUDFType.GROUPED_MAP) reduce_udf = f.pandas_udf( lambda key, pdf: reduce_irls_eqn(key, reduce_key_pattern, pdf), irls_eqn_struct, PandasUDFType.GROUPED_MAP) model_udf = f.pandas_udf( lambda key, pdf: solve_irls_eqn(key, model_key_pattern, pdf, labeldf, alphas, covdf), model_struct, PandasUDFType.GROUPED_MAP) modeldf = lvl1df \ .withColumn('alpha_name', f.explode(f.array([f.lit(n) for n in alphas.keys()]))) \ .groupBy(map_key_pattern) \ .apply(map_udf) \ .groupBy(reduce_key_pattern) \ .apply(reduce_udf) \ .groupBy(model_key_pattern) \ .apply(model_udf) \ .withColumn('alpha_label_coef', f.expr('struct(alphas[0] AS alpha, labels[0] AS label, coefficients[0] AS coefficient)')) \ .groupBy('header_block', 'sample_block', 'header', 'sort_key', f.col('alpha_label_coef.label')) \ .agg(f.sort_array(f.collect_list('alpha_label_coef')).alias('alphas_labels_coefs')) \ .selectExpr('*', 'alphas_labels_coefs.alpha AS alphas', 'alphas_labels_coefs.label AS labels', 'alphas_labels_coefs.coefficient AS coefficients') \ .drop('alphas_labels_coefs', 'label') outdf = modeldf.filter(f'sample_block = "{test_sample_block}"') \ .filter(f.col('labels').getItem(0) == test_label) \ .orderBy('sort_key', 'header') \ .toPandas() betas_glow = np.row_stack(outdf['coefficients']) assert (np.allclose(np.array(test_values['betas']), betas_glow))
def create_ground_truth_rankings(df: DataFrame) -> DataFrame: assert {"userId", "rating", "movieId"}.issubset(df.columns) return ( df.withColumn("is_relevant", f.expr("if(rating > 3.5, true, false)")) .filter(f.col("is_relevant")) # for each user, order the relevant movies by highest rated first .withColumn("rmp", f.struct(f.col("rating"), f.col("movieId"))) .groupBy("userId") .agg(f.sort_array(f.collect_list("rmp"), asc=False).alias("rating_movie_pairs")) .rdd.map(lambda r: (r.userId, [row.movieId for row in r.rating_movie_pairs])) .toDF(["userId", "actual_ranking"]) )
def get_rankings(df, gt_col, est_col, query_col='query', id_col='pid'): """ Does grouping, aggregation, and sorting of df to get rankings. df - a pyspark dataframe gt_col - string, the name of the column containing the ground truth relevance for query-pid pair est_col - string, the name of the column containing the estimated relevance for query-pid pair query_col - string (default 'query'), the name of the column containting the query or search terms or whatever id_col - string (default 'pid'), the name of the column containing item id returns: a pyspark dataframe with a single row per unique query, with columns: "query" (string) the query, "best_order" (array), list of pids ordered by gt relevance, "estimated_order" (array), list of pids ordered by estimated relevance """ sorted_sdf = ( df.groupBy(query_col) .agg( F.sort_array( F.collect_list( F.struct( F.col(gt_col), F.col(id_col)) ), asc = False).alias('best'), F.sort_array( F.collect_list( F.struct( F.col(est_col), F.col(id_col)) ), asc = False).alias('estimated') ) ) return sorted_sdf.select(query_col, 'best.{}'.format(id_col), 'estimated').withColumnRenamed(id_col, 'best_order').select(query_col, 'best_order', 'estimated.{}'.format(id_col)).withColumnRenamed(id_col, 'estimated_order')
def dataframe_eventsource_view(df, state_col='_state', updated_col='_updated'): # calculate a view by : # - squashing the events for each entry record to the last one # - remove deleted record from the list c = set(df.columns).difference({state_col, updated_col}) colnames = [x for x in df.columns if x in c] row_groups = df.groupBy(colnames) df_view = row_groups.agg(F.sort_array(F.collect_list(F.struct( F.col(updated_col), F.col(state_col))),asc = False).getItem(0).alias('_last')).select(*colnames, '_last.*') df = df_view.filter("{} = 0".format(state_col)) return df
def genarate_recency_feature(event, snapshot_date): df_grouped = df.filter(F.col("event") == "P").groupBy("uid").agg( F.collect_set("event").alias("event"), F.collect_list("date").alias("recent")) df_sorted = df_grouped.withColumn("recent", sort_array("recent", asc=False)) #spark.sql('set spark.sql.caseSensitive=true') df_last_time = df_sorted.selectExpr("uid", "recent[0]") df_recency_feature = df_last_time.withColumn( 'rec_' + event, datediff(to_date(lit(snapshot_date)), "recent[0]")) df_recency_feature.drop(F.col('recent[0]')) return df_recency_feature
def get_artists(table): """ Get artist information (artist_name, artist_msid etc) for every user ordered by listen count Args: table (str): name of the temporary table. Returns: iterator (iter): an iterator over result { user1: [{ 'artist_name': str, 'artist_msid': str, 'artist_mbids': list(str), 'listen_count': int }], user2: [{...}], } """ result = run_query(""" WITH intermediate_table as ( SELECT user_name , artist_name , CASE WHEN cardinality(artist_mbids) > 0 THEN NULL ELSE nullif(artist_msid, '') END as artist_msid , artist_mbids FROM {table} ) SELECT * , count(*) as listen_count FROM intermediate_table GROUP BY user_name , artist_name , artist_msid , artist_mbids """.format(table=table)) iterator = result \ .withColumn("artists", struct("listen_count", "artist_name", "artist_msid", "artist_mbids")) \ .groupBy("user_name") \ .agg(sort_array(collect_list("artists"), asc=False).alias("artists")) \ .toLocalIterator() return iterator
def calculate_listening_activity(): """ Calculate number of listens for each user in time ranges given in the "time_range" table. The time ranges are as follows: 1) week - each day with weekday name of the past 2 weeks. 2) month - each day the past 2 months. 3) year - each month of the past 2 years. 4) all_time - each year starting from LAST_FM_FOUNDING_YEAR (2002) """ # Calculate the number of listens in each time range for each user except the time ranges which have zero listens. result_without_zero_days = run_query(""" SELECT listens.user_name , time_range.time_range , count(listens.user_name) as listen_count FROM listens JOIN time_range ON listens.listened_at >= time_range.start AND listens.listened_at <= time_range.end GROUP BY listens.user_name , time_range.time_range """) result_without_zero_days.createOrReplaceTempView( "result_without_zero_days") # Add the time ranges which have zero listens to the previous dataframe result = run_query(""" SELECT dist_user_name.user_name , time_range.time_range , to_unix_timestamp(time_range.start) as from_ts , to_unix_timestamp(time_range.end) as to_ts , ifnull(result_without_zero_days.listen_count, 0) as listen_count FROM (SELECT DISTINCT user_name FROM listens) dist_user_name CROSS JOIN time_range LEFT JOIN result_without_zero_days ON result_without_zero_days.user_name = dist_user_name.user_name AND result_without_zero_days.time_range = time_range.time_range """) # Create a table with a list of time ranges and corresponding listen count for each user iterator = result \ .withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \ .groupBy("user_name") \ .agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \ .toLocalIterator() return iterator
def calculate_daily_activity(): """ Calculate number of listens for each user in each hour. """ # Genarate a dataframe containing hours of all days of the week weekdays = [calendar.day_name[day] for day in range(0, 7)] hours = [hour for hour in range(0, 24)] time_range = itertools.product(weekdays, hours) time_range_df = listenbrainz_spark.session.createDataFrame( time_range, schema=["day", "hour"]) time_range_df.createOrReplaceTempView("time_range") # Truncate listened_at to day and hour to improve matching speed formatted_listens = run_query(""" SELECT user_id , date_format(listened_at, 'EEEE') as day , date_format(listened_at, 'H') as hour FROM listens """) formatted_listens.createOrReplaceTempView("listens") # Calculate the number of listens in each time range for each user except the time ranges which have zero listens. result = run_query(""" SELECT listens.user_id , time_range.day , time_range.hour , count(*) as listen_count FROM listens JOIN time_range ON listens.day == time_range.day AND listens.hour == time_range.hour GROUP BY listens.user_id , time_range.day , time_range.hour """) # Create a table with a list of time ranges and corresponding listen count for each user iterator = result \ .withColumn("daily_activity", struct("hour", "day", "listen_count")) \ .groupBy("user_id") \ .agg(sort_array(collect_list("daily_activity")).alias("daily_activity")) \ .toLocalIterator() return iterator
def view(df, state_col='_state', updated_col='_updated', merge_on=None, version=None): """ Calculate a view from a log of events by performing the following actions: - squashing the events for each entry record to the last one - remove deleted record from the list """ c = set(df.columns).difference({state_col, updated_col}) colnames = [x for x in df.columns if x in c] if updated_col not in df.columns: df = add_update_column(df, '_updated') if state_col not in df.columns: df = df.withColumn('_state', F.lit(0)) on = merge_on or colnames on = on if isinstance(on, (list, tuple)) else [on] groupby_columns = [c for c in on if c in colnames] #filter version df = filter_by_version(df, '_updated', version) # group by row_groups = df.select(*groupby_columns, '_updated').groupBy(groupby_columns) # sort each group by descending _updated get_sorted_array = F.sort_array(F.collect_list(F.col('_updated')), asc=False) # get the selected rows (only group columns and _updated) df_view = row_groups.agg(get_sorted_array.getItem(0).alias('_updated')) # get all original columns by join df_res = df.join(df_view, on=[*groupby_columns, '_updated']) # remove deleted records and _updated/_state columns return df_res.filter('_state =0').select(*colnames)
def build_dataset(self): logger.info('doing') path = self.path_root + '/feature-pandas.json' configs = demjson.decode(open(path, 'r').read()) aggs = [] pass_keys = [] # ['time','advertiser_id','creative_id','weekday','product_id','industry'] for config in configs['feature1']['stat_cols2']: col = config['col'] col_name = config['col_name'] agg_func = config['agg_func'] if col in pass_keys: continue if agg_func == 'count': aggs.append(func.count(col).alias(col_name)) elif agg_func == 'unique': aggs.append(func.countDistinct(col).alias(col_name)) elif agg_func == 'set': aggs.append(func.collect_set(col).alias(col_name)) elif agg_func == 'list': aggs.append( func.sort_array(func.collect_list(func.struct(func.col('time'), func.col(col))), asc=True).alias( col_name)) else: print(config) # print(aggs) df = self.merge_df.groupBy(configs['feature1']['group_key']).agg(*aggs) for config in configs['feature1']['stat_cols2']: col = config['col'] col_name = config['col_name'] agg_func = config['agg_func'] if col in pass_keys: continue if agg_func == 'list': df = df.withColumn(col_name, func.udf(lambda x: [i[1] for i in x], ArrayType(StringType()))(col_name)) if agg_func in ('list', 'set'): df = df.withColumn(col_name, func.udf(lambda x: list(list(x)[-20:] + ['0'] * (20 - len(list(x)))), ArrayType(StringType()))(col_name)) df = df.join(self.user_df, on='user_id') self.dataframe = df.cache() logger.info('done')
def get_variants_df(df, parameters=None): """Gets variants dataframe from the Spark dataframe """ if parameters is None: parameters = {} timestamp_key = parameters[ PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY case_id_glue = parameters[ PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME activity_key = parameters[ PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY df = df.select(case_id_glue, activity_key) grouped_df = df.withColumn("@@id", F.monotonically_increasing_id())\ .groupBy(case_id_glue)\ .agg(F.collect_list(F.struct("@@id", activity_key)).alias("variant"))\ .select(case_id_glue, F.sort_array("variant").getItem(activity_key).alias("variant")) grouped_df = grouped_df.withColumn("variant", F.concat_ws(",", "variant")) return grouped_df
def extract_top_keywords(posts, vocabulary, n_keywords=10): """Given word count (Count Vectorizer) output (as "features" column) - extracts out the vocabulary index of the 10 keywords with highest TF-IDF (for each post).""" def extract_keys_from_vector(vector): return vector.indices.tolist() def extract_values_from_vector(vector): return vector.values.tolist() extract_keys_from_vector_udf = udf( lambda vector: extract_keys_from_vector(vector), ArrayType(IntegerType())) extract_values_from_vector_udf = udf( lambda vector: extract_values_from_vector(vector), ArrayType(DoubleType())) idf_udf = array_transform(idf_wiki) vocab_dict = {k: v for k, v in enumerate(vocabulary)} def ix_to_word(ix): return vocab_dict[ix] vocab_udf = array_transform(ix_to_word) posts = posts.withColumn("word_ix", extract_keys_from_vector_udf("features")) posts = posts.withColumn("word_count", extract_values_from_vector_udf("features")) posts = posts.withColumn('words', vocab_udf(col('word_ix'))) posts = posts.withColumn("idf", idf_udf(col("words"))) posts = posts.withColumn( "zipped_truncated", slice(sort_array(arrays_zip("idf", "words"), asc=False), 1, n_keywords)) take_second = udf(lambda rows: [row[1] for row in rows], ArrayType(StringType())) posts = posts.withColumn("top_keywords", take_second("zipped_truncated")) return posts['CreationDate', 'top_keywords', 'Tags', 'ParentId']
def get_listening_activity(): """ Calculate number of listens for each user in time ranges given in the 'time_range' table """ # Calculate the number of listens in each time range for each user except the time ranges which have zero listens. result_without_zero_days = run_query(""" SELECT listens.user_name , time_range.time_range , count(listens.user_name) as listen_count FROM listens JOIN time_range ON listens.listened_at >= time_range.start AND listens.listened_at <= time_range.end GROUP BY listens.user_name , time_range.time_range """) result_without_zero_days.createOrReplaceTempView('result_without_zero_days') # Add the time ranges which have zero listens to the previous dataframe result = run_query(""" SELECT dist_user_name.user_name , time_range.time_range , to_unix_timestamp(time_range.start) as from_ts , to_unix_timestamp(time_range.end) as to_ts , ifnull(result_without_zero_days.listen_count, 0) as listen_count FROM (SELECT DISTINCT user_name FROM listens) dist_user_name CROSS JOIN time_range LEFT JOIN result_without_zero_days ON result_without_zero_days.user_name = dist_user_name.user_name AND result_without_zero_days.time_range = time_range.time_range """) # Create a table with a list of time ranges and corresponding listen count for each user iterator = result \ .withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \ .groupBy("user_name") \ .agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \ .toLocalIterator() return iterator
def get_listening_activity_all_time() -> Iterator[Optional[UserListeningActivityStatMessage]]: """ Calculate the number of listens for an user in each year starting from LAST_FM_FOUNDING_YEAR (2002). """ logger.debug("Calculating listening_activity_all_time") to_date = get_latest_listen_ts() from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1) result_without_zero_years = None for year in range(from_date.year, to_date.year+1): year_start = datetime(year, 1, 1) year_end = get_year_end(year) try: _get_listens(year_start, year_end) except HDFSException: # Skip if no listens present in df continue year_df = run_query(""" SELECT user_name, count(user_name) as listen_count FROM listens GROUP BY user_name """) year_df = year_df.withColumn('time_range', lit(str(year))).withColumn( 'from_ts', lit(year_start.timestamp())).withColumn('to_ts', lit(year_end.timestamp())) result_without_zero_years = result_without_zero_years.union(year_df) if result_without_zero_years else year_df # Create a table with a list of time ranges and corresponding listen count for each user data = result_without_zero_years \ .withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \ .groupBy("user_name") \ .agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \ .toLocalIterator() messages = create_messages(data=data, stats_range='all_time', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def get_artists(table: str, date_format: str, use_mapping: bool): """ Get artist information (artist_name, artist_msid etc) for every time range specified the "time_range" table ordered by listen count Args: table: Name of the temporary table. date_format: Format in which the listened_at field should be formatted. use_mapping: Flag to optionally use the mapping step to improve the results Returns: iterator (iter): An iterator over result """ # Format the listened_at field according to the provided date_format string formatted_listens = run_query(""" SELECT artist_name , artist_msid , artist_mbids , date_format(listened_at, '{date_format}') as listened_at FROM {table} """.format(table=table, date_format=date_format)) formatted_listens.createOrReplaceTempView('listens') # Use MSID-MBID mapping to improve results if use_mapping: mapped_df = _create_mapped_dataframe() mapped_df.createOrReplaceTempView('listens') # If not using MSID-MBID mapping, run this step to neglect MSID when MBID is present if not use_mapping: run_query(""" SELECT artist_name , CASE WHEN cardinality(artist_mbids) > 0 THEN NULL ELSE nullif(artist_msid, '') END as artist_msid , artist_mbids , listened_at FROM listens """).createOrReplaceTempView('listens') result = run_query(""" SELECT listens.artist_name , listens.artist_msid , listens.artist_mbids , time_range.time_range , time_range.from_ts , time_range.to_ts , count(*) as listen_count FROM listens JOIN time_range ON listens.listened_at == time_range.time_range GROUP BY listens.artist_name , listens.artist_msid , listens.artist_mbids , time_range.time_range , time_range.from_ts , time_range.to_ts """) iterator = result \ .withColumn("artists", struct("listen_count", "artist_name", "artist_msid", "artist_mbids")) \ .groupBy("time_range", "from_ts", "to_ts") \ .agg(sort_array(collect_list("artists"), asc=False).alias("artists")) \ .toLocalIterator() return iterator
.agg( avg("Amount").alias("AverageTransaction"), sum("Amount").alias("TotalTransaction"), count("Amount").alias("NumberOfTransaction"), max("Amount").alias("MaxTransaction"), min("Amount").alias("MinTransaction"), collect_set("Description").alias("UniqueTransactionDescriptions") ) aggFinanceDf.show(5, False) aggFinanceDf\ .select( "AccountNumber", "UniqueTransactionDescriptions", size("UniqueTransactionDescriptions").alias("CountOfUniqueTransactionTypes"), sort_array("UniqueTransactionDescriptions", False).alias("OrderedUniqueTransactionDescriptions"), array_contains("UniqueTransactionDescriptions", "Movies").alias("WentToMovies") )\ .show(5,False) companiesJson = [ """{"company":"NewCo","employees":[{"firstName":"Sidhartha","lastName":"Ray"},{"firstName":"Pratik","lastName":"Solanki"}]}""", """{"company":"FamilyCo","employees":[{"firstName":"Jiten","lastName":"Pupta"},{"firstName":"Pallavi","lastName":"Gupta"}]}""", """{"company":"OldCo","employees":[{"firstName":"Vivek","lastName":"Garg"},{"firstName":"Nitin","lastName":"Gupta"}]}""", """{"company":"ClosedCo","employees":[]}""" ] companiesRDD = sparkSession.sparkContext.parallelize(companiesJson) companiesDF = sparkSession.read.json(companiesRDD) companiesDF.show(5, False)
vertices = sqlContext.createDataFrame([(user, ) for user in user_in_edge_list], ["id"]) edges = sqlContext.createDataFrame([tup[0] for tup in edge_list.collect()] \ , ["src", "dst"]) g = GraphFrame(vertices, edges) ## Label Propagation Algorithm result = g.labelPropagation(maxIter=5) import pyspark.sql.functions as fc from pyspark.sql.functions import size result = result.groupBy("label").agg( fc.sort_array(fc.collect_list("id")).alias("collected")) result = result.withColumn("len_collected", size("collected")) result = result.orderBy(col("len_collected"), col("collected")) answer = result.toPandas()["collected"] f = open(output_file, "w") for community in answer: line = "" for node in community: line += "'" + str(node) + "', " line = line[:-2] f.write(line) f.write("\n") f.close()
def collect_orderby(sdf, cols, groupby, orderby=None, suffix='__collect', sep='____', orderby_func={}, dtype=StringType(), ascending=True, drop_null=True): # 暂时不考虑空值填充,orderby=None时将去除空值 """ Paramter: ---------- sdf: pyspark dataframe to be processed cols: str/list of the sdf'cols to be processed groupby: str/list of sdf' cols to be groupbyed when collect_orderby orderby: str/list of sdf' cols to be orderbyed when collect_orderby suffix: str of cols' names converted bycollect_orderby(renamed by cols+suffix) sep: str of the sep when concat_ws(don't change by default) dtype: pyspark.sql.types of the return values Return: ---------- sdf: pyspark dataframe of collect_list orderby Example: ---------- sdf=collect_orderby(sdf,cols,groupby='user_id',orderby='time') """ # cols:需collect_list项 # groupby:为空时可传入[] # orderby:必为string、int、float项(也可有int,float型) assert not orderby_func or orderby orderby_agg_func = [] orderby_agg_cols = [] orderby_copy_cols_dict, orderby_recover_cols_dict = {}, { } # 用于orderby中有非string字段进行collect时的名称统一 if not isinstance(cols, list): cols = [cols] if not isinstance(groupby, list): groupby = [groupby] if orderby is None: orderby = [] orderby_func = {} if not isinstance(orderby, list): orderby = [orderby] # 如果orderby有字段也要进行collect且是非string类型时,需要做一次字段复制,否则会将1变成'01' for i, c in enumerate(orderby): if c in cols and dict(sdf.select(orderby).dtypes)[c] != 'string': c_orderby = f"{c}{sep}orderby" sdf = sdf.withColumn(c_orderby, F.col(c)) orderby[i] = c_orderby orderby_copy_cols_dict[c_orderby] = c if not isinstance(orderby_func, dict): if not isinstance(orderby_func, list): orderby_func = [orderby_func] orderby_func = dict(zip(orderby, [orderby_func] * len(orderby))) if not drop_null: split_udf = F.udf( lambda x: [ i.split(sep)[-1] if len(i.split(sep)) > 1 else None #当原始字段包含sep时,这里将有问题!!!! for i in x ], ArrayType(dtype)) else: split_udf = F.udf( lambda x: [ i.split(sep)[-1] #当原始字段包含sep时,这里将有问题!!!! for i in x if len(i.split(sep)) > 1 ], ArrayType(dtype)) for c in [ k for k, v in sdf.dtypes if k in cols and len(re.findall(re.compile(r'^(array|vector)'), v)) > 0 ]: logstr(f'{c}类型转换为StringType') sdf = sdf.withColumn(c, cast2str_udf(c)) # 不符合要求的先统计转为StringType() logstr('orderby', orderby) if len(orderby) != 0: # 处理orderby_func for c, f_list in orderby_func.items(): if not isinstance(f_list, list): f_list = [f_list] for i, f in enumerate(f_list): if c not in orderby: continue if isinstance(f, str): f = f_list[i] = eval(f"F.{f}") key = f"{c}{sep}{f.__name__}" orderby_agg_func.append(f(c).alias(key)) orderby_agg_cols.append(key) if c in orderby_copy_cols_dict: orderby_recover_cols_dict[ key] = f"{orderby_copy_cols_dict[c]}{sep}{f.__name__}" # 处理非字符型orderby order_int_list = [ k for k, v in sdf.dtypes if k in orderby and len(re.findall(re.compile(r'(int)'), v)) > 0 ] order_float_list = [ k for k, v in sdf.dtypes if k in orderby and len(re.findall(re.compile(r'(float|double)'), v)) > 0 ] if order_int_list: logstr('order_int_list', order_int_list) order_int_max_sdf = sdf.select(order_int_list).agg( *[F.max(c).alias(c) for c in order_int_list]) order_int_max_df = sdf.select(order_int_list).agg( *[F.max(c).alias(c) for c in order_int_list]).toPandas() order_int_max_dict = dict( zip(order_int_max_df.keys(), order_int_max_df.values.flatten().tolist())) logstr('order_int_max_dict', order_int_max_dict) for c in order_int_list: sdf = sdf.withColumn( c, F.lpad( F.col(c).cast(StringType()), len(str(order_int_max_dict[c])), '0')) if order_float_list: logstr('order_float_list', order_float_list) for c in order_float_list: sdf = sdf.withColumn(c, F.col(c).cast(StringType())) max_df = sdf.select(F.split(c, r"\.").alias(c)).select([ F.length(F.col(c)[i]).alias(c + f"__{i}") for i in range(2) ]).agg(*[ F.max(c + f"__{i}").alias(c + f"__{i}") for i in range(2) ]).toPandas() max_dict = dict( zip(max_df.keys(), max_df.values.flatten().tolist())) logstr('max_dict', max_dict) sdf = sdf.withColumn( c, F.lpad( F.col(c).cast(StringType()), max_dict[c + "__0"], '0')).withColumn( c, F.rpad( F.col(c).cast(StringType()), max_dict[c + "__1"], '0')) agg_fun_list = [ F.sort_array(F.collect_list(f"%s{sep}{c}" % '_'.join(orderby)), asc=ascending).alias(c + '_temp') for c in cols ] # 这里对于Null值的处理仍不友好,即空值会以['a',,'b']这种形式给出 sdf = sdf.select([ F.concat_ws(sep, *orderby, c).alias(f"%s{sep}{c}" % '_'.join(orderby)) for c in cols ] + groupby + orderby) sdf = sdf.groupBy(groupby).agg(*(agg_fun_list + orderby_agg_func)) sdf = sdf.select( [split_udf(c + '_temp').alias(c + suffix) for c in cols] + orderby_agg_cols + groupby) else: agg_fun_list = [F.collect_list(c).alias(c + '_temp') for c in cols] sdf = sdf.select(cols + groupby + orderby) sdf = sdf.groupBy(groupby).agg(*(agg_fun_list + orderby_agg_func)) sdf = sdf.select([ F.col(c + '_temp').cast(ArrayType(dtype)).alias(c + suffix) for c in cols ] + orderby_agg_cols + groupby) for c1, c2 in orderby_recover_cols_dict.items(): sdf = sdf.withColumnRenamed(c1, c2) return sdf
def compare(experiment_core_parquet, stats_input_parquet): conf = SparkConf().setAll([ ("spark.sql.sources.partitionColumnTypeInference.enabled", "false"), ("spark.driver.memory", "5g"), ]) spark = (SparkSession.builder.appName( "IMPC_COMPARE_STATS_LOADER_EXP_CORE").config(conf=conf).getOrCreate()) experiment_core_df = (spark.read.parquet(experiment_core_parquet).select( CSV_FIELDS).withColumn( "metadata", when(size(col("metadata")) == 0, lit(None)).otherwise(sort_array(col("metadata"))), ).withColumn( "allele_accession_id", when(col("allele_accession_id").like("%NULL%"), lit(None)).otherwise(col("allele_accession_id")), )) stats_input_df = ( spark.read.parquet(stats_input_parquet).select(CSV_FIELDS).withColumn( "litter_id", when(col("litter_id").isNull(), lit("")).otherwise(col("litter_id")), )) for col_name in [ "weight", "weight_date", "weight_days_old", "weight_parameter_stable_id", "experiment_source_id", "data_point", "strain_name", "genetic_background", "strain_accession_id", "datasource_name", "project_name", ]: experiment_core_df = experiment_core_df.drop(col_name) stats_input_df = stats_input_df.drop(col_name) experiment_core_df = experiment_core_df.where( col("parameter_stable_id") != "IMPC_EYE_092_001").where( col("age_in_days") > 0) stats_input_df = stats_input_df.where( col("parameter_stable_id") != "IMPC_EYE_092_001").where( col("age_in_days") > 0) diff_df = experiment_core_df.exceptAll(stats_input_df) diff_df = diff_df.alias("experiment_core") stats_input_df = stats_input_df.alias("etl") compare_df = diff_df.join( stats_input_df, (stats_input_df.external_sample_id == diff_df.external_sample_id) & (stats_input_df.parameter_stable_id == diff_df.parameter_stable_id) & (stats_input_df.date_of_experiment == diff_df.date_of_experiment), ) output_cols = [] for column in experiment_core_df.columns: output_cols.append("experiment_core." + column) output_cols.append("etl." + column) # compare_df.select(output_cols).show(vertical=True, truncate=False) compare_df.where( col("experiment_core.metadata_group") != col("etl.metadata_group") ).select(output_cols).show(vertical=True, truncate=False) print(experiment_core_df.count()) print(stats_input_df.count()) print(diff_df.count())
model = mh.fit(schemaFps) # Feature Transformation print("The hashed dataset where hashed values are stored in the column 'hashes':") #model_transform = model.transform(schemaFps) # model_transform.show() # Compute the locality sensitive hashes for the input rows, then perform approximate # similarity join. # We could avoid computing hashes by passing in the already-transformed dataset, e.g. # `model.approxSimilarityJoin(transformedA, transformedB, 0.6)` print("Approximately joining dfA and dfB on distance smaller than 0.3:") # print(model_transform) mol_similarity = model.approxSimilarityJoin(schemaFps, schemaFps, 1 - SIMILARITY_THRESHOLD, distCol="JaccardDistance") \ .select(f.col("datasetA.id").alias("source_id"), f.col("datasetB.id").alias("target_id"), f.col("JaccardDistance")) #TO READD if DEBUG: combinations_sim = mol_similarity.groupby(mol_similarity.source_id).agg(f.sort_array(f.collect_set("target_id")).alias("nbrs")) else: combinations_sim = mol_similarity.groupby(mol_similarity.source_id).agg(f.collect_set("target_id").alias("nbrs")) #combinations_sim = combinations_sim.withColumn("nbr_count", f.size("nbrs")) if DEBUG: combinations_sim.orderBy("source_id").show(20, False) combinations_sim.show(10)
def generate_metadata_group( experiment_specimen_df: DataFrame, impress_df: DataFrame, exp_type="experiment", ) -> DataFrame: """ Takes in an Experiment-Specimen DataFrame and the IMPReSS dataframe, and generates a hash value with the parameters marked as 'isImportant' on IMPReSS. This hash is used to identify experiments that are comparable (i.e. share the same experimental conditions). """ # Explode the experiments by procedureMetadata so each row contains data for each procedureMetadata value experiment_metadata = experiment_specimen_df.withColumn( "procedureMetadata", explode("procedureMetadata")) # Filter the IMPReSS to leave only those that generate a metadata split: isImportant = True impress_df_required = impress_df.where( (col("parameter.isImportant") == True) & (col("parameter.type") == "procedureMetadata")) # Join the experiment DF with he IMPReSS DF experiment_metadata = experiment_metadata.join( impress_df_required, ((experiment_metadata["_pipeline"] == impress_df_required["pipelineKey"]) & (experiment_metadata["_procedureID"] == impress_df_required["procedure.procedureKey"]) & (experiment_metadata["procedureMetadata._parameterID"] == impress_df_required["parameter.parameterKey"])), ) # Create a new column by concatenating the parameter name and the parameter value experiment_metadata = experiment_metadata.withColumn( "metadataItem", when( col("procedureMetadata.value").isNotNull(), concat(col("parameter.name"), lit(" = "), col("procedureMetadata.value")), ).otherwise(concat(col("parameter.name"), lit(" = "), lit("null"))), ) # Select the right column name for production and phenotyping centre depending on experiment type if exp_type == "experiment": production_centre_col = "_productionCentre" phenotyping_centre_col = "_phenotypingCentre" else: production_centre_col = "production_centre" phenotyping_centre_col = "phenotyping_centre" # Create a window for the DataFrame over experiment id, production and phenotyping centre window = Window.partitionBy( "unique_id", production_centre_col, phenotyping_centre_col).orderBy("parameter.name") # Use the window to create for every experiment an array containing the set of "parameter = value" pairs. experiment_metadata_input = experiment_metadata.withColumn( "metadataItems", collect_set(col("metadataItem")).over(window)) # Add the production centre to the metadata group when this is different form the phenotyping centre. # This is because in that given case we would like to generate a metadata split among specimens # That have been produced and phenotyped on the same centre experiment_metadata_input = experiment_metadata_input.withColumn( "metadataItems", when( (col(production_centre_col).isNotNull()) & (col(production_centre_col) != col(phenotyping_centre_col)), array_union( col("metadataItems"), array( concat(lit("ProductionCenter = "), col(production_centre_col))), ), ).otherwise(col("metadataItems")), ) # Create a string with the concatenation of the metadata items "parameter = value" separated by '::'. experiment_metadata = experiment_metadata_input.groupBy( "unique_id", production_centre_col, phenotyping_centre_col).agg( concat_ws("::", sort_array(max( col("metadataItems")))).alias("metadataGroupList")) # Hash the list to generate a medata group identifier. experiment_metadata = experiment_metadata.withColumn( "metadataGroup", md5(col("metadataGroupList"))) # Select the experiment IDs and the metadata group IDs experiment_metadata = experiment_metadata.select("unique_id", "metadataGroup") # Join the original experiment DataFrame with the result of the metadata group generation experiment_specimen_df = experiment_specimen_df.join( experiment_metadata, "unique_id", "left_outer") # Add the hashed version of an empty string to those rows without a metadata group. experiment_specimen_df = experiment_specimen_df.withColumn( "metadataGroup", when(experiment_specimen_df["metadataGroup"].isNull(), md5(lit(""))).otherwise(experiment_specimen_df["metadataGroup"]), ) return experiment_specimen_df
def generate_metadata( experiment_specimen_df: DataFrame, impress_df: DataFrame, exp_type="experiment", ) -> DataFrame: """ Takes in a DataFrame with experiment and specimen data, and the IMPReSS information DataFrame. For every experiment generates a list of "parameter = value" pairs containing every metadata parameter for a that experiment. """ # Explodes the experiment DF so every row represents a procedureMetadata value experiment_metadata = experiment_specimen_df.withColumn( "procedureMetadata", explode("procedureMetadata")) # Filters the IMPReSS DF so it only contains metadata parameters impress_df_required = impress_df.where( (col("parameter.type") == "procedureMetadata")) # Joins the experiment metadata values with the IMPReSS DF. experiment_metadata = experiment_metadata.join( impress_df_required, ((experiment_metadata["_pipeline"] == impress_df_required["pipelineKey"]) & (experiment_metadata["_procedureID"] == impress_df_required["procedure.procedureKey"]) & (experiment_metadata["procedureMetadata._parameterID"] == impress_df_required["parameter.parameterKey"])), ) # Some experimenter IDs need to be replaced on the metadata values process_experimenter_id_udf = udf(_process_experimenter_id, StringType()) experiment_metadata = experiment_metadata.withColumn( "experimenterIdMetadata", when( lower(col("parameter.name")).contains("experimenter"), process_experimenter_id_udf("procedureMetadata"), ).otherwise(lit(None)), ) experiment_metadata = experiment_metadata.withColumn( "procedureMetadata.value", when( col("experimenterIdMetadata").isNotNull(), col("experimenterIdMetadata")).otherwise( "procedureMetadata.value"), ) # Create the "parameter = value" pairs, if there is not value: "parameter = null" experiment_metadata = experiment_metadata.withColumn( "metadataItem", concat( col("parameter.name"), lit(" = "), when( col("procedureMetadata.value").isNotNull(), col("procedureMetadata.value"), ).otherwise(lit("null")), ), ) # Select the right column name for production and phenotyping centre depending on experiment type if exp_type == "experiment": production_centre_col = "_productionCentre" phenotyping_centre_col = "_phenotypingCentre" else: production_centre_col = "production_centre" phenotyping_centre_col = "phenotyping_centre" # Group by the experiment unique_id, phenotyping centre and production centre # And create an array containing all the metadata pairs experiment_metadata = experiment_metadata.groupBy( "unique_id", production_centre_col, phenotyping_centre_col).agg( sort_array(collect_set(col("metadataItem"))).alias("metadata")) # Append the phenotyping centre value to all the # experiments where the phenotyping centre != the production centre experiment_metadata = experiment_metadata.withColumn( "metadata", when( (col(production_centre_col).isNotNull()) & (col(production_centre_col) != col(phenotyping_centre_col)), udf(_append_phenotyping_centre_to_metadata, ArrayType(StringType()))(col("metadata"), col(production_centre_col)), ).otherwise(col("metadata")), ) # Join the original experiment DF with the resulting experiment metadata DF experiment_specimen_df = experiment_specimen_df.join( experiment_metadata, "unique_id", "left_outer") return experiment_specimen_df