Ejemplo n.º 1
0
def token_score(df, on, value):
    q_val = value
    df = df.select([on])
    df = df.withColumn('query', F.lit(q_val).cast(F.StringType()))
    # TODO: implement the pattent
    pattern = ','
    df = df.withColumn('tokens1', F.split(F.col('left'), pattern))
    df = df.withColumn('tokens2', F.split(F.col('right'), pattern))
    # intersection = tokens1.intersection(tokens2)
    # diff1to2 = tokens1.difference(tokens2) = pure token 1
    # diff2to1 = tokens2.difference(tokens1) = pure token 2
    # TODO: implement an intersect and a diff method
    df = df.withColumn('intersection', F.intersect('tokens1', 'tokens2'))
    df = df.withColumn('diff1to2', F.diff('tokens1', 'tokens2'))
    df = df.withColumn('diff2to1', F.diff('tokens2', 'tokens1'))
    # sorted_sect = " ".join(sorted(intersection))
    # sorted_1to2 = " ".join(sorted(diff1to2))
    # sorted_2to1 = " ".join(sorted(diff2to1))
    # TODO: implement a concat for an array
    df = df.withColumn('sorted_sect',
                       F.concat_ws(' ', F.sort_array('intersection')))
    df = df.withColumn('sorted_1to2 ',
                       F.concat_ws(' ', F.sort_array('diff1to2')))
    df = df.withColumn('sorted_2to1', F.concat_ws(' ',
                                                  F.sort_array('diff2to1')))
    # combined_1to2 = sorted_sect + " " + sorted_1to2 = chain 1 that has been sorted
    # combined_2to1 = sorted_sect + " " + sorted_2to1 = chain 2 that has been sorted
    # TODO: no, i'm joking
    df = df.withColumn('combined_1to2',
                       F.concat_ws(' ', ['sorted_sect', 'sorted_1to2']))
    df = df.withColumn('combined_1to2',
                       F.concat_ws(' ', ['sorted_sect', 'sorted_2to1']))
    # strip
    # sorted_sect = sorted_sect.strip()
    # combined_1to2 = combined_1to2.strip()
    # combined_2to1 = combined_2to1.strip()
    for c in ['sorted_sect', 'combined_1to2', 'combined_2to1']:
        df = df.withColumn(c, F.trim(c))
    # TODO: create a function spark_ratio
    df = df.withColumn(
        'ratio1', spark_ratio(F.col('sorted_sect', F.col('combined_1to2'))))
    df = df.withColumn(
        'ratio2', spark_ratio(F.col('sorted_sect', F.col('combined_2to1'))))
    df = df.withColumn(
        'ratio3', spark_ratio(F.col('combined_2to1', F.col('combined_1to2'))))
    # pairwise = [
    #     ratio_func(sorted_sect, combined_1to2),
    #     ratio_func(sorted_sect, combined_2to1),
    #     ratio_func(combined_1to2, combined_2to1)
    # ]
    df = df.withColumn('max_ratio', F.max(['ratio1', 'ratio2', 'ratio3']))
    df = df.withColumnRenamed('max_ratio', 'token_fuzzy')
    df = df.select(['token_fuzzy'])
    return df
Ejemplo n.º 2
0
    def prepare_final_df(self):
        self.df = self.df.select("date", func.explode("message.updates"))

        selected = self.df.select("date", "col.startDatetime",
                                  "col.updateType", "col.name",
                                  "col.finished").filter(
                                      func.col("name").startswith("["))

        grouped = selected.groupBy("name", "startDatetime").agg(
            func.sort_array(func.collect_list("date")).alias("date_array"),
            func.collect_list("updateType").alias("updateType_array"),
            func.reverse(
                func.collect_list("finished")).getItem(0).alias("finished"),
        )

        preprocessed = (grouped.withColumn(
            "start_datetime",
            grouped.startDatetime.cast(TimestampType())).withColumn(
                "last_update",
                self.helper.get_last_date_udf(grouped.date_array)).withColumn(
                    "finished",
                    func.col("finished").cast(BooleanType())).withColumn(
                        "meeting_name", func.col("name")).select(
                            "start_datetime", "last_update", "finished",
                            "meeting_name").withColumn(
                                "duration",
                                func.col("last_update").cast(LongType()) -
                                func.col("start_datetime").cast(LongType()),
                            ))

        preprocessed.printSchema()

        return self.do_post_preprocessing(preprocessed)
Ejemplo n.º 3
0
def processItemSequence1(
        spark, rawSampleDataPath):  # refactor above code without using UDF
    # rating data
    customStruct = StructType([
        StructField("userId", StringType()),
        StructField("movieId", StringType()),
        StructField("rating", FloatType()),
        StructField("timestamp", IntegerType())
    ])
    ratingSamples = spark.read.format("csv").option(
        "header", "true").schema(customStruct).load(rawSampleDataPath)
    print("ratingSample and schema:")
    ratingSamples.show(5)
    ratingSamples.printSchema()
    userSeq = ratingSamples \
        .where(F.col("rating") >= 3.5) \
        .groupBy("userId") \
        .agg(F.sort_array(F.collect_list(F.struct("timestamp", "movieId"))).alias("sortedTimeAndMovieId")) \
        .withColumn("sortedMovieId", F.col("sortedTimeAndMovieId.movieId")) \
        .drop("sortedTimeAndMovieId")
    print("movieIdStr:")
    flatedMovieList = userSeq.select("sortedMovieId").rdd.flatMap(
        lambda x: x)  # same as rdd.map(lambda x: x[0])
    # flatedMovieList = userSeq.agg(F.collect_list("sortedMovieId")).rdd.flatMap(lambda x: x[0])  # this also works
    print(flatedMovieList.take(10))
    return flatedMovieList
Ejemplo n.º 4
0
def benchmark_extract_top_keywords(posts, n_keywords=10):
    """Given TF-IDF output (as "features" column) extracts out the vocabulary index of the
    10 keywords with highest TF-IDF (for each post)."""
    def extract_keys_from_vector(vector):
        return vector.indices.tolist()

    def extract_values_from_vector(vector):
        return vector.values.tolist()

    extract_keys_from_vector_udf = udf(
        lambda vector: extract_keys_from_vector(vector),
        ArrayType(IntegerType()))
    extract_values_from_vector_udf = udf(
        lambda vector: extract_values_from_vector(vector),
        ArrayType(DoubleType()))

    posts = posts.withColumn("extracted_keys",
                             extract_keys_from_vector_udf("features"))
    posts = posts.withColumn("extracted_values",
                             extract_values_from_vector_udf("features"))

    posts = posts.withColumn(
        "zipped_truncated",
        slice(
            sort_array(arrays_zip("extracted_values", "extracted_keys"),
                       asc=False), 1, n_keywords))

    take_second = udf(lambda rows: [row[1] for row in rows],
                      ArrayType(IntegerType()))
    posts = posts.withColumn("top_indices", take_second("zipped_truncated"))

    return posts
Ejemplo n.º 5
0
def view(df, state_col='_state', updated_col='_updated', hash_col='_hash'):
    """
    Calculate a view from a log of events by performing the following actions:
        - squashing the events for each entry record to the last one
        - remove deleted record from the list
    """

    c = set(df.columns).difference({state_col, updated_col, hash_col})
    colnames = [x for x in df.columns if x in c]

    if updated_col not in df.columns:
        return df

    if state_col not in df.columns:
        return df

    selected_columns = colnames + ['_last.*']
    groupby_columns = colnames

    # groupby hash_col first if available
    if hash_col in df.columns:
        selected_columns = selected_columns + [hash_col]
        groupby_columns = [hash_col] + groupby_columns

    row_groups = df.groupBy(groupby_columns)
    get_sorted_array = F.sort_array(F.collect_list(
        F.struct(F.col(updated_col), F.col(state_col))),
                                    asc=False)
    df_view = row_groups.agg(
        get_sorted_array.getItem(0).alias('_last')).select(*selected_columns)
    df_view = df_view.filter("{} = 0".format(state_col))

    return df_view
Ejemplo n.º 6
0
def main(keyspace, outdir, orderkeys):
    result = spark.read.format("org.apache.spark.sql.cassandra") \
    .options(table='orders_parts', keyspace=keyspace).load()
    result = result[result.orderkey.isin(orderkeys)]
    result = result.select(
        result['orderkey'], result['totalprice'],
        functions.sort_array('part_names').alias('part_names')).orderBy(
            result['orderkey'])
    result.rdd.map(output_line).saveAsTextFile(outdir)
Ejemplo n.º 7
0
    def get_word_vec(self):
        data = self.merge_df.groupBy('user_id').agg(
            func.sort_array(func.collect_list(func.struct(func.col('time'), func.col('ad_id'))), asc=True).alias(
                'items'))
        data = data.withColumn("items", func.udf(lambda x: [i[1] for i in x], ArrayType(StringType()))('items'))

        word2Vec = Word2Vec(vectorSize=128, minCount=10, inputCol="items", outputCol="result")
        model = word2Vec.fit(data.repartition(1000))
        return model
Ejemplo n.º 8
0
 def invert_index(term_doc_df):
     """
             Method inverts indices as defined in bsbi algorithm using spark
     :param term_doc_df:
     :return:
     """
     spark_session = Inverter.create_session('invert_index')
     spark_term_doc_df = spark_session.createDataFrame(term_doc_df)
     output = spark_term_doc_df.groupby('termID').agg(functions.collect_list('docID').alias('docID_temp'))
     return output.select("*", functions.sort_array(output["docID_temp"]).alias('docID'))
Ejemplo n.º 9
0
def test_solve_irls_eqn(spark):
    lvl1df = spark.read.parquet(
        f'{data_root}/bt_reduceded_1part.snappy.parquet')
    sample_blocks_df = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet') \
        .withColumn('sample_block', f.col('sample_block').cast('string')) \
        .withColumn('sample_ids', f.expr('transform(sample_ids, v -> cast(v as string))'))
    sample_blocks = {
        r.sample_block: r.sample_ids
        for r in sample_blocks_df.collect()
    }
    with open(f'{data_root}/test_solve_irls_eqn.json') as json_file:
        test_values = json.load(json_file)

    map_key_pattern = ['sample_block', 'label', 'alpha_name']
    reduce_key_pattern = ['header_block', 'header', 'label', 'alpha_name']
    model_key_pattern = ['sample_block', 'label', 'alpha_name']

    map_udf = f.pandas_udf(
        lambda key, pdf:
        map_irls_eqn(key, map_key_pattern, pdf, labeldf, sample_blocks, covdf,
                     beta_cov_dict, maskdf, alphas), irls_eqn_struct,
        PandasUDFType.GROUPED_MAP)

    reduce_udf = f.pandas_udf(
        lambda key, pdf: reduce_irls_eqn(key, reduce_key_pattern, pdf),
        irls_eqn_struct, PandasUDFType.GROUPED_MAP)

    model_udf = f.pandas_udf(
        lambda key, pdf: solve_irls_eqn(key, model_key_pattern, pdf, labeldf,
                                        alphas, covdf), model_struct,
        PandasUDFType.GROUPED_MAP)

    modeldf = lvl1df \
        .withColumn('alpha_name', f.explode(f.array([f.lit(n) for n in alphas.keys()]))) \
        .groupBy(map_key_pattern) \
        .apply(map_udf) \
        .groupBy(reduce_key_pattern) \
        .apply(reduce_udf) \
        .groupBy(model_key_pattern) \
        .apply(model_udf) \
        .withColumn('alpha_label_coef', f.expr('struct(alphas[0] AS alpha, labels[0] AS label, coefficients[0] AS coefficient)')) \
        .groupBy('header_block', 'sample_block', 'header', 'sort_key', f.col('alpha_label_coef.label')) \
        .agg(f.sort_array(f.collect_list('alpha_label_coef')).alias('alphas_labels_coefs')) \
        .selectExpr('*', 'alphas_labels_coefs.alpha AS alphas', 'alphas_labels_coefs.label AS labels', 'alphas_labels_coefs.coefficient AS coefficients') \
        .drop('alphas_labels_coefs', 'label')

    outdf = modeldf.filter(f'sample_block = "{test_sample_block}"') \
        .filter(f.col('labels').getItem(0) == test_label) \
        .orderBy('sort_key', 'header') \
        .toPandas()

    betas_glow = np.row_stack(outdf['coefficients'])

    assert (np.allclose(np.array(test_values['betas']), betas_glow))
Ejemplo n.º 10
0
def create_ground_truth_rankings(df: DataFrame) -> DataFrame:
    assert {"userId", "rating", "movieId"}.issubset(df.columns)

    return (
        df.withColumn("is_relevant", f.expr("if(rating > 3.5, true, false)"))
        .filter(f.col("is_relevant"))
        # for each user, order the relevant movies by highest rated first
        .withColumn("rmp", f.struct(f.col("rating"), f.col("movieId")))
        .groupBy("userId")
        .agg(f.sort_array(f.collect_list("rmp"), asc=False).alias("rating_movie_pairs"))
        .rdd.map(lambda r: (r.userId, [row.movieId for row in r.rating_movie_pairs]))
        .toDF(["userId", "actual_ranking"])
    )
Ejemplo n.º 11
0
def get_rankings(df, gt_col, est_col, query_col='query', id_col='pid'):
    """
    Does grouping, aggregation, and sorting of df to get rankings.
    df - a pyspark dataframe
    gt_col - string, the name of the column containing the ground truth relevance for query-pid pair
    est_col - string, the name of the column containing the estimated relevance for query-pid pair
    query_col - string (default 'query'), the name of the column containting the query or search terms or whatever
    id_col - string (default 'pid'), the name of the column containing item id

    returns:
    a pyspark dataframe with a single row per unique query, with columns:
      "query" (string) the query,
      "best_order" (array), list of pids ordered by gt relevance,
      "estimated_order" (array), list of pids ordered by estimated relevance
    """

    sorted_sdf = ( df.groupBy(query_col)
                    .agg(
                    F.sort_array( F.collect_list( F.struct( F.col(gt_col), F.col(id_col)) ), asc = False).alias('best'),
                    F.sort_array( F.collect_list( F.struct( F.col(est_col), F.col(id_col)) ), asc = False).alias('estimated') )  
               )

    return sorted_sdf.select(query_col, 'best.{}'.format(id_col), 'estimated').withColumnRenamed(id_col, 'best_order').select(query_col, 'best_order', 'estimated.{}'.format(id_col)).withColumnRenamed(id_col, 'estimated_order')
Ejemplo n.º 12
0
def dataframe_eventsource_view(df, state_col='_state', updated_col='_updated'):

    # calculate a view by :
    #   - squashing the events for each entry record to the last one
    #   - remove deleted record from the list

    c = set(df.columns).difference({state_col, updated_col})
    colnames = [x for x in df.columns if x in c]

    row_groups = df.groupBy(colnames)
    df_view = row_groups.agg(F.sort_array(F.collect_list(F.struct( F.col(updated_col), F.col(state_col))),asc = False).getItem(0).alias('_last')).select(*colnames, '_last.*')
    df = df_view.filter("{} = 0".format(state_col))

    return df
Ejemplo n.º 13
0
def genarate_recency_feature(event, snapshot_date):

    df_grouped = df.filter(F.col("event") == "P").groupBy("uid").agg(
        F.collect_set("event").alias("event"),
        F.collect_list("date").alias("recent"))
    df_sorted = df_grouped.withColumn("recent", sort_array("recent",
                                                           asc=False))
    #spark.sql('set spark.sql.caseSensitive=true')
    df_last_time = df_sorted.selectExpr("uid", "recent[0]")

    df_recency_feature = df_last_time.withColumn(
        'rec_' + event, datediff(to_date(lit(snapshot_date)), "recent[0]"))
    df_recency_feature.drop(F.col('recent[0]'))

    return df_recency_feature
Ejemplo n.º 14
0
def get_artists(table):
    """ Get artist information (artist_name, artist_msid etc) for every user
        ordered by listen count

        Args:
            table (str): name of the temporary table.

        Returns:
            iterator (iter): an iterator over result
                    {
                        user1: [{
                            'artist_name': str,
                            'artist_msid': str,
                            'artist_mbids': list(str),
                            'listen_count': int
                        }],
                        user2: [{...}],
                    }
    """

    result = run_query("""
              WITH intermediate_table as (
                SELECT user_name
                     , artist_name
                     , CASE
                         WHEN cardinality(artist_mbids) > 0 THEN NULL
                         ELSE nullif(artist_msid, '')
                       END as artist_msid
                     , artist_mbids
                  FROM {table}
              )
            SELECT *
                 , count(*) as listen_count
              FROM intermediate_table
          GROUP BY user_name
                 , artist_name
                 , artist_msid
                 , artist_mbids
            """.format(table=table))

    iterator = result \
        .withColumn("artists", struct("listen_count", "artist_name", "artist_msid", "artist_mbids")) \
        .groupBy("user_name") \
        .agg(sort_array(collect_list("artists"), asc=False).alias("artists")) \
        .toLocalIterator()

    return iterator
Ejemplo n.º 15
0
def calculate_listening_activity():
    """ Calculate number of listens for each user in time ranges given in the "time_range" table.
    The time ranges are as follows:
        1) week - each day with weekday name of the past 2 weeks.
        2) month - each day the past 2 months. 
        3) year - each month of the past 2 years.
        4) all_time - each year starting from LAST_FM_FOUNDING_YEAR (2002)
    """
    # Calculate the number of listens in each time range for each user except the time ranges which have zero listens.
    result_without_zero_days = run_query("""
            SELECT listens.user_name
                 , time_range.time_range
                 , count(listens.user_name) as listen_count
              FROM listens
              JOIN time_range
                ON listens.listened_at >= time_range.start
               AND listens.listened_at <= time_range.end
          GROUP BY listens.user_name
                 , time_range.time_range
            """)
    result_without_zero_days.createOrReplaceTempView(
        "result_without_zero_days")

    # Add the time ranges which have zero listens to the previous dataframe
    result = run_query("""
            SELECT dist_user_name.user_name
                 , time_range.time_range
                 , to_unix_timestamp(time_range.start) as from_ts
                 , to_unix_timestamp(time_range.end) as to_ts
                 , ifnull(result_without_zero_days.listen_count, 0) as listen_count
              FROM (SELECT DISTINCT user_name FROM listens) dist_user_name
        CROSS JOIN time_range
         LEFT JOIN result_without_zero_days
                ON result_without_zero_days.user_name = dist_user_name.user_name
               AND result_without_zero_days.time_range = time_range.time_range
            """)

    # Create a table with a list of time ranges and corresponding listen count for each user
    iterator = result \
        .withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \
        .groupBy("user_name") \
        .agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \
        .toLocalIterator()

    return iterator
Ejemplo n.º 16
0
def calculate_daily_activity():
    """ Calculate number of listens for each user in each hour. """

    # Genarate a dataframe containing hours of all days of the week
    weekdays = [calendar.day_name[day] for day in range(0, 7)]
    hours = [hour for hour in range(0, 24)]
    time_range = itertools.product(weekdays, hours)
    time_range_df = listenbrainz_spark.session.createDataFrame(
        time_range, schema=["day", "hour"])
    time_range_df.createOrReplaceTempView("time_range")

    # Truncate listened_at to day and hour to improve matching speed
    formatted_listens = run_query("""
                            SELECT user_id
                                 , date_format(listened_at, 'EEEE') as day
                                 , date_format(listened_at, 'H') as hour
                              FROM listens
                              """)

    formatted_listens.createOrReplaceTempView("listens")

    # Calculate the number of listens in each time range for each user except the time ranges which have zero listens.
    result = run_query("""
                SELECT listens.user_id
                     , time_range.day
                     , time_range.hour
                     , count(*) as listen_count
                  FROM listens
                  JOIN time_range
                    ON listens.day == time_range.day
                   AND listens.hour == time_range.hour
              GROUP BY listens.user_id
                     , time_range.day
                     , time_range.hour
                  """)

    # Create a table with a list of time ranges and corresponding listen count for each user
    iterator = result \
        .withColumn("daily_activity", struct("hour", "day", "listen_count")) \
        .groupBy("user_id") \
        .agg(sort_array(collect_list("daily_activity")).alias("daily_activity")) \
        .toLocalIterator()

    return iterator
Ejemplo n.º 17
0
def view(df,
         state_col='_state',
         updated_col='_updated',
         merge_on=None,
         version=None):
    """
    Calculate a view from a log of events by performing the following actions:
        - squashing the events for each entry record to the last one
        - remove deleted record from the list
    """

    c = set(df.columns).difference({state_col, updated_col})
    colnames = [x for x in df.columns if x in c]

    if updated_col not in df.columns:
        df = add_update_column(df, '_updated')

    if state_col not in df.columns:
        df = df.withColumn('_state', F.lit(0))

    on = merge_on or colnames
    on = on if isinstance(on, (list, tuple)) else [on]

    groupby_columns = [c for c in on if c in colnames]

    #filter version
    df = filter_by_version(df, '_updated', version)

    # group by
    row_groups = df.select(*groupby_columns,
                           '_updated').groupBy(groupby_columns)

    # sort each group by descending _updated
    get_sorted_array = F.sort_array(F.collect_list(F.col('_updated')),
                                    asc=False)

    # get the selected rows (only group columns and _updated)
    df_view = row_groups.agg(get_sorted_array.getItem(0).alias('_updated'))

    # get all original columns by join
    df_res = df.join(df_view, on=[*groupby_columns, '_updated'])

    # remove deleted records and _updated/_state columns
    return df_res.filter('_state =0').select(*colnames)
Ejemplo n.º 18
0
    def build_dataset(self):
        logger.info('doing')
        path = self.path_root + '/feature-pandas.json'
        configs = demjson.decode(open(path, 'r').read())

        aggs = []
        pass_keys = []  # ['time','advertiser_id','creative_id','weekday','product_id','industry']
        for config in configs['feature1']['stat_cols2']:
            col = config['col']
            col_name = config['col_name']
            agg_func = config['agg_func']
            if col in pass_keys:
                continue
            if agg_func == 'count':
                aggs.append(func.count(col).alias(col_name))
            elif agg_func == 'unique':
                aggs.append(func.countDistinct(col).alias(col_name))
            elif agg_func == 'set':
                aggs.append(func.collect_set(col).alias(col_name))
            elif agg_func == 'list':
                aggs.append(
                    func.sort_array(func.collect_list(func.struct(func.col('time'), func.col(col))), asc=True).alias(
                        col_name))
            else:
                print(config)
        # print(aggs)
        df = self.merge_df.groupBy(configs['feature1']['group_key']).agg(*aggs)
        for config in configs['feature1']['stat_cols2']:

            col = config['col']
            col_name = config['col_name']
            agg_func = config['agg_func']
            if col in pass_keys:
                continue
            if agg_func == 'list':
                df = df.withColumn(col_name, func.udf(lambda x: [i[1] for i in x], ArrayType(StringType()))(col_name))
            if agg_func in ('list', 'set'):
                df = df.withColumn(col_name, func.udf(lambda x: list(list(x)[-20:] + ['0'] * (20 - len(list(x)))),
                                                      ArrayType(StringType()))(col_name))
        df = df.join(self.user_df, on='user_id')
        self.dataframe = df.cache()
        logger.info('done')
Ejemplo n.º 19
0
def get_variants_df(df, parameters=None):
    """Gets variants dataframe from the Spark dataframe
    """
    if parameters is None:
        parameters = {}

    timestamp_key = parameters[
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
    case_id_glue = parameters[
        PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
    activity_key = parameters[
        PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY

    df = df.select(case_id_glue, activity_key)
    grouped_df = df.withColumn("@@id", F.monotonically_increasing_id())\
        .groupBy(case_id_glue)\
        .agg(F.collect_list(F.struct("@@id", activity_key)).alias("variant"))\
        .select(case_id_glue, F.sort_array("variant").getItem(activity_key).alias("variant"))
    grouped_df = grouped_df.withColumn("variant", F.concat_ws(",", "variant"))

    return grouped_df
Ejemplo n.º 20
0
def extract_top_keywords(posts, vocabulary, n_keywords=10):
    """Given word count (Count Vectorizer) output (as "features" column) -
    extracts out the vocabulary index of the 10 keywords with highest TF-IDF (for each post)."""
    def extract_keys_from_vector(vector):
        return vector.indices.tolist()

    def extract_values_from_vector(vector):
        return vector.values.tolist()

    extract_keys_from_vector_udf = udf(
        lambda vector: extract_keys_from_vector(vector),
        ArrayType(IntegerType()))
    extract_values_from_vector_udf = udf(
        lambda vector: extract_values_from_vector(vector),
        ArrayType(DoubleType()))

    idf_udf = array_transform(idf_wiki)
    vocab_dict = {k: v for k, v in enumerate(vocabulary)}

    def ix_to_word(ix):
        return vocab_dict[ix]

    vocab_udf = array_transform(ix_to_word)

    posts = posts.withColumn("word_ix",
                             extract_keys_from_vector_udf("features"))
    posts = posts.withColumn("word_count",
                             extract_values_from_vector_udf("features"))
    posts = posts.withColumn('words', vocab_udf(col('word_ix')))
    posts = posts.withColumn("idf", idf_udf(col("words")))
    posts = posts.withColumn(
        "zipped_truncated",
        slice(sort_array(arrays_zip("idf", "words"), asc=False), 1,
              n_keywords))

    take_second = udf(lambda rows: [row[1] for row in rows],
                      ArrayType(StringType()))
    posts = posts.withColumn("top_keywords", take_second("zipped_truncated"))

    return posts['CreationDate', 'top_keywords', 'Tags', 'ParentId']
Ejemplo n.º 21
0
def get_listening_activity():
    """ Calculate number of listens for each user in time ranges given in the 'time_range' table """
    # Calculate the number of listens in each time range for each user except the time ranges which have zero listens.
    result_without_zero_days = run_query("""
            SELECT listens.user_name
                 , time_range.time_range
                 , count(listens.user_name) as listen_count
              FROM listens
              JOIN time_range
                ON listens.listened_at >= time_range.start
               AND listens.listened_at <= time_range.end
          GROUP BY listens.user_name
                 , time_range.time_range
            """)
    result_without_zero_days.createOrReplaceTempView('result_without_zero_days')

    # Add the time ranges which have zero listens to the previous dataframe
    result = run_query("""
            SELECT dist_user_name.user_name
                 , time_range.time_range
                 , to_unix_timestamp(time_range.start) as from_ts
                 , to_unix_timestamp(time_range.end) as to_ts
                 , ifnull(result_without_zero_days.listen_count, 0) as listen_count
              FROM (SELECT DISTINCT user_name FROM listens) dist_user_name
        CROSS JOIN time_range
         LEFT JOIN result_without_zero_days
                ON result_without_zero_days.user_name = dist_user_name.user_name
               AND result_without_zero_days.time_range = time_range.time_range
            """)

    # Create a table with a list of time ranges and corresponding listen count for each user
    iterator = result \
        .withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \
        .groupBy("user_name") \
        .agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \
        .toLocalIterator()

    return iterator
Ejemplo n.º 22
0
def get_listening_activity_all_time() -> Iterator[Optional[UserListeningActivityStatMessage]]:
    """ Calculate the number of listens for an user in each year starting from LAST_FM_FOUNDING_YEAR (2002). """
    logger.debug("Calculating listening_activity_all_time")

    to_date = get_latest_listen_ts()
    from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1)

    result_without_zero_years = None
    for year in range(from_date.year, to_date.year+1):
        year_start = datetime(year, 1, 1)
        year_end = get_year_end(year)
        try:
            _get_listens(year_start, year_end)
        except HDFSException:
            # Skip if no listens present in df
            continue
        year_df = run_query("""
                    SELECT user_name,
                           count(user_name) as listen_count
                      FROM listens
                  GROUP BY user_name
                  """)
        year_df = year_df.withColumn('time_range', lit(str(year))).withColumn(
            'from_ts', lit(year_start.timestamp())).withColumn('to_ts', lit(year_end.timestamp()))
        result_without_zero_years = result_without_zero_years.union(year_df) if result_without_zero_years else year_df

    # Create a table with a list of time ranges and corresponding listen count for each user
    data = result_without_zero_years \
        .withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \
        .groupBy("user_name") \
        .agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \
        .toLocalIterator()

    messages = create_messages(data=data, stats_range='all_time', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
Ejemplo n.º 23
0
def get_artists(table: str, date_format: str, use_mapping: bool):
    """ Get artist information (artist_name, artist_msid etc) for every time range specified
        the "time_range" table ordered by listen count

        Args:
            table: Name of the temporary table.
            date_format: Format in which the listened_at field should be formatted.
            use_mapping: Flag to optionally use the mapping step to improve the results

        Returns:
            iterator (iter): An iterator over result
    """

    # Format the listened_at field according to the provided date_format string
    formatted_listens = run_query("""
                SELECT artist_name
                     , artist_msid
                     , artist_mbids
                     , date_format(listened_at, '{date_format}') as listened_at
                  FROM {table}
            """.format(table=table, date_format=date_format))
    formatted_listens.createOrReplaceTempView('listens')

    # Use MSID-MBID mapping to improve results
    if use_mapping:
        mapped_df = _create_mapped_dataframe()
        mapped_df.createOrReplaceTempView('listens')

    # If not using MSID-MBID mapping, run this step to neglect MSID when MBID is present
    if not use_mapping:
        run_query("""
            SELECT artist_name
                 , CASE
                     WHEN cardinality(artist_mbids) > 0 THEN NULL
                     ELSE nullif(artist_msid, '')
                   END as artist_msid
                 , artist_mbids
                 , listened_at
              FROM listens
                  """).createOrReplaceTempView('listens')

    result = run_query("""
                SELECT listens.artist_name
                     , listens.artist_msid
                     , listens.artist_mbids
                     , time_range.time_range
                     , time_range.from_ts
                     , time_range.to_ts
                     , count(*) as listen_count
                  FROM listens
                  JOIN time_range
                    ON listens.listened_at == time_range.time_range
              GROUP BY listens.artist_name
                     , listens.artist_msid
                     , listens.artist_mbids
                     , time_range.time_range
                     , time_range.from_ts
                     , time_range.to_ts
              """)

    iterator = result \
        .withColumn("artists", struct("listen_count", "artist_name", "artist_msid", "artist_mbids")) \
        .groupBy("time_range", "from_ts", "to_ts") \
        .agg(sort_array(collect_list("artists"), asc=False).alias("artists")) \
        .toLocalIterator()

    return iterator
Ejemplo n.º 24
0
        .agg(   avg("Amount").alias("AverageTransaction"),
                sum("Amount").alias("TotalTransaction"),
                count("Amount").alias("NumberOfTransaction"),
                max("Amount").alias("MaxTransaction"),
                min("Amount").alias("MinTransaction"),
                collect_set("Description").alias("UniqueTransactionDescriptions")
        )

    aggFinanceDf.show(5, False)

    aggFinanceDf\
        .select(
                        "AccountNumber",
                        "UniqueTransactionDescriptions",
                        size("UniqueTransactionDescriptions").alias("CountOfUniqueTransactionTypes"),
                        sort_array("UniqueTransactionDescriptions", False).alias("OrderedUniqueTransactionDescriptions"),
                        array_contains("UniqueTransactionDescriptions", "Movies").alias("WentToMovies")
        )\
        .show(5,False)

    companiesJson = [
        """{"company":"NewCo","employees":[{"firstName":"Sidhartha","lastName":"Ray"},{"firstName":"Pratik","lastName":"Solanki"}]}""",
        """{"company":"FamilyCo","employees":[{"firstName":"Jiten","lastName":"Pupta"},{"firstName":"Pallavi","lastName":"Gupta"}]}""",
        """{"company":"OldCo","employees":[{"firstName":"Vivek","lastName":"Garg"},{"firstName":"Nitin","lastName":"Gupta"}]}""",
        """{"company":"ClosedCo","employees":[]}"""
    ]

    companiesRDD = sparkSession.sparkContext.parallelize(companiesJson)
    companiesDF = sparkSession.read.json(companiesRDD)

    companiesDF.show(5, False)
Ejemplo n.º 25
0
    vertices = sqlContext.createDataFrame([(user, )
                                           for user in user_in_edge_list],
                                          ["id"])
    edges = sqlContext.createDataFrame([tup[0] for tup in edge_list.collect()] \
                                       , ["src", "dst"])

    g = GraphFrame(vertices, edges)

    ## Label Propagation Algorithm
    result = g.labelPropagation(maxIter=5)

    import pyspark.sql.functions as fc
    from pyspark.sql.functions import size

    result = result.groupBy("label").agg(
        fc.sort_array(fc.collect_list("id")).alias("collected"))
    result = result.withColumn("len_collected", size("collected"))
    result = result.orderBy(col("len_collected"), col("collected"))

    answer = result.toPandas()["collected"]

    f = open(output_file, "w")
    for community in answer:
        line = ""
        for node in community:
            line += "'" + str(node) + "', "
        line = line[:-2]
        f.write(line)
        f.write("\n")
    f.close()
Ejemplo n.º 26
0
def collect_orderby(sdf,
                    cols,
                    groupby,
                    orderby=None,
                    suffix='__collect',
                    sep='____',
                    orderby_func={},
                    dtype=StringType(),
                    ascending=True,
                    drop_null=True):
    # 暂时不考虑空值填充,orderby=None时将去除空值
    """
    Paramter:
    ----------
    sdf: pyspark dataframe to be processed
    cols: str/list of the sdf'cols to be processed
    groupby: str/list of sdf' cols to be groupbyed when collect_orderby
    orderby: str/list of sdf' cols to be orderbyed when collect_orderby
    suffix: str of cols' names converted bycollect_orderby(renamed by cols+suffix)
    sep: str of the sep when concat_ws(don't change by default)
    dtype: pyspark.sql.types of the return values
    Return:
    ----------
    sdf: pyspark dataframe of collect_list orderby
    Example:
    ----------
    sdf=collect_orderby(sdf,cols,groupby='user_id',orderby='time')
    """
    # cols:需collect_list项
    # groupby:为空时可传入[]
    # orderby:必为string、int、float项(也可有int,float型)
    assert not orderby_func or orderby
    orderby_agg_func = []
    orderby_agg_cols = []
    orderby_copy_cols_dict, orderby_recover_cols_dict = {}, {
    }  # 用于orderby中有非string字段进行collect时的名称统一
    if not isinstance(cols, list):
        cols = [cols]
    if not isinstance(groupby, list):
        groupby = [groupby]
    if orderby is None:
        orderby = []
        orderby_func = {}
    if not isinstance(orderby, list):
        orderby = [orderby]
    # 如果orderby有字段也要进行collect且是非string类型时,需要做一次字段复制,否则会将1变成'01'
    for i, c in enumerate(orderby):
        if c in cols and dict(sdf.select(orderby).dtypes)[c] != 'string':
            c_orderby = f"{c}{sep}orderby"
            sdf = sdf.withColumn(c_orderby, F.col(c))
            orderby[i] = c_orderby
            orderby_copy_cols_dict[c_orderby] = c
    if not isinstance(orderby_func, dict):
        if not isinstance(orderby_func, list):
            orderby_func = [orderby_func]
        orderby_func = dict(zip(orderby, [orderby_func] * len(orderby)))
    if not drop_null:
        split_udf = F.udf(
            lambda x: [
                i.split(sep)[-1]
                if len(i.split(sep)) > 1 else None  #当原始字段包含sep时,这里将有问题!!!!
                for i in x
            ],
            ArrayType(dtype))
    else:
        split_udf = F.udf(
            lambda x: [
                i.split(sep)[-1]  #当原始字段包含sep时,这里将有问题!!!!
                for i in x if len(i.split(sep)) > 1
            ],
            ArrayType(dtype))
    for c in [
            k for k, v in sdf.dtypes if k in cols
            and len(re.findall(re.compile(r'^(array|vector)'), v)) > 0
    ]:
        logstr(f'{c}类型转换为StringType')
        sdf = sdf.withColumn(c, cast2str_udf(c))  # 不符合要求的先统计转为StringType()
    logstr('orderby', orderby)
    if len(orderby) != 0:
        # 处理orderby_func
        for c, f_list in orderby_func.items():
            if not isinstance(f_list, list):
                f_list = [f_list]
            for i, f in enumerate(f_list):
                if c not in orderby:
                    continue
                if isinstance(f, str):
                    f = f_list[i] = eval(f"F.{f}")
                key = f"{c}{sep}{f.__name__}"
                orderby_agg_func.append(f(c).alias(key))
                orderby_agg_cols.append(key)
                if c in orderby_copy_cols_dict:
                    orderby_recover_cols_dict[
                        key] = f"{orderby_copy_cols_dict[c]}{sep}{f.__name__}"
        # 处理非字符型orderby
        order_int_list = [
            k for k, v in sdf.dtypes
            if k in orderby and len(re.findall(re.compile(r'(int)'), v)) > 0
        ]
        order_float_list = [
            k for k, v in sdf.dtypes if k in orderby
            and len(re.findall(re.compile(r'(float|double)'), v)) > 0
        ]
        if order_int_list:
            logstr('order_int_list', order_int_list)
            order_int_max_sdf = sdf.select(order_int_list).agg(
                *[F.max(c).alias(c) for c in order_int_list])
            order_int_max_df = sdf.select(order_int_list).agg(
                *[F.max(c).alias(c) for c in order_int_list]).toPandas()
            order_int_max_dict = dict(
                zip(order_int_max_df.keys(),
                    order_int_max_df.values.flatten().tolist()))
            logstr('order_int_max_dict', order_int_max_dict)
            for c in order_int_list:
                sdf = sdf.withColumn(
                    c,
                    F.lpad(
                        F.col(c).cast(StringType()),
                        len(str(order_int_max_dict[c])), '0'))
        if order_float_list:
            logstr('order_float_list', order_float_list)
            for c in order_float_list:
                sdf = sdf.withColumn(c, F.col(c).cast(StringType()))
                max_df = sdf.select(F.split(c, r"\.").alias(c)).select([
                    F.length(F.col(c)[i]).alias(c + f"__{i}") for i in range(2)
                ]).agg(*[
                    F.max(c + f"__{i}").alias(c + f"__{i}") for i in range(2)
                ]).toPandas()
                max_dict = dict(
                    zip(max_df.keys(),
                        max_df.values.flatten().tolist()))
                logstr('max_dict', max_dict)
                sdf = sdf.withColumn(
                    c,
                    F.lpad(
                        F.col(c).cast(StringType()), max_dict[c + "__0"],
                        '0')).withColumn(
                            c,
                            F.rpad(
                                F.col(c).cast(StringType()),
                                max_dict[c + "__1"], '0'))
        agg_fun_list = [
            F.sort_array(F.collect_list(f"%s{sep}{c}" % '_'.join(orderby)),
                         asc=ascending).alias(c + '_temp') for c in cols
        ]
        # 这里对于Null值的处理仍不友好,即空值会以['a',,'b']这种形式给出
        sdf = sdf.select([
            F.concat_ws(sep, *orderby, c).alias(f"%s{sep}{c}" %
                                                '_'.join(orderby))
            for c in cols
        ] + groupby + orderby)
        sdf = sdf.groupBy(groupby).agg(*(agg_fun_list + orderby_agg_func))
        sdf = sdf.select(
            [split_udf(c + '_temp').alias(c + suffix)
             for c in cols] + orderby_agg_cols + groupby)
    else:
        agg_fun_list = [F.collect_list(c).alias(c + '_temp') for c in cols]
        sdf = sdf.select(cols + groupby + orderby)
        sdf = sdf.groupBy(groupby).agg(*(agg_fun_list + orderby_agg_func))
        sdf = sdf.select([
            F.col(c + '_temp').cast(ArrayType(dtype)).alias(c + suffix)
            for c in cols
        ] + orderby_agg_cols + groupby)
    for c1, c2 in orderby_recover_cols_dict.items():
        sdf = sdf.withColumnRenamed(c1, c2)
    return sdf
Ejemplo n.º 27
0
def compare(experiment_core_parquet, stats_input_parquet):
    conf = SparkConf().setAll([
        ("spark.sql.sources.partitionColumnTypeInference.enabled", "false"),
        ("spark.driver.memory", "5g"),
    ])
    spark = (SparkSession.builder.appName(
        "IMPC_COMPARE_STATS_LOADER_EXP_CORE").config(conf=conf).getOrCreate())
    experiment_core_df = (spark.read.parquet(experiment_core_parquet).select(
        CSV_FIELDS).withColumn(
            "metadata",
            when(size(col("metadata")) == 0,
                 lit(None)).otherwise(sort_array(col("metadata"))),
        ).withColumn(
            "allele_accession_id",
            when(col("allele_accession_id").like("%NULL%"),
                 lit(None)).otherwise(col("allele_accession_id")),
        ))
    stats_input_df = (
        spark.read.parquet(stats_input_parquet).select(CSV_FIELDS).withColumn(
            "litter_id",
            when(col("litter_id").isNull(),
                 lit("")).otherwise(col("litter_id")),
        ))
    for col_name in [
            "weight",
            "weight_date",
            "weight_days_old",
            "weight_parameter_stable_id",
            "experiment_source_id",
            "data_point",
            "strain_name",
            "genetic_background",
            "strain_accession_id",
            "datasource_name",
            "project_name",
    ]:
        experiment_core_df = experiment_core_df.drop(col_name)
        stats_input_df = stats_input_df.drop(col_name)

    experiment_core_df = experiment_core_df.where(
        col("parameter_stable_id") != "IMPC_EYE_092_001").where(
            col("age_in_days") > 0)
    stats_input_df = stats_input_df.where(
        col("parameter_stable_id") != "IMPC_EYE_092_001").where(
            col("age_in_days") > 0)

    diff_df = experiment_core_df.exceptAll(stats_input_df)
    diff_df = diff_df.alias("experiment_core")
    stats_input_df = stats_input_df.alias("etl")
    compare_df = diff_df.join(
        stats_input_df,
        (stats_input_df.external_sample_id == diff_df.external_sample_id)
        & (stats_input_df.parameter_stable_id == diff_df.parameter_stable_id)
        & (stats_input_df.date_of_experiment == diff_df.date_of_experiment),
    )
    output_cols = []
    for column in experiment_core_df.columns:
        output_cols.append("experiment_core." + column)
        output_cols.append("etl." + column)
    # compare_df.select(output_cols).show(vertical=True, truncate=False)
    compare_df.where(
        col("experiment_core.metadata_group") != col("etl.metadata_group")
    ).select(output_cols).show(vertical=True, truncate=False)
    print(experiment_core_df.count())
    print(stats_input_df.count())
    print(diff_df.count())
Ejemplo n.º 28
0
model = mh.fit(schemaFps)

# Feature Transformation
print("The hashed dataset where hashed values are stored in the column 'hashes':")
#model_transform = model.transform(schemaFps)


# model_transform.show()
# Compute the locality sensitive hashes for the input rows, then perform approximate
# similarity join.
# We could avoid computing hashes by passing in the already-transformed dataset, e.g.
# `model.approxSimilarityJoin(transformedA, transformedB, 0.6)`
print("Approximately joining dfA and dfB on distance smaller than 0.3:")
# print(model_transform)
mol_similarity = model.approxSimilarityJoin(schemaFps, schemaFps, 1 - SIMILARITY_THRESHOLD, distCol="JaccardDistance") \
    .select(f.col("datasetA.id").alias("source_id"),
            f.col("datasetB.id").alias("target_id"),
            f.col("JaccardDistance"))

#TO READD
if DEBUG:
    combinations_sim = mol_similarity.groupby(mol_similarity.source_id).agg(f.sort_array(f.collect_set("target_id")).alias("nbrs"))
else:
    combinations_sim = mol_similarity.groupby(mol_similarity.source_id).agg(f.collect_set("target_id").alias("nbrs"))

#combinations_sim = combinations_sim.withColumn("nbr_count", f.size("nbrs"))

if DEBUG:
    combinations_sim.orderBy("source_id").show(20, False)
combinations_sim.show(10)
Ejemplo n.º 29
0
def generate_metadata_group(
    experiment_specimen_df: DataFrame,
    impress_df: DataFrame,
    exp_type="experiment",
) -> DataFrame:
    """
    Takes in an Experiment-Specimen DataFrame and the IMPReSS dataframe,
    and generates a hash value with the parameters marked as 'isImportant' on IMPReSS.
    This hash is used to identify experiments that are comparable (i.e. share the same experimental conditions).
    """

    # Explode the experiments by procedureMetadata so each row contains data for each procedureMetadata value
    experiment_metadata = experiment_specimen_df.withColumn(
        "procedureMetadata", explode("procedureMetadata"))

    # Filter the IMPReSS to leave only those that generate a metadata split: isImportant = True
    impress_df_required = impress_df.where(
        (col("parameter.isImportant") == True)
        & (col("parameter.type") == "procedureMetadata"))

    # Join the experiment DF with he IMPReSS DF
    experiment_metadata = experiment_metadata.join(
        impress_df_required,
        ((experiment_metadata["_pipeline"]
          == impress_df_required["pipelineKey"])
         & (experiment_metadata["_procedureID"]
            == impress_df_required["procedure.procedureKey"])
         & (experiment_metadata["procedureMetadata._parameterID"]
            == impress_df_required["parameter.parameterKey"])),
    )

    # Create a new column by concatenating the parameter name and the parameter value
    experiment_metadata = experiment_metadata.withColumn(
        "metadataItem",
        when(
            col("procedureMetadata.value").isNotNull(),
            concat(col("parameter.name"), lit(" = "),
                   col("procedureMetadata.value")),
        ).otherwise(concat(col("parameter.name"), lit(" = "), lit("null"))),
    )

    # Select the right column name for production and phenotyping centre depending on experiment type
    if exp_type == "experiment":
        production_centre_col = "_productionCentre"
        phenotyping_centre_col = "_phenotypingCentre"
    else:
        production_centre_col = "production_centre"
        phenotyping_centre_col = "phenotyping_centre"

    # Create a window for the DataFrame over experiment id, production and phenotyping centre
    window = Window.partitionBy(
        "unique_id", production_centre_col,
        phenotyping_centre_col).orderBy("parameter.name")

    # Use the window to create for every experiment an array containing the set of "parameter =  value" pairs.
    experiment_metadata_input = experiment_metadata.withColumn(
        "metadataItems",
        collect_set(col("metadataItem")).over(window))

    # Add the production centre to the metadata group when this is different form the phenotyping centre.
    # This is because in that given case we would like to generate a metadata split among specimens
    # That have been produced and phenotyped on the same centre
    experiment_metadata_input = experiment_metadata_input.withColumn(
        "metadataItems",
        when(
            (col(production_centre_col).isNotNull())
            & (col(production_centre_col) != col(phenotyping_centre_col)),
            array_union(
                col("metadataItems"),
                array(
                    concat(lit("ProductionCenter = "),
                           col(production_centre_col))),
            ),
        ).otherwise(col("metadataItems")),
    )

    # Create a string with the concatenation of the metadata items "parameter = value" separated by '::'.
    experiment_metadata = experiment_metadata_input.groupBy(
        "unique_id", production_centre_col, phenotyping_centre_col).agg(
            concat_ws("::", sort_array(max(
                col("metadataItems")))).alias("metadataGroupList"))

    # Hash the list to generate a medata group identifier.
    experiment_metadata = experiment_metadata.withColumn(
        "metadataGroup", md5(col("metadataGroupList")))

    # Select the experiment IDs and the metadata group IDs
    experiment_metadata = experiment_metadata.select("unique_id",
                                                     "metadataGroup")

    # Join the original experiment DataFrame with the result of the metadata group generation
    experiment_specimen_df = experiment_specimen_df.join(
        experiment_metadata, "unique_id", "left_outer")

    # Add the hashed version of an empty string to those rows without a metadata group.
    experiment_specimen_df = experiment_specimen_df.withColumn(
        "metadataGroup",
        when(experiment_specimen_df["metadataGroup"].isNull(),
             md5(lit(""))).otherwise(experiment_specimen_df["metadataGroup"]),
    )
    return experiment_specimen_df
Ejemplo n.º 30
0
def generate_metadata(
    experiment_specimen_df: DataFrame,
    impress_df: DataFrame,
    exp_type="experiment",
) -> DataFrame:
    """
    Takes in a DataFrame with experiment and specimen data, and the IMPReSS information DataFrame.
    For every experiment generates a list of "parameter = value" pairs containing
    every metadata parameter for a that experiment.
    """

    # Explodes the experiment DF so every row represents a procedureMetadata value
    experiment_metadata = experiment_specimen_df.withColumn(
        "procedureMetadata", explode("procedureMetadata"))

    # Filters the IMPReSS DF so it only contains metadata parameters
    impress_df_required = impress_df.where(
        (col("parameter.type") == "procedureMetadata"))

    # Joins the experiment metadata values with the IMPReSS DF.
    experiment_metadata = experiment_metadata.join(
        impress_df_required,
        ((experiment_metadata["_pipeline"]
          == impress_df_required["pipelineKey"])
         & (experiment_metadata["_procedureID"]
            == impress_df_required["procedure.procedureKey"])
         & (experiment_metadata["procedureMetadata._parameterID"]
            == impress_df_required["parameter.parameterKey"])),
    )

    # Some experimenter IDs need to be replaced on the metadata values
    process_experimenter_id_udf = udf(_process_experimenter_id, StringType())
    experiment_metadata = experiment_metadata.withColumn(
        "experimenterIdMetadata",
        when(
            lower(col("parameter.name")).contains("experimenter"),
            process_experimenter_id_udf("procedureMetadata"),
        ).otherwise(lit(None)),
    )
    experiment_metadata = experiment_metadata.withColumn(
        "procedureMetadata.value",
        when(
            col("experimenterIdMetadata").isNotNull(),
            col("experimenterIdMetadata")).otherwise(
                "procedureMetadata.value"),
    )

    # Create the "parameter = value" pairs, if there is not value: "parameter = null"
    experiment_metadata = experiment_metadata.withColumn(
        "metadataItem",
        concat(
            col("parameter.name"),
            lit(" = "),
            when(
                col("procedureMetadata.value").isNotNull(),
                col("procedureMetadata.value"),
            ).otherwise(lit("null")),
        ),
    )

    # Select the right column name for production and phenotyping centre depending on experiment type
    if exp_type == "experiment":
        production_centre_col = "_productionCentre"
        phenotyping_centre_col = "_phenotypingCentre"
    else:
        production_centre_col = "production_centre"
        phenotyping_centre_col = "phenotyping_centre"

    # Group by the experiment unique_id, phenotyping centre and production centre
    # And create an array containing all the metadata pairs
    experiment_metadata = experiment_metadata.groupBy(
        "unique_id", production_centre_col, phenotyping_centre_col).agg(
            sort_array(collect_set(col("metadataItem"))).alias("metadata"))

    # Append the phenotyping centre value to all the
    # experiments where the phenotyping centre !=  the production centre
    experiment_metadata = experiment_metadata.withColumn(
        "metadata",
        when(
            (col(production_centre_col).isNotNull())
            & (col(production_centre_col) != col(phenotyping_centre_col)),
            udf(_append_phenotyping_centre_to_metadata,
                ArrayType(StringType()))(col("metadata"),
                                         col(production_centre_col)),
        ).otherwise(col("metadata")),
    )

    # Join the original experiment DF with the resulting experiment metadata DF
    experiment_specimen_df = experiment_specimen_df.join(
        experiment_metadata, "unique_id", "left_outer")
    return experiment_specimen_df