Ejemplo n.º 1
0
    def test_nested_higher_order_function(self):
        # SPARK-35382: lambda vars must be resolved properly in nested higher order functions
        from pyspark.sql.functions import flatten, struct, transform

        df = self.spark.sql(
            "SELECT array(1, 2, 3) as numbers, array('a', 'b', 'c') as letters"
        )

        actual = df.select(
            flatten(
                transform(
                    "numbers",
                    lambda number: transform(
                        "letters", lambda letter: struct(
                            number.alias("n"), letter.alias("l"))),
                ))).first()[0]

        expected = [
            (1, "a"),
            (1, "b"),
            (1, "c"),
            (2, "a"),
            (2, "b"),
            (2, "c"),
            (3, "a"),
            (3, "b"),
            (3, "c"),
        ]

        self.assertEquals(actual, expected)
Ejemplo n.º 2
0
 def get_column_spec(self, source_df: Optional[DataFrame],
                     current_column: Optional[Column]) -> Column:
     return flatten(
         filter(
             self.column.get_column_spec(source_df=source_df,
                                         current_column=current_column),
             lambda x: x.isNotNull(),
         ))
Ejemplo n.º 3
0
def query3(df, beg, end):

    most_common_topic = udf(lambda x: max(set(x), key=x.count))

    count_entrances = udf(lambda x: x.count(max(set(x), key=x.count)))

    return df.filter(col('time').between(beg, end)).groupBy(col('group_country').alias('country'))\
             .agg(flatten(collect_list('topic_name')).alias('list')).withColumn('topic', most_common_topic('list'))\
             .withColumn('count', count_entrances('list')).select(col('country'), col('topic'), col('count'))
Ejemplo n.º 4
0
def column_revalue(vcf):
    # info 값 수정 필요
    name_list = ["ID", "REF", "ALT", "INFO", "FORMAT"]
    for name in name_list:
        if name == "FORMAT":
            vcf = vcf.withColumn(
                name, F.array_sort(F.array_distinct(F.flatten(F.col(name)))))
            vcf = vcf.withColumn(
                name, F.concat(F.lit("GT:"), F.array_join(F.col(name), ":")))
        else:
            vcf = vcf.withColumn(name, F.array_max(F.col(name)))
    return vcf
Ejemplo n.º 5
0
def create_normalization_spec_spark(df, column, num_samples: int, seed: int):
    """Returns approximately num_samples random rows from column of df."""

    df = df.select(
        explode(col(column).alias("features")).alias("feature_name",
                                                     "feature_value"))

    # calculate fractions
    counts_df = df.groupBy("feature_name").count()
    frac = {}
    for row in counts_df.collect():
        assert num_samples <= row["count"]
        frac[row["feature_name"]] = num_samples / row["count"]

    # TODO(T64843081): change to reservoir sampling, currently it approximates
    # perform sampling and collect them
    df = df.sampleBy("feature_name", fractions=frac, seed=seed)
    df = df.groupBy("feature_name").agg(
        collect_list("feature_value").alias("feature_value_list"))
    df = df.select("feature_name",
                   flatten("feature_value_list").alias("feature_values"))
    return df
Ejemplo n.º 6
0
        |  # these last two were specifically requested
        (col("resp_pricings_osm_ann_ele_name") == F.lit("rfisc"))).withColumn(
            "resp_pricings_osm_ann_ele_type",
            F.when((F.substring(col("resp_pricings_osm_ann_ele_name"), 15, 3)
                    == F.lit("row")),
                   F.substring(col("resp_pricings_osm_ann_ele_name"), 19,
                               3)).otherwise(
                                   col("resp_pricings_osm_ann_ele_name"))
        )  # __goog_ptable_row_??? this will be "row"
    .groupby(
        *[key[2:] for key in row_key], "os_id"
    )  # represents transaction x passenger x segment x leg rows x optionalServices
    .agg(
        F.collect_list(
            col("resp_pricings_osm_ann_ele_type")).alias("google_tables"),
        F.flatten(F.collect_list(col("resp_pricings_osm_ann_ele_val"))).alias(
            "google_table_values")))

# fcols_annotations.select("f_google_tables", "f_google_table_values").show(20, False, vertical=True)

# COMMAND ----------

annotations.count()

# COMMAND ----------

flattened = os4.select(*psl_cols, *os_cols, "os_id").join(
    annotations, [*[key[2:] for key in row_key], "os_id"]).drop("os_id")

# COMMAND ----------

flattened.select("optionalServices_taxes").show(vertical=True)
Ejemplo n.º 7
0
        (col("diff_lead1") < CD) &
        ((col("diff_lead1") <= col("diff_lead2")) | col("diff_lead2").isNull())
        &
        ((col("diff_lead1") <= col("diff_lag1")) | col("diff_lag1").isNull()),
        col("lead1")).when((col("diff_lag1") < CD) & (
            (col("diff_lag1") <= col("diff_lag2")) | col("diff_lag2").isNull())
                           & ((col("diff_lag1") <= col("diff_lead1"))
                              | col("diff_lead1").isNull()),
                           col("outage_time")).otherwise(None)

    pw_df = pw_df.withColumn("merge_time", merge_time)
    pw_null_merge_time = pw_df.filter(col("merge_time").isNull())
    pw_df = pw_df.filter(col("merge_time").isNotNull())

    pw_df = pw_df.groupBy("merge_time").agg(
        F.flatten(F.collect_list("core_id")).alias("core_id"),
        F.flatten(F.collect_list("tx")).alias("tx"),
        F.flatten(F.collect_list("feeder_id")).alias("feeder_id"),
        F.flatten(F.collect_list("outage_times")).alias("outage_times"),
        F.flatten(F.collect_list("restore_time")).alias("restore_time"),
        F.flatten(F.collect_list("location")).alias("location"))

    pw_df = pw_df.select("core_id", "outage_times", "restore_time", "location",
                         "tx", "feeder_id")
    pw_null_merge_time = pw_null_merge_time.select("core_id", "outage_times",
                                                   "restore_time", "location",
                                                   "tx", "feeder_id")
    pw_df = pw_df.union(pw_null_merge_time)

    udfTimestampAverage = udf(timestamp_average, LongType())
    pw_df = pw_df.withColumn("outage_time",
Ejemplo n.º 8
0
            #func.max(sort_pub_year).alias('lastyr'),

            # number of cited papers.
            func.sum(
                func.expr('IF(' + sort_pub_year + ' BETWEEN ' + minyear +
                          ' AND ' + maxyear +
                          ',IF(CitationCountNonSelf>0,1,0),0)')
            ).alias('ns_npcY1Y3'),
            func.sum(
                func.expr('IF(' + sort_pub_year + ' BETWEEN ' + minyear +
                          ' AND ' + maxyear +
                          ',IF(CitationCount>0,1,0),0)')).alias('ws_npcY1Y3'),
            func.sum('CitationCountNonSelf').alias('ns_ncY2Y3'),
            func.size(
                func.array_distinct(
                    func.flatten(func.collect_list(
                        'CitingEidsNonSelf')))).alias('ns_ncY2Y3_cp'),
            func.max(func.expr('IF(ns_r<=CitationCountNonSelf,ns_r,0)')).alias(
                'ns_hY3'),
            func.max(func.expr('IF(ns_r_eff<=CitationCountNonSelf,ns_r_eff,0)')
                     ).alias('ns_hmY3'),
            func.sum(func.expr('IF(n_authors=1,1,0)')).alias('ns_nps'),
            func.sum(func.expr(
                'IF(n_authors=1,CitationCountNonSelf,0)')).alias('ns_ncs'),
            func.sum(func.expr('IF(n_authors=1 OR Authorseq=1,1,0)')).alias(
                'ns_npsf'),
            func.sum(
                func.expr(
                    'IF(n_authors=1 OR Authorseq=1,CitationCountNonSelf,0)')).
            alias('ns_ncsf'),
            func.sum(
                func.expr(
Ejemplo n.º 9
0
    pw_df = pw_df.withColumn("diff_lead2", col("lead2") - col("lead1"))
    pw_df = pw_df.withColumn("diff_lag1", col("outage_time") - col("lag1"))
    pw_df = pw_df.withColumn("diff_lag2", col("lag1") - col("lag2"))

    merge_time = when((col("diff_lead1") < CD) &
                      ((col("diff_lead1") <= col("diff_lead2")) | col("diff_lead2").isNull()) &
                      ((col("diff_lead1") <= col("diff_lag1")) | col("diff_lag1").isNull()), col("lead1")).when(
                              (col("diff_lag1") < CD) &
                              ((col("diff_lag1") <= col("diff_lag2")) | col("diff_lag2").isNull()) &
                              ((col("diff_lag1") <= col("diff_lead1")) | col("diff_lead1").isNull()), col("outage_time")).otherwise(None)

    pw_df = pw_df.withColumn("merge_time", merge_time)
    pw_null_merge_time = pw_df.filter(col("merge_time").isNull())
    pw_df = pw_df.filter(col("merge_time").isNotNull())

    pw_df = pw_df.groupBy("merge_time").agg(F.flatten(F.collect_list("core_id")).alias("core_id"),
                                            F.flatten(F.collect_list("outage_times")).alias("outage_times"),
                                            F.flatten(F.collect_list("restore_time")).alias("restore_time"),
                                            F.flatten(F.collect_list("location")).alias("location"))

    pw_df = pw_df.select("core_id","outage_times","restore_time","location")
    pw_null_merge_time = pw_null_merge_time.select("core_id","outage_times","restore_time","location")
    pw_df = pw_df.union(pw_null_merge_time)

    udfTimestampAverage = udf(timestamp_average, LongType())
    pw_df = pw_df.withColumn("outage_time", udfTimestampAverage("outage_times"))
    pw_df = pw_df.localCheckpoint(eager = True)
    print("Merged to:", pw_df.count())
    print()

#Okay now we have a list of outages, restore_times, locations, core_ids
Ejemplo n.º 10
0
        sum("DislikeCount")
        #calculate LikeCount
        ,
        sum("LikeCount")
        #calculate Rating
        ,
        sum("Rating")
        #calculate Duration
        ,
        sum("Duration")
        #calculate ViewCount
        ,
        sum("ViewCount")
        #calculate textwords
        ,
        flatten(f.collect_list("TextWords"))
        #count videos
        ,
        f.count("*"))
    #rename
    .withColumnRenamed("sum(DislikeCount)", "DislikeCount").withColumnRenamed(
        'sum(LikeCount)', "LikeCount").withColumnRenamed(
            "sum(Rating)", "Rating").withColumnRenamed(
                "sum(Duration)", "Duration").withColumnRenamed(
                    "sum(ViewCount)", "ViewCount").withColumnRenamed(
                        "flatten(collect_list(TextWords))",
                        "TextWords").withColumnRenamed("count(1)",
                                                       "VideoCount"))

#cache results
df2.cache()
Ejemplo n.º 11
0
# Schema for the user-defined function
schema = T.ArrayType(
    T.StructType([
        T.StructField("word", T.StringType(), False),
        T.StructField("count", T.IntegerType(), False)
    ]))

# Sorting dictionary in the ascending order
SorterUDF = f.udf(sort_dict_f, schema)

udf_take_n_words = f.udf(lambda x: [i for i in x[:5]])

# Stopwords filter is applied
testdf = billdf.withColumn('lst', f.split(f.col('Lyrics'), ' '))
testdf2 = testdf.select(['Year','lst']).groupby('Year').agg(f.collect_list('lst'))\
                                        .withColumn("collect_list(lst)",f.flatten("collect_list(lst)"))\
                                        .withColumnRenamed("collect_list(lst)", "All words")
testdf3 = testdf2.withColumn(
    "cnt", SorterUDF(udf_flatten_counter(udf_filter_words("All words"))))
lsc = testdf3.select("All words").collect()
lsc = [i for i in lsc[0][0] if i not in stpwr]
ds = dict(Counter(lsc))
ds = sorted(ds.items(), key=operator.itemgetter(1), reverse=True)

popular_words = pd.Series(lsc).str.cat(sep=' ')

wordcloud = WordCloud(width=1600,
                      height=800,
                      max_font_size=200,
                      background_color='white').generate(popular_words)
plt.figure(figsize=(12, 10))
Ejemplo n.º 12
0
        (col("diff_lead1") < CD) &
        ((col("diff_lead1") <= col("diff_lead2")) | col("diff_lead2").isNull())
        &
        ((col("diff_lead1") <= col("diff_lag1")) | col("diff_lag1").isNull()),
        col("lead1")).when((col("diff_lag1") < CD) & (
            (col("diff_lag1") <= col("diff_lag2")) | col("diff_lag2").isNull())
                           & ((col("diff_lag1") <= col("diff_lead1"))
                              | col("diff_lead1").isNull()),
                           col("outage_time")).otherwise(None)

    pw_df = pw_df.withColumn("merge_time", merge_time)
    pw_null_merge_time = pw_df.filter(col("merge_time").isNull())
    pw_df = pw_df.filter(col("merge_time").isNotNull())

    pw_df = pw_df.groupBy("merge_time").agg(
        F.flatten(F.collect_list("user_id")).alias("user_id"),
        F.flatten(F.collect_list("outage_times")).alias("outage_times"),
    )

    pw_df = pw_df.select("user_id", "outage_times")
    pw_null_merge_time = pw_null_merge_time.select("user_id", "outage_times")
    pw_df = pw_df.union(pw_null_merge_time)

    udfTimestampAverage = udf(timestamp_average, LongType())
    pw_df = pw_df.withColumn("outage_time",
                             udfTimestampAverage("outage_times"))
    pw_df = pw_df.localCheckpoint(eager=True)
    print("Merged to:", pw_df.count())
    print()

#Okay now we have a list of outages, restore_times, locations, user_ids
Ejemplo n.º 13
0
def calc_features(spark_session: SparkSession) -> DataFrame:
    df: DataFrame = spark_session \
        .read \
        .option("header", True) \
        .option("inferSchema", False) \
        .csv("daily-raw") \
        .withColumn("closingPrice", F.col("closingPrice").cast(types.DoubleType()))

    def days(x: int) -> int:
        return x * 24 * 3600

    window = Window.partitionBy("ISIN").orderBy("Date")
    window2 = Window.partitionBy("ISIN", "localMin",
                                 "localMax").orderBy("Date")

    window3 = Window \
        .partitionBy("ISIN") \
        .orderBy(F.col("Date").cast("timestamp").cast("long")) \
        .rangeBetween(days(-31), days(-1))

    label_window = Window \
        .partitionBy("ISIN") \
        .orderBy(F.col("Date").cast("timestamp").cast("long")) \
        .rangeBetween(days(1), days(30))

    # TODO: show case a pandas UDF
    # https://databricks.com/blog/2017/10/30/introducing-vectorized-udfs-for-pyspark.html
    # https://docs.databricks.com/spark/latest/spark-sql/udf-python-pandas.html
    # https://spark.apache.org/docs/latest/api/python/pyspark.sql.html
    # https://stackoverflow.com/questions/40006395/applying-udfs-on-groupeddata-in-pyspark-with-functioning-python-example
    # https://intellipaat.com/community/11611/applying-udfs-on-groupeddata-in-pyspark-with-functioning-python-example

    @F.udf(
        types.StructType([
            types.StructField("Date", types.StringType(), True),
            types.StructField("closingPrice", types.DoubleType(), True)
        ]))
    def my_udf(entries: Sequence[Tuple[str, float]]):
        return min([e for e in entries if e[1] >= 65] + [entries[-1]],
                   key=lambda x: x[0])

    df2: DataFrame = df \
        .withColumn("availableDays", F.datediff("Date", F.min("Date").over(window))) \
        .withColumn("label", F.collect_list(F.struct(F.col("Date"), F.col("closingPrice"))).over(label_window)) \
        .filter(F.size("label") > 0) \
        .withColumn("label", my_udf(F.col("label"))) \
        .withColumn("sellAt", F.col("label.Date")) \
        .withColumn("sellPrice", F.col("label.closingPrice")) \
        .drop("label") \
        .withColumn("diffToPrev", F.col("closingPrice") / F.lag("closingPrice", 1).over(window) - 1) \
        .withColumn("up", F.col("diffToPrev") >= 0) \
        .withColumn("change", F.col("up") != F.lag("up", 1).over(window)) \
        .withColumn("nextChange", F.lead("change", 1, False).over(window)) \
        .withColumn("localMax", F.col("up") & F.col("nextChange")) \
        .withColumn("localMin", ~F.col("up") & F.col("nextChange")) \
        .drop("up", "change", "nextChange") \
        .withColumn("index", F.row_number().over(window)) \
        .filter(F.col("localMin") | F.col("localMax")) \
        .withColumn("higher", F.col("closingPrice") >= F.lag("closingPrice", 1).over(window2)) \
        .withColumn("daysBetween", F.col("index") - F.lag("index", 1).over(window)) \
        .drop("index") \
        .withColumn("hallo", F.concat_ws(",", "localMax", "localMin", "higher"))

    indexer = StringIndexer(inputCol="hallo", outputCol="categoryIndex")

    df3: DataFrame = indexer.fit(df2).transform(df2) \
        .withColumn("hallo", F.format_string("%.0fx", "categoryIndex")) \
        .filter(F.col("higher").isNotNull()) \
        .withColumn("events", F.flatten(F.collect_list(F.array("daysBetween", "hallo")).over(window3))) \
        .drop("hallo", "categoryIndex") \
        .filter(F.col("availableDays") >= 30) \
        .orderBy("Date")

    return df3
Ejemplo n.º 14
0
    def main(self, sc: SparkContext, *args: Any):
        """
        Solr Core loader
        :param list argv: the list elements should be:
                        [1]: source IMPC parquet file
                        [2]: Output Path
        """
        observations_parquet_path = args[0]
        pipeline_core_parquet_path = args[1]
        omero_ids_csv_path = args[2]
        output_path = args[3]

        spark = SparkSession.builder.getOrCreate()
        observations_df = spark.read.parquet(observations_parquet_path)
        pipeline_core_df = spark.read.parquet(pipeline_core_parquet_path)
        pipeline_core_df = pipeline_core_df.select(
            "fully_qualified_name",
            "mouse_anatomy_id",
            "mouse_anatomy_term",
            "embryo_anatomy_id",
            "embryo_anatomy_term",
            col("mp_id").alias("impress_mp_id"),
            col("mp_term").alias("impress_mp_term"),
            "top_level_mouse_anatomy_id",
            "top_level_mouse_anatomy_term",
            "top_level_embryo_anatomy_id",
            "top_level_embryo_anatomy_term",
            col("top_level_mp_id").alias("impress_top_level_mp_id"),
            col("top_level_mp_term").alias("impress_top_level_mp_term"),
            col("intermediate_mp_id").alias("impress_intermediate_mp_id"),
            col("intermediate_mp_term").alias("impress_intermediate_mp_term"),
        ).distinct()
        omero_ids_df = spark.read.csv(omero_ids_csv_path,
                                      header=True).dropDuplicates()
        omero_ids_df = omero_ids_df.alias("omero")
        image_observations_df = observations_df.where(
            col("observation_type") == "image_record")
        image_observations_df = image_observations_df.alias("obs")
        image_observations_df = image_observations_df.join(
            omero_ids_df,
            [
                "observation_id",
                "download_file_path",
                "phenotyping_center",
                "pipeline_stable_id",
                "procedure_stable_id",
                "parameter_stable_id",
                "datasource_name",
            ],
        )
        image_observations_df = image_observations_df.select(
            "obs.*", "omero.omero_id")
        parameter_association_fields = [
            "parameter_association_stable_id",
            "parameter_association_sequence_id",
            "parameter_association_name",
            "parameter_association_value",
        ]
        image_observations_exp_df = image_observations_df
        for parameter_association_field in parameter_association_fields:
            image_observations_exp_df = image_observations_exp_df.withColumn(
                f"{parameter_association_field}_exp",
                explode_outer(parameter_association_field),
            )
        image_observations_x_impress_df = image_observations_exp_df.withColumn(
            "fully_qualified_name",
            concat_ws(
                "_",
                "pipeline_stable_id",
                "procedure_stable_id",
                "parameter_association_stable_id_exp",
            ),
        )

        image_observations_x_impress_df = image_observations_x_impress_df.join(
            pipeline_core_df,
            (image_observations_x_impress_df["fully_qualified_name"]
             == pipeline_core_df["fully_qualified_name"]),
            "left_outer",
        )
        group_by_expressions = [
            collect_set(
                when(
                    col("mouse_anatomy_id").isNotNull(),
                    col("mouse_anatomy_id")).otherwise(col(
                        "embryo_anatomy_id"))).alias("embryo_anatomy_id_set"),
            collect_set(
                when(
                    col("mouse_anatomy_term").isNotNull(),
                    col("mouse_anatomy_term")).otherwise(
                        col("embryo_anatomy_term"))).alias(
                            "embryo_anatomy_term_set"),
            collect_set(
                when(
                    col("mouse_anatomy_id").isNotNull(),
                    col("mouse_anatomy_id")).otherwise(
                        col("embryo_anatomy_id"))).alias("anatomy_id"),
            collect_set(
                when(
                    col("mouse_anatomy_term").isNotNull(),
                    col("mouse_anatomy_term")).otherwise(
                        col("embryo_anatomy_term"))).alias("anatomy_term"),
            flatten(
                collect_set(
                    when(
                        col("mouse_anatomy_id").isNotNull(),
                        col("top_level_mouse_anatomy_id"),
                    ).otherwise(col("top_level_embryo_anatomy_id")))).alias(
                        "selected_top_level_anatomy_id"),
            flatten(
                collect_set(
                    when(
                        col("mouse_anatomy_id").isNotNull(),
                        col("top_level_mouse_anatomy_term"),
                    ).otherwise(col("top_level_embryo_anatomy_term")))).alias(
                        "selected_top_level_anatomy_term"),
            collect_set("impress_mp_id").alias("mp_id"),
            collect_set("impress_mp_term").alias("mp_term"),
            flatten(collect_set("impress_top_level_mp_id")).alias(
                "top_level_mp_id_set"),
            flatten(collect_set("impress_top_level_mp_term")).alias(
                "top_level_mp_term_set"),
            flatten(collect_set("impress_intermediate_mp_id")).alias(
                "intermediate_mp_id_set"),
            flatten(collect_set("impress_intermediate_mp_term")).alias(
                "intermediate_mp_term_set"),
        ]
        image_observations_x_impress_df = image_observations_x_impress_df.select(
            [
                "observation_id",
                "mouse_anatomy_id",
                "embryo_anatomy_id",
                "mouse_anatomy_term",
                "embryo_anatomy_term",
                "top_level_mouse_anatomy_id",
                "top_level_embryo_anatomy_id",
                "top_level_mouse_anatomy_term",
                "top_level_embryo_anatomy_term",
                "impress_mp_id",
                "impress_mp_term",
                "impress_top_level_mp_id",
                "impress_top_level_mp_term",
                "impress_intermediate_mp_id",
                "impress_intermediate_mp_term",
            ])
        image_observations_x_impress_df = image_observations_x_impress_df.groupBy(
            "observation_id").agg(*group_by_expressions)

        image_observations_df = image_observations_df.join(
            image_observations_x_impress_df, "observation_id")

        image_observations_df = image_observations_df.withColumn(
            "download_url",
            concat(
                lit("//www.ebi.ac.uk/mi/media/omero/webgateway/archived_files/download/"
                    ),
                col("omero_id"),
            ),
        )
        image_observations_df = image_observations_df.withColumn(
            "jpeg_url",
            concat(
                lit("//www.ebi.ac.uk/mi/media/omero/webgateway/render_image/"),
                col("omero_id"),
            ),
        )
        image_observations_df = image_observations_df.withColumn(
            "thumbnail_url",
            concat(
                lit("//www.ebi.ac.uk/mi/media/omero/webgateway/render_birds_eye_view/"
                    ),
                col("omero_id"),
            ),
        )
        image_observations_df.write.parquet(output_path)
Ejemplo n.º 15
0
 def get_column_spec(self, source_df: Optional[DataFrame],
                     current_column: Optional[Column]) -> Column:
     return flatten(
         self.column.get_column_spec(source_df=source_df,
                                     current_column=current_column))
Ejemplo n.º 16
0
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, flatten

spark = SparkSession.builder.appName('pyspark-by-examples').getOrCreate()

arrayArrayData = [
    ("James", [["Java", "Scala", "C++"], ["Spark", "Java"]]),
    ("Michael", [["Spark", "Java", "C++"], ["Spark", "Java"]]),
    ("Robert", [["CSharp", "VB"], ["Spark", "Python"]])
]

df = spark.createDataFrame(data=arrayArrayData, schema=['name', 'subjects'])
# df.printSchema()

#df.show(truncate=False)

# explode array columns to array rows

df.select(df.name, explode(df.subjects)).show()

df.select(df.name, flatten(df.subjects)).show(truncate=False)
Ejemplo n.º 17
0
    def main(self, sc: SparkContext, *args: Any):
        """
        Pipeline Solr Core loader
        """
        pipeline_parquet_path = args[0]
        observations_parquet_path = args[1]
        ontology_parquet_path = args[2]
        emap_emapa_tsv_path = args[3]
        emapa_metadata_csv_path = args[4]
        ma_metadata_csv_path = args[5]
        output_path = args[6]

        spark = SparkSession(sc)
        pipeline_df = spark.read.parquet(pipeline_parquet_path)
        observations_df = spark.read.parquet(observations_parquet_path)
        ontology_df = spark.read.parquet(ontology_parquet_path)
        emap_emapa_df = spark.read.csv(emap_emapa_tsv_path,
                                       header=True,
                                       sep="\t")
        for col_name in emap_emapa_df.columns:
            emap_emapa_df = emap_emapa_df.withColumnRenamed(
                col_name,
                col_name.lower().replace(" ", "_"))
        emapa_metadata_df = spark.read.csv(emapa_metadata_csv_path,
                                           header=True)
        ma_metadata_df = spark.read.csv(ma_metadata_csv_path, header=True)

        pipeline_df = pipeline_df.withColumnRenamed("increment",
                                                    "incrementStruct")
        for column, source in COLUMN_MAPPER.items():
            pipeline_df = pipeline_df.withColumn(column, col(source))

        pipeline_df = pipeline_df.withColumn(
            "unit_y",
            when(col("incrementStruct").isNotNull(),
                 col("unitName")).otherwise(lit(None)),
        )
        pipeline_df = pipeline_df.withColumn(
            "unit_x",
            when(
                col("incrementStruct").isNotNull(),
                col("incrementStruct.incrementUnit")).otherwise(
                    col("unitName")),
        )
        pipeline_df = pipeline_df.withColumn(
            "metadata",
            col("parameter.type") == "procedureMetadata")
        pipeline_df = pipeline_df.withColumn(
            "fully_qualified_name",
            concat_ws("_", "pipeline_stable_id", "procedure_stable_id",
                      "parameter_stable_id"),
        )
        observations_df = observations_df.withColumn(
            "fully_qualified_name",
            concat_ws("_", "pipeline_stable_id", "procedure_stable_id",
                      "parameter_stable_id"),
        )
        observations_df = observations_df.groupBy("fully_qualified_name").agg(
            first(col("observation_type")).alias("observation_type"))

        pipeline_df = pipeline_df.join(observations_df, "fully_qualified_name",
                                       "left_outer")

        pipeline_categories_df = pipeline_df.select(
            "fully_qualified_name",
            when(
                col("option.name").rlike("^\d+$")
                & col("option.description").isNotNull(),
                col("option.description"),
            ).otherwise(col("option.name")).alias("name"),
        )
        pipeline_categories_df = pipeline_categories_df.groupBy(
            "fully_qualified_name").agg(
                collect_set("name").alias("categories"))

        pipeline_df = pipeline_df.join(pipeline_categories_df,
                                       "fully_qualified_name", "left_outer")

        pipeline_mp_terms_df = pipeline_df.select(
            "fully_qualified_name", "parammpterm.selectionOutcome",
            "termAcc").where(col("termAcc").startswith("MP"))

        pipeline_mp_terms_df = pipeline_mp_terms_df.join(
            ontology_df,
            col("id") == col("termAcc"))

        uniquify = udf(self._uniquify, ArrayType(StringType()))

        pipeline_mp_terms_df = pipeline_mp_terms_df.groupBy(
            "fully_qualified_name"
        ).agg(
            collect_set("id").alias("mp_id"),
            collect_set("term").alias("mp_term"),
            uniquify(flatten(
                collect_list("top_level_ids"))).alias("top_level_mp_id"),
            uniquify(flatten(
                collect_list("top_level_terms"))).alias("top_level_mp_term"),
            uniquify(flatten(collect_list("top_level_synonyms"))).alias(
                "top_level_mp_term_synonym"),
            uniquify(flatten(
                collect_list("intermediate_ids"))).alias("intermediate_mp_id"),
            uniquify(flatten(collect_list("intermediate_terms"))).alias(
                "intermediate_mp_term"),
            collect_set(
                when(col("selectionOutcome") == "ABNORMAL",
                     col("termAcc")).otherwise(
                         lit(None))).alias("abnormal_mp_id"),
            collect_set(
                when(col("selectionOutcome") == "ABNORMAL",
                     col("term")).otherwise(
                         lit(None))).alias("abnormal_mp_term"),
            collect_set(
                when(col("selectionOutcome") == "INCREASED",
                     col("termAcc")).otherwise(
                         lit(None))).alias("increased_mp_id"),
            collect_set(
                when(col("selectionOutcome") == "INCREASED",
                     col("term")).otherwise(
                         lit(None))).alias("increased_mp_term"),
            collect_set(
                when(col("selectionOutcome") == "DECREASED",
                     col("termAcc")).otherwise(
                         lit(None))).alias("decreased_mp_id"),
            collect_set(
                when(col("selectionOutcome") == "DECREASED",
                     col("term")).otherwise(
                         lit(None))).alias("decreased_mp_term"),
        )

        pipeline_df = pipeline_df.join(pipeline_mp_terms_df,
                                       "fully_qualified_name", "left_outer")

        pipeline_df = pipeline_df.withColumn(
            "embryo_anatomy_id",
            when(col("termAcc").contains("EMAPA:"),
                 col("termAcc")).otherwise(lit(None)),
        )
        emapa_metadata_df = emapa_metadata_df.select(
            "acc",
            col("name").alias("emapaName"))
        pipeline_df = pipeline_df.join(emapa_metadata_df,
                                       col("embryo_anatomy_id") == col("acc"),
                                       "left_outer")

        pipeline_df = pipeline_df.withColumn("embryo_anatomy_term",
                                             col("emapaName"))
        pipeline_df = pipeline_df.drop(*emapa_metadata_df.columns)

        pipeline_df = pipeline_df.join(ontology_df,
                                       col("embryo_anatomy_id") == col("id"),
                                       "left_outer")
        pipeline_df = pipeline_df.withColumn("top_level_embryo_anatomy_id",
                                             col("top_level_ids"))
        pipeline_df = pipeline_df.withColumn("top_level_embryo_anatomy_term",
                                             col("top_level_terms"))
        pipeline_df = pipeline_df.drop(*ontology_df.columns)

        pipeline_df = pipeline_df.withColumn(
            "mouse_anatomy_id",
            when(col("termAcc").startswith("MA:"),
                 col("termAcc")).otherwise(lit(None)),
        )
        ma_metadata_df = ma_metadata_df.withColumnRenamed("name", "maName")
        pipeline_df = pipeline_df.join(ma_metadata_df,
                                       col("mouse_anatomy_id") == col("curie"),
                                       "left_outer")
        pipeline_df = pipeline_df.withColumn("mouse_anatomy_term",
                                             col("maName"))
        pipeline_df = pipeline_df.drop(*ma_metadata_df.columns)

        pipeline_df = pipeline_df.join(ontology_df,
                                       col("mouse_anatomy_id") == col("id"),
                                       "left_outer")
        pipeline_df = pipeline_df.withColumn("top_level_mouse_anatomy_id",
                                             col("top_level_ids"))
        pipeline_df = pipeline_df.withColumn("top_level_mouse_anatomy_term",
                                             col("top_level_terms"))
        missing_parameter_information_df = pipeline_df.where(
            col("parameter_stable_id").isNull())
        missing_parameter_rows = missing_parameter_information_df.collect()
        if len(missing_parameter_rows) > 0:
            print("MISSING PARAMETERS")
            for missing in missing_parameter_rows:
                print(missing.asDict())
        pipeline_df = pipeline_df.where(col("parameter_stable_id").isNotNull())
        pipeline_df = pipeline_df.drop(*ontology_df.columns)
        pipeline_df.write.parquet(output_path)
arrayArrayData = [("James", [["Java", "Scala", "C++"], ["Spark", "Java"]]),
                  ("Michael", [["Spark", "Java", "C++"], ["Spark", "Java"]]),
                  ("Robert", [["CSharp", "VB"], ["Spark", "Python"]])]

df = spark.createDataFrame(data=arrayArrayData, schema=['name', 'subjects'])
df.printSchema()
df.show(truncate=False)

df_explode = df.select(df.name,
                       explode(df.subjects).alias("Exploded_Subjects"))

df_explode.printSchema()
df_explode.show(truncate=False)

df_flatten = df.select(df.name,
                       flatten(df.subjects).alias("Flattened_Subjects"))

df_flatten.printSchema()
df_flatten.show(truncate=False)

df_flatten_zip=df_flatten \
               .withColumn("tmp", arrays_zip("Flattened_Subjects"))  \
               .withColumn("tmp", explode("tmp")) \
               .select("name", col("tmp.Flattened_Subjects")) \

df_flatten_zip.printSchema()
df_flatten_zip.show(truncate=False)
'''Above is not performant hence below solution if array size is known '''
# Length of array
n = 5
Ejemplo n.º 19
0
  .withColumn('ns_r_eff',func.sum(1/func.col('n_authors')).over(wns.rangeBetween(Window.unboundedPreceding, 0)))
  .withColumn('ns_r',func.rank().over(wns))
  .withColumn('ws_r_eff',func.sum(1/func.col('n_authors')).over(wws.rangeBetween(Window.unboundedPreceding, 0)))
  .withColumn('ws_r',func.rank().over(wws))
  
  .groupBy('auid')
  .agg(
    func.sort_array(func.collect_set("subfield_tuple"),True).alias("subFields"),
    func.sort_array(func.collect_set("field_tuple"),True).alias("Fields"),
    func.sum(func.expr('IF('+sort_pub_year+' BETWEEN '+minyear+' AND '+maxyear+',1,0)')).alias('npY1Y3'),
    # no longer capture first/last here; we want to get those values from the full database and therefore collect them with the author names dataframe (where we also get the last known full prefereed name)
    #func.min(sort_pub_year).alias('firstyr'),
    #func.max(sort_pub_year).alias('lastyr'),
    
    func.sum('CitationCountNonSelf').alias('ns_ncY2Y3'),
    func.size(func.array_distinct(func.flatten(func.collect_list('CitingEidsNonSelf')))).alias('ns_ncY2Y3_cp'),
    func.max(func.expr('IF(ns_r<=CitationCountNonSelf,ns_r,0)')).alias('ns_hY3'),
    func.max(func.expr('IF(ns_r_eff<=CitationCountNonSelf,ns_r_eff,0)')).alias('ns_hmY3'),
    func.sum(func.expr('IF(n_authors=1,1,0)')).alias('ns_nps'),
    func.sum(func.expr('IF(n_authors=1,CitationCountNonSelf,0)')).alias('ns_ncs'),
    func.sum(func.expr('IF(n_authors=1 OR Authorseq=1,1,0)')).alias('ns_npsf'),
    func.sum(func.expr('IF(n_authors=1 OR Authorseq=1,CitationCountNonSelf,0)')).alias('ns_ncsf'),
    func.sum(func.expr('IF(n_authors=1 OR Authorseq=1 OR Authorseq=n_authors,1,0)')).alias('ns_npsfl'),
    func.sum(func.expr('IF(n_authors=1 OR Authorseq=1 OR Authorseq=n_authors,CitationCountNonSelf,0)')).alias('ns_ncsfl'),

    func.sum('CitationCount').alias('ws_ncY2Y3'),
    func.size(func.array_distinct(func.flatten(func.collect_list('CitingEids')))).alias('ws_ncY2Y3_cp'),
    func.max(func.expr('IF(ws_r<=CitationCount,ws_r,0)')).alias('ws_hY3'),
    func.max(func.expr('IF(ws_r_eff<=CitationCount,ws_r_eff,0)')).alias('ws_hmY3'),
    func.sum(func.expr('IF(n_authors=1,1,0)')).alias('ws_nps'),
    func.sum(func.expr('IF(n_authors=1,CitationCount,0)')).alias('ws_ncs'),
Ejemplo n.º 20
0
def compile_array_repeat(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    times = op.times.op().value
    return F.flatten(F.array_repeat(src_column, times))
my_list = [('a', 2, 3), ('b', 5, 6), ('c', 8, 9), ('a', 2, 3), ('b', 5, 6),
           ('c', 8, 9)]
col_name = ['col1', 'col2', 'col3']
ds = spark.createDataFrame(my_list, schema=col_name)
ds.withColumn('concat', F.concat('col1', 'col2')).show()

# 分组统计按#拼接字符串
df.groupBy("col1").agg(
    F.concat_ws("#", F.collect_list(F.col('col2'))).alias("col2_set"))

# 分组统计,合并列表
ds2 = spark.createDataFrame([(1, [1, 2, 3]), (1, [4, 5, 6]), (2, [2]),
                             (2, [3])], ["store", "values"])
# 方法1(思路:先合并,再flatten)
ds2 = ds2.groupBy("store").agg(F.collect_list("values").alias('values_list'))
ds2 = ds2.withColumn("flatten_array", F.flatten(F.col("values_list")))
ds2.show()
# 方法2, rdd, map
ds2.rdd.map(lambda r: (r.store, r.values)).reduceByKey(
    lambda x, y: x + y).toDF(['store', 'values']).show()
# 方法3, udf
import functools


def concat_list(val):
    return functools.reduce(lambda x, y: x + y, val)


concat_list_udf = F.udf(concat_list, ArrayType(IntegerType()))
ds2 = ds2.groupBy("store").agg(
    concat_list_udf(F.collect_list("values")).alias("values_list"))