def main(self, sc: SparkContext, *args): """ Takes in a SparkContext and the list of arguments generated by `app_options` and executes the PySpark job. """ spark = SparkSession(sc) # Parsing app options gp_parquet_path = args[0] output_path = args[1] gp_df = spark.read.parquet(gp_parquet_path) explode_cols = ["procedure_stable_id", "procedure_name", "project_name"] for col_name in explode_cols: gp_df = gp_df.withColumn(col_name, explode(col_name)) gp_df = gp_df.select( "marker_accession_id", "pipeline_stable_id", "procedure_stable_id", "procedure_name", "parameter_stable_id", "parameter_name", "allele_accession_id", "allele_name", "allele_symbol", "zygosity", "phenotyping_center", "sex", "project_name", "p_value", "life_stage_name", "effect_size", "mp_term_id", "mp_term_name", "top_level_mp_term_id", "top_level_mp_term_name", ) gp_df = gp_df.withColumn( "phenotype", struct(col("mp_term_id").alias("id"), col("mp_term_name").alias("name")), ) gp_df = gp_df.withColumn( "topLevelPhenotype", zip_with( "top_level_mp_term_id", "top_level_mp_term_name", lambda x, y: struct(x.alias("id"), y.alias("name")), ), ) gp_df = gp_df.drop( "mp_term_id", "mp_term_name", "top_level_mp_term_id", "top_level_mp_term_name", ) gp_df = gp_df.withColumnRenamed("marker_accession_id", "geneAccessionId") gp_df = gp_df.withColumn("id", col("geneAccessionId")) for col_name in gp_df.columns: gp_df = gp_df.withColumnRenamed(col_name, to_camel_case(col_name)) gp_df = gp_df.groupBy("id").agg( collect_set( struct(*[col_name for col_name in gp_df.columns if col_name != "id"]) ).alias("significantPhenotypes") ) gp_df.write.partitionBy("id").json(output_path)
def eq(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: if isinstance(right, (list, tuple)): from pyspark.pandas.series import first_series, scol_for from pyspark.pandas.frame import DataFrame from pyspark.pandas.internal import NATURAL_ORDER_COLUMN_NAME, InternalField len_right = len(right) if len(left) != len(right): raise ValueError("Lengths must be equal") sdf = left._internal.spark_frame structed_scol = F.struct( sdf[NATURAL_ORDER_COLUMN_NAME], *left._internal.index_spark_columns, left.spark.column, ) # The size of the list is expected to be small. collected_structed_scol = F.collect_list(structed_scol) # Sort the array by NATURAL_ORDER_COLUMN so that we can guarantee the order. collected_structed_scol = F.array_sort(collected_structed_scol) right_values_scol = F.array(*(F.lit(x) for x in right)) index_scol_names = left._internal.index_spark_column_names scol_name = left._internal.spark_column_name_for( left._internal.column_labels[0]) # Compare the values of left and right by using zip_with function. cond = F.zip_with( collected_structed_scol, right_values_scol, lambda x, y: F.struct( *[ x[index_scol_name].alias(index_scol_name) for index_scol_name in index_scol_names ], F.when(x[scol_name].isNull() | y.isNull(), False). otherwise(x[scol_name] == y, ).alias(scol_name), ), ).alias(scol_name) # 1. `sdf_new` here looks like the below (the first field of each set is Index): # +----------------------------------------------------------+ # |0 | # +----------------------------------------------------------+ # |[{0, false}, {1, true}, {2, false}, {3, true}, {4, false}]| # +----------------------------------------------------------+ sdf_new = sdf.select(cond) # 2. `sdf_new` after the explode looks like the below: # +----------+ # | col| # +----------+ # |{0, false}| # | {1, true}| # |{2, false}| # | {3, true}| # |{4, false}| # +----------+ sdf_new = sdf_new.select(F.explode(scol_name)) # 3. Here, the final `sdf_new` looks like the below: # +-----------------+-----+ # |__index_level_0__| 0| # +-----------------+-----+ # | 0|false| # | 1| true| # | 2|false| # | 3| true| # | 4|false| # +-----------------+-----+ sdf_new = sdf_new.select("col.*") index_spark_columns = [ scol_for(sdf_new, index_scol_name) for index_scol_name in index_scol_names ] data_spark_columns = [scol_for(sdf_new, scol_name)] internal = left._internal.copy( spark_frame=sdf_new, index_spark_columns=index_spark_columns, data_spark_columns=data_spark_columns, index_fields=[ InternalField.from_struct_field(index_field) for index_field in sdf_new.select( index_spark_columns).schema.fields ], data_fields=[ InternalField.from_struct_field( sdf_new.select(data_spark_columns).schema.fields[0]) ], ) return first_series(DataFrame(internal)) else: from pyspark.pandas.base import column_op return column_op(Column.__eq__)(left, right)
def main(self, sc: SparkContext, *args: Any): # Drop statistical results from the gene bundle # Create an experimental data collection with the observations observations_parquet_path = args[0] genotype_phenotype_parquet_path = args[1] impc_images_parquet_path = args[2] product_parquet_path = args[3] stats_results_parquet_path = args[4] stats_results_raw_data_parquet_path = f"{stats_results_parquet_path}_raw_data" gene_core_parquet_path = args[5] output_path = args[6] spark = SparkSession(sc) observations_df = spark.read.parquet(observations_parquet_path) genotype_phenotype_df = spark.read.parquet( genotype_phenotype_parquet_path) impc_images_df = spark.read.parquet(impc_images_parquet_path) product_df = spark.read.parquet(product_parquet_path) gene_df: DataFrame = spark.read.parquet(gene_core_parquet_path) gene_df = gene_df.drop("datasets_raw_data") stats_results_df = spark.read.parquet(stats_results_parquet_path) impc_images_df = impc_images_df.withColumnRenamed( "gene_accession_id", "mgi_accession_id") images_by_gene_df = impc_images_df.groupBy("mgi_accession_id").agg( collect_set( struct(*[ col_name for col_name in impc_images_df.columns if col_name != "mgi_accession_id" ])).alias("gene_images")) gene_df = gene_df.join(images_by_gene_df, "mgi_accession_id", "left_outer") products_by_gene = product_df.groupBy("mgi_accession_id").agg( collect_set( struct(*[ col_name for col_name in product_df.columns if col_name not in ["mgi_accession_id"] + EXCLUDE_PRODUCT_COLUMNS ])).alias("gene_products")) gene_df = gene_df.join(products_by_gene, "mgi_accession_id", "left_outer") stats_results_by_gene = stats_results_df.groupBy( "marker_accession_id").agg( collect_set("doc_id").alias("statistical_result_ids")) gene_df = gene_df.join( stats_results_by_gene, col("mgi_accession_id") == col("marker_accession_id"), "left_outer", ) parameters_by_gene = observations_df.select( "gene_accession_id", "pipeline_stable_id", "pipeline_name", "procedure_stable_id", "procedure_name", "parameter_stable_id", "parameter_name", ).distinct() parameters_by_gene = parameters_by_gene.groupBy( "gene_accession_id").agg( collect_set( struct( "pipeline_stable_id", "pipeline_name", "procedure_stable_id", "procedure_name", "parameter_stable_id", "parameter_name", )).alias("tested_parameters")) parameters_by_gene = parameters_by_gene.withColumnRenamed( "gene_accession_id", "mgi_accession_id") gene_df = gene_df.join(parameters_by_gene, "mgi_accession_id", "left_outer") gene_df = gene_df.withColumn("_id", col("mgi_accession_id")) genotype_phenotype_df = genotype_phenotype_df.withColumnRenamed( "marker_accession_id", "mgi_accession_id") gp_by_gene_df = genotype_phenotype_df.groupBy("mgi_accession_id").agg( collect_set( struct(*[ col_name for col_name in genotype_phenotype_df.columns if col_name != "mgi_accession_id" ])).alias("gene_phenotype_associations")) gene_vs_phenotypes_df = gene_df.join(gp_by_gene_df, "mgi_accession_id", "left_outer") # self.write_to_mongo( # gene_vs_phenotypes_df, # "org.mousephenotype.api.models.GeneBundle", # "gene_bundles", # ) # Create search_index gp_mp_term_structured = genotype_phenotype_df.withColumn( "significant_mp_term", struct( "mp_term_id", "mp_term_name", zip_with( "intermediate_mp_term_id", "intermediate_mp_term_name", lambda x, y: struct(x.alias("mp_term_id"), y.alias("mp_term_name")), ).alias("intermediate_ancestors"), zip_with( "top_level_mp_term_id", "top_level_mp_term_name", lambda x, y: struct(x.alias("mp_term_id"), y.alias("mp_term_name")), ).alias("top_level_ancestors"), ).alias("significant_mp_term"), ) gp_mp_term_structured = gp_mp_term_structured.select( "mgi_accession_id", "significant_mp_term") gp_mp_term_structured_gene_df = gp_mp_term_structured.groupBy( "mgi_accession_id").agg( collect_set("significant_mp_term").alias( "significant_mp_terms")) gene_search_df = gene_df.join(gp_mp_term_structured_gene_df, "mgi_accession_id", "left_outer") gene_search_df = gene_search_df.select( col("mgi_accession_id").alias("_id"), "mgi_accession_id", "marker_name", "human_gene_symbol", "marker_synonym", "assignment_status", "crispr_allele_production_status", "es_cell_production_status", "mouse_production_status", "phenotype_status", "phenotyping_data_available", "tested_parameters", col("significant_top_level_mp_terms").alias( "significant_phenotype_system"), col("not_significant_top_level_mp_terms").alias( "non_significant_phenotype_system"), "significant_mp_terms", ) self.write_to_mongo( gene_search_df, "org.mousephenotype.api.models.Gene", "gene_search", ) # self.write_to_mongo( # observations_df, # "org.mousephenotype.api.models.Observation", # "experimental_data", # ) stats_results_df = stats_results_df.withColumnRenamed( "doc_id", "statistical_result_id") stats_results_df = stats_results_df.withColumn( "_id", col("statistical_result_id")) # self.write_to_mongo( # stats_results_df, # "org.mousephenotype.api.models.StatisticalResult", # "statistical_results", # ) gene_vs_phenotypes_df.write.parquet(output_path)