def extract_gentar_tsv(spark_session: SparkSession, file_path, entity_type) -> DataFrame: """ Uses a Spark session to generate a DataFrame from a TSV file. Can extract a Colonies file or a Products file. :param spark_session: spark SQL session to be used in the extraction :param file_path: path to the TSV file :return: Spark DataFrame with the extracted data """ gentar_df = utils.extract_tsv(spark_session, file_path) gentar_df = gentar_df.toDF(*[ column_name.replace(" ", "_").lower() for column_name in gentar_df.columns ]) if entity_type == "Product": gentar_df = gentar_df.withColumnRenamed("tissue_types", "tissue_enquiry_types") for col_name in PRODUCT_MULTIVALUED: gentar_df = gentar_df.withColumn( col_name, when( col(col_name).contains("|"), split(col_name, r"\|"), ).when(col(col_name).isNull(), lit(None)).otherwise(array(col_name)), ) return gentar_df
def main(self, sc: SparkContext, *args): """ Takes in a SparkContext and the list of arguments generated by `app_options` and executes the PySpark job. """ spark = SparkSession(sc) # Parsing app options gentar_tsv_path = args[0] output_path = args[1] # Load the data form TSV to a Spark DataFrame gentar_df = utils.extract_tsv(spark, gentar_tsv_path) # Map GenTar Column names to match the ones on the Observations Schema Report gentar_col_mapping = { "Phenotyping External Reference": "colony_name", "Background Strain": "colony_background_strain", "Mutation Symbol": "allele_symbol", "Gene Symbol": "marker_symbol", "MGI Gene Accession ID": "mgi_accession_id", "MGI Strain Accession ID": "mgi_strain_accession_id", "Phenotyping Work Unit": "phenotyping_centre", "Phenotyping Work Group": "phenotyping_consortium", "Production Work Unit": "production_centre", "Production Work Group": "production_consortium", } new_col_names = [] for col_name in gentar_df.columns: if col_name in gentar_col_mapping: new_col_names.append(gentar_col_mapping[col_name]) else: new_col_names.append(col_name.replace(" ", "_").lower()) gentar_df = gentar_df.toDF(*new_col_names) gentar_df.write.parquet(output_path)
def extract_imits_tsv_allele_2(spark_session: SparkSession, file_path: str) -> DataFrame: imits_df = utils.extract_tsv(spark_session, file_path) imits_df = imits_df.withColumn( "allele_mgi_accession_id", when( (col("allele_mgi_accession_id").isNull()) & (col("type") == "Allele"), concat(lit("NOT-RELEASED-"), substring(md5(col("allele_symbol")), 0, 10)), ).otherwise(col("allele_mgi_accession_id")), ) imits_df = imits_df.withColumn( "marker_mgi_accession_id", when( (col("marker_mgi_accession_id").isNull()) & (col("type") == "Gene"), concat(lit("NOT-RELEASED-"), substring(md5(col("marker_symbol")), 0, 10)), ).otherwise(col("marker_mgi_accession_id")), ) imits_df = imits_df.withColumn( "allele2_id", monotonically_increasing_id().astype(StringType())) for col_name in ALLELE2_MULTIVALUED: imits_df = imits_df.withColumn( col_name, when( col(col_name).contains("|"), split(col_name, r"\|"), ).otherwise(array(col_name)), ) return imits_df
def extract_mgi_genes(spark: SparkSession, strain_report_path: str) -> DataFrame: gene_pheno_df = extract_tsv(spark, strain_report_path, schema=GENE_PHENO_SCHEMA, header=False) return gene_pheno_df
def extract_imits_tsv_by_entity_type(spark_session: SparkSession, file_path: str, entity_type: str) -> DataFrame: """ Uses a Spark Session to generate a DataFrame from a TSV file and a specific entity type. Can extract Genes or Alleles from a Alleles report file produced by IMITS. :param spark_session: spark SQL session to be used in the extraction :param file_path: path to the TSV file :param entity_type: 'Allele' or 'Gene' :return: Spark DataFrame with the extracted data """ imits_df = utils.extract_tsv(spark_session, file_path) imtis_entity_df = imits_df.where(imits_df.type == entity_type) if entity_type == "Allele": imtis_entity_df = imtis_entity_df.withColumn( "acc", when( col("allele_mgi_accession_id").isNull(), concat(lit("NOT-RELEASED-"), substring(md5(col("allele_symbol")), 0, 10)), ).otherwise(col("allele_mgi_accession_id")), ) imtis_entity_df = imtis_entity_df.withColumn( "allele2_id", monotonically_increasing_id().astype(StringType())) for col_name in ALLELE2_MULTIVALUED: imits_df = imits_df.withColumn( col_name, when( col(col_name).contains("|"), split(col_name, r"\|"), ).otherwise(array(col_name)), ) return imtis_entity_df
def extract_human_gene_orthologues(spark_session: SparkSession, file_path: str) -> DataFrame: """ :param spark_session: :param file_path: :return human_gene_orthologues_df: Dataframe with the human gene to mouse gene mapping """ file_string_fields = [ "Human Marker Symbol", "Human Entrez Gene ID", "HomoloGene ID", "Mouse Marker Symbol", "MGI Marker Accession ID", ] file_array_fields = ["High-level Mammalian Phenotype ID"] schema_fields = [ StructField(field_name, StringType(), True) for field_name in file_string_fields ] schema_fields.extend([ StructField(field_name, ArrayType(StringType), True) for field_name in file_array_fields ]) hmd_file_schema = StructType(schema_fields) human_gene_orthologues_df = utils.extract_tsv(spark_session, file_path, hmd_file_schema) return human_gene_orthologues_df
def extract_mgi_strain_report(spark: SparkSession, strain_report_path: str) -> DataFrame: strain_df = extract_tsv(spark, strain_report_path, schema=STRAIN_SCHEMA, header=False) return strain_df
def extract_phenotyping_centres(spark_session: SparkSession, file_path: str) -> DataFrame: """ :param spark_session: :param file_path: :return: """ phenotyping_centres_df = utils.extract_tsv(spark_session, file_path) return phenotyping_centres_df
def extract_products(spark_session: SparkSession, file_path='.') -> DataFrame: """ :param spark_session: :param file_path: :return product_df: """ product_df = utils.extract_tsv(spark_session, file_path).withColumn() return product_df
def extract_mgi_homologene(spark: SparkSession, homologene_report_path: str) -> DataFrame: # TODO fix empty column on rpt homologene_df = extract_tsv(spark, homologene_report_path, header=True) for col_name in homologene_df.columns: homologene_df = homologene_df.withColumnRenamed( col_name, col_name.lower().replace(" ", "_")) return homologene_df
def extract_mgi_mrk_list(spark: SparkSession, mgi_mrk_list_report_path: str) -> DataFrame: mgi_mrk_list_df = extract_tsv(spark, mgi_mrk_list_report_path, header=True) for col_name in mgi_mrk_list_df.columns: mgi_mrk_list_df = mgi_mrk_list_df.withColumnRenamed( col_name, col_name.lower().replace(" ", "_").replace("_(pipe-separated)", ""), ) return mgi_mrk_list_df
def extract_genes(spark_session: SparkSession, file_path='.') -> DataFrame: """ :param spark_session: :param file_path: :return genes_df: """ allele2_df = utils.extract_tsv(spark_session, file_path) genes_df = allele2_df.where(allele2_df.type == 'Gene') return genes_df
def extract_mgi_alleles(spark: SparkSession, strain_report_path: str) -> DataFrame: gene_pheno_df = extract_tsv(spark, strain_report_path, schema=PHENOTYPIC_ALLELE_SCHEMA, header=False) allele_df = gene_pheno_df.select( "alleleSymbol", "mgiAlleleID", "alleleName", "mgiMarkerAccessionID", "markerSymbol", ).dropDuplicates() return allele_df
def main(self, sc: SparkContext, *args): """ Takes in a SparkContext and the list of arguments generated by `app_options` and executes the PySpark job. """ mgi_gene_pheno_report_path = args[0] output_path = args[1] spark = SparkSession(sc) mgi_gene_pheno_df: DataFrame = extract_tsv(spark, mgi_gene_pheno_report_path, schema=GENE_PHENO_SCHEMA, header=False) mgi_gene_pheno_df.write.mode("overwrite").parquet(output_path)
def main(self, sc: SparkContext, *args): """ Takes in a SparkContext and the list of arguments generated by `app_options` and executes the PySpark job. """ mgi_homology_report_path = args[0] output_path = args[1] spark = SparkSession(sc) # TODO fix empty column on rpt mgi_homology_df = extract_tsv(spark, mgi_homology_report_path, header=True) for col_name in mgi_homology_df.columns: mgi_homology_df = mgi_homology_df.withColumnRenamed( col_name, col_name.lower().replace(" ", "_")) mgi_homology_df.write.mode("overwrite").parquet(output_path)
def extract_imits_tsv(spark_session: SparkSession, file_path, entity_type) -> DataFrame: """ Uses a Spark session to generate a DataFrame from a TSV file. Can extract a Colonies file or a Products file. :param spark_session: spark SQL session to be used in the extraction :param file_path: path to the TSV file :return: Spark DataFrame with the extracted data """ imits_df = utils.extract_tsv(spark_session, file_path) imits_df = imits_df.toDF(*[ column_name.replace(" ", "_").lower() for column_name in imits_df.columns ]) if entity_type == "Product": for col_name in PRODUCT_MULTIVALUED: imits_df = imits_df.withColumn(col_name, split(col_name, r"\|")) return imits_df
def main(self, sc: SparkContext, *args): """ Takes in a SparkContext and the list of arguments generated by `app_options` and executes the PySpark job. """ mgi_phenotypic_allele_report_path = args[0] output_path = args[1] spark = SparkSession(sc) mgi_phenotypic_allele_df: DataFrame = extract_tsv( spark, mgi_phenotypic_allele_report_path, schema=PHENOTYPIC_ALLELE_SCHEMA, header=False, ) allele_df = mgi_phenotypic_allele_df.select( "alleleSymbol", "mgiAlleleID", "alleleName", "mgiMarkerAccessionID", "markerSymbol", ).dropDuplicates() allele_df.write.mode("overwrite").parquet(output_path)