Exemple #1
0
def extract_gentar_tsv(spark_session: SparkSession, file_path,
                       entity_type) -> DataFrame:
    """
    Uses a Spark session to generate a DataFrame from a TSV file. Can extract a Colonies file or a Products file.
    :param spark_session: spark SQL session to be used in the extraction
    :param file_path: path to the TSV file
    :return: Spark DataFrame with the extracted data
    """
    gentar_df = utils.extract_tsv(spark_session, file_path)
    gentar_df = gentar_df.toDF(*[
        column_name.replace(" ", "_").lower()
        for column_name in gentar_df.columns
    ])
    if entity_type == "Product":
        gentar_df = gentar_df.withColumnRenamed("tissue_types",
                                                "tissue_enquiry_types")
        for col_name in PRODUCT_MULTIVALUED:
            gentar_df = gentar_df.withColumn(
                col_name,
                when(
                    col(col_name).contains("|"),
                    split(col_name, r"\|"),
                ).when(col(col_name).isNull(),
                       lit(None)).otherwise(array(col_name)),
            )
    return gentar_df
Exemple #2
0
    def main(self, sc: SparkContext, *args):
        """
        Takes in a SparkContext and the list of arguments generated by `app_options` and executes the PySpark job.
        """
        spark = SparkSession(sc)
        # Parsing app options
        gentar_tsv_path = args[0]
        output_path = args[1]

        # Load the data form TSV to a Spark DataFrame
        gentar_df = utils.extract_tsv(spark, gentar_tsv_path)

        # Map GenTar Column names to match the ones on the Observations Schema Report
        gentar_col_mapping = {
            "Phenotyping External Reference": "colony_name",
            "Background Strain": "colony_background_strain",
            "Mutation Symbol": "allele_symbol",
            "Gene Symbol": "marker_symbol",
            "MGI Gene Accession ID": "mgi_accession_id",
            "MGI Strain Accession ID": "mgi_strain_accession_id",
            "Phenotyping Work Unit": "phenotyping_centre",
            "Phenotyping Work Group": "phenotyping_consortium",
            "Production Work Unit": "production_centre",
            "Production Work Group": "production_consortium",
        }
        new_col_names = []
        for col_name in gentar_df.columns:
            if col_name in gentar_col_mapping:
                new_col_names.append(gentar_col_mapping[col_name])
            else:
                new_col_names.append(col_name.replace(" ", "_").lower())
        gentar_df = gentar_df.toDF(*new_col_names)
        gentar_df.write.parquet(output_path)
Exemple #3
0
def extract_imits_tsv_allele_2(spark_session: SparkSession,
                               file_path: str) -> DataFrame:
    imits_df = utils.extract_tsv(spark_session, file_path)
    imits_df = imits_df.withColumn(
        "allele_mgi_accession_id",
        when(
            (col("allele_mgi_accession_id").isNull()) &
            (col("type") == "Allele"),
            concat(lit("NOT-RELEASED-"),
                   substring(md5(col("allele_symbol")), 0, 10)),
        ).otherwise(col("allele_mgi_accession_id")),
    )
    imits_df = imits_df.withColumn(
        "marker_mgi_accession_id",
        when(
            (col("marker_mgi_accession_id").isNull()) &
            (col("type") == "Gene"),
            concat(lit("NOT-RELEASED-"),
                   substring(md5(col("marker_symbol")), 0, 10)),
        ).otherwise(col("marker_mgi_accession_id")),
    )
    imits_df = imits_df.withColumn(
        "allele2_id",
        monotonically_increasing_id().astype(StringType()))
    for col_name in ALLELE2_MULTIVALUED:
        imits_df = imits_df.withColumn(
            col_name,
            when(
                col(col_name).contains("|"),
                split(col_name, r"\|"),
            ).otherwise(array(col_name)),
        )
    return imits_df
Exemple #4
0
def extract_mgi_genes(spark: SparkSession,
                      strain_report_path: str) -> DataFrame:
    gene_pheno_df = extract_tsv(spark,
                                strain_report_path,
                                schema=GENE_PHENO_SCHEMA,
                                header=False)
    return gene_pheno_df
Exemple #5
0
def extract_imits_tsv_by_entity_type(spark_session: SparkSession,
                                     file_path: str,
                                     entity_type: str) -> DataFrame:
    """
    Uses a Spark Session to generate a DataFrame from a TSV file and a specific entity type.
    Can extract Genes or Alleles from a Alleles report file produced by IMITS.
    :param spark_session: spark SQL session to be used in the extraction
    :param file_path: path to the TSV file
    :param entity_type: 'Allele' or 'Gene'
    :return: Spark DataFrame with the extracted data
    """
    imits_df = utils.extract_tsv(spark_session, file_path)
    imtis_entity_df = imits_df.where(imits_df.type == entity_type)
    if entity_type == "Allele":
        imtis_entity_df = imtis_entity_df.withColumn(
            "acc",
            when(
                col("allele_mgi_accession_id").isNull(),
                concat(lit("NOT-RELEASED-"),
                       substring(md5(col("allele_symbol")), 0, 10)),
            ).otherwise(col("allele_mgi_accession_id")),
        )
        imtis_entity_df = imtis_entity_df.withColumn(
            "allele2_id",
            monotonically_increasing_id().astype(StringType()))
    for col_name in ALLELE2_MULTIVALUED:
        imits_df = imits_df.withColumn(
            col_name,
            when(
                col(col_name).contains("|"),
                split(col_name, r"\|"),
            ).otherwise(array(col_name)),
        )
    return imtis_entity_df
def extract_human_gene_orthologues(spark_session: SparkSession,
                                   file_path: str) -> DataFrame:
    """

    :param spark_session:
    :param file_path:
    :return human_gene_orthologues_df: Dataframe with the human gene to mouse gene mapping
    """
    file_string_fields = [
        "Human Marker Symbol",
        "Human Entrez Gene ID",
        "HomoloGene ID",
        "Mouse Marker Symbol",
        "MGI Marker Accession ID",
    ]
    file_array_fields = ["High-level Mammalian Phenotype ID"]
    schema_fields = [
        StructField(field_name, StringType(), True)
        for field_name in file_string_fields
    ]
    schema_fields.extend([
        StructField(field_name, ArrayType(StringType), True)
        for field_name in file_array_fields
    ])
    hmd_file_schema = StructType(schema_fields)
    human_gene_orthologues_df = utils.extract_tsv(spark_session, file_path,
                                                  hmd_file_schema)
    return human_gene_orthologues_df
Exemple #7
0
def extract_mgi_strain_report(spark: SparkSession,
                              strain_report_path: str) -> DataFrame:
    strain_df = extract_tsv(spark,
                            strain_report_path,
                            schema=STRAIN_SCHEMA,
                            header=False)
    return strain_df
def extract_phenotyping_centres(spark_session: SparkSession,
                                file_path: str) -> DataFrame:
    """
    :param spark_session:
    :param file_path:
    :return:
    """
    phenotyping_centres_df = utils.extract_tsv(spark_session, file_path)
    return phenotyping_centres_df
Exemple #9
0
def extract_products(spark_session: SparkSession, file_path='.') -> DataFrame:
    """

    :param spark_session:
    :param file_path:
    :return product_df:
    """
    product_df = utils.extract_tsv(spark_session, file_path).withColumn()
    return product_df
Exemple #10
0
def extract_mgi_homologene(spark: SparkSession,
                           homologene_report_path: str) -> DataFrame:
    # TODO fix empty column on rpt
    homologene_df = extract_tsv(spark, homologene_report_path, header=True)
    for col_name in homologene_df.columns:
        homologene_df = homologene_df.withColumnRenamed(
            col_name,
            col_name.lower().replace(" ", "_"))
    return homologene_df
Exemple #11
0
def extract_mgi_mrk_list(spark: SparkSession,
                         mgi_mrk_list_report_path: str) -> DataFrame:
    mgi_mrk_list_df = extract_tsv(spark, mgi_mrk_list_report_path, header=True)
    for col_name in mgi_mrk_list_df.columns:
        mgi_mrk_list_df = mgi_mrk_list_df.withColumnRenamed(
            col_name,
            col_name.lower().replace(" ", "_").replace("_(pipe-separated)",
                                                       ""),
        )
    return mgi_mrk_list_df
Exemple #12
0
def extract_genes(spark_session: SparkSession, file_path='.') -> DataFrame:
    """

    :param spark_session:
    :param file_path:
    :return genes_df:
    """
    allele2_df = utils.extract_tsv(spark_session, file_path)
    genes_df = allele2_df.where(allele2_df.type == 'Gene')
    return genes_df
Exemple #13
0
def extract_mgi_alleles(spark: SparkSession,
                        strain_report_path: str) -> DataFrame:
    gene_pheno_df = extract_tsv(spark,
                                strain_report_path,
                                schema=PHENOTYPIC_ALLELE_SCHEMA,
                                header=False)
    allele_df = gene_pheno_df.select(
        "alleleSymbol",
        "mgiAlleleID",
        "alleleName",
        "mgiMarkerAccessionID",
        "markerSymbol",
    ).dropDuplicates()
    return allele_df
Exemple #14
0
    def main(self, sc: SparkContext, *args):
        """
        Takes in a SparkContext and the list of arguments generated by `app_options` and executes the PySpark job.
        """
        mgi_gene_pheno_report_path = args[0]
        output_path = args[1]

        spark = SparkSession(sc)

        mgi_gene_pheno_df: DataFrame = extract_tsv(spark,
                                                   mgi_gene_pheno_report_path,
                                                   schema=GENE_PHENO_SCHEMA,
                                                   header=False)
        mgi_gene_pheno_df.write.mode("overwrite").parquet(output_path)
Exemple #15
0
    def main(self, sc: SparkContext, *args):
        """
        Takes in a SparkContext and the list of arguments generated by `app_options` and executes the PySpark job.
        """
        mgi_homology_report_path = args[0]
        output_path = args[1]

        spark = SparkSession(sc)
        # TODO fix empty column on rpt
        mgi_homology_df = extract_tsv(spark,
                                      mgi_homology_report_path,
                                      header=True)
        for col_name in mgi_homology_df.columns:
            mgi_homology_df = mgi_homology_df.withColumnRenamed(
                col_name,
                col_name.lower().replace(" ", "_"))
        mgi_homology_df.write.mode("overwrite").parquet(output_path)
Exemple #16
0
def extract_imits_tsv(spark_session: SparkSession, file_path,
                      entity_type) -> DataFrame:
    """
    Uses a Spark session to generate a DataFrame from a TSV file. Can extract a Colonies file or a Products file.
    :param spark_session: spark SQL session to be used in the extraction
    :param file_path: path to the TSV file
    :return: Spark DataFrame with the extracted data
    """
    imits_df = utils.extract_tsv(spark_session, file_path)
    imits_df = imits_df.toDF(*[
        column_name.replace(" ", "_").lower()
        for column_name in imits_df.columns
    ])
    if entity_type == "Product":
        for col_name in PRODUCT_MULTIVALUED:
            imits_df = imits_df.withColumn(col_name, split(col_name, r"\|"))
    return imits_df
Exemple #17
0
    def main(self, sc: SparkContext, *args):
        """
        Takes in a SparkContext and the list of arguments generated by `app_options` and executes the PySpark job.
        """
        mgi_phenotypic_allele_report_path = args[0]
        output_path = args[1]

        spark = SparkSession(sc)

        mgi_phenotypic_allele_df: DataFrame = extract_tsv(
            spark,
            mgi_phenotypic_allele_report_path,
            schema=PHENOTYPIC_ALLELE_SCHEMA,
            header=False,
        )
        allele_df = mgi_phenotypic_allele_df.select(
            "alleleSymbol",
            "mgiAlleleID",
            "alleleName",
            "mgiMarkerAccessionID",
            "markerSymbol",
        ).dropDuplicates()
        allele_df.write.mode("overwrite").parquet(output_path)