def compute(
        self,
        biomarkers_table: str,
        source_table: str,
        disease_table: str,
        drug_index: str,
        output_file: str
    ) -> None:
        """Loads and processes inputs to generate the Cancer Biomarkers evidence strings"""

        # Import data
        biomarkers_df = self.spark.read.csv(biomarkers_table, sep='\t', header=True)
        source_df = self.spark.read.json(source_table).select(
            col('label').alias('niceName'),
            'source', 'url')
        disease_df = self.spark.read.json(disease_table).select(
            regexp_replace(col('name'), '_', '').alias('tumor_type'),
            regexp_extract(col('url'), r'[^/]+$', 0).alias('diseaseFromSourceMappedId'))
        drugs_df = self.spark.read.parquet(drug_index).select(
            col('id').alias('drugId'), col('name').alias('drug'))

        # Process inputs to generate evidence strings
        evidence = self.process_biomarkers(
            biomarkers_df, source_df, disease_df, drugs_df
        )

        # Write evidence strings
        write_evidence_strings(self.evidence, output_file)
        logging.info(f'{evidence.count()} evidence strings have been saved to {output_file}.')
def main(
    toxcast: str,
    output: str,
    adverse_events: str,
    safety_risk: str,
    log_file: Optional[str] = None,
):
    """
    This module puts together data from different sources that describe target safety liabilities.

    Args:
        adverse_events: Input TSV containing adverse events associated with targets that have been collected from relevant publications. Fetched from GitHub.
        safety_risk: Input TSV containing cardiovascular safety liabilities associated with targets that have been collected from relevant publications. Fetched from GitHub.
        toxcast: Input table containing biological processes associated with relevant targets that have been observed in toxicity assays.
        output: Output gzipped json file following the target safety liabilities data model.
        log_file: Destination of the logs generated by this script. Defaults to None.
    """

    # Logger initializer. If no log_file is specified, logs are written to stderr
    logging.basicConfig(
        level=logging.INFO,
        format=
        '%(asctime)s %(levelname)s %(module)s - %(funcName)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S',
    )
    if log_file:
        logging.config.fileConfig(filename=log_file)
    else:
        logging.StreamHandler(sys.stderr)

    # Initialize spark context
    global spark
    spark = initialize_spark()
    spark.sparkContext.addFile(adverse_events)
    spark.sparkContext.addFile(safety_risk)
    logging.info('Remote files successfully added to the Spark Context.')

    # Load and process the input files into dataframes
    ae_df = process_adverse_events(
        SparkFiles.get(adverse_events.split('/')[-1]))
    sr_df = process_safety_risk(SparkFiles.get(safety_risk.split('/')[-1]))
    toxcast_df = process_toxcast(toxcast)
    logging.info('Data has been processed. Merging...')

    # Combine dfs and group evidence
    safety_df = (
        # dfs are combined; unionByName is used instead of union to address for the differences in the schemas
        ae_df.unionByName(sr_df, allowMissingColumns=True).unionByName(
            toxcast_df, allowMissingColumns=True))

    # Write output
    logging.info('Evidence strings have been processed. Saving...')
    write_evidence_strings(safety_df, output)
    logging.info(
        f'{safety_df.count()} evidence of safety liabilities have been saved to {output}. Exiting.'
    )

    return 0
Example #3
0
def main(input_file: str, output_file: str, cache_dir: str) -> None:

    # Read and process Orphanet's XML file into evidence strings

    orphanet_df = parse_orphanet_xml(input_file, spark)
    logging.info('Orphanet input file has been imported. Processing evidence strings.')

    evidence_df = process_orphanet(orphanet_df)

    evidence_df = add_efo_mapping(evidence_strings=evidence_df, spark_instance=spark, ontoma_cache_dir=cache_dir)
    logging.info('Disease mappings have been added.')

    # Save data
    write_evidence_strings(evidence_df, output_file)
    logging.info(f'{evidence_df.count()} evidence strings have been saved to {output_file}')
def main(outputFile: str) -> None:

    # Initialize spark session
    spark_mem_limit = detect_spark_memory_limit()
    spark_conf = (SparkConf().set(
        'spark.driver.memory', f'{spark_mem_limit}g').set(
            'spark.executor.memory',
            f'{spark_mem_limit}g').set('spark.driver.maxResultSize', '0').set(
                'spark.debug.maxToStringFields',
                '2000').set('spark.sql.execution.arrow.maxRecordsPerBatch',
                            '500000'))
    spark = (SparkSession.builder.config(
        conf=spark_conf).master('local[*]').getOrCreate())

    spark.sparkContext.addFile(TEPURL)

    # Fetching and processing the TEP table and saved as a JSON file:
    TEP_df = (
        spark.read.csv(SparkFiles.get(TEPURL.split('/')[-1]),
                       sep='\t',
                       header=True)

        # Generating TEP url from Gene column: SLC12A4/SLC12A6 -> https://www.thesgc.org/tep/SLC12A4SLC12A6
        .withColumn(
            'url',
            concat(lit('https://www.thesgc.org/tep/'),
                   regexp_replace(lower(col('Gene')), '/', '')))

        # Exploding TEPs, where multiple genes are given:
        .withColumn('targetFromSourceId', explode(split(col('Gene'), '/')))

        # Renaming columns:
        .withColumnRenamed('Therapeutic Area',
                           'therapeuticArea').withColumnRenamed(
                               'Description', 'description')

        # Dropping columns:
        .drop(*['Gene', 'version', 'Date']).persist())

    logging.info('TEP dataset has been downloaded and formatted.')
    logging.info(f'Number of TEPs: {TEP_df.count()}')
    logging.info(
        f'Number of unique genes: {TEP_df.select("targetFromSourceId").distinct().count()}'
    )

    # Saving data:
    write_evidence_strings(TEP_df, outputFile)
    logging.info(f'TEP dataset is written to {outputFile}.')
def main(chembl_evidence: str, predictions: str, output_file: str) -> None:
    """
    This module adds the studyStopReasonCategories to the ChEMBL evidence as a result of the categorisation of the clinical trial reason to stop.
    Args:
        chembl_evidence: Input gzipped JSON with the evidence submitted by ChEMBL.
        predictions: Input TSV containing the categories of the clinical trial reason to stop. 
        Instructions for applying the ML model here: https://github.com/ireneisdoomed/stopReasons.
        output_file: Output gzipped json file containing the ChEMBL evidence with the additional studyStopReasonCategories field.
        log_file: Destination of the logs generated by this script. Defaults to None.
    """
    logging.info(f'ChEMBL evidence JSON file: {chembl_evidence}')
    logging.info(f'Classes of reason to stop table: {predictions}')

    # Load input into dataframes
    chembl_df = spark.read.json(chembl_evidence).persist()
    predictions_df = (load_stop_reasons_classes(predictions).withColumnRenamed(
        'why_stopped', 'studyStopReason').withColumnRenamed(
            'subclasses', 'studyStopReasonCategories').select(
                'studyStopReason',
                'studyStopReasonCategories').distinct().persist())

    # Join datasets
    evd_df = (chembl_df.join(predictions_df, on='studyStopReason',
                             how='left').distinct())

    # We expect that ~10% of evidence strings have a reason to stop assigned
    # It is asserted that this fraction is between 9 and 11% of the total count
    total_count = evd_df.count()
    early_stopped_count = evd_df.filter(
        col('studyStopReasonCategories').isNotNull()).count()

    if not 0.08 < early_stopped_count / total_count < 0.11:
        raise AssertionError(
            f'The fraction of evidence with a CT reason to stop class is not as expected ({early_stopped_count / total_count}).'
        )

    # Write output
    logging.info('Evidence strings have been processed. Saving...')
    write_evidence_strings(evd_df, output_file)

    logging.info(
        f'{total_count} evidence strings have been saved to {output_file}. Exiting.'
    )
def main(input_file: str, output_file: str, cache_dir: str, local: bool = False) -> None:

    # Initialize spark session
    if local:
        sparkConf = (
            SparkConf()
            .set('spark.driver.memory', '15g')
            .set('spark.executor.memory', '15g')
            .set('spark.driver.maxResultSize', '0')
            .set('spark.debug.maxToStringFields', '2000')
            .set('spark.sql.execution.arrow.maxRecordsPerBatch', '500000')
        )
        spark = (
            SparkSession.builder
            .config(conf=sparkConf)
            .master('local[*]')
            .getOrCreate()
        )
    else:
        sparkConf = (
            SparkConf()
            .set('spark.driver.maxResultSize', '0')
            .set('spark.debug.maxToStringFields', '2000')
            .set('spark.sql.execution.arrow.maxRecordsPerBatch', '500000')
        )
        spark = (
            SparkSession.builder
            .config(conf=sparkConf)
            .getOrCreate()
        )

    # Read and process Clingen's table into evidence strings

    clingen_df = read_input_file(input_file, spark_instance=spark)
    logging.info('Gene Validity Curations table has been imported. Processing evidence strings.')

    evidence_df = process_clingen(clingen_df)

    evidence_df = add_efo_mapping(evidence_strings=evidence_df, spark_instance=spark, ontoma_cache_dir=cache_dir)
    logging.info('Disease mappings have been added.')

    write_evidence_strings(evidence_df, output_file)
    logging.info(f'{evidence_df.count()} evidence strings have been saved to {output_file}')
def main(dd_file: str,
         eye_file: str,
         skin_file: str,
         cancer_file: str,
         cardiac_file: str,
         output_file: str,
         cache_dir: str,
         local: bool = False) -> None:

    # Initialize spark session
    global spark
    spark_mem_limit = detect_spark_memory_limit()
    spark_conf = (SparkConf().set(
        'spark.driver.memory', f'{spark_mem_limit}g').set(
            'spark.executor.memory',
            f'{spark_mem_limit}g').set('spark.driver.maxResultSize', '0').set(
                'spark.debug.maxToStringFields',
                '2000').set('spark.sql.execution.arrow.maxRecordsPerBatch',
                            '500000'))
    spark = (SparkSession.builder.config(conf=spark_conf).config(
        "spark.sql.broadcastTimeout",
        "36000").master('local[*]').getOrCreate())

    # Read and process G2P's tables into evidence strings
    gene2phenotype_df = read_input_file(dd_file, eye_file, skin_file,
                                        cancer_file, cardiac_file)
    logging.info(
        'Gene2Phenotype panels have been imported. Processing evidence strings.'
    )

    evidence_df = process_gene2phenotype(gene2phenotype_df)

    evidence_df = add_efo_mapping(evidence_strings=evidence_df,
                                  ontoma_cache_dir=cache_dir,
                                  spark_instance=spark)
    logging.info('Disease mappings have been added.')

    # Saving data:
    write_evidence_strings(evidence_df, output_file)
    logging.info(
        f'{evidence_df.count()} evidence strings have been saved to {output_file}'
    )
Example #8
0
    return parser


if __name__ == '__main__':
    args = get_parser().parse_args()

    # Logger initializer. If no log_file is specified, logs are written to stderr
    logging.basicConfig(
        level=logging.INFO,
        format=
        '%(asctime)s %(levelname)s %(module)s - %(funcName)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S',
    )
    if args.log_file:
        logging.config.fileConfig(filename=args.log_file)
    else:
        logging.StreamHandler(sys.stderr)

    spark = initialize_sparksession()

    evd_df = main(
        az_binary_data=args.az_binary_data,
        az_quant_data=args.az_quant_data,
        spark_instance=spark,
    )

    write_evidence_strings(evd_df, args.output)
    logging.info(
        f'Evidence strings have been saved to {args.output}. Exiting.')
def main(cooccurrences, outputFile):

    # Log parameters:
    logging.info(f'Cooccurrence file: {cooccurrences}')
    logging.info(f'Output file: {outputFile}')
    logging.info('Generating evidence:')

    # Load/filter datasets:
    agg_cooccurrence_df = (
        # Reading file:
        read_path(cooccurrences, spark)
        .repartition(200)
        # Filter out pairs found in unwanted sections
        .filter(F.col('section').isin(SECTIONS_OF_INTEREST))
        # Casting integer pmid column to string:
        .withColumn("pmid", F.trim(F.col('pmid').cast(StringType())))
        # Dropping pmcid values that violate schema:
        .withColumn('pmcid', F.when(F.col('pmcid').rlike(r'^PMC\d+$'), F.col('pmcid')))
        # Publication identifier is a pmid if available, otherwise pmcid
        .withColumn('publicationIdentifier', F.when(F.col('pmid').isNull(), F.col('pmcid')).otherwise(F.col('pmid')))
        # Filtering for disease/target cooccurrences:
        .filter(
            (F.col('type') == 'GP-DS')  # Filter gene/protein - disease cooccurrence
            & F.col('isMapped')  # Filtering for mapped cooccurrences
            & F.col('publicationIdentifier').isNotNull() # Making sure at least the pmid or the pmcid is given:
            & (F.length(F.col('text')) < 600) # Exclude sentences with more than 600 characters
            & (F.col('label1').isin(EXCLUDED_TARGET_TERMS) == False)  # Excluding target labels from the exclusion list
        )
        # Renaming columns:
        .withColumnRenamed('keywordId1', 'targetFromSourceId')
        .withColumnRenamed('keywordId2', 'diseaseFromSourceMappedId')
        # Aggregating data by publication, target and disease:
        .groupBy(['publicationIdentifier', 'targetFromSourceId', 'diseaseFromSourceMappedId'])
        .agg(
            F.collect_set(F.col('pmcid')).alias('pmcIds'),
            F.collect_set(F.col('pmid')).alias('literature'),
            F.collect_set(
                F.struct(
                    F.col('text'),
                    F.col('start1').alias('tStart'),
                    F.col('end1').alias('tEnd'),
                    F.col('start2').alias('dStart'),
                    F.col('end2').alias('dEnd'),
                    F.col('section'),
                )
            ).alias('textMiningSentences'),
            F.sum(F.col('evidence_score')).alias('resourceScore'),
        )
        # Nullify pmcIds if empty array:
        .withColumn('pmcIds', F.when(F.size('pmcIds') != 0, F.col('pmcIds')))
        # Only evidence with score above 1 is considered:
        .filter(F.col('resourceScore') > 1)
    )

    # Final formatting and saving data:
    evidence = (
        agg_cooccurrence_df
        # Adding literal columns:
        .withColumn('datasourceId', F.lit('europepmc')).withColumn('datatypeId', F.lit('literature'))
        # Reorder columns:
        .select(
            [
                'datasourceId',
                'datatypeId',
                'targetFromSourceId',
                'diseaseFromSourceMappedId',
                'resourceScore',
                'literature',
                'textMiningSentences',
                'pmcIds',
            ]
        )
    )

    write_evidence_strings(evidence, outputFile)
    logging.info('EPMC disease target evidence saved.')
    logging.info(f'Number of evidence: {agg_cooccurrence_df.count()}')
    # Report on the number of diseases, targets and associations if loglevel == "debug" to avoid cost on computation time:
    logging.debug(f"Number of publications: {agg_cooccurrence_df.select(F.col('publicationIdentifier')).count()}")
    logging.debug(
        f"Number of publications without pubmed ID: {agg_cooccurrence_df.filter(F.col('publicationIdentifier').contains('PMC')).select('publicationIdentifier').distinct().count()}"
    )
    logging.debug(f"Number of targets: {evidence.select(F.col('targetFromSourceId')).distinct().count()}")
    logging.debug(f"Number of diseases: {evidence.select(F.col('diseaseFromSourceMappedId')).distinct().count()}")
    logging.debug(
        f"Number of associations: {evidence.select(F.col('diseaseFromSourceMappedId'), F.col('targetFromSourceId')).dropDuplicates().count()}"
    )
Example #10
0
def main(desc_file, evid_file, cell_file, out_file):
    sparkConf = (SparkConf().set('spark.driver.memory', '15g').set(
        'spark.executor.memory',
        '15g').set('spark.driver.maxResultSize',
                   '0').set('spark.debug.maxToStringFields', '2000').set(
                       'spark.sql.execution.arrow.maxRecordsPerBatch',
                       '500000'))
    spark = (SparkSession.builder.config(
        conf=sparkConf).master('local[*]').getOrCreate())

    # Log parameters:
    logging.info(f'Evidence file: {evid_file}')
    logging.info(f'Description file: {desc_file}')
    logging.info(f'Cell type annotation: {cell_file}')
    logging.info(f'Output file: {out_file}')

    # Read files:
    evidence_df = (spark.read.csv(evid_file, sep='\t',
                                  header=True).drop('pmid', 'gene_set_name',
                                                    'disease_name'))
    cell_lines_df = spark.read.csv(cell_file, sep='\t', header=True)
    description_df = spark.read.csv(desc_file, sep='\t', header=True)

    # Logging dataframe stats:
    logging.info(f'Number of evidence: {evidence_df.count()}')
    logging.info(f'Number of descriptions: {description_df.count()}')
    logging.info(f'Number of cell/tissue annotation: {cell_lines_df.count()}')

    # Tissues and cancer types are annotated together in the same column (tissue_or_cancer_type)
    # To disambiguate one from another, the column is combined with the cell lines
    # First on the tissue level:
    tissue_desc = (description_df.withColumnRenamed(
        'tissue_or_cancer_type', 'tissue').join(cell_lines_df,
                                                on='tissue',
                                                how='inner'))

    # And then on the disease level:
    cell_desc = (description_df.withColumnRenamed('tissue_or_cancer_type',
                                                  'diseaseFromSource').join(
                                                      cell_lines_df,
                                                      on='diseaseFromSource',
                                                      how='inner'))

    merged_annotation = (
        # Concatenating the above generated dataframes:
        cell_desc.union(tissue_desc)

        # Aggregating by disease and method:
        .groupBy('diseaseFromSource', 'efo_id', 'method')

        # The cell annotation is aggregated in a list of struct:
        .agg(
            collect_set(
                struct(col('name'), col('id'), col('tissue'),
                       col('tissueId'))).alias('diseaseCellLines')
        ).drop('method'))

    # Joining merged annotation with evidence:
    pooled_evidence_df = (
        evidence_df.select(
            col('target_id').alias('targetFromSourceId'),
            col('disease_id').alias('efo_id'),
            col('score').alias('resourceScore').cast(FloatType()),
        )

        # Some of the target identifier are not Ensembl Gene id - replace them:
        .replace(to_replace=CRISPR_SYMBOL_MAPPING,
                 subset=['targetFromSourceId'])

        # Merging with descriptions:
        .join(merged_annotation, on='efo_id', how='outer')

        # From EFO uri, generate EFO id:
        .withColumn(
            'diseaseFromSourceMappedId',
            element_at(split(col('efo_id'), '/'),
                       -1).alias('diseaseFromSourceMappedId')).drop('efo_id')

        # Adding constants:
        .withColumn('datasourceId', lit('crispr')).withColumn(
            'datatypeId', lit('affected_pathway')).persist())

    logging.info(
        f'Saving {pooled_evidence_df.count()} CRISPR evidence in JSON format, to: {out_file}'
    )

    write_evidence_strings(pooled_evidence_df, out_file)
Example #11
0
    def generate_panelapp_evidence(self, input_file: str, output_file: str,
                                   cache_dir: str) -> None:
        logging.info('Filter and extract the necessary columns.')
        panelapp_df = self.spark.read.csv(input_file, sep=r'\t', header=True)
        # Panel version can be either a single number (e.g. 1), or two numbers separated by a dot (e.g. 3.14). We cast
        # either representation to float to ensure correct filtering below. (Note that conversion to float would not
        # work in the general case, because 3.4 > 3.14, but we only need to compare relative to 1.0.)
        panelapp_df = panelapp_df.withColumn(
            'Panel Version',
            panelapp_df['Panel Version'].cast('float').alias('Panel Version'))
        panelapp_df = (
            panelapp_df.filter((
                (col('List') == 'green') | (col('List') == 'amber'))
                               & (col('Panel Version') >= 1.0)
                               & (col('Panel Status') == 'PUBLIC')).select(
                                   'Symbol', 'Panel Id', 'Panel Name', 'List',
                                   'Mode of inheritance', 'Phenotypes')
            # The full original records are not redundant; however, uniqueness on a subset of fields is not guaranteed.
            .distinct())

        logging.info(
            'Fix typos and formatting errors which would interfere with phenotype splitting.'
        )
        panelapp_df = panelapp_df.withColumn('cleanedUpPhenotypes',
                                             col('Phenotypes'))
        for regexp, replacement in self.PHENOTYPE_BEFORE_SPLIT_RE.items():
            panelapp_df = panelapp_df.withColumn(
                'cleanedUpPhenotypes',
                regexp_replace(col('cleanedUpPhenotypes'), regexp,
                               replacement))

        logging.info('Split and explode the phenotypes.')
        panelapp_df = (panelapp_df.withColumn(
            'cohortPhenotypes',
            array_distinct(split(col('cleanedUpPhenotypes'), ';'))).withColumn(
                'phenotype', explode(col('cohortPhenotypes'))))

        logging.info(
            'Remove specific patterns and phrases which will interfere with ontology extraction and mapping.'
        )
        panelapp_df = panelapp_df.withColumn('diseaseFromSource',
                                             col('phenotype'))
        for regexp in self.PHENOTYPE_AFTER_SPLIT_RE:
            panelapp_df = panelapp_df.withColumn(
                'diseaseFromSource',
                regexp_replace(col('diseaseFromSource'), f'({regexp})', ''))

        logging.info(
            'Extract ontology information, clean up and filter the split phenotypes.'
        )
        panelapp_df = (
            panelapp_df

            # Extract Orphanet/MONDO/HP ontology identifiers and remove them from the phenotype string.
            .withColumn('ontology_namespace', regexp_extract(col('diseaseFromSource'), self.OTHER_RE, 1))
            .withColumn('ontology_namespace', regexp_replace(col('ontology_namespace'), 'OrphaNet: ORPHA', 'ORPHA'))
            .withColumn('ontology_id', regexp_extract(col('diseaseFromSource'), self.OTHER_RE, 2))
            .withColumn(
                'ontology',
                when(
                    (col('ontology_namespace') != '') & (col('ontology_id') != ''),
                    concat(col('ontology_namespace'), lit(':'), col('ontology_id'))
                )
            )
            .withColumn('diseaseFromSource', regexp_replace(col('diseaseFromSource'), f'({self.OTHER_RE})', ''))

            # Extract OMIM identifiers and remove them from the phenotype string.
            .withColumn('omim_id', regexp_extract(col('diseaseFromSource'), self.OMIM_RE, 2))
            .withColumn('omim', when(col('omim_id') != '', concat(lit('OMIM:'), col('omim_id'))))
            .withColumn('diseaseFromSource', regexp_replace(col('diseaseFromSource'), f'({self.OMIM_RE})', ''))

            # Choose one of the ontology identifiers, keeping OMIM as a priority.
            .withColumn('diseaseFromSourceId', when(col('omim').isNotNull(), col('omim')).otherwise(col('ontology')))
            .drop('ontology_namespace', 'ontology_id', 'ontology', 'omim_id', 'omim')

            # Clean up the final split phenotypes.
            .withColumn('diseaseFromSource', regexp_replace(col('diseaseFromSource'), r'\(\)', ''))
            .withColumn('diseaseFromSource', trim(col('diseaseFromSource')))
            .withColumn('diseaseFromSource', when(col('diseaseFromSource') != '', col('diseaseFromSource')))

            # Remove low quality records, where the name of the phenotype string starts with a question mark.
            .filter(
                ~(
                    (col('diseaseFromSource').isNotNull()) & (col('diseaseFromSource').startswith('?'))
                )
            )

            # Remove duplication caused by cases where multiple phenotypes within the same record fail to generate any
            # phenotype string or ontology identifier.
            .distinct()

            # For records where we were unable to determine either a phenotype string nor an ontology identifier,
            # substitute the panel name instead.
            .withColumn(
                'diseaseFromSource',
                when(
                    (col('diseaseFromSource').isNull()) & (col('diseaseFromSourceId').isNull()),
                    col('Panel Name')
                )
                .otherwise(col('diseaseFromSource'))
            )
            .persist()
        )

        logging.info('Fetch and join literature references.')
        all_panel_ids = panelapp_df.select(
            'Panel Id').toPandas()['Panel Id'].unique()
        literature_references = self.fetch_literature_references(all_panel_ids)
        panelapp_df = panelapp_df.join(literature_references,
                                       on=['Panel Id', 'Symbol'],
                                       how='left')

        if self.debug_output_phenotypes_filename:
            logging.info('Output tables for debugging purposes, if requested.')
            (panelapp_df.select(
                'Phenotypes',  # Original, unaltered string with all phenotypes.
                'cleanedUpPhenotypes',  # String with phenotypes after pre-split cleanup.
                'phenotype',  # Individual phenotype after splitting.
                'diseaseFromSource',  # Final cleaned up disease name.
                'diseaseFromSourceId',  # Final cleaned up disease ID.
            ).distinct().toPandas().to_csv(
                self.debug_output_phenotypes_filename, sep='\t', index=False))

        logging.info(
            'Drop unnecessary fields and populate the final evidence string structure.'
        )
        evidence_df = (
            panelapp_df.drop('Phenotypes', 'cleanedUpPhenotypes', 'phenotype')
            # allelicRequirements requires a list, but we always only have one value from PanelApp.
            .withColumn(
                'allelicRequirements',
                when(
                    col('Mode of inheritance').isNotNull(),
                    array(col('Mode of inheritance')))).drop(
                        'Mode of inheritance').withColumnRenamed(
                            'List', 'confidence').withColumn(
                                'datasourceId',
                                lit('genomics_england')).withColumn(
                                    'datatypeId', lit('genetic_literature'))
            # diseaseFromSourceId populated above
            # literature populated above
            .withColumnRenamed('Panel Id', 'studyId').withColumnRenamed(
                'Panel Name',
                'studyOverview').withColumnRenamed('Symbol',
                                                   'targetFromSourceId')

            # Some residual duplication is caused by slightly different representations from `cohortPhenotypes` being
            # cleaned up to the same representation in `diseaseFromSource`, for example "Pontocerebellar hypoplasia type
            # 2D (613811)" and "Pontocerebellar hypoplasia type 2D, 613811".
            .distinct())

        evidence_df = add_efo_mapping(evidence_strings=evidence_df,
                                      spark_instance=self.spark,
                                      ontoma_cache_dir=cache_dir)
        logging.info('Disease mappings have been added.')

        write_evidence_strings(evidence_df, output_file)
        logging.info(
            f'{evidence_df.count()} evidence strings have been saved to {output_file}'
        )