def compute( self, biomarkers_table: str, source_table: str, disease_table: str, drug_index: str, output_file: str ) -> None: """Loads and processes inputs to generate the Cancer Biomarkers evidence strings""" # Import data biomarkers_df = self.spark.read.csv(biomarkers_table, sep='\t', header=True) source_df = self.spark.read.json(source_table).select( col('label').alias('niceName'), 'source', 'url') disease_df = self.spark.read.json(disease_table).select( regexp_replace(col('name'), '_', '').alias('tumor_type'), regexp_extract(col('url'), r'[^/]+$', 0).alias('diseaseFromSourceMappedId')) drugs_df = self.spark.read.parquet(drug_index).select( col('id').alias('drugId'), col('name').alias('drug')) # Process inputs to generate evidence strings evidence = self.process_biomarkers( biomarkers_df, source_df, disease_df, drugs_df ) # Write evidence strings write_evidence_strings(self.evidence, output_file) logging.info(f'{evidence.count()} evidence strings have been saved to {output_file}.')
def main( toxcast: str, output: str, adverse_events: str, safety_risk: str, log_file: Optional[str] = None, ): """ This module puts together data from different sources that describe target safety liabilities. Args: adverse_events: Input TSV containing adverse events associated with targets that have been collected from relevant publications. Fetched from GitHub. safety_risk: Input TSV containing cardiovascular safety liabilities associated with targets that have been collected from relevant publications. Fetched from GitHub. toxcast: Input table containing biological processes associated with relevant targets that have been observed in toxicity assays. output: Output gzipped json file following the target safety liabilities data model. log_file: Destination of the logs generated by this script. Defaults to None. """ # Logger initializer. If no log_file is specified, logs are written to stderr logging.basicConfig( level=logging.INFO, format= '%(asctime)s %(levelname)s %(module)s - %(funcName)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', ) if log_file: logging.config.fileConfig(filename=log_file) else: logging.StreamHandler(sys.stderr) # Initialize spark context global spark spark = initialize_spark() spark.sparkContext.addFile(adverse_events) spark.sparkContext.addFile(safety_risk) logging.info('Remote files successfully added to the Spark Context.') # Load and process the input files into dataframes ae_df = process_adverse_events( SparkFiles.get(adverse_events.split('/')[-1])) sr_df = process_safety_risk(SparkFiles.get(safety_risk.split('/')[-1])) toxcast_df = process_toxcast(toxcast) logging.info('Data has been processed. Merging...') # Combine dfs and group evidence safety_df = ( # dfs are combined; unionByName is used instead of union to address for the differences in the schemas ae_df.unionByName(sr_df, allowMissingColumns=True).unionByName( toxcast_df, allowMissingColumns=True)) # Write output logging.info('Evidence strings have been processed. Saving...') write_evidence_strings(safety_df, output) logging.info( f'{safety_df.count()} evidence of safety liabilities have been saved to {output}. Exiting.' ) return 0
def main(input_file: str, output_file: str, cache_dir: str) -> None: # Read and process Orphanet's XML file into evidence strings orphanet_df = parse_orphanet_xml(input_file, spark) logging.info('Orphanet input file has been imported. Processing evidence strings.') evidence_df = process_orphanet(orphanet_df) evidence_df = add_efo_mapping(evidence_strings=evidence_df, spark_instance=spark, ontoma_cache_dir=cache_dir) logging.info('Disease mappings have been added.') # Save data write_evidence_strings(evidence_df, output_file) logging.info(f'{evidence_df.count()} evidence strings have been saved to {output_file}')
def main(outputFile: str) -> None: # Initialize spark session spark_mem_limit = detect_spark_memory_limit() spark_conf = (SparkConf().set( 'spark.driver.memory', f'{spark_mem_limit}g').set( 'spark.executor.memory', f'{spark_mem_limit}g').set('spark.driver.maxResultSize', '0').set( 'spark.debug.maxToStringFields', '2000').set('spark.sql.execution.arrow.maxRecordsPerBatch', '500000')) spark = (SparkSession.builder.config( conf=spark_conf).master('local[*]').getOrCreate()) spark.sparkContext.addFile(TEPURL) # Fetching and processing the TEP table and saved as a JSON file: TEP_df = ( spark.read.csv(SparkFiles.get(TEPURL.split('/')[-1]), sep='\t', header=True) # Generating TEP url from Gene column: SLC12A4/SLC12A6 -> https://www.thesgc.org/tep/SLC12A4SLC12A6 .withColumn( 'url', concat(lit('https://www.thesgc.org/tep/'), regexp_replace(lower(col('Gene')), '/', ''))) # Exploding TEPs, where multiple genes are given: .withColumn('targetFromSourceId', explode(split(col('Gene'), '/'))) # Renaming columns: .withColumnRenamed('Therapeutic Area', 'therapeuticArea').withColumnRenamed( 'Description', 'description') # Dropping columns: .drop(*['Gene', 'version', 'Date']).persist()) logging.info('TEP dataset has been downloaded and formatted.') logging.info(f'Number of TEPs: {TEP_df.count()}') logging.info( f'Number of unique genes: {TEP_df.select("targetFromSourceId").distinct().count()}' ) # Saving data: write_evidence_strings(TEP_df, outputFile) logging.info(f'TEP dataset is written to {outputFile}.')
def main(chembl_evidence: str, predictions: str, output_file: str) -> None: """ This module adds the studyStopReasonCategories to the ChEMBL evidence as a result of the categorisation of the clinical trial reason to stop. Args: chembl_evidence: Input gzipped JSON with the evidence submitted by ChEMBL. predictions: Input TSV containing the categories of the clinical trial reason to stop. Instructions for applying the ML model here: https://github.com/ireneisdoomed/stopReasons. output_file: Output gzipped json file containing the ChEMBL evidence with the additional studyStopReasonCategories field. log_file: Destination of the logs generated by this script. Defaults to None. """ logging.info(f'ChEMBL evidence JSON file: {chembl_evidence}') logging.info(f'Classes of reason to stop table: {predictions}') # Load input into dataframes chembl_df = spark.read.json(chembl_evidence).persist() predictions_df = (load_stop_reasons_classes(predictions).withColumnRenamed( 'why_stopped', 'studyStopReason').withColumnRenamed( 'subclasses', 'studyStopReasonCategories').select( 'studyStopReason', 'studyStopReasonCategories').distinct().persist()) # Join datasets evd_df = (chembl_df.join(predictions_df, on='studyStopReason', how='left').distinct()) # We expect that ~10% of evidence strings have a reason to stop assigned # It is asserted that this fraction is between 9 and 11% of the total count total_count = evd_df.count() early_stopped_count = evd_df.filter( col('studyStopReasonCategories').isNotNull()).count() if not 0.08 < early_stopped_count / total_count < 0.11: raise AssertionError( f'The fraction of evidence with a CT reason to stop class is not as expected ({early_stopped_count / total_count}).' ) # Write output logging.info('Evidence strings have been processed. Saving...') write_evidence_strings(evd_df, output_file) logging.info( f'{total_count} evidence strings have been saved to {output_file}. Exiting.' )
def main(input_file: str, output_file: str, cache_dir: str, local: bool = False) -> None: # Initialize spark session if local: sparkConf = ( SparkConf() .set('spark.driver.memory', '15g') .set('spark.executor.memory', '15g') .set('spark.driver.maxResultSize', '0') .set('spark.debug.maxToStringFields', '2000') .set('spark.sql.execution.arrow.maxRecordsPerBatch', '500000') ) spark = ( SparkSession.builder .config(conf=sparkConf) .master('local[*]') .getOrCreate() ) else: sparkConf = ( SparkConf() .set('spark.driver.maxResultSize', '0') .set('spark.debug.maxToStringFields', '2000') .set('spark.sql.execution.arrow.maxRecordsPerBatch', '500000') ) spark = ( SparkSession.builder .config(conf=sparkConf) .getOrCreate() ) # Read and process Clingen's table into evidence strings clingen_df = read_input_file(input_file, spark_instance=spark) logging.info('Gene Validity Curations table has been imported. Processing evidence strings.') evidence_df = process_clingen(clingen_df) evidence_df = add_efo_mapping(evidence_strings=evidence_df, spark_instance=spark, ontoma_cache_dir=cache_dir) logging.info('Disease mappings have been added.') write_evidence_strings(evidence_df, output_file) logging.info(f'{evidence_df.count()} evidence strings have been saved to {output_file}')
def main(dd_file: str, eye_file: str, skin_file: str, cancer_file: str, cardiac_file: str, output_file: str, cache_dir: str, local: bool = False) -> None: # Initialize spark session global spark spark_mem_limit = detect_spark_memory_limit() spark_conf = (SparkConf().set( 'spark.driver.memory', f'{spark_mem_limit}g').set( 'spark.executor.memory', f'{spark_mem_limit}g').set('spark.driver.maxResultSize', '0').set( 'spark.debug.maxToStringFields', '2000').set('spark.sql.execution.arrow.maxRecordsPerBatch', '500000')) spark = (SparkSession.builder.config(conf=spark_conf).config( "spark.sql.broadcastTimeout", "36000").master('local[*]').getOrCreate()) # Read and process G2P's tables into evidence strings gene2phenotype_df = read_input_file(dd_file, eye_file, skin_file, cancer_file, cardiac_file) logging.info( 'Gene2Phenotype panels have been imported. Processing evidence strings.' ) evidence_df = process_gene2phenotype(gene2phenotype_df) evidence_df = add_efo_mapping(evidence_strings=evidence_df, ontoma_cache_dir=cache_dir, spark_instance=spark) logging.info('Disease mappings have been added.') # Saving data: write_evidence_strings(evidence_df, output_file) logging.info( f'{evidence_df.count()} evidence strings have been saved to {output_file}' )
return parser if __name__ == '__main__': args = get_parser().parse_args() # Logger initializer. If no log_file is specified, logs are written to stderr logging.basicConfig( level=logging.INFO, format= '%(asctime)s %(levelname)s %(module)s - %(funcName)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', ) if args.log_file: logging.config.fileConfig(filename=args.log_file) else: logging.StreamHandler(sys.stderr) spark = initialize_sparksession() evd_df = main( az_binary_data=args.az_binary_data, az_quant_data=args.az_quant_data, spark_instance=spark, ) write_evidence_strings(evd_df, args.output) logging.info( f'Evidence strings have been saved to {args.output}. Exiting.')
def main(cooccurrences, outputFile): # Log parameters: logging.info(f'Cooccurrence file: {cooccurrences}') logging.info(f'Output file: {outputFile}') logging.info('Generating evidence:') # Load/filter datasets: agg_cooccurrence_df = ( # Reading file: read_path(cooccurrences, spark) .repartition(200) # Filter out pairs found in unwanted sections .filter(F.col('section').isin(SECTIONS_OF_INTEREST)) # Casting integer pmid column to string: .withColumn("pmid", F.trim(F.col('pmid').cast(StringType()))) # Dropping pmcid values that violate schema: .withColumn('pmcid', F.when(F.col('pmcid').rlike(r'^PMC\d+$'), F.col('pmcid'))) # Publication identifier is a pmid if available, otherwise pmcid .withColumn('publicationIdentifier', F.when(F.col('pmid').isNull(), F.col('pmcid')).otherwise(F.col('pmid'))) # Filtering for disease/target cooccurrences: .filter( (F.col('type') == 'GP-DS') # Filter gene/protein - disease cooccurrence & F.col('isMapped') # Filtering for mapped cooccurrences & F.col('publicationIdentifier').isNotNull() # Making sure at least the pmid or the pmcid is given: & (F.length(F.col('text')) < 600) # Exclude sentences with more than 600 characters & (F.col('label1').isin(EXCLUDED_TARGET_TERMS) == False) # Excluding target labels from the exclusion list ) # Renaming columns: .withColumnRenamed('keywordId1', 'targetFromSourceId') .withColumnRenamed('keywordId2', 'diseaseFromSourceMappedId') # Aggregating data by publication, target and disease: .groupBy(['publicationIdentifier', 'targetFromSourceId', 'diseaseFromSourceMappedId']) .agg( F.collect_set(F.col('pmcid')).alias('pmcIds'), F.collect_set(F.col('pmid')).alias('literature'), F.collect_set( F.struct( F.col('text'), F.col('start1').alias('tStart'), F.col('end1').alias('tEnd'), F.col('start2').alias('dStart'), F.col('end2').alias('dEnd'), F.col('section'), ) ).alias('textMiningSentences'), F.sum(F.col('evidence_score')).alias('resourceScore'), ) # Nullify pmcIds if empty array: .withColumn('pmcIds', F.when(F.size('pmcIds') != 0, F.col('pmcIds'))) # Only evidence with score above 1 is considered: .filter(F.col('resourceScore') > 1) ) # Final formatting and saving data: evidence = ( agg_cooccurrence_df # Adding literal columns: .withColumn('datasourceId', F.lit('europepmc')).withColumn('datatypeId', F.lit('literature')) # Reorder columns: .select( [ 'datasourceId', 'datatypeId', 'targetFromSourceId', 'diseaseFromSourceMappedId', 'resourceScore', 'literature', 'textMiningSentences', 'pmcIds', ] ) ) write_evidence_strings(evidence, outputFile) logging.info('EPMC disease target evidence saved.') logging.info(f'Number of evidence: {agg_cooccurrence_df.count()}') # Report on the number of diseases, targets and associations if loglevel == "debug" to avoid cost on computation time: logging.debug(f"Number of publications: {agg_cooccurrence_df.select(F.col('publicationIdentifier')).count()}") logging.debug( f"Number of publications without pubmed ID: {agg_cooccurrence_df.filter(F.col('publicationIdentifier').contains('PMC')).select('publicationIdentifier').distinct().count()}" ) logging.debug(f"Number of targets: {evidence.select(F.col('targetFromSourceId')).distinct().count()}") logging.debug(f"Number of diseases: {evidence.select(F.col('diseaseFromSourceMappedId')).distinct().count()}") logging.debug( f"Number of associations: {evidence.select(F.col('diseaseFromSourceMappedId'), F.col('targetFromSourceId')).dropDuplicates().count()}" )
def main(desc_file, evid_file, cell_file, out_file): sparkConf = (SparkConf().set('spark.driver.memory', '15g').set( 'spark.executor.memory', '15g').set('spark.driver.maxResultSize', '0').set('spark.debug.maxToStringFields', '2000').set( 'spark.sql.execution.arrow.maxRecordsPerBatch', '500000')) spark = (SparkSession.builder.config( conf=sparkConf).master('local[*]').getOrCreate()) # Log parameters: logging.info(f'Evidence file: {evid_file}') logging.info(f'Description file: {desc_file}') logging.info(f'Cell type annotation: {cell_file}') logging.info(f'Output file: {out_file}') # Read files: evidence_df = (spark.read.csv(evid_file, sep='\t', header=True).drop('pmid', 'gene_set_name', 'disease_name')) cell_lines_df = spark.read.csv(cell_file, sep='\t', header=True) description_df = spark.read.csv(desc_file, sep='\t', header=True) # Logging dataframe stats: logging.info(f'Number of evidence: {evidence_df.count()}') logging.info(f'Number of descriptions: {description_df.count()}') logging.info(f'Number of cell/tissue annotation: {cell_lines_df.count()}') # Tissues and cancer types are annotated together in the same column (tissue_or_cancer_type) # To disambiguate one from another, the column is combined with the cell lines # First on the tissue level: tissue_desc = (description_df.withColumnRenamed( 'tissue_or_cancer_type', 'tissue').join(cell_lines_df, on='tissue', how='inner')) # And then on the disease level: cell_desc = (description_df.withColumnRenamed('tissue_or_cancer_type', 'diseaseFromSource').join( cell_lines_df, on='diseaseFromSource', how='inner')) merged_annotation = ( # Concatenating the above generated dataframes: cell_desc.union(tissue_desc) # Aggregating by disease and method: .groupBy('diseaseFromSource', 'efo_id', 'method') # The cell annotation is aggregated in a list of struct: .agg( collect_set( struct(col('name'), col('id'), col('tissue'), col('tissueId'))).alias('diseaseCellLines') ).drop('method')) # Joining merged annotation with evidence: pooled_evidence_df = ( evidence_df.select( col('target_id').alias('targetFromSourceId'), col('disease_id').alias('efo_id'), col('score').alias('resourceScore').cast(FloatType()), ) # Some of the target identifier are not Ensembl Gene id - replace them: .replace(to_replace=CRISPR_SYMBOL_MAPPING, subset=['targetFromSourceId']) # Merging with descriptions: .join(merged_annotation, on='efo_id', how='outer') # From EFO uri, generate EFO id: .withColumn( 'diseaseFromSourceMappedId', element_at(split(col('efo_id'), '/'), -1).alias('diseaseFromSourceMappedId')).drop('efo_id') # Adding constants: .withColumn('datasourceId', lit('crispr')).withColumn( 'datatypeId', lit('affected_pathway')).persist()) logging.info( f'Saving {pooled_evidence_df.count()} CRISPR evidence in JSON format, to: {out_file}' ) write_evidence_strings(pooled_evidence_df, out_file)
def generate_panelapp_evidence(self, input_file: str, output_file: str, cache_dir: str) -> None: logging.info('Filter and extract the necessary columns.') panelapp_df = self.spark.read.csv(input_file, sep=r'\t', header=True) # Panel version can be either a single number (e.g. 1), or two numbers separated by a dot (e.g. 3.14). We cast # either representation to float to ensure correct filtering below. (Note that conversion to float would not # work in the general case, because 3.4 > 3.14, but we only need to compare relative to 1.0.) panelapp_df = panelapp_df.withColumn( 'Panel Version', panelapp_df['Panel Version'].cast('float').alias('Panel Version')) panelapp_df = ( panelapp_df.filter(( (col('List') == 'green') | (col('List') == 'amber')) & (col('Panel Version') >= 1.0) & (col('Panel Status') == 'PUBLIC')).select( 'Symbol', 'Panel Id', 'Panel Name', 'List', 'Mode of inheritance', 'Phenotypes') # The full original records are not redundant; however, uniqueness on a subset of fields is not guaranteed. .distinct()) logging.info( 'Fix typos and formatting errors which would interfere with phenotype splitting.' ) panelapp_df = panelapp_df.withColumn('cleanedUpPhenotypes', col('Phenotypes')) for regexp, replacement in self.PHENOTYPE_BEFORE_SPLIT_RE.items(): panelapp_df = panelapp_df.withColumn( 'cleanedUpPhenotypes', regexp_replace(col('cleanedUpPhenotypes'), regexp, replacement)) logging.info('Split and explode the phenotypes.') panelapp_df = (panelapp_df.withColumn( 'cohortPhenotypes', array_distinct(split(col('cleanedUpPhenotypes'), ';'))).withColumn( 'phenotype', explode(col('cohortPhenotypes')))) logging.info( 'Remove specific patterns and phrases which will interfere with ontology extraction and mapping.' ) panelapp_df = panelapp_df.withColumn('diseaseFromSource', col('phenotype')) for regexp in self.PHENOTYPE_AFTER_SPLIT_RE: panelapp_df = panelapp_df.withColumn( 'diseaseFromSource', regexp_replace(col('diseaseFromSource'), f'({regexp})', '')) logging.info( 'Extract ontology information, clean up and filter the split phenotypes.' ) panelapp_df = ( panelapp_df # Extract Orphanet/MONDO/HP ontology identifiers and remove them from the phenotype string. .withColumn('ontology_namespace', regexp_extract(col('diseaseFromSource'), self.OTHER_RE, 1)) .withColumn('ontology_namespace', regexp_replace(col('ontology_namespace'), 'OrphaNet: ORPHA', 'ORPHA')) .withColumn('ontology_id', regexp_extract(col('diseaseFromSource'), self.OTHER_RE, 2)) .withColumn( 'ontology', when( (col('ontology_namespace') != '') & (col('ontology_id') != ''), concat(col('ontology_namespace'), lit(':'), col('ontology_id')) ) ) .withColumn('diseaseFromSource', regexp_replace(col('diseaseFromSource'), f'({self.OTHER_RE})', '')) # Extract OMIM identifiers and remove them from the phenotype string. .withColumn('omim_id', regexp_extract(col('diseaseFromSource'), self.OMIM_RE, 2)) .withColumn('omim', when(col('omim_id') != '', concat(lit('OMIM:'), col('omim_id')))) .withColumn('diseaseFromSource', regexp_replace(col('diseaseFromSource'), f'({self.OMIM_RE})', '')) # Choose one of the ontology identifiers, keeping OMIM as a priority. .withColumn('diseaseFromSourceId', when(col('omim').isNotNull(), col('omim')).otherwise(col('ontology'))) .drop('ontology_namespace', 'ontology_id', 'ontology', 'omim_id', 'omim') # Clean up the final split phenotypes. .withColumn('diseaseFromSource', regexp_replace(col('diseaseFromSource'), r'\(\)', '')) .withColumn('diseaseFromSource', trim(col('diseaseFromSource'))) .withColumn('diseaseFromSource', when(col('diseaseFromSource') != '', col('diseaseFromSource'))) # Remove low quality records, where the name of the phenotype string starts with a question mark. .filter( ~( (col('diseaseFromSource').isNotNull()) & (col('diseaseFromSource').startswith('?')) ) ) # Remove duplication caused by cases where multiple phenotypes within the same record fail to generate any # phenotype string or ontology identifier. .distinct() # For records where we were unable to determine either a phenotype string nor an ontology identifier, # substitute the panel name instead. .withColumn( 'diseaseFromSource', when( (col('diseaseFromSource').isNull()) & (col('diseaseFromSourceId').isNull()), col('Panel Name') ) .otherwise(col('diseaseFromSource')) ) .persist() ) logging.info('Fetch and join literature references.') all_panel_ids = panelapp_df.select( 'Panel Id').toPandas()['Panel Id'].unique() literature_references = self.fetch_literature_references(all_panel_ids) panelapp_df = panelapp_df.join(literature_references, on=['Panel Id', 'Symbol'], how='left') if self.debug_output_phenotypes_filename: logging.info('Output tables for debugging purposes, if requested.') (panelapp_df.select( 'Phenotypes', # Original, unaltered string with all phenotypes. 'cleanedUpPhenotypes', # String with phenotypes after pre-split cleanup. 'phenotype', # Individual phenotype after splitting. 'diseaseFromSource', # Final cleaned up disease name. 'diseaseFromSourceId', # Final cleaned up disease ID. ).distinct().toPandas().to_csv( self.debug_output_phenotypes_filename, sep='\t', index=False)) logging.info( 'Drop unnecessary fields and populate the final evidence string structure.' ) evidence_df = ( panelapp_df.drop('Phenotypes', 'cleanedUpPhenotypes', 'phenotype') # allelicRequirements requires a list, but we always only have one value from PanelApp. .withColumn( 'allelicRequirements', when( col('Mode of inheritance').isNotNull(), array(col('Mode of inheritance')))).drop( 'Mode of inheritance').withColumnRenamed( 'List', 'confidence').withColumn( 'datasourceId', lit('genomics_england')).withColumn( 'datatypeId', lit('genetic_literature')) # diseaseFromSourceId populated above # literature populated above .withColumnRenamed('Panel Id', 'studyId').withColumnRenamed( 'Panel Name', 'studyOverview').withColumnRenamed('Symbol', 'targetFromSourceId') # Some residual duplication is caused by slightly different representations from `cohortPhenotypes` being # cleaned up to the same representation in `diseaseFromSource`, for example "Pontocerebellar hypoplasia type # 2D (613811)" and "Pontocerebellar hypoplasia type 2D, 613811". .distinct()) evidence_df = add_efo_mapping(evidence_strings=evidence_df, spark_instance=self.spark, ontoma_cache_dir=cache_dir) logging.info('Disease mappings have been added.') write_evidence_strings(evidence_df, output_file) logging.info( f'{evidence_df.count()} evidence strings have been saved to {output_file}' )