def get_annotation(var_df, ds_name, input_fp=None, output_prefix='', output_dir=None, output_fp=None, filter_condition=None, reference_genome=None): """ Add or also run FatHMM for the variants in the given dataframe :param var_df: pandas dataframe with variants :param ds_name: name of dataset, possibly related to given filter; used for default naming of input and output files :param input_fp: absolute path to input file :param output_prefix: should be None :param output_dir: should be None :param output_fp: absolute path to output file :param filter_condition: select only a subset of given dataframe, for example a cancer type or a subject :param reference_genome: no reference genome for FatHMM :return: extended pandas dataframe with FATHMM scores """ if filter_condition is None: select_indices = np.where(var_df[FUNC_COL])[0] else: select_indices = np.where(filter_condition & var_df[FUNC_COL])[0] if len(select_indices) == 0: logger.warning( 'No variants selected for FATHMM analysis of case {}!'.format( ds_name)) if output_dir is None: output_dir = os.path.join('.') if input_fp is None: input_fp = os.path.join( output_dir, '{}{}'.format(ds_name, FatHMM.INPUT_SUFFIX)) if output_fp is None: output_fp = os.path.join( output_dir, '{}{}'.format(ds_name, FatHMM.OUTPUT_SUFFIX)) if not os.path.isfile(input_fp): FatHMM.generate_input_file(input_fp, var_df.iloc[select_indices]) if not os.path.isfile(output_fp): # run FatHMM FatHMM.run(os.path.abspath(input_fp), os.path.abspath(output_fp)) fathmm_df = FatHMM.read_results(output_fp) # add FATHMM results to the dataframe add_column(var_df, select_indices, FATHMM_KEY_COL, FatHMM.SCORE_COL, fathmm_df, as_value=True, sub_key=FatHMM.SCORE_COL, as_float=True) if (var_df.loc[select_indices][FatHMM.SCORE_COL].count() < 0.2 * var_df.loc[select_indices][FATHMM_KEY_COL].count() + min(3.0, var_df.loc[select_indices][FATHMM_KEY_COL].count() / 2.0)): logger.warning( 'Only {} FATHMM predictions found for case {} but {} IDs.'. format(var_df.loc[select_indices][FatHMM.SCORE_COL].count(), ds_name, var_df.loc[select_indices][FATHMM_KEY_COL].count())) return var_df
def get_annotation(var_df, ds_name, input_fp=None, output_prefix='', output_dir=None, output_fp=None, filter_condition=None, reference_genome='hg19'): """ Add or also run CanDrA for the given dataframe :param var_df: pandas dataframe :param ds_name: name of dataset, possibly related to given filter; used for default naming of input and output files :param input_fp: path to input file :param output_prefix: output naming prefix :param output_dir: path to output directory :param output_fp: if output_fp is given then CanDrA will not be run and it is assumed to exist already :param filter_condition: select only a subset of given dataframe, for example a cancer type or a subject :param reference_genome: human reference genome, e.g. hg19 or hg20 :return: extended pandas dataframe with CanDrA scores """ if filter_condition is None: select_indices = np.where(var_df[FUNC_COL])[0] else: select_indices = np.where(filter_condition & var_df[FUNC_COL])[0] if len(select_indices) == 0: logger.warning( 'No variants selected for CanDrA analysis of case {}!'.format( ds_name)) if output_dir is None: output_dir = os.path.join('.') if input_fp is None: input_fp = os.path.join( output_dir, '{}{}'.format(ds_name, Candra.INPUT_SUFFIX)) if output_fp is None: output_fp = os.path.join( output_dir, '{}{}'.format(ds_name, Candra.OUTPUT_SUFFIX)) if not os.path.isfile(input_fp): Candra.generate_input_file(input_fp, var_df.iloc[select_indices], chromosome_col='Chromosome', position_col='StartPosition', reference_col='ReferenceAllele', alternate_col='AlternateAllele') if not os.path.isfile(output_fp): # run CanDrA if len(var_df.iloc[select_indices][CT_COL].unique()) != 1: # var_df[var_df.Subject == sub_name][CT_COL].unique() raise RuntimeError( 'No cancer type could be inferred for subject {}: {} {}'. format(ds_name, var_df.iloc[select_indices][CT_COL].unique(), len(var_df.iloc[select_indices][CT_COL].unique()))) ct = var_df.iloc[select_indices][CT_COL].unique()[0] Candra.run(os.path.abspath(input_fp), os.path.abspath(output_fp), ct) candra_df = Candra.read_results(output_fp) # add CanDrA results to the dataframe Candra.add_amino_acid_change_column(var_df, candra_df) add_column(var_df, select_indices, NT_VAR_COL, Candra.LIFD_SCORE_COL, candra_df, as_value=True, sub_key=Candra.SCORE_COL, as_float=True) add_column(var_df, select_indices, NT_VAR_COL, Candra.LIFD_CATEGORY_COL, candra_df, as_value=True, sub_key=Candra.CATEGORY_COL) var_df['CanDrA_clf'].replace('nan', NAN, inplace=True) add_column(var_df, select_indices, NT_VAR_COL, Candra.LIFD_SIGNIFICANCE_COL, candra_df, as_value=True, sub_key=Candra.SIGNIFICANCE_COL, as_float=True) return var_df
def get_annotation(var_df, ds_name, input_fp=None, output_prefix='', output_dir=None, output_fp=None, filter_condition=None, reference_genome='hg19'): """ Add or also run CRAVAT and CHASMplus for the given dataframe :param var_df: pandas dataframe :param ds_name: name of dataset, possibly related to given filter; used for default naming of input and output files :param input_fp: path to input file :param output_prefix: output naming prefix :param output_dir: path to output directory :param output_fp: if cravat_output_fp is given then CRAVAT will not be run and it is assumed to exist already :param filter_condition: select only a subset of given dataframe, for example a cancer type or a subject :param reference_genome: human reference genome, e.g. hg19 or hg20 :return: extended pandas dataframe with CHASMPlus pancancer and cancer type specific scores and p-values """ # Cravat can not convert genomic position for mitochondrial DNA if filter_condition is None: select_indices = np.where(var_df[FUNC_COL] & (var_df.Chromosome != 'MT'))[0] else: select_indices = np.where(filter_condition & var_df[FUNC_COL] & (var_df.Chromosome != 'MT'))[0] if len(select_indices) == 0: logger.warning( 'No variants selected for CRAVAT analysis of dataset {}!'. format(ds_name)) if output_dir is None: output_dir = os.path.join('.') if input_fp is None: input_fp = os.path.join( output_dir, '{}{}'.format(ds_name, Cravat.INPUT_SUFFIX)) if not os.path.isfile(os.path.abspath(input_fp)): Cravat.generate_input_file(input_fp, var_df.iloc[select_indices], chromosome_col=CHR_COL, position_col=POS_START_COL, reference_col=REF_COL, alternate_col=ALT_COL, subject_col='Subject') else: logger.debug( 'CRAVAT input file for dataset {} already exists.'.format( ds_name)) # get cancer type if len(var_df.iloc[select_indices][CT_COL].unique()) != 1: # var_df[var_df.Subject == sub_name][CT_COL].unique() raise RuntimeError( 'No cancer type could be inferred for dataset {}: {}'.format( ds_name, var_df.iloc[select_indices][CT_COL].unique())) cts = var_df.iloc[select_indices][CT_COL].unique() if len(cts) == 1: ct = cts[0] else: logger.error('No unique cancer type could be identified: f{cts}') ct = None if output_fp is None: output_fp = os.path.join(output_dir, f'{ds_name}{Cravat.OUTPUT_SUFFIX}') if not os.path.isfile(output_fp): # run CRAVAT Cravat.run(os.path.abspath(input_fp), output_dir, ct, prefix=output_prefix, sub_name=ds_name, reference_genome=reference_genome) else: logger.debug( 'CRAVAT output file already exists: '.format(output_fp)) if os.path.exists(output_fp) and os.path.isfile(output_fp): if output_fp.endswith('.xlsx'): cravat_df = Cravat.read_results( os.path.abspath(output_fp), cancer_type=ct, reference_genome=reference_genome) logger.debug('Read cravat results with {} entries: {}'.format( len(cravat_df), output_fp)) add_column(var_df, select_indices, NT_VAR_COL, Cravat.CP_COL, cravat_df, sub_key=Cravat.CP_COL, as_float=True) add_column(var_df, select_indices, NT_VAR_COL, Cravat.CP_SCORE_COL, cravat_df, sub_key=Cravat.CP_SCORE_COL, as_float=True) if ct == 'PANCAN': add_column(var_df, select_indices, NT_VAR_COL, Cravat.CP_CT_COL, cravat_df, sub_key=Cravat.CP_COL, as_float=True) add_column(var_df, select_indices, NT_VAR_COL, Cravat.CP_SCORE_CT_COL, cravat_df, sub_key=Cravat.CP_SCORE_COL, as_float=True) elif ct is not None and not (isinstance(ct, float) and np.isnan(ct)): add_column(var_df, select_indices, NT_VAR_COL, Cravat.CP_CT_COL, cravat_df, sub_key=Cravat.CP_CT_COL.replace('CT', ct), as_float=True) add_column(var_df, select_indices, NT_VAR_COL, Cravat.CP_SCORE_CT_COL, cravat_df, sub_key=Cravat.CP_SCORE_CT_COL.replace( 'CT', ct), as_float=True) else: logger.warning( 'No cancer type given for dataset {}: {}'.format( ds_name, ct)) if len(select_indices) < 10: min_fraction = 0.0 else: min_fraction = 1.0 if 'MutationEffect' in var_df.columns: df = var_df.iloc[select_indices, :] min_fraction *= 0.9 * len( df[df.MutationEffect == 'Substitution']) / len( select_indices) else: min_fraction *= 0.3 # check if at least a minimum number of predictions were found assert (var_df.iloc[select_indices, :][Cravat.CP_COL].count() > min_fraction * len(select_indices)), \ 'Only {} Cravat/CHASMplus predictions found for dataset {} but {} indices.'.format( var_df.iloc[select_indices, :][Cravat.CP_COL].count(), ds_name, len(select_indices)) elif output_fp.endswith('.tsv'): cravat_vars = Cravat.read_results( os.path.abspath(output_fp), reference_genome=reference_genome) # add CHASM results to the dataframe add_column(var_df, select_indices, NT_VAR_COL, 'Chasm', cravat_vars, sub_key=0, as_float=True) assert var_df[filter_condition]['Chasm'].count() > 0.3 * len(select_indices), \ 'Only {} Cravat/Chasm predictions found for dataset {} but {} indices.'.format( var_df[filter_condition]['Chasm'].count(), ds_name, len(select_indices)) else: logger.error( 'Missing cravat/chasm output for dataset {}! No file {}'. format(ds_name, output_fp)) raise RuntimeError( 'Missing cravat/chasm output for dataset {}! No file {}'. format(ds_name, output_fp)) return var_df
def get_annotation(var_df, ds_name, input_fp=None, output_prefix='', output_dir=None, output_fp=None, filter_condition=None, reference_genome='hg19'): """ Add CGI analysis results for whole dataset or a given subject's data :param var_df: pandas dataframe :param ds_name: name of dataset, possibly related to given filter; used for default naming of input and output files :param input_fp: path to input file :param output_prefix: output naming prefix :param output_dir: path to output directory :param output_fp: path to expected output file :param filter_condition: select only a subset of given dataframe for example CGI could be run separately for different cancer types or subjects :param reference_genome: human reference genome, e.g. hg19 :return: extended pandas dataframe with columns 'cadd_phred', 'CGI_driver', 'CGI_known_driver', 'CGI_predicted_driver', 'CGI_driver_gene', 'CGI_driver_gene_source', 'CGI_gene_role', 'CGI_driver_mut_prediction' """ if filter_condition is None: select_indices = np.where(var_df[FUNC_COL])[0] else: select_indices = np.where(filter_condition & var_df[FUNC_COL])[0] if output_dir is None: output_dir = os.path.join('.') if input_fp is None: input_fp = os.path.join(output_dir, '{}{}'.format(ds_name, Cgi.INPUT_SUFFIX)) if output_fp is None: output_fp = os.path.join(output_dir, '{}{}'.format(ds_name, Cgi.OUTPUT_SUFFIX)) if not os.path.isfile(os.path.abspath(input_fp)): Cgi.generate_input_file(input_fp, var_df.iloc[select_indices]) job_id = None if not os.path.isfile(output_fp) and os.path.isfile(input_fp): ct = var_df.iloc[select_indices][CT_COL].unique()[0] if ct == 'PANCAN': ct = 'CANCER' job_id = Cgi.run(os.path.abspath(input_fp), os.path.abspath(output_fp), ct, ds_name) if os.path.exists(output_fp) and zipfile.is_zipfile(output_fp): cgi_vars_df = Cgi.read_results(os.path.abspath(output_fp)) # cgi_vars_df['driver'] # add CGI results to the dataframe # interpreting CADD scores # >30 very high, >25 high, >20 medium, >10 low, <=10 very low add_column(var_df, select_indices, NT_VAR_COL, 'cadd_phred', cgi_vars_df, sub_key='cadd_phred', as_float=True) # add three columns for whether variant is a known or predicted driver, or other (passenger) add_column(var_df, select_indices, NT_VAR_COL, Cgi.DRIVER_COL, cgi_vars_df, sub_key='driver', as_float=False) add_column(var_df, select_indices, NT_VAR_COL, Cgi.KNOWN_DRIVER_COL, cgi_vars_df[cgi_vars_df.driver == 'known'], sub_key='driver', as_value=False) add_column(var_df, select_indices, NT_VAR_COL, Cgi.PREDICTED_DRIVER_COL, cgi_vars_df[cgi_vars_df.driver != 'other'], sub_key='driver', as_value=False) add_column(var_df, select_indices, NT_VAR_COL, Cgi.DRIVER_GENE_COL, cgi_vars_df, sub_key='driver_gene', as_float=False) add_column(var_df, select_indices, NT_VAR_COL, Cgi.SOURCE_COL, cgi_vars_df, sub_key='driver_gene_source', as_float=False) add_column(var_df, select_indices, NT_VAR_COL, Cgi.GENE_ROLE_COL, cgi_vars_df, sub_key='gene_role', as_float=False) add_column(var_df, select_indices, NT_VAR_COL, Cgi.DRIVER_MUT_PREDICTION_COL, cgi_vars_df, sub_key='driver_mut_prediction', as_float=False) # assert var_df['CGI_driver'].count() > 0.3 * len(select_indices), \ # 'Only {} CGI predictions found for case {} but {} variants.'.format( # var_df['CGI_driver'].count(), ds_name, len(select_indices)) # UNDO THIS COMENT # delete CGI request if job_id is not None: r = requests.delete('{}/{}'.format(Cgi.URL, job_id), headers=Cgi.HEADERS) r.json() else: logger.warning( f'Missing CGI output for case {ds_name} with input file: {input_fp}' ) existing_jobs = Cgi._get_existing_jobs() if ds_name not in existing_jobs.keys(): most_recent_date = sorted(existing_jobs[ds_name].keys())[-1] job_id = existing_jobs[ds_name][most_recent_date]['id'] job_info = Cgi.get_job_info(job_id) logger.warning(job_info) return var_df
def get_annotation(var_df, ds_name, input_fp=None, output_prefix='', output_dir=None, output_fp=None, filter_condition=None, reference_genome='hg19'): """ Run VEP and FATHMM for given subject's data and add the results to the given dataframe :param var_df: pandas dataframe with input data :param ds_name: name of dataset, possibly related to given filter; used for default naming of input and output files :param input_fp: path to VEP input file :param output_prefix: output naming prefix :param output_dir: path to output directory :param output_fp: path to VEP output file :param filter_condition: select only a subset of given dataframe :param reference_genome: human reference genome, e.g. hg19 or hg20 :return: extended pandas dataframe with columns 'PolyPhen', 'Sift', 'impact', 'consequence', 'FATHMM_ID', """ if filter_condition is None: select_indices = np.where(var_df[FUNC_COL])[0] else: select_indices = np.where(filter_condition & (var_df[FUNC_COL]))[0] if output_dir is None: output_dir = os.path.join('.') if input_fp is None: input_fp = os.path.join(output_dir, '{}{}'.format(ds_name, Vep.INPUT_SUFFIX)) if output_fp is None: output_fp = os.path.join(output_dir, '{}{}'.format(ds_name, Vep.OUTPUT_SUFFIX)) if not os.path.isfile(input_fp): Vep.generate_input_file(input_fp, var_df.iloc[select_indices], chromosome_col='Chromosome', gene_col='GeneSymbol', start_col='StartPosition', end_col='EndPosition', reference_col='ReferenceAllele', alternate_col='AlternateAllele') if not os.path.isfile(output_fp) and os.path.isfile(input_fp): Vep.run(os.path.abspath(input_fp), os.path.abspath(output_fp), reference_genome=reference_genome) # Extract relevant information from VEP files and generate FATHMM input files if necessary if os.path.isfile(output_fp): vep_df = Vep.read_results(output_fp) if vep_df is not None: # polyphen add_column(var_df, select_indices, NT_VAR_COL, PP_SCORE_COL, vep_df, sub_key=PP_SCORE_COL, as_value=True, as_float=True) add_column(var_df, select_indices, NT_VAR_COL, SIFT_SCORE_COL, vep_df, sub_key=SIFT_SCORE_COL, as_value=True, as_float=True) add_column(var_df, select_indices, NT_VAR_COL, Vep.IMPACT_COL, vep_df, sub_key=Vep.IMPACT_COL, as_value=True) if FATHMM_KEY_COL not in var_df.columns: add_column(var_df, select_indices, NT_VAR_COL, FATHMM_KEY_COL, vep_df, sub_key=FATHMM_KEY_COL, as_value=True) var_df.replace({FATHMM_KEY_COL: 'nan'}, np.nan, inplace=True) # assert var_df[filter_condition][Vep.IMPACT_COL].count() > 0.3 * len(select_indices) - 2, \ # 'Only {} VEP impact predictions found for case {} but {} indices.'.format( # var_df[filter_condition][Vep.IMPACT_COL].count(), ds_name, len(select_indices)) else: logger.error('No VEP results for case {}.'.format(ds_name)) raise RuntimeError( 'No VEP results for case {}. '.format(ds_name)) return var_df