def reclassify_mutations(self): # calculate p(Somatic | given joint TiN estimate) if self.input.weighted_classification == True: numerator = np.zeros(len(self.ssnv_based_model.p_TiN_given_S)) denominator = np.zeros(len(self.ssnv_based_model.p_TiN_given_S)) for idx, p in enumerate(self.joint_posterior): if p > 0.001: num_iter = (p * self.ssnv_based_model.p_somatic * self.ssnv_based_model.p_TiN_given_S[:, idx]) numerator = numerator + num_iter denom_iter = num_iter + p * (np.array( [1 - self.ssnv_based_model.p_somatic] * np.nan_to_num( self.ssnv_based_model.p_TiN_given_G[:, idx]))) denominator = denominator + denom_iter else: numerator = self.ssnv_based_model.p_somatic * np.expand_dims( self.ssnv_based_model.p_TiN_given_S[:, self.TiN_int], 1) denominator = numerator + np.array( [1 - self.ssnv_based_model.p_somatic] * np.expand_dims( np.nan_to_num(self.ssnv_based_model.p_TiN_given_G[:, self.TiN_int]), 1)) self.SSNVs.loc[:, ('p_somatic_given_TiN')] = np.nan_to_num(np.true_divide(numerator, denominator)) # expected normal allele fraction given TiN and tau af_n_given_TiN = np.multiply(self.ssnv_based_model.tumor_f, self.ssnv_based_model.CN_ratio[:, self.TiN_int]) # probability of normal allele fraction less than or equal to predicted fraction self.SSNVs.loc[:, 'p_outlier'] = self.ssnv_based_model.rv_normal_af.cdf(af_n_given_TiN + 0.01) if self.TiN_int == 0: print 'Estimated 0 TiN no SSNVs will be recovered outputing deTiN statistics for each site' elif self.use_outlier_threshold: # remove outliers mutations p(af_n >= E[af_n|TiN]) < 0.05 self.SSNVs['judgement'][np.logical_and(self.SSNVs['p_somatic_given_TiN'] > self.threshold, self.SSNVs['p_outlier'] >= 0.01)] = 'KEEP' else: self.SSNVs['judgement'][self.SSNVs['p_somatic_given_TiN'] > self.threshold] = 'KEEP' if self.input.indel_file != 'None': if self.input.indel_table.isnull().values.sum() == 0: indel_model = dssnv.model(self.input.indel_table, self.input.mutation_prior, self.input.resolution) indel_model.generate_conditional_ps() self.indels = self.input.indel_table numerator = indel_model.p_somatic * np.expand_dims(indel_model.p_TiN_given_S[:, self.TiN_int], 1) denominator = numerator + np.array( [1 - indel_model.p_somatic] * np.expand_dims(np.nan_to_num( indel_model.p_TiN_given_G[:, self.TiN_int]), 1)) af_n_given_TiN = np.multiply(indel_model.tumor_f, indel_model.CN_ratio[:, self.TiN_int]) self.indels.loc[:, ('p_somatic_given_TiN')] = np.nan_to_num(np.true_divide(numerator, denominator)) self.indels.loc[:, 'p_outlier'] = indel_model.rv_normal_af.cdf(af_n_given_TiN) if self.TiN_int == 0: print 'Estimated 0 TiN no indels will be recovered outputing deTiN statistics for each site' elif self.use_outlier_threshold: # remove outliers mutations p(af_n >= E[af_n|TiN]) < 0.05 self.indels['filter'][np.logical_and(self.indels['p_somatic_given_TiN'] > self.threshold, self.indels['p_outlier'] >= 0.01)] = 'PASS' else: self.indels['filter'][self.indels['p_somatic_given_TiN'] > self.threshold] = 'PASS' elif self.input.indel_table.isnull().values.sum() > 0: self.indels = self.input.indel_table
def reclassify_mutations(self): # calculate p(Somatic | given joint TiN estimate) numerator = self.ssnv_based_model.p_somatic * self.ssnv_based_model.p_TiN_given_S[:, self . TiN_int] denominator = numerator + np.array( [1 - self.ssnv_based_model.p_somatic] * np.nan_to_num( self.ssnv_based_model.p_TiN_given_G[:, self.TiN_int])) self.SSNVs.loc[:, ('p_somatic_given_TiN')] = np.nan_to_num( np.true_divide(numerator, denominator)) # expected normal allele fraction given TiN and tau af_n_given_TiN = np.multiply( self.ssnv_based_model.tumor_f, self.ssnv_based_model.CN_ratio[:, self.TiN_int]) # probability of normal allele fraction less than or equal to predicted fraction self.SSNVs.loc[:, 'p_outlier'] = self.ssnv_based_model.rv_normal_af.cdf( af_n_given_TiN + 0.01) if self.use_outlier_threshold: # remove outliers mutations p(af_n >= E[af_n|TiN]) < 0.05 self.SSNVs['judgement'][np.logical_and( self.SSNVs['p_somatic_given_TiN'] > self.threshold, self.SSNVs['p_outlier'] >= 0.01)] = 'KEEP' else: self.SSNVs['judgement'][ self.SSNVs['p_somatic_given_TiN'] > self.threshold] = 'KEEP' if not self.input.indel_file == 'None': print 'reclassifying indels' indel_model = dssnv.model(self.input.indel_table, self.input.mutation_prior) indel_model.generate_conditional_ps() self.indels = self.input.indel_table numerator = indel_model.p_somatic * indel_model.p_TiN_given_S[:, self. TiN_int] denominator = numerator + np.array( [1 - indel_model.p_somatic] * np.nan_to_num(indel_model.p_TiN_given_G[:, self.TiN_int])) af_n_given_TiN = np.multiply(indel_model.tumor_f, indel_model.CN_ratio[:, self.TiN_int]) self.indels.loc[:, ('p_somatic_given_TiN')] = np.nan_to_num( np.true_divide(numerator, denominator)) self.indels.loc[:, 'p_outlier'] = indel_model.rv_normal_af.cdf( af_n_given_TiN) if self.use_outlier_threshold: # remove outliers mutations p(af_n >= E[af_n|TiN]) < 0.05 self.indels['filter'][np.logical_and( self.indels['p_somatic_given_TiN'] > self.threshold, self.indels['p_outlier'] >= 0.01)] = 'PASS' else: self.indels['filter'][self.indels['p_somatic_given_TiN'] > self.threshold] = 'PASS'
def main(): """ deTiN pipeline. Method operates in two stages (1) estimating tumor in normal via candidate SSNVs and SCNAS. (2) Performing variant re-classification using bayes rule. """ parser = argparse.ArgumentParser( description='Estimate tumor in normal (TiN) using putative somatic' ' events see Taylor-Weiner & Stewart et al. 2017') # input files parser.add_argument('--mutation_data_path', help='Path to mutation candidate SSNV data.' 'Supported formats: MuTect call-stats', required=False, default='NULL') parser.add_argument( '--cn_data_path', help='Path to copy number data.' 'Supported format: AllelicCapseg .seg file. Generated by GATK4 AllelicCNV.', required=False, default='NULL') parser.add_argument( '--tumor_het_data_path', help= 'Path to heterozygous site allele count data in tumor. Generated by GATK4 GetBayesianHetCoverage.' 'Required columns: CONTIG,POS,REF_COUNT and ALT_COUNT', required=False, default='NULL') parser.add_argument( '--normal_het_data_path', help= 'Path to heterozygous site allele count data in normal. Generated by GATK4 GetBayesianHetCoverage' 'Required columns: CONTIG,POS,REF_COUNT and ALT_COUNT', required=False, default='NULL') parser.add_argument( '--exac_data_path', help= 'Path to exac af > 0.01 pickle. Can be generated by downloading ExAC VCF and running build_exac_pickle', required=False) parser.add_argument('--indel_data_path', help='Path to candidate indels data.' 'Supported formats: Strelka / MuTect2 VCFs', required=False, default='None') parser.add_argument('--indel_data_type', help='MuTect2 or Strelka' 'Caller used to generate indels', required=False, default='None') # output related arguments parser.add_argument('--output_name', required=True, help='sample name') parser.add_argument('--output_dir', help='directory to put plots and TiN solution', required=False, default='.') # model related parameters parser.add_argument( '--mutation_prior', help= 'prior expected ratio of somatic mutations to rare germline events', required=False, default=0.05) parser.add_argument( '--aSCNA_threshold', help='minor allele fraction threshold for calling aSCNAs.', required=False, default=0.1) parser.add_argument( '--TiN_prior', help= 'expected frequency of TiN contamination in sequencing setting used for model selection', required=False, default=0.5) parser.add_argument( '--use_outlier_removal', help= 'remove sites from recovered SSNVs where allele fractions significantly exceed predicted fraction', required=False, default=True) parser.add_argument( '--resolution', help= 'number of TiN bins to consider default = 101 corresponds to 0.01 TiN levels', required=False, default=101) parser.add_argument( '--weighted_classification', help='integrate variant classification over all values of TiN', required=False, default=False) parser.add_argument( '--ascna_probe_number_filter', help='number of probes to require for an aSCNA to be considered', required=False, default=200) parser.add_argument( '--ascna_SNP_number_filter', help='number of probes to require for an aSCNA to be considered', required=False, default=20) parser.add_argument( '--coverage_threshold', help='number of reads required to use a site for TiN estimation', required=False, default=15) parser.add_argument( '--SSNV_af_threshold', help='fraction of alternate alleles required for site to be used ' 'for SSNV TiN estimation', required=False, default=.2) parser.add_argument( '--aSCNA_variance_threshold', help= 'variance of segment allele shift tolerated before removing segment ' 'as artifact', required=False, default=0.025) parser.add_argument( '--cancer_hot_spots', help= 'Optional BED file of cancer hot spot mutations which the user has a stronger prior on being somatic e.g. BRAF v600E mutations.' 'The format of this file is Chromosome\tPosition\tProbability. Note this will override the mutation prior at these locations', required=False, default='NA') parser.add_argument('--only_ascnas', help='only use ascna data for TiN estimation', required=False, action='store_true') args = parser.parse_args() if args.cn_data_path == 'NULL' and args.mutation_data_path == 'NULL': print('One of CN data or SSNV data are required.') sys.exit() elif args.cn_data_path == 'NULL': di = input(args) di.read_and_preprocess_SSNVs() di.candidates = du.select_candidate_mutations(di.call_stats_table, di.exac_db_file) n_calls_pre = np.sum(di.candidates['judgement'] == "KEEP") ssnv_based_model = dssnv.model(di.candidates, di.mutation_prior, di.resolution, di.SSNV_af_threshold, di.coverage_threshold, di.CancerHotSpotsBED, skew=di.skew) ssnv_based_model.perform_inference() ascna_based_model = dascna.model(di.seg_table, di.het_table, di.resolution) ascna_based_model.TiN = np.nan elif args.mutation_data_path == 'NULL': di = input(args) di.read_and_preprocess_aSCNAs() di.candidates = pd.DataFrame( index=[0], columns=[ 'contig', 'position', 'ref_allele', 'alt_allele', 'tumor_name', 'normal_name', 't_alt_count', 't_ref_count', 'n_alt_count', 'n_ref_count', 'failure_reasons', 'judgement', 'genomic_coord_x', 'f_acs', 'tau' ]) ssnv_based_model = dssnv.model(di.candidates, di.mutation_prior, di.resolution, di.SSNV_af_threshold, di.coverage_threshold, di.CancerHotSpotsBED) ssnv_based_model.TiN = np.nan ascna = False # identify aSCNAs and filter hets if len(di.seg_table) > 0: di.aSCNA_hets = du.ensure_balanced_hets(di.seg_table, di.het_table) if len(di.aSCNA_hets) > 0: di.aSCNA_segs, di.convergent_segs = du.identify_aSCNAs( di.seg_table, di.aSCNA_hets, di.aSCNA_thresh, di.ascna_SNP_number_filter, di.aSCNA_variance_threshold) if len(di.aSCNA_segs) > 0: ascna_based_model = dascna.model(di.aSCNA_segs, di.aSCNA_hets, di.resolution) ascna_based_model.perform_inference() ascna = True if not ascna: ascna_based_model = dascna.model(di.seg_table, di.het_table, di.resolution) ascna_based_model.TiN = np.nan else: di = input(args) di.read_and_preprocess_data() # identify candidate mutations based on MuTect flags. # kept sites are flagged as KEEP or rejected for normal lod and/or alt_allele_in_normal di.candidates = du.select_candidate_mutations(di.call_stats_table, di.exac_db_file) n_calls_pre = np.sum(di.candidates['judgement'] == "KEEP") # generate SSNV based model using candidate sites ssnv_based_model = dssnv.model(di.candidates, di.mutation_prior, di.resolution, di.SSNV_af_threshold, di.coverage_threshold, di.CancerHotSpotsBED) ssnv_based_model.perform_inference() if di.only_ascnas == True: ssnv_based_model.TiN = np.nan print('Only using aSCNA data') ascna = False # identify aSCNAs and filter hets if len(di.seg_table) > 0: di.aSCNA_hets = du.ensure_balanced_hets(di.seg_table, di.het_table) if len(di.aSCNA_hets) > 0: di.aSCNA_segs, di.convergent_segs = du.identify_aSCNAs( di.seg_table, di.aSCNA_hets, di.aSCNA_thresh, di.ascna_SNP_number_filter, di.aSCNA_variance_threshold) if len(di.aSCNA_segs) > 0: ascna_based_model = dascna.model(di.aSCNA_segs, di.aSCNA_hets, di.resolution) ascna_based_model.perform_inference() ascna = True if not ascna: ascna_based_model = dascna.model(di.seg_table, di.het_table, di.resolution) ascna_based_model.TiN = np.nan # combine models and reclassify mutations do = output(di, ssnv_based_model, ascna_based_model) do.calculate_joint_estimate() if len(do.SSNVs) > 1: do.reclassify_mutations() do.SSNVs.drop('Chromosome', axis=1, inplace=True) n_calls_post = np.sum(do.SSNVs['judgement'] == "KEEP") n_calls_added = n_calls_post - n_calls_pre # make output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # write deTiN reclassified SSNVs do.SSNVs.to_csv(path_or_buf=do.input.output_path + '/' + do.input.output_name + '.deTiN_SSNVs.txt', sep='\t', index=None) if not di.indel_file == 'None': #if 'Chromosome' in do.indels.columns: do.indels.drop('Chromosome', axis=1, inplace=True) do.indels.to_csv(path_or_buf=do.input.output_path + '/' + do.input.output_name + '.deTiN_indels.txt', sep='\t', index=None) # write plots if not np.isnan(ascna_based_model.TiN): do.ascna_based_model.segs[ 'Chromosome'] = do.ascna_based_model.segs['Chromosome'] + 1 do.ascna_based_model.segs.to_csv(path_or_buf=do.input.output_path + '/' + do.input.output_name + '.deTiN_aSCNAs.txt', sep='\t', index=None) du.plot_kmeans_info(ascna_based_model, do.input.output_path, do.input.output_name) du.plot_TiN_models(do) du.plot_aSCNA_het_data(do) if not np.isnan(ssnv_based_model.TiN): du.plot_SSNVs(do) # write TiN and CIs file = open( do.input.output_path + '/' + do.input.output_name + '.TiN_estimate.txt', 'w') file.write('%s' % (do.TiN)) file.close() file = open( do.input.output_path + '/' + do.input.output_name + '.TiN_estimate_CI.txt', 'w') file.write('%s - %s' % (str(do.CI_tin_low), str(do.CI_tin_high))) file.close() file = open( do.input.output_path + '/' + do.input.output_name + '.number_of_SSNVs_added.txt', 'w') file.write('%s\n' % int(n_calls_added))
def main(): """ deTiN pipeline. Method operates in two stages (1) estimating tumor in normal via candidate SSNVs and SCNAS. (2) Performing variant re-classification using bayes rule. """ parser = argparse.ArgumentParser( description='Estimate tumor in normal (TiN) using putative somatic' ' events see Taylor-Weiner & Stewart et al. 2017') # input files parser.add_argument('--mutation_data_path', help='Path to mutation candidate SSNV data.' 'Supported formats: MuTect call-stats', required=True) parser.add_argument( '--cn_data_path', help='Path to copy number data.' 'Supported format: AllelicCapseg .seg file. Generated by GATK4 AllelicCNV.', required=True) parser.add_argument( '--tumor_het_data_path', help= 'Path to heterozygous site allele count data in tumor. Generated by GATK4 GetBayesianHetCoverage.' 'Required columns: CONTIG,POS,REF_COUNT and ALT_COUNT', required=True) parser.add_argument( '--normal_het_data_path', help= 'Path to heterozygous site allele count data in normal. Generated by GATK4 GetBayesianHetCoverage' 'Required columns: CONTIG,POS,REF_COUNT and ALT_COUNT', required=True) parser.add_argument( '--exac_data_path', help= 'Path to exac af > 0.01 pickle. Can be generated by downloading ExAC VCF and running build_exac_pickle', required=False) parser.add_argument('--indel_data_path', help='Path to candidate indels data.' 'Supported formats: Strelka / MuTect2 VCFs', required=False, default='None') parser.add_argument('--indel_data_type', help='MuTect2 or Strelka' 'Caller used to generate indels', required=False, default='None') # output related arguments parser.add_argument('--output_name', required=True, help='sample name') parser.add_argument('--output_dir', help='directory to put plots and TiN solution', required=False, default='.') # model related parameters parser.add_argument( '--mutation_prior', help= 'prior expected ratio of somatic mutations to rare germline events', required=False, default=0.15) parser.add_argument( '--aSCNA_threshold', help='minor allele fraction threshold for calling aSCNAs.', required=False, default=0.1) parser.add_argument( '--TiN_prior', help= 'expected frequency of TiN contamination in sequencing setting used for model selection', required=False, default=0.5) parser.add_argument( '--use_outlier_removal', help= 'remove sites from recovered SSNVs where allele fractions significantly exceed predicted fraction', required=False, default=True) args = parser.parse_args() di = input(args) di.read_and_preprocess_data() # identify candidate mutations based on MuTect flags. # kept sites are flagged as KEEP or rejected for normal lod and/or alt_allele_in_normal di.candidates = du.select_candidate_mutations(di.call_stats_table, di.exac_db_file) # generate SSNV based model using candidate sites ssnv_based_model = dssnv.model(di.candidates, di.mutation_prior) ssnv_based_model.perform_inference() # identify aSCNAs and filter hets if len(di.seg_table) > 0: di.aSCNA_hets = du.ensure_balanced_hets(di.seg_table, di.het_table) di.aSCNA_segs = du.identify_aSCNAs(di.seg_table, di.aSCNA_hets, di.aSCNA_thresh) # generate aSCNA based model ascna_based_model = dascna.model(di.aSCNA_segs, di.aSCNA_hets) ascna_based_model.perform_inference() else: ascna_based_model = dascna.model(di.seg_table, di.het_table) ascna_based_model.TiN = np.nan # combine models and reclassify mutations do = output(di, ssnv_based_model, ascna_based_model) do.calculate_joint_estimate() do.reclassify_mutations() do.SSNVs.drop('Chromosome', axis=1, inplace=True) # make output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # write deTiN reclassified SSNVs do.SSNVs.to_csv(path_or_buf=do.input.output_path + '/' + do.input.output_name + '.deTiN_SSNVs.txt', sep='\t', index=None) if not di.indel_file == 'None': do.indels.to_csv(path_or_buf=do.input.output_path + '/' + do.input.output_name + '.deTiN_indels.txt', sep='\t', index=None) # write plots if not np.isnan(ascna_based_model.TiN): du.plot_kmeans_info(ascna_based_model, do.input.output_path, do.input.output_name) du.plot_TiN_models(do) du.plot_SSNVs(do) # write TiN and CIs file = open( do.input.output_path + '/' + do.input.output_name + '.TiN_estimate.txt', 'w') file.write('%s' % (do.TiN)) file.close() file = open( do.input.output_path + '/' + do.input.output_name + '.TiN_estimate_CI.txt', 'w') file.write('%s - %s' % (str(do.CI_tin_low), str(do.CI_tin_high))) file.close()