def main(): """ deTiN pipeline. Method operates in two stages (1) estimating tumor in normal via candidate SSNVs and SCNAS. (2) Performing variant re-classification using bayes rule. """ parser = argparse.ArgumentParser( description='Estimate tumor in normal (TiN) using putative somatic' ' events see Taylor-Weiner & Stewart et al. 2017') # input files parser.add_argument('--mutation_data_path', help='Path to mutation candidate SSNV data.' 'Supported formats: MuTect call-stats', required=False, default='NULL') parser.add_argument( '--cn_data_path', help='Path to copy number data.' 'Supported format: AllelicCapseg .seg file. Generated by GATK4 AllelicCNV.', required=False, default='NULL') parser.add_argument( '--tumor_het_data_path', help= 'Path to heterozygous site allele count data in tumor. Generated by GATK4 GetBayesianHetCoverage.' 'Required columns: CONTIG,POS,REF_COUNT and ALT_COUNT', required=False, default='NULL') parser.add_argument( '--normal_het_data_path', help= 'Path to heterozygous site allele count data in normal. Generated by GATK4 GetBayesianHetCoverage' 'Required columns: CONTIG,POS,REF_COUNT and ALT_COUNT', required=False, default='NULL') parser.add_argument( '--exac_data_path', help= 'Path to exac af > 0.01 pickle. Can be generated by downloading ExAC VCF and running build_exac_pickle', required=False) parser.add_argument('--indel_data_path', help='Path to candidate indels data.' 'Supported formats: Strelka / MuTect2 VCFs', required=False, default='None') parser.add_argument('--indel_data_type', help='MuTect2 or Strelka' 'Caller used to generate indels', required=False, default='None') # output related arguments parser.add_argument('--output_name', required=True, help='sample name') parser.add_argument('--output_dir', help='directory to put plots and TiN solution', required=False, default='.') # model related parameters parser.add_argument( '--mutation_prior', help= 'prior expected ratio of somatic mutations to rare germline events', required=False, default=0.05) parser.add_argument( '--aSCNA_threshold', help='minor allele fraction threshold for calling aSCNAs.', required=False, default=0.1) parser.add_argument( '--TiN_prior', help= 'expected frequency of TiN contamination in sequencing setting used for model selection', required=False, default=0.5) parser.add_argument( '--use_outlier_removal', help= 'remove sites from recovered SSNVs where allele fractions significantly exceed predicted fraction', required=False, default=True) parser.add_argument( '--resolution', help= 'number of TiN bins to consider default = 101 corresponds to 0.01 TiN levels', required=False, default=101) parser.add_argument( '--weighted_classification', help='integrate variant classification over all values of TiN', required=False, default=False) parser.add_argument( '--ascna_probe_number_filter', help='number of probes to require for an aSCNA to be considered', required=False, default=200) parser.add_argument( '--ascna_SNP_number_filter', help='number of probes to require for an aSCNA to be considered', required=False, default=20) parser.add_argument( '--coverage_threshold', help='number of reads required to use a site for TiN estimation', required=False, default=15) parser.add_argument( '--SSNV_af_threshold', help='fraction of alternate alleles required for site to be used ' 'for SSNV TiN estimation', required=False, default=.2) parser.add_argument( '--aSCNA_variance_threshold', help= 'variance of segment allele shift tolerated before removing segment ' 'as artifact', required=False, default=0.025) parser.add_argument( '--cancer_hot_spots', help= 'Optional BED file of cancer hot spot mutations which the user has a stronger prior on being somatic e.g. BRAF v600E mutations.' 'The format of this file is Chromosome\tPosition\tProbability. Note this will override the mutation prior at these locations', required=False, default='NA') parser.add_argument('--only_ascnas', help='only use ascna data for TiN estimation', required=False, action='store_true') args = parser.parse_args() if args.cn_data_path == 'NULL' and args.mutation_data_path == 'NULL': print('One of CN data or SSNV data are required.') sys.exit() elif args.cn_data_path == 'NULL': di = input(args) di.read_and_preprocess_SSNVs() di.candidates = du.select_candidate_mutations(di.call_stats_table, di.exac_db_file) n_calls_pre = np.sum(di.candidates['judgement'] == "KEEP") ssnv_based_model = dssnv.model(di.candidates, di.mutation_prior, di.resolution, di.SSNV_af_threshold, di.coverage_threshold, di.CancerHotSpotsBED, skew=di.skew) ssnv_based_model.perform_inference() ascna_based_model = dascna.model(di.seg_table, di.het_table, di.resolution) ascna_based_model.TiN = np.nan elif args.mutation_data_path == 'NULL': di = input(args) di.read_and_preprocess_aSCNAs() di.candidates = pd.DataFrame( index=[0], columns=[ 'contig', 'position', 'ref_allele', 'alt_allele', 'tumor_name', 'normal_name', 't_alt_count', 't_ref_count', 'n_alt_count', 'n_ref_count', 'failure_reasons', 'judgement', 'genomic_coord_x', 'f_acs', 'tau' ]) ssnv_based_model = dssnv.model(di.candidates, di.mutation_prior, di.resolution, di.SSNV_af_threshold, di.coverage_threshold, di.CancerHotSpotsBED) ssnv_based_model.TiN = np.nan ascna = False # identify aSCNAs and filter hets if len(di.seg_table) > 0: di.aSCNA_hets = du.ensure_balanced_hets(di.seg_table, di.het_table) if len(di.aSCNA_hets) > 0: di.aSCNA_segs, di.convergent_segs = du.identify_aSCNAs( di.seg_table, di.aSCNA_hets, di.aSCNA_thresh, di.ascna_SNP_number_filter, di.aSCNA_variance_threshold) if len(di.aSCNA_segs) > 0: ascna_based_model = dascna.model(di.aSCNA_segs, di.aSCNA_hets, di.resolution) ascna_based_model.perform_inference() ascna = True if not ascna: ascna_based_model = dascna.model(di.seg_table, di.het_table, di.resolution) ascna_based_model.TiN = np.nan else: di = input(args) di.read_and_preprocess_data() # identify candidate mutations based on MuTect flags. # kept sites are flagged as KEEP or rejected for normal lod and/or alt_allele_in_normal di.candidates = du.select_candidate_mutations(di.call_stats_table, di.exac_db_file) n_calls_pre = np.sum(di.candidates['judgement'] == "KEEP") # generate SSNV based model using candidate sites ssnv_based_model = dssnv.model(di.candidates, di.mutation_prior, di.resolution, di.SSNV_af_threshold, di.coverage_threshold, di.CancerHotSpotsBED) ssnv_based_model.perform_inference() if di.only_ascnas == True: ssnv_based_model.TiN = np.nan print('Only using aSCNA data') ascna = False # identify aSCNAs and filter hets if len(di.seg_table) > 0: di.aSCNA_hets = du.ensure_balanced_hets(di.seg_table, di.het_table) if len(di.aSCNA_hets) > 0: di.aSCNA_segs, di.convergent_segs = du.identify_aSCNAs( di.seg_table, di.aSCNA_hets, di.aSCNA_thresh, di.ascna_SNP_number_filter, di.aSCNA_variance_threshold) if len(di.aSCNA_segs) > 0: ascna_based_model = dascna.model(di.aSCNA_segs, di.aSCNA_hets, di.resolution) ascna_based_model.perform_inference() ascna = True if not ascna: ascna_based_model = dascna.model(di.seg_table, di.het_table, di.resolution) ascna_based_model.TiN = np.nan # combine models and reclassify mutations do = output(di, ssnv_based_model, ascna_based_model) do.calculate_joint_estimate() if len(do.SSNVs) > 1: do.reclassify_mutations() do.SSNVs.drop('Chromosome', axis=1, inplace=True) n_calls_post = np.sum(do.SSNVs['judgement'] == "KEEP") n_calls_added = n_calls_post - n_calls_pre # make output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # write deTiN reclassified SSNVs do.SSNVs.to_csv(path_or_buf=do.input.output_path + '/' + do.input.output_name + '.deTiN_SSNVs.txt', sep='\t', index=None) if not di.indel_file == 'None': #if 'Chromosome' in do.indels.columns: do.indels.drop('Chromosome', axis=1, inplace=True) do.indels.to_csv(path_or_buf=do.input.output_path + '/' + do.input.output_name + '.deTiN_indels.txt', sep='\t', index=None) # write plots if not np.isnan(ascna_based_model.TiN): do.ascna_based_model.segs[ 'Chromosome'] = do.ascna_based_model.segs['Chromosome'] + 1 do.ascna_based_model.segs.to_csv(path_or_buf=do.input.output_path + '/' + do.input.output_name + '.deTiN_aSCNAs.txt', sep='\t', index=None) du.plot_kmeans_info(ascna_based_model, do.input.output_path, do.input.output_name) du.plot_TiN_models(do) du.plot_aSCNA_het_data(do) if not np.isnan(ssnv_based_model.TiN): du.plot_SSNVs(do) # write TiN and CIs file = open( do.input.output_path + '/' + do.input.output_name + '.TiN_estimate.txt', 'w') file.write('%s' % (do.TiN)) file.close() file = open( do.input.output_path + '/' + do.input.output_name + '.TiN_estimate_CI.txt', 'w') file.write('%s - %s' % (str(do.CI_tin_low), str(do.CI_tin_high))) file.close() file = open( do.input.output_path + '/' + do.input.output_name + '.number_of_SSNVs_added.txt', 'w') file.write('%s\n' % int(n_calls_added))
def main(): """ deTiN pipeline. Method operates in two stages (1) estimating tumor in normal via candidate SSNVs and SCNAS. (2) Performing variant re-classification using bayes rule. """ parser = argparse.ArgumentParser( description='Estimate tumor in normal (TiN) using putative somatic' ' events see Taylor-Weiner & Stewart et al. 2017') # input files parser.add_argument('--mutation_data_path', help='Path to mutation candidate SSNV data.' 'Supported formats: MuTect call-stats', required=True) parser.add_argument( '--cn_data_path', help='Path to copy number data.' 'Supported format: AllelicCapseg .seg file. Generated by GATK4 AllelicCNV.', required=True) parser.add_argument( '--tumor_het_data_path', help= 'Path to heterozygous site allele count data in tumor. Generated by GATK4 GetBayesianHetCoverage.' 'Required columns: CONTIG,POS,REF_COUNT and ALT_COUNT', required=True) parser.add_argument( '--normal_het_data_path', help= 'Path to heterozygous site allele count data in normal. Generated by GATK4 GetBayesianHetCoverage' 'Required columns: CONTIG,POS,REF_COUNT and ALT_COUNT', required=True) parser.add_argument( '--exac_data_path', help= 'Path to exac af > 0.01 pickle. Can be generated by downloading ExAC VCF and running build_exac_pickle', required=False) parser.add_argument('--indel_data_path', help='Path to candidate indels data.' 'Supported formats: Strelka / MuTect2 VCFs', required=False, default='None') parser.add_argument('--indel_data_type', help='MuTect2 or Strelka' 'Caller used to generate indels', required=False, default='None') # output related arguments parser.add_argument('--output_name', required=True, help='sample name') parser.add_argument('--output_dir', help='directory to put plots and TiN solution', required=False, default='.') # model related parameters parser.add_argument( '--mutation_prior', help= 'prior expected ratio of somatic mutations to rare germline events', required=False, default=0.15) parser.add_argument( '--aSCNA_threshold', help='minor allele fraction threshold for calling aSCNAs.', required=False, default=0.1) parser.add_argument( '--TiN_prior', help= 'expected frequency of TiN contamination in sequencing setting used for model selection', required=False, default=0.5) parser.add_argument( '--use_outlier_removal', help= 'remove sites from recovered SSNVs where allele fractions significantly exceed predicted fraction', required=False, default=True) args = parser.parse_args() di = input(args) di.read_and_preprocess_data() # identify candidate mutations based on MuTect flags. # kept sites are flagged as KEEP or rejected for normal lod and/or alt_allele_in_normal di.candidates = du.select_candidate_mutations(di.call_stats_table, di.exac_db_file) # generate SSNV based model using candidate sites ssnv_based_model = dssnv.model(di.candidates, di.mutation_prior) ssnv_based_model.perform_inference() # identify aSCNAs and filter hets if len(di.seg_table) > 0: di.aSCNA_hets = du.ensure_balanced_hets(di.seg_table, di.het_table) di.aSCNA_segs = du.identify_aSCNAs(di.seg_table, di.aSCNA_hets, di.aSCNA_thresh) # generate aSCNA based model ascna_based_model = dascna.model(di.aSCNA_segs, di.aSCNA_hets) ascna_based_model.perform_inference() else: ascna_based_model = dascna.model(di.seg_table, di.het_table) ascna_based_model.TiN = np.nan # combine models and reclassify mutations do = output(di, ssnv_based_model, ascna_based_model) do.calculate_joint_estimate() do.reclassify_mutations() do.SSNVs.drop('Chromosome', axis=1, inplace=True) # make output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # write deTiN reclassified SSNVs do.SSNVs.to_csv(path_or_buf=do.input.output_path + '/' + do.input.output_name + '.deTiN_SSNVs.txt', sep='\t', index=None) if not di.indel_file == 'None': do.indels.to_csv(path_or_buf=do.input.output_path + '/' + do.input.output_name + '.deTiN_indels.txt', sep='\t', index=None) # write plots if not np.isnan(ascna_based_model.TiN): du.plot_kmeans_info(ascna_based_model, do.input.output_path, do.input.output_name) du.plot_TiN_models(do) du.plot_SSNVs(do) # write TiN and CIs file = open( do.input.output_path + '/' + do.input.output_name + '.TiN_estimate.txt', 'w') file.write('%s' % (do.TiN)) file.close() file = open( do.input.output_path + '/' + do.input.output_name + '.TiN_estimate_CI.txt', 'w') file.write('%s - %s' % (str(do.CI_tin_low), str(do.CI_tin_high))) file.close()