def test_random_seed(self): # test downsampling with a different random seed from the default config = { bam_qc.CONFIG_KEY_BAM: self.bam_path, bam_qc.CONFIG_KEY_DEBUG: self.debug, bam_qc.CONFIG_KEY_DOWNSAMPLED_BAM: self.downsampled_bam, bam_qc.CONFIG_KEY_TARGET: self.target_path, bam_qc.CONFIG_KEY_INSERT_MAX: self.insert_max, bam_qc.CONFIG_KEY_LOG: self.log_path, bam_qc.CONFIG_KEY_METADATA: self.metadata_path, bam_qc.CONFIG_KEY_MARK_DUPLICATES: self.markdup_path, bam_qc.CONFIG_KEY_N_AS_MISMATCH: self.n_as_mismatch, bam_qc.CONFIG_KEY_SKIP_BELOW_MAPQ: self.quality, bam_qc.CONFIG_KEY_RANDOM_SEED: 88, bam_qc.CONFIG_KEY_REFERENCE: self.reference, bam_qc.CONFIG_KEY_SAMPLE: self.sample_level, bam_qc.CONFIG_KEY_TEMP_DIR: self.tmpdir, bam_qc.CONFIG_KEY_VERBOSE: self.verbose, bam_qc.CONFIG_KEY_WORKFLOW_VERSION: self.workflow_version } qc = bam_qc(config) out_path = os.path.join(self.tmpdir, 'out_downsampled_88.json') qc.write_output(out_path) with (open(out_path)) as f: output = json.loads(f.read()) with (open(self.expected_path_rs88)) as f: expected = json.loads(f.read()) # do not test the alignment reference local path del expected['alignment reference'] del output['alignment reference'] self.assertEqual(output, expected) qc.cleanup()
def test_downsampled_input(self): config = { bam_qc.CONFIG_KEY_BAM: self.bam_path, bam_qc.CONFIG_KEY_DEBUG: self.debug, bam_qc.CONFIG_KEY_DOWNSAMPLED_BAM: self.downsampled_bam_nonempty, bam_qc.CONFIG_KEY_TARGET: self.target_path, bam_qc.CONFIG_KEY_INSERT_MAX: self.insert_max, bam_qc.CONFIG_KEY_LOG: self.log_path, bam_qc.CONFIG_KEY_METADATA: self.metadata_path, bam_qc.CONFIG_KEY_MARK_DUPLICATES: self.markdup_path, bam_qc.CONFIG_KEY_N_AS_MISMATCH: self.n_as_mismatch, bam_qc.CONFIG_KEY_SKIP_BELOW_MAPQ: None, bam_qc.CONFIG_KEY_RANDOM_SEED: None, bam_qc.CONFIG_KEY_REFERENCE: self.reference, bam_qc.CONFIG_KEY_SAMPLE: None, bam_qc.CONFIG_KEY_TEMP_DIR: self.tmpdir, bam_qc.CONFIG_KEY_VERBOSE: self.verbose, bam_qc.CONFIG_KEY_WORKFLOW_VERSION: self.workflow_version } qc = bam_qc(config) out_path = os.path.join(self.tmpdir, 'out_downsampled.json') qc.write_output(out_path) self.assertTrue(os.path.exists(out_path)) ep = self.expected_path_from_downsampled_input self.assert_output_with_downsampled_input_ok(out_path, ep) qc.cleanup()
def test_default_analysis_picard2_multiple_libraries(self): config = { bam_qc.CONFIG_KEY_BAM: self.bam_path, bam_qc.CONFIG_KEY_DEBUG: self.debug, bam_qc.CONFIG_KEY_DOWNSAMPLED_BAM: self.downsampled_bam, bam_qc.CONFIG_KEY_TARGET: self.target_path, bam_qc.CONFIG_KEY_INSERT_MAX: self.insert_max, bam_qc.CONFIG_KEY_LOG: self.log_path, bam_qc.CONFIG_KEY_METADATA: self.metadata_path, bam_qc.CONFIG_KEY_MARK_DUPLICATES: self.markdup_path_picard2_multiple_libraries, bam_qc.CONFIG_KEY_N_AS_MISMATCH: self.n_as_mismatch, bam_qc.CONFIG_KEY_SKIP_BELOW_MAPQ: self.quality, bam_qc.CONFIG_KEY_RANDOM_SEED: None, bam_qc.CONFIG_KEY_REFERENCE: self.reference, bam_qc.CONFIG_KEY_SAMPLE: self.sample_default, bam_qc.CONFIG_KEY_TEMP_DIR: self.tmpdir, bam_qc.CONFIG_KEY_VERBOSE: self.verbose, bam_qc.CONFIG_KEY_WORKFLOW_VERSION: self.workflow_version } qc = bam_qc(config) out_path = os.path.join(self.tmpdir, 'out.json') qc.write_output(out_path) self.assert_default_output_ok(out_path, self.expected_picard2_multiple_libraries) qc.cleanup()
def test_downsampling_analysis(self): config = { bam_qc.CONFIG_KEY_BAM: self.bam_path, bam_qc.CONFIG_KEY_DEBUG: self.debug, bam_qc.CONFIG_KEY_DOWNSAMPLED_BAM: self.downsampled_bam, bam_qc.CONFIG_KEY_TARGET: self.target_path, bam_qc.CONFIG_KEY_INSERT_MAX: self.insert_max, bam_qc.CONFIG_KEY_LOG: self.log_path, bam_qc.CONFIG_KEY_METADATA: self.metadata_path, bam_qc.CONFIG_KEY_MARK_DUPLICATES: self.markdup_path, bam_qc.CONFIG_KEY_N_AS_MISMATCH: self.n_as_mismatch, bam_qc.CONFIG_KEY_SKIP_BELOW_MAPQ: self.quality, bam_qc.CONFIG_KEY_RANDOM_SEED: None, bam_qc.CONFIG_KEY_REFERENCE: self.reference, bam_qc.CONFIG_KEY_SAMPLE: self.sample_level, bam_qc.CONFIG_KEY_TEMP_DIR: self.tmpdir, bam_qc.CONFIG_KEY_VERBOSE: self.verbose, bam_qc.CONFIG_KEY_WORKFLOW_VERSION: self.workflow_version } qc = bam_qc(config) out_path = os.path.join(self.tmpdir, 'out_downsampled.json') qc.write_output(out_path) self.assertTrue(os.path.exists(out_path)) with (open(out_path)) as f: output = json.loads(f.read()) # do individual sanity checks on some variables # helps validate results if expected output JSON file has been changed expected_variables = { "inserted bases": 315, "reads per start point": 1.003, # downsampled "readsMissingMDtags": 9762, # downsampled "sample level": self.sample_level, "total reads": 80020, "total target size": 527189, } for key in expected_variables.keys(): expected = expected_variables[key] got = output[key] try: self.assertEqual(expected, got) except AssertionError: print("\nFailed on metric '" + key + "': Expected", expected, ", got", got, file=sys.stderr) raise # reference path output depends on local filesystem # make test portable by just checking the filename self.assertTrue(re.search('/hg19.fa$', output['alignment reference'])) # now check all output data (aside from the reference) with (open(self.expected_path_downsampled)) as f: expected = json.loads(f.read()) del output['alignment reference'] self.assertEqual(output, expected) qc.cleanup()
def test_missing_inputs(self): # test possible missing inputs: # - ESTIMATED_LIBRARY_SIZE in mark duplicates text # - FFQ/LFQ in samtools stats config = { bam_qc.CONFIG_KEY_BAM: self.bam_path, bam_qc.CONFIG_KEY_DEBUG: self.debug, bam_qc.CONFIG_KEY_DOWNSAMPLED_BAM: self.downsampled_bam, bam_qc.CONFIG_KEY_TARGET: self.target_path, bam_qc.CONFIG_KEY_INSERT_MAX: self.insert_max, bam_qc.CONFIG_KEY_LOG: self.log_path, bam_qc.CONFIG_KEY_METADATA: self.metadata_path, bam_qc.CONFIG_KEY_MARK_DUPLICATES: self.markdup_path, bam_qc.CONFIG_KEY_N_AS_MISMATCH: self.n_as_mismatch, bam_qc.CONFIG_KEY_SKIP_BELOW_MAPQ: self.quality, bam_qc.CONFIG_KEY_RANDOM_SEED: None, bam_qc.CONFIG_KEY_REFERENCE: self.reference, bam_qc.CONFIG_KEY_SAMPLE: self.sample_default, bam_qc.CONFIG_KEY_TEMP_DIR: self.tmpdir, bam_qc.CONFIG_KEY_VERBOSE: self.verbose, bam_qc.CONFIG_KEY_WORKFLOW_VERSION: self.workflow_version } qc = bam_qc(config) # for low-coverage runs, ESTIMATED_LIBRARY_SIZE value is missing from mark duplicates text # test input file also has variant '## METRICS CLASS ...' line metrics_found = qc.read_mark_dup(self.markdup_path_low_cover) with (open(self.expected_metrics_low_cover)) as f: metrics_expected = json.loads(f.read()) # Found/expected HISTOGRAM keys are integers and strings, respectively. # (Annoyingly, JSON format insists dictionary keys must be strings) histogram_found = metrics_found['HISTOGRAM'] histogram_expected = metrics_expected['HISTOGRAM'] self.assertEqual(len(histogram_found), len(histogram_expected)) for histogram_type in histogram_found.keys(): for key in histogram_found[histogram_type]: self.assertEqual(histogram_found[histogram_type][key], histogram_expected[histogram_type][str(key)]) del metrics_found['HISTOGRAM'] del metrics_expected['HISTOGRAM'] self.assertEqual(metrics_found, metrics_expected) # test empty FFQ/LFQ result from samtools stats; may occur for small input datasets # requires a fast_metric_finder object fast_finder = fast_metric_finder(self.bam_path, self.reference, self.insert_max, self.n_as_mismatch, qc.logger) fq_result = fast_finder.fq_stats([]) fq_expected = ({}, {}) self.assertEqual(fq_expected, fq_result) qc.cleanup()
def main(): parser = argparse.ArgumentParser(description='QC for BAM files.') parser.add_argument('-a', '--all-reads', action='store_true', help='Do not apply downsampling; '+\ 'use all reads as input to all QC metrics. Incompatible with --sample.') parser.add_argument('-b', '--bam', metavar='PATH', required=True, help='Path to input BAM file. Required.') parser.add_argument('-d', '--mark-duplicates', metavar='PATH', help='Path to text file output by Picard MarkDuplicates. Optional.') parser.add_argument('-D', '--debug', action='store_true', help='Most verbose; write messages of priority DEBUG and higher to log') parser.add_argument('-i', '--insert-max', metavar='INT', default=DEFAULT_INSERT_MAX, help='Maximum expected value for insert size; higher values will be '+\ 'counted as abnormal. Optional; default = %i.' % DEFAULT_INSERT_MAX) parser.add_argument('-l', '--log-path', metavar='PATH', help='Path of file where log output '+\ 'will be appended. Optional, defaults to STDERR.') parser.add_argument('-m', '--metadata', metavar='PATH', help='Path to JSON file containing metadata. Optional.') parser.add_argument('-n', '--n-as-mismatch', action='store_true', help='Record N calls as mismatches in mismatch-per-cycle counts. '+\ 'Only relevant if a reference is given with -r.') parser.add_argument('-o', '--out', metavar='PATH', required=True, help='Path for JSON output, or - for STDOUT. Required.') parser.add_argument('-p', '--profile', action='store_true', help='Write runtime profile to '+\ 'STDOUT. For development use only. Should not be combined with writing '+\ 'JSON to STDOUT.') parser.add_argument('-q', '--skip-below-mapq', metavar='QSCORE', help='Threshold to skip reads with low alignment quality. Optional.') parser.add_argument('-r', '--reference', metavar='PATH', help='Path to FASTA reference used to align the BAM file. Used to find '+\ 'mismatches by cycle using samtools. Optional; if not supplied, '+\ 'mismatches by cycle will be empty.') parser.add_argument('-R', '--random-seed', metavar='INT', help='Set sampling random seed to '+\ 'INT. Has no effect if --sample-rate not specified. Optional; if not '+\ 'given, a default seed will be used.') parser.add_argument('-s', '--sample', metavar='INT', help='Sample a total of INT reads from the BAM file, for input to slower '+\ 'QC metrics. Defaults to 1.1 million. Incompatible with --all-reads.') parser.add_argument('-S', '--downsampled-bam', metavar='PATH', help='Downsampled BAM file for input to slow QC metrics. Incompatible with '+\ '--all-reads and --sample.') parser.add_argument('-t', '--target', metavar='PATH', help='Path to target BED file, containing targets to calculate coverage '+\ 'against. Optional. If given, must be sorted in same order as BAM file. '+\ 'If not given, bedtools coverage metrics will be omitted.') parser.add_argument('-T', '--temp-dir', metavar='PATH', help='Directory for temporary output '+\ 'files; optional, defaults to %s (the current system tempdir).' \ % tempfile.gettempdir()) parser.add_argument('-v', '--version', action='version', version=read_package_version(), help='Print the version number of bam-qc-metrics and exit') parser.add_argument('-V', '--verbose', action='store_true', help='More verbose; write messages of priority INFO and higher to log') parser.add_argument('-w', '--workflow-version', metavar='VERSION', help='Version of the workflow being used to run bam-qc-metrics. '+\ 'Optional. If given, will be recorded in JSON output.') args = parser.parse_args() if not validate_args(args): print("For usage, run with -h or --help") exit(1) skip_below_mapq = None if args.skip_below_mapq == None else int(args.skip_below_mapq) insert_max = None if args.insert_max == None else int(args.insert_max) random_seed = None if args.random_seed == None else int(args.random_seed) if args.all_reads or args.downsampled_bam: sample = None else: sample = DEFAULT_SAMPLE_LEVEL if args.sample == None else int(args.sample) config = { bam_qc.CONFIG_KEY_BAM: args.bam, bam_qc.CONFIG_KEY_DEBUG: args.debug, bam_qc.CONFIG_KEY_DOWNSAMPLED_BAM: args.downsampled_bam, bam_qc.CONFIG_KEY_TARGET: args.target, bam_qc.CONFIG_KEY_INSERT_MAX: insert_max, bam_qc.CONFIG_KEY_LOG: args.log_path, bam_qc.CONFIG_KEY_METADATA: args.metadata, bam_qc.CONFIG_KEY_MARK_DUPLICATES: args.mark_duplicates, bam_qc.CONFIG_KEY_N_AS_MISMATCH: args.n_as_mismatch, bam_qc.CONFIG_KEY_SKIP_BELOW_MAPQ: skip_below_mapq, bam_qc.CONFIG_KEY_RANDOM_SEED: random_seed, bam_qc.CONFIG_KEY_REFERENCE: args.reference, bam_qc.CONFIG_KEY_SAMPLE: sample, bam_qc.CONFIG_KEY_TEMP_DIR: args.temp_dir, bam_qc.CONFIG_KEY_VERBOSE: args.verbose, bam_qc.CONFIG_KEY_WORKFLOW_VERSION: args.workflow_version } if args.profile: # sort order = 2, sorts profile by cumulative time cProfile.runctx('bam_qc(config).write_output(out_path)', {'bam_qc': bam_qc, 'config': config, 'out_path': args.out}, {}, None, 2) else: qc = bam_qc(config) qc.write_output(args.out)