Example #1
0
 def test_random_seed(self):
     # test downsampling with a different random seed from the default
     config = {
         bam_qc.CONFIG_KEY_BAM: self.bam_path,
         bam_qc.CONFIG_KEY_DEBUG: self.debug,
         bam_qc.CONFIG_KEY_DOWNSAMPLED_BAM: self.downsampled_bam,
         bam_qc.CONFIG_KEY_TARGET: self.target_path,
         bam_qc.CONFIG_KEY_INSERT_MAX: self.insert_max,
         bam_qc.CONFIG_KEY_LOG: self.log_path,
         bam_qc.CONFIG_KEY_METADATA: self.metadata_path,
         bam_qc.CONFIG_KEY_MARK_DUPLICATES: self.markdup_path,
         bam_qc.CONFIG_KEY_N_AS_MISMATCH: self.n_as_mismatch,
         bam_qc.CONFIG_KEY_SKIP_BELOW_MAPQ: self.quality,
         bam_qc.CONFIG_KEY_RANDOM_SEED: 88,
         bam_qc.CONFIG_KEY_REFERENCE: self.reference,
         bam_qc.CONFIG_KEY_SAMPLE: self.sample_level,
         bam_qc.CONFIG_KEY_TEMP_DIR: self.tmpdir,
         bam_qc.CONFIG_KEY_VERBOSE: self.verbose,
         bam_qc.CONFIG_KEY_WORKFLOW_VERSION: self.workflow_version
     }
     qc = bam_qc(config)
     out_path = os.path.join(self.tmpdir, 'out_downsampled_88.json')
     qc.write_output(out_path)
     with (open(out_path)) as f:
         output = json.loads(f.read())
     with (open(self.expected_path_rs88)) as f:
         expected = json.loads(f.read())
     # do not test the alignment reference local path
     del expected['alignment reference']
     del output['alignment reference']
     self.assertEqual(output, expected)
     qc.cleanup()
Example #2
0
 def test_downsampled_input(self):
     config = {
         bam_qc.CONFIG_KEY_BAM: self.bam_path,
         bam_qc.CONFIG_KEY_DEBUG: self.debug,
         bam_qc.CONFIG_KEY_DOWNSAMPLED_BAM: self.downsampled_bam_nonempty,
         bam_qc.CONFIG_KEY_TARGET: self.target_path,
         bam_qc.CONFIG_KEY_INSERT_MAX: self.insert_max,
         bam_qc.CONFIG_KEY_LOG: self.log_path,
         bam_qc.CONFIG_KEY_METADATA: self.metadata_path,
         bam_qc.CONFIG_KEY_MARK_DUPLICATES: self.markdup_path,
         bam_qc.CONFIG_KEY_N_AS_MISMATCH: self.n_as_mismatch,
         bam_qc.CONFIG_KEY_SKIP_BELOW_MAPQ: None,
         bam_qc.CONFIG_KEY_RANDOM_SEED: None,
         bam_qc.CONFIG_KEY_REFERENCE: self.reference,
         bam_qc.CONFIG_KEY_SAMPLE: None,
         bam_qc.CONFIG_KEY_TEMP_DIR: self.tmpdir,
         bam_qc.CONFIG_KEY_VERBOSE: self.verbose,
         bam_qc.CONFIG_KEY_WORKFLOW_VERSION: self.workflow_version
     }
     qc = bam_qc(config)
     out_path = os.path.join(self.tmpdir, 'out_downsampled.json')
     qc.write_output(out_path)
     self.assertTrue(os.path.exists(out_path))
     ep = self.expected_path_from_downsampled_input
     self.assert_output_with_downsampled_input_ok(out_path, ep)
     qc.cleanup()
Example #3
0
 def test_default_analysis_picard2_multiple_libraries(self):
     config = {
         bam_qc.CONFIG_KEY_BAM: self.bam_path,
         bam_qc.CONFIG_KEY_DEBUG: self.debug,
         bam_qc.CONFIG_KEY_DOWNSAMPLED_BAM: self.downsampled_bam,
         bam_qc.CONFIG_KEY_TARGET: self.target_path,
         bam_qc.CONFIG_KEY_INSERT_MAX: self.insert_max,
         bam_qc.CONFIG_KEY_LOG: self.log_path,
         bam_qc.CONFIG_KEY_METADATA: self.metadata_path,
         bam_qc.CONFIG_KEY_MARK_DUPLICATES:
         self.markdup_path_picard2_multiple_libraries,
         bam_qc.CONFIG_KEY_N_AS_MISMATCH: self.n_as_mismatch,
         bam_qc.CONFIG_KEY_SKIP_BELOW_MAPQ: self.quality,
         bam_qc.CONFIG_KEY_RANDOM_SEED: None,
         bam_qc.CONFIG_KEY_REFERENCE: self.reference,
         bam_qc.CONFIG_KEY_SAMPLE: self.sample_default,
         bam_qc.CONFIG_KEY_TEMP_DIR: self.tmpdir,
         bam_qc.CONFIG_KEY_VERBOSE: self.verbose,
         bam_qc.CONFIG_KEY_WORKFLOW_VERSION: self.workflow_version
     }
     qc = bam_qc(config)
     out_path = os.path.join(self.tmpdir, 'out.json')
     qc.write_output(out_path)
     self.assert_default_output_ok(out_path,
                                   self.expected_picard2_multiple_libraries)
     qc.cleanup()
Example #4
0
 def test_downsampling_analysis(self):
     config = {
         bam_qc.CONFIG_KEY_BAM: self.bam_path,
         bam_qc.CONFIG_KEY_DEBUG: self.debug,
         bam_qc.CONFIG_KEY_DOWNSAMPLED_BAM: self.downsampled_bam,
         bam_qc.CONFIG_KEY_TARGET: self.target_path,
         bam_qc.CONFIG_KEY_INSERT_MAX: self.insert_max,
         bam_qc.CONFIG_KEY_LOG: self.log_path,
         bam_qc.CONFIG_KEY_METADATA: self.metadata_path,
         bam_qc.CONFIG_KEY_MARK_DUPLICATES: self.markdup_path,
         bam_qc.CONFIG_KEY_N_AS_MISMATCH: self.n_as_mismatch,
         bam_qc.CONFIG_KEY_SKIP_BELOW_MAPQ: self.quality,
         bam_qc.CONFIG_KEY_RANDOM_SEED: None,
         bam_qc.CONFIG_KEY_REFERENCE: self.reference,
         bam_qc.CONFIG_KEY_SAMPLE: self.sample_level,
         bam_qc.CONFIG_KEY_TEMP_DIR: self.tmpdir,
         bam_qc.CONFIG_KEY_VERBOSE: self.verbose,
         bam_qc.CONFIG_KEY_WORKFLOW_VERSION: self.workflow_version
     }
     qc = bam_qc(config)
     out_path = os.path.join(self.tmpdir, 'out_downsampled.json')
     qc.write_output(out_path)
     self.assertTrue(os.path.exists(out_path))
     with (open(out_path)) as f:
         output = json.loads(f.read())
     # do individual sanity checks on some variables
     # helps validate results if expected output JSON file has been changed
     expected_variables = {
         "inserted bases": 315,
         "reads per start point": 1.003,  # downsampled
         "readsMissingMDtags": 9762,  # downsampled
         "sample level": self.sample_level,
         "total reads": 80020,
         "total target size": 527189,
     }
     for key in expected_variables.keys():
         expected = expected_variables[key]
         got = output[key]
         try:
             self.assertEqual(expected, got)
         except AssertionError:
             print("\nFailed on metric '" + key + "': Expected",
                   expected,
                   ", got",
                   got,
                   file=sys.stderr)
             raise
     # reference path output depends on local filesystem
     # make test portable by just checking the filename
     self.assertTrue(re.search('/hg19.fa$', output['alignment reference']))
     # now check all output data (aside from the reference)
     with (open(self.expected_path_downsampled)) as f:
         expected = json.loads(f.read())
     del output['alignment reference']
     self.assertEqual(output, expected)
     qc.cleanup()
Example #5
0
 def test_missing_inputs(self):
     # test possible missing inputs:
     # - ESTIMATED_LIBRARY_SIZE in mark duplicates text
     # - FFQ/LFQ in samtools stats
     config = {
         bam_qc.CONFIG_KEY_BAM: self.bam_path,
         bam_qc.CONFIG_KEY_DEBUG: self.debug,
         bam_qc.CONFIG_KEY_DOWNSAMPLED_BAM: self.downsampled_bam,
         bam_qc.CONFIG_KEY_TARGET: self.target_path,
         bam_qc.CONFIG_KEY_INSERT_MAX: self.insert_max,
         bam_qc.CONFIG_KEY_LOG: self.log_path,
         bam_qc.CONFIG_KEY_METADATA: self.metadata_path,
         bam_qc.CONFIG_KEY_MARK_DUPLICATES: self.markdup_path,
         bam_qc.CONFIG_KEY_N_AS_MISMATCH: self.n_as_mismatch,
         bam_qc.CONFIG_KEY_SKIP_BELOW_MAPQ: self.quality,
         bam_qc.CONFIG_KEY_RANDOM_SEED: None,
         bam_qc.CONFIG_KEY_REFERENCE: self.reference,
         bam_qc.CONFIG_KEY_SAMPLE: self.sample_default,
         bam_qc.CONFIG_KEY_TEMP_DIR: self.tmpdir,
         bam_qc.CONFIG_KEY_VERBOSE: self.verbose,
         bam_qc.CONFIG_KEY_WORKFLOW_VERSION: self.workflow_version
     }
     qc = bam_qc(config)
     # for low-coverage runs, ESTIMATED_LIBRARY_SIZE value is missing from mark duplicates text
     # test input file also has variant '## METRICS CLASS ...' line
     metrics_found = qc.read_mark_dup(self.markdup_path_low_cover)
     with (open(self.expected_metrics_low_cover)) as f:
         metrics_expected = json.loads(f.read())
     # Found/expected HISTOGRAM keys are integers and strings, respectively.
     # (Annoyingly, JSON format insists dictionary keys must be strings)
     histogram_found = metrics_found['HISTOGRAM']
     histogram_expected = metrics_expected['HISTOGRAM']
     self.assertEqual(len(histogram_found), len(histogram_expected))
     for histogram_type in histogram_found.keys():
         for key in histogram_found[histogram_type]:
             self.assertEqual(histogram_found[histogram_type][key],
                              histogram_expected[histogram_type][str(key)])
     del metrics_found['HISTOGRAM']
     del metrics_expected['HISTOGRAM']
     self.assertEqual(metrics_found, metrics_expected)
     # test empty FFQ/LFQ result from samtools stats; may occur for small input datasets
     # requires a fast_metric_finder object
     fast_finder = fast_metric_finder(self.bam_path, self.reference,
                                      self.insert_max, self.n_as_mismatch,
                                      qc.logger)
     fq_result = fast_finder.fq_stats([])
     fq_expected = ({}, {})
     self.assertEqual(fq_expected, fq_result)
     qc.cleanup()
Example #6
0
def main():
    parser = argparse.ArgumentParser(description='QC for BAM files.')
    parser.add_argument('-a', '--all-reads', action='store_true', help='Do not apply downsampling; '+\
                        'use all reads as input to all QC metrics. Incompatible with --sample.')
    parser.add_argument('-b', '--bam', metavar='PATH', required=True,
                        help='Path to input BAM file. Required.')
    parser.add_argument('-d', '--mark-duplicates', metavar='PATH',
                        help='Path to text file output by Picard MarkDuplicates. Optional.')
    parser.add_argument('-D', '--debug', action='store_true',
                        help='Most verbose; write messages of priority DEBUG and higher to log')
    parser.add_argument('-i', '--insert-max', metavar='INT', default=DEFAULT_INSERT_MAX,
                        help='Maximum expected value for insert size; higher values will be '+\
                        'counted as abnormal. Optional; default = %i.' % DEFAULT_INSERT_MAX)
    parser.add_argument('-l', '--log-path', metavar='PATH', help='Path of file where log output '+\
                        'will be appended. Optional, defaults to STDERR.')
    parser.add_argument('-m', '--metadata', metavar='PATH',
                        help='Path to JSON file containing metadata. Optional.')
    parser.add_argument('-n', '--n-as-mismatch', action='store_true',
                        help='Record N calls as mismatches in mismatch-per-cycle counts. '+\
                        'Only relevant if a reference is given with -r.')
    parser.add_argument('-o', '--out', metavar='PATH', required=True,
                        help='Path for JSON output, or - for STDOUT. Required.')
    parser.add_argument('-p', '--profile', action='store_true', help='Write runtime profile to '+\
                        'STDOUT. For development use only. Should not be combined with writing '+\
                        'JSON to STDOUT.')
    parser.add_argument('-q', '--skip-below-mapq', metavar='QSCORE',
                        help='Threshold to skip reads with low alignment quality. Optional.')
    parser.add_argument('-r', '--reference', metavar='PATH',
                        help='Path to FASTA reference used to align the BAM file. Used to find '+\
                        'mismatches by cycle using samtools. Optional; if not supplied, '+\
                        'mismatches by cycle will be empty.')
    parser.add_argument('-R', '--random-seed', metavar='INT', help='Set sampling random seed to '+\
                        'INT. Has no effect if --sample-rate not specified. Optional; if not '+\
                        'given, a default seed will be used.')
    parser.add_argument('-s', '--sample', metavar='INT',
                        help='Sample a total of INT reads from the BAM file, for input to slower '+\
                        'QC metrics. Defaults to 1.1 million. Incompatible with --all-reads.')
    parser.add_argument('-S', '--downsampled-bam', metavar='PATH',
                        help='Downsampled BAM file for input to slow QC metrics. Incompatible with '+\
                        '--all-reads and --sample.')
    parser.add_argument('-t', '--target', metavar='PATH',
                        help='Path to target BED file, containing targets to calculate coverage '+\
                        'against. Optional. If given, must be sorted in same order as BAM file. '+\
                        'If not given, bedtools coverage metrics will be omitted.')
    parser.add_argument('-T', '--temp-dir', metavar='PATH', help='Directory for temporary output '+\
                        'files; optional, defaults to %s (the current system tempdir).' \
                        % tempfile.gettempdir())
    parser.add_argument('-v', '--version', action='version',
                        version=read_package_version(),
                        help='Print the version number of bam-qc-metrics and exit')
    parser.add_argument('-V', '--verbose', action='store_true',
                        help='More verbose; write messages of priority INFO and higher to log')
    parser.add_argument('-w', '--workflow-version', metavar='VERSION',
                        help='Version of the workflow being used to run bam-qc-metrics. '+\
                        'Optional. If given, will be recorded in JSON output.')
    args = parser.parse_args()
    if not validate_args(args):
        print("For usage, run with -h or --help")
        exit(1)
    skip_below_mapq = None if args.skip_below_mapq == None else int(args.skip_below_mapq)
    insert_max = None if args.insert_max == None else int(args.insert_max)
    random_seed = None if args.random_seed == None else int(args.random_seed)
    if args.all_reads or args.downsampled_bam:
        sample = None
    else:
        sample = DEFAULT_SAMPLE_LEVEL if args.sample == None else int(args.sample)
    config = {
        bam_qc.CONFIG_KEY_BAM: args.bam,
        bam_qc.CONFIG_KEY_DEBUG: args.debug,
        bam_qc.CONFIG_KEY_DOWNSAMPLED_BAM: args.downsampled_bam,
        bam_qc.CONFIG_KEY_TARGET: args.target,
        bam_qc.CONFIG_KEY_INSERT_MAX: insert_max,
        bam_qc.CONFIG_KEY_LOG: args.log_path,
        bam_qc.CONFIG_KEY_METADATA: args.metadata,
        bam_qc.CONFIG_KEY_MARK_DUPLICATES: args.mark_duplicates,
        bam_qc.CONFIG_KEY_N_AS_MISMATCH: args.n_as_mismatch,
        bam_qc.CONFIG_KEY_SKIP_BELOW_MAPQ: skip_below_mapq,
        bam_qc.CONFIG_KEY_RANDOM_SEED: random_seed,
        bam_qc.CONFIG_KEY_REFERENCE: args.reference,
        bam_qc.CONFIG_KEY_SAMPLE: sample,
        bam_qc.CONFIG_KEY_TEMP_DIR: args.temp_dir,
        bam_qc.CONFIG_KEY_VERBOSE: args.verbose,
        bam_qc.CONFIG_KEY_WORKFLOW_VERSION: args.workflow_version
    }
    if args.profile:
        # sort order = 2, sorts profile by cumulative time
        cProfile.runctx('bam_qc(config).write_output(out_path)',
                        {'bam_qc': bam_qc, 'config': config, 'out_path': args.out},
                        {},
                        None,
                        2)
    else:
        qc = bam_qc(config)
        qc.write_output(args.out)