def pysam_bam_from_sam(sam_filename, bam_filename, index=True): infile = samtools.alignment_file(sam_filename, "r") outfile = samtools.alignment_file(bam_filename, "wb", template=infile) for s in infile: outfile.write(s) infile.close() outfile.close() if index: samtools.index(bam_filename)
def test_sort_and_index_bam(self): sam_contents = \ '''@HD|VN:1.4|GO:none|SO:coordinate @SQ|SN:chr10|LN:135534747 readNameB1|147|chr10|400|0|5M|=|200|100|CCCCC|>>>>> readNameA1|147|chr10|300|0|5M|=|100|100|AAAAA|>>>>> readNameA1|99|chr10|100|0|5M|=|300|200|AAAAA|>>>>> readNameB1|99|chr10|200|0|5M|=|400|200|CCCCC|>>>>> readNameA2|147|chr10|300|0|5M|=|100|100|AAAAA|>>>>> readNameA2|99|chr10|100|0|5M|=|300|200|AAAAA|>>>>> '''.replace("|", "\t") with TempDirectory() as tmp_dir: bam = create_bam(tmp_dir.path, "input.sam", sam_contents, index=False) samtools.sort_and_index_bam(bam) alignments = samtools.alignment_file(bam, "rb").fetch() aligns = [(a.query_name, a.reference_start + 1) for a in alignments] self.assertEquals(6, len(aligns)) self.assertEquals([("readNameA1", 100), ("readNameA2", 100), ("readNameB1", 200), ("readNameA1", 300), ("readNameA2", 300), ("readNameB1", 400)], aligns) original_dir = os.getcwd() try: os.chdir(tmp_dir.path) os.mkdir("tmp") bam = create_bam(os.path.join(tmp_dir.path, "tmp"), "input.sam", sam_contents, index=False) bam_filename = os.path.basename(bam) samtools.sort_and_index_bam(os.path.join("tmp", bam_filename)) alignments = samtools.alignment_file(bam, "rb").fetch() aligns = [(a.query_name, a.reference_start + 1) for a in alignments] self.assertEquals(6, len(aligns)) self.assertEquals([("readNameA1", 100), ("readNameA2", 100), ("readNameB1", 200), ("readNameA1", 300), ("readNameA2", 300), ("readNameB1", 400)], aligns) finally: os.chdir(original_dir)
def _dedup_alignments(args, consensus_writer, annotated_writer, log): log.info('reading input bam [{}]', args.input_bam) total_aligns = samtools.total_align_count(args.input_bam) family_filter = _build_family_filter(args) handlers = familyhandler.build_family_handlers(args, consensus_writer, annotated_writer, log) bamfile = samtools.alignment_file(args.input_bam, 'rb') coord_family_holder = _CoordinateFamilyHolder() supplemental_log = _build_supplemental_log(coord_family_holder) progress_gen = _progress_logger(bamfile.fetch(), total_aligns, log, supplemental_log) filtered_aligns_gen = samtools.filter_alignments(progress_gen, annotated_writer) paired_align_gen = _build_coordinate_pairs(filtered_aligns_gen, annotated_writer) coord_family_gen = coord_family_holder.build_coordinate_families( paired_align_gen) for coord_family in coord_family_gen: ranked_tags = _rank_tags(coord_family) tag_families = _build_tag_families(coord_family, ranked_tags, args.umt_distance_threshold, args.consensus_freq_threshold, family_filter) for handler in handlers: for tag_family in tag_families: handler.handle(tag_family) for handler in handlers: handler.end() bamfile.close()
def test_close_sortsAndIndexes(self): with TempDirectory() as tmp_dir: bam_path = os.path.join(tmp_dir.path, 'destination.bam') header = { 'HD': {'VN': '1.0'}, 'SQ': [{'LN': 1575, 'SN': 'chr1'}, {'LN': 1584, 'SN': 'chr2'}] } align1 = ConnorAlign(mock_align(query_name='align1', reference_start=100)) align2 = ConnorAlign(mock_align(query_name='align2', reference_start=200)) align3 = ConnorAlign(mock_align(query_name='align3', reference_start=300)) tag1 = BamTag('X1','Z', 'desc', get_value=lambda family, pair, align: family) tag2 = BamTag('X2','Z', 'desc', get_value=lambda family, pair, align: align.query_name) writer = samtools.AlignWriter(header, bam_path, [tag1, tag2]) writer.write('familyC', None, align3) writer.write('familyA', None, align1) writer.write('familyB', None, align2) writer.close() bamfile = samtools.alignment_file(bam_path, 'rb') actual_aligns = [a for a in bamfile.fetch()] bamfile.close() self.assertEqual(3, len(actual_aligns)) self.assertEqual('align1', actual_aligns[0].query_name) self.assertEqual('align2', actual_aligns[1].query_name) self.assertEqual('align3', actual_aligns[2].query_name)
def test_write_removesTagsWhenValueIsNone(self): with TempDirectory() as tmp_dir: bam_path = os.path.join(tmp_dir.path, 'destination.bam') header = { 'HD': {'VN': '1.0'}, 'SQ': [{'LN': 1575, 'SN': 'chr1'}, {'LN': 1584, 'SN': 'chr2'}] } align1 = ConnorAlign(mock_align(query_name='align1')) align1.set_tag('X1', 'No', 'Z') tag1 = BamTag('X1','Z', 'desc', get_value = lambda family, pair, align: None) writer = samtools.AlignWriter(header, bam_path, [tag1]) writer.write('familyA', None, align1) writer.close() bamfile = samtools.alignment_file(bam_path, 'rb') actual_aligns = [a for a in bamfile.fetch()] bamfile.close() align_tags = {} for actual_align in actual_aligns: for t_name, t_val, t_type in actual_align.get_tags(with_value_type=True): key = (actual_align.query_name, t_name) t_type = AlignWriterTest.fix_pysam_inconsistent_tag_type(t_type) align_tags[key] = "{}:{}:{}".format(t_name, t_type, t_val) self.assertEqual(1, len(actual_aligns)) self.assertEqual(0, len(align_tags))
def _check_input_bam_valid(args, log=None): #pylint: disable=unused-argument try: bamfile = samtools.alignment_file(args.input_bam, 'rb') bamfile.close() except ValueError: raise utils.UsageError(("Specified input [{}] not a valid BAM. Review " "inputs and try again.").format(args.input_bam))
def _check_input_bam_not_deduped(args, log=None): bamfile = samtools.alignment_file(args.input_bam, 'rb') header = bamfile.header bamfile.close() names = set([pg_item.get('PN', None) for pg_item in header.get('PG', [])]) if samtools.CONNOR_PG_PN in names: msg = ('Specified input [{}] has already been processed with ' 'Connor.').format(args.input_bam) _log_force_or_raise(args, log, msg)
def _check_input_bam_indexed(args, log=None): #pylint: disable=unused-argument bamfile = samtools.alignment_file(args.input_bam, 'rb') try: bamfile.fetch() except ValueError: raise utils.UsageError(("Specified input [{}] is not indexed. Review " "inputs and try again.").format(args.input_bam)) finally: bamfile.close()
def _sample_bamfile(input_bam, extractor_function): stats = {'forward': [], 'reverse': []} bamfile = samtools.alignment_file(input_bam, 'rb') try: for align in _balanced_strand_gen(bamfile.fetch(), _SAMPLE_SIZE): stats[_strand(align)].append(extractor_function(align)) finally: bamfile.close() return stats
def _check_input_bam_not_empty(args, log=None): #pylint: disable=unused-argument bamfile = samtools.alignment_file(args.input_bam, 'rb') try: next(bamfile.fetch()) except StopIteration: msg = "Specified input [{}] is empty" raise utils.UsageError(msg.format(args.input_bam)) finally: bamfile.close()
def _check_input_bam_paired(args, log=None): #pylint: disable=unused-argument bamfile = samtools.alignment_file(args.input_bam, 'rb') try: for alignment in itertools.islice(bamfile.fetch(), _SAMPLE_SIZE): if alignment.is_paired: return finally: bamfile.close() msg = ('Specified input [{}] does not appear to contain paired ' 'reads.').format(args.input_bam) _log_force_or_raise(args, log, msg)
def test_write_addsAlignTags(self): with TempDirectory() as tmp_dir: bam_path = os.path.join(tmp_dir.path, 'destination.bam') header = { 'HD': {'VN': '1.0'}, 'SQ': [{'LN': 1575, 'SN': 'chr1'}, {'LN': 1584, 'SN': 'chr2'}] } align1 = ConnorAlign(mock_align(query_name='align1')) align2 = ConnorAlign(mock_align(query_name='align2')) align3 = ConnorAlign(mock_align(query_name='align3')) tag1 = BamTag('X1','Z', 'desc', get_value=lambda family,pair,align: family) tag2 = BamTag('X2','Z', 'desc', get_value=lambda family,pair,align: pair) tag3 = BamTag('X3','Z', 'desc', get_value=lambda family,pair,align: align.query_name) writer = samtools.AlignWriter(header, bam_path, [tag1, tag2, tag3]) writer.write('familyA', 'pair1', align1) writer.write('familyB', 'pair2', align2) writer.write('familyC', 'pair3', align3) writer.close() bamfile = samtools.alignment_file(bam_path, 'rb') actual_aligns = [a for a in bamfile.fetch()] bamfile.close() align_tags = {} for actual_align in actual_aligns: for t_name, t_val, t_type in actual_align.get_tags(with_value_type=True): key = (actual_align.query_name, t_name) t_type = AlignWriterTest.fix_pysam_inconsistent_tag_type(t_type) align_tags[key] = "{}:{}:{}".format(t_name, t_type, t_val) self.assertEqual(3, len(actual_aligns)) self.assertEqual("X1:Z:familyA", align_tags[('align1', 'X1')]) self.assertEqual("X1:Z:familyB", align_tags[('align2', 'X1')]) self.assertEqual("X1:Z:familyC", align_tags[('align3', 'X1')]) self.assertEqual("X2:Z:pair1", align_tags[('align1', 'X2')]) self.assertEqual("X2:Z:pair2", align_tags[('align2', 'X2')]) self.assertEqual("X2:Z:pair3", align_tags[('align3', 'X2')]) self.assertEqual("X3:Z:align1", align_tags[('align1', 'X3')]) self.assertEqual("X3:Z:align2", align_tags[('align2', 'X3')]) self.assertEqual("X3:Z:align3", align_tags[('align3', 'X3')])
def test_write(self): with TempDirectory() as tmp_dir: bam_path = os.path.join(tmp_dir.path, "destination.bam") header = { 'HD': {'VN': '1.0'}, 'SQ': [{'LN': 1575, 'SN': 'chr1'}, {'LN': 1584, 'SN': 'chr2'}] } align1 = ConnorAlign(mock_align(query_name="align1")) align2 = ConnorAlign(mock_align(query_name="align2")) align3 = ConnorAlign(mock_align(query_name="align3")) family = None writer = samtools.AlignWriter(header, bam_path) writer.write(family, None, align1) writer.write(family, None, align2) writer.write(family, None, align3) writer.close() bamfile = samtools.alignment_file(bam_path, 'rb') actual_query_names = [align.query_name for align in bamfile.fetch()] bamfile.close() self.assertEqual(['align1', 'align2', 'align3'], actual_query_names)
def test_build_writer(self): sam_contents = \ '''@HD|VN:1.4|GO:none|SO:coordinate @SQ|SN:chr10|LN:135534747 @PG|ID:bwa|VN:0.5.5 @PG|ID:GATK|PN:foo|VN:1.0.3471 readNameA1|99|chr10|100|20|5M|=|300|200|AAAAA|>>>>> '''.replace("|", "\t") with TempDirectory() as tmp_dir: input_bam = create_bam(tmp_dir.path, 'input.sam', sam_contents) annotated_output_bam = os.path.join(tmp_dir.path, 'annotated.bam') tags = [] args=Namespace(original_command_line=['command-line'], simplify_pg_header=False) actual_writer = samtools.build_writer(input_bam, annotated_output_bam, tags, args) actual_writer.close() actual_output = samtools.alignment_file(annotated_output_bam, 'rb',) expected_header = {'HD': {'GO': 'none', 'SO': 'coordinate', 'VN': '1.4'}, 'SQ': [{'SN': 'chr10', 'LN': 135534747}], 'PG': [{'ID':'bwa', 'VN':'0.5.5'}, {'ID':'GATK', 'PN':'foo', 'VN':'1.0.3471'}, {'ID':'connor', 'PN':'connor', 'VN':connor.__version__, 'CL':'command-line' }, ]} self.assertEqual(expected_header, actual_output.header)
def test_write_addsHeaderTags(self): with TempDirectory() as tmp_dir: bam_path = os.path.join(tmp_dir.path, 'destination.bam') header = { 'HD': {'VN': '1.0'}, 'SQ': [{'LN': 1575, 'SN': 'chr1'}, {'LN': 1584, 'SN': 'chr2'}], 'CO': ['comment1', 'comment2']} tag1 = BamTag('X1','Z', 'annotates family', get_value=None) tag2 = BamTag('X2','Z', 'annotates alignment', get_value=None) writer = samtools.AlignWriter(header, bam_path, [tag2, tag1]) writer.close() bamfile = samtools.alignment_file(bam_path, 'rb') actual_header = dict(bamfile.header) bamfile.close() expected_header = deepcopy(header) expected_header.pop('CO') actual_comments = actual_header.pop('CO') expected_comments = ['comment1', 'comment2', 'connor\tBAM tag\tX1: annotates family', 'connor\tBAM tag\tX2: annotates alignment'] self.assertEqual(expected_comments, actual_comments)
def pysam_alignments_from_bam(bam_filename): infile = samtools.alignment_file(bam_filename, "rb") aligned_segments = [s for s in infile] infile.close() return aligned_segments