def test_does_not_readd_headers(self): # Make sure headers that exist are not duplicated from ngs_mapper.tagreads import get_bam_header from ngs_mapper import samtools self.temp_copy_files() # Create a new bam file that the header already has a read group in it # as well as a new header hdr = get_bam_header(self.bam) hdr += '\n' + '@RG\tID:Test\tCN:cn\tSM:sm\tPL:ILLUMINA\n' hdr += '@RG\tID:Roche454\tSM:312\tPL:L454\n' hdr += '@RG\tID:IonTorrent\tSM:312\tPL:IONTORRENT\n' hdr += '@RG\tID:MiSeq\tSM:312\tPL:ILLUMINA\n' hdr += '@RG\tID:Sanger\tSM:312\tPL:CAPILLARY\n' # Read the pipe which should be sam input and output bam s = samtools.view(self.bam) # Put in new header and sam output after it with open('t.sam', 'w') as fh: # New header fh.write(hdr) # Then reads fh.write(s.read()) # Now convert that pipe to bam b = samtools.view('t.sam', h=True, S=True, b=True) # Write the bam output with open('hasrg.bam', 'wb') as bamfh: bamfh.write(b.read()) # Close the file handles b.close() s.close() # Now we have a bamfile with an existing header that we can test r = self._C('hasrg.bam', 'sm', 'cn') # Make sure that the new header made it in and that the MiSeq header was not duplicated header_lines = r.splitlines(True) read_groups = [rg for rg in header_lines if rg.startswith('@RG')] num_miseq = 0 num_test = 0 for rg in read_groups: if 'ID:Test\t' in rg: num_test += 1 if 'ID:MiSeq\t' in rg: num_miseq += 1 eq_(1, num_miseq, "Header was duplicated which is incorrect") eq_(1, num_test, "Existing header was removed somehow") # How many platform readgroups to expect(includes the MiSeq one that we are testing) i = len(self.read_group_ids) # Now increase that by HD, SQ and Test RG i += 3 eq_(i, len(header_lines), "Incorrect number of header lines")
def test_header_correct(self): hdr = self._C(self.bam) with open('t.sam', 'w') as fh: fh.write(hdr) from ngs_mapper import samtools h = samtools.view('t.sam', S=True, H=True) eq_(hdr, h.read().rstrip())
def test_seqencingcenter_argument(self): from ngs_mapper import samtools self.temp_copy_files() self._C([self.bam], ['-CN', 'seqcenter']) s = samtools.view(self.bam, H=True) rgs = s.readlines() s.close() # Ensure each read group contains the samplename set for rg in [r for r in rgs if r.startswith('@RG')]: ok_('CN:seqcenter\t' in rg, "Sequencing center did not make it into the headers")
def count_rg(self, bam): ''' Count how many of each uniq read group id ''' from ngs_mapper import samtools s = samtools.view(bam) counts = {} for read in s: aread = samtools.SamRow(read) tags = dict(aread.TAGS) id = tags['RG'] if id not in counts: counts[id] = 0 counts[id] += 1 return counts
def test_does_multiple_bams(self): from ngs_mapper import samtools self.temp_copy_files() bam2 = join(self.tempdir, 'sample2.bam') bai2 = join(self.tempdir, 'sample2.bam.bai') shutil.copy(self.bam, bam2) shutil.copy(self.bai, bai2) self._C([self.bam, bam2]) for b in [self.bam, bam2]: self.check_tagreadcounts(b) n = basename(b).replace('.bam', '') s = samtools.view(b, H=True) rg = [ header.split('\t') for header in s if header.startswith('@RG') ] for rgline in rg: eq_( 'SM:' + n, rgline[2], "Did not set {0} as SM for {1}. Header: {2}".format( n, b, rgline))