Beispiel #1
0
    def test_sam_record_count(self):
        '''test sam_record_count'''
        tmpfile = 'tmp.utils.sam_record_count'
        with open(tmpfile, 'w') as f:
            print('@foo bar', file=f)
            print('@PG baz', file=f)
            for i in range(42):
                print('zaphod', file=f)

        self.assertEqual(42, utils.sam_record_count(tmpfile))
        os.unlink(tmpfile)
Beispiel #2
0
    def test_sam_record_count(self):
        """test sam_record_count"""
        tmpfile = "tmp.utils.sam_record_count"
        with open(tmpfile, "w") as f:
            print("@foo bar", file=f)
            print("@PG baz", file=f)
            for i in range(42):
                print("zaphod", file=f)

        self.assertEqual(42, utils.sam_record_count(tmpfile))
        os.unlink(tmpfile)
Beispiel #3
0
 def test_map_reads_secondary_hits_removed(self):
     """test map_reads secondary hits get removed"""
     reads1 = os.path.join(data_dir, "secondary_hits_removed.reads_1.fq")
     reads2 = os.path.join(data_dir, "secondary_hits_removed.reads_2.fq")
     ref_fasta = os.path.join(data_dir, "secondary_hits_removed.ref.fa")
     tmp_sam = "tmp.test_map_reads.sam"
     if os.path.exists(tmp_sam):
         os.unlink(tmp_sam)
     read_map.map_reads(ref_fasta, reads1, reads2, tmp_sam)
     # bwa mem reports one secondary alignment, so the 1 read pair makes
     # 3 SAM records. So should have 2 records after removing secondary match.
     self.assertEqual(2, utils.sam_record_count(tmp_sam))
     os.unlink(tmp_sam)
Beispiel #4
0
def map_reads(
    ref_fasta, reads1, reads2, outfile, rmdup=False, markdup=False, read_group=None, threads=1
):
    """Maps reads with BWA MEM. By default, outputs SAM file in input read order.
    rmdup=True => remove duplicates using samtools rmdup. Final output is sorted bam
                  Incompatible with markdup=True
    markdup=True => mark duplicates using picard MarkDuplicate. Final output is sorted bam.
                  Incompatible with rmdup=True
    read_group should be a tuple (group_id, group_name). If given, these will be
    put into the BAM"""
    if rmdup and markdup:
        raise Error("Cannot have rmdup and markdup both True." "")

    try:
        expected_read_count = 2 * fqtools.count([reads1, reads2])
    except:
        raise Error("Error counting reads in input files " + reads1 + " " + reads2)

    if rmdup or markdup:
        tmpdir = tempfile.mkdtemp(
            prefix=outfile + ".tmp.map_reads.", dir=os.path.dirname(outfile)
        )
        sam_file = os.path.join(tmpdir, "tmp.sam")
    else:
        sam_file = outfile

    # "LB:LIB" is needed, otherwise samtools rmdup segfaults when map_reads_set() is used
    R_option = (
        ""
        if read_group is None
        else r"""-R '@RG\tLB:LIB\tID:"""
        + read_group[0]
        + r"""\tSM:"""
        + read_group[1]
        + "'"
    )

    cmd = " ".join(
        [
            "bwa mem -M",
            f"-t {threads}",
            R_option,
            ref_fasta,
            reads1,
            reads2,
            r""" | awk '/^@/ || !and($2,256)' """,  # remove secondary alignments (but keep header)
            ">",
            sam_file,
        ]
    )

    try:
        utils.syscall(cmd)
    except:
        if rmdup or markdup:
            shutil.rmtree(tmpdir)
        raise Error("Error running BWA MEM: " + cmd)

    number_in_sam = utils.sam_record_count(sam_file)
    if expected_read_count != number_in_sam:
        if rmdup or markdup:
            shutil.rmtree(tmpdir)
        raise Error(
            "Error! Mismatch in read counts. Expected "
            + str(expected_read_count)
            + " but got "
            + str(number_in_sam)
        )

    if rmdup or markdup:
        sorted_bam = os.path.join(tmpdir, "tmp.sorted.bam")

        cmd = " ".join(["samtools sort", "-o", sorted_bam, sam_file])

        try:
            utils.syscall(cmd)
        except:
            shutil.rmtree(tmpdir)
            raise Error("Error running samtools sort: " + cmd)

        if rmdup:
            cmd = "samtools rmdup " + sorted_bam + " " + outfile
            try:
                utils.syscall(cmd)
            except:
                shutil.rmtree(tmpdir)
                raise Error("Error running samtools rmdup: " + cmd)
        else:
            try:
                picard.mark_duplicates(sorted_bam, outfile)
            except:
                shutil.rmtree(tmpdir)
                raise Error("Error running picard mark_duplicates " + cmd)

        shutil.rmtree(tmpdir)
Beispiel #5
0
def map_reads(ref_fasta,
              reads1,
              reads2,
              outfile,
              rmdup=False,
              markdup=False,
              read_group=None):
    '''Maps reads with BWA MEM. By default, outputs SAM file in input read order.
    rmdup=True => remove duplicates using samtools rmdup. Final output is sorted bam
                  Incompatible with markdup=True
    markdup=True => mark duplicates using picard MarkDuplicate. Final output is sorted bam.
                  Incompatible with rmdup=True
    read_group should be a tuple (group_id, group_name). If given, these will be
    put into the BAM'''
    if rmdup and markdup:
        raise Error('Cannot have rmdup and markdup both True.' '')

    try:
        expected_read_count = 2 * fqtools.count([reads1, reads2])
    except:
        raise Error('Error counting reads in input files ' + reads1 + ' ' +
                    reads2)

    if rmdup or markdup:
        tmpdir = tempfile.mkdtemp(prefix=outfile + '.tmp.map_reads.',
                                  dir=os.path.dirname(outfile))
        sam_file = os.path.join(tmpdir, 'tmp.sam')
    else:
        sam_file = outfile

    # "LB:LIB" is needed, otherwise samtools rmdup segfaults when map_reads_set() is used
    R_option = '' if read_group is None else r'''-R '@RG\tLB:LIB\tID:''' + read_group[
        0] + r'''\tSM:''' + read_group[1] + "'"

    cmd = ' '.join([
        'bwa mem -M',
        R_option,
        ref_fasta,
        reads1,
        reads2,
        r''' | awk '/^@/ || !and($2,256)' ''',  # remove secondary alignments (but keep header)
        '>',
        sam_file
    ])

    try:
        utils.syscall(cmd)
    except:
        if rmdup or markdup:
            shutil.rmtree(tmpdir)
        raise Error('Error running BWA MEM: ' + cmd)

    number_in_sam = utils.sam_record_count(sam_file)
    if expected_read_count != number_in_sam:
        if rmdup or markdup:
            shutil.rmtree(tmpdir)
        raise Error('Error! Mismatch in read counts. Expected ' +
                    str(expected_read_count) + ' but got ' +
                    str(number_in_sam))

    if rmdup or markdup:
        sorted_bam = os.path.join(tmpdir, 'tmp.sorted.bam')

        cmd = ' '.join(['samtools sort', '-o', sorted_bam, sam_file])

        try:
            utils.syscall(cmd)
        except:
            shutil.rmtree(tmpdir)
            raise Error('Error running samtools sort: ' + cmd)

        if rmdup:
            cmd = 'samtools rmdup ' + sorted_bam + ' ' + outfile
            try:
                utils.syscall(cmd)
            except:
                shutil.rmtree(tmpdir)
                raise Error('Error running samtools rmdup: ' + cmd)
        else:
            try:
                picard.mark_duplicates(sorted_bam, outfile)
            except:
                shutil.rmtree(tmpdir)
                raise Error('Error running picard mark_duplicates ' + cmd)

        shutil.rmtree(tmpdir)