def map_and_parse_sam(ref_index,
                      query_fa,
                      tags,
                      qry_or_ref,
                      ops,
                      get_unique=True):
    samfile = ops.outprefix + '.maptags.sam'
    if get_unique:
        utils.syscall(external_progs.bowtie2_align + ' -f -x ' + ref_index +
                      ' -U ' + query_fa + ' -S ' + samfile)
    else:
        utils.syscall(external_progs.bowtie2_align +
                      ' -a --score-min L,0,0 -f -x ' + ref_index + ' -U ' +
                      query_fa + ' -S ' + samfile)
    sam_reader = sam.file_reader(samfile)

    for sam_record in sam_reader:
        assert sam_record.id in tags
        if sam_record.is_mapped() and sam_record.tags['AS'][1] == 0:
            if (get_unique and (('XS' not in sam_record.tags or sam_record.tags['XS'][1] < 0))) \
               or not get_unique:
                if qry_or_ref == 'qry':
                    tags[sam_record.id].qry_hits.add(
                        Hit(sam_record.rname, sam_record.pos,
                            sam_record.query_strand()))
                elif qry_or_ref == 'ref':
                    tags[sam_record.id].ref_hits.add(
                        Hit(sam_record.rname, sam_record.pos,
                            sam_record.query_strand()))
                else:
                    print('Error parsing SAM', file=sys.stderr)
                    sys.exit(1)

    os.unlink(samfile)
 def test_file_reader_sam(self):
     '''file_reader should iterate through a BAM file correctly'''
     tmp_sam_out = 'tmp.sam'
     fout = utils.open_file_write(tmp_sam_out)
     sam_reader = sam.file_reader('sam_unittest.bam')
     for sam_record in sam_reader:
         print(sam_record, file=fout)
     utils.close(fout)
     self.assertTrue(filecmp.cmp('sam_unittest.sam', tmp_sam_out))
     os.unlink(tmp_sam_out)
def map_and_parse_sam(ref_index, tags_fasta, tag_counts, log_fh):
    samfile = options.outprefix + '.maptags.sam'
    #utils.syscall('smalt map -d -1 -y 1 -f samsoft -o ' + samfile + ' ' + ref_smalt_index + ' ' + tags_fasta)
    utils.syscall(external_progs.bowtie2_align + ' -f -x ' + ref_index +
                  ' -U ' + tags_fasta + ' -S ' + samfile)
    sam_reader = sam.file_reader(samfile)
    for sam_record in sam_reader:
        (contig_name, range) = sam_record.id.rsplit(':', 1)
        assert contig_name not in tag_counts
        if sam_record.is_mapped() \
            and sam_record.tags['AS'][1] == 0 \
            and ('XS' not in sam_record.tags or sam_record.tags['XS'][1] < 0):
            tag_counts[contig_name] = 1
        else:
            tag_counts[contig_name] = 2

    os.unlink(samfile)
Beispiel #4
0
    description = 'Given a fasta/q file of reads, and a second fasta of vector sequences, trims the vectors off the reads. Made specifically for assembled capillary read pairs - uses BWA for mapping. Untested on short reads or unassembled read pairs',
    usage = '%(prog)s [options] <reads fasta/q> <vectors fasta> <outprefix>')
parser.add_argument('--join_distance', type=int, help='Join hits at most this many bases apart [%(default)s]', metavar='INT', default=100)
parser.add_argument('reads_in', help='Name of input fasta/q file of reads', metavar='reads fasta/q')
parser.add_argument('vectors_in', help='Name of input fasta file of vectors', metavar='vectors fasta')
parser.add_argument('outprefix', help='Prefix of names of ouput files')
options = parser.parse_args()

bwa_index = options.outprefix + '.bwa_index'
bwa_sam = options.outprefix + '.map_reads.sam'
utils.syscall(' '.join([external_progs.bwa, 'index -p', bwa_index, options.vectors_in]))
utils.syscall(' '.join([external_progs.bwa, 'bwasw -f', bwa_sam, bwa_index, options.reads_in]))

read_hit_coords = {} # id -> [(start, end), (start, end), ...]

sam_reader = sam.file_reader(bwa_sam)

for sam_record in sam_reader:
    if not sam_record.is_mapped():
        continue

    if not sam_record.is_forward_strand():
        sam_record.cigar.reverse()

    hit_start = 1
    hit_end = len(sam_record.seq)

    if sam_record.cigar.operations[0].operator == 'S':
        hit_start = sam_record.cigar.operations[0].number

    if sam_record.cigar.operations[-1].operator == 'S':
Beispiel #5
0
import argparse
import fastn
import sam
import utils

parser = argparse.ArgumentParser(
    description=
    'Report positions in the reference where any read had an error (i.e. difference between read and reference)',
    usage='%(prog)s [options] <in.bam> <reference.fasta> <outfile>')
parser.add_argument('bam_in', help='Name of input bam file')
parser.add_argument('fasta_in', help='Name of reference fasta file')
parser.add_argument('outfile', help='Name of output file')
options = parser.parse_args()

sam_reader = sam.file_reader(options.bam_in)
errors = {}
ref_seqs = {}
fastn.file_to_dict(options.fasta_in, ref_seqs)

for sam_record in sam_reader:
    if sam_record.is_mapped():
        new_errors = sam_record.get_differences_from_ref(
            ref_seqs[sam_record.rname])
        if sam_record.rname not in errors:
            errors[sam_record.rname] = {}

        for e in new_errors:
            errors[sam_record.rname][e] = errors[sam_record.rname].get(e,
                                                                       0) + 1
bamfile = options.outprefix + '.map_tags.bam'
sorted_bamfile = options.outprefix + '.map_tags.sorted.bam'
external_progs.index_with_bowtie2(options.scaffolds_fa)
#utils.syscall(external_progs.bowtie2_align + ' -f -a --score-min L,0,0 -x ' + options.scaffolds_fa + ' -U ' + tags_fa_file + ' -S ' + samfile)
utils.syscall(external_progs.bowtie2_align + ' -f -x ' + options.scaffolds_fa +
              ' -U ' + tags_fa_file + ' -S ' + samfile)
utils.syscall('samtools view -T ' + options.scaffolds_fa + ' -bS ' + samfile +
              ' > ' + bamfile)
os.unlink(samfile)
utils.syscall('samtools sort ' + bamfile + ' ' + sorted_bamfile[0:-4])
#os.unlink(bamfile)

# Load the hits into memory
previous_sam = None
previous_tag = None
sam_reader = sam.file_reader(sorted_bamfile)
flag_counts = {k: 0 for k in [0, 1, 2, 4, 5, 8, 12, 16]}
tags_from_bam = set()
tag_distances = []
f_log = utils.open_file_write(options.outprefix + '.log')
f_tags_and_sam = utils.open_file_write(options.outprefix + '.tags_and_sam.gz')
skipped_tags = 0

for current_sam in sam_reader:
    if current_sam.is_mapped():
        tags_from_bam.add(current_sam.id)
        if current_sam.tags['AS'][1] != 0:
            print('Nonzero alignemnt score', current_sam, file=f_log)
        if 'XS' in current_sam.tags and current_sam.tags['XS'][
                1] >= current_sam.tags['AS'][1]:
            print('Non-unique best hit', current_sam, file=f_log)
second_coords = {}
tag_counts = {}

if options.second_fasta:
    tags_tmp_fa = options.outprefix + '.tags.tmp.fa'
    f = utils.open_file_write(tags_tmp_fa)
    for t in unique_tags:
        #print('>' + t[0] + ':' + str(t[1]) + ':' + str(t[2]) + '\n' + t[3], file=f)
        print('>' + t[0] + '\n' + t[3], file=f)
    utils.close(f)
    samfile = options.outprefix + '.maptags.sam'
    #utils.syscall('smalt map -d -1 -y 1 -f samsoft -o ' + samfile + ' ' + second_seqs_smalt_index + ' ' + tags_tmp_fa)
    utils.syscall(external_progs.bowtie2_align + ' -f -x ' +
                  second_seqs_index + ' -U ' + tags_tmp_fa + ' -S ' + samfile)
    os.unlink(tags_tmp_fa)
    sam_reader = sam.file_reader(samfile)
    for sam_record in sam_reader:
        if sam_record.is_mapped():
            tag_counts[sam_record.id] = tag_counts.get(sam_record.id, 0) + 1
            second_coords[sam_record.id] = [
                sam_record.rname, sam_record.pos + 1
            ]
        else:
            tag_counts[contig_name] = -1

    os.unlink(samfile)
    #os.unlink(second_seqs_smalt_index + '.smi')
    #os.unlink(second_seqs_smalt_index + '.sma')
    for ext in ['1.bt2', '2.bt2', '3.bt2', '4.bt2', 'rev.1.bt2', 'rev.2.bt2']:
        os.unlink(second_seqs_index + '.' + ext)