def map_and_parse_sam(ref_index,
                      query_fa,
                      tags,
                      qry_or_ref,
                      ops,
                      get_unique=True):
    samfile = ops.outprefix + '.maptags.sam'
    if get_unique:
        utils.syscall(external_progs.bowtie2_align + ' -f -x ' + ref_index +
                      ' -U ' + query_fa + ' -S ' + samfile)
    else:
        utils.syscall(external_progs.bowtie2_align +
                      ' -a --score-min L,0,0 -f -x ' + ref_index + ' -U ' +
                      query_fa + ' -S ' + samfile)
    sam_reader = sam.file_reader(samfile)

    for sam_record in sam_reader:
        assert sam_record.id in tags
        if sam_record.is_mapped() and sam_record.tags['AS'][1] == 0:
            if (get_unique and (('XS' not in sam_record.tags or sam_record.tags['XS'][1] < 0))) \
               or not get_unique:
                if qry_or_ref == 'qry':
                    tags[sam_record.id].qry_hits.add(
                        Hit(sam_record.rname, sam_record.pos,
                            sam_record.query_strand()))
                elif qry_or_ref == 'ref':
                    tags[sam_record.id].ref_hits.add(
                        Hit(sam_record.rname, sam_record.pos,
                            sam_record.query_strand()))
                else:
                    print('Error parsing SAM', file=sys.stderr)
                    sys.exit(1)

    os.unlink(samfile)
def map_and_parse_sam(ref_index, tags_fasta, tag_counts, log_fh):
    samfile = options.outprefix + '.maptags.sam'
    #utils.syscall('smalt map -d -1 -y 1 -f samsoft -o ' + samfile + ' ' + ref_smalt_index + ' ' + tags_fasta)
    utils.syscall(external_progs.bowtie2_align + ' -f -x ' + ref_index +
                  ' -U ' + tags_fasta + ' -S ' + samfile)
    sam_reader = sam.file_reader(samfile)
    for sam_record in sam_reader:
        (contig_name, range) = sam_record.id.rsplit(':', 1)
        assert contig_name not in tag_counts
        if sam_record.is_mapped() \
            and sam_record.tags['AS'][1] == 0 \
            and ('XS' not in sam_record.tags or sam_record.tags['XS'][1] < 0):
            tag_counts[contig_name] = 1
        else:
            tag_counts[contig_name] = 2

    os.unlink(samfile)
Beispiel #3
0
    def test_system_call(self):
        '''Test that system call appears to work and die as it should'''
        test_file = 'system_call_test.txt'
        tmp_out = 'utils_unittest_syscall.tmp'
        utils.syscall('cat ' + test_file + ' > ' + tmp_out)
        self.assertTrue(filecmp.cmp(tmp_out, test_file))
        os.unlink(tmp_out)

        with self.assertRaises(utils.Error):
            utils.syscall('thisisveryunlikelytoebarealcommandandshouldthrowerror')

        utils.syscall('echo "this is not the right string" > ' + tmp_out)
        self.assertFalse(filecmp.cmp(tmp_out, test_file))
        os.unlink(tmp_out)

        s = utils.syscall_get_stdout('echo bingo')
        self.assertListEqual(["bingo"], s)
for i in range(len(clusters)):
    reads_file = options.outfile + '.cluster.' + str(i + 1)
    f = utils.open_file_write(reads_file)
    for id in clusters[i]:
        seq = all_seqs[id]
        if strands[id] == '-':
            seq = copy.copy(all_seqs[id])
            seq.revcomp()
        else:
            seq = all_seqs[id]

        print(seq, file=f)
    utils.close(f)

    utils.syscall('cap3 ' + reads_file)
    singlet_count = fastn.count_sequences(reads_file + '.cap.singlets')
    contig_count = fastn.count_sequences(reads_file + '.cap.contigs')
    if singlet_count == 0 and contig_count == 1:
        seq_reader = fastn.file_reader(reads_file + '.cap.contigs')
        for seq in seq_reader:
            seq.id = 'cluster.' + str(i + 1) + '.contig'
            assembled_seqs.append(copy.copy(seq))

        for e in [
                'ace', 'contigs.links', 'contigs.qual', 'info', 'singlets',
                'contigs'
        ]:
            os.unlink(reads_file + '.cap.' + e)
        os.unlink(reads_file)
    else:
print('Got', len(tags), 'tags', file=sys.stderr)

# sort the tags into reference order for each chromosome
for chr, l in tags_by_chr.items():
    l.sort()
    for i in range(len(l)):
        l[i].ordered_index = i

# map the tags to the scaffolds
samfile = options.outprefix + '.map_tags.sam'
bamfile = options.outprefix + '.map_tags.bam'
sorted_bamfile = options.outprefix + '.map_tags.sorted.bam'
external_progs.index_with_bowtie2(options.scaffolds_fa)
#utils.syscall(external_progs.bowtie2_align + ' -f -a --score-min L,0,0 -x ' + options.scaffolds_fa + ' -U ' + tags_fa_file + ' -S ' + samfile)
utils.syscall(external_progs.bowtie2_align + ' -f -x ' + options.scaffolds_fa +
              ' -U ' + tags_fa_file + ' -S ' + samfile)
utils.syscall('samtools view -T ' + options.scaffolds_fa + ' -bS ' + samfile +
              ' > ' + bamfile)
os.unlink(samfile)
utils.syscall('samtools sort ' + bamfile + ' ' + sorted_bamfile[0:-4])
#os.unlink(bamfile)

# Load the hits into memory
previous_sam = None
previous_tag = None
sam_reader = sam.file_reader(sorted_bamfile)
flag_counts = {k: 0 for k in [0, 1, 2, 4, 5, 8, 12, 16]}
tags_from_bam = set()
tag_distances = []
f_log = utils.open_file_write(options.outprefix + '.log')
f_tags_and_sam = utils.open_file_write(options.outprefix + '.tags_and_sam.gz')
          'c(',
          ','.join(str(x) for x in y_coords),
          '), ',
          'xlab="Correct joins", ',
          'ylab="Incorrect joins", ',
          'xlim=c(0,',
          x_max,
          '), ',
          'ylim=c(0,',
          y_max,
          '), ',
          'col=',
          r_colour_vector,
          ', ',
          'pch=',
          r_symbol_vector,
          ', ',
          'bg=',
          r_colour_vector,
          ')',
          sep='',
          file=f)

    print(r_legend, file=f)

    print('dev.off()', file=f)

utils.close(f)

utils.syscall('R CMD BATCH ' + r_script)
Beispiel #7
0
'''
MIT License

Copyright (c) 2017 William Ivanski

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
'''

import sys
from utils import syscall, syscall_bg

out = syscall('ln -s ../run/{0} cron/{1}_{0}'.format(sys.argv[1], sys.argv[2]))
print('\n'.join(out))
utils.close(f)

# make scaffolds and simulate reads from them
for i in range(len(scaffolds)):
    coverage, contig_list = scaffolds[i]
    scaff_name = 'scaff.' + str(i + 1)
    scaff_fname = outprefix + '.' + scaff_name + '.fa'
    seq = fastn.Fasta(scaff_name,
                      ('').join([contig_seqs[c].seq for c in contig_list]))
    f = utils.open_file_write(scaff_fname)
    print(seq, file=f)
    utils.close(f)
    reads_fname = scaff_fname + '.reads.fq'
    reads_fastq_files.append(reads_fname)
    cmd = 'fastn_to_perfect_reads.py ' + scaff_fname + ' ' + reads_fname + ' 500 30 ' + coverage + ' 76'
    utils.syscall(cmd)
    os.unlink(scaff_fname)

# cat all the reads files together
reads_fastq = outprefix + '.reads.fq'
fout = utils.open_file_write(reads_fastq)
for fname in reads_fastq_files:
    with open(fname) as infile:
        for line in infile:
            fout.write(line)

    os.unlink(fname)

utils.close(fout)

# make deinterleaved fastq files
Beispiel #9
0
def run_r_script(script):
    if not options.noplots:
        utils.syscall('R CMD BATCH ' + script)
        os.unlink(script  + 'out')
        os.unlink(script)
Beispiel #10
0
    #print(insert_pdf(outprefix + '.roc_with_chulls.pdf'), file=f_tex)
    print('', file=f_tex)
    print(r'''\noindent''', file=f_tex)
    print(insert_pdf(outprefix + '.skipped_tags_barchart.pdf'), file=f_tex)
    print(insert_pdf(outprefix + '.lost_tags_barchart.pdf'), file=f_tex)
    print('', file=f_tex)
    print(r'''\noindent''', file=f_tex)
    print(insert_pdf(outprefix + '.cpu.pdf'), file=f_tex)
    print(insert_pdf(outprefix + '.mem.pdf'), file=f_tex)
    #print(insert_pdf(outprefix + '.percent_good_joins_barchart.pdf'), file=f_tex)


print(r'''\end{document}''', file=f_tex)
utils.close(f_tex)
if not options.noplots:
    utils.syscall('pdflatex ' + texfile)
    utils.syscall('pdflatex ' + texfile)


#def get_data_by_scaffolder(data_type):
#    d = {s: [] for s in scaffolders}
#
#    for test_type in test_data_types:
#        for scaff in scaffolders:
#            d[scaff].append(results[test_type].results[scaff][data_type])
#
#    return d
#
#percent_good_by_scaff = get_data_by_scaffolder('% correct joins')

    default=None)
parser.add_argument('fasta_in',
                    help='Name of input fasta file',
                    metavar='in.fasta')
parser.add_argument('outprefix', help='Prefix of output files')
options = parser.parse_args()

untagged_seqs = {}
fastn.file_to_dict(options.fasta_in, untagged_seqs)

second_seqs = {}
unique_tags = []

seqs_index = options.outprefix + '.seqs.bowtie2.index'
#utils.syscall('smalt index -k 20 -s 10 ' + seqs_smalt_index + ' ' + options.fasta_in)
utils.syscall('bowtie2-build ' + options.fasta_in + ' ' + seqs_index)

if options.second_fasta:
    #second_seqs_smalt_index = options.outprefix + '.second_seqs_smalt_index'
    second_seqs_index = options.outprefix + '.second_seqs_bowtie2_index'
    #utils.syscall('smalt index -k 20 -s 10 ' + second_seqs_smalt_index + ' ' + options.second_fasta)
    utils.syscall('bowtie2-build ' + options.second_fasta + ' ' +
                  second_seqs_index)
else:
    second_seqs_index = None

uniquely_tagged = {}
f_log = utils.open_file_write(options.outprefix + '.log')

for tag_length in range(options.min_tag_length, options.max_tag_length + 1,
                        options.tag_step):
ref_seqs = {}
fastn.file_to_dict(options.ref_fa, ref_seqs)

mummer_dir = os.path.join(os.path.expanduser('~mh12'), 'bin', 'MUMmer3.23')
nucmer_exe = os.path.join(mummer_dir, 'nucmer')
delta_filter = os.path.join(mummer_dir, 'delta-filter')
show_coords = os.path.join(mummer_dir, 'show-coords')

nucmer_out_prefix = options.outprefix + '.nucmer'
nucmer_out_delta = nucmer_out_prefix + '.delta'
nucmer_out_filter = nucmer_out_prefix + '.delta-filter'
nucmer_out_coords = nucmer_out_filter + '.coords'

# run nucmer of contigs vs ref
utils.syscall(' '.join([
    nucmer_exe, options.nucmer_options, '-p', nucmer_out_prefix,
    options.ref_fa, options.contigs_fa
]))
utils.syscall(' '.join([
    delta_filter, '-i 98 -l 180 -q', nucmer_out_delta, '>', nucmer_out_filter
]))
utils.syscall(' '.join(
    [show_coords, '-dTlro', nucmer_out_filter, '>', nucmer_out_coords]))

# load hits into hash. key=ref_name, value=another hash with key=qry_name, value=list of hit positions in that ref seq
nucmer_hits = {}
contigs_to_print = {}

nucmer_reader = nucmer.file_reader(nucmer_out_coords)

for hit in nucmer_reader:
    if hit.ref_name not in nucmer_hits:
            else:
                nodes[sam1.id].add(sam2.id)
                nodes[sam2.id].add(sam1.id)

print(nodes)


cmd = 'echo "digraph G {'
first  = True

for node, l in sorted(nodes.items()):
    if first:
        first = False
    else:
        cmd += ';'

    if len(l):
        cmd += ';'.join([node + '->' + x for x in l])
    else:
        cmd += node


cmd += '}"  | dot -Tpdf > ' + options.outprefix + '.pdf'

make_graph = options.outprefix + '.make_graph.sh'
f = utils.open_file_write(make_graph)
print(cmd, file=f)
utils.close(f)
utils.syscall('bash ' + make_graph)

Beispiel #14
0
import sam
import genome_intervals
import external_progs

parser = argparse.ArgumentParser(
    description = 'Given a fasta/q file of reads, and a second fasta of vector sequences, trims the vectors off the reads. Made specifically for assembled capillary read pairs - uses BWA for mapping. Untested on short reads or unassembled read pairs',
    usage = '%(prog)s [options] <reads fasta/q> <vectors fasta> <outprefix>')
parser.add_argument('--join_distance', type=int, help='Join hits at most this many bases apart [%(default)s]', metavar='INT', default=100)
parser.add_argument('reads_in', help='Name of input fasta/q file of reads', metavar='reads fasta/q')
parser.add_argument('vectors_in', help='Name of input fasta file of vectors', metavar='vectors fasta')
parser.add_argument('outprefix', help='Prefix of names of ouput files')
options = parser.parse_args()

bwa_index = options.outprefix + '.bwa_index'
bwa_sam = options.outprefix + '.map_reads.sam'
utils.syscall(' '.join([external_progs.bwa, 'index -p', bwa_index, options.vectors_in]))
utils.syscall(' '.join([external_progs.bwa, 'bwasw -f', bwa_sam, bwa_index, options.reads_in]))

read_hit_coords = {} # id -> [(start, end), (start, end), ...]

sam_reader = sam.file_reader(bwa_sam)

for sam_record in sam_reader:
    if not sam_record.is_mapped():
        continue

    if not sam_record.is_forward_strand():
        sam_record.cigar.reverse()

    hit_start = 1
    hit_end = len(sam_record.seq)
Beispiel #15
0
'''
MIT License

Copyright (c) 2017 William Ivanski

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
'''


import sys
from utils import syscall, syscall_bg

out = syscall('rm -f cron/*_{0}'.format(sys.argv[1]))
print('\n'.join(out))
Beispiel #16
0
def index_with_bowtie2(file):
    if not is_bowtie2_indexed(file):
        utils.syscall(bowtie2_build + ' ' + file + ' ' + file)
    for id in d:
        for interval in d[id]:
            print(id, interval.start+1, interval.end+1, sep='\t', file=f)

    utils.close(f)

# run nucmer
nucmer_outprefix = options.outprefix + '.nucmer'
nucmer_script = nucmer_outprefix + '.sh'
nucmer_coords = nucmer_outprefix + '.coords'
f = utils.open_file_write(nucmer_script)
print(external_progs.nucmer, options.nucmer_ops, '-p', nucmer_outprefix, options.reference, options.assembly, file=f)
print(external_progs.delta_filter, options.df_ops, nucmer_outprefix + '.delta >', nucmer_outprefix + '.filter', file=f)
print(external_progs.show_coords, '-dTlro', nucmer_outprefix + '.filter >', nucmer_coords, file=f)
utils.close(f)
utils.syscall('bash ' + nucmer_script)

# gather the results
ref_lengths, ref_gaps = get_gaps_and_lengths(options.reference)
assembly_lengths, assembly_gaps = get_gaps_and_lengths(options.assembly)
ref_hits, assembly_hits = get_nucmer_hits(nucmer_coords)

ref_hits_and_gaps = make_hits_union(ref_lengths.keys(), ref_gaps, ref_hits)
assembly_hits_and_gaps = make_hits_union(assembly_lengths.keys(), assembly_gaps, assembly_hits)

ref_bases = sum(ref_lengths.values())
assembly_bases = sum(assembly_lengths.values())
ref_bases_assembled = total_length_from_dict(ref_hits_and_gaps)
assembly_bases_in_ref = total_length_from_dict(assembly_hits_and_gaps)

ref_gaps_sum = total_length_from_dict(ref_gaps)