def map_and_parse_sam(ref_index, query_fa, tags, qry_or_ref, ops, get_unique=True): samfile = ops.outprefix + '.maptags.sam' if get_unique: utils.syscall(external_progs.bowtie2_align + ' -f -x ' + ref_index + ' -U ' + query_fa + ' -S ' + samfile) else: utils.syscall(external_progs.bowtie2_align + ' -a --score-min L,0,0 -f -x ' + ref_index + ' -U ' + query_fa + ' -S ' + samfile) sam_reader = sam.file_reader(samfile) for sam_record in sam_reader: assert sam_record.id in tags if sam_record.is_mapped() and sam_record.tags['AS'][1] == 0: if (get_unique and (('XS' not in sam_record.tags or sam_record.tags['XS'][1] < 0))) \ or not get_unique: if qry_or_ref == 'qry': tags[sam_record.id].qry_hits.add( Hit(sam_record.rname, sam_record.pos, sam_record.query_strand())) elif qry_or_ref == 'ref': tags[sam_record.id].ref_hits.add( Hit(sam_record.rname, sam_record.pos, sam_record.query_strand())) else: print('Error parsing SAM', file=sys.stderr) sys.exit(1) os.unlink(samfile)
def map_and_parse_sam(ref_index, tags_fasta, tag_counts, log_fh): samfile = options.outprefix + '.maptags.sam' #utils.syscall('smalt map -d -1 -y 1 -f samsoft -o ' + samfile + ' ' + ref_smalt_index + ' ' + tags_fasta) utils.syscall(external_progs.bowtie2_align + ' -f -x ' + ref_index + ' -U ' + tags_fasta + ' -S ' + samfile) sam_reader = sam.file_reader(samfile) for sam_record in sam_reader: (contig_name, range) = sam_record.id.rsplit(':', 1) assert contig_name not in tag_counts if sam_record.is_mapped() \ and sam_record.tags['AS'][1] == 0 \ and ('XS' not in sam_record.tags or sam_record.tags['XS'][1] < 0): tag_counts[contig_name] = 1 else: tag_counts[contig_name] = 2 os.unlink(samfile)
def test_system_call(self): '''Test that system call appears to work and die as it should''' test_file = 'system_call_test.txt' tmp_out = 'utils_unittest_syscall.tmp' utils.syscall('cat ' + test_file + ' > ' + tmp_out) self.assertTrue(filecmp.cmp(tmp_out, test_file)) os.unlink(tmp_out) with self.assertRaises(utils.Error): utils.syscall('thisisveryunlikelytoebarealcommandandshouldthrowerror') utils.syscall('echo "this is not the right string" > ' + tmp_out) self.assertFalse(filecmp.cmp(tmp_out, test_file)) os.unlink(tmp_out) s = utils.syscall_get_stdout('echo bingo') self.assertListEqual(["bingo"], s)
for i in range(len(clusters)): reads_file = options.outfile + '.cluster.' + str(i + 1) f = utils.open_file_write(reads_file) for id in clusters[i]: seq = all_seqs[id] if strands[id] == '-': seq = copy.copy(all_seqs[id]) seq.revcomp() else: seq = all_seqs[id] print(seq, file=f) utils.close(f) utils.syscall('cap3 ' + reads_file) singlet_count = fastn.count_sequences(reads_file + '.cap.singlets') contig_count = fastn.count_sequences(reads_file + '.cap.contigs') if singlet_count == 0 and contig_count == 1: seq_reader = fastn.file_reader(reads_file + '.cap.contigs') for seq in seq_reader: seq.id = 'cluster.' + str(i + 1) + '.contig' assembled_seqs.append(copy.copy(seq)) for e in [ 'ace', 'contigs.links', 'contigs.qual', 'info', 'singlets', 'contigs' ]: os.unlink(reads_file + '.cap.' + e) os.unlink(reads_file) else:
print('Got', len(tags), 'tags', file=sys.stderr) # sort the tags into reference order for each chromosome for chr, l in tags_by_chr.items(): l.sort() for i in range(len(l)): l[i].ordered_index = i # map the tags to the scaffolds samfile = options.outprefix + '.map_tags.sam' bamfile = options.outprefix + '.map_tags.bam' sorted_bamfile = options.outprefix + '.map_tags.sorted.bam' external_progs.index_with_bowtie2(options.scaffolds_fa) #utils.syscall(external_progs.bowtie2_align + ' -f -a --score-min L,0,0 -x ' + options.scaffolds_fa + ' -U ' + tags_fa_file + ' -S ' + samfile) utils.syscall(external_progs.bowtie2_align + ' -f -x ' + options.scaffolds_fa + ' -U ' + tags_fa_file + ' -S ' + samfile) utils.syscall('samtools view -T ' + options.scaffolds_fa + ' -bS ' + samfile + ' > ' + bamfile) os.unlink(samfile) utils.syscall('samtools sort ' + bamfile + ' ' + sorted_bamfile[0:-4]) #os.unlink(bamfile) # Load the hits into memory previous_sam = None previous_tag = None sam_reader = sam.file_reader(sorted_bamfile) flag_counts = {k: 0 for k in [0, 1, 2, 4, 5, 8, 12, 16]} tags_from_bam = set() tag_distances = [] f_log = utils.open_file_write(options.outprefix + '.log') f_tags_and_sam = utils.open_file_write(options.outprefix + '.tags_and_sam.gz')
'c(', ','.join(str(x) for x in y_coords), '), ', 'xlab="Correct joins", ', 'ylab="Incorrect joins", ', 'xlim=c(0,', x_max, '), ', 'ylim=c(0,', y_max, '), ', 'col=', r_colour_vector, ', ', 'pch=', r_symbol_vector, ', ', 'bg=', r_colour_vector, ')', sep='', file=f) print(r_legend, file=f) print('dev.off()', file=f) utils.close(f) utils.syscall('R CMD BATCH ' + r_script)
''' MIT License Copyright (c) 2017 William Ivanski Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ''' import sys from utils import syscall, syscall_bg out = syscall('ln -s ../run/{0} cron/{1}_{0}'.format(sys.argv[1], sys.argv[2])) print('\n'.join(out))
utils.close(f) # make scaffolds and simulate reads from them for i in range(len(scaffolds)): coverage, contig_list = scaffolds[i] scaff_name = 'scaff.' + str(i + 1) scaff_fname = outprefix + '.' + scaff_name + '.fa' seq = fastn.Fasta(scaff_name, ('').join([contig_seqs[c].seq for c in contig_list])) f = utils.open_file_write(scaff_fname) print(seq, file=f) utils.close(f) reads_fname = scaff_fname + '.reads.fq' reads_fastq_files.append(reads_fname) cmd = 'fastn_to_perfect_reads.py ' + scaff_fname + ' ' + reads_fname + ' 500 30 ' + coverage + ' 76' utils.syscall(cmd) os.unlink(scaff_fname) # cat all the reads files together reads_fastq = outprefix + '.reads.fq' fout = utils.open_file_write(reads_fastq) for fname in reads_fastq_files: with open(fname) as infile: for line in infile: fout.write(line) os.unlink(fname) utils.close(fout) # make deinterleaved fastq files
def run_r_script(script): if not options.noplots: utils.syscall('R CMD BATCH ' + script) os.unlink(script + 'out') os.unlink(script)
#print(insert_pdf(outprefix + '.roc_with_chulls.pdf'), file=f_tex) print('', file=f_tex) print(r'''\noindent''', file=f_tex) print(insert_pdf(outprefix + '.skipped_tags_barchart.pdf'), file=f_tex) print(insert_pdf(outprefix + '.lost_tags_barchart.pdf'), file=f_tex) print('', file=f_tex) print(r'''\noindent''', file=f_tex) print(insert_pdf(outprefix + '.cpu.pdf'), file=f_tex) print(insert_pdf(outprefix + '.mem.pdf'), file=f_tex) #print(insert_pdf(outprefix + '.percent_good_joins_barchart.pdf'), file=f_tex) print(r'''\end{document}''', file=f_tex) utils.close(f_tex) if not options.noplots: utils.syscall('pdflatex ' + texfile) utils.syscall('pdflatex ' + texfile) #def get_data_by_scaffolder(data_type): # d = {s: [] for s in scaffolders} # # for test_type in test_data_types: # for scaff in scaffolders: # d[scaff].append(results[test_type].results[scaff][data_type]) # # return d # #percent_good_by_scaff = get_data_by_scaffolder('% correct joins')
default=None) parser.add_argument('fasta_in', help='Name of input fasta file', metavar='in.fasta') parser.add_argument('outprefix', help='Prefix of output files') options = parser.parse_args() untagged_seqs = {} fastn.file_to_dict(options.fasta_in, untagged_seqs) second_seqs = {} unique_tags = [] seqs_index = options.outprefix + '.seqs.bowtie2.index' #utils.syscall('smalt index -k 20 -s 10 ' + seqs_smalt_index + ' ' + options.fasta_in) utils.syscall('bowtie2-build ' + options.fasta_in + ' ' + seqs_index) if options.second_fasta: #second_seqs_smalt_index = options.outprefix + '.second_seqs_smalt_index' second_seqs_index = options.outprefix + '.second_seqs_bowtie2_index' #utils.syscall('smalt index -k 20 -s 10 ' + second_seqs_smalt_index + ' ' + options.second_fasta) utils.syscall('bowtie2-build ' + options.second_fasta + ' ' + second_seqs_index) else: second_seqs_index = None uniquely_tagged = {} f_log = utils.open_file_write(options.outprefix + '.log') for tag_length in range(options.min_tag_length, options.max_tag_length + 1, options.tag_step):
ref_seqs = {} fastn.file_to_dict(options.ref_fa, ref_seqs) mummer_dir = os.path.join(os.path.expanduser('~mh12'), 'bin', 'MUMmer3.23') nucmer_exe = os.path.join(mummer_dir, 'nucmer') delta_filter = os.path.join(mummer_dir, 'delta-filter') show_coords = os.path.join(mummer_dir, 'show-coords') nucmer_out_prefix = options.outprefix + '.nucmer' nucmer_out_delta = nucmer_out_prefix + '.delta' nucmer_out_filter = nucmer_out_prefix + '.delta-filter' nucmer_out_coords = nucmer_out_filter + '.coords' # run nucmer of contigs vs ref utils.syscall(' '.join([ nucmer_exe, options.nucmer_options, '-p', nucmer_out_prefix, options.ref_fa, options.contigs_fa ])) utils.syscall(' '.join([ delta_filter, '-i 98 -l 180 -q', nucmer_out_delta, '>', nucmer_out_filter ])) utils.syscall(' '.join( [show_coords, '-dTlro', nucmer_out_filter, '>', nucmer_out_coords])) # load hits into hash. key=ref_name, value=another hash with key=qry_name, value=list of hit positions in that ref seq nucmer_hits = {} contigs_to_print = {} nucmer_reader = nucmer.file_reader(nucmer_out_coords) for hit in nucmer_reader: if hit.ref_name not in nucmer_hits:
else: nodes[sam1.id].add(sam2.id) nodes[sam2.id].add(sam1.id) print(nodes) cmd = 'echo "digraph G {' first = True for node, l in sorted(nodes.items()): if first: first = False else: cmd += ';' if len(l): cmd += ';'.join([node + '->' + x for x in l]) else: cmd += node cmd += '}" | dot -Tpdf > ' + options.outprefix + '.pdf' make_graph = options.outprefix + '.make_graph.sh' f = utils.open_file_write(make_graph) print(cmd, file=f) utils.close(f) utils.syscall('bash ' + make_graph)
import sam import genome_intervals import external_progs parser = argparse.ArgumentParser( description = 'Given a fasta/q file of reads, and a second fasta of vector sequences, trims the vectors off the reads. Made specifically for assembled capillary read pairs - uses BWA for mapping. Untested on short reads or unassembled read pairs', usage = '%(prog)s [options] <reads fasta/q> <vectors fasta> <outprefix>') parser.add_argument('--join_distance', type=int, help='Join hits at most this many bases apart [%(default)s]', metavar='INT', default=100) parser.add_argument('reads_in', help='Name of input fasta/q file of reads', metavar='reads fasta/q') parser.add_argument('vectors_in', help='Name of input fasta file of vectors', metavar='vectors fasta') parser.add_argument('outprefix', help='Prefix of names of ouput files') options = parser.parse_args() bwa_index = options.outprefix + '.bwa_index' bwa_sam = options.outprefix + '.map_reads.sam' utils.syscall(' '.join([external_progs.bwa, 'index -p', bwa_index, options.vectors_in])) utils.syscall(' '.join([external_progs.bwa, 'bwasw -f', bwa_sam, bwa_index, options.reads_in])) read_hit_coords = {} # id -> [(start, end), (start, end), ...] sam_reader = sam.file_reader(bwa_sam) for sam_record in sam_reader: if not sam_record.is_mapped(): continue if not sam_record.is_forward_strand(): sam_record.cigar.reverse() hit_start = 1 hit_end = len(sam_record.seq)
''' MIT License Copyright (c) 2017 William Ivanski Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ''' import sys from utils import syscall, syscall_bg out = syscall('rm -f cron/*_{0}'.format(sys.argv[1])) print('\n'.join(out))
def index_with_bowtie2(file): if not is_bowtie2_indexed(file): utils.syscall(bowtie2_build + ' ' + file + ' ' + file)
for id in d: for interval in d[id]: print(id, interval.start+1, interval.end+1, sep='\t', file=f) utils.close(f) # run nucmer nucmer_outprefix = options.outprefix + '.nucmer' nucmer_script = nucmer_outprefix + '.sh' nucmer_coords = nucmer_outprefix + '.coords' f = utils.open_file_write(nucmer_script) print(external_progs.nucmer, options.nucmer_ops, '-p', nucmer_outprefix, options.reference, options.assembly, file=f) print(external_progs.delta_filter, options.df_ops, nucmer_outprefix + '.delta >', nucmer_outprefix + '.filter', file=f) print(external_progs.show_coords, '-dTlro', nucmer_outprefix + '.filter >', nucmer_coords, file=f) utils.close(f) utils.syscall('bash ' + nucmer_script) # gather the results ref_lengths, ref_gaps = get_gaps_and_lengths(options.reference) assembly_lengths, assembly_gaps = get_gaps_and_lengths(options.assembly) ref_hits, assembly_hits = get_nucmer_hits(nucmer_coords) ref_hits_and_gaps = make_hits_union(ref_lengths.keys(), ref_gaps, ref_hits) assembly_hits_and_gaps = make_hits_union(assembly_lengths.keys(), assembly_gaps, assembly_hits) ref_bases = sum(ref_lengths.values()) assembly_bases = sum(assembly_lengths.values()) ref_bases_assembled = total_length_from_dict(ref_hits_and_gaps) assembly_bases_in_ref = total_length_from_dict(assembly_hits_and_gaps) ref_gaps_sum = total_length_from_dict(ref_gaps)