def main(args): bf = BAMFile(args.input) chrlens = bf.get_header().get_sequence_lengths() of_chrlens = open(args.output, 'w') for qname in sorted(chrlens.keys()): of_chrlens.write(qname + "\t" + str(chrlens[qname]) + "\n") of_chrlens.close()
def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="BAM file input") parser.add_argument('--threads', type=int, default=cpu_count(), help="Thread count") parser.add_argument('--tempdir', help="location of temporary directory to use") parser.add_argument('-o', '--output', help="Output file name") args = parser.parse_args() if args.tempdir: args.tempdir = args.tempdir.rstrip('/') bf = BAMFile(args.input) seqs = bf.get_header().get_sequence_lengths() f = tempfile.NamedTemporaryFile(delete=False) for seq in seqs: f.write(seq + "\t" + str(seqs[seq]) + "\n") f.close() bf.close() fout = tempfile.NamedTemporaryFile(delete=False) cmd = 'sort -k 1,1 -k2,2n -k3,3n -S4G --parallel=' + str(args.threads) if args.tempdir: cmd += ' -T ' + args.tempdir global ps ps = Popen(cmd.split(), stdin=PIPE, stdout=fout) if args.threads > 1: poo = Pool(processes=args.threads) for seq in seqs: if args.threads > 1: poo.apply_async(do_seq, args=(seq, args, f.name), callback=do_output) else: res = do_seq(seq, args, f.name) do_output(res) if args.threads > 1: poo.close() poo.join() ps.communicate() fout.close() of = sys.stdout if args.output: if args.output[-3:] == '.gz': of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') cmd = 'sort -k 1,1 -k2,2n -k3,3n -S4G --parallel=' + str(args.threads) inf = open(fout.name) p = Popen(cmd.split(), stdout=PIPE, stdin=inf) for line in p.stdout: of.write(line) p.communicate() inf.close() of.close() os.unlink(f.name) os.unlink(fout.name)
def do_chunk(coords,ecount,args): bf = BAMFile(args.input,blockStart=coords[0],innerStart=coords[1]) results = [] for i in range(0,ecount): e = bf.read_entry() if e.is_aligned(): rng = e.get_target_range() results.append([rng.get_range_string(),e.get_aligned_bases_count()]) else: results.append(['',0]) return results
def main(): parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input',help="BAM file input") parser.add_argument('--threads',type=int,default=cpu_count(),help="Thread count") parser.add_argument('--tempdir',help="location of temporary directory to use") parser.add_argument('-o','--output',help="Output file name") args = parser.parse_args() if args.tempdir: args.tempdir = args.tempdir.rstrip('/') bf = BAMFile(args.input) seqs = bf.get_header().get_sequence_lengths() f = tempfile.NamedTemporaryFile(delete=False) for seq in seqs: f.write(seq+"\t"+str(seqs[seq])+"\n") f.close() bf.close() fout = tempfile.NamedTemporaryFile(delete=False) cmd = 'sort -k 1,1 -k2,2n -k3,3n -S4G --parallel='+str(args.threads) if args.tempdir: cmd += ' -T '+args.tempdir global ps ps = Popen(cmd.split(),stdin=PIPE,stdout=fout) if args.threads > 1: poo = Pool(processes=args.threads) for seq in seqs: if args.threads > 1: poo.apply_async(do_seq,args=(seq,args,f.name),callback=do_output) else: res = do_seq(seq,args,f.name) do_output(res) if args.threads > 1: poo.close() poo.join() ps.communicate() fout.close() of = sys.stdout if args.output: if args.output[-3:]=='.gz': of = gzip.open(args.output,'w') else: of = open(args.output,'w') cmd = 'sort -k 1,1 -k2,2n -k3,3n -S4G --parallel='+str(args.threads) inf = open(fout.name) p = Popen(cmd.split(),stdout=PIPE,stdin=inf) for line in p.stdout: of.write(line) p.communicate() inf.close() of.close() os.unlink(f.name) os.unlink(fout.name)
def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="BAM file or Use - for STDIN for SAM") parser.add_argument('--minimum_intron', type=int, default=68, help="smallest intron") parser.add_argument('-o', '--output', help="Output file, gzip is okay") args = parser.parse_args() of = sys.stdout if args.output: if args.output[-3:] == '.gz': of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') if args.input == '-': sh = SamStream(sys.stdin) else: sh = BAMFile(args.input) for e in sh: if not e.is_aligned(): continue gpd_line = e.get_target_transcript( min_intron=args.minimum_intron).get_gpd_line() of.write(gpd_line + "\n") sh.close() of.close()
def main(): parser = argparse.ArgumentParser( description= 'Based on Yunhaos ONT naming convention. i.e. BOWDEN04_20160603_FNFAD11879_MN16254_sequencing_run_R9_H1cDNA_SIRV_79593_ch49_read2100_strand_pass_2D or /^(\S+)_\d+_[^_]+_[^_]+[^_]+_[^_]+_[^_]+$/', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="Input bam file") parser.add_argument('-o', '--output', required=True, help="output directory") parser.add_argument( '--suffix', help="string to add before .bam: cellname.XXXXXX.sorted.bam") args = parser.parse_args() args.output = args.output.rstrip('/') nameprog = re.compile('^(\S+)_\d+_[^_]+_[^_]+_[^_]+_[^_]+_[^_]+$') if not os.path.exists(args.output): os.makedirs(args.output) bf = BAMFile(args.input) sorted_header_text = sort_header(bf.header_text) fhs = {} z = 0 for e in bf: z += 1 if z % 1000 == 0: sys.stderr.write( str(z) + " reads " + str(len(fhs.keys())) + " cells \r") m = nameprog.match(e.value('qname')) mol = '_nonont' if m: mol = m.group(1) ln = e.get_line() if mol not in fhs: fname = args.output + '/' + mol if args.suffix: fname += '.' + args.suffix fname += '.gz' of = gzip.open(fname, 'w') fhs[mol] = [of, fname] fhs[mol][0].write(sorted_header_text) fhs[mol][0].write(ln + "\n") sys.stderr.write("\n") z = 0 for mol in fhs: z += 1 fhs[mol][0].close() ofname = fhs[mol][1][:-2] + 'sorted' inf = gzip.open(fhs[mol][1]) cmd1 = 'samtools view -Sb -' cmd2 = 'samtools sort - ' + ofname p2 = Popen(cmd2.split(), stdin=PIPE) p1 = Popen(cmd1.split(), stdin=inf, stdout=p2.stdin) p2.communicate() p1.communicate() inf.close() of.close() os.remove(fhs[mol][1]) sys.stderr.write( str(z) + '/' + str(len(fhs.keys())) + " finished \r") sys.stderr.write("\n")
def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="Input bam file") parser.add_argument('-o', '--output', required=True, help="output directory") parser.add_argument( '--suffix', help="string to add before .bam: smrtcellname.XXXXXX.sorted.bam") args = parser.parse_args() args.output = args.output.rstrip('/') nameprog = re.compile('^([^\/]+)\/\d+/') if not os.path.exists(args.output): os.makedirs(args.output) bf = BAMFile(args.input) sorted_header_text = sort_header(bf.header_text) fhs = {} z = 0 for e in bf: z += 1 if z % 1000 == 0: sys.stderr.write( str(z) + " reads " + str(len(fhs.keys())) + " cells \r") m = nameprog.match(e.value('qname')) mol = '_nonpacbio' if m: mol = m.group(1) ln = e.get_line() if mol not in fhs: fname = args.output + '/' + mol if args.suffix: fname += '.' + args.suffix fname += '.gz' of = gzip.open(fname, 'w') fhs[mol] = [of, fname] fhs[mol][0].write(sorted_header_text) fhs[mol][0].write(ln + "\n") sys.stderr.write("\n") z = 0 for mol in fhs: z += 1 fhs[mol][0].close() ofname = fhs[mol][1][:-2] + 'sorted' inf = gzip.open(fhs[mol][1]) cmd1 = 'samtools view -Sb -' cmd2 = 'samtools sort - ' + ofname p2 = Popen(cmd2.split(), stdin=PIPE) p1 = Popen(cmd1.split(), stdin=inf, stdout=p2.stdin) p2.communicate() p1.communicate() inf.close() of.close() os.remove(fhs[mol][1]) sys.stderr.write( str(z) + '/' + str(len(fhs.keys())) + " finished \r") sys.stderr.write("\n")
def do_chunk(ilines, infile, args): ilines = [x.rstrip().split("\t") for x in ilines] coord = [int(x) for x in ilines[0][2:4]] bf = BAMFile(infile, blockStart=coord[0], innerStart=coord[1]) results = [] for i in range(0, len(ilines)): flag = int(ilines[i][5]) e = bf.read_entry() #if not e: break #print {'qlen':e.get_original_query_length(),'alen':e.get_aligned_bases_count()} value = None if e.is_aligned(): tx = e.get_target_transcript(args.minimum_intron_size) value = { 'qrng': e.get_actual_original_query_range().get_range_string(), 'tx': tx.get_gpd_line(), 'flag': flag, 'qlen': e.get_original_query_length(), 'aligned_bases': e.get_aligned_bases_count() } results.append( e.value('qname') + "\t" + base64.b64encode(zlib.compress(pickle.dumps(value)))) #results.append([e.value('qname'),zlib.compress(pickle.dumps(value))]) else: value = { 'qrng': '', 'tx': '', 'flag': flag, 'qlen': e.get_original_query_length(), 'aligned_bases': 0 } results.append( e.value('qname') + "\t" + base64.b64encode(zlib.compress(pickle.dumps(value)))) #results.append([e.value('qname'),zlib.compress(pickle.dumps(value))]) return results
def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="Use - for STDIN or specify a BAM file") parser.add_argument('-r', '--reference', help="Reference fasta", required=True) args = parser.parse_args() ref = None if args.reference: ref = FastaData(open(args.reference, 'rb').read()) if args.input == '-': args.input = SamStream(sys.stdin, reference=ref) else: args.input = BAMFile(args.input, reference=ref) for e in args.input: if e.is_aligned(): print e.get_PSL()
def main(): #do our inputs args = do_inputs() bf = BAMFile(args.input) if not args.all_alignments: if args.index_path: bf.read_index(args.index_path) else: bf.read_index(args.index_path) ls = LocusStream(bf) if args.output: args.output = open(args.output,'w') else: args.output = sys.stdout global of of = args.output z = 0 if args.threads > 1: p = Pool(processes=args.threads) for entries in ls: bedarray = [] #print len(entries.get_payload()) for e in entries.get_payload(): if not args.all_alignments and not e.indexed_as_primary_alignment(): continue if not e.is_aligned(): continue tx = e.get_target_transcript(min_intron=args.minimum_intron_size) for exon in tx.exons: bedarray.append(exon.rng.copy()) #print exon.rng.get_range_string() if len(bedarray) == 0: continue if args.threads > 1: p.apply_async(get_output,args=(bedarray,z,),callback=do_output) else: r = get_output(bedarray,z) do_output(r) z += 1 if args.threads > 1: p.close() p.join() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir) args.output.close()
def main(args): # make our error profile report sys.stderr.write("Reading reference fasta\n") ref = FastaData(open(args.reference).read()) sys.stderr.write("Reading alignments\n") epf = ErrorProfileFactory() if args.random: bf = None if args.input_index: bf = BAMFile(args.input, reference=ref, index_file=args.input_index) bf.read_index(index_file=args.input_index) else: bf = BAMFile(args.input, reference=ref) bf.read_index() if not bf.has_index(): sys.stderr.write("Random access requires an index be set\n") z = 0 strand = 'target' if args.query: strand = 'query' con = 0 while True: rname = random.choice(bf.index.get_names()) #print rname coord = bf.index.get_longest_target_alignment_coords_by_name(rname) #print coord if not coord: continue e = bf.fetch_by_coord(coord) if e.is_aligned(): epf.add_alignment(e) z += 1 if z % 100 == 1: con = epf.get_min_context_count(strand) sys.stderr.write( str(z) + " alignments, " + str(con) + " min context coverage\r") if args.max_alignments <= z: break if args.stopping_point <= con: break else: bf = BAMFile(args.input, reference=ref) z = 0 strand = 'target' if args.query: strand = 'query' con = 0 for e in bf: if e.is_aligned(): epf.add_alignment(e) z += 1 if z % 100 == 1: con = epf.get_min_context_count(strand) sys.stderr.write( str(z) + " alignments, " + str(con) + " min context coverage\r") if args.max_alignments <= z: break if args.stopping_point <= con: break sys.stderr.write("\n") sys.stderr.write('working with:' + "\n") sys.stderr.write( str(z) + " alignments, " + str(con) + " min context coverage" + "\n") epf.write_context_error_report(args.tempdir + '/err.txt', strand) for ofile in args.output: cmd = args.rscript_path + ' ' + os.path.dirname( os.path.realpath(__file__) ) + '/plot_base_error_context.r ' + args.tempdir + '/err.txt ' + ofile + ' ' if args.scale: cmd += ' '.join([str(x) for x in args.scale]) sys.stderr.write(cmd + "\n") call(cmd.split()) sys.stderr.write("finished\n") if args.output_raw: of = open(args.output_raw, 'w') with open(args.tempdir + "/err.txt") as inf: for line in inf: of.write(line) # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(args): sys.stderr.write("Read reference fasta\n") fasta = FastaData(open(args.reference_fasta).read()) sys.stderr.write("Read alignment file\n") bf = BAMFile(args.bam_input,reference=fasta) bf.read_index() total_qualities = [] for j in range(0,100): total_qualities.append([]) ef = ErrorProfileFactory() mincontext = 0 alignments = 0 for i in range(0,args.max_alignments): rname = random.choice(bf.index.get_names()) coord = bf.index.get_longest_target_alignment_coords_by_name(rname) if not coord: continue bam = bf.fetch_by_coord(coord) qual = bam.value('qual') do_qualities(total_qualities,qual) if not bam.is_aligned(): continue alignments += 1 ef.add_alignment(bam) if i%100 == 0: mincontext = ef.get_min_context_count('target') if mincontext: if mincontext >= args.min_context and alignments >= args.min_alignments: break sys.stderr.write(str(i+1)+" lines "+str(alignments)+"/"+str(args.min_alignments)+" alignments "+str(mincontext)+"/"+str(args.min_context)+" mincontext \r") sys.stderr.write("\n") sys.stderr.write(str(mincontext)+" minimum contexts observed\n") target_context = ef.get_target_context_error_report() general_error_stats = ef.get_alignment_errors().get_stats() general_error_report = ef.get_alignment_errors().get_report() # convert report to table general_all = [x.split("\t") for x in general_error_report.rstrip().split("\n")] general_head = general_all[0] #print [y for y in general_all[1:]] general_data = [[y[0],y[1],int(y[2]),int(y[3])] for y in general_all[1:]] general_error_report = {'head':general_head,'data':general_data} quality_counts = [] for vals in total_qualities: garr = [] grp = {} for v in vals: if v[0] not in grp: grp[v[0]] = {}# check ordinal if v[1] not in grp[v[0]]: grp[v[0]][v[1]] = 0 # run length grp[v[0]][v[1]]+=1 for ordval in sorted(grp.keys()): for runlen in sorted(grp[ordval].keys()): garr.append([ordval,runlen,grp[ordval][runlen]]) quality_counts.append(garr) #Quailty counts now has 100 bins, each has an ordered array of # [ordinal_quality, run_length, observation_count] # Can prepare an output output = {} output['quality_counts'] = quality_counts output['context_error'] = target_context output['alignment_error'] = general_error_report output['error_stats'] = general_error_stats of = None if args.output[-3:]=='.gz': of = gzip.open(args.output,'w') else: of = open(args.output,'w') of.write(base64.b64encode(zlib.compress(json.dumps(output)))+"\n") of.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(args): sys.stderr.write("Reading our reference Fasta\n") ref = FastaData(open(args.reference, 'rb').read()) sys.stderr.write("Finished reading our reference Fasta\n") bf = None if args.input_index: bf = BAMFile(args.input, reference=ref, index_file=args.input_index) bf.read_index(index_file=args.input_index) else: bf = BAMFile(args.input, reference=ref) bf.read_index() epf = ErrorProfileFactory() if args.random: if not bf.has_index(): sys.stderr.write( "Random access requires our format of index bgi to be set\n") sys.exit() z = 0 while True: rname = random.choice(bf.index.get_names()) coord = bf.index.get_longest_target_alignment_coords_by_name(rname) if not coord: continue e = bf.fetch_by_coord(coord) if e.is_aligned(): epf.add_alignment(e) z += 1 #print z if z % 100 == 1: con = epf.get_alignment_errors().alignment_length if args.max_length <= con: break sys.stderr.write( str(con) + "/" + str(args.max_length) + " bases from " + str(z) + " alignments\r") sys.stderr.write("\n") else: z = 0 for e in bf: if e.is_aligned(): epf.add_alignment(e) z += 1 #print z if z % 100 == 1: con = epf.get_alignment_errors().alignment_length if args.max_length <= con: break sys.stderr.write( str(con) + "/" + str(args.max_length) + " bases from " + str(z) + " alignments\r") sys.stderr.write("\n") of = open(args.tempdir + '/report.txt', 'w') of.write(epf.get_alignment_errors().get_report()) of.close() for ofile in args.output: cmd = args.rscript_path + ' ' + os.path.dirname( os.path.realpath(__file__) ) + '/plot_alignment_errors.r ' + args.tempdir + '/report.txt ' + ofile + ' ' if args.scale: cmd += ' '.join([str(x) for x in args.scale]) sys.stderr.write(cmd + "\n") call(cmd.split()) if args.output_raw: of = open(args.output_raw, 'w') with open(args.tempdir + "/report.txt") as inf: for line in inf: of.write(line) of.close() if args.output_stats: of = open(args.output_stats, 'w') of.write(epf.get_alignment_errors().get_stats()) of.close() sys.stderr.write("finished\n") # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)