def main(): parser = argparse.ArgumentParser() parser.add_argument('input', help="Bam file in order of query name - for stdin") args = parser.parse_args() inf = sys.stdin if args.input != '-': cmd = "samtools view -h " + args.input p = Popen(cmd.split(), stdout=PIPE) inf = p.stdout mesr = MultiEntrySamReader(inf) while True: entries = mesr.read_entries() if not entries: break if len(entries) == 0: break if entries[0].value('cigar') == '*': print entries[0].value('qname') + "\t0" continue sam = entries[0] m = re.search('NH:i:(\d+)', sam.entry['remainder']) if not m: sys.stderr.write("ERROR not a hisat entry\n") sys.exit() cnt = max([len(entries), int(m.group(1))]) print entries[0].value('qname') + "\t" + str(cnt)
def main(): parser = argparse.ArgumentParser( description= "Take a sam file and join together mate pairs into single alignments. Alignments must be ordered by query name." ) parser.add_argument( 'input', help="FILENAME input .sam or .bam or '-' for STDIN sam") group = parser.add_mutually_exclusive_group() group.add_argument('--sam', action='store_true') group.add_argument('--bam', action='store_true') parser.add_argument('--mates_only', action='store_true', help="Only output combined mates") parser.add_argument('--threads', type=int, default=1, help="Number of threads to use, default is 1") args = parser.parse_args() inf = sys.stdin if args.bam or (not args.sam and not args.input == '-'): fh = open(args.input) p = Popen('samtools view - -h'.split(), stdin=fh, stdout=PIPE) inf = p.stdout buffer_size = 10000 buffer = [] msr = MultiEntrySamReader(inf) spc = SAMtoPSLconversionFactory() psc = PSLtoSAMconversionFactory() # set the headers for the spc for h in msr.header: print h.rstrip() spc.read_header_line(h) if args.threads > 1: p1 = Pool(processes=args.threads) while True: entries = msr.read_entries() if not entries: break buffer.append(entries) if len(buffer) >= buffer_size: if args.threads > 1: p1.apply_async(do_buffer, args=(buffer, msr, spc, psc, args), callback=do_callback) else: v = do_buffer(buffer, msr, spc, psc, args) do_callback(v) buffer = [] if len(buffer) > 0: if args.threads > 1: p1.apply_async(do_buffer, args=(buffer, msr, spc, psc, args), callback=do_callback) else: v = do_buffer(buffer, msr, spc, psc, args) do_callback(v) if args.threads > 1: p1.close() p1.join()
def main(): parser = argparse.ArgumentParser( description="Take a sam or bam file and output the best alignment for each read, it still can output the same read name twice if they happen to be mate pairs, but it will only output the best alignment for each individual mate, not necessarily the two together. You could combine mates if that is helpful with another script." ) parser.add_argument("input", help="FILENAME input .sam or .bam or '-' for STDIN sam") group = parser.add_mutually_exclusive_group() group.add_argument("--bam", action="store_true") group.add_argument("--sam", action="store_true") args = parser.parse_args() inf = sys.stdin if args.bam or (not args.sam and not args.input == "-"): fh = open(args.input) p = Popen("samtools view - -h".split(), stdin=fh, stdout=PIPE) inf = p.stdout msr = MultiEntrySamReader(inf) spc = SAMtoPSLconversionFactory() # set the headers for the spc for h in msr.header: print h.rstrip() spc.read_header_line(h) while True: entries = msr.read_entries() if not entries: break longest0 = 0 entry0 = None longest1 = 0 entry1 = None longest2 = 0 entry2 = None for sam in entries: pline = spc.convert_line(sam.get_line()) if not pline: continue side = None if sam.check_flag(64): side = 1 if sam.check_flag(128): side = 2 p = PSL(pline) if p.get_coverage() > longest0: longest0 = p.get_coverage() entry0 = sam if side == 1 and p.get_coverage() > longest1: longest1 = p.get_coverage() entry1 = sam if side == 2 and p.get_coverage() > longest2: longest2 = p.get_coverage() entry2 = sam if entry0: # output the combined if its there print entry0.get_line() else: if entry1: # output each of the mates if they are paired but not joined print entry1.get_line() if entry2: print entry2.get_line()
def main(): parser = argparse.ArgumentParser( description= "Take a sam or bam file and output the best alignment for each read, it still can output the same read name twice if they happen to be mate pairs, but it will only output the best alignment for each individual mate, not necessarily the two together. You could combine mates if that is helpful with another script." ) parser.add_argument( 'input', help="FILENAME input .sam or .bam or '-' for STDIN sam") group = parser.add_mutually_exclusive_group() group.add_argument('--bam', action='store_true') group.add_argument('--sam', action='store_true') args = parser.parse_args() inf = sys.stdin if args.bam or (not args.sam and not args.input == '-'): fh = open(args.input) p = Popen('samtools view - -h'.split(), stdin=fh, stdout=PIPE) inf = p.stdout msr = MultiEntrySamReader(inf) spc = SAMtoPSLconversionFactory() # set the headers for the spc for h in msr.header: print h.rstrip() spc.read_header_line(h) while True: entries = msr.read_entries() if not entries: break longest0 = 0 entry0 = None longest1 = 0 entry1 = None longest2 = 0 entry2 = None for sam in entries: pline = spc.convert_line(sam.get_line()) if not pline: continue side = None if sam.check_flag(64): side = 1 if sam.check_flag(128): side = 2 p = PSL(pline) if p.get_coverage() > longest0: longest0 = p.get_coverage() entry0 = sam if side == 1 and p.get_coverage() > longest1: longest1 = p.get_coverage() entry1 = sam if side == 2 and p.get_coverage() > longest2: longest2 = p.get_coverage() entry2 = sam if entry0: #output the combined if its there print entry0.get_line() else: if entry1: #output each of the mates if they are paired but not joined print entry1.get_line() if entry2: print entry2.get_line()
def main(): parser = argparse.ArgumentParser(description="Take a sam file and join together mate pairs into single alignments. Alignments must be ordered by query name.") parser.add_argument('input',help="FILENAME input .sam or .bam or '-' for STDIN sam") group = parser.add_mutually_exclusive_group() group.add_argument('--sam',action='store_true') group.add_argument('--bam',action='store_true') parser.add_argument('--mates_only',action='store_true',help="Only output combined mates") parser.add_argument('--threads',type=int,default=1,help="Number of threads to use, default is 1") args = parser.parse_args() inf = sys.stdin if args.bam or (not args.sam and not args.input == '-'): fh = open(args.input) p = Popen('samtools view - -h'.split(),stdin=fh,stdout=PIPE) inf = p.stdout buffer_size = 10000 buffer = [] msr = MultiEntrySamReader(inf) spc = SAMtoPSLconversionFactory() psc = PSLtoSAMconversionFactory() # set the headers for the spc for h in msr.header: print h.rstrip() spc.read_header_line(h) if args.threads > 1: p1 = Pool(processes=args.threads) while True: entries = msr.read_entries() if not entries: break buffer.append(entries) if len(buffer) >= buffer_size: if args.threads > 1: p1.apply_async(do_buffer,args=(buffer,msr,spc,psc,args),callback=do_callback) else: v = do_buffer(buffer,msr,spc,psc,args) do_callback(v) buffer = [] if len(buffer) > 0: if args.threads > 1: p1.apply_async(do_buffer,args=(buffer,msr,spc,psc,args),callback=do_callback) else: v = do_buffer(buffer,msr,spc,psc,args) do_callback(v) if args.threads > 1: p1.close() p1.join()
def main(): parser = argparse.ArgumentParser() parser.add_argument('input',help="Bam file in order of query name - for stdin") args = parser.parse_args() inf = sys.stdin if args.input != '-': cmd = "samtools view -h "+args.input p = Popen(cmd.split(),stdout=PIPE) inf = p.stdout mesr = MultiEntrySamReader(inf) while True: entries = mesr.read_entries() if not entries: break if len(entries) == 0: break if entries[0].value('cigar') == '*': print entries[0].value('qname')+"\t0" continue sam = entries[0] m = re.search('NH:i:(\d+)',sam.entry['remainder']) if not m: sys.stderr.write("ERROR not a hisat entry\n") sys.exit() cnt = max([len(entries),int(m.group(1))]) print entries[0].value('qname')+"\t"+str(cnt)