def process_locus(locus, args): depth = {} s2psl = SAMtoPSLconversionFactory() unique = {} chr = locus[0].value('rname') for sam in locus: p = PSL(s2psl.convert_line(sam.get_line())) g = GenePredEntry(p.get_genepred_line()) g = g.get_smoothed(args.min_intron) for i in range(0,g.get_exon_count()): rng = str(g.value('exonStarts')[i])+"\t"+str(g.value('exonEnds')[i]) if rng not in unique: unique[rng] = 0 unique[rng]+=1 for bstr in unique: [start,end] = bstr.split("\t") for i in range(int(start),int(end)): if i not in depth: depth[i] = 0 depth[i] += unique[bstr] # add the number of these to the depth #now we can print the depth prevdepth = 0 prevstart = None lasti = None for i in sorted(depth.keys()): if depth[i] < args.min_depth: continue if depth[i] != prevdepth: #output what we have so far if we have something if prevstart: output_depth(chr+"\t"+str(prevstart)+"\t"+str(lasti+1)+"\t"+str(prevdepth),args) prevstart = i prevdepth = depth[i] lasti = i if prevstart: output_depth(chr+"\t"+str(prevstart)+"\t"+str(lasti+1)+"\t"+str(prevdepth),args)
def main(): parser = argparse.ArgumentParser() parser.add_argument('input',help="PSLFILE or - for STDIN") args = parser.parse_args() inf = sys.stdin if args.input != '-': inf = open(args.input) z = 0 for line in inf: z+=1 p = PSL(line.rstrip()) print str(z) + "\t" + p.value('qName') + "\t" + p.value('tName')+"\t"+str(p.get_coverage())+"\t"+str(p.value('qSize'))+"\t"+str(p.get_quality()) inf.close()
def main(): parser = argparse.ArgumentParser( description= "Take a sam or bam file and output the best alignment for each read, it still can output the same read name twice if they happen to be mate pairs, but it will only output the best alignment for each individual mate, not necessarily the two together. You could combine mates if that is helpful with another script." ) parser.add_argument( 'input', help="FILENAME input .sam or .bam or '-' for STDIN sam") group = parser.add_mutually_exclusive_group() group.add_argument('--bam', action='store_true') group.add_argument('--sam', action='store_true') args = parser.parse_args() inf = sys.stdin if args.bam or (not args.sam and not args.input == '-'): fh = open(args.input) p = Popen('samtools view - -h'.split(), stdin=fh, stdout=PIPE) inf = p.stdout msr = MultiEntrySamReader(inf) spc = SAMtoPSLconversionFactory() # set the headers for the spc for h in msr.header: print h.rstrip() spc.read_header_line(h) while True: entries = msr.read_entries() if not entries: break longest0 = 0 entry0 = None longest1 = 0 entry1 = None longest2 = 0 entry2 = None for sam in entries: pline = spc.convert_line(sam.get_line()) if not pline: continue side = None if sam.check_flag(64): side = 1 if sam.check_flag(128): side = 2 p = PSL(pline) if p.get_coverage() > longest0: longest0 = p.get_coverage() entry0 = sam if side == 1 and p.get_coverage() > longest1: longest1 = p.get_coverage() entry1 = sam if side == 2 and p.get_coverage() > longest2: longest2 = p.get_coverage() entry2 = sam if entry0: #output the combined if its there print entry0.get_line() else: if entry1: #output each of the mates if they are paired but not joined print entry1.get_line() if entry2: print entry2.get_line()
def read_next(self): mpa = MultiplePSLAlignments() mcnt = 0 current_name = None if self.previous: #We have one waiting to go into an alignment l1 = self.previous p1 = PSL(l1.rstrip()) current_name = p1.value('qName') mpa.add_entry(p1) mcnt += 1 else: # It must be our first entry, so prime our buffer l1 = None while True: l1 = self.fh.readline() if not l1: return None if not is_valid(l1.rstrip()): continue # go till we get a PSL break p1 = PSL(l1.rstrip()) current_name = p1.value('qName') mpa.add_entry(p1) mcnt += 1 while True: l2 = self.fh.readline() if not l2: self.previous = None if mcnt > 0: return mpa return None if not is_valid(l2): sys.stderr.write("Warning line is not a valid psl line\n" + l2.rstrip() + "\n") continue # just skip strange bad lines like we never saw them p2 = PSL(l2.rstrip()) if p2.value( 'qName' ) == current_name: # We are working on this set of entries mpa.add_entry(p2) mcnt += 1 else: # We have a new set so buffer it and output what we have so far self.previous = l2 # buffer the line if mcnt > 0: return mpa sys.stderr.write("ERROR: How are we here?\n") sys.exit()
def main(): parser = argparse.ArgumentParser() parser.add_argument('input', help="PSLFILE or - for STDIN") args = parser.parse_args() inf = sys.stdin if args.input != '-': inf = open(args.input) z = 0 for line in inf: z += 1 p = PSL(line.rstrip()) print str(z) + "\t" + p.value('qName') + "\t" + p.value( 'tName') + "\t" + str(p.get_coverage()) + "\t" + str( p.value('qSize')) + "\t" + str(p.get_quality()) inf.close()
def convert_ARS_to_genomic_psl(self, psl, maximum_intron=400000): newpsl = psl.copy() coords = [] for i in range(0, len(psl.value('blockSizes'))): for j in range(0, psl.value('blockSizes')[i]): k = psl.value('tStarts')[i] + j m = psl.value('qStarts')[i] + j v = self.convert_ARS_to_genomic_coordinate( k + 1, in_strand=psl.value('strand')) if not v: return None v.append(m + 1) v.append(psl.value('strand')) coords.append(v) #name = self.get_conversion_string() # use this if they forgot to set a name #if self.name: name = self.name name = psl.value('qName') #print name + "\t" + self.get_conversion_string()+"\t"+str(v)+"\t"+psl.value('strand') psl_lines = crush_coords(coords, maximum_intron, name) mpsl = MultiplePSLAlignments() for psl_line in psl_lines: mpsl.add_entry(PSL(psl_line)) for i in range(0, mpsl.entry_count()): mpsl.entries[i].recalculate_stats() return mpsl
def read_next(self): mpa = MultiplePSLAlignments() mcnt = 0 current_name = None if self.previous: #We have one waiting to go into an alignment l1 = self.previous p1 = PSL(l1.rstrip()) current_name = p1.value('qName') mpa.add_entry(p1) mcnt += 1 else: # It must be our first entry, so prime our buffer l1 = None while True: l1 = self.fh.readline() if not l1: return None if not is_valid(l1.rstrip()): continue # go till we get a PSL break p1 = PSL(l1.rstrip()) current_name = p1.value('qName') mpa.add_entry(p1) mcnt += 1 while True: l2 = self.fh.readline() if not l2: self.previous = None if mcnt > 0: return mpa return None if not is_valid(l2): sys.stderr.write("Warning line is not a valid psl line\n"+l2.rstrip()+"\n") continue # just skip strange bad lines like we never saw them p2 = PSL(l2.rstrip()) if p2.value('qName') == current_name: # We are working on this set of entries mpa.add_entry(p2) mcnt += 1 else: # We have a new set so buffer it and output what we have so far self.previous = l2 # buffer the line if mcnt > 0: return mpa sys.stderr.write("ERROR: How are we here?\n") sys.exit()
def main(): parser = argparse.ArgumentParser( description="Convert a sam file into a psl file") parser.add_argument('--genome', help="FASTA input file of reference genome") parser.add_argument('--get_secondary_alignments', action='store_true', help="Report SA:Z secondary alignments as well") parser.add_argument('--get_alternative_alignments', action='store_true', help="Report XA:Z alternative alignments as well") parser.add_argument( '--get_all_alignments', action='store_true', help="Report SA:Z and XA:Z alternative alignments as well") parser.add_argument('--give_unique_names', action='store_true', help="Output query names will be unique.") group = parser.add_mutually_exclusive_group() group.add_argument( '--output_fasta', help= "FILENAME to save an outgoing fasta. Only works for primary alignments." ) group.add_argument( '--output_fastq', help= "FILENAME to save an outgoing fastq. Only works for primary alignments." ) parser.add_argument('infile', help="FILENAME input file or '-' for STDIN") parser.add_argument('-o', '--output', help="FILENAME for the output, STDOUT if not set.") args = parser.parse_args() if (args.output_fasta or args.output_fastq) and (args.get_secondary_alignments or args.get_alternative_alignments or args.get_all_alignments): sys.stderr.write( "ERROR, can only output the fastq/fasta if we are doing primary alignments only.\n" ) sys.exit() inf = sys.stdin if args.infile != '-': inf = open(args.infile) of = sys.stdout if args.output: of = open(args.output, 'w') spcf = SamBasics.SAMtoPSLconversionFactory() if args.genome: spcf.set_genome(args.genome) off = None if args.output_fasta: off = open(args.output_fasta, 'w') if args.output_fastq: off = open(args.output_fastq, 'w') z = 0 for line in inf: line = line.rstrip() if SamBasics.is_header(line): spcf.read_header_line(line) continue # We have a line to convert psl = spcf.convert_line(line) if psl: pobj = PSL(psl) z += 1 if args.give_unique_names: pobj.entry['qName'] = 'Q' + str(z) of.write(pobj.get_line() + "\n") if args.output_fastq or args.output_fasta: sam = SamBasics.SAM(line) sequence = sam.value('seq').upper() quality = sam.value('qual') if sam.check_flag(16): sequence = rc(sam.value('seq').upper()) quality = sam.value('qual')[::-1] if args.output_fasta: off.write(">" + pobj.value('qName') + "\n" + sequence + "\n") elif args.output_fastq: if len(sequence) == len(quality): off.write("@" + pobj.value('qName') + "\n" + sequence + "\n" + "+\n" + quality + "\n") else: sys.stderr.write("ERROR: sequence " + sequence + " length (" + str(len(sequence)) + ") doesnt match quality " + quality + " length (" + str(len(quality)) + ")\n") sys.exit() # Lets look for secondary alignments to convert if args.get_secondary_alignments or args.get_all_alignments: secondary_alignments = SamBasics.get_secondary_alignments( line.rstrip()) for samline in secondary_alignments: psl = spcf.convert_line(samline) if psl: #print "\nsecondary" #print samline z += 1 pobj = PSL(psl) if args.give_unique_names: pobj.entry['qName'] = 'Q' + str(z) of.write(pobj.get_line() + "\n") if args.get_alternative_alignments or args.get_all_alignments: alternative_alignments = SamBasics.get_alternative_alignments( line.rstrip()) for samline in alternative_alignments: psl = spcf.convert_line(samline) if psl: #print "\nsecondary" #print samline z += 1 pobj = PSL(psl) if args.give_unique_names: pobj.entry['qName'] = 'Q' + str(z) of.write(pobj.get_line() + "\n") inf.close() of.close()
def do_psl(args): for line in args.input: psl = PSL(line) cov = sum(psl.value('blockSizes')) print cov
def main(): parser = argparse.ArgumentParser(description="Convert a sam file into a psl file") parser.add_argument('--genome',help="FASTA input file of reference genome") parser.add_argument('--get_secondary_alignments',action='store_true',help="Report SA:Z secondary alignments as well") parser.add_argument('--get_alternative_alignments',action='store_true',help="Report XA:Z alternative alignments as well") parser.add_argument('--get_all_alignments',action='store_true',help="Report SA:Z and XA:Z alternative alignments as well") parser.add_argument('--give_unique_names',action='store_true',help="Output query names will be unique.") group = parser.add_mutually_exclusive_group() group.add_argument('--output_fasta',help="FILENAME to save an outgoing fasta. Only works for primary alignments.") group.add_argument('--output_fastq',help="FILENAME to save an outgoing fastq. Only works for primary alignments.") parser.add_argument('infile',help="FILENAME input file or '-' for STDIN") parser.add_argument('-o','--output',help="FILENAME for the output, STDOUT if not set.") args = parser.parse_args() if (args.output_fasta or args.output_fastq) and (args.get_secondary_alignments or args.get_alternative_alignments or args.get_all_alignments): sys.stderr.write("ERROR, can only output the fastq/fasta if we are doing primary alignments only.\n") sys.exit() inf = sys.stdin if args.infile != '-': inf = open(args.infile) of = sys.stdout if args.output: of = open(args.output,'w') spcf = SamBasics.SAMtoPSLconversionFactory() if args.genome: spcf.set_genome(args.genome) off = None if args.output_fasta: off = open(args.output_fasta,'w') if args.output_fastq: off = open(args.output_fastq,'w') z = 0 for line in inf: line = line.rstrip() if SamBasics.is_header(line): spcf.read_header_line(line) continue # We have a line to convert psl = spcf.convert_line(line) if psl: pobj = PSL(psl) z += 1 if args.give_unique_names: pobj.entry['qName'] = 'Q'+str(z) of.write(pobj.get_line()+"\n") if args.output_fastq or args.output_fasta: sam = SamBasics.SAM(line) sequence = sam.value('seq').upper() quality = sam.value('qual') if sam.check_flag(16): sequence = rc(sam.value('seq').upper()) quality = sam.value('qual')[::-1] if args.output_fasta: off.write(">"+pobj.value('qName')+"\n"+sequence+"\n") elif args.output_fastq: if len(sequence) == len(quality): off.write("@"+pobj.value('qName')+"\n"+sequence+"\n"+"+\n"+quality+"\n") else: sys.stderr.write("ERROR: sequence "+sequence+" length ("+str(len(sequence))+") doesnt match quality "+quality+" length ("+str(len(quality))+")\n") sys.exit() # Lets look for secondary alignments to convert if args.get_secondary_alignments or args.get_all_alignments: secondary_alignments = SamBasics.get_secondary_alignments(line.rstrip()) for samline in secondary_alignments: psl = spcf.convert_line(samline) if psl: #print "\nsecondary" #print samline z += 1 pobj = PSL(psl) if args.give_unique_names: pobj.entry['qName'] = 'Q'+str(z) of.write(pobj.get_line()+"\n") if args.get_alternative_alignments or args.get_all_alignments: alternative_alignments = SamBasics.get_alternative_alignments(line.rstrip()) for samline in alternative_alignments: psl = spcf.convert_line(samline) if psl: #print "\nsecondary" #print samline z += 1 pobj = PSL(psl) if args.give_unique_names: pobj.entry['qName'] = 'Q'+str(z) of.write(pobj.get_line()+"\n") inf.close() of.close()
def do_buffer(buffer,msr,spc,psc,args): outputs = [] for entries in buffer: l = [] r = [] for sam in entries: #Print line if its not a pair if not_a_mate_sam(sam): if not args.mates_only: outputs.append(sam.get_line()) continue if sam.check_flag(64): l.append(sam) if sam.check_flag(128): r.append(sam) if not (len(l)==1 and len(r)==1): # more than just a unique pair here if not args.mates_only: for sam in l: outputs.append(sam.get_line()) for sam in r: outputs.append(sam.get_line()) continue #Verify pairing by reference and direction if l[0].value('rname') != r[0].value('rname') or l[0].check_flag(16) == r[0].check_flag(16): sys.stderr.write("ERROR, these are not actually properly paired as we were led to believe\n") sys.exit() p1 = PSL(spc.convert_line(l[0].get_line())) if not re.search('[HP]',l[0].value('cigar')): p1.set_query(l[0].value('seq')) p1.set_quality_seq(l[0].value('qual')) if l[0].check_flag(16): # set the query to what it actually is p1.set_query(rc(l[0].value('seq'))) p1.set_quality_seq(l[0].value('qual')[::-1]) p2 = PSL(spc.convert_line(r[0].get_line())) if not re.search('[HP]',r[0].value('cigar')): p2.set_query(r[0].value('seq')) p2.set_quality_seq(r[0].value('qual')) if r[0].check_flag(16): # set the query to what it actually is p2.set_query(rc(r[0].value('seq'))) p2.set_quality_seq(r[0].value('qual')[::-1]) p12 = join_mated(p1,p2) if not p12: if not args.mates_only: outputs.append(l[0].get_line()) outputs.append(r[0].get_line()) continue #if p1.value('strand') == '-' and p2.value('strand') == '+' \ #and p2.value('tEnd') < p1.value('tStart'): sline = psc.convert_line(p12.get_line(),query_sequence=p12.get_query(),quality_sequence=p12.get_quality_seq()) #print p12.get_line() outputs.append(sline) return outputs
def do_buffer(buffer, msr, spc, psc, args): outputs = [] for entries in buffer: l = [] r = [] for sam in entries: #Print line if its not a pair if not_a_mate_sam(sam): if not args.mates_only: outputs.append(sam.get_line()) continue if sam.check_flag(64): l.append(sam) if sam.check_flag(128): r.append(sam) if not (len(l) == 1 and len(r) == 1): # more than just a unique pair here if not args.mates_only: for sam in l: outputs.append(sam.get_line()) for sam in r: outputs.append(sam.get_line()) continue #Verify pairing by reference and direction if l[0].value('rname') != r[0].value('rname') or l[0].check_flag( 16) == r[0].check_flag(16): sys.stderr.write( "ERROR, these are not actually properly paired as we were led to believe\n" ) sys.exit() p1 = PSL(spc.convert_line(l[0].get_line())) if not re.search('[HP]', l[0].value('cigar')): p1.set_query(l[0].value('seq')) p1.set_quality_seq(l[0].value('qual')) if l[0].check_flag(16): # set the query to what it actually is p1.set_query(rc(l[0].value('seq'))) p1.set_quality_seq(l[0].value('qual')[::-1]) p2 = PSL(spc.convert_line(r[0].get_line())) if not re.search('[HP]', r[0].value('cigar')): p2.set_query(r[0].value('seq')) p2.set_quality_seq(r[0].value('qual')) if r[0].check_flag(16): # set the query to what it actually is p2.set_query(rc(r[0].value('seq'))) p2.set_quality_seq(r[0].value('qual')[::-1]) p12 = join_mated(p1, p2) if not p12: if not args.mates_only: outputs.append(l[0].get_line()) outputs.append(r[0].get_line()) continue #if p1.value('strand') == '-' and p2.value('strand') == '+' \ #and p2.value('tEnd') < p1.value('tStart'): sline = psc.convert_line(p12.get_line(), query_sequence=p12.get_query(), quality_sequence=p12.get_quality_seq()) #print p12.get_line() outputs.append(sline) return outputs