Exemple #1
0
def process_locus(locus, args):
  depth = {}
  s2psl = SAMtoPSLconversionFactory()
  unique = {}
  chr = locus[0].value('rname')
  for sam in locus:
    p = PSL(s2psl.convert_line(sam.get_line()))
    g = GenePredEntry(p.get_genepred_line())
    g = g.get_smoothed(args.min_intron)
    for i in range(0,g.get_exon_count()):
      rng = str(g.value('exonStarts')[i])+"\t"+str(g.value('exonEnds')[i])
      if rng not in unique: unique[rng] = 0
      unique[rng]+=1
  for bstr in unique:
    [start,end] = bstr.split("\t")
    for i in range(int(start),int(end)):
      if i not in depth:  depth[i] = 0
      depth[i] += unique[bstr] # add the number of these to the depth
  #now we can print the depth
  prevdepth = 0
  prevstart = None
  lasti = None
  for i in sorted(depth.keys()):
    if depth[i] < args.min_depth: continue
    if depth[i] != prevdepth: #output what we have so far if we have something
      if prevstart: 
        output_depth(chr+"\t"+str(prevstart)+"\t"+str(lasti+1)+"\t"+str(prevdepth),args)
      prevstart = i
    prevdepth = depth[i]
    lasti = i
  if prevstart:
    output_depth(chr+"\t"+str(prevstart)+"\t"+str(lasti+1)+"\t"+str(prevdepth),args)
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('input',help="PSLFILE or - for STDIN")
  args = parser.parse_args()
  inf = sys.stdin
  if args.input != '-':
    inf = open(args.input)
  z = 0
  for line in inf:
    z+=1
    p = PSL(line.rstrip())
    print str(z) + "\t" + p.value('qName') + "\t" + p.value('tName')+"\t"+str(p.get_coverage())+"\t"+str(p.value('qSize'))+"\t"+str(p.get_quality())
  inf.close()
Exemple #3
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Take a sam or bam file and output the best alignment for each read, it still can output the same read name twice if they happen to be mate pairs, but it will only output the best alignment for each individual mate, not necessarily the two together.  You could combine mates if that is helpful with another script."
    )
    parser.add_argument(
        'input', help="FILENAME input .sam or .bam or '-' for STDIN sam")
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--bam', action='store_true')
    group.add_argument('--sam', action='store_true')
    args = parser.parse_args()
    inf = sys.stdin
    if args.bam or (not args.sam and not args.input == '-'):
        fh = open(args.input)
        p = Popen('samtools view - -h'.split(), stdin=fh, stdout=PIPE)
        inf = p.stdout
    msr = MultiEntrySamReader(inf)
    spc = SAMtoPSLconversionFactory()
    # set the headers for the spc
    for h in msr.header:
        print h.rstrip()
        spc.read_header_line(h)
    while True:
        entries = msr.read_entries()
        if not entries: break
        longest0 = 0
        entry0 = None
        longest1 = 0
        entry1 = None
        longest2 = 0
        entry2 = None
        for sam in entries:
            pline = spc.convert_line(sam.get_line())
            if not pline: continue
            side = None
            if sam.check_flag(64): side = 1
            if sam.check_flag(128): side = 2
            p = PSL(pline)
            if p.get_coverage() > longest0:
                longest0 = p.get_coverage()
                entry0 = sam
            if side == 1 and p.get_coverage() > longest1:
                longest1 = p.get_coverage()
                entry1 = sam
            if side == 2 and p.get_coverage() > longest2:
                longest2 = p.get_coverage()
                entry2 = sam
        if entry0:  #output the combined if its there
            print entry0.get_line()
        else:
            if entry1:  #output each of the mates if they are paired but not joined
                print entry1.get_line()
            if entry2:
                print entry2.get_line()
 def read_next(self):
     mpa = MultiplePSLAlignments()
     mcnt = 0
     current_name = None
     if self.previous:  #We have one waiting to go into an alignment
         l1 = self.previous
         p1 = PSL(l1.rstrip())
         current_name = p1.value('qName')
         mpa.add_entry(p1)
         mcnt += 1
     else:  # It must be our first entry, so prime our buffer
         l1 = None
         while True:
             l1 = self.fh.readline()
             if not l1:
                 return None
             if not is_valid(l1.rstrip()): continue  # go till we get a PSL
             break
         p1 = PSL(l1.rstrip())
         current_name = p1.value('qName')
         mpa.add_entry(p1)
         mcnt += 1
     while True:
         l2 = self.fh.readline()
         if not l2:
             self.previous = None
             if mcnt > 0:
                 return mpa
             return None
         if not is_valid(l2):
             sys.stderr.write("Warning line is not a valid psl line\n" +
                              l2.rstrip() + "\n")
             continue  # just skip strange bad lines like we never saw them
         p2 = PSL(l2.rstrip())
         if p2.value(
                 'qName'
         ) == current_name:  # We are working on this set of entries
             mpa.add_entry(p2)
             mcnt += 1
         else:  # We have a new set so buffer it and output what we have so far
             self.previous = l2  # buffer the line
             if mcnt > 0:
                 return mpa
             sys.stderr.write("ERROR: How are we here?\n")
             sys.exit()
Exemple #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input', help="PSLFILE or - for STDIN")
    args = parser.parse_args()
    inf = sys.stdin
    if args.input != '-':
        inf = open(args.input)
    z = 0
    for line in inf:
        z += 1
        p = PSL(line.rstrip())
        print str(z) + "\t" + p.value('qName') + "\t" + p.value(
            'tName') + "\t" + str(p.get_coverage()) + "\t" + str(
                p.value('qSize')) + "\t" + str(p.get_quality())
    inf.close()
Exemple #6
0
 def convert_ARS_to_genomic_psl(self, psl, maximum_intron=400000):
     newpsl = psl.copy()
     coords = []
     for i in range(0, len(psl.value('blockSizes'))):
         for j in range(0, psl.value('blockSizes')[i]):
             k = psl.value('tStarts')[i] + j
             m = psl.value('qStarts')[i] + j
             v = self.convert_ARS_to_genomic_coordinate(
                 k + 1, in_strand=psl.value('strand'))
             if not v: return None
             v.append(m + 1)
             v.append(psl.value('strand'))
             coords.append(v)
             #name = self.get_conversion_string() # use this if they forgot to set a name
             #if self.name:  name = self.name
             name = psl.value('qName')
             #print name + "\t" + self.get_conversion_string()+"\t"+str(v)+"\t"+psl.value('strand')
     psl_lines = crush_coords(coords, maximum_intron, name)
     mpsl = MultiplePSLAlignments()
     for psl_line in psl_lines:
         mpsl.add_entry(PSL(psl_line))
     for i in range(0, mpsl.entry_count()):
         mpsl.entries[i].recalculate_stats()
     return mpsl
 def read_next(self):
   mpa = MultiplePSLAlignments()
   mcnt = 0
   current_name = None
   if self.previous:      #We have one waiting to go into an alignment
     l1 = self.previous
     p1 = PSL(l1.rstrip())
     current_name = p1.value('qName')
     mpa.add_entry(p1)
     mcnt +=  1
   else: # It must be our first entry, so prime our buffer
     l1 = None
     while True:
       l1 = self.fh.readline()
       if not l1:
         return None
       if not is_valid(l1.rstrip()): continue # go till we get a PSL
       break
     p1 = PSL(l1.rstrip())
     current_name = p1.value('qName')
     mpa.add_entry(p1)
     mcnt += 1
   while True:
     l2 = self.fh.readline()
     if not l2: 
       self.previous = None
       if mcnt > 0:
         return mpa
       return None
     if not is_valid(l2): 
       sys.stderr.write("Warning line is not a valid psl line\n"+l2.rstrip()+"\n")
       continue # just skip strange bad lines like we never saw them
     p2 = PSL(l2.rstrip())
     if p2.value('qName') == current_name: # We are working on this set of entries
       mpa.add_entry(p2)
       mcnt += 1
     else: # We have a new set so buffer it and output what we have so far
       self.previous = l2 # buffer the line
       if mcnt > 0:
         return mpa
       sys.stderr.write("ERROR: How are we here?\n")
       sys.exit()
def main():
    parser = argparse.ArgumentParser(
        description="Convert a sam file into a psl file")
    parser.add_argument('--genome',
                        help="FASTA input file of reference genome")
    parser.add_argument('--get_secondary_alignments',
                        action='store_true',
                        help="Report SA:Z secondary alignments as well")
    parser.add_argument('--get_alternative_alignments',
                        action='store_true',
                        help="Report XA:Z alternative alignments as well")
    parser.add_argument(
        '--get_all_alignments',
        action='store_true',
        help="Report SA:Z and XA:Z alternative alignments as well")
    parser.add_argument('--give_unique_names',
                        action='store_true',
                        help="Output query names will be unique.")
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '--output_fasta',
        help=
        "FILENAME to save an outgoing fasta.  Only works for primary alignments."
    )
    group.add_argument(
        '--output_fastq',
        help=
        "FILENAME to save an outgoing fastq.  Only works for primary alignments."
    )
    parser.add_argument('infile', help="FILENAME input file or '-' for STDIN")
    parser.add_argument('-o',
                        '--output',
                        help="FILENAME for the output, STDOUT if not set.")
    args = parser.parse_args()
    if (args.output_fasta
            or args.output_fastq) and (args.get_secondary_alignments
                                       or args.get_alternative_alignments
                                       or args.get_all_alignments):
        sys.stderr.write(
            "ERROR, can only output the fastq/fasta if we are doing primary alignments only.\n"
        )
        sys.exit()
    inf = sys.stdin
    if args.infile != '-':
        inf = open(args.infile)
    of = sys.stdout
    if args.output:
        of = open(args.output, 'w')
    spcf = SamBasics.SAMtoPSLconversionFactory()
    if args.genome: spcf.set_genome(args.genome)
    off = None
    if args.output_fasta:
        off = open(args.output_fasta, 'w')
    if args.output_fastq:
        off = open(args.output_fastq, 'w')
    z = 0
    for line in inf:
        line = line.rstrip()
        if SamBasics.is_header(line):
            spcf.read_header_line(line)
            continue
        # We have a line to convert
        psl = spcf.convert_line(line)
        if psl:
            pobj = PSL(psl)
            z += 1
            if args.give_unique_names:
                pobj.entry['qName'] = 'Q' + str(z)
            of.write(pobj.get_line() + "\n")
            if args.output_fastq or args.output_fasta:
                sam = SamBasics.SAM(line)
                sequence = sam.value('seq').upper()
                quality = sam.value('qual')
                if sam.check_flag(16):
                    sequence = rc(sam.value('seq').upper())
                    quality = sam.value('qual')[::-1]
                if args.output_fasta:
                    off.write(">" + pobj.value('qName') + "\n" + sequence +
                              "\n")
                elif args.output_fastq:
                    if len(sequence) == len(quality):
                        off.write("@" + pobj.value('qName') + "\n" + sequence +
                                  "\n" + "+\n" + quality + "\n")
                    else:
                        sys.stderr.write("ERROR: sequence " + sequence +
                                         " length (" + str(len(sequence)) +
                                         ") doesnt match quality " + quality +
                                         " length (" + str(len(quality)) +
                                         ")\n")
                        sys.exit()
        # Lets look for secondary alignments to convert
        if args.get_secondary_alignments or args.get_all_alignments:
            secondary_alignments = SamBasics.get_secondary_alignments(
                line.rstrip())
            for samline in secondary_alignments:
                psl = spcf.convert_line(samline)
                if psl:
                    #print "\nsecondary"
                    #print samline
                    z += 1
                    pobj = PSL(psl)
                    if args.give_unique_names:
                        pobj.entry['qName'] = 'Q' + str(z)
                    of.write(pobj.get_line() + "\n")
        if args.get_alternative_alignments or args.get_all_alignments:
            alternative_alignments = SamBasics.get_alternative_alignments(
                line.rstrip())
            for samline in alternative_alignments:
                psl = spcf.convert_line(samline)
                if psl:
                    #print "\nsecondary"
                    #print samline
                    z += 1
                    pobj = PSL(psl)
                    if args.give_unique_names:
                        pobj.entry['qName'] = 'Q' + str(z)
                    of.write(pobj.get_line() + "\n")
    inf.close()
    of.close()
def do_psl(args):
  for line in args.input:
    psl = PSL(line)
    cov = sum(psl.value('blockSizes'))
    print cov
def main():
  parser = argparse.ArgumentParser(description="Convert a sam file into a psl file")
  parser.add_argument('--genome',help="FASTA input file of reference genome")
  parser.add_argument('--get_secondary_alignments',action='store_true',help="Report SA:Z secondary alignments as well")
  parser.add_argument('--get_alternative_alignments',action='store_true',help="Report XA:Z alternative alignments as well")
  parser.add_argument('--get_all_alignments',action='store_true',help="Report SA:Z and XA:Z alternative alignments as well")
  parser.add_argument('--give_unique_names',action='store_true',help="Output query names will be unique.")
  group = parser.add_mutually_exclusive_group()
  group.add_argument('--output_fasta',help="FILENAME to save an outgoing fasta.  Only works for primary alignments.")
  group.add_argument('--output_fastq',help="FILENAME to save an outgoing fastq.  Only works for primary alignments.")
  parser.add_argument('infile',help="FILENAME input file or '-' for STDIN")
  parser.add_argument('-o','--output',help="FILENAME for the output, STDOUT if not set.")
  args = parser.parse_args()
  if (args.output_fasta or args.output_fastq) and (args.get_secondary_alignments or args.get_alternative_alignments or args.get_all_alignments):
    sys.stderr.write("ERROR, can only output the fastq/fasta if we are doing primary alignments only.\n")
    sys.exit()
  inf = sys.stdin
  if args.infile != '-': 
    inf = open(args.infile)
  of = sys.stdout
  if args.output:
    of = open(args.output,'w')
  spcf = SamBasics.SAMtoPSLconversionFactory()
  if args.genome: spcf.set_genome(args.genome)
  off = None
  if args.output_fasta:
    off = open(args.output_fasta,'w')
  if args.output_fastq:
    off = open(args.output_fastq,'w')
  z = 0
  for line in inf:
    line = line.rstrip()
    if SamBasics.is_header(line): 
      spcf.read_header_line(line)
      continue
    # We have a line to convert
    psl = spcf.convert_line(line)
    if psl:
      pobj = PSL(psl)
      z += 1
      if args.give_unique_names:
        pobj.entry['qName'] = 'Q'+str(z)
      of.write(pobj.get_line()+"\n")
      if args.output_fastq or args.output_fasta:
        sam = SamBasics.SAM(line)
        sequence = sam.value('seq').upper()
        quality = sam.value('qual')
        if sam.check_flag(16):
          sequence = rc(sam.value('seq').upper())
          quality = sam.value('qual')[::-1]
        if args.output_fasta:
          off.write(">"+pobj.value('qName')+"\n"+sequence+"\n")
        elif args.output_fastq:
          if len(sequence) == len(quality):
            off.write("@"+pobj.value('qName')+"\n"+sequence+"\n"+"+\n"+quality+"\n")
          else:
            sys.stderr.write("ERROR: sequence "+sequence+" length ("+str(len(sequence))+") doesnt match quality "+quality+" length ("+str(len(quality))+")\n")
            sys.exit()
    # Lets look for secondary alignments to convert
    if args.get_secondary_alignments or args.get_all_alignments:
      secondary_alignments = SamBasics.get_secondary_alignments(line.rstrip())
      for samline in secondary_alignments:
        psl = spcf.convert_line(samline)
        if psl:
          #print "\nsecondary"
          #print samline
          z += 1
          pobj = PSL(psl)
          if args.give_unique_names:
            pobj.entry['qName'] = 'Q'+str(z)
          of.write(pobj.get_line()+"\n")
    if args.get_alternative_alignments or args.get_all_alignments:
      alternative_alignments = SamBasics.get_alternative_alignments(line.rstrip())
      for samline in alternative_alignments:
        psl = spcf.convert_line(samline)
        if psl:
          #print "\nsecondary"
          #print samline
          z += 1
          pobj = PSL(psl)
          if args.give_unique_names:
            pobj.entry['qName'] = 'Q'+str(z)
          of.write(pobj.get_line()+"\n")
  inf.close()
  of.close()
def do_buffer(buffer,msr,spc,psc,args):
  outputs = []
  for entries in buffer:
    l = []
    r = []
    for sam in entries:
      #Print line if its not a pair
      if not_a_mate_sam(sam):
        if not args.mates_only:
          outputs.append(sam.get_line())
        continue
      if sam.check_flag(64): l.append(sam)
      if sam.check_flag(128): r.append(sam)
    if not (len(l)==1 and len(r)==1):
      # more than just a unique pair here
      if not args.mates_only:
        for sam in l:  outputs.append(sam.get_line())
        for sam in r:  outputs.append(sam.get_line())
      continue
    #Verify pairing by reference and direction
    if l[0].value('rname') != r[0].value('rname') or l[0].check_flag(16) == r[0].check_flag(16):
      sys.stderr.write("ERROR, these are not actually properly paired as we were led to believe\n")
      sys.exit()
    p1 = PSL(spc.convert_line(l[0].get_line()))
    if not re.search('[HP]',l[0].value('cigar')): 
      p1.set_query(l[0].value('seq'))
      p1.set_quality_seq(l[0].value('qual'))
      if l[0].check_flag(16):
        # set the query to what it actually is
        p1.set_query(rc(l[0].value('seq')))
        p1.set_quality_seq(l[0].value('qual')[::-1])      
    p2 = PSL(spc.convert_line(r[0].get_line()))
    if not re.search('[HP]',r[0].value('cigar')): 
      p2.set_query(r[0].value('seq'))
      p2.set_quality_seq(r[0].value('qual'))
      if r[0].check_flag(16):
        # set the query to what it actually is
        p2.set_query(rc(r[0].value('seq')))
        p2.set_quality_seq(r[0].value('qual')[::-1])      
    p12 = join_mated(p1,p2)
    if not p12:
      if not args.mates_only:
        outputs.append(l[0].get_line())
        outputs.append(r[0].get_line())
      continue
    #if p1.value('strand') == '-' and p2.value('strand') == '+' \
    #and p2.value('tEnd') < p1.value('tStart'):
    sline = psc.convert_line(p12.get_line(),query_sequence=p12.get_query(),quality_sequence=p12.get_quality_seq())
    #print p12.get_line()
    outputs.append(sline)
  return outputs
Exemple #12
0
def do_buffer(buffer, msr, spc, psc, args):
    outputs = []
    for entries in buffer:
        l = []
        r = []
        for sam in entries:
            #Print line if its not a pair
            if not_a_mate_sam(sam):
                if not args.mates_only:
                    outputs.append(sam.get_line())
                continue
            if sam.check_flag(64): l.append(sam)
            if sam.check_flag(128): r.append(sam)
        if not (len(l) == 1 and len(r) == 1):
            # more than just a unique pair here
            if not args.mates_only:
                for sam in l:
                    outputs.append(sam.get_line())
                for sam in r:
                    outputs.append(sam.get_line())
            continue
        #Verify pairing by reference and direction
        if l[0].value('rname') != r[0].value('rname') or l[0].check_flag(
                16) == r[0].check_flag(16):
            sys.stderr.write(
                "ERROR, these are not actually properly paired as we were led to believe\n"
            )
            sys.exit()
        p1 = PSL(spc.convert_line(l[0].get_line()))
        if not re.search('[HP]', l[0].value('cigar')):
            p1.set_query(l[0].value('seq'))
            p1.set_quality_seq(l[0].value('qual'))
            if l[0].check_flag(16):
                # set the query to what it actually is
                p1.set_query(rc(l[0].value('seq')))
                p1.set_quality_seq(l[0].value('qual')[::-1])
        p2 = PSL(spc.convert_line(r[0].get_line()))
        if not re.search('[HP]', r[0].value('cigar')):
            p2.set_query(r[0].value('seq'))
            p2.set_quality_seq(r[0].value('qual'))
            if r[0].check_flag(16):
                # set the query to what it actually is
                p2.set_query(rc(r[0].value('seq')))
                p2.set_quality_seq(r[0].value('qual')[::-1])
        p12 = join_mated(p1, p2)
        if not p12:
            if not args.mates_only:
                outputs.append(l[0].get_line())
                outputs.append(r[0].get_line())
            continue
        #if p1.value('strand') == '-' and p2.value('strand') == '+' \
        #and p2.value('tEnd') < p1.value('tStart'):
        sline = psc.convert_line(p12.get_line(),
                                 query_sequence=p12.get_query(),
                                 quality_sequence=p12.get_quality_seq())
        #print p12.get_line()
        outputs.append(sline)
    return outputs
def do_psl(args):
    for line in args.input:
        psl = PSL(line)
        cov = sum(psl.value('blockSizes'))
        print cov