def main():
  parser = argparse.ArgumentParser(description="Correct the matches/mismatches and Ncount of a PSL file")
  parser.add_argument('input',help="PSLFILE or - for STIDN")
  parser.add_argument('reference',help="FASTAFILE reference genome")
  parser.add_argument('query',help="FASTAFILE query sequences")
  parser.add_argument('--minimum_intron_size',type=int,default=68,help="INT")
  #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance")
  args = parser.parse_args()
  # Read in the reference genome
  sys.stderr.write("Reading in reference genome\n")
  g = read_fasta_into_hash(args.reference)
  sys.stderr.write("Finished reading "+str(len(g.keys()))+" reference sequences\n")
  inf = sys.stdin
  if args.input != '-':
    inf = open(args.input)
  fhr = FastaHandleReader(open(args.query))
  last_fasta = fhr.read_entry()
  if not last_fasta:
    sys.stderr.write("ERROR: No query sequences\n")
    sys.exit()
  for line in inf:
    p = PSLBasics.PSL(line)
    if not p.validate():
      sys.stderr.write("WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"+line.rstrip()+"\n")
    n = p.value('qName')
    if not last_fasta:
      sys.stderr.write("ERROR: Ran out of query sequences too soon.  Are they sorted properly\n")
      sys.exit()
    while last_fasta['name'] != n:
      last_fasta = fhr.read_entry()
    p.set_query(last_fasta['seq'])
    p.set_reference_dictionary(g)
    print p.get_line()
    p.pretty_print(50)
  fhr.close()
Example #2
0
def main():
    parser = argparse.ArgumentParser(
        description="Correct the matches/mismatches and Ncount of a PSL file")
    parser.add_argument('input', help="PSLFILE or - for STIDN")
    parser.add_argument('reference', help="FASTAFILE reference genome")
    parser.add_argument('query', help="FASTAFILE query sequences")
    parser.add_argument('--minimum_intron_size',
                        type=int,
                        default=68,
                        help="INT")
    #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance")
    args = parser.parse_args()
    # Read in the reference genome
    sys.stderr.write("Reading in reference genome\n")
    g = read_fasta_into_hash(args.reference)
    sys.stderr.write("Finished reading " + str(len(g.keys())) +
                     " reference sequences\n")
    inf = sys.stdin
    if args.input != '-':
        inf = open(args.input)
    fhr = FastaHandleReader(open(args.query))
    last_fasta = fhr.read_entry()
    if not last_fasta:
        sys.stderr.write("ERROR: No query sequences\n")
        sys.exit()
    for line in inf:
        p = PSLBasics.PSL(line)
        if not p.validate():
            sys.stderr.write(
                "WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"
                + line.rstrip() + "\n")
        n = p.value('qName')
        if not last_fasta:
            sys.stderr.write(
                "ERROR: Ran out of query sequences too soon.  Are they sorted properly\n"
            )
            sys.exit()
        while last_fasta['name'] != n:
            last_fasta = fhr.read_entry()
        p.set_query(last_fasta['seq'])
        p.set_reference_dictionary(g)
        print p.get_line()
        p.pretty_print(50)
    fhr.close()
def main():
  parser = argparse.ArgumentParser(description="Correct the matches/mismatches and Ncount of a PSL file")
  parser.add_argument('input',help="PSLFILE or - for STIDN")
  parser.add_argument('reference',help="FASTAFILE reference genome")
  parser.add_argument('query',help="FASTAFILE query sequences")
  parser.add_argument('--minimum_intron_size',type=int,default=68,help="INT")
  #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance")
  args = parser.parse_args()
  # Read in the reference genome
  sys.stderr.write("Reading in reference genome\n")
  g = read_fasta_into_hash(args.reference)
  sys.stderr.write("Finished reading "+str(len(g.keys()))+" reference sequences\n")
  inf = sys.stdin
  if args.input != '-':
    inf = open(args.input)
  fhr = FastaHandleReader(open(args.query))
  last_fasta = fhr.read_entry()
  if not last_fasta:
    sys.stderr.write("ERROR: No query sequences\n")
    sys.exit()
  for line in inf:
    p = PSLBasics.PSL(line)
    if not p.validate():
      sys.stderr.write("WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"+line.rstrip()+"\n")
    n = p.value('qName')
    if not last_fasta:
      sys.stderr.write("ERROR: Ran out of query sequences too soon.  Are they sorted properly\n")
      sys.exit()
    while last_fasta['name'] != n:
      last_fasta = fhr.read_entry()
    p.set_query(last_fasta['seq'])
    p.set_reference_dictionary(g)
    p.correct_stats()
    print p.get_line()
    continue
    f = last_fasta
    nCount = 0
    matches = 0
    misMatches = 0
    prev_qE = 0
    prev_tE = 0
    qNumInsert = 0
    qBaseInsert = 0
    tNumInsert = 0
    tBaseInsert = 0
    for i in range(p.value('blockCount')):
      blen = p.value('blockSizes')[i]
      qS = p.value('qStarts')[i] #query start
      qE = qS + blen             #query end
      tS = p.value('tStarts')[i] #target start
      tE = tS + blen             #target end
      #Work on gaps
      if prev_qE > 0 or prev_tE > 0: #if its not our first time through
        tgap = tS-prev_tE
        if tgap < args.minimum_intron_size and tgap > 0:
          tNumInsert += 1
          tBaseInsert += tgap
        qgap = qS-prev_qE
        if qgap > 0:
          qNumInsert += 1
          qBaseInsert += qgap
      query = f['seq']
      if p.value('strand') == '-':
        query = rc(f['seq'])
      qseq = query[qS:qE].upper()
      rseq = g[p.value('tName')][tS:tE].upper()
      #print qseq+"\n"+rseq+"\n"
      for j in range(0,blen):
        if qseq[j] == 'N':
          nCount += 1
        elif qseq[j] == rseq[j]:
          matches += 1
        else:
          misMatches += 1
      prev_qE = qE
      prev_tE = tE
    p.entry['matches'] = matches
    p.entry['misMatches'] = misMatches
    p.entry['nCount'] = nCount
    p.entry['qNumInsert'] = qNumInsert
    p.entry['qBaseInsert'] = qBaseInsert
    p.entry['tNumInsert'] = tNumInsert
    p.entry['tBaseInsert'] = tBaseInsert
    p.entry['qSize'] = len(query)
    p.entry['tSize'] = len(g[p.value('tName')]) 
    print p.get_line()
    #p.pretty_print(100)
  fhr.close()