def main():
    parser = argparse.ArgumentParser(
        description=
        "Find mapping distance of paired end reads.  Takes an ordered (by query) alignment to a transcriptome.\nSomething that works for an input thus far is like:\nhisat --reorder -x mytranscriptome -1 my_1.fastq -2 my_2.fastq | this_script.py -"
    )
    parser.add_argument(
        'input_sam',
        help="SAMFILE ordered alignment a transcriptome or - for stdin")
    args = parser.parse_args()
    inf = sys.stdin
    if args.input_sam != '-':
        inf = open(args.input_sam)
    msr = SamBasics.MultiEntrySamReader(inf)
    spcf = SamBasics.SAMtoPSLconversionFactory()
    data = []
    sys.stderr.write("Pairs    Mean    Stddev\n")
    while True:
        entries = msr.read_entries()
        if not entries: break
        if len(entries) != 2: continue
        [e1, e2] = entries
        if e1.check_flag(4) or e2.check_flag(4): continue
        if not e1.check_flag(2) and e2.check_flag(2): continue
        if not ((e1.check_flag(64) and e2.check_flag(128)) or
                (e1.check_flag(128) and e2.check_flag(64))):
            continue
        p1 = spcf.convert_line(e1.get_line())
        p2 = spcf.convert_line(e2.get_line())
        if not p1 or not p2: continue
        p1 = PSLBasics.PSL(p1)
        p2 = PSLBasics.PSL(p2)
        dist = max(
            p2.value('tEnd') - p1.value('tStart'),
            p1.value('tEnd') - p2.value('tStart'))
        data.append(dist)
        if len(data) < 2: continue
        if len(data) % 1000 == 0:
            sys.stderr.write(
                str(len(data)) + "    " + str(int(mean(data))) + "    " +
                str(int(stddev(data))) + "              \r")
    sys.stderr.write(
        str(len(data)) + "    " + str(int(mean(data))) + "    " +
        str(int(stddev(data))) + "              \r")
    sys.stderr.write("\n")
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser(
        description="Correct the matches/mismatches and Ncount of a PSL file")
    parser.add_argument('input', help="PSLFILE or - for STIDN")
    parser.add_argument('reference', help="FASTAFILE reference genome")
    parser.add_argument('query', help="FASTAFILE query sequences")
    parser.add_argument('--minimum_intron_size',
                        type=int,
                        default=68,
                        help="INT")
    #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance")
    args = parser.parse_args()
    # Read in the reference genome
    sys.stderr.write("Reading in reference genome\n")
    g = read_fasta_into_hash(args.reference)
    sys.stderr.write("Finished reading " + str(len(g.keys())) +
                     " reference sequences\n")
    inf = sys.stdin
    if args.input != '-':
        inf = open(args.input)
    fhr = FastaHandleReader(open(args.query))
    last_fasta = fhr.read_entry()
    if not last_fasta:
        sys.stderr.write("ERROR: No query sequences\n")
        sys.exit()
    for line in inf:
        p = PSLBasics.PSL(line)
        if not p.validate():
            sys.stderr.write(
                "WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"
                + line.rstrip() + "\n")
        n = p.value('qName')
        if not last_fasta:
            sys.stderr.write(
                "ERROR: Ran out of query sequences too soon.  Are they sorted properly\n"
            )
            sys.exit()
        while last_fasta['name'] != n:
            last_fasta = fhr.read_entry()
        p.set_query(last_fasta['seq'])
        p.set_reference_dictionary(g)
        print p.get_line()
        p.pretty_print(50)
    fhr.close()
def main():
  parser = argparse.ArgumentParser(description="Correct the matches/mismatches and Ncount of a PSL file")
  parser.add_argument('input',help="PSLFILE or - for STIDN")
  parser.add_argument('reference',help="FASTAFILE reference genome")
  parser.add_argument('query',help="FASTAFILE query sequences")
  parser.add_argument('--minimum_intron_size',type=int,default=68,help="INT")
  #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance")
  args = parser.parse_args()
  # Read in the reference genome
  sys.stderr.write("Reading in reference genome\n")
  g = read_fasta_into_hash(args.reference)
  sys.stderr.write("Finished reading "+str(len(g.keys()))+" reference sequences\n")
  inf = sys.stdin
  if args.input != '-':
    inf = open(args.input)
  fhr = FastaHandleReader(open(args.query))
  last_fasta = fhr.read_entry()
  if not last_fasta:
    sys.stderr.write("ERROR: No query sequences\n")
    sys.exit()
  for line in inf:
    p = PSLBasics.PSL(line)
    if not p.validate():
      sys.stderr.write("WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"+line.rstrip()+"\n")
    n = p.value('qName')
    if not last_fasta:
      sys.stderr.write("ERROR: Ran out of query sequences too soon.  Are they sorted properly\n")
      sys.exit()
    while last_fasta['name'] != n:
      last_fasta = fhr.read_entry()
    p.set_query(last_fasta['seq'])
    p.set_reference_dictionary(g)
    p.correct_stats()
    print p.get_line()
    continue
    f = last_fasta
    nCount = 0
    matches = 0
    misMatches = 0
    prev_qE = 0
    prev_tE = 0
    qNumInsert = 0
    qBaseInsert = 0
    tNumInsert = 0
    tBaseInsert = 0
    for i in range(p.value('blockCount')):
      blen = p.value('blockSizes')[i]
      qS = p.value('qStarts')[i] #query start
      qE = qS + blen             #query end
      tS = p.value('tStarts')[i] #target start
      tE = tS + blen             #target end
      #Work on gaps
      if prev_qE > 0 or prev_tE > 0: #if its not our first time through
        tgap = tS-prev_tE
        if tgap < args.minimum_intron_size and tgap > 0:
          tNumInsert += 1
          tBaseInsert += tgap
        qgap = qS-prev_qE
        if qgap > 0:
          qNumInsert += 1
          qBaseInsert += qgap
      query = f['seq']
      if p.value('strand') == '-':
        query = rc(f['seq'])
      qseq = query[qS:qE].upper()
      rseq = g[p.value('tName')][tS:tE].upper()
      #print qseq+"\n"+rseq+"\n"
      for j in range(0,blen):
        if qseq[j] == 'N':
          nCount += 1
        elif qseq[j] == rseq[j]:
          matches += 1
        else:
          misMatches += 1
      prev_qE = qE
      prev_tE = tE
    p.entry['matches'] = matches
    p.entry['misMatches'] = misMatches
    p.entry['nCount'] = nCount
    p.entry['qNumInsert'] = qNumInsert
    p.entry['qBaseInsert'] = qBaseInsert
    p.entry['tNumInsert'] = tNumInsert
    p.entry['tBaseInsert'] = tBaseInsert
    p.entry['qSize'] = len(query)
    p.entry['tSize'] = len(g[p.value('tName')]) 
    print p.get_line()
    #p.pretty_print(100)
  fhr.close()
def main():
    parser = argparse.ArgumentParser(
        description="Analyze ORDERED psl alignments of long reads.")
    parser.add_argument(
        'psl_file',
        help="Alignment file. Must be ordered by query name. use - for stdin")
    parser.add_argument('-o',
                        '--output',
                        help="Write to output file, default is STDIN")
    parser.add_argument('--noheader', action='store_true')
    parser.add_argument(
        '--minimum_coverage',
        type=int,
        help="Only consider alignments with at least this many bp aligned")
    parser.add_argument('--threads',
                        type=int,
                        default=multiprocessing.cpu_count(),
                        help="INT default cpu_count")
    parser.add_argument(
        '--tempbuffer',
        help=
        "DIRECTORY store the results in a temporary file until they are ready to output.  suggest using /tmp if you don't know what to use"
    )
    args = parser.parse_args()
    seen_names = set()
    last_name = ''
    buffer = PSLBasics.MultiplePSLAlignments()
    inf = sys.stdin
    if args.psl_file != '-':
        inf = open(args.psl_file)
    global of
    tname = None
    if args.tempbuffer:
        if not args.output:
            sys.stderr.write(
                "ERROR if you want to buffer outputs in a temp file you need to specify a final output file.\n"
            )
            sys.exit()
        rnum = random.randint(1, 1000000000)
        tname = args.tempbuffer.rstrip('/') + '/weirathe.' + str(
            rnum) + '.meta'
        of = open(tname, 'w')
    if args.output and not args.tempbuffer:
        of = open(args.output, 'w')
    global lock
    if args.threads > 1:
        pool = multiprocessing.Pool(args.threads)
    for line in inf:
        e = PSLBasics.line_to_entry(line.rstrip())
        if e['qName'] != last_name:  # we have a new name
            if e['qName'] in seen_names:
                sys.stderr.write(
                    "ERROR psl entries are not ordered by query name.\n")
                sys.exit()
            seen_names.add(e['qName'])
            if buffer.get_alignment_count() > 0:
                #process_buffer(buffer)
                if args.threads > 1:
                    pool.apply_async(process_buffer, [buffer],
                                     callback=print_result)
                else:
                    res = process_buffer(buffer)
                    print_result(res)
            buffer = PSLBasics.MultiplePSLAlignments()
            if args.minimum_coverage > 1:
                buffer.set_minimum_coverage(args.minimum_coverage)
        last_name = e['qName']
        buffer.add_entry(PSLBasics.PSL(line.rstrip()))
    inf.close()
    if buffer.get_alignment_count() > 0:
        if args.threads > 1:
            pool.apply_async(
                process_buffer, [buffer],
                callback=print_result)  # if we still have something left to do
        else:
            res = process_buffer(buffer)
            print_result(res)
    if args.threads > 1:
        pool.close()
        pool.join()
    of.close()
    if args.tempbuffer:
        of = open(args.output, 'w')
        with open(tname) as inf:
            for line in inf:
                of.write(line)
        of.close()
        os.remove(tname)