Example #1
0
def parse_pslfile(tdir,pslfile,smoothing_factor):
  # Go through the long reads and make a genepred
  if pslfile != '-':
    fr = FileBasics.GenericFileReader(pslfile)
  else:
    fr = sys.stdin
  seennames = {}
  longreadnumber = 0
  of_gpd = open(tdir+'/longreads.gpd','w')
  while True:
    line = fr.readline()
    if not line: break
    if re.match('^#',line): #skip comments
      continue
    longreadnumber += 1
    gpd_line = PSLBasics.convert_entry_to_genepred_line(PSLBasics.line_to_entry(line.rstrip()))
    if not gpd_line:
      sys.stderr.write("Warning: malformed psl for "+readname+"\n")
      continue
    entry = GenePredBasics.smooth_gaps( \
              GenePredBasics.line_to_entry(gpd_line),smoothing_factor)
    readname = entry['name']
    if readname in seennames:
      sys.stderr.write("Warning: repeat name '"+readname+"'\n")
    #set our first name to our bin
    entry['name'] = str(longreadnumber)
    gline = GenePredBasics.entry_to_line(entry)
    of_gpd.write(gline+"\n")
  fr.close()
  of_gpd.close()
def get_exons_from_seqs(seqs, d, spcf):
    sind = 0
    oline = ''
    for seq in seqs:
        sind += 1
        psec = 'P'  #primary or secondary
        if sind > 1: psec = 'S'
        d1 = d.copy()
        d1['rname'] = seq[1]
        if seq[2] == '+': d1['flag'] = 0
        else: d1['flag'] = 16
        d1['pos'] = seq[3]
        d1['cigar'] = seq[4]
        d1['cigar_array'] = SamBasics.parse_cigar(seq[4])
        skips = set(['H', 'D', 'N'])
        total_length = 0
        possible_matches = 0
        indels = 0
        qstart = 0
        if d1['cigar_array'][0]['op'] == 'S':
            qstart = d1['cigar_array'][0]['val']
        if d1['cigar_array'][0]['op'] == 'H':
            qstart = d1['cigar_array'][0]['val']
        for ce in d1['cigar_array']:
            if ce['op'] not in skips:
                total_length += ce['val']
            if ce['op'] == 'M': possible_matches += ce['val']
            elif ce['op'] == 'I':
                indels += ce['val']
            elif ce['op'] == 'D' and ce['val'] < 68:
                indels += ce['val']
        fakeseq = 'N' * total_length
        d1['seq'] = fakeseq
        nline = SamBasics.entry_to_line(d1)
        pline = spcf.convert_line(nline)
        pentry = PSLBasics.line_to_entry(pline)
        #mismatch_count = -1
        #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches
        #  for i in range(0,len(pentry['blockSizes'])):
        #    tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]]
        #    qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]]
        #    print pentry['blockSizes'][i]
        #    print tseq
        #    print qseq
        #    for j in range(0,len(tseq)):
        #      if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1
        gline = PSLBasics.convert_entry_to_genepred_line(pentry)
        gentry = GenePredBasics.line_to_entry(gline)
        gsmooth = GenePredBasics.smooth_gaps(gentry, 68)
        for i in range(0, len(gsmooth['exonStarts'])):
            oline += gsmooth['chrom'] + "\t" + str(
                gsmooth['exonStarts'][i]) + "\t" + str(
                    gsmooth['exonEnds']
                    [i]) + "\t" + gsmooth['strand'] + "\t" + gsmooth[
                        'name'] + "\t" + str(possible_matches) + "\t" + str(
                            indels) + "\t" + psec + "\t" + str(qstart) + "\n"
    return oline
def main():
  parser = argparse.ArgumentParser(description='Use reference junctions when they are close',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('--min_intron_size',type=int,default=68,help="INT min intron size")
  parser.add_argument('--min_local_support',type=int,default=0,help="INT min number of junctions within search_size of a junction in order to count it")
  parser.add_argument('--search_size',type=int,default=10,help="INT search space for reference")
  parser.add_argument('--output_fake_psl',help="FASTAFILE reference genome to make a fake PSL output")
  parser.add_argument('psl',help="PSLFILENAME or '-' for STDIN")
  parser.add_argument('reference_genepred',help="FASTAFILENAME for reference genepred")
  args = parser.parse_args()

  cpus = multiprocessing.cpu_count()

  genome = {}
  if args.output_fake_psl:
    genome = read_fasta_into_hash(args.output_fake_psl)

  #read in the reference genepred first
  gpf = GenePredBasics.GenePredFile(args.reference_genepred)
  #lets sort entries by chromosome
  ref = {}
  for e in [x.entry for x in gpf.entries]:
    if len(e['exonStarts']) <= 1: continue
    if e['chrom'] not in ref:
      ref[e['chrom']] = {}
    for i in range(1,len(e['exonStarts'])):
      if e['exonEnds'][i-1] not in ref[e['chrom']]:
        ref[e['chrom']][e['exonEnds'][i-1]] = {}
      if e['exonStarts'][i]+1 not in ref[e['chrom']][e['exonEnds'][i-1]]:
        ref[e['chrom']][e['exonEnds'][i-1]][e['exonStarts'][i]+1] = e['strand']
  #Stored all junctions as 1-base

  read_info = {}
  pf = GenericFileReader(args.psl)
  fcount_total = 0
  while True:
    line = pf.readline()
    if not line: break
    if re.match('^#',line): continue
    line = line.rstrip()
    pe = PSLBasics.line_to_entry(line)
    if len(pe['tStarts']) != len(pe['blockSizes']) or len(pe['qStarts']) != len(pe['blockSizes']):
      sys.stderr.write("WARNING invalid psl\n")
      continue
    genepred_line = PSLBasics.convert_entry_to_genepred_line(pe)
    ge = GenePredBasics.smooth_gaps(GenePredBasics.line_to_entry(genepred_line),args.min_intron_size)
    refjuns = {}
    if pe['tName'] in ref: refjuns = ref[pe['tName']]
    new_ge = nudge(pe,ge,refjuns,args)
    if args.output_fake_psl:
      new_psl_line = GenePredBasics.entry_to_fake_psl_line(new_ge,genome)
      print new_psl_line
    else:
      print GenePredBasics.entry_to_line(new_ge)
def get_exons_from_seqs(seqs,d,spcf):
  sind = 0
  oline = ''
  for seq in seqs:
    sind+=1
    psec = 'P' #primary or secondary
    if sind > 1: psec = 'S'
    d1 = d.copy()
    d1['rname'] = seq[1]
    if seq[2] == '+':  d1['flag'] = 0
    else: d1['flag'] = 16
    d1['pos'] = seq[3]
    d1['cigar'] = seq[4]
    d1['cigar_array'] = SamBasics.parse_cigar(seq[4])
    skips = set(['H','D','N'])
    total_length = 0
    possible_matches = 0
    indels = 0
    qstart = 0
    if d1['cigar_array'][0]['op'] == 'S':
      qstart = d1['cigar_array'][0]['val']
    if d1['cigar_array'][0]['op'] == 'H':
      qstart = d1['cigar_array'][0]['val']
    for ce in d1['cigar_array']:
      if ce['op'] not in skips:
        total_length += ce['val']
      if ce['op'] == 'M': possible_matches += ce['val']
      elif ce['op'] == 'I':
        indels += ce['val']
      elif ce['op'] == 'D' and ce['val'] < 68:
        indels += ce['val']
    fakeseq = 'N'*total_length
    d1['seq'] = fakeseq
    nline = SamBasics.entry_to_line(d1)
    pline = spcf.convert_line(nline)
    pentry = PSLBasics.line_to_entry(pline)
    #mismatch_count = -1
    #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches
    #  for i in range(0,len(pentry['blockSizes'])):
    #    tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]]
    #    qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]]
    #    print pentry['blockSizes'][i]
    #    print tseq
    #    print qseq
    #    for j in range(0,len(tseq)):
    #      if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1
    gline = PSLBasics.convert_entry_to_genepred_line(pentry)
    gentry = GenePredBasics.line_to_entry(gline)
    gsmooth = GenePredBasics.smooth_gaps(gentry,68)
    for i in range(0,len(gsmooth['exonStarts'])):
      oline += gsmooth['chrom'] + "\t" + str(gsmooth['exonStarts'][i])+"\t"+str(gsmooth['exonEnds'][i])+"\t"+gsmooth['strand']+"\t"+gsmooth['name']+"\t"+str(possible_matches)+"\t"+str(indels)+"\t"+psec+"\t"+str(qstart)+"\n"
  return oline
def process_buffer(mpa):
  mpa.populate_query()
  #separate entires by Locus
  loci = RangeBasics.Loci()
  loci.set_use_direction(True)
  minimum_locus_distance = 400000
  for entry in mpa.entries:
    rng = RangeBasics.Bed(entry.value('tName'),entry.value('tStart')-minimum_locus_distance,entry.value('tEnd')+minimum_locus_distance,entry.value('strand'))
    rng.set_payload(entry)
    locus = RangeBasics.Locus()
    locus.set_use_direction(True)
    locus.add_member(rng)
    loci.add_locus(locus)
  loci.update_loci()
  outputs = []
  for locus in loci.loci:
    mpsl = PSLBasics.MultiplePSLAlignments()
    mpsl.set_minimum_coverage(20)
    for member in locus.members:
      mpsl.add_entry(member.get_payload())
    mpsl.populate_query()
    bac = mpsl.best_query()
    #if len(bac.segments) > 2:
    segtrimmed = bac.get_trimmed_entries()
    stitched = PSLBasics.stitch_query_trimmed_psl_entries(segtrimmed)
    #bac.print_report()
    #for segpsl in bac.segment_trimmed_entries:
    #  print str(segpsl.value('qStart'))+"\t"+str(segpsl.value('qEnd'))+"\t"+str(segpsl.value('tStart'))+"\t"+str(segpsl.value('tEnd'))
    ##print stitched.get_line()
    if not stitched.validate():
      sys.exit()
    outputs.append(stitched.get_line())
  return outputs
Example #6
0
def do_locus_callback(cbr):
    if not cbr: return
    [r, tot, cnt] = cbr
    global combo_results
    for e in r:
        combo_results.append(PSLBasics.entry_to_line(e) + "\n")
    sys.stderr.write(str(cnt) + "/" + str(tot) + "  \r")
    return
def do_locus_callback(cbr):
  if not cbr: return
  [r,tot,cnt] = cbr
  global combo_results
  for e in r:
    combo_results.append(PSLBasics.entry_to_line(e)+"\n")
  sys.stderr.write(str(cnt)+"/"+str(tot)+"  \r")
  return
def main():
  parser = argparse.ArgumentParser(description="Convert a psl file into a target formated genepred file.")
  parser.add_argument('--fill_gaps',type=int,default=0,help="Close gaps this size or smaller.")
  parser.add_argument('input_name',help="Input PSL file, use - to indicate STDIN.")
  args = parser.parse_args()
  
  pslfilehandle = sys.stdin
  if args.input_name != '-':
    pslfilehandle = open(args.input_name)
  with pslfilehandle as infile:
    for line in infile:
      psl_entry = PSLBasics.line_to_entry(line)
      genepred_line = PSLBasics.convert_entry_to_genepred_line(psl_entry)
      if args.fill_gaps > 0:
        genepred_entry = GenePredBasics.line_to_entry(genepred_line)
        genepred_entry2 = GenePredBasics.smooth_gaps(genepred_entry,args.fill_gaps)
        genepred_line = GenePredBasics.entry_to_line(genepred_entry2)
      print genepred_line
def main():
    parser = argparse.ArgumentParser(
        description=
        "Find mapping distance of paired end reads.  Takes an ordered (by query) alignment to a transcriptome.\nSomething that works for an input thus far is like:\nhisat --reorder -x mytranscriptome -1 my_1.fastq -2 my_2.fastq | this_script.py -"
    )
    parser.add_argument(
        'input_sam',
        help="SAMFILE ordered alignment a transcriptome or - for stdin")
    args = parser.parse_args()
    inf = sys.stdin
    if args.input_sam != '-':
        inf = open(args.input_sam)
    msr = SamBasics.MultiEntrySamReader(inf)
    spcf = SamBasics.SAMtoPSLconversionFactory()
    data = []
    sys.stderr.write("Pairs    Mean    Stddev\n")
    while True:
        entries = msr.read_entries()
        if not entries: break
        if len(entries) != 2: continue
        [e1, e2] = entries
        if e1.check_flag(4) or e2.check_flag(4): continue
        if not e1.check_flag(2) and e2.check_flag(2): continue
        if not ((e1.check_flag(64) and e2.check_flag(128)) or
                (e1.check_flag(128) and e2.check_flag(64))):
            continue
        p1 = spcf.convert_line(e1.get_line())
        p2 = spcf.convert_line(e2.get_line())
        if not p1 or not p2: continue
        p1 = PSLBasics.PSL(p1)
        p2 = PSLBasics.PSL(p2)
        dist = max(
            p2.value('tEnd') - p1.value('tStart'),
            p1.value('tEnd') - p2.value('tStart'))
        data.append(dist)
        if len(data) < 2: continue
        if len(data) % 1000 == 0:
            sys.stderr.write(
                str(len(data)) + "    " + str(int(mean(data))) + "    " +
                str(int(stddev(data))) + "              \r")
    sys.stderr.write(
        str(len(data)) + "    " + str(int(mean(data))) + "    " +
        str(int(stddev(data))) + "              \r")
    sys.stderr.write("\n")
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('psl_file',help="use - for STDIN")
  parser.add_argument('--filter',action='store_true',help="Only output passing lines")
  args = parser.parse_args()
  inf = sys.stdin
  if args.psl_file != '-':
    inf = open(args.psl_file)
  z = 0
  for line in inf:
    z += 1
    if args.filter:
      if PSLBasics.is_valid(line): print line.rstrip()
      continue
    if not PSLBasics.is_valid(line):
      print "bad line "+str(z)
      print line
      return
  if not args.filter:
    print "PSL file looks good"
  inf.close()
def process_buffer(mpa):
    mpa.populate_query()
    #separate entires by Locus
    loci = RangeBasics.Loci()
    loci.set_use_direction(True)
    minimum_locus_distance = 400000
    for entry in mpa.entries:
        rng = RangeBasics.Bed(entry.value('tName'),
                              entry.value('tStart') - minimum_locus_distance,
                              entry.value('tEnd') + minimum_locus_distance,
                              entry.value('strand'))
        rng.set_payload(entry)
        locus = RangeBasics.Locus()
        locus.set_use_direction(True)
        locus.add_member(rng)
        loci.add_locus(locus)
    loci.update_loci()
    outputs = []
    for locus in loci.loci:
        mpsl = PSLBasics.MultiplePSLAlignments()
        mpsl.set_minimum_coverage(20)
        for member in locus.members:
            mpsl.add_entry(member.get_payload())
        mpsl.populate_query()
        bac = mpsl.best_query()
        #if len(bac.segments) > 2:
        segtrimmed = bac.get_trimmed_entries()
        stitched = PSLBasics.stitch_query_trimmed_psl_entries(segtrimmed)
        #bac.print_report()
        #for segpsl in bac.segment_trimmed_entries:
        #  print str(segpsl.value('qStart'))+"\t"+str(segpsl.value('qEnd'))+"\t"+str(segpsl.value('tStart'))+"\t"+str(segpsl.value('tEnd'))
        ##print stitched.get_line()
        if not stitched.validate():
            sys.exit()
        outputs.append(stitched.get_line())
    return outputs
Example #12
0
 def set_mapping_counts(self,psl_filename):
   self.mapping_counts_set = True
   gfr0 = GenericFileReader(psl_filename)
   qcnts = {}
   while True:
     line = gfr0.readline()
     if not line: break
     try:
       psle = PSLBasics.line_to_entry(line.rstrip())
     except:
       sys.stderr.write("Problem parsing line:\n"+line.rstrip()+"\n")
       continue
     if psle['qName'] not in qcnts: qcnts[psle['qName']] = 0
     qcnts[psle['qName']] += 1
   gfr0.close()
   self.mapping_counts = qcnts
Example #13
0
 def set_mapping_counts(self, psl_filename):
     self.mapping_counts_set = True
     gfr0 = GenericFileReader(psl_filename)
     qcnts = {}
     while True:
         line = gfr0.readline()
         if not line: break
         try:
             psle = PSLBasics.line_to_entry(line.rstrip())
         except:
             sys.stderr.write("Problem parsing line:\n" + line.rstrip() +
                              "\n")
             continue
         if psle['qName'] not in qcnts: qcnts[psle['qName']] = 0
         qcnts[psle['qName']] += 1
     gfr0.close()
     self.mapping_counts = qcnts
Example #14
0
def main():
    parser = argparse.ArgumentParser(
        description="Correct the matches/mismatches and Ncount of a PSL file")
    parser.add_argument('input', help="PSLFILE or - for STIDN")
    parser.add_argument('reference', help="FASTAFILE reference genome")
    parser.add_argument('query', help="FASTAFILE query sequences")
    parser.add_argument('--minimum_intron_size',
                        type=int,
                        default=68,
                        help="INT")
    #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance")
    args = parser.parse_args()
    # Read in the reference genome
    sys.stderr.write("Reading in reference genome\n")
    g = read_fasta_into_hash(args.reference)
    sys.stderr.write("Finished reading " + str(len(g.keys())) +
                     " reference sequences\n")
    inf = sys.stdin
    if args.input != '-':
        inf = open(args.input)
    fhr = FastaHandleReader(open(args.query))
    last_fasta = fhr.read_entry()
    if not last_fasta:
        sys.stderr.write("ERROR: No query sequences\n")
        sys.exit()
    for line in inf:
        p = PSLBasics.PSL(line)
        if not p.validate():
            sys.stderr.write(
                "WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"
                + line.rstrip() + "\n")
        n = p.value('qName')
        if not last_fasta:
            sys.stderr.write(
                "ERROR: Ran out of query sequences too soon.  Are they sorted properly\n"
            )
            sys.exit()
        while last_fasta['name'] != n:
            last_fasta = fhr.read_entry()
        p.set_query(last_fasta['seq'])
        p.set_reference_dictionary(g)
        print p.get_line()
        p.pretty_print(50)
    fhr.close()
Example #15
0
def main():
  parser = argparse.ArgumentParser(description="Take a PSL file and output only one alignment per query.  This doesn't do any magic to combine entries, it only seeks the best quality alignment (by most aligned bases).")
  parser.add_argument('input',help="PSLFILE or - for STDIN")
  parser.add_argument('-o','--output',help="OUTPUTFILE or STDOUT if not set.")
  parser.add_argument('--already_ordered',action='store_true',help="The PSL file has already been ordered according to query.  This speeds reading.")
  parser.add_argument('-T','--tempdir',help="DIRECTORY where we can write temporary files.")
  parser.add_argument('-S','--maxtempsize',help="The maximum size for chunks of the temporary files in a sort.  Default units is kb just like linux sort -S... since that is what this feeds the argument to.")
  args = parser.parse_args()
  of = sys.stdout
  if args.output:
    of = open(args.output,'w')
  inf = sys.stdin
  if args.input != '-':
    inf = open(args.input)
  # inf2 is our sorted file (by query)
  inf2 = inf
  if not args.already_ordered:
    cmd = "sort -k 10,10"
    if args.tempdir:
      cmd += " -T "+args.tempdir.rstrip('/')
    if args.maxtempsize:
      cmd += " -S "+args.maxtempsize
    p1 = subprocess.Popen(cmd,shell=True,stdin=inf,stdout=subprocess.PIPE)    
    inf2 = p1.stdout
  gr = PSLBasics.GenericOrderedMultipleAlignmentPSLReader()
  gr.set_handle(inf2)
  while True:
    mpa = gr.read_next()
    if not mpa: break
    maxcov = 0
    bestpsl = None
    for psl in mpa.entries:
      cov = psl.get_coverage()
      if cov >= maxcov:
        bestpsl = psl
        maxcov = cov
    of.write(psl.get_line()+"\n")
  of.close()
  if not args.already_ordered:
    p1.communicate() # make sure this is all done
  inf.close()
Example #16
0
def do_combine_operation(best_option, left, right, read, seq, args):
    #print "choice is "+str(best_option)
    left_target = best_option[0]
    right_target = best_option[1]
    left_query = best_option[2]
    right_query = best_option[3]
    # store for output
    q_start_array = []
    t_start_array = []
    block_size_array = []

    left_query_start = left['qStarts'][0]
    left_target_start = left['tStarts'][0]
    for i in range(0, len(left['tStarts'])):
        tstart = left['tStarts'][i]
        tend = left['tStarts'][i] + left['blockSizes'][i]
        qstart = left['qStarts'][i]
        qend = left['qStarts'][i] + left['blockSizes'][i]
        if left_query <= qstart + 1: break
        left_query_start = qstart
        left_target_start = tstart
        if left_query <= qend: break
        q_start_array.append(qstart)
        t_start_array.append(tstart)
        block_size_array.append(left['blockSizes'][i])

    #print "left things"
    #print [left_query_start+1,left_query]
    #print [left_target_start+1,left_target]

    right_query_end = right['qStarts'][0] + right['blockSizes'][0]
    right_target_end = right['tStarts'][0] + right['blockSizes'][0]
    right_outer_index = 0
    for j in range(0, len(right['tStarts'])):
        tstart = right['tStarts'][j]
        tend = right['tStarts'][j] + right['blockSizes'][j]
        qstart = right['qStarts'][j]
        qend = right['qStarts'][j] + right['blockSizes'][j]
        right_outer_index = j + 1
        if right_query <= qstart + 1: break
        right_query_end = qend
        right_target_end = tend
        if right_query < qend: break
    #print "right things"
    #print [right_query+1,right_query_end]
    #print [right_target+1,right_target_end]
    working_read = read.upper()
    if left['strand'] == '-': working_read = rc(read.upper())
    pread = working_read[left_query_start:right_query_end]
    tseq = seq[left_target_start:left_target].upper(
    ) + seq[right_target - 1:right_target_end].upper()
    res = needleman_wunsch(pread, tseq)
    #print "short needleman wunsch"
    #print res[0]
    #print res[1]

    # Fun part of making the new portion of the alignment
    qindex = left_query_start
    tindex = left_target_start
    in_alignment = 0
    alignment = None
    bynumbers = None
    for i in range(0, len(res[0])):
        if res[0][i] == '-':  #insertion in target (gap in query)
            tindex += 1
            in_alignment = 0
        elif res[1][i] == '-':  #insertion in query (gap in target)
            qindex += 1
            in_alignment = 0
        else:  # we are in an alignment
            if in_alignment == 0:
                # output buffered result
                if alignment:
                    if len(alignment[0]) > 0:
                        q_start_array.append(bynumbers[0])
                        t_start_array.append(bynumbers[1])
                        block_size_array.append(len(alignment[0]))
                alignment = ['', '']
                bynumbers = [qindex, tindex, qindex, tindex]
            in_alignment = 1
            alignment[0] += res[0][i]
            alignment[1] += res[1][i]
            bynumbers[2] += 1
            bynumbers[3] += 1
            qindex += 1
            tindex += 1
        if qindex == right_query:  # switch forward
            #print "switch"
            #print str(tindex) + "\t" + str(right_target)
            #print str(qindex) + "\t" + str(right_query)
            if not tindex == right_target:
                in_alignment = 0
            tindex = right_target
    if alignment:
        if len(alignment[0]) > 0:
            q_start_array.append(bynumbers[0])
            t_start_array.append(bynumbers[1])
            block_size_array.append(len(alignment[0]))
        #print bynumbers

    for i in range(right_outer_index, len(right['blockSizes'])):
        q_start_array.append(right['qStarts'][i])
        t_start_array.append(right['tStarts'][i])
        block_size_array.append(right['blockSizes'][i])

    #now we can finally construct a psl line
    #we won't keep track of repeats for now
    matches = 0
    misMatches = 0
    repMatches = 0
    nCount = 0
    qNumInsert = 0
    qBaseInsert = 0
    tNumInsert = 0
    tBaseInsert = 0
    strand = left['strand']
    qName = left['qName']
    qSize = len(read)
    qStart = q_start_array[0]
    qEnd = q_start_array[len(q_start_array) -
                         1] + block_size_array[len(block_size_array) - 1]
    tName = left['tName']
    tSize = len(seq)
    tStart = t_start_array[0]
    tEnd = t_start_array[len(t_start_array) -
                         1] + block_size_array[len(block_size_array) - 1]
    blockCount = len(block_size_array)
    blockSizes = ','.join([str(x) for x in block_size_array]) + ','
    qStarts = ','.join([str(x) for x in q_start_array]) + ','
    tStarts = ','.join([str(x) for x in t_start_array]) + ','

    prev_q_end = None
    prev_t_end = None
    for i in range(0, len(block_size_array)):
        qseg = working_read[q_start_array[i]:q_start_array[i] +
                            block_size_array[i]]
        tseg = seq[t_start_array[i]:t_start_array[i] +
                   block_size_array[i]].upper()
        for j in range(0, len(qseg)):
            if qseg[j] == 'N': nCount += 1
            if qseg[j] == tseg[j]: matches += 1
            else:
                misMatches += 1
        if prev_t_end:
            t_dist = t_start_array[i] - prev_t_end
            if t_dist > 0 and t_dist < args.min_intron_size:  #we have an insert into the target and its not an intron
                tNumInsert += 1
                tBaseInsert += t_dist
        if prev_q_end:
            q_dist = q_start_array[i] - prev_q_end
            if q_dist > 0:
                qNumInsert += 1
                qBaseInsert += q_dist
        prev_q_end = q_start_array[i] + block_size_array[i]
        prev_t_end = t_start_array[i] + block_size_array[i]

    # now we have everything to make the line
    combo_line = str(matches) + "\t" + str(misMatches) + "\t" + str(repMatches) + "\t" \
               + str(nCount) + "\t" + str(qNumInsert) + "\t" + str(qBaseInsert) + "\t" \
               + str(tNumInsert) + "\t" + str(tBaseInsert) + "\t" \
               + strand + "\t" + qName + "\t" + str(qSize) + "\t" \
               + str(qStart) + "\t" + str(qEnd) + "\t" \
               + tName + "\t" + str(tSize) + "\t" \
               + str(tStart) + "\t" + str(tEnd) + "\t" + str(blockCount) + "\t" \
               + blockSizes + "\t" + qStarts + "\t" + tStarts
    #print combo_line
    #print q_start_array
    #print t_start_array
    #print block_size_array
    #  print str(right['qStarts'][i])+"\t"+str(right['qStarts'][i]+right['blockSizes'][i])
    #  print i
    return PSLBasics.line_to_entry(combo_line)
Example #17
0
  def convert_line(self,psl_line,query_sequence=None,quality_sequence=None):
    try:
      pe = PSLBasics.line_to_entry(psl_line)
    except:
      sys.stderr.write("Problem parsing line:\n"+psl_line.rstrip()+"\n")
      return False
    if len(pe['tStarts']) != len(pe['blockSizes']):
      sys.stderr.write("Warning invalid psl entry: "+pe['qName']+"\n")
      return False
    #work on the positive strand case first
    cigar = '*'
    blocks = len(pe['blockSizes'])
    starts = pe['qStarts']
    #if pe['strand'] == '-':
    #  starts = [x for x in reversed(pe['qStarts_actual'])]
    #  print 'isrev'
    q_coord_start = starts[0]+1 # base-1 converted starting position
    q_coord_end = starts[blocks-1]+pe['blockSizes'][blocks-1] # base-1 position
    t_coord_start = pe['tStarts'][0]+1 # base-1 converted starting position
    t_coord_end = pe['tStarts'][blocks-1]+pe['blockSizes'][blocks-1] # base-1 position
    if pe['qName'] not in self.reads and self.reads_set is True:
      sys.stderr.write("Warning: qName "+pe['qName']+" was not found in reads\n")
    # we will clip the query sequence to begin and end from the aligned region
    #q_seq = ''
    #if self.reads_set:
    #  q_seq = self.reads[pe['qName']]

    # 1. Get the new query to output
    q_seq_trimmed = '*'
    if self.reads_set or query_sequence:
      q_seq_trimmed = query_sequence
      if not query_sequence: # get it from the archive we loaded if we didn't give it
        q_seq_trimmed = self.reads[pe['qName']]
      if pe['strand'] == '-':
        q_seq_trimmed = SequenceBasics.rc(q_seq_trimmed)
      q_seq_trimmed = q_seq_trimmed[q_coord_start-1:q_coord_end]

    qual_trimmed = '*'
    if self.qualities_set or quality_sequence:
      qual_trimmed = quality_sequence
      if not quality_sequence:
        qual_trimmed = self.qualities[pe['qName']]
      if pe['strand'] == '-':
        qual_trimmed = qual_trimmed[::-1]
      qual_trimmed = qual_trimmed[q_coord_start-1:q_coord_end]
    # 2. Get the cigar string to output
    prev_diff = t_coord_start-q_coord_start
    cigar = ''
    #for i in range(0,blocks):
    #  current_diff = pe['tStarts'][i]-starts[i]
    #  delta = current_diff - prev_diff
    #  #print delta
    #  if delta >= self.min_intron_size:
    #    cigar += str(abs(delta))+'N'
    #  elif delta > 0: # we have a
    #    cigar += str(abs(delta))+'D'
    #  elif delta < 0: # we have a
    #    cigar += str(abs(delta))+'I'
    #  cigar += str(pe['blockSizes'][i])+'M' # our matches
    #  #print current_diff
    #  prev_diff = current_diff
    qstarts = [x-pe['qStarts'][0] for x in pe['qStarts']]
    tstarts = [x-pe['tStarts'][0] for x in pe['tStarts']]
    query_index = 0
    target_index = 0
    junctions = []
    for i in range(0,blocks):
      qdif = qstarts[i] - query_index
      tdif = tstarts[i] - target_index
      if qdif > 0:  # we have to insert
        cigar += str(qdif) + 'I'
      if tdif > self.min_intron_size: # we have an intron
        cigar += str(tdif) + 'N'
        junctions.append(i)
      elif tdif > 0: # we have to delete
        cigar += str(tdif) + 'D'
      cigar += str(pe['blockSizes'][i]) + 'M'
      query_index = qstarts[i]+pe['blockSizes'][i]
      target_index = tstarts[i]+pe['blockSizes'][i]
    ### cigar done
    # inspect junctions if we have a ref_genome
    spliceflag_set = False
    if self.ref_genome_set:
      canon = 0
      revcanon = 0
      for i in junctions: #blocks following a junction
        left_num = pe['tStarts'][i-1]+pe['blockSizes'][i-1]
        left_val = self.ref_genome[pe['tName']][left_num:left_num+2].upper()
        right_num = pe['tStarts'][i-1]-2
        right_val = self.ref_genome[pe['tName']][right_num:right_num+2].upper()
        junc = left_val + '-' + right_val
        if junc in self.canonical: canon += 1
        if junc in self.revcanonical: revcanon += 1
      if canon > revcanon: 
        spliceflag_set = True
        spliceflag = '+'
      elif revcanon > canon:
        spliceflag_set = True
        spliceflag = '-'
    # if we have junctions, and we should be setting direction but 
    # we can't figure out the direction skip ambiguous direction
    if len(junctions) > 0 and self.skip_directionless_splice and spliceflag_set == False:
      return False
    samline =  pe['qName'] + "\t"        # 1. QNAME
    if pe['strand'] == '-':
      samline += '16' + "\t"             # 2. FLAG
    else:
      samline += '0' + "\t"
    samline += pe['tName'] + "\t"        # 3. RNAME
    samline += str(t_coord_start) + "\t" # 4. POS
    samline += '0' + "\t"                # 5. MAPQ
    samline += cigar + "\t"         # 6. CIGAR
    samline += '*' + "\t"           # 7. RNEXT
    samline += '0' + "\t"           # 8. PNEXT
    samline += '0' + "\t"           # 9. TLEN
    samline += q_seq_trimmed + "\t" # 10. SEQ
    samline += qual_trimmed + "\t"  # 11. QUAL
    if spliceflag_set:
      samline += 'XS:A:'+spliceflag + "\t"
    if self.ref_genome_set:
      samline += 'NH:i:'+str(self.mapping_counts[pe['qName']]) + "\t"
    samline += 'XC:i:'+str(len(junctions)) + "\t"
    samline += 'NM:i:0'
    return samline
def main():
  parser = argparse.ArgumentParser(description="Analyze ORDERED psl alignments of long reads.")
  parser.add_argument('psl_file',help="Alignment file. Must be ordered by query name. use - for stdin")
  parser.add_argument('-o','--output',help="Write to output file, default is STDIN")
  parser.add_argument('--noheader',action='store_true')
  parser.add_argument('--minimum_coverage',type=int,help="Only consider alignments with at least this many bp aligned")
  parser.add_argument('--threads',type=int,default=multiprocessing.cpu_count(),help="INT default cpu_count")
  parser.add_argument('--tempbuffer',help="DIRECTORY store the results in a temporary file until they are ready to output.  suggest using /tmp if you don't know what to use")
  args = parser.parse_args()
  seen_names = set()
  last_name = ''
  buffer = PSLBasics.MultiplePSLAlignments()
  inf = sys.stdin
  if args.psl_file != '-':
    inf = open(args.psl_file)
  global of
  tname = None
  if args.tempbuffer:
    if not args.output:
      sys.stderr.write("ERROR if you want to buffer outputs in a temp file you need to specify a final output file.\n")
      sys.exit()
    rnum = random.randint(1,1000000000);
    tname = args.tempbuffer.rstrip('/')+'/weirathe.'+str(rnum)+'.meta'
    of = open(tname,'w')
  if args.output and not args.tempbuffer:
    of = open(args.output,'w')
  global lock
  if args.threads > 1:
    pool = multiprocessing.Pool(args.threads)
  for line in inf:
    e = PSLBasics.line_to_entry(line.rstrip())
    if e['qName'] != last_name: # we have a new name
      if e['qName'] in seen_names:
        sys.stderr.write("ERROR psl entries are not ordered by query name.\n")
        sys.exit()
      seen_names.add(e['qName'])
      if buffer.get_alignment_count() > 0:
        #process_buffer(buffer)
        if args.threads > 1:
          pool.apply_async(process_buffer,[buffer],callback=print_result)
        else:
          res = process_buffer(buffer)
          print_result(res)
      buffer = PSLBasics.MultiplePSLAlignments()
      if args.minimum_coverage > 1:
        buffer.set_minimum_coverage(args.minimum_coverage)
    last_name = e['qName']
    buffer.add_entry(PSLBasics.PSL(line.rstrip()))
  inf.close()
  if buffer.get_alignment_count() > 0:
    if args.threads > 1:
      pool.apply_async(process_buffer,[buffer],callback=print_result) # if we still have something left to do
    else:
      res = process_buffer(buffer)
      print_result(res)
  if args.threads > 1:
    pool.close()
    pool.join()
  of.close()
  if args.tempbuffer:
    of = open(args.output,'w')
    with open(tname) as inf:
      for line in inf:
        of.write(line)
    of.close()
    os.remove(tname)
def main():
  parser = argparse.ArgumentParser(description="Analyze ORDERED psl alignments of long reads.")
  parser.add_argument('psl_file',help="Alignment file. Must be ordered by query name. use - for stdin")
  parser.add_argument('--output',help="Write to output file, default is STDIN")
  parser.add_argument('--noheader',action='store_true')
  #parser.add_argument('--best',action='store_true')
  #parser.add_argument('--split',action='store_true')
  parser.add_argument('--minimum_coverage',type=int,help="Only consider alignments with at least this many bp aligned")
  parser.add_argument('--threads',type=int,default=multiprocessing.cpu_count(),help="INT default cpu_count")
  parser.add_argument('--tempbuffer',help="DIRECTORY store the results in a temporary file until they are ready to output.  suggest using /tmp if you don't know what to use")
  args = parser.parse_args()
  seen_names = set()
  last_name = ''
  buffer = PSLBasics.MultiplePSLAlignments()
  inf = sys.stdin
  if args.psl_file != '-':
    inf = open(args.psl_file)
  global of
  tname = None
  if args.tempbuffer:
    if not args.output:
      sys.stderr.write("ERROR if you want to buffer outputs in a temp file you need to specify a final output file.\n")
      sys.exit()
    rnum = random.randint(1,1000000000);
    tname = args.tempbuffer.rstrip('/')+'/weirathe.'+str(rnum)+'.meta'
    of = open(tname,'w')
  if args.output and not args.tempbuffer:
    of = open(args.output,'w')
  global lock
  if not args.noheader:
    lock.acquire()
    of.write("QueryName\tSegmentCount\tLocusCount\tHasOverlapped\tHasMultiplyMapped\n")
    lock.release()
  pool = multiprocessing.Pool(args.threads)
  for line in inf:
    e = PSLBasics.line_to_entry(line.rstrip())
    if e['qName'] != last_name: # we have a new name
      if e['qName'] in seen_names:
        sys.stderr.write("ERROR psl entries are not ordered by query name.\n")
        sys.exit()
      seen_names.add(e['qName'])
      if buffer.get_alignment_count() > 0:
        #process_buffer(buffer)
        pool.apply_async(process_buffer,[buffer],callback=print_result)
      buffer = PSLBasics.MultiplePSLAlignments()
      if args.minimum_coverage > 1:
        buffer.set_minimum_coverage(args.minimum_coverage)
    last_name = e['qName']
    buffer.add_entry(e)
  inf.close()
  if buffer.get_alignment_count() > 0:
    #process_buffer(buffer) # if we still have something left to do
    pool.apply_async(process_buffer,[buffer],callback=print_result) # if we still have something left to do
  pool.close()
  pool.join()
  of.close()
  if args.tempbuffer:
    of = open(args.output,'w')
    with open(tname) as inf:
      for line in inf:
        of.write(line)
    of.close()
    os.remove(tname)
def main():
  parser = argparse.ArgumentParser(description="Correct the matches/mismatches and Ncount of a PSL file")
  parser.add_argument('input',help="PSLFILE or - for STIDN")
  parser.add_argument('reference',help="FASTAFILE reference genome")
  parser.add_argument('query',help="FASTAFILE query sequences")
  parser.add_argument('--minimum_intron_size',type=int,default=68,help="INT")
  #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance")
  args = parser.parse_args()
  # Read in the reference genome
  sys.stderr.write("Reading in reference genome\n")
  g = read_fasta_into_hash(args.reference)
  sys.stderr.write("Finished reading "+str(len(g.keys()))+" reference sequences\n")
  inf = sys.stdin
  if args.input != '-':
    inf = open(args.input)
  fhr = FastaHandleReader(open(args.query))
  last_fasta = fhr.read_entry()
  if not last_fasta:
    sys.stderr.write("ERROR: No query sequences\n")
    sys.exit()
  for line in inf:
    p = PSLBasics.PSL(line)
    if not p.validate():
      sys.stderr.write("WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"+line.rstrip()+"\n")
    n = p.value('qName')
    if not last_fasta:
      sys.stderr.write("ERROR: Ran out of query sequences too soon.  Are they sorted properly\n")
      sys.exit()
    while last_fasta['name'] != n:
      last_fasta = fhr.read_entry()
    p.set_query(last_fasta['seq'])
    p.set_reference_dictionary(g)
    p.correct_stats()
    print p.get_line()
    continue
    f = last_fasta
    nCount = 0
    matches = 0
    misMatches = 0
    prev_qE = 0
    prev_tE = 0
    qNumInsert = 0
    qBaseInsert = 0
    tNumInsert = 0
    tBaseInsert = 0
    for i in range(p.value('blockCount')):
      blen = p.value('blockSizes')[i]
      qS = p.value('qStarts')[i] #query start
      qE = qS + blen             #query end
      tS = p.value('tStarts')[i] #target start
      tE = tS + blen             #target end
      #Work on gaps
      if prev_qE > 0 or prev_tE > 0: #if its not our first time through
        tgap = tS-prev_tE
        if tgap < args.minimum_intron_size and tgap > 0:
          tNumInsert += 1
          tBaseInsert += tgap
        qgap = qS-prev_qE
        if qgap > 0:
          qNumInsert += 1
          qBaseInsert += qgap
      query = f['seq']
      if p.value('strand') == '-':
        query = rc(f['seq'])
      qseq = query[qS:qE].upper()
      rseq = g[p.value('tName')][tS:tE].upper()
      #print qseq+"\n"+rseq+"\n"
      for j in range(0,blen):
        if qseq[j] == 'N':
          nCount += 1
        elif qseq[j] == rseq[j]:
          matches += 1
        else:
          misMatches += 1
      prev_qE = qE
      prev_tE = tE
    p.entry['matches'] = matches
    p.entry['misMatches'] = misMatches
    p.entry['nCount'] = nCount
    p.entry['qNumInsert'] = qNumInsert
    p.entry['qBaseInsert'] = qBaseInsert
    p.entry['tNumInsert'] = tNumInsert
    p.entry['tBaseInsert'] = tBaseInsert
    p.entry['qSize'] = len(query)
    p.entry['tSize'] = len(g[p.value('tName')]) 
    print p.get_line()
    #p.pretty_print(100)
  fhr.close()
def main():
    parser = argparse.ArgumentParser(
        description='Use reference junctions when they are close',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--min_intron_size',
                        type=int,
                        default=68,
                        help="INT min intron size")
    parser.add_argument(
        '--min_local_support',
        type=int,
        default=0,
        help=
        "INT min number of junctions within search_size of a junction in order to count it"
    )
    parser.add_argument('--search_size',
                        type=int,
                        default=10,
                        help="INT search space for reference")
    parser.add_argument(
        '--output_fake_psl',
        help="FASTAFILE reference genome to make a fake PSL output")
    parser.add_argument('psl', help="PSLFILENAME or '-' for STDIN")
    parser.add_argument('reference_genepred',
                        help="FASTAFILENAME for reference genepred")
    args = parser.parse_args()

    cpus = multiprocessing.cpu_count()

    genome = {}
    if args.output_fake_psl:
        genome = read_fasta_into_hash(args.output_fake_psl)

    #read in the reference genepred first
    gpf = GenePredBasics.GenePredFile(args.reference_genepred)
    #lets sort entries by chromosome
    ref = {}
    for e in [x.entry for x in gpf.entries]:
        if len(e['exonStarts']) <= 1: continue
        if e['chrom'] not in ref:
            ref[e['chrom']] = {}
        for i in range(1, len(e['exonStarts'])):
            if e['exonEnds'][i - 1] not in ref[e['chrom']]:
                ref[e['chrom']][e['exonEnds'][i - 1]] = {}
            if e['exonStarts'][i] + 1 not in ref[e['chrom']][e['exonEnds'][i -
                                                                           1]]:
                ref[e['chrom']][e['exonEnds'][i - 1]][e['exonStarts'][i] +
                                                      1] = e['strand']
    #Stored all junctions as 1-base

    read_info = {}
    pf = GenericFileReader(args.psl)
    fcount_total = 0
    while True:
        line = pf.readline()
        if not line: break
        if re.match('^#', line): continue
        line = line.rstrip()
        pe = PSLBasics.line_to_entry(line)
        if len(pe['tStarts']) != len(pe['blockSizes']) or len(
                pe['qStarts']) != len(pe['blockSizes']):
            sys.stderr.write("WARNING invalid psl\n")
            continue
        genepred_line = PSLBasics.convert_entry_to_genepred_line(pe)
        ge = GenePredBasics.smooth_gaps(
            GenePredBasics.line_to_entry(genepred_line), args.min_intron_size)
        refjuns = {}
        if pe['tName'] in ref: refjuns = ref[pe['tName']]
        new_ge = nudge(pe, ge, refjuns, args)
        if args.output_fake_psl:
            new_psl_line = GenePredBasics.entry_to_fake_psl_line(
                new_ge, genome)
            print new_psl_line
        else:
            print GenePredBasics.entry_to_line(new_ge)
def do_combine_operation(best_option,left,right,read,seq,args):
  #print "choice is "+str(best_option)
  left_target = best_option[0]
  right_target = best_option[1]
  left_query = best_option[2]
  right_query = best_option[3]
  # store for output
  q_start_array = []
  t_start_array = []
  block_size_array = []

  left_query_start = left['qStarts'][0]
  left_target_start = left['tStarts'][0]
  for i in range(0,len(left['tStarts'])):
    tstart = left['tStarts'][i]
    tend = left['tStarts'][i]+left['blockSizes'][i]
    qstart = left['qStarts'][i]
    qend = left['qStarts'][i]+left['blockSizes'][i]
    if left_query <= qstart+1: break
    left_query_start = qstart
    left_target_start = tstart
    if left_query <= qend: break
    q_start_array.append(qstart)
    t_start_array.append(tstart)
    block_size_array.append(left['blockSizes'][i])

  #print "left things"
  #print [left_query_start+1,left_query]
  #print [left_target_start+1,left_target]

  right_query_end = right['qStarts'][0]+right['blockSizes'][0]
  right_target_end = right['tStarts'][0]+right['blockSizes'][0]
  right_outer_index = 0
  for j in range(0,len(right['tStarts'])):
    tstart = right['tStarts'][j]
    tend = right['tStarts'][j]+right['blockSizes'][j]
    qstart = right['qStarts'][j]
    qend = right['qStarts'][j]+right['blockSizes'][j]
    right_outer_index = j+1
    if right_query <= qstart+1: break
    right_query_end = qend
    right_target_end = tend
    if right_query < qend: break
  #print "right things"
  #print [right_query+1,right_query_end]
  #print [right_target+1,right_target_end]
  working_read = read.upper()
  if left['strand'] == '-': working_read = rc(read.upper())
  pread = working_read[left_query_start:right_query_end]
  tseq = seq[left_target_start:left_target].upper()+seq[right_target-1:right_target_end].upper()
  res = needleman_wunsch(pread,tseq)
  #print "short needleman wunsch"
  #print res[0]
  #print res[1]

  # Fun part of making the new portion of the alignment
  qindex = left_query_start
  tindex = left_target_start
  in_alignment = 0
  alignment = None
  bynumbers = None
  for i in range(0,len(res[0])):
    if res[0][i] == '-':  #insertion in target (gap in query)
      tindex += 1
      in_alignment = 0
    elif res[1][i] == '-':  #insertion in query (gap in target)
      qindex += 1
      in_alignment = 0
    else: # we are in an alignment
      if in_alignment == 0:
        # output buffered result
        if alignment:
          if len(alignment[0]) > 0:
            q_start_array.append(bynumbers[0])
            t_start_array.append(bynumbers[1])
            block_size_array.append(len(alignment[0]))
        alignment = ['','']
        bynumbers = [qindex,tindex,qindex,tindex]
      in_alignment = 1
      alignment[0] += res[0][i]
      alignment[1] += res[1][i]
      bynumbers[2] += 1
      bynumbers[3] += 1
      qindex+=1
      tindex+=1
    if qindex == right_query: # switch forward
      #print "switch"
      #print str(tindex) + "\t" + str(right_target)
      #print str(qindex) + "\t" + str(right_query)
      if not tindex == right_target: 
        in_alignment = 0
      tindex = right_target
  if alignment:
    if len(alignment[0]) > 0:
      q_start_array.append(bynumbers[0])
      t_start_array.append(bynumbers[1])
      block_size_array.append(len(alignment[0]))
    #print bynumbers


  for i in range(right_outer_index,len(right['blockSizes'])):
    q_start_array.append(right['qStarts'][i])
    t_start_array.append(right['tStarts'][i])
    block_size_array.append(right['blockSizes'][i])

  #now we can finally construct a psl line
  #we won't keep track of repeats for now
  matches = 0
  misMatches = 0
  repMatches = 0
  nCount = 0
  qNumInsert = 0
  qBaseInsert = 0
  tNumInsert = 0
  tBaseInsert = 0
  strand = left['strand']
  qName = left['qName']
  qSize = len(read)
  qStart = q_start_array[0]
  qEnd = q_start_array[len(q_start_array)-1]+block_size_array[len(block_size_array)-1]
  tName = left['tName']
  tSize = len(seq)
  tStart = t_start_array[0]
  tEnd = t_start_array[len(t_start_array)-1]+block_size_array[len(block_size_array)-1]
  blockCount = len(block_size_array)
  blockSizes = ','.join([str(x) for x in block_size_array])+','
  qStarts = ','.join([str(x) for x in q_start_array])+','
  tStarts = ','.join([str(x) for x in t_start_array])+','

  prev_q_end = None
  prev_t_end = None
  for i in range(0,len(block_size_array)):
    qseg = working_read[q_start_array[i]:q_start_array[i]+block_size_array[i]]
    tseg = seq[t_start_array[i]:t_start_array[i]+block_size_array[i]].upper()
    for j in range(0,len(qseg)):
      if qseg[j] == 'N': nCount += 1
      if qseg[j] == tseg[j]: matches += 1
      else:
        misMatches += 1
    if prev_t_end:
      t_dist = t_start_array[i]-prev_t_end
      if t_dist > 0 and t_dist < args.min_intron_size: #we have an insert into the target and its not an intron
        tNumInsert += 1
        tBaseInsert += t_dist
    if prev_q_end:
      q_dist = q_start_array[i]-prev_q_end
      if q_dist > 0:
        qNumInsert += 1
        qBaseInsert += q_dist
    prev_q_end = q_start_array[i]+block_size_array[i]
    prev_t_end = t_start_array[i]+block_size_array[i]

  # now we have everything to make the line
  combo_line = str(matches) + "\t" + str(misMatches) + "\t" + str(repMatches) + "\t" \
             + str(nCount) + "\t" + str(qNumInsert) + "\t" + str(qBaseInsert) + "\t" \
             + str(tNumInsert) + "\t" + str(tBaseInsert) + "\t" \
             + strand + "\t" + qName + "\t" + str(qSize) + "\t" \
             + str(qStart) + "\t" + str(qEnd) + "\t" \
             + tName + "\t" + str(tSize) + "\t" \
             + str(tStart) + "\t" + str(tEnd) + "\t" + str(blockCount) + "\t" \
             + blockSizes + "\t" + qStarts + "\t" + tStarts
  #print combo_line
  #print q_start_array
  #print t_start_array
  #print block_size_array
  #  print str(right['qStarts'][i])+"\t"+str(right['qStarts'][i]+right['blockSizes'][i])
  #  print i
  return PSLBasics.line_to_entry(combo_line)
def main():
  parser = argparse.ArgumentParser(description="splice together partial alignments")
  group1 = parser.add_mutually_exclusive_group(required=True)
  group1.add_argument('--fastq_reads')
  group1.add_argument('--fasta_reads')
  parser.add_argument('--genome',help="FASTA reference genome",required=True)
  parser.add_argument('--genepred',help="Transcriptome genepred")
  parser.add_argument('--max_intron_size',type=int,default=100000,help="INT maximum intron size")
  parser.add_argument('--min_intron_size',type=int,default=68,help="INT minimum intron size")
  parser.add_argument('--max_gap_size',type=int,default=10,help="INT gap size in query to join")
  parser.add_argument('--max_search_expand',type=int,default=10,help="INT max search space to expand search for junction")
  parser.add_argument('--direction_specific',action='store_true',help="The direction of the transcript is known and properly oriented already")
  parser.add_argument('--threads',type=int,default=0,help="INT number of threads to use default cpu_count")
  parser.add_argument('-o','--output',default='-',help="FILENAME output results to here rather than STDOUT which is default")
  parser.add_argument('input_alignment',help="FILENAME input .psl file or '-' for STDIN")
  args = parser.parse_args()

  # Read our reference genome
  sys.stderr.write("Reading reference\n")
  ref = read_fasta_into_hash(args.genome)

  # Make sure our reads are unique
  sys.stderr.write("Checking for unqiuely named reads\n")
  reads = check_for_uniquely_named_reads(args) # does a hard exit and error if there are any names repeated
  sys.stderr.write("Reads are uniquely named\n")
  
  # Set number of threads to use
  cpu_count = multiprocessing.cpu_count()
  if args.threads > 0:
    cpu_count = args.threads

  #Set reference splices (if any are available)
  reference_splices = {}
  if args.genepred:
    sys.stderr.write("Reading reference splices from genepred\n")
    reference_splices = get_reference_splices(args)

  sys.stderr.write("Reading alignments into loci\n")

  # Get locus division (first stage)
  # Each read (qName) is separated
  # Then each locus will be specific to at chromosome (tName)
  # Then by (strand), but keep in mind this is the is based on the read
  # Each locus should be specific to a direction but we don't necessarily
  # know direction based on the data we have thus far.  
  inf = sys.stdin
  if args.input_alignment != '-': inf = open(args.input_alignment,'r')
  loci = {}
  for line in inf:
    line = line.rstrip()
    if re.match('^#',line): continue
    psl = PSLBasics.line_to_entry(line)
    if psl['qName'] not in loci:
      loci[psl['qName']] = {}
    if psl['tName'] not in loci[psl['qName']]:
      loci[psl['qName']][psl['tName']] = {}
    if psl['strand'] not in loci[psl['qName']][psl['tName']]:
      loci[psl['qName']][psl['tName']][psl['strand']] = {}
    if psl['tStarts'][0] not in loci[psl['qName']][psl['tName']][psl['strand']]:
      loci[psl['qName']][psl['tName']][psl['strand']][psl['tStarts'][0]] = []
    loci[psl['qName']][psl['tName']][psl['strand']][psl['tStarts'][0]].append(psl)

  sys.stderr.write("breaking loci by genomic distance\n")
  for qname in loci:
    for chr in loci[qname]:
      for strand in loci[qname][chr]:
        #print qname + "\t" + chr + "\t" + strand
        starts = loci[qname][chr][strand].keys()
        current_set = []
        locus_sets = []
        last_end = -1*(args.max_intron_size+2)
        for start in sorted(starts):
          for e in loci[qname][chr][strand][start]:
            start = e['tStarts'][0]+1 # base-1 start of start of alignment
            if start > last_end+args.max_intron_size:
              # we have the start of a new set
              if len(current_set) > 0: 
                locus_sets.append(current_set)
              current_set = []
            last_end = e['tStarts'][len(e['tStarts'])-1]+e['blockSizes'][len(e['tStarts'])-1]
            current_set.append(e)
        if len(current_set) > 0:
          locus_sets.append(current_set)
        loci[qname][chr][strand] = locus_sets # replace what was there with these ordered sets

  locus_total = 0
  for qname in loci:
    for chr in loci[qname]:
      for strand in loci[qname][chr]:
        for locus_set in loci[qname][chr][strand]:
          locus_total+=1  

  sys.stderr.write("Work on each read in each locus with "+str(cpu_count)+" CPUs\n")
  p = multiprocessing.Pool(processes=cpu_count)
  locus_count = 0
  for qname in loci:
    for chr in loci[qname]:
      for strand in loci[qname][chr]:
        #print qname + "\t" + chr + "\t" + strand
        for locus_set in loci[qname][chr][strand]:
          locus_count += 1
          onum = len(locus_set)
          # send blank reference splices unless we have some
          rsplices = {}
          if chr in reference_splices: rsplices = reference_splices[chr]
          #p.apply_async(process_locus_set,(locus_set,args,rsplices,ref[chr],reads[qname],locus_total,locus_count),callback=do_locus_callback)
          r1 = execute_locus(locus_set,args,rsplices,ref[chr],reads[qname],locus_total,locus_count)
          do_locus_callback(r1)
          #nnum = len(new_locus_set)
          #print str(onum) + " to " + str(nnum)
          #for e in new_locus_set:
          #  print PSLBasics.entry_to_line(e)
  p.close()
  p.join() 
  sys.stderr.write("\nfinished\n")

  ofh = sys.stdout
  if not args.output == '-':
    ofh = open(args.output,'w')

  for line in combo_results:
    ofh.write(line)
Example #24
0
def main():
    parser = argparse.ArgumentParser(
        description="splice together partial alignments")
    group1 = parser.add_mutually_exclusive_group(required=True)
    group1.add_argument('--fastq_reads')
    group1.add_argument('--fasta_reads')
    parser.add_argument('--genome',
                        help="FASTA reference genome",
                        required=True)
    parser.add_argument('--genepred', help="Transcriptome genepred")
    parser.add_argument('--max_intron_size',
                        type=int,
                        default=100000,
                        help="INT maximum intron size")
    parser.add_argument('--min_intron_size',
                        type=int,
                        default=68,
                        help="INT minimum intron size")
    parser.add_argument('--max_gap_size',
                        type=int,
                        default=10,
                        help="INT gap size in query to join")
    parser.add_argument(
        '--max_search_expand',
        type=int,
        default=10,
        help="INT max search space to expand search for junction")
    parser.add_argument(
        '--direction_specific',
        action='store_true',
        help=
        "The direction of the transcript is known and properly oriented already"
    )
    parser.add_argument('--threads',
                        type=int,
                        default=0,
                        help="INT number of threads to use default cpu_count")
    parser.add_argument(
        '-o',
        '--output',
        default='-',
        help=
        "FILENAME output results to here rather than STDOUT which is default")
    parser.add_argument('input_alignment',
                        help="FILENAME input .psl file or '-' for STDIN")
    args = parser.parse_args()

    # Read our reference genome
    sys.stderr.write("Reading reference\n")
    ref = read_fasta_into_hash(args.genome)

    # Make sure our reads are unique
    sys.stderr.write("Checking for unqiuely named reads\n")
    reads = check_for_uniquely_named_reads(
        args)  # does a hard exit and error if there are any names repeated
    sys.stderr.write("Reads are uniquely named\n")

    # Set number of threads to use
    cpu_count = multiprocessing.cpu_count()
    if args.threads > 0:
        cpu_count = args.threads

    #Set reference splices (if any are available)
    reference_splices = {}
    if args.genepred:
        sys.stderr.write("Reading reference splices from genepred\n")
        reference_splices = get_reference_splices(args)

    sys.stderr.write("Reading alignments into loci\n")

    # Get locus division (first stage)
    # Each read (qName) is separated
    # Then each locus will be specific to at chromosome (tName)
    # Then by (strand), but keep in mind this is the is based on the read
    # Each locus should be specific to a direction but we don't necessarily
    # know direction based on the data we have thus far.
    inf = sys.stdin
    if args.input_alignment != '-': inf = open(args.input_alignment, 'r')
    loci = {}
    for line in inf:
        line = line.rstrip()
        if re.match('^#', line): continue
        psl = PSLBasics.line_to_entry(line)
        if psl['qName'] not in loci:
            loci[psl['qName']] = {}
        if psl['tName'] not in loci[psl['qName']]:
            loci[psl['qName']][psl['tName']] = {}
        if psl['strand'] not in loci[psl['qName']][psl['tName']]:
            loci[psl['qName']][psl['tName']][psl['strand']] = {}
        if psl['tStarts'][0] not in loci[psl['qName']][psl['tName']][
                psl['strand']]:
            loci[psl['qName']][psl['tName']][psl['strand']][psl['tStarts']
                                                            [0]] = []
        loci[psl['qName']][psl['tName']][psl['strand']][psl['tStarts']
                                                        [0]].append(psl)

    sys.stderr.write("breaking loci by genomic distance\n")
    for qname in loci:
        for chr in loci[qname]:
            for strand in loci[qname][chr]:
                #print qname + "\t" + chr + "\t" + strand
                starts = loci[qname][chr][strand].keys()
                current_set = []
                locus_sets = []
                last_end = -1 * (args.max_intron_size + 2)
                for start in sorted(starts):
                    for e in loci[qname][chr][strand][start]:
                        start = e['tStarts'][
                            0] + 1  # base-1 start of start of alignment
                        if start > last_end + args.max_intron_size:
                            # we have the start of a new set
                            if len(current_set) > 0:
                                locus_sets.append(current_set)
                            current_set = []
                        last_end = e['tStarts'][len(e['tStarts']) -
                                                1] + e['blockSizes'][
                                                    len(e['tStarts']) - 1]
                        current_set.append(e)
                if len(current_set) > 0:
                    locus_sets.append(current_set)
                loci[qname][chr][
                    strand] = locus_sets  # replace what was there with these ordered sets

    locus_total = 0
    for qname in loci:
        for chr in loci[qname]:
            for strand in loci[qname][chr]:
                for locus_set in loci[qname][chr][strand]:
                    locus_total += 1

    sys.stderr.write("Work on each read in each locus with " + str(cpu_count) +
                     " CPUs\n")
    p = multiprocessing.Pool(processes=cpu_count)
    locus_count = 0
    for qname in loci:
        for chr in loci[qname]:
            for strand in loci[qname][chr]:
                #print qname + "\t" + chr + "\t" + strand
                for locus_set in loci[qname][chr][strand]:
                    locus_count += 1
                    onum = len(locus_set)
                    # send blank reference splices unless we have some
                    rsplices = {}
                    if chr in reference_splices:
                        rsplices = reference_splices[chr]
                    #p.apply_async(process_locus_set,(locus_set,args,rsplices,ref[chr],reads[qname],locus_total,locus_count),callback=do_locus_callback)
                    r1 = execute_locus(locus_set, args, rsplices, ref[chr],
                                       reads[qname], locus_total, locus_count)
                    do_locus_callback(r1)
                    #nnum = len(new_locus_set)
                    #print str(onum) + " to " + str(nnum)
                    #for e in new_locus_set:
                    #  print PSLBasics.entry_to_line(e)
    p.close()
    p.join()
    sys.stderr.write("\nfinished\n")

    ofh = sys.stdout
    if not args.output == '-':
        ofh = open(args.output, 'w')

    for line in combo_results:
        ofh.write(line)
Example #25
0
    def convert_line(self,
                     psl_line,
                     query_sequence=None,
                     quality_sequence=None):
        try:
            pe = PSLBasics.line_to_entry(psl_line)
        except:
            sys.stderr.write("Problem parsing line:\n" + psl_line.rstrip() +
                             "\n")
            return False
        if len(pe['tStarts']) != len(pe['blockSizes']):
            sys.stderr.write("Warning invalid psl entry: " + pe['qName'] +
                             "\n")
            return False
        #work on the positive strand case first
        cigar = '*'
        blocks = len(pe['blockSizes'])
        starts = pe['qStarts']
        #if pe['strand'] == '-':
        #  starts = [x for x in reversed(pe['qStarts_actual'])]
        #  print 'isrev'
        q_coord_start = starts[0] + 1  # base-1 converted starting position
        q_coord_end = starts[blocks -
                             1] + pe['blockSizes'][blocks -
                                                   1]  # base-1 position
        t_coord_start = pe['tStarts'][
            0] + 1  # base-1 converted starting position
        t_coord_end = pe['tStarts'][blocks -
                                    1] + pe['blockSizes'][blocks -
                                                          1]  # base-1 position
        if pe['qName'] not in self.reads and self.reads_set is True:
            sys.stderr.write("Warning: qName " + pe['qName'] +
                             " was not found in reads\n")
        # we will clip the query sequence to begin and end from the aligned region
        #q_seq = ''
        #if self.reads_set:
        #  q_seq = self.reads[pe['qName']]

        # 1. Get the new query to output
        q_seq_trimmed = '*'
        if self.reads_set or query_sequence:
            q_seq_trimmed = query_sequence
            if not query_sequence:  # get it from the archive we loaded if we didn't give it
                q_seq_trimmed = self.reads[pe['qName']]
            if pe['strand'] == '-':
                q_seq_trimmed = SequenceBasics.rc(q_seq_trimmed)
            q_seq_trimmed = q_seq_trimmed[q_coord_start - 1:q_coord_end]

        qual_trimmed = '*'
        if self.qualities_set or quality_sequence:
            qual_trimmed = quality_sequence
            if not quality_sequence:
                qual_trimmed = self.qualities[pe['qName']]
            if pe['strand'] == '-':
                qual_trimmed = qual_trimmed[::-1]
            qual_trimmed = qual_trimmed[q_coord_start - 1:q_coord_end]
        # 2. Get the cigar string to output
        prev_diff = t_coord_start - q_coord_start
        cigar = ''
        #for i in range(0,blocks):
        #  current_diff = pe['tStarts'][i]-starts[i]
        #  delta = current_diff - prev_diff
        #  #print delta
        #  if delta >= self.min_intron_size:
        #    cigar += str(abs(delta))+'N'
        #  elif delta > 0: # we have a
        #    cigar += str(abs(delta))+'D'
        #  elif delta < 0: # we have a
        #    cigar += str(abs(delta))+'I'
        #  cigar += str(pe['blockSizes'][i])+'M' # our matches
        #  #print current_diff
        #  prev_diff = current_diff
        qstarts = [x - pe['qStarts'][0] for x in pe['qStarts']]
        tstarts = [x - pe['tStarts'][0] for x in pe['tStarts']]
        query_index = 0
        target_index = 0
        junctions = []
        for i in range(0, blocks):
            qdif = qstarts[i] - query_index
            tdif = tstarts[i] - target_index
            if qdif > 0:  # we have to insert
                cigar += str(qdif) + 'I'
            if tdif > self.min_intron_size:  # we have an intron
                cigar += str(tdif) + 'N'
                junctions.append(i)
            elif tdif > 0:  # we have to delete
                cigar += str(tdif) + 'D'
            cigar += str(pe['blockSizes'][i]) + 'M'
            query_index = qstarts[i] + pe['blockSizes'][i]
            target_index = tstarts[i] + pe['blockSizes'][i]
        ### cigar done
        # inspect junctions if we have a ref_genome
        spliceflag_set = False
        if self.ref_genome_set:
            canon = 0
            revcanon = 0
            for i in junctions:  #blocks following a junction
                left_num = pe['tStarts'][i - 1] + pe['blockSizes'][i - 1]
                left_val = self.ref_genome[pe['tName']][left_num:left_num +
                                                        2].upper()
                right_num = pe['tStarts'][i - 1] - 2
                right_val = self.ref_genome[pe['tName']][right_num:right_num +
                                                         2].upper()
                junc = left_val + '-' + right_val
                if junc in self.canonical: canon += 1
                if junc in self.revcanonical: revcanon += 1
            if canon > revcanon:
                spliceflag_set = True
                spliceflag = '+'
            elif revcanon > canon:
                spliceflag_set = True
                spliceflag = '-'
        # if we have junctions, and we should be setting direction but
        # we can't figure out the direction skip ambiguous direction
        if len(
                junctions
        ) > 0 and self.skip_directionless_splice and spliceflag_set == False:
            return False
        samline = pe['qName'] + "\t"  # 1. QNAME
        if pe['strand'] == '-':
            samline += '16' + "\t"  # 2. FLAG
        else:
            samline += '0' + "\t"
        samline += pe['tName'] + "\t"  # 3. RNAME
        samline += str(t_coord_start) + "\t"  # 4. POS
        samline += '0' + "\t"  # 5. MAPQ
        samline += cigar + "\t"  # 6. CIGAR
        samline += '*' + "\t"  # 7. RNEXT
        samline += '0' + "\t"  # 8. PNEXT
        samline += '0' + "\t"  # 9. TLEN
        samline += q_seq_trimmed + "\t"  # 10. SEQ
        samline += qual_trimmed + "\t"  # 11. QUAL
        if spliceflag_set:
            samline += 'XS:A:' + spliceflag + "\t"
        if self.ref_genome_set:
            samline += 'NH:i:' + str(self.mapping_counts[pe['qName']]) + "\t"
        samline += 'XC:i:' + str(len(junctions)) + "\t"
        samline += 'NM:i:0'
        return samline