예제 #1
0
def parse_refgpd(tdir,geneprednames,simplenames):
  # get the reference genepreds ready to use in work
  column_number = 0
  entry_number = 0
  of_entries = open(tdir+"/entries.txt",'w')
  for file in geneprednames:
    column_number += 1
    of_ref = open(tdir+"/reference."+str(column_number)+".bed",'w')
    gfr = FileBasics.GenericFileReader(file)
    while True:
      line = gfr.readline()
      if not line: break
      if re.match('^#',line): continue
      entry_number += 1
      line = line.rstrip("\n")
      entry = GenePredBasics.line_to_entry(line)
      entry_length = 0
      for i in range(0,len(entry['exonStarts'])): entry_length += entry['exonEnds'][i]-entry['exonStarts'][i]
      of_entries.write(str(column_number)+ "\t" + simplenames[column_number-1] + "\t" + str(entry_number) + "\t" + entry['gene_name'] + "\t" + entry['name']+"\t"+str(entry_length)+"\n")
      exon_number = 0
      for i in range(0,len(entry['exonStarts'])):
        exon_number += 1
        of_ref.write(entry['chrom'] + "\t" + str(entry['exonStarts'][i]) + "\t" \
                   + str(entry['exonEnds'][i]) + "\t" + str(entry_number) + "\t" \
                   + entry['gene_name'] + "\t" \
                   + entry['name'] + "\t" + str(len(entry['exonStarts'])) + "\t" \
                   + entry['strand'] + "\t" + str(exon_number) \
                   + "\n")
    gfr.close()
    of_ref.close()
  of_entries.close()
예제 #2
0
def parse_gpdfile(tdir,gpdfile,smoothing_factor):
  # Go through the long reads and make a genepred
  if gpdfile != '-':
    fr = FileBasics.GenericFileReader(gpdfile)
  else:
    fr = sys.stdin
  seennames = {}
  longreadnumber = 0
  of_gpd = open(tdir+'/longreads.gpd','w')
  while True:
    line = fr.readline()
    if not line: break
    if re.match('^#',line): #skip comments
      continue
    longreadnumber += 1
    entry = GenePredBasics.smooth_gaps( \
              GenePredBasics.line_to_entry(line.rstrip()) \
              ,smoothing_factor)
    readname = entry['name']
    if readname in seennames:
      sys.stderr.write("Warning: repeat name '"+readname+"'\n")
    #set our first name to our bin
    entry['name'] = str(longreadnumber)
    gline = GenePredBasics.entry_to_line(entry)
    of_gpd.write(gline+"\n")
  fr.close()
  of_gpd.close()
def get_exons_from_seqs(seqs, d, spcf):
    sind = 0
    oline = ''
    for seq in seqs:
        sind += 1
        psec = 'P'  #primary or secondary
        if sind > 1: psec = 'S'
        d1 = d.copy()
        d1['rname'] = seq[1]
        if seq[2] == '+': d1['flag'] = 0
        else: d1['flag'] = 16
        d1['pos'] = seq[3]
        d1['cigar'] = seq[4]
        d1['cigar_array'] = SamBasics.parse_cigar(seq[4])
        skips = set(['H', 'D', 'N'])
        total_length = 0
        possible_matches = 0
        indels = 0
        qstart = 0
        if d1['cigar_array'][0]['op'] == 'S':
            qstart = d1['cigar_array'][0]['val']
        if d1['cigar_array'][0]['op'] == 'H':
            qstart = d1['cigar_array'][0]['val']
        for ce in d1['cigar_array']:
            if ce['op'] not in skips:
                total_length += ce['val']
            if ce['op'] == 'M': possible_matches += ce['val']
            elif ce['op'] == 'I':
                indels += ce['val']
            elif ce['op'] == 'D' and ce['val'] < 68:
                indels += ce['val']
        fakeseq = 'N' * total_length
        d1['seq'] = fakeseq
        nline = SamBasics.entry_to_line(d1)
        pline = spcf.convert_line(nline)
        pentry = PSLBasics.line_to_entry(pline)
        #mismatch_count = -1
        #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches
        #  for i in range(0,len(pentry['blockSizes'])):
        #    tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]]
        #    qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]]
        #    print pentry['blockSizes'][i]
        #    print tseq
        #    print qseq
        #    for j in range(0,len(tseq)):
        #      if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1
        gline = PSLBasics.convert_entry_to_genepred_line(pentry)
        gentry = GenePredBasics.line_to_entry(gline)
        gsmooth = GenePredBasics.smooth_gaps(gentry, 68)
        for i in range(0, len(gsmooth['exonStarts'])):
            oline += gsmooth['chrom'] + "\t" + str(
                gsmooth['exonStarts'][i]) + "\t" + str(
                    gsmooth['exonEnds']
                    [i]) + "\t" + gsmooth['strand'] + "\t" + gsmooth[
                        'name'] + "\t" + str(possible_matches) + "\t" + str(
                            indels) + "\t" + psec + "\t" + str(qstart) + "\n"
    return oline
def main():
  parser = argparse.ArgumentParser(description='Use reference junctions when they are close',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('--min_intron_size',type=int,default=68,help="INT min intron size")
  parser.add_argument('--min_local_support',type=int,default=0,help="INT min number of junctions within search_size of a junction in order to count it")
  parser.add_argument('--search_size',type=int,default=10,help="INT search space for reference")
  parser.add_argument('--output_fake_psl',help="FASTAFILE reference genome to make a fake PSL output")
  parser.add_argument('psl',help="PSLFILENAME or '-' for STDIN")
  parser.add_argument('reference_genepred',help="FASTAFILENAME for reference genepred")
  args = parser.parse_args()

  cpus = multiprocessing.cpu_count()

  genome = {}
  if args.output_fake_psl:
    genome = read_fasta_into_hash(args.output_fake_psl)

  #read in the reference genepred first
  gpf = GenePredBasics.GenePredFile(args.reference_genepred)
  #lets sort entries by chromosome
  ref = {}
  for e in [x.entry for x in gpf.entries]:
    if len(e['exonStarts']) <= 1: continue
    if e['chrom'] not in ref:
      ref[e['chrom']] = {}
    for i in range(1,len(e['exonStarts'])):
      if e['exonEnds'][i-1] not in ref[e['chrom']]:
        ref[e['chrom']][e['exonEnds'][i-1]] = {}
      if e['exonStarts'][i]+1 not in ref[e['chrom']][e['exonEnds'][i-1]]:
        ref[e['chrom']][e['exonEnds'][i-1]][e['exonStarts'][i]+1] = e['strand']
  #Stored all junctions as 1-base

  read_info = {}
  pf = GenericFileReader(args.psl)
  fcount_total = 0
  while True:
    line = pf.readline()
    if not line: break
    if re.match('^#',line): continue
    line = line.rstrip()
    pe = PSLBasics.line_to_entry(line)
    if len(pe['tStarts']) != len(pe['blockSizes']) or len(pe['qStarts']) != len(pe['blockSizes']):
      sys.stderr.write("WARNING invalid psl\n")
      continue
    genepred_line = PSLBasics.convert_entry_to_genepred_line(pe)
    ge = GenePredBasics.smooth_gaps(GenePredBasics.line_to_entry(genepred_line),args.min_intron_size)
    refjuns = {}
    if pe['tName'] in ref: refjuns = ref[pe['tName']]
    new_ge = nudge(pe,ge,refjuns,args)
    if args.output_fake_psl:
      new_psl_line = GenePredBasics.entry_to_fake_psl_line(new_ge,genome)
      print new_psl_line
    else:
      print GenePredBasics.entry_to_line(new_ge)
예제 #5
0
def main():
  parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('genepred',help="FILENAME or use - for STDIN")
  parser.add_argument('--smoothing_size',type=int,default=68,help="INT no gaps less than this size")
  args = parser.parse_args()
  inf = sys.stdin
  if args.genepred != '-':
    inf = open(args.genepred)
  for line in inf:
    e = GenePredBasics.line_to_entry(line)
    e2 = GenePredBasics.smooth_gaps(e,args.smoothing_size)
    print GenePredBasics.entry_to_line(e2)
예제 #6
0
def get_exons_from_seqs(seqs,d,spcf):
  sind = 0
  oline = ''
  for seq in seqs:
    sind+=1
    psec = 'P' #primary or secondary
    if sind > 1: psec = 'S'
    d1 = d.copy()
    d1['rname'] = seq[1]
    if seq[2] == '+':  d1['flag'] = 0
    else: d1['flag'] = 16
    d1['pos'] = seq[3]
    d1['cigar'] = seq[4]
    d1['cigar_array'] = SamBasics.parse_cigar(seq[4])
    skips = set(['H','D','N'])
    total_length = 0
    possible_matches = 0
    indels = 0
    qstart = 0
    if d1['cigar_array'][0]['op'] == 'S':
      qstart = d1['cigar_array'][0]['val']
    if d1['cigar_array'][0]['op'] == 'H':
      qstart = d1['cigar_array'][0]['val']
    for ce in d1['cigar_array']:
      if ce['op'] not in skips:
        total_length += ce['val']
      if ce['op'] == 'M': possible_matches += ce['val']
      elif ce['op'] == 'I':
        indels += ce['val']
      elif ce['op'] == 'D' and ce['val'] < 68:
        indels += ce['val']
    fakeseq = 'N'*total_length
    d1['seq'] = fakeseq
    nline = SamBasics.entry_to_line(d1)
    pline = spcf.convert_line(nline)
    pentry = PSLBasics.line_to_entry(pline)
    #mismatch_count = -1
    #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches
    #  for i in range(0,len(pentry['blockSizes'])):
    #    tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]]
    #    qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]]
    #    print pentry['blockSizes'][i]
    #    print tseq
    #    print qseq
    #    for j in range(0,len(tseq)):
    #      if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1
    gline = PSLBasics.convert_entry_to_genepred_line(pentry)
    gentry = GenePredBasics.line_to_entry(gline)
    gsmooth = GenePredBasics.smooth_gaps(gentry,68)
    for i in range(0,len(gsmooth['exonStarts'])):
      oline += gsmooth['chrom'] + "\t" + str(gsmooth['exonStarts'][i])+"\t"+str(gsmooth['exonEnds'][i])+"\t"+gsmooth['strand']+"\t"+gsmooth['name']+"\t"+str(possible_matches)+"\t"+str(indels)+"\t"+psec+"\t"+str(qstart)+"\n"
  return oline
예제 #7
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('genepred', help="FILENAME or use - for STDIN")
    parser.add_argument('--smoothing_size',
                        type=int,
                        default=68,
                        help="INT no gaps less than this size")
    args = parser.parse_args()
    inf = sys.stdin
    if args.genepred != '-':
        inf = open(args.genepred)
    for line in inf:
        e = GenePredBasics.line_to_entry(line)
        e2 = GenePredBasics.smooth_gaps(e, args.smoothing_size)
        print GenePredBasics.entry_to_line(e2)
예제 #8
0
  def read_from_fasta_and_genepred(self,genomefastafile,genepredfile):
    # read in our genome
    seen_names = {}
    seen_coords = {}
    genepred = {}
    with open(genepredfile) as inf:
      for line in inf:
        if re.match('^#',line): continue
        e = GenePredBasics.line_to_entry(line)
        hexcoord = hashlib.sha1(e['chrom']+"\t"+e['strand'] + "\t" + str(e['exonStarts'])+"\t" + str(e['exonEnds'])).hexdigest()
        #print hex
        #print e['gene_name']
        #print e['name']
        dupname = 0
        dupcoord = 0
        if hexcoord in seen_coords:
          sys.stderr.write("Warning "+ e['name'] + " " + e['gene_name'] + " exists at identical coordinates as another entry\n")
          dupcoord = 1
        seen_coords[hexcoord] = 1
        currname = e['name']
        if e['name'] in seen_names:
          if dupcoord == 1:
            sys.stderr.write("skipping perfect duplicate of "+e['name']+"\n")
            continue
          newname = e['name'] + "."+str(len(seen_names[e['name']])+1)
          currname = newname
          seen_names[e['name']].append(newname)
          sys.stderr.write("Warning "+ e['name'] + " " + e['gene_name'] + " is a duplicate name.. renaming to "+newname+ "\n")
          dupname = 1
        else:
          seen_names[e['name']] = []
          seen_names[e['name']].append(e['name'])
        genepred[currname] = e

    #print "reading names and locs"             
    ref = read_fasta_into_hash(genomefastafile)
    #print "converting sequences"
    for transcript in genepred:
      e = genepred[transcript]
      if e['chrom'] in ref:
        seq = ''
        self.transcript_names[transcript] = genepred[transcript]['name']
        for i in range(0,e['exonCount']):
          seq += ref[e['chrom']][e['exonStarts'][i]:e['exonEnds'][i]]
        if e['strand'] == '-': seq = rc(seq)
        self.transcripts[transcript] = seq
def main():
  parser = argparse.ArgumentParser(description="Convert a psl file into a target formated genepred file.")
  parser.add_argument('--fill_gaps',type=int,default=0,help="Close gaps this size or smaller.")
  parser.add_argument('input_name',help="Input PSL file, use - to indicate STDIN.")
  args = parser.parse_args()
  
  pslfilehandle = sys.stdin
  if args.input_name != '-':
    pslfilehandle = open(args.input_name)
  with pslfilehandle as infile:
    for line in infile:
      psl_entry = PSLBasics.line_to_entry(line)
      genepred_line = PSLBasics.convert_entry_to_genepred_line(psl_entry)
      if args.fill_gaps > 0:
        genepred_entry = GenePredBasics.line_to_entry(genepred_line)
        genepred_entry2 = GenePredBasics.smooth_gaps(genepred_entry,args.fill_gaps)
        genepred_line = GenePredBasics.entry_to_line(genepred_entry2)
      print genepred_line
예제 #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input_file', help="use - for STDIN")
    args = parser.parse_args()
    inf = sys.stdin
    if args.input_file != '-':
        inf = open(args.input_file)
    for line in inf:
        e = GenePredBasics.line_to_entry(line.rstrip())
        matches = 0
        qstartslist = []
        for i in range(0, len(e['exonStarts'])):
            mylen = e['exonEnds'][i] - e['exonStarts'][i]
            matches += mylen
            qstartslist.append(matches - mylen)
        qstarts = ','.join([str(x) for x in qstartslist]) + ','
        oline = str(matches) + "\t"  # 1
        oline += "0\t"  # 2
        oline += "0\t"  # 3
        oline += "0\t"  # 4
        oline += "0\t"  # 5
        oline += "0\t"  # 6
        oline += "0\t"  # 7
        oline += "0\t"  # 8
        oline += e['strand'] + "\t"  # 9
        oline += e['name'] + "\t"  # 10
        oline += str(matches) + "\t"  # 11
        oline += "0\t"  # 12
        oline += str(matches) + "\t"  # 13
        oline += str(e['chrom']) + "\t"  # 14
        oline += str(e['exonEnds'][-1]) + "\t"  # 15
        oline += str(e['exonStarts'][0]) + "\t"  # 16
        oline += str(e['exonEnds'][-1]) + "\t"  # 17
        oline += str(len(e['exonStarts'])) + "\t"  # 18
        oline += ','.join([
            str(e['exonEnds'][x] - e['exonStarts'][x])
            for x in range(0, len(e['exonStarts']))
        ]) + ',' + "\t"  # 19
        oline += qstarts + "\t"  # 20
        oline += ','.join([str(x) for x in e['exonStarts']]) + ','  # 21
        print oline
    inf.close()
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('input_file',help="use - for STDIN")
  args = parser.parse_args()
  inf = sys.stdin
  if args.input_file != '-':
    inf = open(args.input_file)
  for line in inf:
    e = GenePredBasics.line_to_entry(line.rstrip())
    matches = 0
    qstartslist = []
    for i in range(0,len(e['exonStarts'])):
      mylen = e['exonEnds'][i]-e['exonStarts'][i]
      matches += mylen
      qstartslist.append(matches-mylen)
    qstarts = ','.join([str(x) for x in qstartslist])+','
    oline =  str(matches)+"\t" # 1
    oline += "0\t" # 2
    oline += "0\t" # 3
    oline += "0\t" # 4
    oline += "0\t" # 5
    oline += "0\t" # 6
    oline += "0\t" # 7
    oline += "0\t" # 8
    oline += e['strand']+"\t" # 9
    oline += e['name']+"\t" # 10
    oline += str(matches)+"\t" # 11
    oline += "0\t" # 12
    oline += str(matches)+"\t" # 13
    oline += str(e['chrom'])+"\t" # 14
    oline += str(e['exonEnds'][-1])+"\t" # 15
    oline += str(e['exonStarts'][0])+"\t" # 16
    oline += str(e['exonEnds'][-1])+"\t" # 17
    oline += str(len(e['exonStarts']))+"\t" # 18
    oline += ','.join([str(e['exonEnds'][x]-e['exonStarts'][x]) for x in range(0,len(e['exonStarts']))])+','+"\t" # 19
    oline += qstarts + "\t" # 20
    oline += ','.join([str(x) for x in e['exonStarts']])+',' # 21
    print oline
  inf.close()
예제 #12
0
def break_gpdfile(tdir,job_size):
  bfcr = BigFileBasics.BigFileChunkReader(tdir+'/longreads.gpd')
  bfcr.set_chunk_size_bytes(job_size)
  num_jobs = bfcr.chunk_count
  for i in range(0,bfcr.chunk_count):
    oc = bfcr.open_chunk(i)
    job = i+1
    of_bed = open(tdir+'/partreads.'+str(job)+'.bed','w')
    while True:
      line = oc.read_line()
      if not line: break
      line = line.rstrip("\n")
      entry = GenePredBasics.line_to_entry(line)
      exon_number = 0
      for i in range(0,len(entry['exonStarts'])):
        exon_number += 1
        of_bed.write(entry['chrom'] + "\t" + str(entry['exonStarts'][i]) + "\t" \
                     + str(entry['exonEnds'][i]) + "\t" + entry['name']+"\t" \
                     + entry['gene_name'] + "\t" + str(len(entry['exonStarts'])) + "\t" \
                     + entry['strand'] + "\t" + str(exon_number) + "\n")   
    oc.close()
    of_bed.close()
  return num_jobs
def nudge(psl_entry,gpd_entry,refjun,args):
  junctions = []
  fcount = 0
  if len(gpd_entry['exonStarts']) == 1:
    #print "no intron 1"
    return gpd_entry
  bounds = []
  for i in range(1,len(gpd_entry['exonStarts'])):
    junc_start = gpd_entry['exonEnds'][i-1]
    junc_finish = gpd_entry['exonStarts'][i]+1
    bounds.append([junc_start, junc_finish,i-1])
  if len(bounds) < 1:
    #print "no intron 2"
    return gpd_entry
  bestbounds = []
  for bound in bounds:
    best_distance = [10000000,10000000]
    best_result = None
    for z1 in range(bound[0]-args.search_size,bound[0]+args.search_size+1):
      d1 = abs(z1-bound[0])
      if z1 in refjun:
        for z2 in range(bound[1]-args.search_size,bound[1]+args.search_size+args.search_size+1):
          d2 = abs(z2-bound[1])
          if z2 in refjun[z1]:
            refstrand = refjun[z1][z2]
            if d1+d2 < best_distance[0]+best_distance[1]:
              best_distance = [d1,d2]
              best_result = [z1,z2,refstrand,bound[2]]+best_distance
    if best_result:
      bestbounds.append(best_result)
  if len(bestbounds) < 1: 
    #nothing fixable
    #sys.stderr.write("nothing fixable\n")
    return gpd_entry
  #Now we have a list of nudgable bounds
  #Lets pick a strand
  plus_score = 0
  minus_score = 0
  #print '----'
  #print bestbounds
  for bound in bestbounds:
    if bound[2] == '+':
      plus_score += 1/(float(abs(bound[4]))+float(abs(bound[5]))+1)
    else:
      minus_score += 1/(float(abs(bound[4]))+float(abs(bound[5]))+1)
  use_strand = '+'
  #print [plus_score,minus_score]
  if plus_score < minus_score: use_strand = '-'
  #print use_strand
  choice_bounds = []
  for bound in bestbounds:
    if bound[2] == use_strand:  choice_bounds.append(bound)
  #print '---'
  #print GenePredBasics.entry_to_line(gpd_entry)
  #print bestbounds
  #print choice_bounds
  if len(choice_bounds) < 1: 
    print "ERROR  should have choices"
    sys.exit()
  replacements = {}
  for bound in choice_bounds:  replacements[bound[3]] = [bound[0],bound[1]]
  junctions = []
  #print "fixed "+str(len(replacements.keys()))
  for i in range(0,len(bounds)):
    val = bounds[i]
    if i in replacements:
      #sys.stderr.write("use replacement\n")
      val = replacements[i]
      fcount += 1
    junctions.append([val[0],val[1]])
  #print junctions
  #sys.stderr.write("replace\n")
  #print junctions
  new_gpd_line  = gpd_entry['gene_name'] + "\t"
  new_gpd_line += gpd_entry['name'] + "\t"
  new_gpd_line += gpd_entry['chrom'] + "\t"
  new_gpd_line += gpd_entry['strand'] + "\t"
  new_gpd_line += str(gpd_entry['txStart']) + "\t"
  new_gpd_line += str(gpd_entry['txEnd']) + "\t"
  new_gpd_line += str(gpd_entry['cdsStart']) + "\t"
  new_gpd_line += str(gpd_entry['cdsEnd']) + "\t"
  new_gpd_line += str(len(junctions)+1) + "\t"
  exon_starts = [gpd_entry['txStart']]
  exon_ends = [] #gpd_entry['txEnd']]
  for junc in junctions:
    exon_starts.append(junc[1]-1)
    exon_ends.append(junc[0])
  exon_ends.append(gpd_entry['txEnd'])
  new_gpd_line += ','.join([str(x) for x in exon_starts])+','+"\t"
  new_gpd_line += ','.join([str(x) for x in exon_ends])+','+"\t"
  #print new_gpd_line
  new_gpd_entry = GenePredBasics.line_to_entry(new_gpd_line)
  #print "got junctions"
  #print new_gpd_line
  #print '.........'
  return new_gpd_entry
def main():
    parser = argparse.ArgumentParser(
        description='Use reference junctions when they are close',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--min_intron_size',
                        type=int,
                        default=68,
                        help="INT min intron size")
    parser.add_argument(
        '--min_local_support',
        type=int,
        default=0,
        help=
        "INT min number of junctions within search_size of a junction in order to count it"
    )
    parser.add_argument('--search_size',
                        type=int,
                        default=10,
                        help="INT search space for reference")
    parser.add_argument(
        '--output_fake_psl',
        help="FASTAFILE reference genome to make a fake PSL output")
    parser.add_argument('psl', help="PSLFILENAME or '-' for STDIN")
    parser.add_argument('reference_genepred',
                        help="FASTAFILENAME for reference genepred")
    args = parser.parse_args()

    cpus = multiprocessing.cpu_count()

    genome = {}
    if args.output_fake_psl:
        genome = read_fasta_into_hash(args.output_fake_psl)

    #read in the reference genepred first
    gpf = GenePredBasics.GenePredFile(args.reference_genepred)
    #lets sort entries by chromosome
    ref = {}
    for e in [x.entry for x in gpf.entries]:
        if len(e['exonStarts']) <= 1: continue
        if e['chrom'] not in ref:
            ref[e['chrom']] = {}
        for i in range(1, len(e['exonStarts'])):
            if e['exonEnds'][i - 1] not in ref[e['chrom']]:
                ref[e['chrom']][e['exonEnds'][i - 1]] = {}
            if e['exonStarts'][i] + 1 not in ref[e['chrom']][e['exonEnds'][i -
                                                                           1]]:
                ref[e['chrom']][e['exonEnds'][i - 1]][e['exonStarts'][i] +
                                                      1] = e['strand']
    #Stored all junctions as 1-base

    read_info = {}
    pf = GenericFileReader(args.psl)
    fcount_total = 0
    while True:
        line = pf.readline()
        if not line: break
        if re.match('^#', line): continue
        line = line.rstrip()
        pe = PSLBasics.line_to_entry(line)
        if len(pe['tStarts']) != len(pe['blockSizes']) or len(
                pe['qStarts']) != len(pe['blockSizes']):
            sys.stderr.write("WARNING invalid psl\n")
            continue
        genepred_line = PSLBasics.convert_entry_to_genepred_line(pe)
        ge = GenePredBasics.smooth_gaps(
            GenePredBasics.line_to_entry(genepred_line), args.min_intron_size)
        refjuns = {}
        if pe['tName'] in ref: refjuns = ref[pe['tName']]
        new_ge = nudge(pe, ge, refjuns, args)
        if args.output_fake_psl:
            new_psl_line = GenePredBasics.entry_to_fake_psl_line(
                new_ge, genome)
            print new_psl_line
        else:
            print GenePredBasics.entry_to_line(new_ge)
def nudge(psl_entry, gpd_entry, refjun, args):
    junctions = []
    fcount = 0
    if len(gpd_entry['exonStarts']) == 1:
        #print "no intron 1"
        return gpd_entry
    bounds = []
    for i in range(1, len(gpd_entry['exonStarts'])):
        junc_start = gpd_entry['exonEnds'][i - 1]
        junc_finish = gpd_entry['exonStarts'][i] + 1
        bounds.append([junc_start, junc_finish, i - 1])
    if len(bounds) < 1:
        #print "no intron 2"
        return gpd_entry
    bestbounds = []
    for bound in bounds:
        best_distance = [10000000, 10000000]
        best_result = None
        for z1 in range(bound[0] - args.search_size,
                        bound[0] + args.search_size + 1):
            d1 = abs(z1 - bound[0])
            if z1 in refjun:
                for z2 in range(
                        bound[1] - args.search_size,
                        bound[1] + args.search_size + args.search_size + 1):
                    d2 = abs(z2 - bound[1])
                    if z2 in refjun[z1]:
                        refstrand = refjun[z1][z2]
                        if d1 + d2 < best_distance[0] + best_distance[1]:
                            best_distance = [d1, d2]
                            best_result = [z1, z2, refstrand, bound[2]
                                           ] + best_distance
        if best_result:
            bestbounds.append(best_result)
    if len(bestbounds) < 1:
        #nothing fixable
        #sys.stderr.write("nothing fixable\n")
        return gpd_entry
    #Now we have a list of nudgable bounds
    #Lets pick a strand
    plus_score = 0
    minus_score = 0
    #print '----'
    #print bestbounds
    for bound in bestbounds:
        if bound[2] == '+':
            plus_score += 1 / (float(abs(bound[4])) + float(abs(bound[5])) + 1)
        else:
            minus_score += 1 / (float(abs(bound[4])) + float(abs(bound[5])) +
                                1)
    use_strand = '+'
    #print [plus_score,minus_score]
    if plus_score < minus_score: use_strand = '-'
    #print use_strand
    choice_bounds = []
    for bound in bestbounds:
        if bound[2] == use_strand: choice_bounds.append(bound)
    #print '---'
    #print GenePredBasics.entry_to_line(gpd_entry)
    #print bestbounds
    #print choice_bounds
    if len(choice_bounds) < 1:
        print "ERROR  should have choices"
        sys.exit()
    replacements = {}
    for bound in choice_bounds:
        replacements[bound[3]] = [bound[0], bound[1]]
    junctions = []
    #print "fixed "+str(len(replacements.keys()))
    for i in range(0, len(bounds)):
        val = bounds[i]
        if i in replacements:
            #sys.stderr.write("use replacement\n")
            val = replacements[i]
            fcount += 1
        junctions.append([val[0], val[1]])
    #print junctions
    #sys.stderr.write("replace\n")
    #print junctions
    new_gpd_line = gpd_entry['gene_name'] + "\t"
    new_gpd_line += gpd_entry['name'] + "\t"
    new_gpd_line += gpd_entry['chrom'] + "\t"
    new_gpd_line += gpd_entry['strand'] + "\t"
    new_gpd_line += str(gpd_entry['txStart']) + "\t"
    new_gpd_line += str(gpd_entry['txEnd']) + "\t"
    new_gpd_line += str(gpd_entry['cdsStart']) + "\t"
    new_gpd_line += str(gpd_entry['cdsEnd']) + "\t"
    new_gpd_line += str(len(junctions) + 1) + "\t"
    exon_starts = [gpd_entry['txStart']]
    exon_ends = []  #gpd_entry['txEnd']]
    for junc in junctions:
        exon_starts.append(junc[1] - 1)
        exon_ends.append(junc[0])
    exon_ends.append(gpd_entry['txEnd'])
    new_gpd_line += ','.join([str(x) for x in exon_starts]) + ',' + "\t"
    new_gpd_line += ','.join([str(x) for x in exon_ends]) + ',' + "\t"
    #print new_gpd_line
    new_gpd_entry = GenePredBasics.line_to_entry(new_gpd_line)
    #print "got junctions"
    #print new_gpd_line
    #print '.........'
    return new_gpd_entry