def get_gap_or_overlap_from_psl_file(psl_filename):
  res = {}
  with open(psl_filename) as infile:
    while True:
      line1 = infile.readline().strip()
      if not line1: return res
      line2 = infile.readline().strip()
      if not line2: return res
      e1 = psl_basics.read_psl_entry(line1)
      e2 = psl_basics.read_psl_entry(line2)
      prog = re.compile('\|(F\d+)[+-]')
      m = prog.search(e1['qName'])
      if not m: 
        print "problem reading query name from psl file"
        sys.exit()
      e1name = m.group(1)
      m = prog.search(e2['qName'])
      if not m: 
        print "problem reading query name from psl file"
        sys.exit()
      e2name = m.group(1)
      if e1name != e2name:
        print "problem: consecutive lines should have the same fusion ID"
        sys.exit()
      v = {}
      v['base_overlap'] = psl_basics.query_coordinates_base_overlap_size(e1,e2)
      v['region_overlap'] = psl_basics.query_coordinates_region_overlap_size(e1,e2)
      v['gap'] = psl_basics.query_coordinates_gap_size(e1,e2)
      res[e1name] = v
  return res
Ejemplo n.º 2
0
def get_gap_or_overlap_from_psl_file(psl_filename):
  res = {}
  with open(psl_filename) as infile:
    while True:
      line1 = infile.readline().strip()
      if not line1: return res
      line2 = infile.readline().strip()
      if not line2: return res
      e1 = psl_basics.read_psl_entry(line1)
      e2 = psl_basics.read_psl_entry(line2)
      prog = re.compile('\|(F\d+)[+-]')
      m = prog.search(e1['qName'])
      if not m: 
        print "problem reading query name from psl file"
        sys.exit()
      e1name = m.group(1)
      m = prog.search(e2['qName'])
      if not m: 
        print "problem reading query name from psl file"
        sys.exit()
      e2name = m.group(1)
      if e1name != e2name:
        print "problem: consecutive lines should have the same fusion ID"
        sys.exit()
      v = {}
      v['base_overlap'] = psl_basics.query_coordinates_base_overlap_size(e1,e2)
      v['region_overlap'] = psl_basics.query_coordinates_region_overlap_size(e1,e2)
      v['gap'] = psl_basics.query_coordinates_gap_size(e1,e2)
      res[e1name] = v
  return res
def main():

  if len(sys.argv) < 2:
    print sys.argv[0] + " <psl filename> <smoothing parameter (default 10)>"
    sys.exit()
  smoothing = 10
  if len(sys.argv) == 3:
    smoothing = int(sys.argv[2])
  psl_filename = sys.argv[1]
  with open(psl_filename) as fh:
    for line in fh:
      line = line.rstrip()
      psl_entry = psl_basics.read_psl_entry(line)
      gpd_line = psl_basics.convert_entry_to_genepred_line(psl_entry)
      gpd_entry = genepred_basics.genepred_line_to_dictionary(gpd_line)
      smoothed_gpd_entry = genepred_basics.smooth_gaps(gpd_entry,smoothing)
      #get longest exon only
      longest_exon = 0
      best_start = 0
      best_end = 0
      for i in range(0,smoothed_gpd_entry['exonCount']):
        exon_length = smoothed_gpd_entry['exonEnds'][i]-smoothed_gpd_entry['exonStarts'][i]
        if exon_length > longest_exon:
          longest_exon = exon_length
          best_end = smoothed_gpd_entry['exonEnds'][i]
          best_start = smoothed_gpd_entry['exonStarts'][i]
      print psl_entry['tName'] + "\t" +  str(best_start) + "\t" + str(best_end) + "\t" + psl_entry['qName'] + "\t" + str(psl_entry['qSize']) + "\t" + str(psl_entry['tSize']) + "\t" + str(longest_exon)
def make_best_continuous_alignment_bed(alignmentfname,bestalignmentbedfname):
  bestcont = {}
  of = open(bestalignmentbedfname,'w')
  with open(alignmentfname) as f:
    for line in f:
      if re.match('^#',line): continue
      pe = psl_basics.read_psl_entry(line)
      gl = psl_basics.convert_entry_to_genepred_line(pe)
      ge = genepred_basics.genepred_line_to_dictionary(gl)
      ges = genepred_basics.smooth_gaps(ge,10)
      for i in range(0,len(ges['exonStarts'])):
        exonlen = ges['exonEnds'][i]-ges['exonStarts'][i]
        if ges['name'] not in bestcont:
          bestcont[ges['name']] = {}
        if ges['chrom'] not in bestcont[ges['name']]:
          entry = {}
          entry['bestlen'] = 0
          entry['start'] = 0
          entry['end'] = 0
          entry['target_length'] = 0
          entry['query_length'] = 0
          entry['matches'] = 0
          bestcont[ges['name']][ges['chrom']] = entry
        if exonlen > bestcont[ges['name']][ges['chrom']]['bestlen']:
          bestcont[ges['name']][ges['chrom']]['bestlen'] = exonlen
          bestcont[ges['name']][ges['chrom']]['start'] = ges['exonStarts'][i]
          bestcont[ges['name']][ges['chrom']]['end'] = ges['exonEnds'][i]
          bestcont[ges['name']][ges['chrom']]['target_length'] = pe['tSize']
          bestcont[ges['name']][ges['chrom']]['query_length'] = pe['qSize']
          bestcont[ges['name']][ges['chrom']]['matches'] = pe['matches']
  for read in bestcont:
    for tx in bestcont[read]:
      of.write(tx + "\t" + str(bestcont[read][tx]['start']) + "\t" +  str(bestcont[read][tx]['end']) + "\t" + read + "\t" + str(bestcont[read][tx]['query_length']) + "\t" + str(bestcont[read][tx]['target_length']) + "\t"  + str(bestcont[read][tx]['bestlen']) + "\t" + str(bestcont[read][tx]['matches']) + "\n")
  return
Ejemplo n.º 5
0
def main():
  if len(sys.argv) < 2:
    print sys.argv[0] + ' <psl filename> <smoothing size (optional)>'
    sys.exit()
  pslfilename = sys.argv[1]
  smooth_size = 0
  if len(sys.argv) == 3:
    smooth_size = int(sys.argv[2])
  with open(pslfilename) as infile:
    for line in infile:
      psl_entry = psl_basics.read_psl_entry(line)
      genepred_line = psl_basics.convert_entry_to_genepred_line(psl_entry)
      genepred_entry = genepred_basics.genepred_line_to_dictionary(genepred_line)
      if smooth_size > 0:
        genepred_entry = genepred_basics.smooth_gaps(genepred_entry,smooth_size)
      for i in range(0,len(genepred_entry['exonStarts'])):
        print genepred_entry['chrom'] + "\t" + str(genepred_entry['exonStarts'][i]+1) + "\t" + str(genepred_entry['exonEnds'][i]) + "\t" + genepred_entry['gene_name']
Ejemplo n.º 6
0
def make_fusion_genepred(psl_filename, min_gap_in_block_size, outfile):
    i = 0
    ofile = open(outfile, 'w')
    fusion_gpd = {}
    with open(psl_filename) as infile:
        for line in infile:
            m = i % 2
            i += 1
            entry = psl_basics.read_psl_entry(line)
            gline = psl_basics.convert_entry_to_genepred_line(entry)
            e = genepred_basics.genepred_line_to_dictionary(gline)
            # m == 0 is left, m == 1 is right
            look = re.search('\|(F[\d]+)([+-])', e['gene_name'])
            fid = look.group(1)
            fsidesign = look.group(2)  #which side is the fusion on

            e['cdsStart'] = e['txStart']
            e['cdsEnd'] = e['txEnd']

            f = genepred_basics.smooth_gaps(e, min_gap_in_block_size)

            # get new transcript length
            newlen = 0
            for j in range(0, f['exonCount']):
                newlen += f['exonEnds'][j] - f['exonStarts'][j]

            # make a new name based on the new fusion site and new length
            look = re.match('^([\d\.]+_)\d+(\|F\d+[+-].*)$', f['gene_name'])
            v1 = look.group(1)
            v2 = str(newlen)
            v3 = look.group(2)
            f['gene_name'] = v1 + v2 + v3

            gpline = genepred_basics.genepred_entry_to_genepred_line(f)
            ofile.write(gpline + "\n")
            if fid not in fusion_gpd:
                n = [0, 0]
            fusion_gpd[fid] = n
            fusion_gpd[fid][m] = [
                f, fsidesign
            ]  #send back the genepred and the side the fusion is on
    ofile.close()
    return fusion_gpd
def make_fusion_genepred(psl_filename,min_gap_in_block_size,outfile):
  i = 0
  ofile = open(outfile,'w')
  fusion_gpd = {}
  with open(psl_filename) as infile:
    for line in infile:
      m = i % 2
      i += 1
      entry = psl_basics.read_psl_entry(line)
      gline = psl_basics.convert_entry_to_genepred_line(entry)
      e = genepred_basics.genepred_line_to_dictionary(gline)
      # m == 0 is left, m == 1 is right
      look = re.search('\|(F[\d]+)([+-])',e['gene_name'])
      fid = look.group(1)
      fsidesign = look.group(2) #which side is the fusion on

      e['cdsStart'] = e['txStart']        
      e['cdsEnd'] = e['txEnd']        
      
      f = genepred_basics.smooth_gaps(e,min_gap_in_block_size)
 
      # get new transcript length
      newlen = 0
      for j in range(0,f['exonCount']): newlen += f['exonEnds'][j] - f['exonStarts'][j]

      # make a new name based on the new fusion site and new length
      look = re.match('^([\d\.]+_)\d+(\|F\d+[+-].*)$',f['gene_name'])
      v1 = look.group(1)
      v2 = str(newlen)
      v3 = look.group(2)
      f['gene_name'] = v1 + v2 + v3

      gpline = genepred_basics.genepred_entry_to_genepred_line(f)
      ofile.write(gpline+"\n")
      if fid not in fusion_gpd: 
        n = [0,0]
      fusion_gpd[fid] = n
      fusion_gpd[fid][m] = [f, fsidesign] #send back the genepred and the side the fusion is on
  ofile.close()
  return fusion_gpd
def main():
    if len(sys.argv) < 2:
        print sys.argv[0] + ' <psl filename> <smoothing size (optional)>'
        sys.exit()
    pslfilename = sys.argv[1]
    smooth_size = 0
    if len(sys.argv) == 3:
        smooth_size = int(sys.argv[2])
    with open(pslfilename) as infile:
        for line in infile:
            psl_entry = psl_basics.read_psl_entry(line)
            genepred_line = psl_basics.convert_entry_to_genepred_line(
                psl_entry)
            genepred_entry = genepred_basics.genepred_line_to_dictionary(
                genepred_line)
            if smooth_size > 0:
                genepred_entry = genepred_basics.smooth_gaps(
                    genepred_entry, smooth_size)
            for i in range(0, len(genepred_entry['exonStarts'])):
                print genepred_entry['chrom'] + "\t" + str(
                    genepred_entry['exonStarts'][i] + 1) + "\t" + str(
                        genepred_entry['exonEnds']
                        [i]) + "\t" + genepred_entry['gene_name']
def make_fusion_genepred(psl_filename,fid_coords,min_gap_in_block_size,outfile):
  i = 0
  ofile = open(outfile,'w')
  fusion_gpd = {}
  with open(psl_filename) as infile:
    for line in infile:
      m = i % 2
      i += 1
      entry = psl_basics.read_psl_entry(line)
      gline = psl_basics.convert_entry_to_genepred_line(entry)
      d = genepred_basics.genepred_line_to_dictionary(gline)
      # m == 0 is left, m == 1 is right
      look = re.search('\|(F[\d]+)([+-])',d['gene_name'])
      fid = look.group(1)
      fsidesign = look.group(2) #which side is the fusion on
      if fid not in fid_coords: continue # we only are working on the ones that met criteria for short reads

      #fix our end points based on short read defined fusions
      side = 'left'
      if m == 1: side = 'right'
      [chr, coord, dir] = fid_coords[fid][side]

      coord = int(coord)
      # fix our endpoints
      e = {} # our corrected genepred
      newpos = 0 #this will be used to update the 
      if fsidesign == '-':
        #print 'right ' +str(coord) + " " + str(d['exonEnds'][d['exonCount']-1])
        # if the new txEnd is greater than the last exon, we need to move it out
        if coord == d['exonEnds'][d['exonCount']-1]:
          e = d
          #print "right side stays the same"
        elif coord > d['exonEnds'][d['exonCount']-1]:
          e = genepred_basics.right_extend_genepred(d,coord)
          #print "right extend transcript"
          #everything is all set
        else:
          e = genepred_basics.right_trim_genepred(d,coord)
          #print "right shorten transcript"
        newpos = e['txEnd']
      else:
        # end point is on the left side of the alignment
        #print 'left ' + str(coord) + " " + str(d['exonStarts'][0])
        if coord-1 == d['exonStarts'][0]:
          e = d
          #print "left stays the same"
        if coord-1 < d['exonStarts'][0]:
          e = genepred_basics.left_extend_genepred(d,coord-1)
          #print "left extend transcript"
        else:
          e = genepred_basics.left_trim_genepred(d,coord-1)
          #print "left trim transcript"
        newpos = e['txStart']
      # we treat cdsStart and cdsEnd the same as tx
      e['cdsStart'] = e['txStart']        
      e['cdsEnd'] = e['txEnd']        
      
      f = genepred_basics.smooth_gaps(e,min_gap_in_block_size)
 
      # get new transcript length
      newlen = 0
      for j in range(0,f['exonCount']): newlen += f['exonEnds'][j] - f['exonStarts'][j]

      # make a new name based on the new fusion site and new length
      look = re.match('^([\d\.]+_)\d+(\|F\d+[+-])\d+(\|.*)$',f['gene_name'])
      v1 = look.group(1)
      v2 = str(newlen)
      v3 = look.group(2)
      v4 = str(newpos)
      v5 = look.group(3)
      f['gene_name'] = v1 + v2 + v3 + v4 + v5 

      gpline = genepred_basics.genepred_entry_to_genepred_line(f)
      ofile.write(gpline+"\n")
      if fid not in fusion_gpd: 
        n = [0,0]
      fusion_gpd[fid] = n
      fusion_gpd[fid][m] = [f, fsidesign] #send back the genepred and the side the fusion is on
  ofile.close()
  return fusion_gpd
Ejemplo n.º 10
0
def process_temp_list(temp_list, gname_list, gene_start_pos_list,
                      gene_end_pos_list, gene_max_end_pos_list):
    ref_stat = 0

    # Select best ref_ls from single alignemt
    for result_ls in temp_list:
        stat = float(result_ls[0]) / float(result_ls[10])
        if stat > ref_stat:
            ref_stat = stat
            ref_str = '\t'.join(result_ls)

    fusion_flag = False
    # Select best ref_ls from two alignemts
    # qstart/end index: 11/12
    # tstart/end index: 15/16
    # tname index: 13
    temp_list_len = len(temp_list)
    for idx in range(temp_list_len):
        # Check numbe rof matches
        gnames_1 = get_gnames(temp_list[idx][13], int(temp_list[idx][15]),
                              int(temp_list[idx][16]), gname_list,
                              gene_start_pos_list, gene_end_pos_list,
                              gene_max_end_pos_list)

        if (int(temp_list[idx][0]) < fusion_segment_len_threshold):
            continue
        line1 = "\t".join(temp_list[idx])
        entry1 = psl_basics.read_psl_entry(line1)
        for idx_2 in range(idx + 1, temp_list_len):
            # Check number of matches
            if (int(temp_list[idx_2][0]) < fusion_segment_len_threshold):
                continue
            if (temp_list[idx][13] == temp_list[idx_2][13]):
                #print ">" + str(temp_list)

                gnames_2 = get_gnames(temp_list[idx_2][13],
                                      int(temp_list[idx_2][15]),
                                      int(temp_list[idx_2][16]), gname_list,
                                      gene_start_pos_list, gene_end_pos_list,
                                      gene_max_end_pos_list)
                # Check if they share same gene locus
                if ((len(gnames_1) > 0) and (len(gnames_2) > 0)):
                    if (len(gnames_1 & gnames_2) != 0):
                        continue
                else:
                    tstart = max(int(temp_list[idx][15]),
                                 int(temp_list[idx_2][15]))
                    tend = min(int(temp_list[idx][16]),
                               int(temp_list[idx_2][16]))
                    # Note: it is expected intron shorter than L_junction_limit are not splited by blat
                    if ((tstart - tend) < L_junction_limit):
                        continue
            # Note: query coordinates have special handling in neg strands
            # Check psl format on genome USCSC website
            line2 = "\t".join(temp_list[idx_2])
            entry2 = psl_basics.read_psl_entry(line2)
            baseoverlap = psl_basics.query_coordinates_base_overlap_size(
                entry1, entry2)
            basegap = psl_basics.query_coordinates_gap_size(entry1, entry2)
            if (baseoverlap):
                if (baseoverlap <= fusion_overlap_threshold):
                    qlen = sum(entry1['blockSizes']) - baseoverlap
                    qlen_2 = sum(entry2['blockSizes']) - baseoverlap
                    if ((qlen < fusion_segment_len_threshold)
                            or (qlen_2 < fusion_segment_len_threshold)):
                        continue
                else:
                    continue
            elif (basegap > fusion_gap_threshold):
                continue
            stat = (float(temp_list[idx][0]) +
                    float(temp_list[idx_2][0])) / float(temp_list[idx][10])
            if stat > ref_stat:
                fusion_psl.write('\t'.join(temp_list[idx]) + '\n')
                fusion_psl.write('\t'.join(temp_list[idx_2]) + '\n')
                fusion_flag = True

    if (fusion_flag):
        fusion_psl_single.write(ref_str + '\n')
        return ""
    else:
        return (ref_str)
Ejemplo n.º 11
0
def main():

    if len(sys.argv) != 5:
        print sys.argv[
            0] + ' <IDP temp dir> <junction (i.e. chr1:1000-/chr2:1000+)> <range> <output dir>'
        sys.exit()
    inputdir = sys.argv[1].rstrip('/')
    psl_filename = inputdir + '/' + 'LR.psl_fusion_pair_filtered'
    map_filename = inputdir + '/uniqueness/fusion_read.map'
    fasta_filename = inputdir + '/uniqueness/unmapped_shortread.fa'
    txn_filename = inputdir + '/uniqueness/txn.map'
    junction_abbreviation = sys.argv[2]
    myrange = int(sys.argv[3])
    outdir = sys.argv[4].rstrip('/')
    if not os.path.isdir(outdir):
        print "make directory " + outdir
        os.mkdir(outdir)
    of1 = open(outdir + "/fusion_coordiante.txt", 'w')
    of1.write(junction_abbreviation + "\t" + str(myrange) + "\n")
    of1.close()
    m = re.match('([^:]+):(\d+)([+-])\/([^:]+):(\d+)([+-])',
                 junction_abbreviation)
    chr1 = m.group(1)
    coo1 = int(m.group(2))
    dir1 = m.group(3)
    chr2 = m.group(4)
    coo2 = int(m.group(5))
    dir2 = m.group(6)
    of = open(outdir + "/long_read_query_locations.txt", 'w')
    oleft = open(outdir + "/long_read_left.bed", 'w')
    oright = open(outdir + "/long_read_right.bed", 'w')
    oleftgpd = open(outdir + "/long_read_left.gpd", 'w')
    orightgpd = open(outdir + "/long_read_right.gpd", 'w')
    # Work through psl file for long reads
    lrcnt = 0
    with open(psl_filename) as f:
        while True:
            l1 = f.readline().rstrip()
            if not l1:
                break
            l2 = f.readline().rstrip()
            if not l2:
                break
            e1 = psl_basics.read_psl_entry(l1)
            e2 = psl_basics.read_psl_entry(l2)
            g1 = genepred_basics.smooth_gaps(
                genepred_basics.genepred_line_to_dictionary(
                    psl_basics.convert_entry_to_genepred_line(e1)), 30)
            g2 = genepred_basics.smooth_gaps(
                genepred_basics.genepred_line_to_dictionary(
                    psl_basics.convert_entry_to_genepred_line(e2)), 30)
            if check_coordiantes(e1, e2, chr1, coo1, dir1, chr2, coo2, dir2,
                                 myrange):
                oleftgpd.write(
                    genepred_basics.genepred_entry_to_genepred_line(g1) + "\n")
                orightgpd.write(
                    genepred_basics.genepred_entry_to_genepred_line(g2) + "\n")
                lrcnt += 1
                of.write(e1["qName"] + "\t" + str(e1['qStart']+1) + "\t" + \
                         str(e1['qEnd']) + "\t" + dir1 + "\t" + \
                         str(e2['qStart']+1) + "\t" + str(e2['qEnd']) + "\t" + dir2 + "\n")
                for i in range(0, g1['exonCount']):
                    if g1["exonEnds"][i] - g1["exonStarts"][i] >= 30:
                        oleft.write(g1["chrom"] + "\t" + str(g1["exonStarts"][i]+1) + "\t" + \
                                    str(g1["exonEnds"][i]) + "\t" + e1["qName"] + "\n")
                for i in range(0, g2['exonCount']):
                    if g2["exonEnds"][i] - g2["exonStarts"][i] >= 30:
                        oright.write(g2["chrom"] + "\t" + str(g2["exonStarts"][i]+1) + "\t" + \
                                    str(g2["exonEnds"][i]) + "\t" + e1["qName"] + "\n")
            elif check_coordiantes(e1, e2, chr2, coo2, opposite(dir2), chr1,
                                   coo1, opposite(dir1), myrange):
                oleftgpd.write(
                    genepred_basics.genepred_entry_to_genepred_line(g2) + "\n")
                orightgpd.write(
                    genepred_basics.genepred_entry_to_genepred_line(g1) + "\n")
                lrcnt += 1
                of.write(e1["qName"] + "\t" + str(e1['qStart']+1) + "\t" + \
                         str(e1['qEnd']) + "\t" + opposite(dir2) + "\t" + \
                         str(e2['qStart']+1) + "\t" + str(e2['qEnd']) + "\t" + opposite(dir1) + "\n")
                for i in range(0, g1['exonCount']):
                    if g1["exonEnds"][i] - g1["exonStarts"][i] >= 30:
                        oright.write(g1["chrom"] + "\t" + str(g1["exonStarts"][i]+1) + "\t" + \
                                     str(g1["exonEnds"][i]) + "\t" + e1["qName"] + "\n")
                for i in range(0, g2['exonCount']):
                    if g2["exonEnds"][i] - g2["exonStarts"][i] >= 30:
                        oleft.write(g2["chrom"] + "\t" + str(g2["exonStarts"][i]+1) + "\t" + \
                                    str(g2["exonEnds"][i]) + "\t" + e1["qName"] + "\n")
    of.close()
    oleft.close()
    oright.close()
    oleftgpd.close()
    orightgpd.close()
    print str(lrcnt) + " long reads found supporting the fusion"
    #Work through fusion read map for short reads
    rnames = {}
    seenhit = {}
    with open(map_filename) as inf:
        for line in inf:
            f = line.rstrip().split("\t")
            srname = f[0]
            loc = f[2]
            m = re.search('^([^:]+):.*\/([^:]+):', loc)
            srchr1 = m.group(1)
            srchr2 = m.group(2)
            m = re.search(
                '[,:](-?\d+)-(-?\d+)([+-])\/[^:]+:(-?\d+)-(-?\d+),?.*([+-])',
                loc)
            srcoo1start = int(m.group(1))
            srcoo1finish = int(m.group(2))
            srdir1 = m.group(3)
            srcoo2start = int(m.group(4))
            srcoo2finish = int(m.group(5))
            srdir2 = m.group(6)
            m = re.search
            srcooleft = srcoo1finish
            if srdir1 == '-':
                srcooleft = srcoo1start
            srcooright = srcoo2start
            if srdir2 == '-':
                srcooright = srcoo2finish
            #print srchr1 + "\t" + srchr2 + "\t" + str(srcooleft) + "\t" + str(srcooright)
            if srdir1 == dir1 and srchr1 == chr1 and srdir2 == dir2 and srchr2 == chr2 and srcooleft == coo1 and srcooright == coo2:
                rnames[srname] = {}
                rnames[srname]['left'] = srchr1 + "\t" + str(
                    srcoo1start) + "\t" + str(srcoo1finish) + "\t" + srname
                rnames[srname]['right'] = srchr2 + "\t" + str(
                    srcoo2start) + "\t" + str(srcoo2finish) + "\t" + srname
                if srname not in seenhit:
                    seenhit[srname] = 0
                seenhit[srname] += 1
            if srdir1 == opposite(
                    dir1
            ) and srchr1 == chr2 and srdir2 == opposite(
                    dir2
            ) and srchr2 == chr1 and srcooleft == coo2 and srcooright == coo1:
                rnames[srname] = {}
                rnames[srname]['left'] = srchr2 + "\t" + str(
                    srcoo2start) + "\t" + str(srcoo2finish) + "\t" + srname
                rnames[srname]['right'] = srchr1 + "\t" + str(
                    srcoo1start) + "\t" + str(srcoo1finish) + "\t" + srname
                if srname not in seenhit:
                    seenhit[srname] = 0
                seenhit[srname] += 1
    print "found " + str(len(rnames)) + " short reads"
    for srname in seenhit:
        if seenhit[srname] > 1:
            print "removing " + srname
            del rnames[srname]
    print "found " + str(
        len(rnames)) + " short reads with no multihits among fusions"
    validreads = {}
    with open(fasta_filename) as inf:
        for line in inf:
            m = re.match('^>(.*)$', line.rstrip())
            if m:
                validreads[m.group(1)] = 1
    namelist = rnames.keys()
    for rname in namelist:
        if rname not in validreads:
            print "removing " + rname
            del rnames[rname]
    print "found " + str(
        len(rnames)) + " unique short reads with no hits in the genome"
    with open(txn_filename) as inf:
        for line in inf:
            f = line.rstrip().split("\t")
            if f[0] in rnames:
                print "removing " + f[0]
                del rnames[f[0]]
    print "found " + str(
        len(rnames)
    ) + " unique short reads with no hits in the genome or transcriptome"
    oleft = open(outdir + "/short_read_left.bed", 'w')
    oright = open(outdir + "/short_read_right.bed", 'w')
    for rname in rnames:
        oleft.write(rnames[rname]['left'] + "\n")
        oright.write(rnames[rname]['right'] + "\n")
    oleft.close()
    oright.close()
Ejemplo n.º 12
0
def process_temp_list(temp_list,
                      gname_list, gene_start_pos_list, 
                      gene_end_pos_list, gene_max_end_pos_list):
    ref_stat = 0
    
    # Select best ref_ls from single alignemt
    for result_ls in temp_list:
        stat = float(result_ls[0])/float(result_ls[10])
        if stat > ref_stat:
            ref_stat = stat
            ref_str = '\t'.join(result_ls)

    fusion_flag = False
    # Select best ref_ls from two alignemts
    # qstart/end index: 11/12
    # tstart/end index: 15/16
    # tname index: 13
    temp_list_len = len(temp_list)
    for idx in range(temp_list_len):
        # Check numbe rof matches
        gnames_1 = get_gnames(temp_list[idx][13], int(temp_list[idx][15]), int(temp_list[idx][16]),
                                 gname_list, gene_start_pos_list, gene_end_pos_list, gene_max_end_pos_list)

        if (int(temp_list[idx][0]) < fusion_segment_len_threshold):
            continue
        line1 = "\t".join(temp_list[idx])
        entry1 = psl_basics.read_psl_entry(line1)
        for idx_2 in range(idx + 1, temp_list_len):
            # Check number of matches
            if (int(temp_list[idx_2][0]) < fusion_segment_len_threshold):
                continue
            if (temp_list[idx][13] == temp_list[idx_2][13]):
                #print ">" + str(temp_list)
                
                gnames_2 = get_gnames(temp_list[idx_2][13], int(temp_list[idx_2][15]), int(temp_list[idx_2][16]),
                                 gname_list, gene_start_pos_list, gene_end_pos_list, gene_max_end_pos_list)
                # Check if they share same gene locus
                if ((len(gnames_1) > 0) and (len(gnames_2) > 0)):
                    if (len(gnames_1 & gnames_2) != 0):
                        continue
                else:
                    tstart = max(int(temp_list[idx][15]), int(temp_list[idx_2][15]))
                    tend = min(int(temp_list[idx][16]), int(temp_list[idx_2][16]))
                    # Note: it is expected intron shorter than L_junction_limit are not splited by blat
                    if ((tstart - tend) < L_junction_limit):
                        continue
            # Note: query coordinates have special handling in neg strands
            # Check psl format on genome USCSC website
            line2 = "\t".join(temp_list[idx_2])
            entry2 = psl_basics.read_psl_entry(line2)
            baseoverlap = psl_basics.query_coordinates_base_overlap_size(entry1,entry2)
            basegap = psl_basics.query_coordinates_gap_size(entry1,entry2)
            if (baseoverlap):
                if (baseoverlap <= fusion_overlap_threshold):
                    qlen = sum(entry1['blockSizes']) - baseoverlap
                    qlen_2 = sum(entry2['blockSizes']) - baseoverlap
                    if ((qlen < fusion_segment_len_threshold) or
                        (qlen_2 < fusion_segment_len_threshold)):
                        continue
                else:
                    continue
            elif (basegap > fusion_gap_threshold):
                continue
            stat = (float(temp_list[idx][0]) + float(temp_list[idx_2][0]))/float(temp_list[idx][10])
            if stat > ref_stat:
                fusion_psl.write('\t'.join(temp_list[idx]) + '\n')
                fusion_psl.write('\t'.join(temp_list[idx_2]) + '\n')
                fusion_flag = True


    if (fusion_flag):
        fusion_psl_single.write(ref_str + '\n')
        return ""
    else:
     return (ref_str)
def main():

  if len(sys.argv) != 5:
    print sys.argv[0] + ' <IDP temp dir> <junction (i.e. chr1:1000-/chr2:1000+)> <range> <output dir>'
    sys.exit()
  inputdir = sys.argv[1].rstrip('/')
  psl_filename = inputdir+'/'+'LR.psl_fusion_pair_filtered'
  map_filename = inputdir+'/uniqueness/fusion_read.map'
  fasta_filename = inputdir+'/uniqueness/unmapped_shortread.fa'
  txn_filename = inputdir+'/uniqueness/txn.map'
  junction_abbreviation = sys.argv[2]
  myrange = int(sys.argv[3])
  outdir = sys.argv[4].rstrip('/')
  if not os.path.isdir(outdir):
    print "make directory "+outdir
    os.mkdir(outdir)
  of1 = open(outdir+"/fusion_coordiante.txt",'w')
  of1.write(junction_abbreviation+"\t"+str(myrange)+"\n")
  of1.close()
  m = re.match('([^:]+):(\d+)([+-])\/([^:]+):(\d+)([+-])',junction_abbreviation)
  chr1 = m.group(1)
  coo1 = int(m.group(2))
  dir1 = m.group(3)
  chr2 = m.group(4)
  coo2 = int(m.group(5))
  dir2 = m.group(6)
  of = open(outdir+"/long_read_query_locations.txt",'w')
  oleft = open(outdir+"/long_read_left.bed",'w')
  oright = open(outdir+"/long_read_right.bed",'w')
  oleftgpd = open(outdir+"/long_read_left.gpd",'w')
  orightgpd = open(outdir+"/long_read_right.gpd",'w')
  # Work through psl file for long reads
  lrcnt = 0
  with open(psl_filename) as f:
    while True:
      l1 = f.readline().rstrip()
      if not l1:
        break
      l2 = f.readline().rstrip()
      if not l2:
        break
      e1 = psl_basics.read_psl_entry(l1)
      e2 = psl_basics.read_psl_entry(l2)
      g1 = genepred_basics.smooth_gaps(genepred_basics.genepred_line_to_dictionary(psl_basics.convert_entry_to_genepred_line(e1)),30)
      g2 = genepred_basics.smooth_gaps(genepred_basics.genepred_line_to_dictionary(psl_basics.convert_entry_to_genepred_line(e2)),30)
      if check_coordiantes(e1,e2,chr1,coo1,dir1,chr2,coo2,dir2,myrange):
        oleftgpd.write(genepred_basics.genepred_entry_to_genepred_line(g1)+"\n")
        orightgpd.write(genepred_basics.genepred_entry_to_genepred_line(g2)+"\n")
        lrcnt += 1
        of.write(e1["qName"] + "\t" + str(e1['qStart']+1) + "\t" + \
                 str(e1['qEnd']) + "\t" + dir1 + "\t" + \
                 str(e2['qStart']+1) + "\t" + str(e2['qEnd']) + "\t" + dir2 + "\n")
        for i in range(0,g1['exonCount']):
          if g1["exonEnds"][i]-g1["exonStarts"][i] >= 30:
            oleft.write(g1["chrom"] + "\t" + str(g1["exonStarts"][i]+1) + "\t" + \
                        str(g1["exonEnds"][i]) + "\t" + e1["qName"] + "\n")
        for i in range(0,g2['exonCount']):
          if g2["exonEnds"][i]-g2["exonStarts"][i] >= 30:
            oright.write(g2["chrom"] + "\t" + str(g2["exonStarts"][i]+1) + "\t" + \
                        str(g2["exonEnds"][i]) + "\t" + e1["qName"] + "\n")
      elif check_coordiantes(e1,e2,chr2,coo2,opposite(dir2),chr1,coo1,opposite(dir1),myrange):
        oleftgpd.write(genepred_basics.genepred_entry_to_genepred_line(g2)+"\n")
        orightgpd.write(genepred_basics.genepred_entry_to_genepred_line(g1)+"\n")
        lrcnt += 1
        of.write(e1["qName"] + "\t" + str(e1['qStart']+1) + "\t" + \
                 str(e1['qEnd']) + "\t" + opposite(dir2) + "\t" + \
                 str(e2['qStart']+1) + "\t" + str(e2['qEnd']) + "\t" + opposite(dir1) + "\n")
        for i in range(0,g1['exonCount']):
          if g1["exonEnds"][i]-g1["exonStarts"][i] >= 30:
            oright.write(g1["chrom"] + "\t" + str(g1["exonStarts"][i]+1) + "\t" + \
                         str(g1["exonEnds"][i]) + "\t" + e1["qName"] + "\n")
        for i in range(0,g2['exonCount']):
          if g2["exonEnds"][i]-g2["exonStarts"][i] >= 30:
            oleft.write(g2["chrom"] + "\t" + str(g2["exonStarts"][i]+1) + "\t" + \
                        str(g2["exonEnds"][i]) + "\t" + e1["qName"] + "\n")
  of.close()
  oleft.close()
  oright.close()
  oleftgpd.close()
  orightgpd.close()
  print str(lrcnt) + " long reads found supporting the fusion"
  #Work through fusion read map for short reads
  rnames = {}
  seenhit = {}
  with open(map_filename) as inf:
    for line in inf:
      f = line.rstrip().split("\t")
      srname = f[0]
      loc = f[2]
      m = re.search('^([^:]+):.*\/([^:]+):',loc)
      srchr1 = m.group(1)
      srchr2 = m.group(2)
      m = re.search('[,:](-?\d+)-(-?\d+)([+-])\/[^:]+:(-?\d+)-(-?\d+),?.*([+-])',loc)
      srcoo1start = int(m.group(1))
      srcoo1finish = int(m.group(2))
      srdir1 = m.group(3)
      srcoo2start = int(m.group(4))
      srcoo2finish = int(m.group(5))
      srdir2 = m.group(6)
      m = re.search
      srcooleft = srcoo1finish
      if srdir1 == '-':
        srcooleft = srcoo1start
      srcooright = srcoo2start
      if srdir2 == '-':
        srcooright = srcoo2finish
      #print srchr1 + "\t" + srchr2 + "\t" + str(srcooleft) + "\t" + str(srcooright)
      if srdir1 == dir1 and srchr1 == chr1 and srdir2 == dir2 and srchr2 == chr2 and srcooleft == coo1 and srcooright == coo2: 
        rnames[srname] = {}
        rnames[srname]['left'] = srchr1 + "\t" + str(srcoo1start) + "\t" + str(srcoo1finish) + "\t" + srname
        rnames[srname]['right'] = srchr2 + "\t" + str(srcoo2start) + "\t" + str(srcoo2finish) + "\t" + srname
        if srname not in seenhit:
          seenhit[srname] = 0
        seenhit[srname] += 1
      if srdir1 == opposite(dir1) and srchr1 == chr2 and srdir2 == opposite(dir2) and srchr2 == chr1 and srcooleft == coo2 and srcooright == coo1: 
        rnames[srname] = {}
        rnames[srname]['left'] = srchr2 + "\t" + str(srcoo2start) + "\t" + str(srcoo2finish) + "\t" + srname
        rnames[srname]['right'] = srchr1 + "\t" + str(srcoo1start) + "\t" + str(srcoo1finish) + "\t" + srname
        if srname not in seenhit:
          seenhit[srname] = 0
        seenhit[srname] += 1
  print "found "+str(len(rnames))+" short reads"
  for srname in seenhit:
    if seenhit[srname] > 1:
      print "removing " + srname
      del rnames[srname]
  print "found "+str(len(rnames))+" short reads with no multihits among fusions"
  validreads = {}
  with open(fasta_filename) as inf:
    for line in inf:
      m = re.match('^>(.*)$',line.rstrip())
      if m:
        validreads[m.group(1)] = 1
  namelist = rnames.keys()
  for rname in namelist:
    if rname not in validreads:
      print "removing " + rname
      del rnames[rname]
  print "found "+str(len(rnames))+" unique short reads with no hits in the genome"
  with open(txn_filename) as inf:
    for line in inf:
      f = line.rstrip().split("\t")
      if f[0] in rnames:
        print "removing " + f[0]
        del rnames[f[0]]
  print "found "+str(len(rnames))+" unique short reads with no hits in the genome or transcriptome"
  oleft = open(outdir+"/short_read_left.bed",'w')
  oright = open(outdir+"/short_read_right.bed",'w')
  for rname in rnames:
    oleft.write(rnames[rname]['left']+"\n")
    oright.write(rnames[rname]['right']+"\n")
  oleft.close()
  oright.close()
Ejemplo n.º 14
0
def make_fusion_genepred(psl_filename,fid_coords,min_gap_in_block_size,outfile):
  i = 0
  ofile = open(outfile,'w')
  fusion_gpd = {}
  with open(psl_filename) as infile:
    for line in infile:
      m = i % 2
      i += 1
      entry = psl_basics.read_psl_entry(line)
      gline = psl_basics.convert_entry_to_genepred_line(entry)
      d = genepred_basics.genepred_line_to_dictionary(gline)
      # m == 0 is left, m == 1 is right
      look = re.search('\|(F[\d]+)([+-])',d['gene_name'])
      fid = look.group(1)
      fsidesign = look.group(2) #which side is the fusion on
      if fid not in fid_coords: continue # we only are working on the ones that met criteria for short reads

      #fix our end points based on short read defined fusions
      side = 'left'
      if m == 1: side = 'right'
      [chr, coord, dir] = fid_coords[fid][side]

      coord = int(coord)
      # fix our endpoints
      e = {} # our corrected genepred
      newpos = 0 #this will be used to update the 
      if fsidesign == '-':
        #print 'right ' +str(coord) + " " + str(d['exonEnds'][d['exonCount']-1])
        # if the new txEnd is greater than the last exon, we need to move it out
        if coord == d['exonEnds'][d['exonCount']-1]:
          e = d
          #print "right side stays the same"
        elif coord > d['exonEnds'][d['exonCount']-1]:
          e = genepred_basics.right_extend_genepred(d,coord)
          #print "right extend transcript"
          #everything is all set
        else:
          e = genepred_basics.right_trim_genepred(d,coord)
          #print "right shorten transcript"
        newpos = e['txEnd']
      else:
        # end point is on the left side of the alignment
        #print 'left ' + str(coord) + " " + str(d['exonStarts'][0])
        if coord-1 == d['exonStarts'][0]:
          e = d
          #print "left stays the same"
        if coord-1 < d['exonStarts'][0]:
          e = genepred_basics.left_extend_genepred(d,coord-1)
          #print "left extend transcript"
        else:
          e = genepred_basics.left_trim_genepred(d,coord-1)
          #print "left trim transcript"
        newpos = e['txStart']
      # we treat cdsStart and cdsEnd the same as tx
      e['cdsStart'] = e['txStart']        
      e['cdsEnd'] = e['txEnd']        
      
      f = genepred_basics.smooth_gaps(e,min_gap_in_block_size)
 
      # get new transcript length
      newlen = 0
      for j in range(0,f['exonCount']): newlen += f['exonEnds'][j] - f['exonStarts'][j]

      # make a new name based on the new fusion site and new length
      look = re.match('^([\d\.]+_)\d+(\|F\d+[+-])\d+(\|.*)$',f['gene_name'])
      v1 = look.group(1)
      v2 = str(newlen)
      v3 = look.group(2)
      v4 = str(newpos)
      v5 = look.group(3)
      f['gene_name'] = v1 + v2 + v3 + v4 + v5 

      gpline = genepred_basics.genepred_entry_to_genepred_line(f)
      ofile.write(gpline+"\n")
      if fid not in fusion_gpd: 
        n = [0,0]
      fusion_gpd[fid] = n
      fusion_gpd[fid][m] = [f, fsidesign] #send back the genepred and the side the fusion is on
  ofile.close()
  return fusion_gpd