Beispiel #1
0
def do_reduction(subset, args, nrfuzzykey, location):
    seen = set()
    for i in subset:
        seen.add(i)
        for j in subset[i]:
            seen.add(j)
    singles = []
    for num in nrfuzzykey:
        if num not in seen:
            singles.append(num)
    #if len(subset.keys()) == 0 and len(compatible.keys()) == 0: return
    families = get_subset_evidence(subset, nrfuzzykey, args)
    gpdlines = ""
    tablelines = ""
    for num in singles:
        families.append(nrfuzzykey[num])
    # find gpds not in the graph...
    for fz in families:
        info = fz.get_info_string()
        gpdline = fz.get_genepred_line()
        #print '&&&&&&&&&&&&&&&&'
        #print gpdline
        #print fz.get_info_string()
        #print '&&&&&&&&&&&&&&&&'
        gpd = GenePredEntry(gpdline)
        if not gpd.is_valid():
            sys.stderr.write("WARNING: invalid genepred entry generated\n" +
                             gpdline + "\n" + fz.get_info_string() + "\n")
            gpd = sorted(
                fz.gpds, key=lambda x: x.get_exon_count(),
                reverse=True)[0]  #just grab one that has all the exons
            fz = FuzzyGenePred(gpd, juntol=args.junction_tolerance * 2)
            gpdline = fz.get_genepred_line()
            if not gpd.is_valid():
                sys.stderr.write("WARNING: still problem skilling\n")
                continue
        gpdlines += gpdline + "\n"
        if args.output_original_table:
            name = gpd.entry['name']
            for g in fz.gpds:
                tablelines += name + "\t" + g.entry['name'] + "\n"
        grng = gpd.get_bed()
        grng.direction = None
        if not location:
            location = grng
        location = location.merge(grng)
    locstring = ''
    if location: locstring = location.get_range_string()
    return [gpdlines, tablelines, locstring]
def do_reduction(subset,args,nrfuzzykey,location):
    seen = set()
    for i in subset:
      seen.add(i)
      for j in subset[i]:  seen.add(j)
    singles = []
    for num in nrfuzzykey:
      if num not in seen:
        singles.append(num)
    #if len(subset.keys()) == 0 and len(compatible.keys()) == 0: return
    families = get_subset_evidence(subset,nrfuzzykey,args)
    gpdlines = ""
    tablelines = ""
    for num in singles:
      families.append(nrfuzzykey[num])
    # find gpds not in the graph... 
    for fz in families:
      info = fz.get_info_string()
      gpdline = fz.get_genepred_line()
      #print '&&&&&&&&&&&&&&&&'
      #print gpdline
      #print fz.get_info_string()
      #print '&&&&&&&&&&&&&&&&'
      gpd = GenePredEntry(gpdline)
      if not gpd.is_valid(): 
        sys.stderr.write("WARNING: invalid genepred entry generated\n"+gpdline+"\n"+fz.get_info_string()+"\n")
        gpd = sorted(fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons
        fz = FuzzyGenePred(gpd,juntol=args.junction_tolerance*2)
        gpdline = fz.get_genepred_line()
        if not gpd.is_valid():
          sys.stderr.write("WARNING: still problem skilling\n")
          continue
      gpdlines += gpdline+"\n"
      if args.output_original_table:
        name = gpd.entry['name']
        for g in fz.gpds:
          tablelines+=name+"\t"+g.entry['name']+"\n"
      grng = gpd.get_bed()
      grng.direction = None
      if not location: 
        location = grng
      location = location.merge(grng)
    locstring = ''
    if location:  locstring = location.get_range_string()
    return [gpdlines, tablelines, locstring]
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('gpd_input')
  parser.add_argument('bam_input')
  parser.add_argument('--intergenic_buffer',default=10000,type=int)
  parser.add_argument('--window_size',default=10000,type=int)
  parser.add_argument('--bin_size',default=1000,type=int)
  parser.add_argument('--use_off_regions',action='store_true',help="Use a region even if there is no reads mapped to it.")
  parser.add_argument('--get_exons',action='store_true')
  args = parser.parse_args()
  chr_beds = {}
  gene_beds = []
  exon_beds = []
  sys.stderr.write("Reading genepred file\n")
  asum = 0
  atot = 0
  with open(args.gpd_input) as inf:
    for line in inf:
      g = GenePredEntry(line)
      asum += g.length()
      atot += 1
      grng = g.get_bed()
      grng.direction = None
      if grng.chr not in chr_beds:
        chr_beds[grng.chr] = grng.copy()
      chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng)
      gene_beds.append(grng)
      for i in range(0,g.get_exon_count()):
        erng = Bed(g.value('chrom'),g.value('exonStarts')[i],g.value('exonEnds')[i])
        exon_beds.append(erng)
  avglen = float(asum)/float(atot)
  sys.stderr.write("Sorting gene bed\n")
  gene_beds = sort_ranges(gene_beds)
  gene_beds = merge_ranges(gene_beds,already_sorted=True)
  sys.stderr.write("Sorting chromosome beds\n")
  chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()])
  sys.stderr.write("Sorting exon beds\n")
  exon_beds = sort_ranges(exon_beds)
  sys.stderr.write("Get padded genes\n")
  padded_gene_beds = pad_ranges(gene_beds,args.intergenic_buffer,chr_beds)
  padded_gene_beds = merge_ranges(padded_gene_beds,already_sorted=True)
  sys.stderr.write("Get intergenic regions\n")
  intergenic_beds = subtract_ranges(chr_beds,padded_gene_beds,already_sorted=True)
  intergenic_beds = merge_ranges(intergenic_beds,already_sorted=True)
  intergenic_beds = window_break(intergenic_beds,args.window_size)
  #for i in intergenic_beds: print i.get_range_string()
  sys.stderr.write("Get merged exons\n")
  exon_beds = merge_ranges(exon_beds)
  sys.stderr.write("Get introns\n")
  intron_beds = subtract_ranges(gene_beds,exon_beds,already_sorted=True)  
  intron_beds = merge_ranges(intron_beds,already_sorted=True)
  intron_beds = window_break(intron_beds,args.window_size)
  sys.stderr.write("Going through short reads\n")
  cmd = "sam_to_bed_depth.py "+args.bam_input
  p = Popen(cmd.split(),stdout=PIPE)
  for x in intron_beds: x.set_payload([]) # payloads are read depths
  for x in intergenic_beds: x.set_payload([]) # payloads are read depths
  for x in exon_beds: x.set_payload([]) # payloads are read depths
  introndepth = []
  intergenicdepth = []
  exondepth = []
  pseudoreadcount = 0
  if not args.get_exons: exon_beds = []
  section_count = 0
  while True:
    section_count += 1
    line = p.stdout.readline()
    if not line: break
    f = line.split("\t")
    depth = int(f[3])
    curr = Bed(f[0],int(f[1]),int(f[2]))
    if section_count %100==0: sys.stderr.write(curr.get_range_string()+"          \r")
    pseudoreadcount += depth
    if len(exon_beds) > 0:
      while curr.cmp(exon_beds[0]) > 0 and len(exon_beds) > 0: # we've passed the region
        v = exon_beds.pop(0)
        if len(v.get_payload()) == 0 and not args.use_off_regions: continue
        av = average(v)
        exondepth.append(av)
        #print str(av)+" exonic "+v.get_range_string()
      c = curr.cmp(exon_beds[0])
      if c == 0: # overlaps with intron
        size = curr.overlap_size(exon_beds[0])
        for i in range(0,size): exon_beds[0].get_payload().append(depth)
    if len(intron_beds) > 0:
      while curr.cmp(intron_beds[0]) > 0 and len(intron_beds) > 0: # we've passed the region
        v = intron_beds.pop(0)
        if len(v.get_payload()) == 0 and not args.use_off_regions: continue
        av = average(v)
        introndepth.append(av)
        #print str(av)+" intronic "+v.get_range_string()
      c = curr.cmp(intron_beds[0])
      if c == 0: # overlaps with intron
        size = curr.overlap_size(intron_beds[0])
        for i in range(0,size): intron_beds[0].get_payload().append(depth)
    if len(intergenic_beds) > 0:
      while curr.cmp(intergenic_beds[0]) > 0 and len(intergenic_beds) > 0: # we've passed the region
        v = intergenic_beds.pop(0)
        if len(v.get_payload()) == 0 and not args.use_off_regions: continue
        av = average(v)
        intergenicdepth.append(av)
        display(curr,introndepth,intergenicdepth,pseudoreadcount,avglen)
        #print str(av)+" intergenic "+v.get_range_string()
      c = curr.cmp(intergenic_beds[0])
      if c == 0: # overlaps with intron
        size = curr.overlap_size(intergenic_beds[0])
        for i in range(0,size): intergenic_beds[0].get_payload().append(depth)
      #if c > 0: # we passed the intron
      #  v = intergenic_beds.pop(0)
      #  av = average(v)
      #  intergenicdepth.append(av)
      #  print str(av)+" intergenic "+v.get_range_string()
  if args.use_off_regions:
    for x in exon_beds: introndepth.append(average(x.get_payload()))
    for x in intron_beds: introndepth.append(average(x.get_payload()))
    for x in intergenic_beds: intergenicdepth.append(average(x.get_payload()))
  p.communicate()
def do_prediction(compatible,args,nrfuzzykey,location):
    #if len(compatible.keys()) == 0: return None
    #all reads could be standing alone version
    families = []
    for num in nrfuzzykey:
      families.append(nrfuzzykey[num])
      nrfuzzykey[num].params['proper_set'] = False #partial overlap is enough
    #get_compatible_evidence(compatible,nrfuzzykey,args)
    for i in compatible:
      for j in compatible[i]:
        #see if its already in there
        g1lines = set()
        for g1 in nrfuzzykey[i].gpds: g1lines.add(g1.get_line())
        repeat = False
        for g2 in nrfuzzykey[j].gpds:
          if g2.get_line() in g1lines:
            repeat = True
            break
        if not repeat: continue
        together = nrfuzzykey[i].concat_fuzzy_gpd(nrfuzzykey[j])
        if together:
          families.append(together)
    # now we need to find any duplicate entries and combine them
    newfam = []
    beforefam = len(families)
    while len(families) > 0:
      fam = families.pop(0)
      remaining = []
      for i in range(0,len(families)):
        if fam.is_equal_fuzzy(families[i]):
          added = fam.add_fuzzy_gpd(families[i])
          if not added:
            sys.stderr.write("WARNING NOT SURE WHY NOT ADDED EQUAL\n")
          fam = added
        else: remaining.append(families[i])
      families = remaining
      newfam.append(fam)
    families = newfam
    afterfam = len(families)

    # Replace the family with a set where we haven't used the same gpd line twice
    # This may damage the fuzzy object
    for i in range(0,len(families)):
      gset = set()
      for g in families[i].gpds:  
        gset.add(g.get_line())
      families[i].gpds  = [GenePredEntry(x) for x in gset]
    #  sys.stderr.write("\n\ncahnged from "+str(beforefam)+"\t"+str(afterfam)+"\n\n")
    gpdlines = ""
    tablelines = ""
    # find gpds not in the graph... 
    for fz in families:
      info = fz.get_info_string()
      gpdline = fz.get_genepred_line()
      #print '&&&&&&&&&&&&&&&&'
      #print gpdline
      #print fz.get_info_string()
      #print '&&&&&&&&&&&&&&&&'
      gpd = GenePredEntry(gpdline)
      if not gpd.is_valid(): 
        sys.stderr.write("WARNING: invalid genepred entry generated\n"+gpdline+"\n"+fz.get_info_string()+"\n")
        gpd = sorted(fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons
        fz = FuzzyGenePred(gpd,juntol=args.junction_tolerance*2)
        gpdline = fz.get_genepred_line()
        if not gpd.is_valid():
          sys.stderr.write("WARNING: still problem skilling\n")
          continue
      gpdlines += gpdline+"\n"
      if args.output_original_table:
        name = gpd.entry['name']
        for g in fz.gpds:
          tablelines+=name+"\t"+g.entry['name']+"\n"
      grng = gpd.get_bed()
      grng.direction = None
      if not location: 
        location = grng
      location = location.merge(grng)
    locstring = ''
    if location:  locstring = location.get_range_string()
    return [gpdlines, tablelines, locstring]
Beispiel #5
0
def do_prediction(compatible, args, nrfuzzykey, location):
    #if len(compatible.keys()) == 0: return None
    #all reads could be standing alone version
    families = []
    for num in nrfuzzykey:
        families.append(nrfuzzykey[num])
        nrfuzzykey[num].params[
            'proper_set'] = False  #partial overlap is enough
    #get_compatible_evidence(compatible,nrfuzzykey,args)
    for i in compatible:
        for j in compatible[i]:
            #see if its already in there
            g1lines = set()
            for g1 in nrfuzzykey[i].gpds:
                g1lines.add(g1.get_line())
            repeat = False
            for g2 in nrfuzzykey[j].gpds:
                if g2.get_line() in g1lines:
                    repeat = True
                    break
            if not repeat: continue
            together = nrfuzzykey[i].concat_fuzzy_gpd(nrfuzzykey[j])
            if together:
                families.append(together)
    # now we need to find any duplicate entries and combine them
    newfam = []
    beforefam = len(families)
    while len(families) > 0:
        fam = families.pop(0)
        remaining = []
        for i in range(0, len(families)):
            if fam.is_equal_fuzzy(families[i]):
                added = fam.add_fuzzy_gpd(families[i])
                if not added:
                    sys.stderr.write("WARNING NOT SURE WHY NOT ADDED EQUAL\n")
                fam = added
            else:
                remaining.append(families[i])
        families = remaining
        newfam.append(fam)
    families = newfam
    afterfam = len(families)

    # Replace the family with a set where we haven't used the same gpd line twice
    # This may damage the fuzzy object
    for i in range(0, len(families)):
        gset = set()
        for g in families[i].gpds:
            gset.add(g.get_line())
        families[i].gpds = [GenePredEntry(x) for x in gset]
    #  sys.stderr.write("\n\ncahnged from "+str(beforefam)+"\t"+str(afterfam)+"\n\n")
    gpdlines = ""
    tablelines = ""
    # find gpds not in the graph...
    for fz in families:
        info = fz.get_info_string()
        gpdline = fz.get_genepred_line()
        #print '&&&&&&&&&&&&&&&&'
        #print gpdline
        #print fz.get_info_string()
        #print '&&&&&&&&&&&&&&&&'
        gpd = GenePredEntry(gpdline)
        if not gpd.is_valid():
            sys.stderr.write("WARNING: invalid genepred entry generated\n" +
                             gpdline + "\n" + fz.get_info_string() + "\n")
            gpd = sorted(
                fz.gpds, key=lambda x: x.get_exon_count(),
                reverse=True)[0]  #just grab one that has all the exons
            fz = FuzzyGenePred(gpd, juntol=args.junction_tolerance * 2)
            gpdline = fz.get_genepred_line()
            if not gpd.is_valid():
                sys.stderr.write("WARNING: still problem skilling\n")
                continue
        gpdlines += gpdline + "\n"
        if args.output_original_table:
            name = gpd.entry['name']
            for g in fz.gpds:
                tablelines += name + "\t" + g.entry['name'] + "\n"
        grng = gpd.get_bed()
        grng.direction = None
        if not location:
            location = grng
        location = location.merge(grng)
    locstring = ''
    if location: locstring = location.get_range_string()
    return [gpdlines, tablelines, locstring]
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('gpd_input')
    parser.add_argument('bam_input')
    parser.add_argument('--intergenic_buffer', default=10000, type=int)
    parser.add_argument('--window_size', default=10000, type=int)
    parser.add_argument('--bin_size', default=1000, type=int)
    parser.add_argument(
        '--use_off_regions',
        action='store_true',
        help="Use a region even if there is no reads mapped to it.")
    parser.add_argument('--get_exons', action='store_true')
    args = parser.parse_args()
    chr_beds = {}
    gene_beds = []
    exon_beds = []
    sys.stderr.write("Reading genepred file\n")
    asum = 0
    atot = 0
    with open(args.gpd_input) as inf:
        for line in inf:
            g = GenePredEntry(line)
            asum += g.length()
            atot += 1
            grng = g.get_bed()
            grng.direction = None
            if grng.chr not in chr_beds:
                chr_beds[grng.chr] = grng.copy()
            chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng)
            gene_beds.append(grng)
            for i in range(0, g.get_exon_count()):
                erng = Bed(g.value('chrom'),
                           g.value('exonStarts')[i],
                           g.value('exonEnds')[i])
                exon_beds.append(erng)
    avglen = float(asum) / float(atot)
    sys.stderr.write("Sorting gene bed\n")
    gene_beds = sort_ranges(gene_beds)
    gene_beds = merge_ranges(gene_beds, already_sorted=True)
    sys.stderr.write("Sorting chromosome beds\n")
    chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()])
    sys.stderr.write("Sorting exon beds\n")
    exon_beds = sort_ranges(exon_beds)
    sys.stderr.write("Get padded genes\n")
    padded_gene_beds = pad_ranges(gene_beds, args.intergenic_buffer, chr_beds)
    padded_gene_beds = merge_ranges(padded_gene_beds, already_sorted=True)
    sys.stderr.write("Get intergenic regions\n")
    intergenic_beds = subtract_ranges(chr_beds,
                                      padded_gene_beds,
                                      already_sorted=True)
    intergenic_beds = merge_ranges(intergenic_beds, already_sorted=True)
    intergenic_beds = window_break(intergenic_beds, args.window_size)
    #for i in intergenic_beds: print i.get_range_string()
    sys.stderr.write("Get merged exons\n")
    exon_beds = merge_ranges(exon_beds)
    sys.stderr.write("Get introns\n")
    intron_beds = subtract_ranges(gene_beds, exon_beds, already_sorted=True)
    intron_beds = merge_ranges(intron_beds, already_sorted=True)
    intron_beds = window_break(intron_beds, args.window_size)
    sys.stderr.write("Going through short reads\n")
    cmd = "sam_to_bed_depth.py " + args.bam_input
    p = Popen(cmd.split(), stdout=PIPE)
    for x in intron_beds:
        x.set_payload([])  # payloads are read depths
    for x in intergenic_beds:
        x.set_payload([])  # payloads are read depths
    for x in exon_beds:
        x.set_payload([])  # payloads are read depths
    introndepth = []
    intergenicdepth = []
    exondepth = []
    pseudoreadcount = 0
    if not args.get_exons: exon_beds = []
    section_count = 0
    while True:
        section_count += 1
        line = p.stdout.readline()
        if not line: break
        f = line.split("\t")
        depth = int(f[3])
        curr = Bed(f[0], int(f[1]), int(f[2]))
        if section_count % 100 == 0:
            sys.stderr.write(curr.get_range_string() + "          \r")
        pseudoreadcount += depth
        if len(exon_beds) > 0:
            while curr.cmp(exon_beds[0]) > 0 and len(
                    exon_beds) > 0:  # we've passed the region
                v = exon_beds.pop(0)
                if len(v.get_payload()) == 0 and not args.use_off_regions:
                    continue
                av = average(v)
                exondepth.append(av)
                #print str(av)+" exonic "+v.get_range_string()
            c = curr.cmp(exon_beds[0])
            if c == 0:  # overlaps with intron
                size = curr.overlap_size(exon_beds[0])
                for i in range(0, size):
                    exon_beds[0].get_payload().append(depth)
        if len(intron_beds) > 0:
            while curr.cmp(intron_beds[0]) > 0 and len(
                    intron_beds) > 0:  # we've passed the region
                v = intron_beds.pop(0)
                if len(v.get_payload()) == 0 and not args.use_off_regions:
                    continue
                av = average(v)
                introndepth.append(av)
                #print str(av)+" intronic "+v.get_range_string()
            c = curr.cmp(intron_beds[0])
            if c == 0:  # overlaps with intron
                size = curr.overlap_size(intron_beds[0])
                for i in range(0, size):
                    intron_beds[0].get_payload().append(depth)
        if len(intergenic_beds) > 0:
            while curr.cmp(intergenic_beds[0]) > 0 and len(
                    intergenic_beds) > 0:  # we've passed the region
                v = intergenic_beds.pop(0)
                if len(v.get_payload()) == 0 and not args.use_off_regions:
                    continue
                av = average(v)
                intergenicdepth.append(av)
                display(curr, introndepth, intergenicdepth, pseudoreadcount,
                        avglen)
                #print str(av)+" intergenic "+v.get_range_string()
            c = curr.cmp(intergenic_beds[0])
            if c == 0:  # overlaps with intron
                size = curr.overlap_size(intergenic_beds[0])
                for i in range(0, size):
                    intergenic_beds[0].get_payload().append(depth)
            #if c > 0: # we passed the intron
            #  v = intergenic_beds.pop(0)
            #  av = average(v)
            #  intergenicdepth.append(av)
            #  print str(av)+" intergenic "+v.get_range_string()
    if args.use_off_regions:
        for x in exon_beds:
            introndepth.append(average(x.get_payload()))
        for x in intron_beds:
            introndepth.append(average(x.get_payload()))
        for x in intergenic_beds:
            intergenicdepth.append(average(x.get_payload()))
    p.communicate()