def get_loci(transcripts_genepred):
    loci = Loci()
    loci.verbose = True
    with open(transcripts_genepred) as inf:
        for line in inf:
            if line[0] == '#': continue
            gpd = GenePredEntry(line.rstrip())
            rng = Bed(gpd.value('chrom'), gpd.value('txStart'),
                      gpd.value('txEnd'))
            rng.set_payload(gpd.value('name'))
            loc1 = Locus()
            loc1.add_member(rng)
            loci.add_locus(loc1)
    sys.stderr.write("Organizing genepred data into overlapping loci\n")
    sys.stderr.write("Started with " + str(len(loci.loci)) + " loci\n")
    loci.update_loci()
    sys.stderr.write("Ended with " + str(len(loci.loci)) + " loci\n")

    m = 0
    locus2name = {}
    name2locus = {}
    for locus in loci.loci:
        m += 1
        for member in locus.members:
            name = member.get_payload()
            if m not in locus2name: locus2name[m] = set()
            locus2name[m].add(name)
            name2locus[name] = m
    return [locus2name, name2locus]
Ejemplo n.º 2
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('input_gpd',help="GENEPRED input or - for STDIN")
  args = parser.parse_args()
  inf = sys.stdin
  if args.input_gpd != '-': inf = open(args.input_gpd)
  seen = set()
  ls = RangeBasics.Loci()
  ls.verbose = True
  ls.use_direction = False
  for line in inf:
    if line[0] == '#': continue
    gpd = GenePredEntry(line)
    if gpd.value('name') in seen:
      sys.stderr.write("ERROR: need uniquely named genepred entry names\n"+name+"\n")
      sys.exit()
    seen.add(gpd.value('name'))
    r = gpd.locus_range.copy()
    r.direction = None
    r.set_payload(gpd.value('name'))
    l = RangeBasics.Locus()
    l.add_member(r)
    ls.add_locus(l)
  ls.update_loci()
  z = 0
  for locus in ls.loci:
    z += 1
    for member in locus.members:
      print str(z) + "\t" + member.get_payload()
def get_loci(transcripts_genepred):
  loci = Loci()
  loci.verbose= True
  with open(transcripts_genepred) as inf:
    for line in inf:
      if line[0]=='#': continue
      gpd = GenePredEntry(line.rstrip())
      rng = Bed(gpd.value('chrom'),gpd.value('txStart'),gpd.value('txEnd'))
      rng.set_payload(gpd.value('name'))
      loc1 = Locus()
      loc1.add_member(rng)
      loci.add_locus(loc1)
  sys.stderr.write("Organizing genepred data into overlapping loci\n")
  sys.stderr.write("Started with "+str(len(loci.loci))+" loci\n")
  loci.update_loci()
  sys.stderr.write("Ended with "+str(len(loci.loci))+" loci\n")

  m = 0
  locus2name = {}
  name2locus = {}
  for locus in loci.loci:
    m+=1
    for member in locus.members:
      name = member.get_payload()
      if m not in locus2name:  locus2name[m] = set()
      locus2name[m].add(name)
      name2locus[name] = m
  return [locus2name,name2locus]
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input_gpd', help="GENEPRED input or - for STDIN")
    args = parser.parse_args()
    inf = sys.stdin
    if args.input_gpd != '-': inf = open(args.input_gpd)
    seen = set()
    ls = RangeBasics.Loci()
    ls.verbose = True
    ls.use_direction = False
    for line in inf:
        if line[0] == '#': continue
        gpd = GenePredEntry(line)
        if gpd.value('name') in seen:
            sys.stderr.write(
                "ERROR: need uniquely named genepred entry names\n" + name +
                "\n")
            sys.exit()
        seen.add(gpd.value('name'))
        r = gpd.locus_range.copy()
        r.direction = None
        r.set_payload(gpd.value('name'))
        l = RangeBasics.Locus()
        l.add_member(r)
        ls.add_locus(l)
    ls.update_loci()
    z = 0
    for locus in ls.loci:
        z += 1
        for member in locus.members:
            print str(z) + "\t" + member.get_payload()
Ejemplo n.º 5
0
def do_reduction(subset, args, nrfuzzykey, location):
    seen = set()
    for i in subset:
        seen.add(i)
        for j in subset[i]:
            seen.add(j)
    singles = []
    for num in nrfuzzykey:
        if num not in seen:
            singles.append(num)
    #if len(subset.keys()) == 0 and len(compatible.keys()) == 0: return
    families = get_subset_evidence(subset, nrfuzzykey, args)
    gpdlines = ""
    tablelines = ""
    for num in singles:
        families.append(nrfuzzykey[num])
    # find gpds not in the graph...
    for fz in families:
        info = fz.get_info_string()
        gpdline = fz.get_genepred_line()
        #print '&&&&&&&&&&&&&&&&'
        #print gpdline
        #print fz.get_info_string()
        #print '&&&&&&&&&&&&&&&&'
        gpd = GenePredEntry(gpdline)
        if not gpd.is_valid():
            sys.stderr.write("WARNING: invalid genepred entry generated\n" +
                             gpdline + "\n" + fz.get_info_string() + "\n")
            gpd = sorted(
                fz.gpds, key=lambda x: x.get_exon_count(),
                reverse=True)[0]  #just grab one that has all the exons
            fz = FuzzyGenePred(gpd, juntol=args.junction_tolerance * 2)
            gpdline = fz.get_genepred_line()
            if not gpd.is_valid():
                sys.stderr.write("WARNING: still problem skilling\n")
                continue
        gpdlines += gpdline + "\n"
        if args.output_original_table:
            name = gpd.entry['name']
            for g in fz.gpds:
                tablelines += name + "\t" + g.entry['name'] + "\n"
        grng = gpd.get_bed()
        grng.direction = None
        if not location:
            location = grng
        location = location.merge(grng)
    locstring = ''
    if location: locstring = location.get_range_string()
    return [gpdlines, tablelines, locstring]
Ejemplo n.º 6
0
 def copy(self):
     g = FuzzyGenePred()  # start with a blank one why not
     # get the settings
     for pname in self.params:
         g.params[pname] = self.params[pname]
     # copy the genepreds
     for orig in self.gpds:
         g.gpds.append(GenePredEntry(orig.get_line()))
     #store direction
     g.dir = self.dir
     # copy the fuzzy junctions
     for orig in self.fuzzy_junctions:
         g.fuzzy_junctions.append(orig.copy())
     # copy the simple junction set
     for orig in self.simple_junction_set:
         g.simple_junction_set.add(orig)
     # copy the start
     if self.start:
         g.start = Bed(self.start.chr,\
                       self.start.start-1,\
                       self.start.end,\
                       self.start.direction)
         g.start.set_payload([])
         for v in self.start.get_payload():
             g.start.get_payload().append(v)
     # copy the end
     if self.end:
         g.end = Bed(self.end.chr, self.end.start - 1, self.end.end,
                     self.end.direction)
         g.end.set_payload([])
         for v in self.end.get_payload():
             g.end.get_payload().append(v)
     return g
Ejemplo n.º 7
0
def main():
  parser = argparse.ArgumentParser(description="Filter a genepred by transcript length")
  parser.add_argument('input',help="Input '-' for STDOUT")
  parser.add_argument('--min_length',type=int,help="Minimum transcript length")
  parser.add_argument('--max_length',type=int,help="Maximum transcript length")
  parser.add_argument('--names',help="filter on a name list")
  parser.add_argument('--gene_names',help="filter on a gene name list")
  parser.add_argument('-v','--invert',action='store_true',help='Invert search result')
  args = parser.parse_args()
  name_list = set()
  gene_name_list = set()
  if args.names:
    with open(args.names) as inf:
      for line in inf:
        f = line.rstrip().split("\t")
        name_list.add(f[0])
  if args.gene_names:
    with open(args.gene_names) as inf:
      for line in inf:
        f = line.rstrip().split("\t")
        gene_name_list.add(f[0])
  inf = sys.stdin
  if args.input != '-':
    inf = open(args.input)
  for line in inf:
    if re.match('^#',line): continue
    is_good = True
    g = GPD(line.rstrip())
    tot = g.length()
    if args.min_length:
      if tot < args.min_length:
        is_good = False
    if args.max_length:
      if tot > args.max_length:
        is_good = False
    if args.names:
      if g.value('name') not in name_list:
        is_good = False
    if args.gene_names:
      if g.value('gene_name') not in args.gene_name_list:
        is_good = False
    # If we are still here we can print
    if not args.invert:
      if is_good: print line.rstrip()
    else:
      if not is_good: print line.rstrip()
def do_reduction(subset,args,nrfuzzykey,location):
    seen = set()
    for i in subset:
      seen.add(i)
      for j in subset[i]:  seen.add(j)
    singles = []
    for num in nrfuzzykey:
      if num not in seen:
        singles.append(num)
    #if len(subset.keys()) == 0 and len(compatible.keys()) == 0: return
    families = get_subset_evidence(subset,nrfuzzykey,args)
    gpdlines = ""
    tablelines = ""
    for num in singles:
      families.append(nrfuzzykey[num])
    # find gpds not in the graph... 
    for fz in families:
      info = fz.get_info_string()
      gpdline = fz.get_genepred_line()
      #print '&&&&&&&&&&&&&&&&'
      #print gpdline
      #print fz.get_info_string()
      #print '&&&&&&&&&&&&&&&&'
      gpd = GenePredEntry(gpdline)
      if not gpd.is_valid(): 
        sys.stderr.write("WARNING: invalid genepred entry generated\n"+gpdline+"\n"+fz.get_info_string()+"\n")
        gpd = sorted(fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons
        fz = FuzzyGenePred(gpd,juntol=args.junction_tolerance*2)
        gpdline = fz.get_genepred_line()
        if not gpd.is_valid():
          sys.stderr.write("WARNING: still problem skilling\n")
          continue
      gpdlines += gpdline+"\n"
      if args.output_original_table:
        name = gpd.entry['name']
        for g in fz.gpds:
          tablelines+=name+"\t"+g.entry['name']+"\n"
      grng = gpd.get_bed()
      grng.direction = None
      if not location: 
        location = grng
      location = location.merge(grng)
    locstring = ''
    if location:  locstring = location.get_range_string()
    return [gpdlines, tablelines, locstring]
def main():
  parser = argparse.ArgumentParser(description="For every genepred entry report its alignability",formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('input',help="Genepred can be gzipped or - for STDIN")
  parser.add_argument('-r','--reference',required=True,help="Reference fasta")
  parser.add_argument('-k','--fragment_size',default=100,type=int,help="Fragment size to try to align")
  parser.add_argument('-x','--hisat_index',required=True,help="HISAT index base name")
  parser.add_argument('--threads',type=int,default=cpu_count(),help="number of threads")
  parser.add_argument('--type',choices=['mean','median'],default='mean',help="How to bring together overlapping reads")
  parser.add_argument('--perbase',action='store_true')
  parser.add_argument('--output','-o',help="output file or leave unset for STDOUT")
  args = parser.parse_args()
  
  if args.input=='-': args.input=sys.stdin
  elif re.search('\.gz$',args.input):
    args.input = gzip.open(args.input)
  else: args.input = open(args.input)

  udir = os.path.dirname(os.path.realpath(__file__))
  cmd2 = udir+'/genepred_counts_to_mappability.py -'
  cmd2 += ' --threads '+str(args.threads)
  cmd2 += ' -k '+str(args.fragment_size)
  if args.perbase: cmd2 += ' --perbase'
  if args.output: cmd2 += ' --output '+args.output
  if args.type: cmd2 += ' --type '+args.type
  p2 = Popen(cmd2.split(),stdin=PIPE)
  ref = read_fasta_into_hash(args.reference)
  cmd1 = 'hisat -x '+args.hisat_index+' -U - -f --reorder -p '+str(args.threads)
  p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin,stderr=null)
  #p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin)
  line_number = 0
  for line in args.input:
    line_number +=1
    gpd = GPD(line.rstrip())
    #print gpd.entry['name']
    #print gpd.length()
    if gpd.length() < args.fragment_size: continue
    seq = gpd.get_sequence(ref)
    for i in range(0,len(seq)-args.fragment_size+1):
      info = gpd.value('name')+"\t"+gpd.value('gene_name')+"\t"+str(line_number)+"\t"+str(len(seq))+"\t"+str(i)
      einfo = encode_name(info)
      p1.stdin.write('>'+einfo+"\n")
      p1.stdin.write(seq[i:i+args.fragment_size]+"\n")
  p1.communicate()
  p2.communicate()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('reference_genome')
    parser.add_argument('transcripts_genepred')
    parser.add_argument('--out_gpd', help="fusion genepred", required=True)
    parser.add_argument('--out_fasta', help="fusion fasta", required=True)
    parser.add_argument(
        '--fusion_count',
        type=int,
        default=1000,
        help="Create this many fusions, max is number of genes/2.")
    args = parser.parse_args()
    ref = read_fasta_into_hash(args.reference_genome)
    of_gpd = open(args.out_gpd, 'w')
    of_fasta = open(args.out_fasta, 'w')
    genes = {}
    with open(args.transcripts_genepred) as inf:
        for line in inf:
            gpd = GPD(line.rstrip())
            if gpd.value('exonCount') <= 1: continue
            if gpd.value('gene_name') not in genes:
                genes[gpd.value('gene_name')] = []
            genes[gpd.value('gene_name')].append(gpd)
    gene_names = genes.keys()
    fusion_count = args.fusion_count
    shuffle(gene_names)
    pairs = []
    while True:
        if len(pairs) == fusion_count: break
        if len(gene_names) < 2: break
        pair = [gene_names[0], gene_names[1]]
        pairs.append(pair)
        gene_names.pop(0)
        gene_names.pop(0)
    for pair in pairs:
        [gpds, ars] = get_random_gpds_from_pair(pair, genes, ref)
        print ars.name
        of_fasta.write(ars.get_fasta())
        for gpd in gpds:
            of_gpd.write(gpd + "\n")
    of_gpd.close()
    of_fasta.close()
Ejemplo n.º 11
0
def process_locus(locus, args):
  depth = {}
  s2psl = SAMtoPSLconversionFactory()
  unique = {}
  chr = locus[0].value('rname')
  for sam in locus:
    p = PSL(s2psl.convert_line(sam.get_line()))
    g = GenePredEntry(p.get_genepred_line())
    g = g.get_smoothed(args.min_intron)
    for i in range(0,g.get_exon_count()):
      rng = str(g.value('exonStarts')[i])+"\t"+str(g.value('exonEnds')[i])
      if rng not in unique: unique[rng] = 0
      unique[rng]+=1
  for bstr in unique:
    [start,end] = bstr.split("\t")
    for i in range(int(start),int(end)):
      if i not in depth:  depth[i] = 0
      depth[i] += unique[bstr] # add the number of these to the depth
  #now we can print the depth
  prevdepth = 0
  prevstart = None
  lasti = None
  for i in sorted(depth.keys()):
    if depth[i] < args.min_depth: continue
    if depth[i] != prevdepth: #output what we have so far if we have something
      if prevstart: 
        output_depth(chr+"\t"+str(prevstart)+"\t"+str(lasti+1)+"\t"+str(prevdepth),args)
      prevstart = i
    prevdepth = depth[i]
    lasti = i
  if prevstart:
    output_depth(chr+"\t"+str(prevstart)+"\t"+str(lasti+1)+"\t"+str(prevdepth),args)
def do_fuzzy(fz, sr, args):
    outputs = []
    cnt = 0
    for i in range(0, len(fz.gpds)):
        cnt += 1
        #fz.gpds[0].entry['name'] = 'LR_'+str(cnt)
    g = GenePredEntry(fz.get_genepred_line())
    #print g.get_bed().get_range_string() + "\t" + str(g.get_exon_count())+" exons"
    parts = evaluate_junctions(fz, sr, args)
    for part in parts:
        #full = "LR_"+str(outind)+"\t"+"LR_"+str(outind)+"\t"+part
        outputs.append(part)
    return outputs
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('reference_genome')
  parser.add_argument('transcripts_genepred')
  parser.add_argument('--out_gpd',help="fusion genepred",required=True)
  parser.add_argument('--out_fasta',help="fusion fasta",required=True)
  parser.add_argument('--fusion_count',type=int,default=1000,help="Create this many fusions, max is number of genes/2.")
  args = parser.parse_args()
  ref = read_fasta_into_hash(args.reference_genome)
  of_gpd = open(args.out_gpd,'w')
  of_fasta = open(args.out_fasta,'w')
  genes = {}
  with open(args.transcripts_genepred) as inf:
    for line in inf:
      gpd = GPD(line.rstrip())
      if gpd.value('exonCount') <= 1: continue
      if gpd.value('gene_name') not in genes:
        genes[gpd.value('gene_name')] = []
      genes[gpd.value('gene_name')].append(gpd)
  gene_names = genes.keys()
  fusion_count = args.fusion_count
  shuffle(gene_names)
  pairs = []
  while True:
    if len(pairs) == fusion_count: break
    if len(gene_names) < 2: break
    pair = [gene_names[0],gene_names[1]]
    pairs.append(pair)
    gene_names.pop(0)
    gene_names.pop(0)
  for pair in pairs:
    [gpds,ars] = get_random_gpds_from_pair(pair,genes,ref)
    print ars.name
    of_fasta.write(ars.get_fasta())
    for gpd in gpds:
      of_gpd.write(gpd+"\n")
  of_gpd.close()
  of_fasta.close()
def evaluate_junctions(fz, sr, args):
    cnt = 0
    source_names = [x.entry['name'] for x in fz.gpds]
    working = fz.copy()
    if len(working.fuzzy_junctions) == 0: return []
    for i in range(0, len(working.fuzzy_junctions)):
        newjun = working.fuzzy_junctions[i]
        newjun.left.get_payload()['junc'] = []
        newjun.right.get_payload()['junc'] = []
        oldjun = fz.fuzzy_junctions[i]
        for srjun in sr:
            sjun = sr[srjun]['fzjun']
            if oldjun.overlaps(sjun, args.junction_tolerance):
                for i in range(0, min(sr[srjun]['cnt'], args.downsample)):
                    newjun.left.get_payload()['junc'].append(
                        sjun.left.get_payload()['junc'][0])
                    newjun.right.get_payload()['junc'].append(
                        sjun.right.get_payload()['junc'][0])
                    cnt += 1
    juncs = []
    starts = []
    ends = []
    evidences = []
    for i in range(0, len(fz.fuzzy_junctions)):
        evidence = len(working.fuzzy_junctions[i].left.get_payload()['junc'])
        if evidence >= args.required_evidence:
            if i == 0:
                starts.append(working.start.start)
            elif working.fuzzy_junctions[i].left.get_payload()['start']:
                starts.append(working.fuzzy_junctions[i].left.get_payload()
                              ['start'].start)
            else:
                starts.append(working.fuzzy_junctions[i - 1].right.start)
            #now ends
            if i == len(fz.fuzzy_junctions) - 1:
                ends.append(working.end.end)
            elif working.fuzzy_junctions[i].right.get_payload()['end']:
                ends.append(
                    working.fuzzy_junctions[i].right.get_payload()['end'].end)
            else:
                ends.append(working.fuzzy_junctions[i + 1].left.end)
            bestleft = GenePredFuzzyBasics.mode(
                working.fuzzy_junctions[i].left.get_payload()['junc'])
            bestright = GenePredFuzzyBasics.mode(
                working.fuzzy_junctions[i].right.get_payload()['junc'])
            juncs.append([bestleft, bestright])
            #print 'jun '+str(i)+' evid: '+str(evidence)+" "+str(bestleft)+" "+str(bestright)
        else:
            starts.append([])
            ends.append([])
            juncs.append([])
        evidences.append(evidence)
    #print juncs
    #print starts
    #print ends
    #print evidences
    # now we can put together the runs
    runs = []
    current_run = []
    for i in range(0, len(evidences)):
        if evidences[i] < args.required_evidence:
            if len(current_run) > 0:
                runs.append(current_run)
            current_run = []
            continue
        current_run.append(i)
    if len(current_run) > 0:
        runs.append(current_run)
    # now the runs are in runs
    #print 'runs:'
    parts = []
    for run in runs:
        sarr = []
        sarr.append(starts[run[0]] - 1)  #put back to zero index
        earr = []
        for i in range(0, len(run)):
            sarr.append(juncs[run[i]][1] - 1)
            earr.append(juncs[run[i]][0])
        earr.append(ends[run[-1]])
        # ready to build a genepred!
        part = ''
        part += str(working.start.chr) + "\t"
        part += '+' + "\t"
        part += str(sarr[0]) + "\t"
        part += str(earr[-1]) + "\t"
        part += str(sarr[0]) + "\t"
        part += str(earr[-1]) + "\t"
        part += str(len(sarr)) + "\t"
        part += ','.join([str(x) for x in sarr]) + ',' + "\t"
        part += ','.join([str(x) for x in earr]) + ','
        # Final quality check here
        gpd = GenePredEntry("test1\ttest1\t" + part)
        if not gpd.is_valid():
            sys.stderr.write("\nWARNING skipping invalid GPD\n" +
                             gpd.get_line() + "\n")
            continue
        parts.append([part, source_names])
    #print parts
    return parts
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('gpd_input')
    parser.add_argument('bam_input')
    parser.add_argument('--intergenic_buffer', default=10000, type=int)
    parser.add_argument('--window_size', default=10000, type=int)
    parser.add_argument('--bin_size', default=1000, type=int)
    parser.add_argument(
        '--use_off_regions',
        action='store_true',
        help="Use a region even if there is no reads mapped to it.")
    parser.add_argument('--get_exons', action='store_true')
    args = parser.parse_args()
    chr_beds = {}
    gene_beds = []
    exon_beds = []
    sys.stderr.write("Reading genepred file\n")
    asum = 0
    atot = 0
    with open(args.gpd_input) as inf:
        for line in inf:
            g = GenePredEntry(line)
            asum += g.length()
            atot += 1
            grng = g.get_bed()
            grng.direction = None
            if grng.chr not in chr_beds:
                chr_beds[grng.chr] = grng.copy()
            chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng)
            gene_beds.append(grng)
            for i in range(0, g.get_exon_count()):
                erng = Bed(g.value('chrom'),
                           g.value('exonStarts')[i],
                           g.value('exonEnds')[i])
                exon_beds.append(erng)
    avglen = float(asum) / float(atot)
    sys.stderr.write("Sorting gene bed\n")
    gene_beds = sort_ranges(gene_beds)
    gene_beds = merge_ranges(gene_beds, already_sorted=True)
    sys.stderr.write("Sorting chromosome beds\n")
    chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()])
    sys.stderr.write("Sorting exon beds\n")
    exon_beds = sort_ranges(exon_beds)
    sys.stderr.write("Get padded genes\n")
    padded_gene_beds = pad_ranges(gene_beds, args.intergenic_buffer, chr_beds)
    padded_gene_beds = merge_ranges(padded_gene_beds, already_sorted=True)
    sys.stderr.write("Get intergenic regions\n")
    intergenic_beds = subtract_ranges(chr_beds,
                                      padded_gene_beds,
                                      already_sorted=True)
    intergenic_beds = merge_ranges(intergenic_beds, already_sorted=True)
    intergenic_beds = window_break(intergenic_beds, args.window_size)
    #for i in intergenic_beds: print i.get_range_string()
    sys.stderr.write("Get merged exons\n")
    exon_beds = merge_ranges(exon_beds)
    sys.stderr.write("Get introns\n")
    intron_beds = subtract_ranges(gene_beds, exon_beds, already_sorted=True)
    intron_beds = merge_ranges(intron_beds, already_sorted=True)
    intron_beds = window_break(intron_beds, args.window_size)
    sys.stderr.write("Going through short reads\n")
    cmd = "sam_to_bed_depth.py " + args.bam_input
    p = Popen(cmd.split(), stdout=PIPE)
    for x in intron_beds:
        x.set_payload([])  # payloads are read depths
    for x in intergenic_beds:
        x.set_payload([])  # payloads are read depths
    for x in exon_beds:
        x.set_payload([])  # payloads are read depths
    introndepth = []
    intergenicdepth = []
    exondepth = []
    pseudoreadcount = 0
    if not args.get_exons: exon_beds = []
    section_count = 0
    while True:
        section_count += 1
        line = p.stdout.readline()
        if not line: break
        f = line.split("\t")
        depth = int(f[3])
        curr = Bed(f[0], int(f[1]), int(f[2]))
        if section_count % 100 == 0:
            sys.stderr.write(curr.get_range_string() + "          \r")
        pseudoreadcount += depth
        if len(exon_beds) > 0:
            while curr.cmp(exon_beds[0]) > 0 and len(
                    exon_beds) > 0:  # we've passed the region
                v = exon_beds.pop(0)
                if len(v.get_payload()) == 0 and not args.use_off_regions:
                    continue
                av = average(v)
                exondepth.append(av)
                #print str(av)+" exonic "+v.get_range_string()
            c = curr.cmp(exon_beds[0])
            if c == 0:  # overlaps with intron
                size = curr.overlap_size(exon_beds[0])
                for i in range(0, size):
                    exon_beds[0].get_payload().append(depth)
        if len(intron_beds) > 0:
            while curr.cmp(intron_beds[0]) > 0 and len(
                    intron_beds) > 0:  # we've passed the region
                v = intron_beds.pop(0)
                if len(v.get_payload()) == 0 and not args.use_off_regions:
                    continue
                av = average(v)
                introndepth.append(av)
                #print str(av)+" intronic "+v.get_range_string()
            c = curr.cmp(intron_beds[0])
            if c == 0:  # overlaps with intron
                size = curr.overlap_size(intron_beds[0])
                for i in range(0, size):
                    intron_beds[0].get_payload().append(depth)
        if len(intergenic_beds) > 0:
            while curr.cmp(intergenic_beds[0]) > 0 and len(
                    intergenic_beds) > 0:  # we've passed the region
                v = intergenic_beds.pop(0)
                if len(v.get_payload()) == 0 and not args.use_off_regions:
                    continue
                av = average(v)
                intergenicdepth.append(av)
                display(curr, introndepth, intergenicdepth, pseudoreadcount,
                        avglen)
                #print str(av)+" intergenic "+v.get_range_string()
            c = curr.cmp(intergenic_beds[0])
            if c == 0:  # overlaps with intron
                size = curr.overlap_size(intergenic_beds[0])
                for i in range(0, size):
                    intergenic_beds[0].get_payload().append(depth)
            #if c > 0: # we passed the intron
            #  v = intergenic_beds.pop(0)
            #  av = average(v)
            #  intergenicdepth.append(av)
            #  print str(av)+" intergenic "+v.get_range_string()
    if args.use_off_regions:
        for x in exon_beds:
            introndepth.append(average(x.get_payload()))
        for x in intron_beds:
            introndepth.append(average(x.get_payload()))
        for x in intergenic_beds:
            intergenicdepth.append(average(x.get_payload()))
    p.communicate()
Ejemplo n.º 16
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('gpd_input')
  parser.add_argument('bam_input')
  parser.add_argument('--intergenic_buffer',default=10000,type=int)
  parser.add_argument('--window_size',default=10000,type=int)
  parser.add_argument('--bin_size',default=1000,type=int)
  parser.add_argument('--use_off_regions',action='store_true',help="Use a region even if there is no reads mapped to it.")
  parser.add_argument('--get_exons',action='store_true')
  args = parser.parse_args()
  chr_beds = {}
  gene_beds = []
  exon_beds = []
  sys.stderr.write("Reading genepred file\n")
  asum = 0
  atot = 0
  with open(args.gpd_input) as inf:
    for line in inf:
      g = GenePredEntry(line)
      asum += g.length()
      atot += 1
      grng = g.get_bed()
      grng.direction = None
      if grng.chr not in chr_beds:
        chr_beds[grng.chr] = grng.copy()
      chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng)
      gene_beds.append(grng)
      for i in range(0,g.get_exon_count()):
        erng = Bed(g.value('chrom'),g.value('exonStarts')[i],g.value('exonEnds')[i])
        exon_beds.append(erng)
  avglen = float(asum)/float(atot)
  sys.stderr.write("Sorting gene bed\n")
  gene_beds = sort_ranges(gene_beds)
  gene_beds = merge_ranges(gene_beds,already_sorted=True)
  sys.stderr.write("Sorting chromosome beds\n")
  chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()])
  sys.stderr.write("Sorting exon beds\n")
  exon_beds = sort_ranges(exon_beds)
  sys.stderr.write("Get padded genes\n")
  padded_gene_beds = pad_ranges(gene_beds,args.intergenic_buffer,chr_beds)
  padded_gene_beds = merge_ranges(padded_gene_beds,already_sorted=True)
  sys.stderr.write("Get intergenic regions\n")
  intergenic_beds = subtract_ranges(chr_beds,padded_gene_beds,already_sorted=True)
  intergenic_beds = merge_ranges(intergenic_beds,already_sorted=True)
  intergenic_beds = window_break(intergenic_beds,args.window_size)
  #for i in intergenic_beds: print i.get_range_string()
  sys.stderr.write("Get merged exons\n")
  exon_beds = merge_ranges(exon_beds)
  sys.stderr.write("Get introns\n")
  intron_beds = subtract_ranges(gene_beds,exon_beds,already_sorted=True)  
  intron_beds = merge_ranges(intron_beds,already_sorted=True)
  intron_beds = window_break(intron_beds,args.window_size)
  sys.stderr.write("Going through short reads\n")
  cmd = "sam_to_bed_depth.py "+args.bam_input
  p = Popen(cmd.split(),stdout=PIPE)
  for x in intron_beds: x.set_payload([]) # payloads are read depths
  for x in intergenic_beds: x.set_payload([]) # payloads are read depths
  for x in exon_beds: x.set_payload([]) # payloads are read depths
  introndepth = []
  intergenicdepth = []
  exondepth = []
  pseudoreadcount = 0
  if not args.get_exons: exon_beds = []
  section_count = 0
  while True:
    section_count += 1
    line = p.stdout.readline()
    if not line: break
    f = line.split("\t")
    depth = int(f[3])
    curr = Bed(f[0],int(f[1]),int(f[2]))
    if section_count %100==0: sys.stderr.write(curr.get_range_string()+"          \r")
    pseudoreadcount += depth
    if len(exon_beds) > 0:
      while curr.cmp(exon_beds[0]) > 0 and len(exon_beds) > 0: # we've passed the region
        v = exon_beds.pop(0)
        if len(v.get_payload()) == 0 and not args.use_off_regions: continue
        av = average(v)
        exondepth.append(av)
        #print str(av)+" exonic "+v.get_range_string()
      c = curr.cmp(exon_beds[0])
      if c == 0: # overlaps with intron
        size = curr.overlap_size(exon_beds[0])
        for i in range(0,size): exon_beds[0].get_payload().append(depth)
    if len(intron_beds) > 0:
      while curr.cmp(intron_beds[0]) > 0 and len(intron_beds) > 0: # we've passed the region
        v = intron_beds.pop(0)
        if len(v.get_payload()) == 0 and not args.use_off_regions: continue
        av = average(v)
        introndepth.append(av)
        #print str(av)+" intronic "+v.get_range_string()
      c = curr.cmp(intron_beds[0])
      if c == 0: # overlaps with intron
        size = curr.overlap_size(intron_beds[0])
        for i in range(0,size): intron_beds[0].get_payload().append(depth)
    if len(intergenic_beds) > 0:
      while curr.cmp(intergenic_beds[0]) > 0 and len(intergenic_beds) > 0: # we've passed the region
        v = intergenic_beds.pop(0)
        if len(v.get_payload()) == 0 and not args.use_off_regions: continue
        av = average(v)
        intergenicdepth.append(av)
        display(curr,introndepth,intergenicdepth,pseudoreadcount,avglen)
        #print str(av)+" intergenic "+v.get_range_string()
      c = curr.cmp(intergenic_beds[0])
      if c == 0: # overlaps with intron
        size = curr.overlap_size(intergenic_beds[0])
        for i in range(0,size): intergenic_beds[0].get_payload().append(depth)
      #if c > 0: # we passed the intron
      #  v = intergenic_beds.pop(0)
      #  av = average(v)
      #  intergenicdepth.append(av)
      #  print str(av)+" intergenic "+v.get_range_string()
  if args.use_off_regions:
    for x in exon_beds: introndepth.append(average(x.get_payload()))
    for x in intron_beds: introndepth.append(average(x.get_payload()))
    for x in intergenic_beds: intergenicdepth.append(average(x.get_payload()))
  p.communicate()
Ejemplo n.º 17
0
def main():
  parser = argparse.ArgumentParser(description="Rename gene and transcript elements of GenePred file that are redundant.  Please specify an output if you would like report files generated for the filters.")
  parser.add_argument('input',help="GENEPREDFILE or '-' for STDIN")
  parser.add_argument('-o','--output',help="OUTPUT FILE default is STDOUT, but you need to specify an output file to get report files generated")
  parser.add_argument('--minimum_locus_distance',type=int,default=500000,help="Genes with the same name will be renamed if this far apart")
  parser.add_argument('--keep_positional_duplicates',action='store_true',help="By default we remove one of the duplicate entries")
  parser.add_argument('--keep_transcript_names',action='store_true',help="By default we rename duplicated transcript names")
  parser.add_argument('--keep_gene_names',action='store_true',help="By default we rename genes located at different loci.")
  args = parser.parse_args()
  inf = sys.stdin
  if args.input != '-': inf = open(args.input)
  of = sys.stdout
  if args.output: of = open(args.output,'w')
  txdef = {}
  gfams = {}
  for line in inf:
    if line[0] == '#': continue
    g = GenePredEntry(line)
    loc = g.value('chrom') + ':' +','.join([str(x) for x in g.value('exonStarts')]) + '-' + ','.join([str(x) for x in g.value('exonEnds')])+'/'+g.value('strand')
    if loc not in txdef:
      txdef[loc] = []
    txdef[loc].append(g)
    if g.value('gene_name') not in gfams: gfams[g.value('gene_name')] = []
    gfams[g.value('gene_name')].append(g.value('name'))
  # now we have cataloged all transcripts by unique locations
  omissions = []
  keepers = []
  for loc in sorted(txdef.keys()):
    if args.keep_positional_duplicates: # We don't want to ommit anything here
      for g in txdef[loc]: keepers.append(g)
      continue #basically skipping this part by populating keepers
    num = len(txdef[loc])
    if num > 1:
      sys.stderr.write("Found "+str(num)+" entries at location\n")
      sys.stderr.write(loc +"\n")
      sys.stderr.write("They are:\n")
      largest = 0
      keepgene = None
      keepindex = -1
      i = 0
      for e in txdef[loc]:
        famsize = len(gfams[e.value('gene_name')])
        sys.stderr.write("     "+e.value('gene_name')+"\t"+e.value('name')+"\t"+str(famsize)+"\n")
        if famsize > largest:
          keepgene = e
          largest = famsize
          keepindex = i
        i+=1
      for j in range(0,len(txdef[loc])):  
        if j != keepindex: omissions.append(txdef[loc][j])
        else: keepers.append(txdef[loc][j])
      sys.stderr.write("     Biggest gene family is "+keepgene.value('gene_name')+" with "+str(largest)+" transcripts\n")
      sys.stderr.write("     so keep that one.\n")
    else:
      keepers.append(txdef[loc][0])
  sys.stderr.write("Omitting "+str(len(omissions))+" entries for redundant positions\n")
  if args.output and not args.keep_positional_duplicates:
    of1 = open(args.output+'.positional_duplicate_omissions','w')
    for g in omissions:
      of1.write(g.get_line()+"\n")
    of1.close()
  # Now the keepers contains transcripts with unique locations
  # Lets provide unique names to remaining transcripts
  tnames = {}
  renametx = {}
  for g in keepers:
    tx = g.value('name')
    if tx not in tnames: tnames[tx] = []
    tnames[tx].append(g)
  for name in tnames:
    if args.keep_transcript_names: continue # We don't want to rename them
    nsize = len(tnames[name])
    if nsize > 1:
      sys.stderr.write("Name: "+name+" has a family of size "+str(nsize)+"\n")
      for i in range(0,len(tnames[name])):
        newname = name+'['+str(i+1)+'/'+str(nsize)+']'
        renametx[newname] = name
        tnames[name][i].entry['name'] = newname
  sys.stderr.write("Renamed: "+str(len(renametx))+" transcripts\n")
  if args.output and not args.keep_transcript_names:
    of1 = open(args.output+'.renamed_transcripts','w')
    for name in sorted(renametx.keys()):
      of1.write(name+"\t"+renametx[name]+"\n")
    of1.close()
  #now we need to arrange into gene families
  gnames = {}
  for name in tnames:
    for g in tnames[name]:
      gene = g.value('gene_name')
      if gene not in gnames:  gnames[gene] = []
      gnames[gene].append(g)
  renamegene = {}
  finished = []
  for gene in gnames:
    if args.keep_gene_names:
      for g in gnames[gene]: finished.append(g)
      continue # We don't want to rename genes
    if len(gnames[gene])==1:
      finished.append(gnames[gene][0])
      continue
    # Now we need to make sure these genes are really on the same locus.
    loci = Loci()
    loci.set_minimum_distance(args.minimum_locus_distance)
    for g in gnames[gene]:
      r = g.locus_range.copy()
      r.set_payload(g)
      loc = Locus()
      loc.add_member(r)
      loci.add_locus(loc)
    loci.update_loci()
    lcount = len(loci.loci)
    if lcount == 1:
      for g in gnames[gene]: finished.append(g)
      continue
    # need to rename some genes
    for i in range(0,lcount):
      newname = gene+'['+str(i+1)+'/'+str(lcount)+']'
      rstr = loci.loci[i].range.get_range_string()
      renamegene[newname] = gene
      sys.stderr.write(newname+"\t"+rstr+"\n")
      for m in loci.loci[i].members:
        m.get_payload().entry['gene_name'] = newname
        finished.append(m.get_payload())
  sys.stderr.write("Renamed: "+str(len(renamegene))+" genes\n")
  if args.output and not args.keep_transcript_names:
    of1 = open(args.output+'.renamed_genes','w')
    for name in sorted(renamegene.keys()):
      of1.write(name+"\t"+renamegene[name]+"\n")
    of1.close()
  #Now lets resort by genes
  bygene = {}
  for g in finished:
    gene = g.value('gene_name')
    if gene not in bygene: bygene[gene] = []
    bygene[gene].append(g)
  for gene in sorted(bygene.keys()):
    for g in bygene[gene]:
      of.write(g.get_line()+"\n")
  of.close()
  inf.close()
Ejemplo n.º 18
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Rename gene and transcript elements of GenePred file that are redundant.  Please specify an output if you would like report files generated for the filters."
    )
    parser.add_argument('input', help="GENEPREDFILE or '-' for STDIN")
    parser.add_argument(
        '-o',
        '--output',
        help=
        "OUTPUT FILE default is STDOUT, but you need to specify an output file to get report files generated"
    )
    parser.add_argument(
        '--minimum_locus_distance',
        type=int,
        default=500000,
        help="Genes with the same name will be renamed if this far apart")
    parser.add_argument(
        '--keep_positional_duplicates',
        action='store_true',
        help="By default we remove one of the duplicate entries")
    parser.add_argument(
        '--keep_transcript_names',
        action='store_true',
        help="By default we rename duplicated transcript names")
    parser.add_argument(
        '--keep_gene_names',
        action='store_true',
        help="By default we rename genes located at different loci.")
    args = parser.parse_args()
    inf = sys.stdin
    if args.input != '-': inf = open(args.input)
    of = sys.stdout
    if args.output: of = open(args.output, 'w')
    txdef = {}
    gfams = {}
    for line in inf:
        if line[0] == '#': continue
        g = GenePredEntry(line)
        loc = g.value('chrom') + ':' + ','.join(
            [str(x) for x in g.value('exonStarts')]) + '-' + ','.join(
                [str(x)
                 for x in g.value('exonEnds')]) + '/' + g.value('strand')
        if loc not in txdef:
            txdef[loc] = []
        txdef[loc].append(g)
        if g.value('gene_name') not in gfams: gfams[g.value('gene_name')] = []
        gfams[g.value('gene_name')].append(g.value('name'))
    # now we have cataloged all transcripts by unique locations
    omissions = []
    keepers = []
    for loc in sorted(txdef.keys()):
        if args.keep_positional_duplicates:  # We don't want to ommit anything here
            for g in txdef[loc]:
                keepers.append(g)
            continue  #basically skipping this part by populating keepers
        num = len(txdef[loc])
        if num > 1:
            sys.stderr.write("Found " + str(num) + " entries at location\n")
            sys.stderr.write(loc + "\n")
            sys.stderr.write("They are:\n")
            largest = 0
            keepgene = None
            keepindex = -1
            i = 0
            for e in txdef[loc]:
                famsize = len(gfams[e.value('gene_name')])
                sys.stderr.write("     " + e.value('gene_name') + "\t" +
                                 e.value('name') + "\t" + str(famsize) + "\n")
                if famsize > largest:
                    keepgene = e
                    largest = famsize
                    keepindex = i
                i += 1
            for j in range(0, len(txdef[loc])):
                if j != keepindex: omissions.append(txdef[loc][j])
                else: keepers.append(txdef[loc][j])
            sys.stderr.write("     Biggest gene family is " +
                             keepgene.value('gene_name') + " with " +
                             str(largest) + " transcripts\n")
            sys.stderr.write("     so keep that one.\n")
        else:
            keepers.append(txdef[loc][0])
    sys.stderr.write("Omitting " + str(len(omissions)) +
                     " entries for redundant positions\n")
    if args.output and not args.keep_positional_duplicates:
        of1 = open(args.output + '.positional_duplicate_omissions', 'w')
        for g in omissions:
            of1.write(g.get_line() + "\n")
        of1.close()
    # Now the keepers contains transcripts with unique locations
    # Lets provide unique names to remaining transcripts
    tnames = {}
    renametx = {}
    for g in keepers:
        tx = g.value('name')
        if tx not in tnames: tnames[tx] = []
        tnames[tx].append(g)
    for name in tnames:
        if args.keep_transcript_names: continue  # We don't want to rename them
        nsize = len(tnames[name])
        if nsize > 1:
            sys.stderr.write("Name: " + name + " has a family of size " +
                             str(nsize) + "\n")
            for i in range(0, len(tnames[name])):
                newname = name + '[' + str(i + 1) + '/' + str(nsize) + ']'
                renametx[newname] = name
                tnames[name][i].entry['name'] = newname
    sys.stderr.write("Renamed: " + str(len(renametx)) + " transcripts\n")
    if args.output and not args.keep_transcript_names:
        of1 = open(args.output + '.renamed_transcripts', 'w')
        for name in sorted(renametx.keys()):
            of1.write(name + "\t" + renametx[name] + "\n")
        of1.close()
    #now we need to arrange into gene families
    gnames = {}
    for name in tnames:
        for g in tnames[name]:
            gene = g.value('gene_name')
            if gene not in gnames: gnames[gene] = []
            gnames[gene].append(g)
    renamegene = {}
    finished = []
    for gene in gnames:
        if args.keep_gene_names:
            for g in gnames[gene]:
                finished.append(g)
            continue  # We don't want to rename genes
        if len(gnames[gene]) == 1:
            finished.append(gnames[gene][0])
            continue
        # Now we need to make sure these genes are really on the same locus.
        loci = Loci()
        loci.set_minimum_distance(args.minimum_locus_distance)
        for g in gnames[gene]:
            r = g.locus_range.copy()
            r.set_payload(g)
            loc = Locus()
            loc.add_member(r)
            loci.add_locus(loc)
        loci.update_loci()
        lcount = len(loci.loci)
        if lcount == 1:
            for g in gnames[gene]:
                finished.append(g)
            continue
        # need to rename some genes
        for i in range(0, lcount):
            newname = gene + '[' + str(i + 1) + '/' + str(lcount) + ']'
            rstr = loci.loci[i].range.get_range_string()
            renamegene[newname] = gene
            sys.stderr.write(newname + "\t" + rstr + "\n")
            for m in loci.loci[i].members:
                m.get_payload().entry['gene_name'] = newname
                finished.append(m.get_payload())
    sys.stderr.write("Renamed: " + str(len(renamegene)) + " genes\n")
    if args.output and not args.keep_transcript_names:
        of1 = open(args.output + '.renamed_genes', 'w')
        for name in sorted(renamegene.keys()):
            of1.write(name + "\t" + renamegene[name] + "\n")
        of1.close()
    #Now lets resort by genes
    bygene = {}
    for g in finished:
        gene = g.value('gene_name')
        if gene not in bygene: bygene[gene] = []
        bygene[gene].append(g)
    for gene in sorted(bygene.keys()):
        for g in bygene[gene]:
            of.write(g.get_line() + "\n")
    of.close()
    inf.close()
Ejemplo n.º 19
0
def evaluate_junctions(fz, sr, args):
    cnt = 0
    source_names = [x.entry["name"] for x in fz.gpds]
    working = fz.copy()
    if len(working.fuzzy_junctions) == 0:
        return []
    for i in range(0, len(working.fuzzy_junctions)):
        newjun = working.fuzzy_junctions[i]
        newjun.left.get_payload()["junc"] = []
        newjun.right.get_payload()["junc"] = []
        oldjun = fz.fuzzy_junctions[i]
        for srjun in sr:
            sjun = sr[srjun]["fzjun"]
            if oldjun.overlaps(sjun, args.junction_tolerance):
                for i in range(0, min(sr[srjun]["cnt"], args.downsample)):
                    newjun.left.get_payload()["junc"].append(sjun.left.get_payload()["junc"][0])
                    newjun.right.get_payload()["junc"].append(sjun.right.get_payload()["junc"][0])
                    cnt += 1
    juncs = []
    starts = []
    ends = []
    evidences = []
    for i in range(0, len(fz.fuzzy_junctions)):
        evidence = len(working.fuzzy_junctions[i].left.get_payload()["junc"])
        if evidence >= args.required_evidence:
            if i == 0:
                starts.append(working.start.start)
            elif working.fuzzy_junctions[i].left.get_payload()["start"]:
                starts.append(working.fuzzy_junctions[i].left.get_payload()["start"].start)
            else:
                starts.append(working.fuzzy_junctions[i - 1].right.start)
            # now ends
            if i == len(fz.fuzzy_junctions) - 1:
                ends.append(working.end.end)
            elif working.fuzzy_junctions[i].right.get_payload()["end"]:
                ends.append(working.fuzzy_junctions[i].right.get_payload()["end"].end)
            else:
                ends.append(working.fuzzy_junctions[i + 1].left.end)
            bestleft = GenePredFuzzyBasics.mode(working.fuzzy_junctions[i].left.get_payload()["junc"])
            bestright = GenePredFuzzyBasics.mode(working.fuzzy_junctions[i].right.get_payload()["junc"])
            juncs.append([bestleft, bestright])
            # print 'jun '+str(i)+' evid: '+str(evidence)+" "+str(bestleft)+" "+str(bestright)
        else:
            starts.append([])
            ends.append([])
            juncs.append([])
        evidences.append(evidence)
    # print juncs
    # print starts
    # print ends
    # print evidences
    # now we can put together the runs
    runs = []
    current_run = []
    for i in range(0, len(evidences)):
        if evidences[i] < args.required_evidence:
            if len(current_run) > 0:
                runs.append(current_run)
            current_run = []
            continue
        current_run.append(i)
    if len(current_run) > 0:
        runs.append(current_run)
    # now the runs are in runs
    # print 'runs:'
    parts = []
    for run in runs:
        sarr = []
        sarr.append(starts[run[0]] - 1)  # put back to zero index
        earr = []
        for i in range(0, len(run)):
            sarr.append(juncs[run[i]][1] - 1)
            earr.append(juncs[run[i]][0])
        earr.append(ends[run[-1]])
        # ready to build a genepred!
        part = ""
        part += str(working.start.chr) + "\t"
        part += "+" + "\t"
        part += str(sarr[0]) + "\t"
        part += str(earr[-1]) + "\t"
        part += str(sarr[0]) + "\t"
        part += str(earr[-1]) + "\t"
        part += str(len(sarr)) + "\t"
        part += ",".join([str(x) for x in sarr]) + "," + "\t"
        part += ",".join([str(x) for x in earr]) + ","
        # Final quality check here
        gpd = GenePredEntry("test1\ttest1\t" + part)
        if not gpd.is_valid():
            sys.stderr.write("\nWARNING skipping invalid GPD\n" + gpd.get_line() + "\n")
            continue
        parts.append([part, source_names])
    # print parts
    return parts
Ejemplo n.º 20
0
def main():
    parser = argparse.ArgumentParser(
        description="For every genepred entry report its alignability",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="Genepred can be gzipped or - for STDIN")
    parser.add_argument('-r',
                        '--reference',
                        required=True,
                        help="Reference fasta")
    parser.add_argument('-k',
                        '--fragment_size',
                        default=100,
                        type=int,
                        help="Fragment size to try to align")
    parser.add_argument('-x',
                        '--hisat_index',
                        required=True,
                        help="HISAT index base name")
    parser.add_argument('--threads',
                        type=int,
                        default=cpu_count(),
                        help="number of threads")
    parser.add_argument('--type',
                        choices=['mean', 'median'],
                        default='mean',
                        help="How to bring together overlapping reads")
    parser.add_argument('--perbase', action='store_true')
    parser.add_argument('--output',
                        '-o',
                        help="output file or leave unset for STDOUT")
    args = parser.parse_args()

    if args.input == '-': args.input = sys.stdin
    elif re.search('\.gz$', args.input):
        args.input = gzip.open(args.input)
    else:
        args.input = open(args.input)

    udir = os.path.dirname(os.path.realpath(__file__))
    cmd2 = udir + '/genepred_counts_to_mappability.py -'
    cmd2 += ' --threads ' + str(args.threads)
    cmd2 += ' -k ' + str(args.fragment_size)
    if args.perbase: cmd2 += ' --perbase'
    if args.output: cmd2 += ' --output ' + args.output
    if args.type: cmd2 += ' --type ' + args.type
    p2 = Popen(cmd2.split(), stdin=PIPE)
    ref = read_fasta_into_hash(args.reference)
    cmd1 = 'hisat -x ' + args.hisat_index + ' -U - -f --reorder -p ' + str(
        args.threads)
    p1 = Popen(cmd1.split(), stdin=PIPE, stdout=p2.stdin, stderr=null)
    #p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin)
    line_number = 0
    for line in args.input:
        line_number += 1
        gpd = GPD(line.rstrip())
        #print gpd.entry['name']
        #print gpd.length()
        if gpd.length() < args.fragment_size: continue
        seq = gpd.get_sequence(ref)
        for i in range(0, len(seq) - args.fragment_size + 1):
            info = gpd.value('name') + "\t" + gpd.value(
                'gene_name') + "\t" + str(line_number) + "\t" + str(
                    len(seq)) + "\t" + str(i)
            einfo = encode_name(info)
            p1.stdin.write('>' + einfo + "\n")
            p1.stdin.write(seq[i:i + args.fragment_size] + "\n")
    p1.communicate()
    p2.communicate()
def load_from_inputs(args):
  #Read in the VCF file
  sys.stderr.write("Reading in the VCF file\n")
  alleles = {}
  #with open(args.phased_VCF) as inf:
  with open(args.inputs[1]) as inf:
    for line in inf:
      vcf = VCF(line)
      if not vcf.is_snp(): continue
      g = vcf.get_phased_genotype()
      if not g: continue
      if vcf.value('chrom') not in alleles:
        alleles[vcf.value('chrom')] = {}
      if vcf.value('pos') in alleles[vcf.value('chrom')]:
        sys.stderr.write("WARNING: seeing the same position twice.\n"+line.rstrip()+"\n")
      alleles[vcf.value('chrom')][vcf.value('pos')] = g # set our left and right

  sys.stderr.write("Reading in the reference genome\n")
  #ref = read_fasta_into_hash(args.reference_genome)
  ref = read_fasta_into_hash(args.inputs[0])
  res1 = []
  res2 = []
  p = None
  sys.stderr.write("Introducing VCF changes to reference sequences\n")
  # Pretty memory intesnive to so don't go with all possible threads
  if args.threads > 1: p = Pool(processes=max(1,int(args.threads/4)))
  for chrom in ref:
    # handle the case where there is no allele information
    if chrom not in alleles:
      r1q = Queue()
      r1q.put([0,chrom,ref[chrom]])
      res1.append(r1q)
      r2q = Queue()
      r2q.put([0,chrom,ref[chrom]])
      res2.append(r2q)
    elif args.threads > 1:
      res1.append(p.apply_async(adjust_reference_genome,args=(alleles[chrom],ref[chrom],0,chrom)))
      res2.append(p.apply_async(adjust_reference_genome,args=(alleles[chrom],ref[chrom],1,chrom)))
    else:
      r1q = Queue()
      r1q.put(adjust_reference_genome(alleles[chrom],ref[chrom],0,chrom))
      res1.append(r1q)
      r2q = Queue()
      r2q.put(adjust_reference_genome(alleles[chrom],ref[chrom],1,chrom))
      res2.append(r2q)
  if args.threads > 1:
    p.close()
    p.join()

  # now we can fill reference 1 with all our new sequences
  ref1 = {} 
  c1 = 0
  for i in range(0,len(res1)):
    res = res1[i].get()
    c1 += res[0]
    ref1[res[1]]=res[2]

  # now we can fill reference 2 with all our new sequences
  ref2 = {} 
  c2 = 0
  for i in range(0,len(res2)):
    res = res2[i].get()
    c2 += res[0]
    ref2[res[1]]=res[2]
  sys.stderr.write("Made "+str(c1)+"|"+str(c2)+" changes to the reference\n")

  # Now ref1 and ref2 have are the diploid sources of the transcriptome
  gpdnames = {}
  txn1 = Transcriptome()
  txn2 = Transcriptome()
  txn1.set_reference_genome_dictionary(ref1)
  txn2.set_reference_genome_dictionary(ref2)
  #with open(args.transcripts_genepred) as inf:
  with open(args.inputs[2]) as inf:
    for line in inf:
      if line[0]=='#': continue
      txn1.add_genepred_line(line.rstrip())
      txn2.add_genepred_line(line.rstrip())
      gpd = GenePredEntry(line.rstrip())
      gpdnames[gpd.value('name')] = gpd.value('gene_name')
  # The transcriptomes are set but we dont' really need the references anymore
  # Empty our big memory things
  txn1.ref_hash = None
  txn2.ref_hash = None
  for chrom in ref1.keys():  del ref1[chrom]
  for chrom in ref2.keys():  del ref2[chrom]
  for chrom in ref.keys():  del ref[chrom]

  if not args.locus_by_gene_name:
    #[locus2name,name2locus] = get_loci(args.transcripts_genepred)
    [locus2name,name2locus] = get_loci(args.inputs[2])
  else: # set locus by gene name
    sys.stderr.write("Organizing loci by gene name\n")
    locus2name = {}
    name2locus = {}
    numname = {}
    m = 0
    for name in sorted(gpdnames): 
      gene = gpdnames[name]
      if gene not in numname:
        m+=1
        numname[gene] = m
      num = numname[gene]
      if num not in locus2name:
        locus2name[num] = set()
      locus2name[num].add(name)
      name2locus[name] = num
    sys.stderr.write("Ended with "+str(len(locus2name.keys()))+" loci\n")

  if args.isoform_expression:
    sys.stderr.write("Reading expression from a TSV\n")
    with open(args.isoform_expression) as inf:
      line1 = inf.readline()
      for line in inf:
        f = line.rstrip().split("\t")
        txn1.add_expression(f[0],float(f[1]))
        txn2.add_expression(f[0],float(f[1]))
  elif args.cufflinks_isoform_expression:
    sys.stderr.write("Using cufflinks expression\n")
    cuffz = 0
    with open(args.cufflinks_isoform_expression) as inf:
      line1 = inf.readline()
      for line in inf:
        cuffz +=1
        sys.stderr.write(str(cuffz)+" cufflinks entries processed\r")
        f = line.rstrip().split("\t")
        txn1.add_expression_no_update(f[0],float(f[9]))
        txn2.add_expression_no_update(f[0],float(f[9]))
    txn1.update_expression()
    txn2.update_expression()
    sys.stderr.write("\n")
  elif args.uniform_expression:
    sys.stderr.write("Using uniform expression model\n")
  else:
    sys.stderr.write("Warning isoform expression not sepcified, using uniform expression model.\n")
  # Now we have the transcriptomes set
  rhos = {} # The ASE of allele 1 (the left side)
  randos = {}
  if args.seed:
    random.seed(args.seed)
  for z in locus2name: randos[z] = random.random()
  sys.stderr.write("Setting rho for each transcript\n")
  # Lets set rho for ASE for each transcript
  for tname in sorted(txn1.transcripts):
    if args.ASE_identical or args.ASE_identical == 0:
      rhos[tname] = float(args.ASE_identical)
    elif args.ASE_isoform_random:
      rhos[tname] = random.random()
    else: # we must be on locus random
      rhos[tname] = randos[name2locus[tname]]
  #Now our dataset is set up
  rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter(txn1,txn2)
  rbe.gene_names = gpdnames
  rbe.name2locus = name2locus
  rbe.set_transcriptome1_rho(rhos)
  return rbe
Ejemplo n.º 22
0
def do_prediction(compatible, args, nrfuzzykey, location):
    #if len(compatible.keys()) == 0: return None
    #all reads could be standing alone version
    families = []
    for num in nrfuzzykey:
        families.append(nrfuzzykey[num])
        nrfuzzykey[num].params[
            'proper_set'] = False  #partial overlap is enough
    #get_compatible_evidence(compatible,nrfuzzykey,args)
    for i in compatible:
        for j in compatible[i]:
            #see if its already in there
            g1lines = set()
            for g1 in nrfuzzykey[i].gpds:
                g1lines.add(g1.get_line())
            repeat = False
            for g2 in nrfuzzykey[j].gpds:
                if g2.get_line() in g1lines:
                    repeat = True
                    break
            if not repeat: continue
            together = nrfuzzykey[i].concat_fuzzy_gpd(nrfuzzykey[j])
            if together:
                families.append(together)
    # now we need to find any duplicate entries and combine them
    newfam = []
    beforefam = len(families)
    while len(families) > 0:
        fam = families.pop(0)
        remaining = []
        for i in range(0, len(families)):
            if fam.is_equal_fuzzy(families[i]):
                added = fam.add_fuzzy_gpd(families[i])
                if not added:
                    sys.stderr.write("WARNING NOT SURE WHY NOT ADDED EQUAL\n")
                fam = added
            else:
                remaining.append(families[i])
        families = remaining
        newfam.append(fam)
    families = newfam
    afterfam = len(families)

    # Replace the family with a set where we haven't used the same gpd line twice
    # This may damage the fuzzy object
    for i in range(0, len(families)):
        gset = set()
        for g in families[i].gpds:
            gset.add(g.get_line())
        families[i].gpds = [GenePredEntry(x) for x in gset]
    #  sys.stderr.write("\n\ncahnged from "+str(beforefam)+"\t"+str(afterfam)+"\n\n")
    gpdlines = ""
    tablelines = ""
    # find gpds not in the graph...
    for fz in families:
        info = fz.get_info_string()
        gpdline = fz.get_genepred_line()
        #print '&&&&&&&&&&&&&&&&'
        #print gpdline
        #print fz.get_info_string()
        #print '&&&&&&&&&&&&&&&&'
        gpd = GenePredEntry(gpdline)
        if not gpd.is_valid():
            sys.stderr.write("WARNING: invalid genepred entry generated\n" +
                             gpdline + "\n" + fz.get_info_string() + "\n")
            gpd = sorted(
                fz.gpds, key=lambda x: x.get_exon_count(),
                reverse=True)[0]  #just grab one that has all the exons
            fz = FuzzyGenePred(gpd, juntol=args.junction_tolerance * 2)
            gpdline = fz.get_genepred_line()
            if not gpd.is_valid():
                sys.stderr.write("WARNING: still problem skilling\n")
                continue
        gpdlines += gpdline + "\n"
        if args.output_original_table:
            name = gpd.entry['name']
            for g in fz.gpds:
                tablelines += name + "\t" + g.entry['name'] + "\n"
        grng = gpd.get_bed()
        grng.direction = None
        if not location:
            location = grng
        location = location.merge(grng)
    locstring = ''
    if location: locstring = location.get_range_string()
    return [gpdlines, tablelines, locstring]
def load_from_inputs(args):
    #Read in the VCF file
    sys.stderr.write("Reading in the VCF file\n")
    alleles = {}
    #with open(args.phased_VCF) as inf:
    with open(args.inputs[1]) as inf:
        for line in inf:
            vcf = VCF(line)
            if not vcf.is_snp(): continue
            g = vcf.get_phased_genotype()
            if not g: continue
            if vcf.value('chrom') not in alleles:
                alleles[vcf.value('chrom')] = {}
            if vcf.value('pos') in alleles[vcf.value('chrom')]:
                sys.stderr.write("WARNING: seeing the same position twice.\n" +
                                 line.rstrip() + "\n")
            alleles[vcf.value('chrom')][vcf.value(
                'pos')] = g  # set our left and right

    sys.stderr.write("Reading in the reference genome\n")
    #ref = read_fasta_into_hash(args.reference_genome)
    ref = read_fasta_into_hash(args.inputs[0])
    res1 = []
    res2 = []
    p = None
    sys.stderr.write("Introducing VCF changes to reference sequences\n")
    # Pretty memory intesnive to so don't go with all possible threads
    if args.threads > 1: p = Pool(processes=max(1, int(args.threads / 4)))
    for chrom in ref:
        # handle the case where there is no allele information
        if chrom not in alleles:
            r1q = Queue()
            r1q.put([0, chrom, ref[chrom]])
            res1.append(r1q)
            r2q = Queue()
            r2q.put([0, chrom, ref[chrom]])
            res2.append(r2q)
        elif args.threads > 1:
            res1.append(
                p.apply_async(adjust_reference_genome,
                              args=(alleles[chrom], ref[chrom], 0, chrom)))
            res2.append(
                p.apply_async(adjust_reference_genome,
                              args=(alleles[chrom], ref[chrom], 1, chrom)))
        else:
            r1q = Queue()
            r1q.put(
                adjust_reference_genome(alleles[chrom], ref[chrom], 0, chrom))
            res1.append(r1q)
            r2q = Queue()
            r2q.put(
                adjust_reference_genome(alleles[chrom], ref[chrom], 1, chrom))
            res2.append(r2q)
    if args.threads > 1:
        p.close()
        p.join()

    # now we can fill reference 1 with all our new sequences
    ref1 = {}
    c1 = 0
    for i in range(0, len(res1)):
        res = res1[i].get()
        c1 += res[0]
        ref1[res[1]] = res[2]

    # now we can fill reference 2 with all our new sequences
    ref2 = {}
    c2 = 0
    for i in range(0, len(res2)):
        res = res2[i].get()
        c2 += res[0]
        ref2[res[1]] = res[2]
    sys.stderr.write("Made " + str(c1) + "|" + str(c2) +
                     " changes to the reference\n")

    # Now ref1 and ref2 have are the diploid sources of the transcriptome
    gpdnames = {}
    txn1 = Transcriptome()
    txn2 = Transcriptome()
    txn1.set_reference_genome_dictionary(ref1)
    txn2.set_reference_genome_dictionary(ref2)
    #with open(args.transcripts_genepred) as inf:
    with open(args.inputs[2]) as inf:
        for line in inf:
            if line[0] == '#': continue
            txn1.add_genepred_line(line.rstrip())
            txn2.add_genepred_line(line.rstrip())
            gpd = GenePredEntry(line.rstrip())
            gpdnames[gpd.value('name')] = gpd.value('gene_name')
    # The transcriptomes are set but we dont' really need the references anymore
    # Empty our big memory things
    txn1.ref_hash = None
    txn2.ref_hash = None
    for chrom in ref1.keys():
        del ref1[chrom]
    for chrom in ref2.keys():
        del ref2[chrom]
    for chrom in ref.keys():
        del ref[chrom]

    if not args.locus_by_gene_name:
        #[locus2name,name2locus] = get_loci(args.transcripts_genepred)
        [locus2name, name2locus] = get_loci(args.inputs[2])
    else:  # set locus by gene name
        sys.stderr.write("Organizing loci by gene name\n")
        locus2name = {}
        name2locus = {}
        numname = {}
        m = 0
        for name in sorted(gpdnames):
            gene = gpdnames[name]
            if gene not in numname:
                m += 1
                numname[gene] = m
            num = numname[gene]
            if num not in locus2name:
                locus2name[num] = set()
            locus2name[num].add(name)
            name2locus[name] = num
        sys.stderr.write("Ended with " + str(len(locus2name.keys())) +
                         " loci\n")

    if args.isoform_expression:
        sys.stderr.write("Reading expression from a TSV\n")
        with open(args.isoform_expression) as inf:
            line1 = inf.readline()
            for line in inf:
                f = line.rstrip().split("\t")
                txn1.add_expression(f[0], float(f[1]))
                txn2.add_expression(f[0], float(f[1]))
    elif args.cufflinks_isoform_expression:
        sys.stderr.write("Using cufflinks expression\n")
        cuffz = 0
        with open(args.cufflinks_isoform_expression) as inf:
            line1 = inf.readline()
            for line in inf:
                cuffz += 1
                sys.stderr.write(str(cuffz) + " cufflinks entries processed\r")
                f = line.rstrip().split("\t")
                txn1.add_expression_no_update(f[0], float(f[9]))
                txn2.add_expression_no_update(f[0], float(f[9]))
        txn1.update_expression()
        txn2.update_expression()
        sys.stderr.write("\n")
    elif args.uniform_expression:
        sys.stderr.write("Using uniform expression model\n")
    else:
        sys.stderr.write(
            "Warning isoform expression not sepcified, using uniform expression model.\n"
        )
    # Now we have the transcriptomes set
    rhos = {}  # The ASE of allele 1 (the left side)
    randos = {}
    if args.seed:
        random.seed(args.seed)
    for z in locus2name:
        randos[z] = random.random()
    sys.stderr.write("Setting rho for each transcript\n")
    # Lets set rho for ASE for each transcript
    for tname in sorted(txn1.transcripts):
        if args.ASE_identical or args.ASE_identical == 0:
            rhos[tname] = float(args.ASE_identical)
        elif args.ASE_isoform_random:
            rhos[tname] = random.random()
        else:  # we must be on locus random
            rhos[tname] = randos[name2locus[tname]]
    #Now our dataset is set up
    rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter(txn1, txn2)
    rbe.gene_names = gpdnames
    rbe.name2locus = name2locus
    rbe.set_transcriptome1_rho(rhos)
    return rbe
def do_prediction(compatible,args,nrfuzzykey,location):
    #if len(compatible.keys()) == 0: return None
    #all reads could be standing alone version
    families = []
    for num in nrfuzzykey:
      families.append(nrfuzzykey[num])
      nrfuzzykey[num].params['proper_set'] = False #partial overlap is enough
    #get_compatible_evidence(compatible,nrfuzzykey,args)
    for i in compatible:
      for j in compatible[i]:
        #see if its already in there
        g1lines = set()
        for g1 in nrfuzzykey[i].gpds: g1lines.add(g1.get_line())
        repeat = False
        for g2 in nrfuzzykey[j].gpds:
          if g2.get_line() in g1lines:
            repeat = True
            break
        if not repeat: continue
        together = nrfuzzykey[i].concat_fuzzy_gpd(nrfuzzykey[j])
        if together:
          families.append(together)
    # now we need to find any duplicate entries and combine them
    newfam = []
    beforefam = len(families)
    while len(families) > 0:
      fam = families.pop(0)
      remaining = []
      for i in range(0,len(families)):
        if fam.is_equal_fuzzy(families[i]):
          added = fam.add_fuzzy_gpd(families[i])
          if not added:
            sys.stderr.write("WARNING NOT SURE WHY NOT ADDED EQUAL\n")
          fam = added
        else: remaining.append(families[i])
      families = remaining
      newfam.append(fam)
    families = newfam
    afterfam = len(families)

    # Replace the family with a set where we haven't used the same gpd line twice
    # This may damage the fuzzy object
    for i in range(0,len(families)):
      gset = set()
      for g in families[i].gpds:  
        gset.add(g.get_line())
      families[i].gpds  = [GenePredEntry(x) for x in gset]
    #  sys.stderr.write("\n\ncahnged from "+str(beforefam)+"\t"+str(afterfam)+"\n\n")
    gpdlines = ""
    tablelines = ""
    # find gpds not in the graph... 
    for fz in families:
      info = fz.get_info_string()
      gpdline = fz.get_genepred_line()
      #print '&&&&&&&&&&&&&&&&'
      #print gpdline
      #print fz.get_info_string()
      #print '&&&&&&&&&&&&&&&&'
      gpd = GenePredEntry(gpdline)
      if not gpd.is_valid(): 
        sys.stderr.write("WARNING: invalid genepred entry generated\n"+gpdline+"\n"+fz.get_info_string()+"\n")
        gpd = sorted(fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons
        fz = FuzzyGenePred(gpd,juntol=args.junction_tolerance*2)
        gpdline = fz.get_genepred_line()
        if not gpd.is_valid():
          sys.stderr.write("WARNING: still problem skilling\n")
          continue
      gpdlines += gpdline+"\n"
      if args.output_original_table:
        name = gpd.entry['name']
        for g in fz.gpds:
          tablelines+=name+"\t"+g.entry['name']+"\n"
      grng = gpd.get_bed()
      grng.direction = None
      if not location: 
        location = grng
      location = location.merge(grng)
    locstring = ''
    if location:  locstring = location.get_range_string()
    return [gpdlines, tablelines, locstring]