def get_loci(transcripts_genepred):
    loci = Loci()
    loci.verbose = True
    with open(transcripts_genepred) as inf:
        for line in inf:
            if line[0] == '#': continue
            gpd = GenePredEntry(line.rstrip())
            rng = Bed(gpd.value('chrom'), gpd.value('txStart'),
                      gpd.value('txEnd'))
            rng.set_payload(gpd.value('name'))
            loc1 = Locus()
            loc1.add_member(rng)
            loci.add_locus(loc1)
    sys.stderr.write("Organizing genepred data into overlapping loci\n")
    sys.stderr.write("Started with " + str(len(loci.loci)) + " loci\n")
    loci.update_loci()
    sys.stderr.write("Ended with " + str(len(loci.loci)) + " loci\n")

    m = 0
    locus2name = {}
    name2locus = {}
    for locus in loci.loci:
        m += 1
        for member in locus.members:
            name = member.get_payload()
            if m not in locus2name: locus2name[m] = set()
            locus2name[m].add(name)
            name2locus[name] = m
    return [locus2name, name2locus]
def get_loci(transcripts_genepred):
  loci = Loci()
  loci.verbose= True
  with open(transcripts_genepred) as inf:
    for line in inf:
      if line[0]=='#': continue
      gpd = GenePredEntry(line.rstrip())
      rng = Bed(gpd.value('chrom'),gpd.value('txStart'),gpd.value('txEnd'))
      rng.set_payload(gpd.value('name'))
      loc1 = Locus()
      loc1.add_member(rng)
      loci.add_locus(loc1)
  sys.stderr.write("Organizing genepred data into overlapping loci\n")
  sys.stderr.write("Started with "+str(len(loci.loci))+" loci\n")
  loci.update_loci()
  sys.stderr.write("Ended with "+str(len(loci.loci))+" loci\n")

  m = 0
  locus2name = {}
  name2locus = {}
  for locus in loci.loci:
    m+=1
    for member in locus.members:
      name = member.get_payload()
      if m not in locus2name:  locus2name[m] = set()
      locus2name[m].add(name)
      name2locus[name] = m
  return [locus2name,name2locus]
def process_read(mpa, args):
    # Filter entries by a minimum alignment coverage
    newentries = []
    for i in [
            i for i in range(0, len(mpa.entries))
            if mpa.entries[i].get_coverage() > args.minimum_alignment_coverage
    ]:
        newentries.append(mpa.entries[i])
    mpa.entries = newentries

    # Find best singles
    bestsingle = None
    bestsinglescore = -1
    for i in range(0, len(mpa.entries)):
        totalcov = mpa.entries[i].get_coverage()
        weightedcov = float(mpa.entries[i].get_coverage()) * float(
            mpa.entries[i].get_quality())
        if weightedcov > bestsinglescore:
            bestsinglescore = weightedcov
            bestsingle = i
    if bestsinglescore == -1:
        sys.stderr.write("failed to find a single path\n")
        return None
    my_max_intron = args.maximum_intron
    if args.fusion: my_max_intron = -1  # we can look any distance for a group
    mpa.compatible_graph(
        max_intron=my_max_intron,
        max_query_overlap=args.maximum_query_overlap,
        max_gap=args.maximum_query_gap,
        max_target_overlap=args.maximum_target_overlap,
        max_query_fraction_overlap=args.maximum_query_fraction_overlap)
    ps = mpa.get_root_paths()
    bestpath = [bestsingle]
    bestscore = 0
    besttotalcov = 0
    allscores = []
    allcov = []
    best_path_index = -1
    zz = 0
    for path in ps:
        totalcov = sum([mpa.entries[i].get_coverage() for i in path])
        weightedcov = sum([
            float(mpa.entries[i].get_coverage()) *
            float(mpa.entries[i].get_quality()) for i in path
        ])
        allscores.append(weightedcov)
        allcov.append(totalcov)
        if weightedcov > bestscore:
            bestscore = weightedcov
            bestpath = path
            besttotalcov = totalcov
            best_path_index = zz
        zz += 1
    #if not bestpath: return None
    otherpaths = []
    for i in range(0, len(ps)):
        if i != best_path_index:
            otherpaths.append(ps[i])
    query_target_coverages = []
    for other_path in otherpaths:
        qcov = 0
        tcov = 0
        for other_entry in [mpa.entries[i] for i in other_path]:
            for entry in [mpa.entries[j] for j in bestpath]:
                qcov += other_entry.query_overlap_size(entry)
                tcov += other_entry.target_overlap_size(entry)
        query_target_coverages.append(str(qcov) + '/' + str(tcov))

    gapsizes = []
    if len(bestpath) > 1:
        gapsizes = [
            mpa.entries[bestpath[j + 1]].get_query_bed().start -
            mpa.entries[bestpath[j]].get_query_bed().end - 1
            for j in range(0,
                           len(bestpath) - 1)
        ]
    #print mpa.g.get_status_string()
    #print [mpa.entries[i].get_target_bed().get_range_string() for i in bestpath]
    #print [mpa.entries[i].get_query_bed().get_range_string() for i in bestpath]
    #print [mpa.entries[i].get_quality() for i in bestpath]
    #print [mpa.entries[i].get_coverage() for i in bestpath]
    #print gapsizes
    #print bestscore
    #print bestsinglescore

    #See if we should use the single path score instead
    if len(path) > 1 and bestsinglescore * (
            1 + args.multipath_score_improvement) > bestscore:
        bestpath = [bestsingle]
        besttotalcov = mpa.entries[bestsingle].get_coverage()
        bestscore = bestsinglescore
    query_span = mpa.entries[bestpath[0]].get_query_bed()
    loci = Loci()
    loci.set_use_direction(True)
    loci.set_minimum_distance(args.maximum_intron)
    for i in bestpath:
        r = mpa.entries[i].get_target_bed()
        locus = Locus()
        locus.set_use_direction(True)
        locus.add_member(r)
        loci.add_locus(locus)
    loci.update_loci()
    if len(bestpath) > 1:
        for i in bestpath[1:]:
            query_span = mpa.entries[i].get_query_bed().merge(query_span)
    report = ''
    report += mpa.entries[bestpath[0]].value('qName') + "\t"
    report += str(len(bestpath)) + "\t"
    report += str(len(loci.loci)) + "\t"
    report += query_span.get_range_string() + "\t"
    report += ','.join([mpa.entries[i].value('strand')
                        for i in bestpath]) + "\t"
    report += ','.join(
        [mpa.entries[i].get_query_bed().get_range_string()
         for i in bestpath]) + "\t"
    report += ','.join(
        [mpa.entries[i].get_target_bed().get_range_string()
         for i in bestpath]) + "\t"
    report += ','.join([str(mpa.entries[i].get_quality())
                        for i in bestpath]) + "\t"
    report += ','.join([str(mpa.entries[i].get_coverage())
                        for i in bestpath]) + "\t"
    report += ','.join([str(x) for x in gapsizes]) + "\t"
    report += str(besttotalcov) + "\t"
    report += str(bestscore) + "\t"
    report += str(bestsinglescore) + "\t"
    report += str(','.join(query_target_coverages) + "\t")
    #if args.best_report:
    #  best_report_fh.write(report+"\n")
    #for i in bestpath:
    #  args.output.write(mpa.entries[i].get_line()+"\n")
    return [report, [mpa.entries[i].get_line() for i in bestpath]]
Example #4
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Rename gene and transcript elements of GenePred file that are redundant.  Please specify an output if you would like report files generated for the filters."
    )
    parser.add_argument('input', help="GENEPREDFILE or '-' for STDIN")
    parser.add_argument(
        '-o',
        '--output',
        help=
        "OUTPUT FILE default is STDOUT, but you need to specify an output file to get report files generated"
    )
    parser.add_argument(
        '--minimum_locus_distance',
        type=int,
        default=500000,
        help="Genes with the same name will be renamed if this far apart")
    parser.add_argument(
        '--keep_positional_duplicates',
        action='store_true',
        help="By default we remove one of the duplicate entries")
    parser.add_argument(
        '--keep_transcript_names',
        action='store_true',
        help="By default we rename duplicated transcript names")
    parser.add_argument(
        '--keep_gene_names',
        action='store_true',
        help="By default we rename genes located at different loci.")
    args = parser.parse_args()
    inf = sys.stdin
    if args.input != '-': inf = open(args.input)
    of = sys.stdout
    if args.output: of = open(args.output, 'w')
    txdef = {}
    gfams = {}
    for line in inf:
        if line[0] == '#': continue
        g = GenePredEntry(line)
        loc = g.value('chrom') + ':' + ','.join(
            [str(x) for x in g.value('exonStarts')]) + '-' + ','.join(
                [str(x)
                 for x in g.value('exonEnds')]) + '/' + g.value('strand')
        if loc not in txdef:
            txdef[loc] = []
        txdef[loc].append(g)
        if g.value('gene_name') not in gfams: gfams[g.value('gene_name')] = []
        gfams[g.value('gene_name')].append(g.value('name'))
    # now we have cataloged all transcripts by unique locations
    omissions = []
    keepers = []
    for loc in sorted(txdef.keys()):
        if args.keep_positional_duplicates:  # We don't want to ommit anything here
            for g in txdef[loc]:
                keepers.append(g)
            continue  #basically skipping this part by populating keepers
        num = len(txdef[loc])
        if num > 1:
            sys.stderr.write("Found " + str(num) + " entries at location\n")
            sys.stderr.write(loc + "\n")
            sys.stderr.write("They are:\n")
            largest = 0
            keepgene = None
            keepindex = -1
            i = 0
            for e in txdef[loc]:
                famsize = len(gfams[e.value('gene_name')])
                sys.stderr.write("     " + e.value('gene_name') + "\t" +
                                 e.value('name') + "\t" + str(famsize) + "\n")
                if famsize > largest:
                    keepgene = e
                    largest = famsize
                    keepindex = i
                i += 1
            for j in range(0, len(txdef[loc])):
                if j != keepindex: omissions.append(txdef[loc][j])
                else: keepers.append(txdef[loc][j])
            sys.stderr.write("     Biggest gene family is " +
                             keepgene.value('gene_name') + " with " +
                             str(largest) + " transcripts\n")
            sys.stderr.write("     so keep that one.\n")
        else:
            keepers.append(txdef[loc][0])
    sys.stderr.write("Omitting " + str(len(omissions)) +
                     " entries for redundant positions\n")
    if args.output and not args.keep_positional_duplicates:
        of1 = open(args.output + '.positional_duplicate_omissions', 'w')
        for g in omissions:
            of1.write(g.get_line() + "\n")
        of1.close()
    # Now the keepers contains transcripts with unique locations
    # Lets provide unique names to remaining transcripts
    tnames = {}
    renametx = {}
    for g in keepers:
        tx = g.value('name')
        if tx not in tnames: tnames[tx] = []
        tnames[tx].append(g)
    for name in tnames:
        if args.keep_transcript_names: continue  # We don't want to rename them
        nsize = len(tnames[name])
        if nsize > 1:
            sys.stderr.write("Name: " + name + " has a family of size " +
                             str(nsize) + "\n")
            for i in range(0, len(tnames[name])):
                newname = name + '[' + str(i + 1) + '/' + str(nsize) + ']'
                renametx[newname] = name
                tnames[name][i].entry['name'] = newname
    sys.stderr.write("Renamed: " + str(len(renametx)) + " transcripts\n")
    if args.output and not args.keep_transcript_names:
        of1 = open(args.output + '.renamed_transcripts', 'w')
        for name in sorted(renametx.keys()):
            of1.write(name + "\t" + renametx[name] + "\n")
        of1.close()
    #now we need to arrange into gene families
    gnames = {}
    for name in tnames:
        for g in tnames[name]:
            gene = g.value('gene_name')
            if gene not in gnames: gnames[gene] = []
            gnames[gene].append(g)
    renamegene = {}
    finished = []
    for gene in gnames:
        if args.keep_gene_names:
            for g in gnames[gene]:
                finished.append(g)
            continue  # We don't want to rename genes
        if len(gnames[gene]) == 1:
            finished.append(gnames[gene][0])
            continue
        # Now we need to make sure these genes are really on the same locus.
        loci = Loci()
        loci.set_minimum_distance(args.minimum_locus_distance)
        for g in gnames[gene]:
            r = g.locus_range.copy()
            r.set_payload(g)
            loc = Locus()
            loc.add_member(r)
            loci.add_locus(loc)
        loci.update_loci()
        lcount = len(loci.loci)
        if lcount == 1:
            for g in gnames[gene]:
                finished.append(g)
            continue
        # need to rename some genes
        for i in range(0, lcount):
            newname = gene + '[' + str(i + 1) + '/' + str(lcount) + ']'
            rstr = loci.loci[i].range.get_range_string()
            renamegene[newname] = gene
            sys.stderr.write(newname + "\t" + rstr + "\n")
            for m in loci.loci[i].members:
                m.get_payload().entry['gene_name'] = newname
                finished.append(m.get_payload())
    sys.stderr.write("Renamed: " + str(len(renamegene)) + " genes\n")
    if args.output and not args.keep_transcript_names:
        of1 = open(args.output + '.renamed_genes', 'w')
        for name in sorted(renamegene.keys()):
            of1.write(name + "\t" + renamegene[name] + "\n")
        of1.close()
    #Now lets resort by genes
    bygene = {}
    for g in finished:
        gene = g.value('gene_name')
        if gene not in bygene: bygene[gene] = []
        bygene[gene].append(g)
    for gene in sorted(bygene.keys()):
        for g in bygene[gene]:
            of.write(g.get_line() + "\n")
    of.close()
    inf.close()
def process_read(mpa,args):
    # Filter entries by a minimum alignment coverage
    newentries = []
    for i in [i for i in range(0,len(mpa.entries)) if mpa.entries[i].get_coverage() > args.minimum_alignment_coverage]:
      newentries.append(mpa.entries[i])
    mpa.entries = newentries

    # Find best singles
    bestsingle = None
    bestsinglescore = -1
    for i in range(0,len(mpa.entries)):
      totalcov = mpa.entries[i].get_coverage()
      weightedcov = float(mpa.entries[i].get_coverage())*float(mpa.entries[i].get_quality())
      if weightedcov > bestsinglescore:
        bestsinglescore = weightedcov
        bestsingle = i
    if bestsinglescore == -1: 
      sys.stderr.write("failed to find a single path\n")
      return None
    my_max_intron = args.maximum_intron
    if args.fusion: my_max_intron = -1 # we can look any distance for a group
    mpa.compatible_graph(max_intron=my_max_intron,max_query_overlap=args.maximum_query_overlap,max_gap=args.maximum_query_gap,max_target_overlap=args.maximum_target_overlap,max_query_fraction_overlap=args.maximum_query_fraction_overlap)
    ps = mpa.get_root_paths()
    bestpath = [bestsingle]
    bestscore = 0
    besttotalcov = 0
    allscores = []
    allcov = []
    best_path_index = -1
    zz = 0
    for path in ps:
      totalcov = sum([mpa.entries[i].get_coverage() for i in path])
      weightedcov = sum([float(mpa.entries[i].get_coverage())*float(mpa.entries[i].get_quality()) for i in path])
      allscores.append(weightedcov)
      allcov.append(totalcov)
      if weightedcov > bestscore: 
        bestscore = weightedcov
        bestpath = path
        besttotalcov = totalcov
        best_path_index = zz
      zz+=1
    #if not bestpath: return None
    otherpaths = []
    for i in range(0,len(ps)):
      if i != best_path_index:
        otherpaths.append(ps[i])
    query_target_coverages = []
    for other_path in otherpaths:
      qcov = 0
      tcov = 0
      for other_entry in [mpa.entries[i] for i in other_path]:
        for entry in [mpa.entries[j] for j in bestpath]:
          qcov += other_entry.query_overlap_size(entry)
          tcov += other_entry.target_overlap_size(entry)
      query_target_coverages.append(str(qcov)+'/'+str(tcov))

    gapsizes = []
    if len(bestpath) > 1:
      gapsizes = [mpa.entries[bestpath[j+1]].get_query_bed().start - mpa.entries[bestpath[j]].get_query_bed().end -1 for j in range(0,len(bestpath)-1)]
    #print mpa.g.get_status_string()
    #print [mpa.entries[i].get_target_bed().get_range_string() for i in bestpath]
    #print [mpa.entries[i].get_query_bed().get_range_string() for i in bestpath]
    #print [mpa.entries[i].get_quality() for i in bestpath]
    #print [mpa.entries[i].get_coverage() for i in bestpath]
    #print gapsizes
    #print bestscore
    #print bestsinglescore

    #See if we should use the single path score instead
    if len(path) > 1 and bestsinglescore*(1+args.multipath_score_improvement) > bestscore:
      bestpath = [bestsingle]
      besttotalcov = mpa.entries[bestsingle].get_coverage()
      bestscore = bestsinglescore
    query_span = mpa.entries[bestpath[0]].get_query_bed()
    loci = Loci()
    loci.set_use_direction(True)
    loci.set_minimum_distance(args.maximum_intron)
    for i in bestpath:
      r = mpa.entries[i].get_target_bed()
      locus = Locus()
      locus.set_use_direction(True)
      locus.add_member(r)
      loci.add_locus(locus)
    loci.update_loci()
    if len(bestpath) > 1:
      for i in bestpath[1:]:
        query_span = mpa.entries[i].get_query_bed().merge(query_span)
    report = ''
    report += mpa.entries[bestpath[0]].value('qName')+"\t"
    report += str(len(bestpath))+"\t"
    report += str(len(loci.loci))+"\t"
    report += query_span.get_range_string()+"\t"
    report += ','.join([mpa.entries[i].value('strand') for i in bestpath])+"\t"
    report += ','.join([mpa.entries[i].get_query_bed().get_range_string() for i in bestpath])+"\t"
    report += ','.join([mpa.entries[i].get_target_bed().get_range_string() for i in bestpath])+"\t"
    report += ','.join([str(mpa.entries[i].get_quality()) for i in bestpath])+"\t"
    report += ','.join([str(mpa.entries[i].get_coverage()) for i in bestpath])+"\t"
    report += ','.join([str(x) for x in gapsizes])+"\t"
    report += str(besttotalcov)+"\t"
    report += str(bestscore)+"\t"
    report += str(bestsinglescore)+"\t"
    report += str(','.join(query_target_coverages)+"\t")
    #if args.best_report:
    #  best_report_fh.write(report+"\n")
    #for i in bestpath:
    #  args.output.write(mpa.entries[i].get_line()+"\n")
    return [report, [mpa.entries[i].get_line() for i in bestpath]]
def main():
  parser = argparse.ArgumentParser(description="Rename gene and transcript elements of GenePred file that are redundant.  Please specify an output if you would like report files generated for the filters.")
  parser.add_argument('input',help="GENEPREDFILE or '-' for STDIN")
  parser.add_argument('-o','--output',help="OUTPUT FILE default is STDOUT, but you need to specify an output file to get report files generated")
  parser.add_argument('--minimum_locus_distance',type=int,default=500000,help="Genes with the same name will be renamed if this far apart")
  parser.add_argument('--keep_positional_duplicates',action='store_true',help="By default we remove one of the duplicate entries")
  parser.add_argument('--keep_transcript_names',action='store_true',help="By default we rename duplicated transcript names")
  parser.add_argument('--keep_gene_names',action='store_true',help="By default we rename genes located at different loci.")
  args = parser.parse_args()
  inf = sys.stdin
  if args.input != '-': inf = open(args.input)
  of = sys.stdout
  if args.output: of = open(args.output,'w')
  txdef = {}
  gfams = {}
  for line in inf:
    if line[0] == '#': continue
    g = GenePredEntry(line)
    loc = g.value('chrom') + ':' +','.join([str(x) for x in g.value('exonStarts')]) + '-' + ','.join([str(x) for x in g.value('exonEnds')])+'/'+g.value('strand')
    if loc not in txdef:
      txdef[loc] = []
    txdef[loc].append(g)
    if g.value('gene_name') not in gfams: gfams[g.value('gene_name')] = []
    gfams[g.value('gene_name')].append(g.value('name'))
  # now we have cataloged all transcripts by unique locations
  omissions = []
  keepers = []
  for loc in sorted(txdef.keys()):
    if args.keep_positional_duplicates: # We don't want to ommit anything here
      for g in txdef[loc]: keepers.append(g)
      continue #basically skipping this part by populating keepers
    num = len(txdef[loc])
    if num > 1:
      sys.stderr.write("Found "+str(num)+" entries at location\n")
      sys.stderr.write(loc +"\n")
      sys.stderr.write("They are:\n")
      largest = 0
      keepgene = None
      keepindex = -1
      i = 0
      for e in txdef[loc]:
        famsize = len(gfams[e.value('gene_name')])
        sys.stderr.write("     "+e.value('gene_name')+"\t"+e.value('name')+"\t"+str(famsize)+"\n")
        if famsize > largest:
          keepgene = e
          largest = famsize
          keepindex = i
        i+=1
      for j in range(0,len(txdef[loc])):  
        if j != keepindex: omissions.append(txdef[loc][j])
        else: keepers.append(txdef[loc][j])
      sys.stderr.write("     Biggest gene family is "+keepgene.value('gene_name')+" with "+str(largest)+" transcripts\n")
      sys.stderr.write("     so keep that one.\n")
    else:
      keepers.append(txdef[loc][0])
  sys.stderr.write("Omitting "+str(len(omissions))+" entries for redundant positions\n")
  if args.output and not args.keep_positional_duplicates:
    of1 = open(args.output+'.positional_duplicate_omissions','w')
    for g in omissions:
      of1.write(g.get_line()+"\n")
    of1.close()
  # Now the keepers contains transcripts with unique locations
  # Lets provide unique names to remaining transcripts
  tnames = {}
  renametx = {}
  for g in keepers:
    tx = g.value('name')
    if tx not in tnames: tnames[tx] = []
    tnames[tx].append(g)
  for name in tnames:
    if args.keep_transcript_names: continue # We don't want to rename them
    nsize = len(tnames[name])
    if nsize > 1:
      sys.stderr.write("Name: "+name+" has a family of size "+str(nsize)+"\n")
      for i in range(0,len(tnames[name])):
        newname = name+'['+str(i+1)+'/'+str(nsize)+']'
        renametx[newname] = name
        tnames[name][i].entry['name'] = newname
  sys.stderr.write("Renamed: "+str(len(renametx))+" transcripts\n")
  if args.output and not args.keep_transcript_names:
    of1 = open(args.output+'.renamed_transcripts','w')
    for name in sorted(renametx.keys()):
      of1.write(name+"\t"+renametx[name]+"\n")
    of1.close()
  #now we need to arrange into gene families
  gnames = {}
  for name in tnames:
    for g in tnames[name]:
      gene = g.value('gene_name')
      if gene not in gnames:  gnames[gene] = []
      gnames[gene].append(g)
  renamegene = {}
  finished = []
  for gene in gnames:
    if args.keep_gene_names:
      for g in gnames[gene]: finished.append(g)
      continue # We don't want to rename genes
    if len(gnames[gene])==1:
      finished.append(gnames[gene][0])
      continue
    # Now we need to make sure these genes are really on the same locus.
    loci = Loci()
    loci.set_minimum_distance(args.minimum_locus_distance)
    for g in gnames[gene]:
      r = g.locus_range.copy()
      r.set_payload(g)
      loc = Locus()
      loc.add_member(r)
      loci.add_locus(loc)
    loci.update_loci()
    lcount = len(loci.loci)
    if lcount == 1:
      for g in gnames[gene]: finished.append(g)
      continue
    # need to rename some genes
    for i in range(0,lcount):
      newname = gene+'['+str(i+1)+'/'+str(lcount)+']'
      rstr = loci.loci[i].range.get_range_string()
      renamegene[newname] = gene
      sys.stderr.write(newname+"\t"+rstr+"\n")
      for m in loci.loci[i].members:
        m.get_payload().entry['gene_name'] = newname
        finished.append(m.get_payload())
  sys.stderr.write("Renamed: "+str(len(renamegene))+" genes\n")
  if args.output and not args.keep_transcript_names:
    of1 = open(args.output+'.renamed_genes','w')
    for name in sorted(renamegene.keys()):
      of1.write(name+"\t"+renamegene[name]+"\n")
    of1.close()
  #Now lets resort by genes
  bygene = {}
  for g in finished:
    gene = g.value('gene_name')
    if gene not in bygene: bygene[gene] = []
    bygene[gene].append(g)
  for gene in sorted(bygene.keys()):
    for g in bygene[gene]:
      of.write(g.get_line()+"\n")
  of.close()
  inf.close()