def main(arg): f = Fasta(arg) G = {} iG = set() for a in f.keys(): for b in f.keys(): if a == b: continue ov = get_overlap(a, b, f) if not ov: continue a, b, i = ov G[a] = (a, b, i) iG.add(b) # linearize graph start = set(f.keys()) - iG assert len(start) == 1 z = list(start)[0] seq = str(f[z].seq) while z in G: a, b, i = G[z] seq = seq[:-i] + str(f[b].seq) z = b print seq
def main(arg): f = Fasta(arg) for a in f.keys(): for b in f.keys(): if a == b: continue if check_overlap(a, b, f): print a, b
def reindex(args): """ %prog reindex gffile pep.fasta ref.pep.fasta Reindex the splice isoforms (mRNA) in input GFF file, preferably generated after PASA annotation update In the input GFF file, there can be several types of mRNA within a locus: * CDS matches reference, UTR extended, inherits reference mRNA ID * CDS (slightly) different from reference, inherits reference mRNA ID * Novel isoform added by PASA, have IDs like "LOCUS.1.1", "LOCUS.1.2" * Multiple mRNA collapsed due to shared structure, have IDs like "LOCUS.1-LOCUS.1.1" In the case of multiple mRNA which have inherited the same reference mRNA ID, break ties by comparing the new protein with the reference protein using EMBOSS `needle` to decide which mRNA retains ID and which is assigned a new ID. All mRNA identifiers should follow the AGI naming conventions. When reindexing the isoform identifiers, order mRNA based on: * decreasing transcript length * decreasing support from multiple input datasets used to run pasa.consolidate() """ from jcvi.formats.gff import make_index from jcvi.formats.fasta import Fasta from jcvi.apps.emboss import needle from jcvi.formats.base import FileShredder from tempfile import mkstemp p = OptionParser(reindex.__doc__) p.add_option("--scores", type="str", \ help="read from existing EMBOSS `needle` scores file") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) gffile, pep, refpep, = args gffdb = make_index(gffile) reffasta = Fasta(refpep) if not opts.scores: fh, pairsfile = mkstemp(prefix='pairs', suffix=".txt", dir=".") fw = must_open(pairsfile, "w") conflict, novel = AutoVivification(), {} for gene in gffdb.features_of_type('gene', order_by=('seqid', 'start')): geneid = atg_name(gene.id, retval='locus') novel[geneid] = [] updated_mrna, hybrid_mrna = [], [] for mrna in gffdb.children(gene, featuretype='mRNA', order_by=('seqid', 'start')): if re.match(atg_name_pat, mrna.id) is not None and "_" not in mrna.id: pf, mrnaid = parse_prefix(mrna.id) mlen = gffdb.children_bp(mrna, child_featuretype='exon') if "-" in mrna.id: hybrid_mrna.append((mrna.id, mrna.start, mlen, len(pf))) else: updated_mrna.append((mrna.id, mrna.start, mlen, len(pf))) for mrna in sorted(updated_mrna, key=lambda k:(k[1], -k[2], -k[3])): pf, mrnaid = parse_prefix(mrna[0]) mstart, mlen = mrna[1], mrna[2] iso = atg_name(mrnaid, retval='iso') newiso = "{0}{1}".format(iso, re.sub(atg_name_pat, "", mrnaid)) if iso == newiso: if iso not in conflict[geneid]: conflict[geneid][iso] = [] conflict[geneid][iso].append((mrna[0], iso, newiso, \ mstart, mlen, len(pf))) else: novel[geneid].append((mrna[0], None, newiso, \ mstart, mlen, len(pf))) for mrna in sorted(hybrid_mrna, key=lambda k:(k[1], -k[2], -k[3])): pf, mrnaid = parse_prefix(mrna[0]) mstart, mlen = mrna[1], mrna[2] _iso, _newiso = [], [] for id in sorted(mrnaid.split("-")): a = atg_name(id, retval='iso') b = "{0}{1}".format(a, re.sub(atg_name_pat, "", id)) _iso.append(a) _newiso.append(b) _novel = None newiso = "-".join(str(x) for x in set(_newiso)) for iso, niso in zip(_iso, _newiso): if iso == niso: if iso not in conflict[geneid]: conflict[geneid][iso] = \ [(mrna[0], iso, newiso, mstart, mlen, len(pf))] _novel = None break _novel = True if _novel is not None: novel[geneid].append((mrna[0], None, newiso, \ mstart, mlen, len(pf))) if not opts.scores: for isoform in sorted(conflict[geneid]): mrnaid = "{0}.{1}".format(geneid, isoform) if mrnaid in reffasta.keys(): for mrna in conflict[geneid][isoform]: print >> fw, "\t".join(str(x) for x in (mrnaid, mrna[0])) scoresfile = None if not opts.scores: fw.close() needle([pairsfile, refpep, pep]) FileShredder([pairsfile], verbose=False) scoresfile = "{0}.scores".format(pairsfile.rsplit(".")[0]) else: scoresfile = opts.scores scores = read_scores(scoresfile, sort=True, trimsuffix=False) primary = {} for geneid in conflict: primary[geneid] = [] for iso in sorted(conflict[geneid]): conflict[geneid][iso].sort(key=lambda k:(k[3], -k[4], -k[5])) _iso = "{0}.{1}".format(geneid, iso) if _iso not in scores: novel[geneid].extend(conflict[geneid][iso]) continue top_score = scores[_iso][0][1] result = next((i for i, v in enumerate(conflict[geneid][iso]) if v[0] == top_score), None) if result is not None: primary[geneid].append(conflict[geneid][iso][result]) del conflict[geneid][iso][result] if geneid not in novel: novel[geneid] = [] novel[geneid].extend(conflict[geneid][iso]) novel[geneid].sort(key=lambda k:(k[3], -k[4], -k[5])) fw = must_open(opts.outfile, 'w') for gene in gffdb.features_of_type('gene', order_by=('seqid', 'start')): geneid = gene.id print >> fw, gene seen = [] if geneid in primary: all_mrna = primary[geneid] all_mrna.extend(novel[geneid]) for iso, mrna in enumerate(all_mrna): _mrna = gffdb[mrna[0]] _iso = mrna[1] if mrna not in novel[geneid]: seen.append(int(mrna[1])) else: mseen = 0 if len(seen) == 0 else max(seen) _iso = (mseen + iso + 1) - len(seen) _mrnaid = "{0}.{1}".format(geneid, _iso) _mrna['ID'], _mrna['_old_ID'] = [_mrnaid], [_mrna.id] print >> fw, _mrna for c in gffdb.children(_mrna, order_by=('start')): c['Parent'] = [_mrnaid] print >> fw, c else: for feat in gffdb.children(gene, order_by=('seqid', 'start')): print >> fw, feat fw.close()