コード例 #1
0
def main(arg):
    f = Fasta(arg)
    G = {}
    iG = set()
    for a in f.keys():
        for b in f.keys():
            if a == b:
                continue

            ov = get_overlap(a, b, f)
            if not ov:
                continue
            a, b, i = ov
            G[a] = (a, b, i)
            iG.add(b)

    # linearize graph
    start = set(f.keys()) - iG
    assert len(start) == 1
    z = list(start)[0]
    seq = str(f[z].seq)

    while z in G:
        a, b, i = G[z]
        seq = seq[:-i] + str(f[b].seq)
        z = b
    print seq
コード例 #2
0
def main(arg):
    f = Fasta(arg)
    for a in f.keys():
        for b in f.keys():
            if a == b:
                continue
            if check_overlap(a, b, f):
                print a, b
コード例 #3
0
ファイル: reformat.py プロジェクト: zhaotao1987/jcvi
def reindex(args):
    """
    %prog reindex gffile pep.fasta ref.pep.fasta

    Reindex the splice isoforms (mRNA) in input GFF file, preferably
    generated after PASA annotation update

    In the input GFF file, there can be several types of mRNA within a locus:
    * CDS matches reference, UTR extended, inherits reference mRNA ID
    * CDS (slightly) different from reference, inherits reference mRNA ID
    * Novel isoform added by PASA, have IDs like "LOCUS.1.1", "LOCUS.1.2"
    * Multiple mRNA collapsed due to shared structure, have IDs like "LOCUS.1-LOCUS.1.1"

    In the case of multiple mRNA which have inherited the same reference mRNA ID,
    break ties by comparing the new protein with the reference protein using
    EMBOSS `needle` to decide which mRNA retains ID and which is assigned a new ID.

    All mRNA identifiers should follow the AGI naming conventions.

    When reindexing the isoform identifiers, order mRNA based on:
    * decreasing transcript length
    * decreasing support from multiple input datasets used to run pasa.consolidate()
    """
    from jcvi.formats.gff import make_index
    from jcvi.formats.fasta import Fasta
    from jcvi.apps.emboss import needle
    from jcvi.formats.base import FileShredder
    from tempfile import mkstemp

    p = OptionParser(reindex.__doc__)
    p.add_option("--scores", type="str", \
        help="read from existing EMBOSS `needle` scores file")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    gffile, pep, refpep, = args
    gffdb = make_index(gffile)
    reffasta = Fasta(refpep)

    if not opts.scores:
        fh, pairsfile = mkstemp(prefix='pairs', suffix=".txt", dir=".")
        fw = must_open(pairsfile, "w")

    conflict, novel = AutoVivification(), {}
    for gene in gffdb.features_of_type('gene', order_by=('seqid', 'start')):
        geneid = atg_name(gene.id, retval='locus')
        novel[geneid] = []
        updated_mrna, hybrid_mrna = [], []
        for mrna in gffdb.children(gene, featuretype='mRNA', order_by=('seqid', 'start')):
            if re.match(atg_name_pat, mrna.id) is not None and "_" not in mrna.id:
                pf, mrnaid = parse_prefix(mrna.id)
                mlen = gffdb.children_bp(mrna, child_featuretype='exon')
                if "-" in mrna.id:
                    hybrid_mrna.append((mrna.id, mrna.start, mlen, len(pf)))
                else:
                    updated_mrna.append((mrna.id, mrna.start, mlen, len(pf)))

        for mrna in sorted(updated_mrna, key=lambda k:(k[1], -k[2], -k[3])):
            pf, mrnaid = parse_prefix(mrna[0])
            mstart, mlen = mrna[1], mrna[2]

            iso = atg_name(mrnaid, retval='iso')
            newiso = "{0}{1}".format(iso, re.sub(atg_name_pat, "", mrnaid))
            if iso == newiso:
                if iso not in conflict[geneid]:
                    conflict[geneid][iso] = []
                conflict[geneid][iso].append((mrna[0], iso, newiso, \
                    mstart, mlen, len(pf)))
            else:
                novel[geneid].append((mrna[0], None, newiso, \
                    mstart, mlen, len(pf)))

        for mrna in sorted(hybrid_mrna, key=lambda k:(k[1], -k[2], -k[3])):
            pf, mrnaid = parse_prefix(mrna[0])
            mstart, mlen = mrna[1], mrna[2]

            _iso, _newiso = [], []
            for id in sorted(mrnaid.split("-")):
                a = atg_name(id, retval='iso')
                b = "{0}{1}".format(a, re.sub(atg_name_pat, "", id))
                _iso.append(a)
                _newiso.append(b)

            _novel = None
            newiso = "-".join(str(x) for x in set(_newiso))
            for iso, niso in zip(_iso, _newiso):
                if iso == niso:
                    if iso not in conflict[geneid]:
                        conflict[geneid][iso] = \
                            [(mrna[0], iso, newiso, mstart, mlen, len(pf))]
                        _novel = None
                        break

                _novel = True

            if _novel is not None:
                novel[geneid].append((mrna[0], None, newiso, \
                    mstart, mlen, len(pf)))

        if not opts.scores:
            for isoform in sorted(conflict[geneid]):
                mrnaid = "{0}.{1}".format(geneid, isoform)
                if mrnaid in reffasta.keys():
                    for mrna in conflict[geneid][isoform]:
                        print >> fw, "\t".join(str(x) for x in (mrnaid, mrna[0]))

    scoresfile = None
    if not opts.scores:
        fw.close()
        needle([pairsfile, refpep, pep])
        FileShredder([pairsfile], verbose=False)
        scoresfile = "{0}.scores".format(pairsfile.rsplit(".")[0])
    else:
        scoresfile = opts.scores

    scores = read_scores(scoresfile, sort=True, trimsuffix=False)

    primary = {}
    for geneid in conflict:
        primary[geneid] = []
        for iso in sorted(conflict[geneid]):
            conflict[geneid][iso].sort(key=lambda k:(k[3], -k[4], -k[5]))
            _iso = "{0}.{1}".format(geneid, iso)
            if _iso not in scores:
                novel[geneid].extend(conflict[geneid][iso])
                continue
            top_score = scores[_iso][0][1]
            result = next((i for i, v in enumerate(conflict[geneid][iso]) if v[0] == top_score), None)
            if result is not None:
                primary[geneid].append(conflict[geneid][iso][result])
                del conflict[geneid][iso][result]
                if geneid not in novel:
                    novel[geneid] = []
                novel[geneid].extend(conflict[geneid][iso])
        novel[geneid].sort(key=lambda k:(k[3], -k[4], -k[5]))

    fw = must_open(opts.outfile, 'w')
    for gene in gffdb.features_of_type('gene', order_by=('seqid', 'start')):
        geneid = gene.id
        print >> fw, gene
        seen = []
        if geneid in primary:
            all_mrna = primary[geneid]
            all_mrna.extend(novel[geneid])
            for iso, mrna in enumerate(all_mrna):
                _mrna = gffdb[mrna[0]]
                _iso = mrna[1]
                if mrna not in novel[geneid]:
                    seen.append(int(mrna[1]))
                else:
                    mseen = 0 if len(seen) == 0 else max(seen)
                    _iso = (mseen + iso + 1) - len(seen)

                _mrnaid = "{0}.{1}".format(geneid, _iso)
                _mrna['ID'], _mrna['_old_ID'] = [_mrnaid], [_mrna.id]

                print >> fw, _mrna
                for c in gffdb.children(_mrna, order_by=('start')):
                    c['Parent'] = [_mrnaid]
                    print >> fw, c
        else:
            for feat in gffdb.children(gene, order_by=('seqid', 'start')):
                print >> fw, feat

    fw.close()