Beispiel #1
0
def main(blast_files, out_dir, qorg, sorg):
    """empty docstring"""
    raw_cns = "%s/%s_%s.cns.txt" % (out_dir, qorg, sorg)
    assert os.path.exists(raw_cns)
    cns_by_id = dict(parse_raw_cns(raw_cns))

    exons = collections.defaultdict(dict)
    for blast_file in blast_files:
        for line in open(blast_file):
            b = BlastLine(line)
            # chop the q__ and s__
            key = b.query[3:]
            #assert key == b.subject[3:], (key, b.subject[3:])
            # convert piped rice names to short canonical names.
            subject = b.subject.split("|")[0] if "|" in b.subject else b.subject
            # chop At2g26540.1 to At2g26540
            subject = subject[:-2] if subject[-2] == "." else subject
            subject = subject.replace('LOC_', '')
           
            if b.score > 50:
                if not subject in exons[key]:
                    exons[key][subject] = [b.eval]
                else:
                    exons[key][subject].append(b.eval)
                continue

            if b.score < 45: continue
            cns = cns_by_id[key]

            # qstart?
            qlen = cns['qend'] - cns['qstart']
            coverage = (b.hitlen * 3.) / qlen
            #print >>sys.stderr, coverage
            if coverage < 0.90: continue
            if not subject in exons[key]:
                exons[key][subject] = [b.eval]
            else:
                exons[key][subject].append(b.eval)


    exons = dict(exons)
    write_exons(exons, out_dir)
    #for cns_hash, at_exons in exons.iteritems():
    print >>sys.stderr, "%i total unique cnss are exons" % (len(exons), )
    return exons
Beispiel #2
0
def main(cnsfile, qfasta_file, sfasta_file, qorg, sorg, min_len):
    """empty docstring"""
    lens = []
    qfasta = Fasta(qfasta_file)
    sfasta = Fasta(sfasta_file)

    seen = {}

    lens_append = lens.append
    qseq, sseq = None, None
    # so we only read a new fasta file as needed.
    last_qchr, last_schr = None, None

    seen = {}
    for cns_id, cns_dict in parse_raw_cns(cnsfile):
        cns = cns_dict
        qseq = qfasta[str(cns['qseqid'])]
        sseq = sfasta[str(cns['sseqid'])]


        sstart, send = sorted((cns['sstart'], cns['send']))
        qkey = (cns['qseqid'], cns['qstart'], cns['qend'])
        skey = (cns['sseqid'], cns['sstart'], cns['send'])

        assert sstart < send

        if cns['qend'] - cns['qstart'] < min_len: continue
        if send - sstart < min_len: continue


        if not (qkey in seen and skey in seen):
            print ">q__" + cns_id
            seqstr = str(qseq[cns['qstart'] - 1: cns['qend']]).replace('R', 'N').replace('W', 'N').replace('M', 'N')
            assert set(seqstr.lower()).issubset("actgnx"), ('q', 'q__' + cns_id, seqstr)
            print seqstr.upper()

            print ">s__" + cns_id
            seqstr = str(sseq[sstart - 1: send]).replace('R', 'N').replace('W', 'N').replace('M', 'N')
            assert set(seqstr.lower()).issubset("actgnx"), ('s', 's__' + cns_id, seqstr)
            print seqstr.upper()

        seen[qkey] = 1
        seen[skey] = 1
def main(qbed, sbed, cnsfile, dist, orthology_path):
    """
    here, we remove cnss that have been called proteins/rnas from 
    the cns list, and add them to the bed files.
    AND have to do the preliminary assignment of cnss that remain to the new-genes
    that _were_ cnss. the proper assignment is then handled in assign.py
    """
    ortho_trees = read_orthos_to_trees(orthology_path, qbed, sbed)

    name, ext = op.splitext(cnsfile)
    real_cns_fh = open("%s.real%s" % (name, ext), "w")
    print >>sys.stderr, "writing to:", real_cns_fh.name
    outdir = op.dirname(cnsfile)
    print >>real_cns_fh, "#qseqid,qaccn,sseqid,saccn,qstart,qend,sstart,send,eval"

    crna = read_cns_to_rna(outdir)
    cpro = read_cns_to_protein_exons(outdir)

    cns_items = list(parse_raw_cns(cnsfile))
    proteins = collections.defaultdict(list)
    rnas = collections.defaultdict(list)
    real_cns_items = []
    for cns_id, cns in cns_items:
        key = (cns['qseqid'], cns['sseqid'])
        if cns_id in cpro:
            proteins[key].append((cns, cpro[cns_id]))
        elif cns_id in crna:
            rnas[key].append((cns, crna[cns_id]))
        else:
            real_cns_items.append((cns_id, cns))

    p_trees = fill_tree(proteins)
    r_trees = fill_tree(rnas)

    def assign_new_names(prs, protein_or_rna):
        n = {}
        for seqid_pair, li in prs.iteritems():
            if not seqid_pair in n: n[seqid_pair] = []
            for gnew, info in li[:]:
                new_qname = "%(qseqid)s_%(qstart)i_%(qend)i_cns" % gnew
                new_sname = "%(sseqid)s_%(sstart)i_%(send)i_cns" % gnew
                # and give them both an id so we know they were a pair.
                new_qname += "_%s" % (protein_or_rna)
                new_sname += "_%s" % (protein_or_rna)
                try:
                    qstrand = qbed.d[cns['qaccn']]['strand']
                    sstrand = sbed.d[cns['saccn']]['strand']
                except:
                    print >>sys.stderr, cns
                    raise
                gnew['qaccn'] = new_qname
                gnew['saccn'] = new_sname
                gnew['qstrand'] = qstrand
                gnew['sstrand'] = sstrand
                n[seqid_pair].append((gnew, info))
        return n
    nproteins = assign_new_names(proteins, "protein")
    nrnas = assign_new_names(rnas, "rna")


    cns_seen = {}
    # go through the remaining cnss, print and assign them to the new
    # genes (previously cnss) in within dist.
    for cns_id, cns in real_cns_items:
        print >>real_cns_fh, cns_to_str(cns)
        key = (cns['qseqid'], cns['sseqid'])
        
        for pnew, info in get_new(cns, p_trees, key, nproteins, dist + 1000):
            cns['qaccn'] = pnew['qaccn']
            cns['saccn'] = pnew['saccn']
            cns_str = cns_to_str(cns)
	    if cns_str in cns_seen: continue
            cns_seen[cns_str] = 1
            print >>real_cns_fh, cns_str

        for rnew, info in get_new(cns, r_trees, key, nrnas, dist + 1000):
            cns['qaccn'] = rnew['qaccn']
            cns['saccn'] = rnew['saccn']
   	    cns_str = cns_to_str(cns)
            if cns_str in cns_seen: continue
            cns_seen[cns_str] = 1
            print >>real_cns_fh, cns_str

    qbed_list, qnew_pairs = merge_bed(qbed, nproteins, nrnas, ortho_trees, 'q')
    # dont need to do the orthos 2x so send in empty dict.
    sbed_list, snew_pairs_unused = merge_bed(sbed, nproteins, nrnas, {}, 's')

    # if it's the same org, we add the new cnss again to the same we send in both lists.
    # print_bed handles the repeats.
    if qbed.path == sbed.path:
        qbed_new = sbed_new = print_bed(qbed_list + sbed_list, qbed.path)
    else:
        qbed_new = print_bed(qbed_list, qbed.path)
        sbed_new = print_bed(sbed_list, sbed.path)

    return qbed_new, sbed_new, qnew_pairs