コード例 #1
0
ファイル: cns_to_fasta.py プロジェクト: yuzhenpeng/find_cns
def main(cnsfile, qfasta_file, sfasta_file, qorg, sorg, min_len):
    """empty docstring"""
    lens = []
    qfasta = Fasta(qfasta_file)
    sfasta = Fasta(sfasta_file)

    seen = {}

    lens_append = lens.append
    qseq, sseq = None, None
    # so we only read a new fasta file as needed.
    last_qchr, last_schr = None, None

    seen = {}
    for cns in CNS.parse_raw_line(cnsfile):
        #if is_intron(cns,qbed): continue
        qseq = qfasta[str(cns.qseqid)]
        sseq = sfasta[str(cns.sseqid)]

        sstart, send = sorted((cns.sstart, cns.sstop))
        qkey = (cns.qseqid, cns.qstart, cns.qstop)
        skey = (cns.sseqid, cns.sstart, cns.sstop)

        assert sstart < send

        if cns.qstop - cns.qstart < min_len: continue
        if send - sstart < min_len: continue

        if not (qkey in seen and skey in seen):
            print ">q__" + cns.cns_id
            seqstr = str(qseq[cns.qstart - 1:cns.qstop]).replace(
                'R', 'N').replace('W', 'N').replace('M', 'N')
            assert set(seqstr.lower()).issubset("actgnx"), ('q',
                                                            'q__' + cns.cns_id,
                                                            seqstr)
            print seqstr.upper()

            print ">s__" + cns.cns_id
            seqstr = str(sseq[sstart - 1:send]).replace('R', 'N').replace(
                'W', 'N').replace('M', 'N')
            assert set(seqstr.lower()).issubset("actgnx"), ('s',
                                                            's__' + cns.cns_id,
                                                            seqstr)
            print seqstr.upper()

        seen[qkey] = 1
        seen[skey] = 1
コード例 #2
0
ファイル: find_exons.py プロジェクト: yuzhenpeng/find_cns
def main(blast_files, out_dir,raw_cns):
    """empty docstring"""
    cns_by_id = {}  
    for cns in CNS.parse_raw_line(raw_cns):
	cns_by_id[cns.cns_id] = cns

    exons = collections.defaultdict(dict)
    for blast_file in blast_files:
        for line in open(blast_file):
            b = BlastLine(line)
            # chop the q__ and s__
            key = b.query[3:]
            #assert key == b.subject[3:], (key, b.subject[3:])
            # convert piped rice names to short canonical names.
            subject = b.subject.split("|")[0] if "|" in b.subject else b.subject
            # chop At2g26540.1 to At2g26540
            subject = subject[:-2] if subject[-2] == "." else subject
            subject = subject.replace('LOC_', '')
           
            if b.score > 50:
                if not subject in exons[key]:
                    exons[key][subject] = [b.eval]
                else:
                    exons[key][subject].append(b.eval)
                continue

            if b.score < 45: continue
            cns = cns_by_id[key]

            # qstart?
            qlen = cns.qstop - cns.qstart
            coverage = (b.hitlen * 3.) / qlen
            #print >>sys.stderr, coverage
            if coverage < 0.90: continue
            if not subject in exons[key]:
                exons[key][subject] = [b.eval]
            else:
                exons[key][subject].append(b.eval)


    exons = dict(exons)
    write_exons(exons, out_dir)
    #for cns_hash, at_exons in exons.iteritems():
    print >>sys.stderr, "%i total unique cnss are exons" % (len(exons), )
    return exons
コード例 #3
0
ファイル: cns_to_fasta.py プロジェクト: gturco/find_cns
def main(cnsfile, qfasta_file, sfasta_file, qorg, sorg, min_len):
    """empty docstring"""
    lens = []
    qfasta = Fasta(qfasta_file)
    sfasta = Fasta(sfasta_file)

    seen = {}

    lens_append = lens.append
    qseq, sseq = None, None
    # so we only read a new fasta file as needed.
    last_qchr, last_schr = None, None

    seen = {}
    for cns in CNS.parse_raw_line(cnsfile):
        #if is_intron(cns,qbed): continue
        qseq = qfasta[str(cns.qseqid)]
        sseq = sfasta[str(cns.sseqid)]


        sstart, send = sorted((cns.sstart, cns.sstop))
        qkey = (cns.qseqid, cns.qstart, cns.qstop)
        skey = (cns.sseqid, cns.sstart, cns.sstop)

        assert sstart < send

        if cns.qstop - cns.qstart < min_len: continue
        if send - sstart < min_len: continue


        if not (qkey in seen and skey in seen):
            print ">q__" + cns.cns_id
            seqstr = str(qseq[cns.qstart - 1: cns.qstop]).replace('R', 'N').replace('W', 'N').replace('M', 'N')
            assert set(seqstr.lower()).issubset("actgnx"), ('q', 'q__' + cns.cns_id, seqstr)
            print seqstr.upper()

            print ">s__" + cns.cns_id
            seqstr = str(sseq[sstart - 1: send]).replace('R', 'N').replace('W', 'N').replace('M', 'N')
            assert set(seqstr.lower()).issubset("actgnx"), ('s', 's__' + cns.cns_id, seqstr)
            print seqstr.upper()

        seen[qkey] = 1
        seen[skey] = 1
コード例 #4
0
def main(qbed_path, sbed_path, cnsfile, dist, orthology_path):
    """
    here, we remove cnss that have been called proteins/rnas from 
    the cns list, and add them to the bed files.
    AND have to do the preliminary assignment of cnss that remain to the new-genes
    that _were_ cnss. the proper assignment is then handled in assign.py
    """
    qcns_file = qbed_path.replace(".bed", "_cns.gff")
    assert qcns_file != qbed_path
    qcns_gff = open(qcns_file, 'w')
    print >>qcns_gff, "##gff-version 3"
    if sbed_path != qbed_path:
        scns_file = sbed_path.replace(".bed", "_cns.gff")
        assert scns_file != sbed_path
        scns_gff = open(scns_file, 'w')
        print >>scns_gff, "##gff-version 3"
    else: scns_gff = qcns_gff

    qrawbed = RawBed(qbed_path)
    srawbed = RawBed(sbed_path)
  
    ortho_trees = read_orthos_to_trees(orthology_path, qrawbed,srawbed)
    
    qbed = Bed(qbed_path); qbed.fill_dict()
    sbed = Bed(sbed_path); sbed.fill_dict()

    name, ext = op.splitext(cnsfile)
    real_cns_fh = open("%s.real%s" % (name, ext), "w")
    print >>sys.stderr, "writing to:", real_cns_fh.name
    outdir = op.dirname(cnsfile)
    print >>real_cns_fh, "#qseqid,qaccn,sseqid,saccn,qstart,qend,sstart,send,eval"

    crna = read_cns_to_rna(outdir)
    cpro = read_cns_to_protein_exons(outdir)

    #cns_items = list(parse_raw_cns(cnsfile))
    proteins = collections.defaultdict(list)
    rnas = collections.defaultdict(list)
    real_cns_items = []
    for cnsi in CNS.parse_raw_line(cnsfile):
        cns_id = cnsi.cns_id
        cns = cnsi.to_dict()
        key = (cns['qseqid'], cns['sseqid'])
        if cns_id in cpro:
            proteins[key].append((cns, cpro[cns_id]))
        elif cns_id in crna:
            rnas[key].append((cns, crna[cns_id]))
        else:
            real_cns_items.append((cns_id, cns))
    p_trees = fill_tree(proteins)
    r_trees = fill_tree(rnas)

    def assign_new_names(prs, protein_or_rna):
        n = {}
        for seqid_pair, li in prs.iteritems():
            if not seqid_pair in n: n[seqid_pair] = []
            for gnew, info in li[:]:
                new_qname = "%(qseqid)s_%(qstart)i_%(qend)i_cns" % gnew
                new_sname = "%(sseqid)s_%(sstart)i_%(send)i_cns" % gnew
                # and give them both an id so we know they were a pair.
                new_qname += "_%s" % (protein_or_rna)
                new_sname += "_%s" % (protein_or_rna)
                #print >>sys.stderr, gnew['qaccn'], cns["qaccn"]
                try:
                    qstrand = qbed.d[gnew['qaccn']]['strand']
                    sstrand = sbed.d[gnew['saccn']]['strand']
                except:
                    print >>sys.stderr, gnew
                    raise
                gnew['qaccn'] = new_qname
                gnew['saccn'] = new_sname
                gnew['qstrand'] = qstrand
                gnew['sstrand'] = sstrand
                n[seqid_pair].append((gnew, info))
        return n
    nproteins = assign_new_names(proteins, "protein")
    nrnas = assign_new_names(rnas, "rna")

    cns_seen = {}
    # go through the remaining cnss, print and assign them to the new
    # genes (previously cnss) in within dist.
    for cns_id, cns in real_cns_items:
        print >>real_cns_fh, cns_to_str(cns)
        key = (cns['qseqid'], cns['sseqid'])
        
        for pnew, info in get_new(cns, p_trees, key, nproteins, dist + 1000):
            cns['qaccn'] = pnew['qaccn']
            cns['saccn'] = pnew['saccn']
            cns_str = cns_to_str(cns)
            if cns_str in cns_seen: continue
            cns_seen[cns_str] = 1
            print >>real_cns_fh, cns_str

        for rnew, info in get_new(cns, r_trees, key, nrnas, dist + 1000):
            cns['qaccn'] = rnew['qaccn']
            cns['saccn'] = rnew['saccn']
            cns_str = cns_to_str(cns)
            if cns_str in cns_seen: continue
            cns_seen[cns_str] = 1
            print >>real_cns_fh, cns_str

    qbed_list, qnew_pairs = merge_bed(qbed, nproteins, nrnas, ortho_trees, 'q')
    print >> sys.stderr, len(qnew_pairs)
    # dont need to do the orthos 2x so send in empty dict.
    sbed_list, snew_pairs_unused = merge_bed(sbed, nproteins, nrnas, {}, 's')

    # if it's the same org, we add the new cnss again to the same we send in both lists.
    # print_bed handles the repeats.
    if qbed.path == sbed.path:
        qbed_new = sbed_new = print_bed(qbed_list + sbed_list, qbed.path)
    else:
        qbed_new = print_bed(qbed_list, qbed.path)
        sbed_new = print_bed(sbed_list, sbed.path)

    return qbed_new.path, sbed_new.path, qnew_pairs
コード例 #5
0
def main(qbed_path, sbed_path, cnsfile, dist, orthology_path):
    """
    here, we remove cnss that have been called proteins/rnas from 
    the cns list, and add them to the bed files.
    AND have to do the preliminary assignment of cnss that remain to the new-genes
    that _were_ cnss. the proper assignment is then handled in assign.py
    """
    qcns_file = qbed_path.replace(".bed", "_cns.gff")
    assert qcns_file != qbed_path
    qcns_gff = open(qcns_file, 'w')
    print >> qcns_gff, "##gff-version 3"
    if sbed_path != qbed_path:
        scns_file = sbed_path.replace(".bed", "_cns.gff")
        assert scns_file != sbed_path
        scns_gff = open(scns_file, 'w')
        print >> scns_gff, "##gff-version 3"
    else:
        scns_gff = qcns_gff

    qrawbed = RawBed(qbed_path)
    srawbed = RawBed(sbed_path)

    ortho_trees = read_orthos_to_trees(orthology_path, qrawbed, srawbed)

    qbed = Bed(qbed_path)
    qbed.fill_dict()
    sbed = Bed(sbed_path)
    sbed.fill_dict()

    name, ext = op.splitext(cnsfile)
    real_cns_fh = open("%s.real%s" % (name, ext), "w")
    print >> sys.stderr, "writing to:", real_cns_fh.name
    outdir = op.dirname(cnsfile)
    print >> real_cns_fh, "#qseqid,qaccn,sseqid,saccn,qstart,qend,sstart,send,eval"

    crna = read_cns_to_rna(outdir)
    cpro = read_cns_to_protein_exons(outdir)

    #cns_items = list(parse_raw_cns(cnsfile))
    proteins = collections.defaultdict(list)
    rnas = collections.defaultdict(list)
    real_cns_items = []
    for cnsi in CNS.parse_raw_line(cnsfile):
        cns_id = cnsi.cns_id
        cns = cnsi.to_dict()
        key = (cns['qseqid'], cns['sseqid'])
        if cns_id in cpro:
            proteins[key].append((cns, cpro[cns_id]))
        elif cns_id in crna:
            rnas[key].append((cns, crna[cns_id]))
        else:
            real_cns_items.append((cns_id, cns))
    p_trees = fill_tree(proteins)
    r_trees = fill_tree(rnas)

    def assign_new_names(prs, protein_or_rna):
        n = {}
        for seqid_pair, li in prs.iteritems():
            if not seqid_pair in n: n[seqid_pair] = []
            for gnew, info in li[:]:
                new_qname = "%(qseqid)s_%(qstart)i_%(qend)i_cns" % gnew
                new_sname = "%(sseqid)s_%(sstart)i_%(send)i_cns" % gnew
                # and give them both an id so we know they were a pair.
                new_qname += "_%s" % (protein_or_rna)
                new_sname += "_%s" % (protein_or_rna)
                #print >>sys.stderr, gnew['qaccn'], cns["qaccn"]
                try:
                    qstrand = qbed.d[gnew['qaccn']]['strand']
                    sstrand = sbed.d[gnew['saccn']]['strand']
                except:
                    print >> sys.stderr, gnew
                    raise
                gnew['qaccn'] = new_qname
                gnew['saccn'] = new_sname
                gnew['qstrand'] = qstrand
                gnew['sstrand'] = sstrand
                n[seqid_pair].append((gnew, info))
        return n

    nproteins = assign_new_names(proteins, "protein")
    nrnas = assign_new_names(rnas, "rna")

    cns_seen = {}
    # go through the remaining cnss, print and assign them to the new
    # genes (previously cnss) in within dist.
    for cns_id, cns in real_cns_items:
        print >> real_cns_fh, cns_to_str(cns)
        key = (cns['qseqid'], cns['sseqid'])

        for pnew, info in get_new(cns, p_trees, key, nproteins, dist + 1000):
            cns['qaccn'] = pnew['qaccn']
            cns['saccn'] = pnew['saccn']
            cns_str = cns_to_str(cns)
            if cns_str in cns_seen: continue
            cns_seen[cns_str] = 1
            print >> real_cns_fh, cns_str

        for rnew, info in get_new(cns, r_trees, key, nrnas, dist + 1000):
            cns['qaccn'] = rnew['qaccn']
            cns['saccn'] = rnew['saccn']
            cns_str = cns_to_str(cns)
            if cns_str in cns_seen: continue
            cns_seen[cns_str] = 1
            print >> real_cns_fh, cns_str

    qbed_list, qnew_pairs = merge_bed(qbed, nproteins, nrnas, ortho_trees, 'q')
    print >> sys.stderr, len(qnew_pairs)
    # dont need to do the orthos 2x so send in empty dict.
    sbed_list, snew_pairs_unused = merge_bed(sbed, nproteins, nrnas, {}, 's')

    # if it's the same org, we add the new cnss again to the same we send in both lists.
    # print_bed handles the repeats.
    if qbed.path == sbed.path:
        qbed_new = sbed_new = print_bed(qbed_list + sbed_list, qbed.path)
    else:
        qbed_new = print_bed(qbed_list, qbed.path)
        sbed_new = print_bed(sbed_list, sbed.path)

    return qbed_new.path, sbed_new.path, qnew_pairs