def main(cnsfile, qfasta_file, sfasta_file, qorg, sorg, min_len): """empty docstring""" lens = [] qfasta = Fasta(qfasta_file) sfasta = Fasta(sfasta_file) seen = {} lens_append = lens.append qseq, sseq = None, None # so we only read a new fasta file as needed. last_qchr, last_schr = None, None seen = {} for cns in CNS.parse_raw_line(cnsfile): #if is_intron(cns,qbed): continue qseq = qfasta[str(cns.qseqid)] sseq = sfasta[str(cns.sseqid)] sstart, send = sorted((cns.sstart, cns.sstop)) qkey = (cns.qseqid, cns.qstart, cns.qstop) skey = (cns.sseqid, cns.sstart, cns.sstop) assert sstart < send if cns.qstop - cns.qstart < min_len: continue if send - sstart < min_len: continue if not (qkey in seen and skey in seen): print ">q__" + cns.cns_id seqstr = str(qseq[cns.qstart - 1:cns.qstop]).replace( 'R', 'N').replace('W', 'N').replace('M', 'N') assert set(seqstr.lower()).issubset("actgnx"), ('q', 'q__' + cns.cns_id, seqstr) print seqstr.upper() print ">s__" + cns.cns_id seqstr = str(sseq[sstart - 1:send]).replace('R', 'N').replace( 'W', 'N').replace('M', 'N') assert set(seqstr.lower()).issubset("actgnx"), ('s', 's__' + cns.cns_id, seqstr) print seqstr.upper() seen[qkey] = 1 seen[skey] = 1
def main(blast_files, out_dir,raw_cns): """empty docstring""" cns_by_id = {} for cns in CNS.parse_raw_line(raw_cns): cns_by_id[cns.cns_id] = cns exons = collections.defaultdict(dict) for blast_file in blast_files: for line in open(blast_file): b = BlastLine(line) # chop the q__ and s__ key = b.query[3:] #assert key == b.subject[3:], (key, b.subject[3:]) # convert piped rice names to short canonical names. subject = b.subject.split("|")[0] if "|" in b.subject else b.subject # chop At2g26540.1 to At2g26540 subject = subject[:-2] if subject[-2] == "." else subject subject = subject.replace('LOC_', '') if b.score > 50: if not subject in exons[key]: exons[key][subject] = [b.eval] else: exons[key][subject].append(b.eval) continue if b.score < 45: continue cns = cns_by_id[key] # qstart? qlen = cns.qstop - cns.qstart coverage = (b.hitlen * 3.) / qlen #print >>sys.stderr, coverage if coverage < 0.90: continue if not subject in exons[key]: exons[key][subject] = [b.eval] else: exons[key][subject].append(b.eval) exons = dict(exons) write_exons(exons, out_dir) #for cns_hash, at_exons in exons.iteritems(): print >>sys.stderr, "%i total unique cnss are exons" % (len(exons), ) return exons
def main(cnsfile, qfasta_file, sfasta_file, qorg, sorg, min_len): """empty docstring""" lens = [] qfasta = Fasta(qfasta_file) sfasta = Fasta(sfasta_file) seen = {} lens_append = lens.append qseq, sseq = None, None # so we only read a new fasta file as needed. last_qchr, last_schr = None, None seen = {} for cns in CNS.parse_raw_line(cnsfile): #if is_intron(cns,qbed): continue qseq = qfasta[str(cns.qseqid)] sseq = sfasta[str(cns.sseqid)] sstart, send = sorted((cns.sstart, cns.sstop)) qkey = (cns.qseqid, cns.qstart, cns.qstop) skey = (cns.sseqid, cns.sstart, cns.sstop) assert sstart < send if cns.qstop - cns.qstart < min_len: continue if send - sstart < min_len: continue if not (qkey in seen and skey in seen): print ">q__" + cns.cns_id seqstr = str(qseq[cns.qstart - 1: cns.qstop]).replace('R', 'N').replace('W', 'N').replace('M', 'N') assert set(seqstr.lower()).issubset("actgnx"), ('q', 'q__' + cns.cns_id, seqstr) print seqstr.upper() print ">s__" + cns.cns_id seqstr = str(sseq[sstart - 1: send]).replace('R', 'N').replace('W', 'N').replace('M', 'N') assert set(seqstr.lower()).issubset("actgnx"), ('s', 's__' + cns.cns_id, seqstr) print seqstr.upper() seen[qkey] = 1 seen[skey] = 1
def main(qbed_path, sbed_path, cnsfile, dist, orthology_path): """ here, we remove cnss that have been called proteins/rnas from the cns list, and add them to the bed files. AND have to do the preliminary assignment of cnss that remain to the new-genes that _were_ cnss. the proper assignment is then handled in assign.py """ qcns_file = qbed_path.replace(".bed", "_cns.gff") assert qcns_file != qbed_path qcns_gff = open(qcns_file, 'w') print >>qcns_gff, "##gff-version 3" if sbed_path != qbed_path: scns_file = sbed_path.replace(".bed", "_cns.gff") assert scns_file != sbed_path scns_gff = open(scns_file, 'w') print >>scns_gff, "##gff-version 3" else: scns_gff = qcns_gff qrawbed = RawBed(qbed_path) srawbed = RawBed(sbed_path) ortho_trees = read_orthos_to_trees(orthology_path, qrawbed,srawbed) qbed = Bed(qbed_path); qbed.fill_dict() sbed = Bed(sbed_path); sbed.fill_dict() name, ext = op.splitext(cnsfile) real_cns_fh = open("%s.real%s" % (name, ext), "w") print >>sys.stderr, "writing to:", real_cns_fh.name outdir = op.dirname(cnsfile) print >>real_cns_fh, "#qseqid,qaccn,sseqid,saccn,qstart,qend,sstart,send,eval" crna = read_cns_to_rna(outdir) cpro = read_cns_to_protein_exons(outdir) #cns_items = list(parse_raw_cns(cnsfile)) proteins = collections.defaultdict(list) rnas = collections.defaultdict(list) real_cns_items = [] for cnsi in CNS.parse_raw_line(cnsfile): cns_id = cnsi.cns_id cns = cnsi.to_dict() key = (cns['qseqid'], cns['sseqid']) if cns_id in cpro: proteins[key].append((cns, cpro[cns_id])) elif cns_id in crna: rnas[key].append((cns, crna[cns_id])) else: real_cns_items.append((cns_id, cns)) p_trees = fill_tree(proteins) r_trees = fill_tree(rnas) def assign_new_names(prs, protein_or_rna): n = {} for seqid_pair, li in prs.iteritems(): if not seqid_pair in n: n[seqid_pair] = [] for gnew, info in li[:]: new_qname = "%(qseqid)s_%(qstart)i_%(qend)i_cns" % gnew new_sname = "%(sseqid)s_%(sstart)i_%(send)i_cns" % gnew # and give them both an id so we know they were a pair. new_qname += "_%s" % (protein_or_rna) new_sname += "_%s" % (protein_or_rna) #print >>sys.stderr, gnew['qaccn'], cns["qaccn"] try: qstrand = qbed.d[gnew['qaccn']]['strand'] sstrand = sbed.d[gnew['saccn']]['strand'] except: print >>sys.stderr, gnew raise gnew['qaccn'] = new_qname gnew['saccn'] = new_sname gnew['qstrand'] = qstrand gnew['sstrand'] = sstrand n[seqid_pair].append((gnew, info)) return n nproteins = assign_new_names(proteins, "protein") nrnas = assign_new_names(rnas, "rna") cns_seen = {} # go through the remaining cnss, print and assign them to the new # genes (previously cnss) in within dist. for cns_id, cns in real_cns_items: print >>real_cns_fh, cns_to_str(cns) key = (cns['qseqid'], cns['sseqid']) for pnew, info in get_new(cns, p_trees, key, nproteins, dist + 1000): cns['qaccn'] = pnew['qaccn'] cns['saccn'] = pnew['saccn'] cns_str = cns_to_str(cns) if cns_str in cns_seen: continue cns_seen[cns_str] = 1 print >>real_cns_fh, cns_str for rnew, info in get_new(cns, r_trees, key, nrnas, dist + 1000): cns['qaccn'] = rnew['qaccn'] cns['saccn'] = rnew['saccn'] cns_str = cns_to_str(cns) if cns_str in cns_seen: continue cns_seen[cns_str] = 1 print >>real_cns_fh, cns_str qbed_list, qnew_pairs = merge_bed(qbed, nproteins, nrnas, ortho_trees, 'q') print >> sys.stderr, len(qnew_pairs) # dont need to do the orthos 2x so send in empty dict. sbed_list, snew_pairs_unused = merge_bed(sbed, nproteins, nrnas, {}, 's') # if it's the same org, we add the new cnss again to the same we send in both lists. # print_bed handles the repeats. if qbed.path == sbed.path: qbed_new = sbed_new = print_bed(qbed_list + sbed_list, qbed.path) else: qbed_new = print_bed(qbed_list, qbed.path) sbed_new = print_bed(sbed_list, sbed.path) return qbed_new.path, sbed_new.path, qnew_pairs
def main(qbed_path, sbed_path, cnsfile, dist, orthology_path): """ here, we remove cnss that have been called proteins/rnas from the cns list, and add them to the bed files. AND have to do the preliminary assignment of cnss that remain to the new-genes that _were_ cnss. the proper assignment is then handled in assign.py """ qcns_file = qbed_path.replace(".bed", "_cns.gff") assert qcns_file != qbed_path qcns_gff = open(qcns_file, 'w') print >> qcns_gff, "##gff-version 3" if sbed_path != qbed_path: scns_file = sbed_path.replace(".bed", "_cns.gff") assert scns_file != sbed_path scns_gff = open(scns_file, 'w') print >> scns_gff, "##gff-version 3" else: scns_gff = qcns_gff qrawbed = RawBed(qbed_path) srawbed = RawBed(sbed_path) ortho_trees = read_orthos_to_trees(orthology_path, qrawbed, srawbed) qbed = Bed(qbed_path) qbed.fill_dict() sbed = Bed(sbed_path) sbed.fill_dict() name, ext = op.splitext(cnsfile) real_cns_fh = open("%s.real%s" % (name, ext), "w") print >> sys.stderr, "writing to:", real_cns_fh.name outdir = op.dirname(cnsfile) print >> real_cns_fh, "#qseqid,qaccn,sseqid,saccn,qstart,qend,sstart,send,eval" crna = read_cns_to_rna(outdir) cpro = read_cns_to_protein_exons(outdir) #cns_items = list(parse_raw_cns(cnsfile)) proteins = collections.defaultdict(list) rnas = collections.defaultdict(list) real_cns_items = [] for cnsi in CNS.parse_raw_line(cnsfile): cns_id = cnsi.cns_id cns = cnsi.to_dict() key = (cns['qseqid'], cns['sseqid']) if cns_id in cpro: proteins[key].append((cns, cpro[cns_id])) elif cns_id in crna: rnas[key].append((cns, crna[cns_id])) else: real_cns_items.append((cns_id, cns)) p_trees = fill_tree(proteins) r_trees = fill_tree(rnas) def assign_new_names(prs, protein_or_rna): n = {} for seqid_pair, li in prs.iteritems(): if not seqid_pair in n: n[seqid_pair] = [] for gnew, info in li[:]: new_qname = "%(qseqid)s_%(qstart)i_%(qend)i_cns" % gnew new_sname = "%(sseqid)s_%(sstart)i_%(send)i_cns" % gnew # and give them both an id so we know they were a pair. new_qname += "_%s" % (protein_or_rna) new_sname += "_%s" % (protein_or_rna) #print >>sys.stderr, gnew['qaccn'], cns["qaccn"] try: qstrand = qbed.d[gnew['qaccn']]['strand'] sstrand = sbed.d[gnew['saccn']]['strand'] except: print >> sys.stderr, gnew raise gnew['qaccn'] = new_qname gnew['saccn'] = new_sname gnew['qstrand'] = qstrand gnew['sstrand'] = sstrand n[seqid_pair].append((gnew, info)) return n nproteins = assign_new_names(proteins, "protein") nrnas = assign_new_names(rnas, "rna") cns_seen = {} # go through the remaining cnss, print and assign them to the new # genes (previously cnss) in within dist. for cns_id, cns in real_cns_items: print >> real_cns_fh, cns_to_str(cns) key = (cns['qseqid'], cns['sseqid']) for pnew, info in get_new(cns, p_trees, key, nproteins, dist + 1000): cns['qaccn'] = pnew['qaccn'] cns['saccn'] = pnew['saccn'] cns_str = cns_to_str(cns) if cns_str in cns_seen: continue cns_seen[cns_str] = 1 print >> real_cns_fh, cns_str for rnew, info in get_new(cns, r_trees, key, nrnas, dist + 1000): cns['qaccn'] = rnew['qaccn'] cns['saccn'] = rnew['saccn'] cns_str = cns_to_str(cns) if cns_str in cns_seen: continue cns_seen[cns_str] = 1 print >> real_cns_fh, cns_str qbed_list, qnew_pairs = merge_bed(qbed, nproteins, nrnas, ortho_trees, 'q') print >> sys.stderr, len(qnew_pairs) # dont need to do the orthos 2x so send in empty dict. sbed_list, snew_pairs_unused = merge_bed(sbed, nproteins, nrnas, {}, 's') # if it's the same org, we add the new cnss again to the same we send in both lists. # print_bed handles the repeats. if qbed.path == sbed.path: qbed_new = sbed_new = print_bed(qbed_list + sbed_list, qbed.path) else: qbed_new = print_bed(qbed_list, qbed.path) sbed_new = print_bed(sbed_list, sbed.path) return qbed_new.path, sbed_new.path, qnew_pairs