def main(blast_files, out_dir, qorg, sorg): """empty docstring""" raw_cns = "%s/%s_%s.cns.txt" % (out_dir, qorg, sorg) assert os.path.exists(raw_cns) cns_by_id = dict(parse_raw_cns(raw_cns)) exons = collections.defaultdict(dict) for blast_file in blast_files: for line in open(blast_file): b = BlastLine(line) # chop the q__ and s__ key = b.query[3:] #assert key == b.subject[3:], (key, b.subject[3:]) # convert piped rice names to short canonical names. subject = b.subject.split("|")[0] if "|" in b.subject else b.subject # chop At2g26540.1 to At2g26540 subject = subject[:-2] if subject[-2] == "." else subject subject = subject.replace('LOC_', '') if b.score > 50: if not subject in exons[key]: exons[key][subject] = [b.eval] else: exons[key][subject].append(b.eval) continue if b.score < 45: continue cns = cns_by_id[key] # qstart? qlen = cns['qend'] - cns['qstart'] coverage = (b.hitlen * 3.) / qlen #print >>sys.stderr, coverage if coverage < 0.90: continue if not subject in exons[key]: exons[key][subject] = [b.eval] else: exons[key][subject].append(b.eval) exons = dict(exons) write_exons(exons, out_dir) #for cns_hash, at_exons in exons.iteritems(): print >>sys.stderr, "%i total unique cnss are exons" % (len(exons), ) return exons
def main(cnsfile, qfasta_file, sfasta_file, qorg, sorg, min_len): """empty docstring""" lens = [] qfasta = Fasta(qfasta_file) sfasta = Fasta(sfasta_file) seen = {} lens_append = lens.append qseq, sseq = None, None # so we only read a new fasta file as needed. last_qchr, last_schr = None, None seen = {} for cns_id, cns_dict in parse_raw_cns(cnsfile): cns = cns_dict qseq = qfasta[str(cns['qseqid'])] sseq = sfasta[str(cns['sseqid'])] sstart, send = sorted((cns['sstart'], cns['send'])) qkey = (cns['qseqid'], cns['qstart'], cns['qend']) skey = (cns['sseqid'], cns['sstart'], cns['send']) assert sstart < send if cns['qend'] - cns['qstart'] < min_len: continue if send - sstart < min_len: continue if not (qkey in seen and skey in seen): print ">q__" + cns_id seqstr = str(qseq[cns['qstart'] - 1: cns['qend']]).replace('R', 'N').replace('W', 'N').replace('M', 'N') assert set(seqstr.lower()).issubset("actgnx"), ('q', 'q__' + cns_id, seqstr) print seqstr.upper() print ">s__" + cns_id seqstr = str(sseq[sstart - 1: send]).replace('R', 'N').replace('W', 'N').replace('M', 'N') assert set(seqstr.lower()).issubset("actgnx"), ('s', 's__' + cns_id, seqstr) print seqstr.upper() seen[qkey] = 1 seen[skey] = 1
def main(qbed, sbed, cnsfile, dist, orthology_path): """ here, we remove cnss that have been called proteins/rnas from the cns list, and add them to the bed files. AND have to do the preliminary assignment of cnss that remain to the new-genes that _were_ cnss. the proper assignment is then handled in assign.py """ ortho_trees = read_orthos_to_trees(orthology_path, qbed, sbed) name, ext = op.splitext(cnsfile) real_cns_fh = open("%s.real%s" % (name, ext), "w") print >>sys.stderr, "writing to:", real_cns_fh.name outdir = op.dirname(cnsfile) print >>real_cns_fh, "#qseqid,qaccn,sseqid,saccn,qstart,qend,sstart,send,eval" crna = read_cns_to_rna(outdir) cpro = read_cns_to_protein_exons(outdir) cns_items = list(parse_raw_cns(cnsfile)) proteins = collections.defaultdict(list) rnas = collections.defaultdict(list) real_cns_items = [] for cns_id, cns in cns_items: key = (cns['qseqid'], cns['sseqid']) if cns_id in cpro: proteins[key].append((cns, cpro[cns_id])) elif cns_id in crna: rnas[key].append((cns, crna[cns_id])) else: real_cns_items.append((cns_id, cns)) p_trees = fill_tree(proteins) r_trees = fill_tree(rnas) def assign_new_names(prs, protein_or_rna): n = {} for seqid_pair, li in prs.iteritems(): if not seqid_pair in n: n[seqid_pair] = [] for gnew, info in li[:]: new_qname = "%(qseqid)s_%(qstart)i_%(qend)i_cns" % gnew new_sname = "%(sseqid)s_%(sstart)i_%(send)i_cns" % gnew # and give them both an id so we know they were a pair. new_qname += "_%s" % (protein_or_rna) new_sname += "_%s" % (protein_or_rna) try: qstrand = qbed.d[cns['qaccn']]['strand'] sstrand = sbed.d[cns['saccn']]['strand'] except: print >>sys.stderr, cns raise gnew['qaccn'] = new_qname gnew['saccn'] = new_sname gnew['qstrand'] = qstrand gnew['sstrand'] = sstrand n[seqid_pair].append((gnew, info)) return n nproteins = assign_new_names(proteins, "protein") nrnas = assign_new_names(rnas, "rna") cns_seen = {} # go through the remaining cnss, print and assign them to the new # genes (previously cnss) in within dist. for cns_id, cns in real_cns_items: print >>real_cns_fh, cns_to_str(cns) key = (cns['qseqid'], cns['sseqid']) for pnew, info in get_new(cns, p_trees, key, nproteins, dist + 1000): cns['qaccn'] = pnew['qaccn'] cns['saccn'] = pnew['saccn'] cns_str = cns_to_str(cns) if cns_str in cns_seen: continue cns_seen[cns_str] = 1 print >>real_cns_fh, cns_str for rnew, info in get_new(cns, r_trees, key, nrnas, dist + 1000): cns['qaccn'] = rnew['qaccn'] cns['saccn'] = rnew['saccn'] cns_str = cns_to_str(cns) if cns_str in cns_seen: continue cns_seen[cns_str] = 1 print >>real_cns_fh, cns_str qbed_list, qnew_pairs = merge_bed(qbed, nproteins, nrnas, ortho_trees, 'q') # dont need to do the orthos 2x so send in empty dict. sbed_list, snew_pairs_unused = merge_bed(sbed, nproteins, nrnas, {}, 's') # if it's the same org, we add the new cnss again to the same we send in both lists. # print_bed handles the repeats. if qbed.path == sbed.path: qbed_new = sbed_new = print_bed(qbed_list + sbed_list, qbed.path) else: qbed_new = print_bed(qbed_list, qbed.path) sbed_new = print_bed(sbed_list, sbed.path) return qbed_new, sbed_new, qnew_pairs