def main(cnsfile, qbed_file, sbed_file, pairsfile, pairs_fmt, qdsid, sdsid,qpad,spad): qcns_file = qbed_file.replace(".nolocaldups", "_cns.gff") assert qcns_file != qbed_file qcns_gff = open(qcns_file, 'w') print >>qcns_gff, "##gff-version 3" if sbed_file != qbed_file: scns_file = sbed_file.replace(".nolocaldups", "_cns.gff") assert scns_file != sbed_file scns_gff = open(scns_file, 'w') print >>scns_gff, "##gff-version 3" else: scns_gff = qcns_gff qbed = Bed(qbed_file); qbed.fill_dict() sbed = Bed(sbed_file); sbed.fill_dict() cnsdict, evaldict = get_cns_dict(cnsfile) qpair_map, spair_map = make_pair_maps(pairsfile, pairs_fmt, qbed, sbed) out = sys.stdout fmt = "%(cns_id)s,%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \ "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s,%(eval)s,%(link)s" print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","") for cns, qfeat, sfeat in assign(cnsdict,qbed, sbed, qpair_map, spair_map): d = cns_fmt_dict(cns, qfeat, sfeat, evaldict) d['cns_id'] = cns_id(d) if d['sstop'] < d['sstart']: d['sstop'], d['sstart'] = d['sstart'], d['sstop'] d['link'] = cns_link(d, qdsid, sdsid,qpad,spad) print >>out, fmt % d write_gff(d, qcns_gff, scns_gff)
class TestMaize(unittest.TestCase): def setUp(self): handle = open( '/Users/gturco/code/freeling_lab/find_cns_gturco/pipeline/tests/blast_3.txt' ) fh = handle.readlines() self.blast_str = ' , '.join(fh) self.unmasked_fasta = Fasta('/Users/gturco/find_cns/maize_v2_UM.fasta') self.qbed = Bed('/Users/gturco/rice_maize/rice_v6.bed') self.qbed.fill_dict() self.sbed = Bed('/Users/gturco/maize/maize_v2.bed', '/Users/gturco/maize/maize_v2.fasta') self.sbed.fill_dict() self.sfeat = self.sbed.accn('GRMZM2G086714') self.qfeat = self.qbed.accn('Os09g27050') def test_get_cmd(self): sfasta = 'data/rice_v6_maize_v2/maize_v2_split/2.fasta' qfasta = 'data/rice_v6_maize_v2/rice_v6_split/4.fasta' def test_parse_balse(self): orientaion = -1 cns = parse_blast(self.blast_str, orientaion, self.qfeat, self.sfeat, self.qbed, self.sbed, 12000, 26000, self.unmasked_fasta) print cns
class TestAssign(unittest.TestCase): def setUp(self): self.cns_filename = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.cns.txt" self.pairsfile = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.pairs.txt" self.qbed = Bed("data/rice_v6_sorghum_v1/rice_v6.bed") ;self.qbed.fill_dict() self.sbed = Bed("data/rice_v6_sorghum_v1/sorghum_v1.bed") ;self.sbed.fill_dict() self.cns_dict, self.evalue_dict = get_cns_dict(self.cns_filename) self.qpair_map, self.spair_map = make_pair_maps(self.pairsfile, "pair", self.qbed, self.sbed) def test_get_cns_dict(self): """test for test_get_cns_dict""" #print self.cns_dict.keys() print "keys!", self.evalue_dict.keys() def test_assign(self): assign(self.cns_dict, self.qbed, self.sbed, self.qpair_map, self.spair_map) def test_cns_fmt_dict(self): for cns, qfeat, sfeat in assign(self.cns_dict, self.qbed, self.sbed, self.qpair_map, self.spair_map): d = cns_fmt_dict(cns, qfeat, sfeat, self.evalue_dict) print "dddddddd", d def test_main(self): pass
def main(cnsfile, qbed_file, sbed_file, pairsfile, pairs_fmt): qcns_file = qbed_file.replace(".bed", "_cns.gff") assert qcns_file != qbed_file qcns_gff = open(qcns_file, 'w') print >>qcns_gff, "##gff-version 3" if sbed_file != qbed_file: scns_file = sbed_file.replace(".bed", "_cns.gff") assert scns_file != sbed_file scns_gff = open(scns_file, 'w') print >>scns_gff, "##gff-version 3" else: scns_gff = qcns_gff qbed = Bed(qbed_file); qbed.fill_dict() sbed = Bed(sbed_file); sbed.fill_dict() cnsdict = get_cns_dict(cnsfile) qpair_map, spair_map = make_pair_maps(pairsfile, pairs_fmt, qbed, sbed) out = sys.stdout fmt = "%(cns_id)s,%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \ "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s" print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","") for cns, qfeat, sfeat in assign(cnsdict, qbed, sbed, qpair_map, spair_map): d = cns_fmt_dict(cns, qfeat, sfeat) d['cns_id'] = cns_id(d) if d['sstop'] < d['sstart']: d['sstop'], d['sstart'] = d['sstart'], d['sstop'] print >>out, fmt % d write_gff(d, qcns_gff, scns_gff)
class TestAssign(unittest.TestCase): def setUp(self): self.cns_filename = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.cns.txt" self.pairsfile = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.pairs.txt" self.qbed = Bed("data/rice_v6_sorghum_v1/rice_v6.bed") self.qbed.fill_dict() self.sbed = Bed("data/rice_v6_sorghum_v1/sorghum_v1.bed") self.sbed.fill_dict() self.cns_dict, self.evalue_dict = get_cns_dict(self.cns_filename) self.qpair_map, self.spair_map = make_pair_maps( self.pairsfile, "pair", self.qbed, self.sbed) def test_get_cns_dict(self): """test for test_get_cns_dict""" #print self.cns_dict.keys() print "keys!", self.evalue_dict.keys() def test_assign(self): assign(self.cns_dict, self.qbed, self.sbed, self.qpair_map, self.spair_map) def test_cns_fmt_dict(self): for cns, qfeat, sfeat in assign(self.cns_dict, self.qbed, self.sbed, self.qpair_map, self.spair_map): d = cns_fmt_dict(cns, qfeat, sfeat, self.evalue_dict) print "dddddddd", d def test_main(self): pass
class TestPseudo(unittest.TestCase): def setUp(self): self.qallbed = Bed("data/rice_v6_setaria64/rice_v6.all.bed", "data/rice_v6_setaria64/rice_v6.fasta") self.qallbed.fill_dict() self.sallbed = Bed("data/rice_v6_setaria64/setaria64.all.bed", "data/rice_v6_setaria64/setaria64.fasta") self.sallbed.fill_dict() self.saccn = self.sallbed.accn("Si000834m") blastfh = open("blast_res") self.blast = blastfh.read() self.d, self.pseudo = group_cds(self.blast, self.saccn) def test_group_cds_1(self): self.assertEqual(len(self.d.keys()), 4) total_values = [] for key in self.d.keys(): values = len(self.d[key]) total_values.append(values) self.assertEqual(sum(total_values), 38) def test_group_cds_2(self): blast_2fh = open("blast_2") blast_2 = blast_2fh.read() d, pseudo = group_cds(blast_2, self.sallbed.accn("Si002524m")) self.assertEqual(len(d.keys()), 5) for key in d.keys(): # logging.info('key: {0}'.format(key)) self.assertEqual(1, len(d[key])) def test_append_to_included_groups(self): locs = [1, 2, 3, 4] group_dict = {(2, 5): [], (3, 6): [], (9, 8): []} result_dict = append_to_included_groups(locs, group_dict) expected = {(2, 5): [(1, 2, 3, 4)], (3, 6): [(1, 2, 3, 4)], (9, 8): []} self.assertEquals(expected, result_dict) def test_remove_crossing_hit(self): qaccn = self.qallbed.accn("Os01g01890") for group_key in self.d.keys(): exon_hits = self.d[group_key] non_crossing = remove_crossing_hits(exon_hits, qaccn, self.saccn) if len(non_crossing) > 1: mid, start, stop = bites(non_crossing) def test_find_orf(self): qaccn = self.qallbed.accn("Os01g01295") orf = find_orf(self.qallbed, qaccn) self.assertEqual(orf + 1, 141084) def test_find_orf_neg(self): saccn = self.sallbed.accn("Si001539m") orf = find_orf(self.sallbed, saccn) self.assertEqual(orf, 7662777)
def main(cnsfile, qbed_file, sbed_file, qorg, sorg, padding): qbed = Bed(qbed_file); qbed.fill_dict() sbed = Bed(sbed_file); sbed.fill_dict() cnsdict = get_cns_dict(cnsfile) out = sys.stdout fmt = "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \ "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s,%(link)s" print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","") for cns, qfeat, sfeat in assign(cnsdict, qbed, sbed): d = cns_fmt_dict(cns, qfeat, sfeat) d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr, sorg, qorg, padding) print >>out, fmt % d
def main(cnsfile, qbed_file, sbed_file, pairsfile, pck, qorg, sorg, padding): qbed = Bed(qbed_file); qbed.fill_dict() sbed = Bed(sbed_file); sbed.fill_dict() cnsdict = get_cns_dict(cnsfile) qpair_map = make_pair_maps(pairsfile, 'pair', qbed, sbed) out = sys.stdout fmt = "%(saccn)s,%(saccnL)s,%(saccnR)s,%(schr)s,%(sstart)i,%(sstop)i," + \ "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(link)s" print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","") for cns, saccn, saccn_l, saccn_r, qfeat in assign(cnsdict, qbed, qpair_map): d = cns_fmt_dict(cns, qfeat, saccn, saccn_l, saccn_r) d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr,qfeat, pck, sbed, qbed, sorg, qorg, padding) print >>out, fmt % d
def main(cnsfile, qbed_file, sbed_file, qorg, sorg, padding): qbed = Bed(qbed_file); qbed.fill_dict() sbed = Bed(sbed_file); sbed.fill_dict() cnsdict = get_cns_dict(cnsfile) out = sys.stdout fmt = "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \ "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s,%(link)s" print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","") for cns, qfeat, sfeat in assign(cnsdict, qbed, sbed): d = cns_fmt_dict(cns, qfeat, sfeat) if d['sstop'] < d['sstart']: d['sstop'], d['sstart'] = d['sstart'], d['sstop'] d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr, sorg, qorg, padding) print >>out, fmt % d
class LocalDups(object): def __init__(self,filename,bed): self.filename = filename self.bed = Bed(bed) self.bed.fill_dict() def get_order_dups(self): d = {} for line in open(self.filename): dupline = DupLine(line) dups = dupline.get_order(self.bed) d[dups[0]['accn']] = "P" for dup in dups[1:]: d[dup['accn']] = dups[0]['accn'] intervening = dupline.get_interving_genes(self.bed) for i in intervening: if i in d.keys():continue d[i] = "I" self.filename.close() return d def write_ordered(self,out_fh): """write localdups to outfile""" localdup_fh = open(out_fh, "w") d = {} for line in open(self.filename): dupline = DupLine(line) dups = dupline.get_order(self.bed) line = "{0}\n".format("\t".join(dups)) localdup_fh.write(line) localdup_fh.close() def get_dups(self): d = {} for line in open(self.filename): dupline = DupLine(line) d[dupline.parent] = 'P' for dup in dupline.children: d[dup] = dupline.parent intervening = dupline.get_interving_genes(self.bed) for i in intervening: if i in d.keys(): continue d[i] = "I" self.filename.close() return d
class LocalDups(object): def __init__(self, filename, bed): self.filename = filename self.bed = Bed(bed) self.bed.fill_dict() def get_order_dups(self): d = {} for line in open(self.filename): dupline = DupLine(line) dups = dupline.get_order(self.bed) d[dups[0]['accn']] = "P" for dup in dups[1:]: d[dup['accn']] = dups[0]['accn'] intervening = dupline.get_interving_genes(self.bed) for i in intervening: if i in d.keys(): continue d[i] = "I" self.filename.close() return d def write_ordered(self, out_fh): """write localdups to outfile""" localdup_fh = open(out_fh, "w") d = {} for line in open(self.filename): dupline = DupLine(line) dups = dupline.get_order(self.bed) line = "{0}\n".format("\t".join(dups)) localdup_fh.write(line) localdup_fh.close() def get_dups(self): d = {} for line in open(self.filename): dupline = DupLine(line) d[dupline.parent] = 'P' for dup in dupline.children: d[dup] = dupline.parent intervening = dupline.get_interving_genes(self.bed) for i in intervening: if i in d.keys(): continue d[i] = "I" self.filename.close() return d
class TestMaize(unittest.TestCase): def setUp(self): handle = open("/Users/gturco/code/freeling_lab/find_cns_gturco/pipeline/tests/blast_3.txt") fh = handle.readlines() self.blast_str = " , ".join(fh) self.unmasked_fasta = Fasta("/Users/gturco/find_cns/maize_v2_UM.fasta") self.qbed = Bed("/Users/gturco/rice_maize/rice_v6.bed") self.qbed.fill_dict() self.sbed = Bed("/Users/gturco/maize/maize_v2.bed", "/Users/gturco/maize/maize_v2.fasta") self.sbed.fill_dict() self.sfeat = self.sbed.accn("GRMZM2G086714") self.qfeat = self.qbed.accn("Os09g27050") def test_get_cmd(self): sfasta = "data/rice_v6_maize_v2/maize_v2_split/2.fasta" qfasta = "data/rice_v6_maize_v2/rice_v6_split/4.fasta" def test_parse_balse(self): orientaion = -1 cns = parse_blast( self.blast_str, orientaion, self.qfeat, self.sfeat, self.qbed, self.sbed, 12000, 26000, self.unmasked_fasta ) print cns
def main(qbed_path, sbed_path, cnsfile, dist, orthology_path): """ here, we remove cnss that have been called proteins/rnas from the cns list, and add them to the bed files. AND have to do the preliminary assignment of cnss that remain to the new-genes that _were_ cnss. the proper assignment is then handled in assign.py """ qcns_file = qbed_path.replace(".bed", "_cns.gff") assert qcns_file != qbed_path qcns_gff = open(qcns_file, 'w') print >> qcns_gff, "##gff-version 3" if sbed_path != qbed_path: scns_file = sbed_path.replace(".bed", "_cns.gff") assert scns_file != sbed_path scns_gff = open(scns_file, 'w') print >> scns_gff, "##gff-version 3" else: scns_gff = qcns_gff qrawbed = RawBed(qbed_path) srawbed = RawBed(sbed_path) ortho_trees = read_orthos_to_trees(orthology_path, qrawbed, srawbed) qbed = Bed(qbed_path) qbed.fill_dict() sbed = Bed(sbed_path) sbed.fill_dict() name, ext = op.splitext(cnsfile) real_cns_fh = open("%s.real%s" % (name, ext), "w") print >> sys.stderr, "writing to:", real_cns_fh.name outdir = op.dirname(cnsfile) print >> real_cns_fh, "#qseqid,qaccn,sseqid,saccn,qstart,qend,sstart,send,eval" crna = read_cns_to_rna(outdir) cpro = read_cns_to_protein_exons(outdir) #cns_items = list(parse_raw_cns(cnsfile)) proteins = collections.defaultdict(list) rnas = collections.defaultdict(list) real_cns_items = [] for cnsi in CNS.parse_raw_line(cnsfile): cns_id = cnsi.cns_id cns = cnsi.to_dict() key = (cns['qseqid'], cns['sseqid']) if cns_id in cpro: proteins[key].append((cns, cpro[cns_id])) elif cns_id in crna: rnas[key].append((cns, crna[cns_id])) else: real_cns_items.append((cns_id, cns)) p_trees = fill_tree(proteins) r_trees = fill_tree(rnas) def assign_new_names(prs, protein_or_rna): n = {} for seqid_pair, li in prs.iteritems(): if not seqid_pair in n: n[seqid_pair] = [] for gnew, info in li[:]: new_qname = "%(qseqid)s_%(qstart)i_%(qend)i_cns" % gnew new_sname = "%(sseqid)s_%(sstart)i_%(send)i_cns" % gnew # and give them both an id so we know they were a pair. new_qname += "_%s" % (protein_or_rna) new_sname += "_%s" % (protein_or_rna) #print >>sys.stderr, gnew['qaccn'], cns["qaccn"] try: qstrand = qbed.d[gnew['qaccn']]['strand'] sstrand = sbed.d[gnew['saccn']]['strand'] except: print >> sys.stderr, gnew raise gnew['qaccn'] = new_qname gnew['saccn'] = new_sname gnew['qstrand'] = qstrand gnew['sstrand'] = sstrand n[seqid_pair].append((gnew, info)) return n nproteins = assign_new_names(proteins, "protein") nrnas = assign_new_names(rnas, "rna") cns_seen = {} # go through the remaining cnss, print and assign them to the new # genes (previously cnss) in within dist. for cns_id, cns in real_cns_items: print >> real_cns_fh, cns_to_str(cns) key = (cns['qseqid'], cns['sseqid']) for pnew, info in get_new(cns, p_trees, key, nproteins, dist + 1000): cns['qaccn'] = pnew['qaccn'] cns['saccn'] = pnew['saccn'] cns_str = cns_to_str(cns) if cns_str in cns_seen: continue cns_seen[cns_str] = 1 print >> real_cns_fh, cns_str for rnew, info in get_new(cns, r_trees, key, nrnas, dist + 1000): cns['qaccn'] = rnew['qaccn'] cns['saccn'] = rnew['saccn'] cns_str = cns_to_str(cns) if cns_str in cns_seen: continue cns_seen[cns_str] = 1 print >> real_cns_fh, cns_str qbed_list, qnew_pairs = merge_bed(qbed, nproteins, nrnas, ortho_trees, 'q') print >> sys.stderr, len(qnew_pairs) # dont need to do the orthos 2x so send in empty dict. sbed_list, snew_pairs_unused = merge_bed(sbed, nproteins, nrnas, {}, 's') # if it's the same org, we add the new cnss again to the same we send in both lists. # print_bed handles the repeats. if qbed.path == sbed.path: qbed_new = sbed_new = print_bed(qbed_list + sbed_list, qbed.path) else: qbed_new = print_bed(qbed_list, qbed.path) sbed_new = print_bed(sbed_list, sbed.path) return qbed_new.path, sbed_new.path, qnew_pairs
help="path to query localdup_file") parser.add_option("--sdups", dest="sdups", type='string', help="path to subject localdup_file") parser.add_option("--cns_file", dest="cns_file", type='string', help="path to cns file cns.txt") parser.add_option("--UMfasta", dest="unmasked_fasta", help="path to unmasked fasta file file") (options, _) = parser.parse_args() qbed = Bed(options.qbed, options.qfasta) qbed.fill_dict() sbed = Bed(options.sbed, options.sfasta) sbed.fill_dict() unmasked_fasta = Fasta(options.unmasked_fasta) assert options.mask in 'FT' qnolocaldups_path = qbed.path.split(".")[0] + ".nolocaldups.bed" snolocaldups_path = sbed.path.split(".")[0] + ".nolocaldups.bed" #pairs_to_qa("{0}.local".format(options.pairs),'pair',"{0}.nolocaldups.local".format(qbed.path.split(".")[0]),"{0}.nolocaldups.local".format(sbed.path.split(".")[0]),"{0}.raw.filtered.local".format(options.pairs.split(".")[0])) import logging LOG_FILENAME = path.dirname(options.qfasta) + "dup_rdups.log" logging.basicConfig(filename=LOG_FILENAME, level=logging.INFO) main(options.cns_file, options.qdups, options.sdups, options.pairs, options.pair_fmt, qbed, sbed, options.qpad, options.spad,
def main(qbed_path, sbed_path, cnsfile, dist, orthology_path): """ here, we remove cnss that have been called proteins/rnas from the cns list, and add them to the bed files. AND have to do the preliminary assignment of cnss that remain to the new-genes that _were_ cnss. the proper assignment is then handled in assign.py """ qcns_file = qbed_path.replace(".bed", "_cns.gff") assert qcns_file != qbed_path qcns_gff = open(qcns_file, 'w') print >>qcns_gff, "##gff-version 3" if sbed_path != qbed_path: scns_file = sbed_path.replace(".bed", "_cns.gff") assert scns_file != sbed_path scns_gff = open(scns_file, 'w') print >>scns_gff, "##gff-version 3" else: scns_gff = qcns_gff qrawbed = RawBed(qbed_path) srawbed = RawBed(sbed_path) ortho_trees = read_orthos_to_trees(orthology_path, qrawbed,srawbed) qbed = Bed(qbed_path); qbed.fill_dict() sbed = Bed(sbed_path); sbed.fill_dict() name, ext = op.splitext(cnsfile) real_cns_fh = open("%s.real%s" % (name, ext), "w") print >>sys.stderr, "writing to:", real_cns_fh.name outdir = op.dirname(cnsfile) print >>real_cns_fh, "#qseqid,qaccn,sseqid,saccn,qstart,qend,sstart,send,eval" crna = read_cns_to_rna(outdir) cpro = read_cns_to_protein_exons(outdir) #cns_items = list(parse_raw_cns(cnsfile)) proteins = collections.defaultdict(list) rnas = collections.defaultdict(list) real_cns_items = [] for cnsi in CNS.parse_raw_line(cnsfile): cns_id = cnsi.cns_id cns = cnsi.to_dict() key = (cns['qseqid'], cns['sseqid']) if cns_id in cpro: proteins[key].append((cns, cpro[cns_id])) elif cns_id in crna: rnas[key].append((cns, crna[cns_id])) else: real_cns_items.append((cns_id, cns)) p_trees = fill_tree(proteins) r_trees = fill_tree(rnas) def assign_new_names(prs, protein_or_rna): n = {} for seqid_pair, li in prs.iteritems(): if not seqid_pair in n: n[seqid_pair] = [] for gnew, info in li[:]: new_qname = "%(qseqid)s_%(qstart)i_%(qend)i_cns" % gnew new_sname = "%(sseqid)s_%(sstart)i_%(send)i_cns" % gnew # and give them both an id so we know they were a pair. new_qname += "_%s" % (protein_or_rna) new_sname += "_%s" % (protein_or_rna) #print >>sys.stderr, gnew['qaccn'], cns["qaccn"] try: qstrand = qbed.d[gnew['qaccn']]['strand'] sstrand = sbed.d[gnew['saccn']]['strand'] except: print >>sys.stderr, gnew raise gnew['qaccn'] = new_qname gnew['saccn'] = new_sname gnew['qstrand'] = qstrand gnew['sstrand'] = sstrand n[seqid_pair].append((gnew, info)) return n nproteins = assign_new_names(proteins, "protein") nrnas = assign_new_names(rnas, "rna") cns_seen = {} # go through the remaining cnss, print and assign them to the new # genes (previously cnss) in within dist. for cns_id, cns in real_cns_items: print >>real_cns_fh, cns_to_str(cns) key = (cns['qseqid'], cns['sseqid']) for pnew, info in get_new(cns, p_trees, key, nproteins, dist + 1000): cns['qaccn'] = pnew['qaccn'] cns['saccn'] = pnew['saccn'] cns_str = cns_to_str(cns) if cns_str in cns_seen: continue cns_seen[cns_str] = 1 print >>real_cns_fh, cns_str for rnew, info in get_new(cns, r_trees, key, nrnas, dist + 1000): cns['qaccn'] = rnew['qaccn'] cns['saccn'] = rnew['saccn'] cns_str = cns_to_str(cns) if cns_str in cns_seen: continue cns_seen[cns_str] = 1 print >>real_cns_fh, cns_str qbed_list, qnew_pairs = merge_bed(qbed, nproteins, nrnas, ortho_trees, 'q') print >> sys.stderr, len(qnew_pairs) # dont need to do the orthos 2x so send in empty dict. sbed_list, snew_pairs_unused = merge_bed(sbed, nproteins, nrnas, {}, 's') # if it's the same org, we add the new cnss again to the same we send in both lists. # print_bed handles the repeats. if qbed.path == sbed.path: qbed_new = sbed_new = print_bed(qbed_list + sbed_list, qbed.path) else: qbed_new = print_bed(qbed_list, qbed.path) sbed_new = print_bed(sbed_list, sbed.path) return qbed_new.path, sbed_new.path, qnew_pairs
import optparse parser = optparse.OptionParser("usage: %prog [options] ") parser.add_option("-F", dest="mask", help="blast mask simple sequence [default: F]", default="F") parser.add_option("-n", dest="ncpu", help="parallelize to this many cores", type='int', default=8) parser.add_option("-q", dest="qfasta", help="path to genomic query fasta") parser.add_option("--qbed", dest="qbed", help="query bed file") parser.add_option("-s", dest="sfasta", help="path to genomic subject fasta") parser.add_option("--sbed", dest="sbed", help="subject bed file") parser.add_option("-p", dest="pairs", help="the pairs file. output from dagchainer") choices = ("dag", "cluster", "pair", 'qa', 'raw') parser.add_option("--pair_fmt", dest="pair_fmt", default='raw', help="format of the pairs, one of: %s" % str(choices), choices=choices) parser.add_option("--qpad", dest="qpad", type='int', default=12000, help="how far from the end of the query gene to look for cnss") parser.add_option("--spad", dest="spad", type='int', default=26000, help="how far from the end of the subject gene to look for cnss") parser.add_option("--UMfasta", dest="unmasked_fasta", help="path to unmasked fasta file file") (options, _) = parser.parse_args() if not (options.qfasta and options.sfasta and options.sbed and options.qbed): sys.exit(parser.print_help()) qbed = Bed(options.qbed, options.qfasta); qbed.fill_dict() sbed = Bed(options.sbed, options.sfasta); sbed.fill_dict() unmasked_fasta = Fasta(options.unmasked_fasta) assert options.mask in 'FT' main(qbed, sbed, options.pairs, options.qpad, options.spad, unmasked_fasta, options.pair_fmt, options.mask, options.ncpu)
def test_main(self): """test for test_get_cns_dict""" qbed = Bed(self.qbed, self.qfasta); qbed.fill_dict() sbed = Bed(self.sbed, self.sfasta); sbed.fill_dict() x = main(qbed, sbed, self.pairs, 12000,12000, "pair", self.blast_path, "T",2) print x
spos = sbed[raw.pos_b] key = (raw.seqid_a, raw.seqid_b) if not key in trees: trees[key] = [] qpos = (qpos['start'] + qpos['end']) / 2 spos = (spos['start'] + spos['end']) / 2 trees[key].append((int(qpos), int(spos))) for k in trees: trees[k] = cKDTree(trees[k]) return trees if __name__ == "__main__": import optparse parser = optparse.OptionParser() parser.add_option("--qbed", dest="qbed", help="query bed file") parser.add_option("--sbed", dest="sbed", help="subject bed file") parser.add_option("--cns", dest="cns", help="path to raw cns") parser.add_option("--dist", dest="dist", type='int', help="max dist from gene to cns", default=12000) parser.add_option("--paralogy", dest="paralogy", help="path to paralogy file") parser.add_option("--orthology", dest="orthology", help="path to orthology file") options, args = parser.parse_args() if not (options.sbed and options.qbed and options.cns, options.orthology): sys.exit(parser.print_help()) qbed = Bed(options.qbed); qbed.fill_dict() sbed = Bed(options.sbed); sbed.fill_dict() qbed_new, sbed_new, new_pairs = main(qbed, sbed, options.cns, options.dist, options.orthology) write_new_pairs(options.paralogy, options.orthology, qbed, qbed_new, sbed, sbed_new, new_pairs)
print >> fcnss, "%s,%s,%s,[%s,%s],%s,%s" % (qname, qfeat['seqid'], sname, sfeat['qleft_gene'], sfeat['qright_gene'], sfeat['seqid'], ",".join(map(lambda l: ",".join(map(str,l)), cnss))) return None if __name__ == "__main__": import optparse parser = optparse.OptionParser("usage: %prog [options] ") parser.add_option("-F", dest="mask", help="blast mask simple sequence [default: F]", default="F") parser.add_option("-n", dest="ncpu", help="parallelize to this many cores", type='int', default=8) parser.add_option("-q", dest="qfasta", help="path to genomic query fasta") parser.add_option("--qbed", dest="qbed", help="query bed file") parser.add_option("-s", dest="sfasta", help="path to genomic subject fasta") parser.add_option("--sbed", dest="sbed", help="subject bed file") parser.add_option("-p", dest="pairs", help="the pairs file. output from dagchainer") choices = ("dag", "cluster", "pair", 'qa', 'raw', 'pck') parser.add_option("--pair_fmt", dest="pair_fmt", default='raw', help="format of the pairs, one of: %s" % str(choices), choices=choices) (options, _) = parser.parse_args() if not (options.qfasta and options.sfasta and options.sbed and options.qbed): sys.exit(parser.print_help()) qbed = Bed(options.qbed, options.qfasta); qbed.fill_dict() sbed = Bed(options.sbed, options.sfasta); sbed.fill_dict() assert options.mask in 'FT' main(qbed, sbed, options.pairs, options.pair_fmt, options.mask, options.ncpu)