def setUp(self): self.cns_filename = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.cns.txt" self.pairsfile = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.pairs.txt" self.qbed = Bed("data/rice_v6_sorghum_v1/rice_v6.bed") ;self.qbed.fill_dict() self.sbed = Bed("data/rice_v6_sorghum_v1/sorghum_v1.bed") ;self.sbed.fill_dict() self.cns_dict, self.evalue_dict = get_cns_dict(self.cns_filename) self.qpair_map, self.spair_map = make_pair_maps(self.pairsfile, "pair", self.qbed, self.sbed)
def print_bed(flist, old_path): ipath, ext = op.splitext(old_path) path = "%s.with_new%s" % (ipath, ext) print >> sys.stderr, "writing to: %s.with_new%s" % (ipath, ext) fh = open(path, 'wb') seen = {} for item in flist: # convert the locs to a tuple. #print >>sys.stderr, item item = list(item) item[6] = tuple(item[6]) item = tuple(item) if item in seen: continue seen[item] = 1 locs = item[6] # tuple(sorted([item[1], item[2]])) row = dict(accn=item[3], start=item[1], end=item[2], seqid=item[0], locs=locs, score='.', strand=item[5], rgb='.', thickstart='.', thickend=".") print >> fh, Bed.row_string(row) fh.close() return Bed(path)
class TestAssign(unittest.TestCase): def setUp(self): self.cns_filename = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.cns.txt" self.pairsfile = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.pairs.txt" self.qbed = Bed("data/rice_v6_sorghum_v1/rice_v6.bed") ;self.qbed.fill_dict() self.sbed = Bed("data/rice_v6_sorghum_v1/sorghum_v1.bed") ;self.sbed.fill_dict() self.cns_dict, self.evalue_dict = get_cns_dict(self.cns_filename) self.qpair_map, self.spair_map = make_pair_maps(self.pairsfile, "pair", self.qbed, self.sbed) def test_get_cns_dict(self): """test for test_get_cns_dict""" #print self.cns_dict.keys() print "keys!", self.evalue_dict.keys() def test_assign(self): assign(self.cns_dict, self.qbed, self.sbed, self.qpair_map, self.spair_map) def test_cns_fmt_dict(self): for cns, qfeat, sfeat in assign(self.cns_dict, self.qbed, self.sbed, self.qpair_map, self.spair_map): d = cns_fmt_dict(cns, qfeat, sfeat, self.evalue_dict) print "dddddddd", d def test_main(self): pass
def utr_present(cns_pck,query_bed_path, UTR): "checks to see if qaccn has utr region" db = MySQLdb.connect(host="127.0.0.1", user="******", db = "rice_gene_table") cursor = db.cursor() cns_handle = open(cns_pck) cns_pickle = pickle.load(cns_handle) query_bed = Bed(query_bed_path) for cns in cns_pickle: qfeat = query_bed.accn(cns['qaccn']) if qfeat['strand'] == "+": end = qfeat['end'] start = qfeat["start"] else: end = qfeat['start'] start = qfeat["end"] if UTR == 3: if end == min(qfeat['locs'])[0] or end == max(qfeat['locs'])[1]: stmt = "update MUSA_GENE_LIST_copy set MUSA_GENE_LIST_copy.3_UTR = 'ND' where MUSA_GENE_LIST_copy.Rice_MSU6_genes = '{0}'".format(cns['qaccn']) print stmt cursor.execute(stmt) elif UTR == 5: if start == min(qfeat['locs'])[0] or start == max(qfeat['locs'])[1]: stmt = "update MUSA_GENE_LIST_copy set MUSA_GENE_LIST_copy.5_UTR = 'ND' where MUSA_GENE_LIST_copy.Rice_MSU6_genes = '{0}'".format(cns['qaccn']) print stmt cursor.execute(stmt)
def utr_present(cns_pck, query_bed_path, UTR): "checks to see if qaccn has utr region" db = MySQLdb.connect(host="127.0.0.1", user="******", db="rice_gene_table") cursor = db.cursor() cns_handle = open(cns_pck) cns_pickle = pickle.load(cns_handle) query_bed = Bed(query_bed_path) for cns in cns_pickle: qfeat = query_bed.accn(cns['qaccn']) if qfeat['strand'] == "+": end = qfeat['end'] start = qfeat["start"] else: end = qfeat['start'] start = qfeat["end"] if UTR == 3: if end == min(qfeat['locs'])[0] or end == max(qfeat['locs'])[1]: stmt = "update MUSA_GENE_LIST_copy set MUSA_GENE_LIST_copy.3_UTR = 'ND' where MUSA_GENE_LIST_copy.Rice_MSU6_genes = '{0}'".format( cns['qaccn']) print stmt cursor.execute(stmt) elif UTR == 5: if start == min(qfeat['locs'])[0] or start == max( qfeat['locs'])[1]: stmt = "update MUSA_GENE_LIST_copy set MUSA_GENE_LIST_copy.5_UTR = 'ND' where MUSA_GENE_LIST_copy.Rice_MSU6_genes = '{0}'".format( cns['qaccn']) print stmt cursor.execute(stmt)
class TestAssign(unittest.TestCase): def setUp(self): self.cns_filename = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.cns.txt" self.pairsfile = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.pairs.txt" self.qbed = Bed("data/rice_v6_sorghum_v1/rice_v6.bed") self.qbed.fill_dict() self.sbed = Bed("data/rice_v6_sorghum_v1/sorghum_v1.bed") self.sbed.fill_dict() self.cns_dict, self.evalue_dict = get_cns_dict(self.cns_filename) self.qpair_map, self.spair_map = make_pair_maps( self.pairsfile, "pair", self.qbed, self.sbed) def test_get_cns_dict(self): """test for test_get_cns_dict""" #print self.cns_dict.keys() print "keys!", self.evalue_dict.keys() def test_assign(self): assign(self.cns_dict, self.qbed, self.sbed, self.qpair_map, self.spair_map) def test_cns_fmt_dict(self): for cns, qfeat, sfeat in assign(self.cns_dict, self.qbed, self.sbed, self.qpair_map, self.spair_map): d = cns_fmt_dict(cns, qfeat, sfeat, self.evalue_dict) print "dddddddd", d def test_main(self): pass
def setUp(self): self.old_bed = Bed("data/rice_t_sorghum_v1/sorghum_v1.bed") self.missed_bed = Bed( "data/rice_t_sorghum_v1/missed_sorghum_v1_from_rice_b.bed") self.matches = "data/rice_t_sorghum_v1/missed_sorghum_v1_from_rice_b.matches.txt" self.missed_genes = parse_missed_genes(self.matches) self.missed_genes_grouped, self.missed_genes_dict = group_genes_in_bed( self.missed_genes, self.old_bed, self.missed_bed)
def setUp(self): self.qallbed = Bed("data/rice_v6_setaria64/rice_v6.all.bed", "data/rice_v6_setaria64/rice_v6.fasta") self.qallbed.fill_dict() self.sallbed = Bed("data/rice_v6_setaria64/setaria64.all.bed", "data/rice_v6_setaria64/setaria64.fasta") self.sallbed.fill_dict() self.saccn = self.sallbed.accn("Si000834m") blastfh = open("blast_res") self.blast = blastfh.read() self.d, self.pseudo = group_cds(self.blast, self.saccn)
def setUp(self): self.cns_filename = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.cns.txt" self.pairsfile = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.pairs.txt" self.qbed = Bed("data/rice_v6_sorghum_v1/rice_v6.bed") self.qbed.fill_dict() self.sbed = Bed("data/rice_v6_sorghum_v1/sorghum_v1.bed") self.sbed.fill_dict() self.cns_dict, self.evalue_dict = get_cns_dict(self.cns_filename) self.qpair_map, self.spair_map = make_pair_maps( self.pairsfile, "pair", self.qbed, self.sbed)
def main(bedfile,seqfile, gene_list): print "position,gene,element" b = Bed(bedfile) f = Fasta(seqfile) for gene_name in gene_list: gene = b.accn(gene_name) promf, promr = get_prom(f, gene) print gene_name mf = find_seq(promf) mr = find_seq(promr) make_graph(mf,mr, gene_name)
def setUp(self): handle = open("/Users/gturco/code/freeling_lab/find_cns_gturco/pipeline/tests/blast_3.txt") fh = handle.readlines() self.blast_str = " , ".join(fh) self.unmasked_fasta = Fasta("/Users/gturco/find_cns/maize_v2_UM.fasta") self.qbed = Bed("/Users/gturco/rice_maize/rice_v6.bed") self.qbed.fill_dict() self.sbed = Bed("/Users/gturco/maize/maize_v2.bed", "/Users/gturco/maize/maize_v2.fasta") self.sbed.fill_dict() self.sfeat = self.sbed.accn("GRMZM2G086714") self.qfeat = self.qbed.accn("Os09g27050")
class TestPseudo(unittest.TestCase): def setUp(self): self.qallbed = Bed("data/rice_v6_setaria64/rice_v6.all.bed", "data/rice_v6_setaria64/rice_v6.fasta") self.qallbed.fill_dict() self.sallbed = Bed("data/rice_v6_setaria64/setaria64.all.bed", "data/rice_v6_setaria64/setaria64.fasta") self.sallbed.fill_dict() self.saccn = self.sallbed.accn("Si000834m") blastfh = open("blast_res") self.blast = blastfh.read() self.d, self.pseudo = group_cds(self.blast, self.saccn) def test_group_cds_1(self): self.assertEqual(len(self.d.keys()), 4) total_values = [] for key in self.d.keys(): values = len(self.d[key]) total_values.append(values) self.assertEqual(sum(total_values), 38) def test_group_cds_2(self): blast_2fh = open("blast_2") blast_2 = blast_2fh.read() d, pseudo = group_cds(blast_2, self.sallbed.accn("Si002524m")) self.assertEqual(len(d.keys()), 5) for key in d.keys(): # logging.info('key: {0}'.format(key)) self.assertEqual(1, len(d[key])) def test_append_to_included_groups(self): locs = [1, 2, 3, 4] group_dict = {(2, 5): [], (3, 6): [], (9, 8): []} result_dict = append_to_included_groups(locs, group_dict) expected = {(2, 5): [(1, 2, 3, 4)], (3, 6): [(1, 2, 3, 4)], (9, 8): []} self.assertEquals(expected, result_dict) def test_remove_crossing_hit(self): qaccn = self.qallbed.accn("Os01g01890") for group_key in self.d.keys(): exon_hits = self.d[group_key] non_crossing = remove_crossing_hits(exon_hits, qaccn, self.saccn) if len(non_crossing) > 1: mid, start, stop = bites(non_crossing) def test_find_orf(self): qaccn = self.qallbed.accn("Os01g01295") orf = find_orf(self.qallbed, qaccn) self.assertEqual(orf + 1, 141084) def test_find_orf_neg(self): saccn = self.sallbed.accn("Si001539m") orf = find_orf(self.sallbed, saccn) self.assertEqual(orf, 7662777)
def main(cnsfile, qbed_file, sbed_file, pairsfile, pairs_fmt, qdsid, sdsid,qpad,spad): qcns_file = qbed_file.replace(".nolocaldups", "_cns.gff") assert qcns_file != qbed_file qcns_gff = open(qcns_file, 'w') print >>qcns_gff, "##gff-version 3" if sbed_file != qbed_file: scns_file = sbed_file.replace(".nolocaldups", "_cns.gff") assert scns_file != sbed_file scns_gff = open(scns_file, 'w') print >>scns_gff, "##gff-version 3" else: scns_gff = qcns_gff qbed = Bed(qbed_file); qbed.fill_dict() sbed = Bed(sbed_file); sbed.fill_dict() cnsdict, evaldict = get_cns_dict(cnsfile) qpair_map, spair_map = make_pair_maps(pairsfile, pairs_fmt, qbed, sbed) out = sys.stdout fmt = "%(cns_id)s,%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \ "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s,%(eval)s,%(link)s" print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","") for cns, qfeat, sfeat in assign(cnsdict,qbed, sbed, qpair_map, spair_map): d = cns_fmt_dict(cns, qfeat, sfeat, evaldict) d['cns_id'] = cns_id(d) if d['sstop'] < d['sstart']: d['sstop'], d['sstart'] = d['sstart'], d['sstop'] d['link'] = cns_link(d, qdsid, sdsid,qpad,spad) print >>out, fmt % d write_gff(d, qcns_gff, scns_gff)
def write_new_bed(gene_list, old_bed, missed_genes, out_file): merge_fh = open(out_file, "wb") hit_list = [hit for hit, qaccn in missed_genes] for i, gene in enumerate(old_bed): if gene["accn"] in hit_list: continue new_line = Bed.row_string(gene) merge_fh.write("{0}\n".format(new_line)) for i, new_gene in enumerate(gene_list): ### merge overlapping here updated_feat = gene_list[new_gene] if len(updated_feat["locs"]) > 1: updated_feat = merge_feats(updated_feat) new_line = Bed.row_string(updated_feat) merge_fh.write("{0}\n".format(new_line))
def write_new_bed(gene_list, old_bed, missed_genes,out_file): merge_fh = open(out_file,"wb") hit_list = [hit for hit,qaccn in missed_genes] for i,gene in enumerate(old_bed): if gene["accn"] in hit_list: continue new_line = Bed.row_string(gene) merge_fh.write("{0}\n".format(new_line)) for i,new_gene in enumerate(gene_list): ### merge overlapping here updated_feat = gene_list[new_gene] if len(updated_feat["locs"]) > 1: updated_feat = merge_feats(updated_feat) new_line = Bed.row_string(updated_feat) merge_fh.write("{0}\n".format(new_line))
def setUp(self): handle = open( '/Users/gturco/code/freeling_lab/find_cns_gturco/pipeline/tests/blast_3.txt' ) fh = handle.readlines() self.blast_str = ' , '.join(fh) self.unmasked_fasta = Fasta('/Users/gturco/find_cns/maize_v2_UM.fasta') self.qbed = Bed('/Users/gturco/rice_maize/rice_v6.bed') self.qbed.fill_dict() self.sbed = Bed('/Users/gturco/maize/maize_v2.bed', '/Users/gturco/maize/maize_v2.fasta') self.sbed.fill_dict() self.sfeat = self.sbed.accn('GRMZM2G086714') self.qfeat = self.qbed.accn('Os09g27050')
def parse_dups(dups_file, flat): #####THIS ONLY WORKS IF WE CHANGE QUOTA flat.fill_dict() dup_dic = {} seen = [] for line in open(dups_file): line = line.strip().split("\t") parent = line[0] dups = line[1:] all = [Bed.row_to_dict(flat.d[f]) for f in list(set(line))] all.sort(key=operator.itemgetter('start')) dup_start = all[0] dup_end = all[-1] dup_dic[parent] = 'P' seen += [parent] for dup in dups: if dup in seen: continue seen.append(dup) dup_dic[dup] = parent # so here, there are all the genes that arent part of the local dup # array, but we want to mark them with 'I' intervening = flat.get_features_in_region(dup_start['seqid'], dup_start['start'], dup_end['end']) for ii in intervening: if ii['accn'] == parent or ii['accn'] == dup_end: continue if not ii['accn'] in dup_dic.keys(): dup_dic[ii['accn']] = 'I' return dup_dic
def main_gene(feature_file, query_list_pos, query_list_neg): cds = [] three_all = [] five_all = [] feature_bed = Bed(feature_file) for feature in feature_bed: exon_meth = [] for e in feature['locs']: for i in range(e[0], e[1] + 1): if feature["strand"] == "+": matches = query_list_pos[feature['seqid']].find(i, i) else: matches = query_list_neg[feature['seqid']].find(i, i) exon_meth.append(len(matches)) cds.append(sum(exon_meth)) if feature["strand"] == "+": five_prime = query_list_pos[feature['seqid']].find( int(feature['locs'][0][0]) - 300, int(feature['locs'][0][0])) three_prime = query_list_pos[feature['seqid']].find( int(feature['locs'][-1][1]), int(feature['locs'][-1][1]) + 300) elif feature["strand"] == "-": three_prime = query_list_pos[feature['seqid']].find( int(feature['locs'][0][0]) - 300, int(feature['locs'][0][0])) five_prime = query_list_pos[feature['seqid']].find( int(feature['locs'][-1][1]), int(feature['locs'][-1][1]) + 300) three_all.append(len(three_prime)) five_all.append(len(five_prime)) return cds, three_all, five_all
def main(cns_path, fmt, query_bed_path, subject_bed_path): cns_dic = cns_to_dic(cns_path,fmt) query_bed = Bed(query_bed_path) subject_bed = Bed(subject_bed_path) utr_dict = {} for cns in cns_dic: cns['qstop'] = int(cns['qstop']) cns['qstart'] = int(cns['qstart']) cns['sstop'] = int(cns['sstop']) cns['sstart'] = int(cns['sstart']) qfeat = query_bed.accn(cns['qaccn']) sfeat = subject_bed.accn(cns['saccn']) qgene_space_start = min(qfeat['locs'])[0] qgene_space_end = max(qfeat['locs'])[1] qgene_space_poly = LineString([(0.0, qgene_space_start), (0.0, qgene_space_end)]) qgene_poly = LineString([(0.0, qfeat['start']), (0.0, qfeat['end'])]) sgene_poly = LineString([(0.0, sfeat['start']), (0.0, sfeat['end'])]) # if intron of one dont need to check other qcns = LineString([(0,cns['qstart']),(0,cns['qstop'])]) scns = LineString([(0,cns['sstart']),(0,cns['sstop'])]) cns_type(cns,qgene_space_poly, qgene_poly, sgene_poly, scns, qcns,qgene_space_start,qfeat) create_utr_list(utr_dict,qfeat, cns,"q") create_utr_list(utr_dict,sfeat, cns,"s") for cns in cns_dic: if cns['type'] == "5-prox_dist": qgene_start = min(utr_dict[cns['qaccn']]) qgene_stop = max(utr_dict[cns['qaccn']]) # sstart = min(utr_dict[cns['saccn']]) # sstop = max(utr_dict[cns['saccn']]) five_diff_pos = abs(qgene_start - cns["qstop"]) five_diff_neg = abs(qgene_stop - cns["qstart"]) if five_diff_pos <=1000 and cns["qstrand"] == "+" or five_diff_neg <=1000 and cns["qstrand"] == "-": cns["type"] = "5-proximal" elif five_diff_pos >1000 and cns["qstrand"] == "+" or five_diff_neg >1000 and cns["qstrand"] == "-": cns["type"] = "5-distal" elif cns['type'] == "3-prox_dist": qgene_start = min(utr_dict[cns['qaccn']]) qgene_stop = max(utr_dict[cns['qaccn']]) three_diff_pos = abs(cns["qstart"] - qgene_stop) three_diff_neg = abs(cns["qstop"] - qgene_start) if three_diff_pos <=1000 and cns["qstrand"] == "+" or three_diff_neg <=1000 and cns["qstrand"] == "-": cns["type"] = "3-proximal" elif three_diff_pos > 1000 and cns["qstrand"] == "+" or three_diff_neg > 1000 and cns["qstrand"] == "-": cns["type"] = "3-distal" return cns_dic
def main(cns_file,bedpath,fastapath): genespace = get_genespace(cns_file) bed = Bed(bedpath) f = Fasta(fastapath) handles = ['3_utr','5_utr','intronic','5_prox','5_distal','3_prox','3_distal'] fhs = open_files(handles) for gene in genespace.keys(): #cnsspace = genespace[gene] try: accn = bed.accn(gene) except KeyError: continue cnsspace = [(max(0,accn['start'] - 12000), accn['end'] + 12000)] #print "GENESPACE {0}".format(cnsspace) locs = accn['locs'] locs.sort() cnsspace.sort() write_to_pos_fasta(bed,accn,locs,cnsspace,fhs,f)
class LocalDups(object): def __init__(self,filename,bed): self.filename = filename self.bed = Bed(bed) self.bed.fill_dict() def get_order_dups(self): d = {} for line in open(self.filename): dupline = DupLine(line) dups = dupline.get_order(self.bed) d[dups[0]['accn']] = "P" for dup in dups[1:]: d[dup['accn']] = dups[0]['accn'] intervening = dupline.get_interving_genes(self.bed) for i in intervening: if i in d.keys():continue d[i] = "I" self.filename.close() return d def write_ordered(self,out_fh): """write localdups to outfile""" localdup_fh = open(out_fh, "w") d = {} for line in open(self.filename): dupline = DupLine(line) dups = dupline.get_order(self.bed) line = "{0}\n".format("\t".join(dups)) localdup_fh.write(line) localdup_fh.close() def get_dups(self): d = {} for line in open(self.filename): dupline = DupLine(line) d[dupline.parent] = 'P' for dup in dupline.children: d[dup] = dupline.parent intervening = dupline.get_interving_genes(self.bed) for i in intervening: if i in d.keys(): continue d[i] = "I" self.filename.close() return d
def write_genelist(q_or_s, outfile, flat, pairs, orthos, mcnss, link_fmt, this_org, other_org, other_flat, dups, local_dups): # used in the link_fmt qorg, sorg = this_org, other_org fmt = "%(accn)s\t%(seqid)s\t%(start)i\t%(end)i\t%(ortholog)s\t%(ortho_cns)s\t" fmt +="%(regional_dup_info)s\t%(local_dup_info)s\t%(strand)s\t" fmt += "%(new_gene_info)s\t%(link)s" header = fmt.replace('%(', '').replace(')s','').replace(')i','') outdir = op.dirname(flat.path) annos = dict([kv.rstrip().split(",") for kv in open("%s/%s_protein_rna.anno" % (outdir, q_or_s))]) if flat.path == other_flat.path: annos.update(dict([kv.rstrip().split(",") for kv in open("%s/s_protein_rna.anno" % (outdir,))])) out = open(outfile, 'w') print >>sys.stderr, "writing genelist to %s" % (outfile,) print >>out, header.replace('ortho_', other_org + '_') same_org = this_org == other_org for feat in flat: these_pairs = pairs.get(feat['accn'], []) cnss = mcnss.get(feat['accn'], []) ortholog, other_pairs = split_pairs(feat, [other_flat.d[t] for t in these_pairs], orthos, q_or_s=='s') ortho_cns, non_ortho_cns = split_cns(cnss, orthos, q_or_s=='s') regional_dup_info = dups.get(feat['accn'], '') local_dup_info = local_dups.get(feat['accn'], '') if ortholog: ortho = ortholog[0] link = link_fmt % dict(qorg=qorg, sorg=sorg, accn1=ortho['accn'], accn2=feat['accn'] ) else: link = '' new_gene_info = "" if feat['accn'].endswith(("_cns_protein", "_cns_rna")): try: new_gene_info = annos[feat['accn']] except KeyError: # from coannoation of previous run. pass ortholog = len(ortholog) and ",".join([o["accn"] for o in ortholog]) or "" if len(ortho_cns) > 0 and len(ortholog) == 0: print >>sys.stderr, "\nBAD", feat, "\n", ortho_cns, "\nthese:", these_pairs, "\nother:", other_pairs, "\n\n" # fell right on the edge of a syntenic block. the cns got in, but not the gene. #1/0 other_pairs = ",".join([o["accn"] for o in other_pairs]) fmt_dict = locals() fmt_dict.update(Bed.row_to_dict(feat)) fmt_dict.update({'ortho_cns': len(ortho_cns) if ortholog else "", 'ortho_NON_cns_count': len(non_ortho_cns) if other_pairs else ""}) print >>out, fmt % fmt_dict
class LocalDups(object): def __init__(self, filename, bed): self.filename = filename self.bed = Bed(bed) self.bed.fill_dict() def get_order_dups(self): d = {} for line in open(self.filename): dupline = DupLine(line) dups = dupline.get_order(self.bed) d[dups[0]['accn']] = "P" for dup in dups[1:]: d[dup['accn']] = dups[0]['accn'] intervening = dupline.get_interving_genes(self.bed) for i in intervening: if i in d.keys(): continue d[i] = "I" self.filename.close() return d def write_ordered(self, out_fh): """write localdups to outfile""" localdup_fh = open(out_fh, "w") d = {} for line in open(self.filename): dupline = DupLine(line) dups = dupline.get_order(self.bed) line = "{0}\n".format("\t".join(dups)) localdup_fh.write(line) localdup_fh.close() def get_dups(self): d = {} for line in open(self.filename): dupline = DupLine(line) d[dupline.parent] = 'P' for dup in dupline.children: d[dup] = dupline.parent intervening = dupline.get_interving_genes(self.bed) for i in intervening: if i in d.keys(): continue d[i] = "I" self.filename.close() return d
def merge_flat(new_name, aflat, bflat): """take 2 flat files and return a new one that is the union of the 2 existing""" seen = {} both = [] for flat in (aflat, bflat): for row in flat: key = row['seqid'], row['accn'] if key in seen: continue seen[key] = True both.append(row) both.sort(key=lambda a: (a['seqid'], a['start'])) fh = open(new_name, "w") #print >>fh, "\t".join(Flat.names) for b in both: print >> fh, Bed.row_string(b) fh.close() return Bed(fh.name)
def main(cns_file, bedpath, fastapath): genespace = get_genespace(cns_file) bed = Bed(bedpath) f = Fasta(fastapath) handles = [ '3_utr', '5_utr', 'intronic', '5_prox', '5_distal', '3_prox', '3_distal' ] fhs = open_files(handles) for gene in genespace.keys(): #cnsspace = genespace[gene] try: accn = bed.accn(gene) except KeyError: continue cnsspace = [(max(0, accn['start'] - 12000), accn['end'] + 12000)] #print "GENESPACE {0}".format(cnsspace) locs = accn['locs'] locs.sort() cnsspace.sort() write_to_pos_fasta(bed, accn, locs, cnsspace, fhs, f)
def merge_same_hits(missed, fh_match, org_bed): """ groups genes that hit more then once """ d = {} handle = open(fh_match) matches = handle.read() org_bed_path = org_bed.path path = org_bed_path.split('/') dirc = '/'.join(path[:-1]) org = path[-1] fh = open('{0}/missed_from_{1}'.format(dirc,org), "wb") for match in matches.split('\n')[:-1]: qaccn,saccn = match.split('\t') #create dictionary try: seqid = missed.accn(qaccn)['seqid'] haccn = missed.accn(qaccn) except KeyError: continue #if near_gene(haccn,org_bed)==True: continue if (seqid,saccn) not in d.keys(): #append whole dict to keys d[(seqid,saccn)]= missed.accn(qaccn) else: #else add locs to exsting one gene_start = min(d[(seqid,saccn)]['locs'])[0] gene_end = max(d[(seqid,saccn)]['locs'])[1] missed_end = missed.accn(qaccn)['locs'][0][1] missed_start = missed.accn(qaccn)['locs'][0][0] if missed_end < gene_start: # if no intervening genes and they are close together... intervening_genes = get_intervening_genes(missed_end,gene_start,seqid, org_bed, d[(seqid,saccn)]['accn']) if intervening_genes is False: d[(seqid,saccn)]['locs'] = d[(seqid,saccn)]['locs'] + missed.accn(qaccn)['locs'] d[(seqid,saccn)]['start'] = missed_start if 'Os' in qaccn: d[seqid,saccn]['accn'] = qaccn else: d[(seqid,qaccn)] = missed.accn(qaccn) elif gene_end < missed_start: intervening_genes = get_intervening_genes(gene_end,missed_start,seqid, org_bed,d[(seqid,saccn)]["accn"]) if intervening_genes is False: d[(seqid,saccn)]['locs'] = d[(seqid,saccn)]['locs'] + missed.accn(qaccn)['locs'] d[(seqid,saccn)]['end'] = missed_end if 'Os' in qaccn: d[seqid,saccn]['accn'] = qaccn else: d[(seqid,qaccn)]= missed.accn(qaccn) else: d[(seqid,saccn)]['locs'] = d[(seqid,saccn)]['locs'] + missed.accn(qaccn)['locs'] for key in d.keys(): new_row = d[key]['locs'].sort() row = d[key] print >>fh, Bed.row_string(row)
def merge(org_bed, missed, merge_file): """creates blast.all file and updates everything""" merge_fh = open(merge_file, "w") #cds_missed = missed[missed['ftype'] == 'CDS'] #count = org_bed.shape[0] + missed[missed['ftype'] !='CDS'].shape[0] new_rows = [] seen_accns = {} # CDS added to existing gene. for row_missed in missed: if row_missed['accn'] in seen_accns: continue try: org_bed_row = org_bed.accn(row_missed['accn']) # it's a CDS except KeyError: #its a new gene new_rows.append(row_missed) seen_accns[row_missed['accn']] = True continue locs_interval = Intersecter() [locs_interval.add_interval(Feature(start,stop)) for start,stop in org_bed_row['locs']] for missed_start,missed_end in row_missed['locs']: if len(locs_interval.find(missed_start,missed_end)) > 0: # print >>sys.stderr, org_bed_row['accn'] locs_intersects = [(l.start,l.stop) for l in locs_interval.find(missed_start,missed_end)] [org_bed_row['locs'].remove(locs_intersect) for locs_intersect in locs_intersects] locs_intersects = set(locs_intersects) locs_intersects.add((missed_start,missed_end)) locs_start = min([start for start,end in locs_intersects]) locs_end = max([end for start,end in locs_intersects]) org_bed_row['locs'] = org_bed_row['locs'] + [(locs_start,locs_end)] row_missed['locs'].remove((missed_start,missed_end)) org_bed_row['locs'] = org_bed_row['locs'] + row_missed['locs'] #print >>sys.stderr, "{0},{1}".format(row_missed['accn'], locs) org_bed_row['locs'].sort() org_bed_row['start'] = min(min([start for start,end in org_bed_row['locs']]), org_bed_row['start']) org_bed_row['end'] = max(max([end for start,end in org_bed_row['locs']]), org_bed_row['end']) new_rows.append(org_bed_row) seen_accns[org_bed_row['accn']] =True for org_bed_rw in org_bed: if org_bed_rw['accn'] not in seen_accns: new_rows.append(org_bed_rw) seen_accns[org_bed_rw['accn']] =True def row_cmp(a,b): return cmp(a['seqid'], b['seqid']) or cmp(a['start'], b['start']) new_rows.sort(cmp=row_cmp) #print >>merge_fh, "\t".join(Bed.names) for i, row in enumerate(new_rows): print >>merge_fh, Bed.row_string(row)
def freq(feature_file,window_size,interval,meth_data): features = Bed(feature_file) for feature in features: region = range(int(feature["start"]),int(feature["end"])+1) for window_start in region[::interval]: window_end = window_start + window_size if window_end > region[-1]: matches = meth_data[feature['seqid']].find(window_start, region[-1]) else: matches = meth_data[feature['seqid']].find(window_start,window_end) if len(matches) < 15 : continue kw(matches,feature['seqid'],window_start,window_end)
class TestMaize(unittest.TestCase): def setUp(self): handle = open( '/Users/gturco/code/freeling_lab/find_cns_gturco/pipeline/tests/blast_3.txt' ) fh = handle.readlines() self.blast_str = ' , '.join(fh) self.unmasked_fasta = Fasta('/Users/gturco/find_cns/maize_v2_UM.fasta') self.qbed = Bed('/Users/gturco/rice_maize/rice_v6.bed') self.qbed.fill_dict() self.sbed = Bed('/Users/gturco/maize/maize_v2.bed', '/Users/gturco/maize/maize_v2.fasta') self.sbed.fill_dict() self.sfeat = self.sbed.accn('GRMZM2G086714') self.qfeat = self.qbed.accn('Os09g27050') def test_get_cmd(self): sfasta = 'data/rice_v6_maize_v2/maize_v2_split/2.fasta' qfasta = 'data/rice_v6_maize_v2/rice_v6_split/4.fasta' def test_parse_balse(self): orientaion = -1 cns = parse_blast(self.blast_str, orientaion, self.qfeat, self.sfeat, self.qbed, self.sbed, 12000, 26000, self.unmasked_fasta) print cns
def main(cnsfile, qbed_file, sbed_file, pairsfile, pairs_fmt): qcns_file = qbed_file.replace(".bed", "_cns.gff") assert qcns_file != qbed_file qcns_gff = open(qcns_file, 'w') print >>qcns_gff, "##gff-version 3" if sbed_file != qbed_file: scns_file = sbed_file.replace(".bed", "_cns.gff") assert scns_file != sbed_file scns_gff = open(scns_file, 'w') print >>scns_gff, "##gff-version 3" else: scns_gff = qcns_gff qbed = Bed(qbed_file); qbed.fill_dict() sbed = Bed(sbed_file); sbed.fill_dict() cnsdict = get_cns_dict(cnsfile) qpair_map, spair_map = make_pair_maps(pairsfile, pairs_fmt, qbed, sbed) out = sys.stdout fmt = "%(cns_id)s,%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \ "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s" print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","") for cns, qfeat, sfeat in assign(cnsdict, qbed, sbed, qpair_map, spair_map): d = cns_fmt_dict(cns, qfeat, sfeat) d['cns_id'] = cns_id(d) if d['sstop'] < d['sstart']: d['sstop'], d['sstart'] = d['sstart'], d['sstop'] print >>out, fmt % d write_gff(d, qcns_gff, scns_gff)
def main(missed, fh_match, org_bed): """first megers all hits to the same gene... then updates the entire bed file output: all_ORG.bed """ merge_same_hits(missed, fh_match, org_bed) org_bed_path = org_bed.path path = org_bed_path.split('/') dirc = '/'.join(path[:-1]) org = path[-1] missed2 = '{0}/missed_from_{1}'.format(dirc, org) merge_fh = "{0}/all_{1}".format(dirc, org) print missed2 merge(org_bed, Bed(missed2), merge_fh)
def main(cnsfile, qbed_file, sbed_file, pairsfile, pck, qorg, sorg, padding): qbed = Bed(qbed_file); qbed.fill_dict() sbed = Bed(sbed_file); sbed.fill_dict() cnsdict = get_cns_dict(cnsfile) qpair_map = make_pair_maps(pairsfile, 'pair', qbed, sbed) out = sys.stdout fmt = "%(saccn)s,%(saccnL)s,%(saccnR)s,%(schr)s,%(sstart)i,%(sstop)i," + \ "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(link)s" print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","") for cns, saccn, saccn_l, saccn_r, qfeat in assign(cnsdict, qbed, qpair_map): d = cns_fmt_dict(cns, qfeat, saccn, saccn_l, saccn_r) d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr,qfeat, pck, sbed, qbed, sorg, qorg, padding) print >>out, fmt % d
def main(cnsfile, qbed_file, sbed_file, qorg, sorg, padding): qbed = Bed(qbed_file); qbed.fill_dict() sbed = Bed(sbed_file); sbed.fill_dict() cnsdict = get_cns_dict(cnsfile) out = sys.stdout fmt = "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \ "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s,%(link)s" print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","") for cns, qfeat, sfeat in assign(cnsdict, qbed, sbed): d = cns_fmt_dict(cns, qfeat, sfeat) if d['sstop'] < d['sstart']: d['sstop'], d['sstart'] = d['sstart'], d['sstop'] d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr, sorg, qorg, padding) print >>out, fmt % d
def main(cns_path, fmt, query_bed_path, subject_bed_path): cns_dic = cns_to_dic(cns_path, fmt) query_bed = Bed(query_bed_path) subject_bed = Bed(subject_bed_path) utr_dict = {} for cns in cns_dic: cns['qstop'] = int(cns['qstop']) cns['qstart'] = int(cns['qstart']) cns['sstop'] = int(cns['sstop']) cns['sstart'] = int(cns['sstart']) qfeat = query_bed.accn(cns['qaccn']) sfeat = subject_bed.accn(cns['saccn']) qgene_space_start = min(qfeat['locs'])[0] qgene_space_end = max(qfeat['locs'])[1] qgene_space_poly = LineString([(0.0, qgene_space_start), (0.0, qgene_space_end)]) qgene_poly = LineString([(0.0, qfeat['start']), (0.0, qfeat['end'])]) sgene_poly = LineString([(0.0, sfeat['start']), (0.0, sfeat['end'])]) # if intron of one dont need to check other qcns = LineString([(0, cns['qstart']), (0, cns['qstop'])]) scns = LineString([(0, cns['sstart']), (0, cns['sstop'])]) cns_type(cns, qgene_space_poly, qgene_poly, sgene_poly, scns, qcns, qgene_space_start, qfeat) create_utr_list(utr_dict, qfeat, cns, "q") create_utr_list(utr_dict, sfeat, cns, "s") for cns in cns_dic: if cns['type'] == "5-prox_dist": qgene_start = min(utr_dict[cns['qaccn']]) qgene_stop = max(utr_dict[cns['qaccn']]) # sstart = min(utr_dict[cns['saccn']]) # sstop = max(utr_dict[cns['saccn']]) five_diff_pos = abs(qgene_start - cns["qstop"]) five_diff_neg = abs(qgene_stop - cns["qstart"]) if five_diff_pos <= 1000 and cns[ "qstrand"] == "+" or five_diff_neg <= 1000 and cns[ "qstrand"] == "-": cns["type"] = "5-proximal" elif five_diff_pos > 1000 and cns[ "qstrand"] == "+" or five_diff_neg > 1000 and cns[ "qstrand"] == "-": cns["type"] = "5-distal" elif cns['type'] == "3-prox_dist": qgene_start = min(utr_dict[cns['qaccn']]) qgene_stop = max(utr_dict[cns['qaccn']]) three_diff_pos = abs(cns["qstart"] - qgene_stop) three_diff_neg = abs(cns["qstop"] - qgene_start) if three_diff_pos <= 1000 and cns[ "qstrand"] == "+" or three_diff_neg <= 1000 and cns[ "qstrand"] == "-": cns["type"] = "3-proximal" elif three_diff_pos > 1000 and cns[ "qstrand"] == "+" or three_diff_neg > 1000 and cns[ "qstrand"] == "-": cns["type"] = "3-distal" return cns_dic
def merge_flat(new_name, aflat, bflat): """take 2 flat files and return a new one that is the union of the 2 existing""" seen = {} both = [] for flat in (aflat, bflat): for row in flat: key = row['seqid'], row['accn'] if key in seen: continue seen[key] = True both.append(row) both.sort(key=lambda a: (a['seqid'],a['start'])) fh = open(new_name, "w") #print >>fh, "\t".join(Flat.names) for b in both: print >>fh, Bed.row_string(b) fh.close() return Bed(fh.name)
def main(cnsfile, qbed_file, sbed_file, qorg, sorg, padding): qbed = Bed(qbed_file); qbed.fill_dict() sbed = Bed(sbed_file); sbed.fill_dict() cnsdict = get_cns_dict(cnsfile) out = sys.stdout fmt = "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \ "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s,%(link)s" print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","") for cns, qfeat, sfeat in assign(cnsdict, qbed, sbed): d = cns_fmt_dict(cns, qfeat, sfeat) d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr, sorg, qorg, padding) print >>out, fmt % d
def loadintointersect(bed_file): query_list_pos = {} query_list_neg = {} feature_list = Bed(bed_file) for feature in feature_list: ## if float(feature['accn']) < .4: continue if feature["strand"] == "+": ### ADD one because bed adds one too number if feature['seqid'] not in list(query_list_pos): query_list_pos[feature['seqid']] = Intersecter() query_list_pos[feature['seqid']].add_interval( Feature(int(feature['start'] - 1), int(feature['start'] - 1), name=feature['strand'])) elif feature["strand"] == "-": if feature['seqid'] not in list(query_list_neg): query_list_neg[feature['seqid']] = Intersecter() query_list_neg[feature['seqid']].add_interval( Feature(int(feature['start'] - 1), int(feature['start'] - 1), name=feature['strand'])) return query_list_pos, query_list_neg
def print_bed(flist, old_path): ipath, ext = op.splitext(old_path) path = "%s.with_new%s" % (ipath, ext) print >>sys.stderr, "writing to: %s.with_new%s" % (ipath, ext) fh = open(path, 'wb') seen = {} for item in flist: # convert the locs to a tuple. #print >>sys.stderr, item item = list(item) item[6] = tuple(item[6]) item = tuple(item) if item in seen: continue seen[item] = 1 locs = item[6] # tuple(sorted([item[1], item[2]])) row = dict(accn=item[3], start=item[1], end=item[2], seqid=item[0], locs=locs, score='.', strand=item[5], rgb='.', thickstart='.', thickend=".") print >>fh, Bed.row_string(row) fh.close() return Bed(path)
class TestPerfectTargetRegion(unittest.TestCase): def setUp(self): self.gene_name = "Os01g02110" self.bed = Bed("ricetest.bed") self.fasta = Fasta("ricetest.fasta") self.gene = self.bed.accn(self.gene_name) self.exons = self.gene['locs'] def test_rel_pos(self): self.assertEqual((376,486),rel_pos(self.gene,self.exons[0])) self.assertEqual((1289,1789),rel_pos(self.gene,self.exons[-1])) def test_fasta(self): exon = self.exons[-1] seq = self.fasta[self.gene_name][:] self.assertTrue(1789 <= len(seq)) def test_pattern(self): e = exons[-1] start, stop = rel_pos(self.gene,e) for exon in self.exons:
class TestMaize(unittest.TestCase): def setUp(self): handle = open("/Users/gturco/code/freeling_lab/find_cns_gturco/pipeline/tests/blast_3.txt") fh = handle.readlines() self.blast_str = " , ".join(fh) self.unmasked_fasta = Fasta("/Users/gturco/find_cns/maize_v2_UM.fasta") self.qbed = Bed("/Users/gturco/rice_maize/rice_v6.bed") self.qbed.fill_dict() self.sbed = Bed("/Users/gturco/maize/maize_v2.bed", "/Users/gturco/maize/maize_v2.fasta") self.sbed.fill_dict() self.sfeat = self.sbed.accn("GRMZM2G086714") self.qfeat = self.qbed.accn("Os09g27050") def test_get_cmd(self): sfasta = "data/rice_v6_maize_v2/maize_v2_split/2.fasta" qfasta = "data/rice_v6_maize_v2/rice_v6_split/4.fasta" def test_parse_balse(self): orientaion = -1 cns = parse_blast( self.blast_str, orientaion, self.qfeat, self.sfeat, self.qbed, self.sbed, 12000, 26000, self.unmasked_fasta ) print cns
def __init__(self,filename,bed): self.filename = filename self.bed = Bed(bed) self.bed.fill_dict()
import optparse parser = optparse.OptionParser("usage: %prog [options] ") parser.add_option("-F", dest="mask", help="blast mask simple sequence [default: F]", default="F") parser.add_option("-n", dest="ncpu", help="parallelize to this many cores", type='int', default=8) parser.add_option("-q", dest="qfasta", help="path to genomic query fasta") parser.add_option("--qbed", dest="qbed", help="query bed file") parser.add_option("-s", dest="sfasta", help="path to genomic subject fasta") parser.add_option("--sbed", dest="sbed", help="subject bed file") parser.add_option("-p", dest="pairs", help="the pairs file. output from dagchainer") choices = ("dag", "cluster", "pair", 'qa', 'raw') parser.add_option("--pair_fmt", dest="pair_fmt", default='raw', help="format of the pairs, one of: %s" % str(choices), choices=choices) parser.add_option("--qpad", dest="qpad", type='int', default=12000, help="how far from the end of the query gene to look for cnss") parser.add_option("--spad", dest="spad", type='int', default=26000, help="how far from the end of the subject gene to look for cnss") parser.add_option("--UMfasta", dest="unmasked_fasta", help="path to unmasked fasta file file") (options, _) = parser.parse_args() if not (options.qfasta and options.sfasta and options.sbed and options.qbed): sys.exit(parser.print_help()) qbed = Bed(options.qbed, options.qfasta); qbed.fill_dict() sbed = Bed(options.sbed, options.sfasta); sbed.fill_dict() unmasked_fasta = Fasta(options.unmasked_fasta) assert options.mask in 'FT' main(qbed, sbed, options.pairs, options.qpad, options.spad, unmasked_fasta, options.pair_fmt, options.mask, options.ncpu)
spos = sbed[raw.pos_b] key = (raw.seqid_a, raw.seqid_b) if not key in trees: trees[key] = [] qpos = (qpos['start'] + qpos['end']) / 2 spos = (spos['start'] + spos['end']) / 2 trees[key].append((int(qpos), int(spos))) for k in trees: trees[k] = cKDTree(trees[k]) return trees if __name__ == "__main__": import optparse parser = optparse.OptionParser() parser.add_option("--qbed", dest="qbed", help="query bed file") parser.add_option("--sbed", dest="sbed", help="subject bed file") parser.add_option("--cns", dest="cns", help="path to raw cns") parser.add_option("--dist", dest="dist", type='int', help="max dist from gene to cns", default=12000) parser.add_option("--paralogy", dest="paralogy", help="path to paralogy file") parser.add_option("--orthology", dest="orthology", help="path to orthology file") options, args = parser.parse_args() if not (options.sbed and options.qbed and options.cns, options.orthology): sys.exit(parser.print_help()) qbed = Bed(options.qbed); qbed.fill_dict() sbed = Bed(options.sbed); sbed.fill_dict() qbed_new, sbed_new, new_pairs = main(qbed, sbed, options.cns, options.dist, options.orthology) write_new_pairs(options.paralogy, options.orthology, qbed, qbed_new, sbed, sbed_new, new_pairs)
parser.add_option("--paralogy", dest="paralogy", help="paralogy file") parser.add_option("--orthology", dest="orthology", help="orthology file") opts, _ = parser.parse_args() if not (opts.qflat_all and opts.sflat_all and opts.datasheet): print "A" sys.exit(parser.print_help()) if not (opts.qdsgid and opts.qorg and opts.sorg): print "B" sys.exit(parser.print_help()) if not (opts. qdups and opts.sdups and opts.paralogy and opts.orthology): print "C" sys.exit(parser.print_help()) qflat_new = Bed(opts.qflat_new) sflat_new = qflat_new if opts.qflat_new == opts.sflat_new else Bed(opts.sflat_new) qflat_all = Bed(opts.qflat_all) sflat_all = qflat_all if opts.qflat_all == opts.sflat_all else Bed(opts.sflat_all) qfpath = "%s.all%s" % op.splitext(qflat_new.path) sfpath = "%s.all%s" % op.splitext(sflat_new.path) qflat = merge_flat(qfpath, qflat_all, qflat_new) sflat = merge_flat(sfpath, sflat_all, sflat_new) qdups = parse_dups(opts.qdups, qflat) sdups = parse_dups(opts.sdups, sflat) qlocaldups = parse_dups(opts.qlocaldups,qflat)
def setUp(self): self.gene_name = "Os01g02110" self.bed = Bed("ricetest.bed") self.fasta = Fasta("ricetest.fasta") self.gene = self.bed.accn(self.gene_name) self.exons = self.gene['locs']
# print interval_list[0].find(0,100000000) # print interval_list[0].find(3577840,3577841) # print three_prom # print gene_body # print five_prom #three_prom = [i for i in three_prom if i.name == gene['strand']] #five_prom = [i for i in five_prom if i.name == gene['strand']] #gene_body = [i for i in gene_body if i.name == gene['strand']] if len(three_prom) > 0: l = "{0}\t3_prom\t{1}\t{2}\t{3}\n".format( gene_name, three_prom_p, three_prom_p * len(three_prom), sum(int(sig.name) for sig in three_prom)) out.write(l) if len(five_prom) > 0: l = "{0}\t5_prom\t{1}\t{2}\t{3}\n".format( gene_name, five_prom_p, five_prom_p * len(five_prom), sum(int(sig.name) for sig in five_prom)) out.write(l) if len(gene_body) > 0: l = "{0}\tgene_body\t{1}\t{2}\t{3}\n".format( gene_name, gene_body_p, gene_body_p * len(gene_body), sum(int(sig.name) for sig in gene_body)) out.write(l) genelist = Bed("sorg.bed") interval_list = insert_queries("DMR_NONVAS_CG_HYPO") find_intersections(3000, interval_list, genelist, "DMR_nonvas.genes")
def merge_same_hits(missed, fh_match, org_bed): """ groups genes that hit more then once """ d = {} handle = open(fh_match) matches = handle.read() org_bed_path = org_bed.path path = org_bed_path.split('/') dirc = '/'.join(path[:-1]) org = path[-1] fh = open('{0}/missed_from_{1}'.format(dirc, org), "wb") for match in matches.split('\n')[:-1]: qaccn, saccn = match.split('\t') #create dictionary try: seqid = missed.accn(qaccn)['seqid'] haccn = missed.accn(qaccn) except KeyError: continue #if near_gene(haccn,org_bed)==True: continue if (seqid, saccn) not in d.keys(): #append whole dict to keys d[(seqid, saccn)] = missed.accn(qaccn) else: #else add locs to exsting one gene_start = min(d[(seqid, saccn)]['locs'])[0] gene_end = max(d[(seqid, saccn)]['locs'])[1] missed_end = missed.accn(qaccn)['locs'][0][1] missed_start = missed.accn(qaccn)['locs'][0][0] if missed_end < gene_start: # if no intervening genes and they are close together... intervening_genes = get_intervening_genes( missed_end, gene_start, seqid, org_bed, d[(seqid, saccn)]['accn']) if intervening_genes is False: d[(seqid, saccn)]['locs'] = d[ (seqid, saccn)]['locs'] + missed.accn(qaccn)['locs'] d[(seqid, saccn)]['start'] = missed_start if 'Os' in qaccn: d[seqid, saccn]['accn'] = qaccn else: d[(seqid, qaccn)] = missed.accn(qaccn) elif gene_end < missed_start: intervening_genes = get_intervening_genes( gene_end, missed_start, seqid, org_bed, d[(seqid, saccn)]["accn"]) if intervening_genes is False: d[(seqid, saccn)]['locs'] = d[ (seqid, saccn)]['locs'] + missed.accn(qaccn)['locs'] d[(seqid, saccn)]['end'] = missed_end if 'Os' in qaccn: d[seqid, saccn]['accn'] = qaccn else: d[(seqid, qaccn)] = missed.accn(qaccn) else: d[(seqid, saccn)]['locs'] = d[ (seqid, saccn)]['locs'] + missed.accn(qaccn)['locs'] for key in d.keys(): new_row = d[key]['locs'].sort() row = d[key] print >> fh, Bed.row_string(row)
def test_main(self): """test for test_get_cns_dict""" qbed = Bed(self.qbed, self.qfasta); qbed.fill_dict() sbed = Bed(self.sbed, self.sfasta); sbed.fill_dict() x = main(qbed, sbed, self.pairs, 12000,12000, "pair", self.blast_path, "T",2) print x
# write a genomic fasta file with all sequences covered by features # in the specified Bed file masked to N. from flatfeature import Bed import sys # b = Bed(sys.argv[1], sys.argv[2]) b = Bed("/Users/gturco/data/rice_v6.bed", "/Users/gturco/data/rice_v6.fasta") for seqid, seq in b.mask_cds(): seqids = [] seq.tostring()
def main(feature_bed, query_list_pos, query_list_neg, fasta_file, mtype, rand): features = Bed(feature_bed) fasta = Fasta(fasta_file) All_sites = defaultdict(list) r = {} cgene = {} for feature in features: rc = feature["strand"] == "-" if feature["strand"] == "+": TSS_region = range( int(feature['locs'][0][0]) - 2000, int(feature['locs'][0][0])) TTS_region = range(int(feature['locs'][-1][1]), int(feature['locs'][-1][1]) + 2000) TSS_sites = get_matchs(query_list_pos, feature['seqid'], TSS_region, fasta["chromosome_" + feature["seqid"]], -2000, rc) TE_sites = get_matchs(query_list_pos, feature['seqid'], TTS_region, fasta["chromosome_" + feature['seqid']], 1000, rc) gene_body, rebin = get_genebody( query_list_pos, feature, fasta["chromosome_" + feature["seqid"]], rc, rand) r[feature["accn"]] = rebin cgene[feature["accn"]] = gene_body # [All_sites[str(region)].append(freq) for region,freq in TSS_sites] # [All_sites[str(region)].append(freq) for region,freq in TE_sites] [ All_sites[feature["accn"]].append((region, freq)) for region, freq in TSS_sites ] [ All_sites[feature["accn"]].append((region, freq)) for region, freq in TE_sites ] if feature["strand"] == "-": TTS_region = range( int(feature['locs'][0][0]) - 2000, int(feature['locs'][0][0])) TSS_region = range(int(feature['locs'][-1][1]), int(feature['locs'][-1][1]) + 2000) TSS_sites = get_matchs(query_list_neg, feature['seqid'], TSS_region, fasta["chromosome_" + feature["seqid"]], -2000, rc) TE_sites = get_matchs(query_list_neg, feature['seqid'], TTS_region, fasta["chromosome_" + feature['seqid']], 1100, rc) ###RV complent gene_body, rebin = get_genebody( query_list_neg, feature, fasta["chromosome_" + feature["seqid"]], rc, rand) r[feature["accn"]] = rebin cgene[feature["accn"]] = gene_body ##[All_sites[str(region)].append(freq) for region,freq in TSS_sites] ##[All_sites[str(region)].append(freq) for region,freq in TE_sites] [ All_sites[feature["accn"]].append((region, freq)) for region, freq in TSS_sites ] [ All_sites[feature["accn"]].append((region, freq)) for region, freq in TE_sites ] return All_sites, r, cgene
if strand == '-': my_seq = fasta fasta = str(Seq(my_seq).reverse_complement()) if len(fasta) == 0: #print start,stop,accn['accn'] continue seq_w = "{0}\n".format(fasta) new_fasta.write(w) new_fasta.write(seq_w) ####### tair ########## #x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/thaliana_v8.fasta","/Users/gt/thaliana_v8_control_SB.fasta") x = random_noncoding( '/Users/gt/Desktop/tmp.csv', Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_golden/thaliana_v8.with_new_cns_mask.bed' ), "/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta", "/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta" ) ######### rice,sorg,set ##### ##### took out strand info used N to mask bed also ######## #x = #random_noncoding('/Users/gt/Desktop/paper/G-box-seq/rice_rice/tmp.csv',Bed('/Users/gt/Desktop/paper/G-box-seq/rice.with_new_cns_mask.bed'),"/Users/gt/Desktop/paper/G-box-seq/rice_rice/rice_j.fasta","/Users/gt/Desktop/paper/G-box-seq/rice_rice/rice_rice_control_fasta") #x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta") #x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta") #x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta") #x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta") #x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta") #
seq = f[seqid][start:end] if "X" in seq: print accn, seqid, start, end if len(seq) < 15 and len(seq) > 0: print "OH NO!!!!!!" w = ">cns{0}\n".format(n) seq_w = "{0}\n".format(seq) new_fasta.write(w) new_fasta.write(seq_w) dict_size = gene_size_dict('/Users/gturco/Desktop/rice_sorg_size.tsv') #dict_size = gene_size_dict("test_file") x = random_noncoding( dict_size, Bed('/Users/gturco/data/paper3/rice_b_sorghum_v1.nolocaldups.with_new_cns_mask.bed' )) print len(x) #####print x get_seq(x, "/Users/gturco/data/paper3/rice_b.fasta", "/Users/gturco/test.fasta") ## ##### seq for cns #handle = open("/Users/gturco/data/paper3/rice_b_sorghum_v1.cns.assigned_real.csv") #fh = handle.read() #cns_list = [] #for line in fh.split("\n")[:-1]: # if line[0] == "#": continue # cns_id,accn,seqid,start,end,strand = line.split(",")[:6] # cns_list.append((seqid,int(start),int(end))) # #len(cns_list)
def write_bed(gene,merge_fh): new_line = Bed.row_string(gene) merge_fh.write("{0}\n".format(new_line))
type='string', help="path to query localdup_file") parser.add_option("--sdups", dest="sdups", type='string', help="path to subject localdup_file") parser.add_option("--cns_file", dest="cns_file", type='string', help="path to cns file cns.txt") parser.add_option("--UMfasta", dest="unmasked_fasta", help="path to unmasked fasta file file") (options, _) = parser.parse_args() qbed = Bed(options.qbed, options.qfasta) qbed.fill_dict() sbed = Bed(options.sbed, options.sfasta) sbed.fill_dict() unmasked_fasta = Fasta(options.unmasked_fasta) assert options.mask in 'FT' qnolocaldups_path = qbed.path.split(".")[0] + ".nolocaldups.bed" snolocaldups_path = sbed.path.split(".")[0] + ".nolocaldups.bed" #pairs_to_qa("{0}.local".format(options.pairs),'pair',"{0}.nolocaldups.local".format(qbed.path.split(".")[0]),"{0}.nolocaldups.local".format(sbed.path.split(".")[0]),"{0}.raw.filtered.local".format(options.pairs.split(".")[0])) import logging LOG_FILENAME = path.dirname(options.qfasta) + "dup_rdups.log" logging.basicConfig(filename=LOG_FILENAME, level=logging.INFO) main(options.cns_file, options.qdups, options.sdups, options.pairs,
def merge(org_bed, missed, merge_file): """creates blast.all file and updates everything""" merge_fh = open(merge_file, "w") #cds_missed = missed[missed['ftype'] == 'CDS'] #count = org_bed.shape[0] + missed[missed['ftype'] !='CDS'].shape[0] new_rows = [] seen_accns = {} # CDS added to existing gene. for row_missed in missed: if row_missed['accn'] in seen_accns: continue try: org_bed_row = org_bed.accn(row_missed['accn']) # it's a CDS except KeyError: #its a new gene new_rows.append(row_missed) seen_accns[row_missed['accn']] = True continue locs_interval = Intersecter() [ locs_interval.add_interval(Feature(start, stop)) for start, stop in org_bed_row['locs'] ] for missed_start, missed_end in row_missed['locs']: if len(locs_interval.find(missed_start, missed_end)) > 0: # print >>sys.stderr, org_bed_row['accn'] locs_intersects = [ (l.start, l.stop) for l in locs_interval.find(missed_start, missed_end) ] [ org_bed_row['locs'].remove(locs_intersect) for locs_intersect in locs_intersects ] locs_intersects = set(locs_intersects) locs_intersects.add((missed_start, missed_end)) locs_start = min([start for start, end in locs_intersects]) locs_end = max([end for start, end in locs_intersects]) org_bed_row['locs'] = org_bed_row['locs'] + [ (locs_start, locs_end) ] row_missed['locs'].remove((missed_start, missed_end)) org_bed_row['locs'] = org_bed_row['locs'] + row_missed['locs'] #print >>sys.stderr, "{0},{1}".format(row_missed['accn'], locs) org_bed_row['locs'].sort() org_bed_row['start'] = min( min([start for start, end in org_bed_row['locs']]), org_bed_row['start']) org_bed_row['end'] = max( max([end for start, end in org_bed_row['locs']]), org_bed_row['end']) new_rows.append(org_bed_row) seen_accns[org_bed_row['accn']] = True for org_bed_rw in org_bed: if org_bed_rw['accn'] not in seen_accns: new_rows.append(org_bed_rw) seen_accns[org_bed_rw['accn']] = True def row_cmp(a, b): return cmp(a['seqid'], b['seqid']) or cmp(a['start'], b['start']) new_rows.sort(cmp=row_cmp) #print >>merge_fh, "\t".join(Bed.names) for i, row in enumerate(new_rows): print >> merge_fh, Bed.row_string(row)
print "OH NO!!!!!!" w = ">cns{0}\n".format(n) seq_w = "{0}\n".format(seq) new_fasta.write(w) new_fasta.write(seq_w) ######## rice_set ########## #dict_size = gene_size_dict('/Users/gt/tmp.tsv') #x = random_noncoding(dict_size,Bed('/Users/gt/data/paper4/rice_j_setaria_n/rice_j_set.nolocaldups.with_new_cns_mask.bed')) #get_seq(x,"/Users/gt/data/paper4/rice_j.fasta","/Users/gt/data/paper4/rice_j_setaria_n/testing.fasta") ####### rice_sorg ######### dict_size = gene_size_dict('/Users/gt/tmp.tsv') x = random_noncoding( dict_size, Bed('/Users/gt/data/paper4/rice_j_sorghum_n/rice_j_sorg.nolocaldups.with_new_cns_mask.bed' )) get_seq(x, "/Users/gt/data/paper4/rice_j.fasta", "/Users/gt/data/paper4/rice_j_sorghum_n/testing.fasta") ##### seq for cns #handle = open("/Users/gturco/data/paper3/rice_b_sorghum_v1.cns.assigned_real.csv") #fh = handle.read() #cns_list = [] #for line in fh.split("\n")[:-1]: # if line[0] == "#": continue # cns_id,accn,seqid,start,end,strand = line.split(",")[:6] # cns_list.append((seqid,int(start),int(end))) # #len(cns_list) #get_seq(cns_list,"/Users/gturco/data/paper3/rice_b.fasta","/Users/gturco/test_cns.fasta") ##
def main(qbed_path, sbed_path, cnsfile, dist, orthology_path): """ here, we remove cnss that have been called proteins/rnas from the cns list, and add them to the bed files. AND have to do the preliminary assignment of cnss that remain to the new-genes that _were_ cnss. the proper assignment is then handled in assign.py """ qcns_file = qbed_path.replace(".bed", "_cns.gff") assert qcns_file != qbed_path qcns_gff = open(qcns_file, 'w') print >>qcns_gff, "##gff-version 3" if sbed_path != qbed_path: scns_file = sbed_path.replace(".bed", "_cns.gff") assert scns_file != sbed_path scns_gff = open(scns_file, 'w') print >>scns_gff, "##gff-version 3" else: scns_gff = qcns_gff qrawbed = RawBed(qbed_path) srawbed = RawBed(sbed_path) ortho_trees = read_orthos_to_trees(orthology_path, qrawbed,srawbed) qbed = Bed(qbed_path); qbed.fill_dict() sbed = Bed(sbed_path); sbed.fill_dict() name, ext = op.splitext(cnsfile) real_cns_fh = open("%s.real%s" % (name, ext), "w") print >>sys.stderr, "writing to:", real_cns_fh.name outdir = op.dirname(cnsfile) print >>real_cns_fh, "#qseqid,qaccn,sseqid,saccn,qstart,qend,sstart,send,eval" crna = read_cns_to_rna(outdir) cpro = read_cns_to_protein_exons(outdir) #cns_items = list(parse_raw_cns(cnsfile)) proteins = collections.defaultdict(list) rnas = collections.defaultdict(list) real_cns_items = [] for cnsi in CNS.parse_raw_line(cnsfile): cns_id = cnsi.cns_id cns = cnsi.to_dict() key = (cns['qseqid'], cns['sseqid']) if cns_id in cpro: proteins[key].append((cns, cpro[cns_id])) elif cns_id in crna: rnas[key].append((cns, crna[cns_id])) else: real_cns_items.append((cns_id, cns)) p_trees = fill_tree(proteins) r_trees = fill_tree(rnas) def assign_new_names(prs, protein_or_rna): n = {} for seqid_pair, li in prs.iteritems(): if not seqid_pair in n: n[seqid_pair] = [] for gnew, info in li[:]: new_qname = "%(qseqid)s_%(qstart)i_%(qend)i_cns" % gnew new_sname = "%(sseqid)s_%(sstart)i_%(send)i_cns" % gnew # and give them both an id so we know they were a pair. new_qname += "_%s" % (protein_or_rna) new_sname += "_%s" % (protein_or_rna) #print >>sys.stderr, gnew['qaccn'], cns["qaccn"] try: qstrand = qbed.d[gnew['qaccn']]['strand'] sstrand = sbed.d[gnew['saccn']]['strand'] except: print >>sys.stderr, gnew raise gnew['qaccn'] = new_qname gnew['saccn'] = new_sname gnew['qstrand'] = qstrand gnew['sstrand'] = sstrand n[seqid_pair].append((gnew, info)) return n nproteins = assign_new_names(proteins, "protein") nrnas = assign_new_names(rnas, "rna") cns_seen = {} # go through the remaining cnss, print and assign them to the new # genes (previously cnss) in within dist. for cns_id, cns in real_cns_items: print >>real_cns_fh, cns_to_str(cns) key = (cns['qseqid'], cns['sseqid']) for pnew, info in get_new(cns, p_trees, key, nproteins, dist + 1000): cns['qaccn'] = pnew['qaccn'] cns['saccn'] = pnew['saccn'] cns_str = cns_to_str(cns) if cns_str in cns_seen: continue cns_seen[cns_str] = 1 print >>real_cns_fh, cns_str for rnew, info in get_new(cns, r_trees, key, nrnas, dist + 1000): cns['qaccn'] = rnew['qaccn'] cns['saccn'] = rnew['saccn'] cns_str = cns_to_str(cns) if cns_str in cns_seen: continue cns_seen[cns_str] = 1 print >>real_cns_fh, cns_str qbed_list, qnew_pairs = merge_bed(qbed, nproteins, nrnas, ortho_trees, 'q') print >> sys.stderr, len(qnew_pairs) # dont need to do the orthos 2x so send in empty dict. sbed_list, snew_pairs_unused = merge_bed(sbed, nproteins, nrnas, {}, 's') # if it's the same org, we add the new cnss again to the same we send in both lists. # print_bed handles the repeats. if qbed.path == sbed.path: qbed_new = sbed_new = print_bed(qbed_list + sbed_list, qbed.path) else: qbed_new = print_bed(qbed_list, qbed.path) sbed_new = print_bed(sbed_list, sbed.path) return qbed_new.path, sbed_new.path, qnew_pairs