def main(cnsfile, qbed_file, sbed_file, pairsfile, pairs_fmt, qdsid, sdsid,qpad,spad): qcns_file = qbed_file.replace(".nolocaldups", "_cns.gff") assert qcns_file != qbed_file qcns_gff = open(qcns_file, 'w') print >>qcns_gff, "##gff-version 3" if sbed_file != qbed_file: scns_file = sbed_file.replace(".nolocaldups", "_cns.gff") assert scns_file != sbed_file scns_gff = open(scns_file, 'w') print >>scns_gff, "##gff-version 3" else: scns_gff = qcns_gff qbed = Bed(qbed_file); qbed.fill_dict() sbed = Bed(sbed_file); sbed.fill_dict() cnsdict, evaldict = get_cns_dict(cnsfile) qpair_map, spair_map = make_pair_maps(pairsfile, pairs_fmt, qbed, sbed) out = sys.stdout fmt = "%(cns_id)s,%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \ "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s,%(eval)s,%(link)s" print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","") for cns, qfeat, sfeat in assign(cnsdict,qbed, sbed, qpair_map, spair_map): d = cns_fmt_dict(cns, qfeat, sfeat, evaldict) d['cns_id'] = cns_id(d) if d['sstop'] < d['sstart']: d['sstop'], d['sstart'] = d['sstart'], d['sstop'] d['link'] = cns_link(d, qdsid, sdsid,qpad,spad) print >>out, fmt % d write_gff(d, qcns_gff, scns_gff)
def setUp(self): self.old_bed = Bed("data/rice_t_sorghum_v1/sorghum_v1.bed") self.missed_bed = Bed( "data/rice_t_sorghum_v1/missed_sorghum_v1_from_rice_b.bed") self.matches = "data/rice_t_sorghum_v1/missed_sorghum_v1_from_rice_b.matches.txt" self.missed_genes = parse_missed_genes(self.matches) self.missed_genes_grouped, self.missed_genes_dict = group_genes_in_bed( self.missed_genes, self.old_bed, self.missed_bed)
def main(cns_path, fmt, query_bed_path, subject_bed_path): cns_dic = cns_to_dic(cns_path, fmt) query_bed = Bed(query_bed_path) subject_bed = Bed(subject_bed_path) utr_dict = {} for cns in cns_dic: cns['qstop'] = int(cns['qstop']) cns['qstart'] = int(cns['qstart']) cns['sstop'] = int(cns['sstop']) cns['sstart'] = int(cns['sstart']) qfeat = query_bed.accn(cns['qaccn']) sfeat = subject_bed.accn(cns['saccn']) qgene_space_start = min(qfeat['locs'])[0] qgene_space_end = max(qfeat['locs'])[1] qgene_space_poly = LineString([(0.0, qgene_space_start), (0.0, qgene_space_end)]) qgene_poly = LineString([(0.0, qfeat['start']), (0.0, qfeat['end'])]) sgene_poly = LineString([(0.0, sfeat['start']), (0.0, sfeat['end'])]) # if intron of one dont need to check other qcns = LineString([(0, cns['qstart']), (0, cns['qstop'])]) scns = LineString([(0, cns['sstart']), (0, cns['sstop'])]) cns_type(cns, qgene_space_poly, qgene_poly, sgene_poly, scns, qcns, qgene_space_start, qfeat) create_utr_list(utr_dict, qfeat, cns, "q") create_utr_list(utr_dict, sfeat, cns, "s") for cns in cns_dic: if cns['type'] == "5-prox_dist": qgene_start = min(utr_dict[cns['qaccn']]) qgene_stop = max(utr_dict[cns['qaccn']]) # sstart = min(utr_dict[cns['saccn']]) # sstop = max(utr_dict[cns['saccn']]) five_diff_pos = abs(qgene_start - cns["qstop"]) five_diff_neg = abs(qgene_stop - cns["qstart"]) if five_diff_pos <= 1000 and cns[ "qstrand"] == "+" or five_diff_neg <= 1000 and cns[ "qstrand"] == "-": cns["type"] = "5-proximal" elif five_diff_pos > 1000 and cns[ "qstrand"] == "+" or five_diff_neg > 1000 and cns[ "qstrand"] == "-": cns["type"] = "5-distal" elif cns['type'] == "3-prox_dist": qgene_start = min(utr_dict[cns['qaccn']]) qgene_stop = max(utr_dict[cns['qaccn']]) three_diff_pos = abs(cns["qstart"] - qgene_stop) three_diff_neg = abs(cns["qstop"] - qgene_start) if three_diff_pos <= 1000 and cns[ "qstrand"] == "+" or three_diff_neg <= 1000 and cns[ "qstrand"] == "-": cns["type"] = "3-proximal" elif three_diff_pos > 1000 and cns[ "qstrand"] == "+" or three_diff_neg > 1000 and cns[ "qstrand"] == "-": cns["type"] = "3-distal" return cns_dic
def setUp(self): self.cns_filename = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.cns.txt" self.pairsfile = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.pairs.txt" self.qbed = Bed("data/rice_v6_sorghum_v1/rice_v6.bed") self.qbed.fill_dict() self.sbed = Bed("data/rice_v6_sorghum_v1/sorghum_v1.bed") self.sbed.fill_dict() self.cns_dict, self.evalue_dict = get_cns_dict(self.cns_filename) self.qpair_map, self.spair_map = make_pair_maps( self.pairsfile, "pair", self.qbed, self.sbed)
def setUp(self): handle = open( '/Users/gturco/code/freeling_lab/find_cns_gturco/pipeline/tests/blast_3.txt' ) fh = handle.readlines() self.blast_str = ' , '.join(fh) self.unmasked_fasta = Fasta('/Users/gturco/find_cns/maize_v2_UM.fasta') self.qbed = Bed('/Users/gturco/rice_maize/rice_v6.bed') self.qbed.fill_dict() self.sbed = Bed('/Users/gturco/maize/maize_v2.bed', '/Users/gturco/maize/maize_v2.fasta') self.sbed.fill_dict() self.sfeat = self.sbed.accn('GRMZM2G086714') self.qfeat = self.qbed.accn('Os09g27050')
def main(cnsfile, qbed_file, sbed_file, pairsfile, pck, qorg, sorg, padding): qbed = Bed(qbed_file); qbed.fill_dict() sbed = Bed(sbed_file); sbed.fill_dict() cnsdict = get_cns_dict(cnsfile) qpair_map = make_pair_maps(pairsfile, 'pair', qbed, sbed) out = sys.stdout fmt = "%(saccn)s,%(saccnL)s,%(saccnR)s,%(schr)s,%(sstart)i,%(sstop)i," + \ "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(link)s" print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","") for cns, saccn, saccn_l, saccn_r, qfeat in assign(cnsdict, qbed, qpair_map): d = cns_fmt_dict(cns, qfeat, saccn, saccn_l, saccn_r) d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr,qfeat, pck, sbed, qbed, sorg, qorg, padding) print >>out, fmt % d
def main_gene(feature_file, query_list_pos, query_list_neg): cds = [] three_all = [] five_all = [] feature_bed = Bed(feature_file) for feature in feature_bed: exon_meth = [] for e in feature['locs']: for i in range(e[0], e[1] + 1): if feature["strand"] == "+": matches = query_list_pos[feature['seqid']].find(i, i) else: matches = query_list_neg[feature['seqid']].find(i, i) exon_meth.append(len(matches)) cds.append(sum(exon_meth)) if feature["strand"] == "+": five_prime = query_list_pos[feature['seqid']].find( int(feature['locs'][0][0]) - 300, int(feature['locs'][0][0])) three_prime = query_list_pos[feature['seqid']].find( int(feature['locs'][-1][1]), int(feature['locs'][-1][1]) + 300) elif feature["strand"] == "-": three_prime = query_list_pos[feature['seqid']].find( int(feature['locs'][0][0]) - 300, int(feature['locs'][0][0])) five_prime = query_list_pos[feature['seqid']].find( int(feature['locs'][-1][1]), int(feature['locs'][-1][1]) + 300) three_all.append(len(three_prime)) five_all.append(len(five_prime)) return cds, three_all, five_all
def main(cnsfile, qbed_file, sbed_file, qorg, sorg, padding): qbed = Bed(qbed_file); qbed.fill_dict() sbed = Bed(sbed_file); sbed.fill_dict() cnsdict = get_cns_dict(cnsfile) out = sys.stdout fmt = "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \ "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s,%(link)s" print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","") for cns, qfeat, sfeat in assign(cnsdict, qbed, sbed): d = cns_fmt_dict(cns, qfeat, sfeat) if d['sstop'] < d['sstart']: d['sstop'], d['sstart'] = d['sstart'], d['sstop'] d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr, sorg, qorg, padding) print >>out, fmt % d
def utr_present(cns_pck, query_bed_path, UTR): "checks to see if qaccn has utr region" db = MySQLdb.connect(host="127.0.0.1", user="******", db="rice_gene_table") cursor = db.cursor() cns_handle = open(cns_pck) cns_pickle = pickle.load(cns_handle) query_bed = Bed(query_bed_path) for cns in cns_pickle: qfeat = query_bed.accn(cns['qaccn']) if qfeat['strand'] == "+": end = qfeat['end'] start = qfeat["start"] else: end = qfeat['start'] start = qfeat["end"] if UTR == 3: if end == min(qfeat['locs'])[0] or end == max(qfeat['locs'])[1]: stmt = "update MUSA_GENE_LIST_copy set MUSA_GENE_LIST_copy.3_UTR = 'ND' where MUSA_GENE_LIST_copy.Rice_MSU6_genes = '{0}'".format( cns['qaccn']) print stmt cursor.execute(stmt) elif UTR == 5: if start == min(qfeat['locs'])[0] or start == max( qfeat['locs'])[1]: stmt = "update MUSA_GENE_LIST_copy set MUSA_GENE_LIST_copy.5_UTR = 'ND' where MUSA_GENE_LIST_copy.Rice_MSU6_genes = '{0}'".format( cns['qaccn']) print stmt cursor.execute(stmt)
def print_bed(flist, old_path): ipath, ext = op.splitext(old_path) path = "%s.with_new%s" % (ipath, ext) print >> sys.stderr, "writing to: %s.with_new%s" % (ipath, ext) fh = open(path, 'wb') seen = {} for item in flist: # convert the locs to a tuple. #print >>sys.stderr, item item = list(item) item[6] = tuple(item[6]) item = tuple(item) if item in seen: continue seen[item] = 1 locs = item[6] # tuple(sorted([item[1], item[2]])) row = dict(accn=item[3], start=item[1], end=item[2], seqid=item[0], locs=locs, score='.', strand=item[5], rgb='.', thickstart='.', thickend=".") print >> fh, Bed.row_string(row) fh.close() return Bed(path)
def freq(feature_file,window_size,interval,meth_data): features = Bed(feature_file) for feature in features: region = range(int(feature["start"]),int(feature["end"])+1) for window_start in region[::interval]: window_end = window_start + window_size if window_end > region[-1]: matches = meth_data[feature['seqid']].find(window_start, region[-1]) else: matches = meth_data[feature['seqid']].find(window_start,window_end) if len(matches) < 15 : continue kw(matches,feature['seqid'],window_start,window_end)
def main(missed, fh_match, org_bed): """first megers all hits to the same gene... then updates the entire bed file output: all_ORG.bed """ merge_same_hits(missed, fh_match, org_bed) org_bed_path = org_bed.path path = org_bed_path.split('/') dirc = '/'.join(path[:-1]) org = path[-1] missed2 = '{0}/missed_from_{1}'.format(dirc, org) merge_fh = "{0}/all_{1}".format(dirc, org) print missed2 merge(org_bed, Bed(missed2), merge_fh)
def merge_flat(new_name, aflat, bflat): """take 2 flat files and return a new one that is the union of the 2 existing""" seen = {} both = [] for flat in (aflat, bflat): for row in flat: key = row['seqid'], row['accn'] if key in seen: continue seen[key] = True both.append(row) both.sort(key=lambda a: (a['seqid'], a['start'])) fh = open(new_name, "w") #print >>fh, "\t".join(Flat.names) for b in both: print >> fh, Bed.row_string(b) fh.close() return Bed(fh.name)
def main(cns_file, bedpath, fastapath): genespace = get_genespace(cns_file) bed = Bed(bedpath) f = Fasta(fastapath) handles = [ '3_utr', '5_utr', 'intronic', '5_prox', '5_distal', '3_prox', '3_distal' ] fhs = open_files(handles) for gene in genespace.keys(): #cnsspace = genespace[gene] try: accn = bed.accn(gene) except KeyError: continue cnsspace = [(max(0, accn['start'] - 12000), accn['end'] + 12000)] #print "GENESPACE {0}".format(cnsspace) locs = accn['locs'] locs.sort() cnsspace.sort() write_to_pos_fasta(bed, accn, locs, cnsspace, fhs, f)
def loadintointersect(bed_file): query_list_pos = {} query_list_neg = {} feature_list = Bed(bed_file) for feature in feature_list: ## if float(feature['accn']) < .4: continue if feature["strand"] == "+": ### ADD one because bed adds one too number if feature['seqid'] not in list(query_list_pos): query_list_pos[feature['seqid']] = Intersecter() query_list_pos[feature['seqid']].add_interval( Feature(int(feature['start'] - 1), int(feature['start'] - 1), name=feature['strand'])) elif feature["strand"] == "-": if feature['seqid'] not in list(query_list_neg): query_list_neg[feature['seqid']] = Intersecter() query_list_neg[feature['seqid']].add_interval( Feature(int(feature['start'] - 1), int(feature['start'] - 1), name=feature['strand'])) return query_list_pos, query_list_neg
def main(feature_bed, query_list_pos, query_list_neg, fasta_file, mtype, rand): features = Bed(feature_bed) fasta = Fasta(fasta_file) All_sites = defaultdict(list) r = {} cgene = {} for feature in features: rc = feature["strand"] == "-" if feature["strand"] == "+": TSS_region = range( int(feature['locs'][0][0]) - 2000, int(feature['locs'][0][0])) TTS_region = range(int(feature['locs'][-1][1]), int(feature['locs'][-1][1]) + 2000) TSS_sites = get_matchs(query_list_pos, feature['seqid'], TSS_region, fasta["chromosome_" + feature["seqid"]], -2000, rc) TE_sites = get_matchs(query_list_pos, feature['seqid'], TTS_region, fasta["chromosome_" + feature['seqid']], 1000, rc) gene_body, rebin = get_genebody( query_list_pos, feature, fasta["chromosome_" + feature["seqid"]], rc, rand) r[feature["accn"]] = rebin cgene[feature["accn"]] = gene_body # [All_sites[str(region)].append(freq) for region,freq in TSS_sites] # [All_sites[str(region)].append(freq) for region,freq in TE_sites] [ All_sites[feature["accn"]].append((region, freq)) for region, freq in TSS_sites ] [ All_sites[feature["accn"]].append((region, freq)) for region, freq in TE_sites ] if feature["strand"] == "-": TTS_region = range( int(feature['locs'][0][0]) - 2000, int(feature['locs'][0][0])) TSS_region = range(int(feature['locs'][-1][1]), int(feature['locs'][-1][1]) + 2000) TSS_sites = get_matchs(query_list_neg, feature['seqid'], TSS_region, fasta["chromosome_" + feature["seqid"]], -2000, rc) TE_sites = get_matchs(query_list_neg, feature['seqid'], TTS_region, fasta["chromosome_" + feature['seqid']], 1100, rc) ###RV complent gene_body, rebin = get_genebody( query_list_neg, feature, fasta["chromosome_" + feature["seqid"]], rc, rand) r[feature["accn"]] = rebin cgene[feature["accn"]] = gene_body ##[All_sites[str(region)].append(freq) for region,freq in TSS_sites] ##[All_sites[str(region)].append(freq) for region,freq in TE_sites] [ All_sites[feature["accn"]].append((region, freq)) for region, freq in TSS_sites ] [ All_sites[feature["accn"]].append((region, freq)) for region, freq in TE_sites ] return All_sites, r, cgene
if strand == '-': my_seq = fasta fasta = str(Seq(my_seq).reverse_complement()) if len(fasta) == 0: #print start,stop,accn['accn'] continue seq_w = "{0}\n".format(fasta) new_fasta.write(w) new_fasta.write(seq_w) ####### tair ########## #x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/thaliana_v8.fasta","/Users/gt/thaliana_v8_control_SB.fasta") x = random_noncoding( '/Users/gt/Desktop/tmp.csv', Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_golden/thaliana_v8.with_new_cns_mask.bed' ), "/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta", "/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta" ) ######### rice,sorg,set ##### ##### took out strand info used N to mask bed also ######## #x = #random_noncoding('/Users/gt/Desktop/paper/G-box-seq/rice_rice/tmp.csv',Bed('/Users/gt/Desktop/paper/G-box-seq/rice.with_new_cns_mask.bed'),"/Users/gt/Desktop/paper/G-box-seq/rice_rice/rice_j.fasta","/Users/gt/Desktop/paper/G-box-seq/rice_rice/rice_rice_control_fasta") #x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta") #x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta") #x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta") #x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta") #x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta") #
seq = f[seqid][start:end] if "X" in seq: print accn, seqid, start, end if len(seq) < 15 and len(seq) > 0: print "OH NO!!!!!!" w = ">cns{0}\n".format(n) seq_w = "{0}\n".format(seq) new_fasta.write(w) new_fasta.write(seq_w) dict_size = gene_size_dict('/Users/gturco/Desktop/rice_sorg_size.tsv') #dict_size = gene_size_dict("test_file") x = random_noncoding( dict_size, Bed('/Users/gturco/data/paper3/rice_b_sorghum_v1.nolocaldups.with_new_cns_mask.bed' )) print len(x) #####print x get_seq(x, "/Users/gturco/data/paper3/rice_b.fasta", "/Users/gturco/test.fasta") ## ##### seq for cns #handle = open("/Users/gturco/data/paper3/rice_b_sorghum_v1.cns.assigned_real.csv") #fh = handle.read() #cns_list = [] #for line in fh.split("\n")[:-1]: # if line[0] == "#": continue # cns_id,accn,seqid,start,end,strand = line.split(",")[:6] # cns_list.append((seqid,int(start),int(end))) # #len(cns_list)
print "OH NO!!!!!!" w = ">cns{0}\n".format(n) seq_w = "{0}\n".format(seq) new_fasta.write(w) new_fasta.write(seq_w) ######## rice_set ########## #dict_size = gene_size_dict('/Users/gt/tmp.tsv') #x = random_noncoding(dict_size,Bed('/Users/gt/data/paper4/rice_j_setaria_n/rice_j_set.nolocaldups.with_new_cns_mask.bed')) #get_seq(x,"/Users/gt/data/paper4/rice_j.fasta","/Users/gt/data/paper4/rice_j_setaria_n/testing.fasta") ####### rice_sorg ######### dict_size = gene_size_dict('/Users/gt/tmp.tsv') x = random_noncoding( dict_size, Bed('/Users/gt/data/paper4/rice_j_sorghum_n/rice_j_sorg.nolocaldups.with_new_cns_mask.bed' )) get_seq(x, "/Users/gt/data/paper4/rice_j.fasta", "/Users/gt/data/paper4/rice_j_sorghum_n/testing.fasta") ##### seq for cns #handle = open("/Users/gturco/data/paper3/rice_b_sorghum_v1.cns.assigned_real.csv") #fh = handle.read() #cns_list = [] #for line in fh.split("\n")[:-1]: # if line[0] == "#": continue # cns_id,accn,seqid,start,end,strand = line.split(",")[:6] # cns_list.append((seqid,int(start),int(end))) # #len(cns_list) #get_seq(cns_list,"/Users/gturco/data/paper3/rice_b.fasta","/Users/gturco/test_cns.fasta") ##
def main(qbed_path, sbed_path, cnsfile, dist, orthology_path): """ here, we remove cnss that have been called proteins/rnas from the cns list, and add them to the bed files. AND have to do the preliminary assignment of cnss that remain to the new-genes that _were_ cnss. the proper assignment is then handled in assign.py """ qcns_file = qbed_path.replace(".bed", "_cns.gff") assert qcns_file != qbed_path qcns_gff = open(qcns_file, 'w') print >> qcns_gff, "##gff-version 3" if sbed_path != qbed_path: scns_file = sbed_path.replace(".bed", "_cns.gff") assert scns_file != sbed_path scns_gff = open(scns_file, 'w') print >> scns_gff, "##gff-version 3" else: scns_gff = qcns_gff qrawbed = RawBed(qbed_path) srawbed = RawBed(sbed_path) ortho_trees = read_orthos_to_trees(orthology_path, qrawbed, srawbed) qbed = Bed(qbed_path) qbed.fill_dict() sbed = Bed(sbed_path) sbed.fill_dict() name, ext = op.splitext(cnsfile) real_cns_fh = open("%s.real%s" % (name, ext), "w") print >> sys.stderr, "writing to:", real_cns_fh.name outdir = op.dirname(cnsfile) print >> real_cns_fh, "#qseqid,qaccn,sseqid,saccn,qstart,qend,sstart,send,eval" crna = read_cns_to_rna(outdir) cpro = read_cns_to_protein_exons(outdir) #cns_items = list(parse_raw_cns(cnsfile)) proteins = collections.defaultdict(list) rnas = collections.defaultdict(list) real_cns_items = [] for cnsi in CNS.parse_raw_line(cnsfile): cns_id = cnsi.cns_id cns = cnsi.to_dict() key = (cns['qseqid'], cns['sseqid']) if cns_id in cpro: proteins[key].append((cns, cpro[cns_id])) elif cns_id in crna: rnas[key].append((cns, crna[cns_id])) else: real_cns_items.append((cns_id, cns)) p_trees = fill_tree(proteins) r_trees = fill_tree(rnas) def assign_new_names(prs, protein_or_rna): n = {} for seqid_pair, li in prs.iteritems(): if not seqid_pair in n: n[seqid_pair] = [] for gnew, info in li[:]: new_qname = "%(qseqid)s_%(qstart)i_%(qend)i_cns" % gnew new_sname = "%(sseqid)s_%(sstart)i_%(send)i_cns" % gnew # and give them both an id so we know they were a pair. new_qname += "_%s" % (protein_or_rna) new_sname += "_%s" % (protein_or_rna) #print >>sys.stderr, gnew['qaccn'], cns["qaccn"] try: qstrand = qbed.d[gnew['qaccn']]['strand'] sstrand = sbed.d[gnew['saccn']]['strand'] except: print >> sys.stderr, gnew raise gnew['qaccn'] = new_qname gnew['saccn'] = new_sname gnew['qstrand'] = qstrand gnew['sstrand'] = sstrand n[seqid_pair].append((gnew, info)) return n nproteins = assign_new_names(proteins, "protein") nrnas = assign_new_names(rnas, "rna") cns_seen = {} # go through the remaining cnss, print and assign them to the new # genes (previously cnss) in within dist. for cns_id, cns in real_cns_items: print >> real_cns_fh, cns_to_str(cns) key = (cns['qseqid'], cns['sseqid']) for pnew, info in get_new(cns, p_trees, key, nproteins, dist + 1000): cns['qaccn'] = pnew['qaccn'] cns['saccn'] = pnew['saccn'] cns_str = cns_to_str(cns) if cns_str in cns_seen: continue cns_seen[cns_str] = 1 print >> real_cns_fh, cns_str for rnew, info in get_new(cns, r_trees, key, nrnas, dist + 1000): cns['qaccn'] = rnew['qaccn'] cns['saccn'] = rnew['saccn'] cns_str = cns_to_str(cns) if cns_str in cns_seen: continue cns_seen[cns_str] = 1 print >> real_cns_fh, cns_str qbed_list, qnew_pairs = merge_bed(qbed, nproteins, nrnas, ortho_trees, 'q') print >> sys.stderr, len(qnew_pairs) # dont need to do the orthos 2x so send in empty dict. sbed_list, snew_pairs_unused = merge_bed(sbed, nproteins, nrnas, {}, 's') # if it's the same org, we add the new cnss again to the same we send in both lists. # print_bed handles the repeats. if qbed.path == sbed.path: qbed_new = sbed_new = print_bed(qbed_list + sbed_list, qbed.path) else: qbed_new = print_bed(qbed_list, qbed.path) sbed_new = print_bed(sbed_list, sbed.path) return qbed_new.path, sbed_new.path, qnew_pairs
def main(old_bed, new_bed1, new_bed2, merge_path): merge_fh = open(merge_path, 'w') new_genes_1 = new_genes(old_bed, new_bed1) new_genes_2 = new_genes(old_bed, new_bed2) print len(new_genes_1), len(new_genes_2) all_overlap = [] for new_gene in new_genes_1: #print new_gene['accn'] ### does it overlapp with any of the other new genes.... overlapping_genes = overlapping(new_gene['start'], new_gene['end'], new_gene['strand'], new_gene['seqid'], new_genes_2) if overlapping_genes == 0: write_bed(gene, merge_fh) continue ### append all overlaping accns all_overlap.extend(overlapping_genes) for overlapping_gene in overlapping_genes: new_gene = update_locs(new_gene, overlapping_gene) merged_gene = merge_feats(new_gene) write_bed(new_gene, merge_fh) #### if it does merge the numbers for new_gene2 in new_genes_2: if new_gene2 not in all_overlap: write_bed(new_gene2, merge_fh) write_old_bed(new_genes_1, new_genes_2, old_bed, merge_fh) main(Bed("data/rice_v6_setaria64/rice_v6.bed"), Bed("data/rice_v6_setaria64/rice_v6.all2.bed"), Bed("data/rice_v6_sorghum_v1/rice_v6.all2.bed"), "test")
print >> fcnss, "%s,%s,%s,[%s,%s],%s,%s" % (qname, qfeat['seqid'], sname, sfeat['qleft_gene'], sfeat['qright_gene'], sfeat['seqid'], ",".join(map(lambda l: ",".join(map(str,l)), cnss))) return None if __name__ == "__main__": import optparse parser = optparse.OptionParser("usage: %prog [options] ") parser.add_option("-F", dest="mask", help="blast mask simple sequence [default: F]", default="F") parser.add_option("-n", dest="ncpu", help="parallelize to this many cores", type='int', default=8) parser.add_option("-q", dest="qfasta", help="path to genomic query fasta") parser.add_option("--qbed", dest="qbed", help="query bed file") parser.add_option("-s", dest="sfasta", help="path to genomic subject fasta") parser.add_option("--sbed", dest="sbed", help="subject bed file") parser.add_option("-p", dest="pairs", help="the pairs file. output from dagchainer") choices = ("dag", "cluster", "pair", 'qa', 'raw', 'pck') parser.add_option("--pair_fmt", dest="pair_fmt", default='raw', help="format of the pairs, one of: %s" % str(choices), choices=choices) (options, _) = parser.parse_args() if not (options.qfasta and options.sfasta and options.sbed and options.qbed): sys.exit(parser.print_help()) qbed = Bed(options.qbed, options.qfasta); qbed.fill_dict() sbed = Bed(options.sbed, options.sfasta); sbed.fill_dict() assert options.mask in 'FT' main(qbed, sbed, options.pairs, options.pair_fmt, options.mask, options.ncpu)
write_new_bed(new_genes_final, old_bed, missed_genes, out_file) sort_file = "sort -n -k 1 -k 2 {0} -o {0}".format(out_file) commands.getstatusoutput(sort_file) if __name__ == "__main__": import optparse parser = optparse.OptionParser("usage: %prog [options] ") parser.add_option("--missed_bed", dest="new_bed", help="missed ORGA from ORGB bed file from coanno ") parser.add_option( "--missed_matches", dest="missed_genes", help="missed ORGA from ORGB matches.txt file from coanno") parser.add_option("--old_bed", dest="old_bed", help="orginal bed file for ORG") parser.add_option("--out", dest="out_fh", help="out_file: where the new merged bed should go") (options, _) = parser.parse_args() new_bed = Bed(options.new_bed) old_bed = Bed(options.old_bed) main(options.missed_genes, old_bed, new_bed, options.out_fh) #merge_same_hits(Bed('data/athaliana_lyrata2/missed_lyrata_from_athaliana.bed'),'data/athaliana_lyrata2/missed_lyrata_from_athaliana.matches.txt',Bed('data/athaliana_lyrata2/lyrata.bed')) #merge(Bed('data/athaliana_lyrata2/lyrata.bed'),Bed('data/athaliana_lyrata2/missed_from_lyrata.bed'),'data/athaliana_lyrata2/lyrata.all.bed')
def __init__(self, filename, bed): self.filename = filename self.bed = Bed(bed) self.bed.fill_dict()
type='string', help="path to query localdup_file") parser.add_option("--sdups", dest="sdups", type='string', help="path to subject localdup_file") parser.add_option("--cns_file", dest="cns_file", type='string', help="path to cns file cns.txt") parser.add_option("--UMfasta", dest="unmasked_fasta", help="path to unmasked fasta file file") (options, _) = parser.parse_args() qbed = Bed(options.qbed, options.qfasta) qbed.fill_dict() sbed = Bed(options.sbed, options.sfasta) sbed.fill_dict() unmasked_fasta = Fasta(options.unmasked_fasta) assert options.mask in 'FT' qnolocaldups_path = qbed.path.split(".")[0] + ".nolocaldups.bed" snolocaldups_path = sbed.path.split(".")[0] + ".nolocaldups.bed" #pairs_to_qa("{0}.local".format(options.pairs),'pair',"{0}.nolocaldups.local".format(qbed.path.split(".")[0]),"{0}.nolocaldups.local".format(sbed.path.split(".")[0]),"{0}.raw.filtered.local".format(options.pairs.split(".")[0])) import logging LOG_FILENAME = path.dirname(options.qfasta) + "dup_rdups.log" logging.basicConfig(filename=LOG_FILENAME, level=logging.INFO) main(options.cns_file, options.qdups, options.sdups, options.pairs,
# print interval_list[0].find(0,100000000) # print interval_list[0].find(3577840,3577841) # print three_prom # print gene_body # print five_prom #three_prom = [i for i in three_prom if i.name == gene['strand']] #five_prom = [i for i in five_prom if i.name == gene['strand']] #gene_body = [i for i in gene_body if i.name == gene['strand']] if len(three_prom) > 0: l = "{0}\t3_prom\t{1}\t{2}\t{3}\n".format( gene_name, three_prom_p, three_prom_p * len(three_prom), sum(int(sig.name) for sig in three_prom)) out.write(l) if len(five_prom) > 0: l = "{0}\t5_prom\t{1}\t{2}\t{3}\n".format( gene_name, five_prom_p, five_prom_p * len(five_prom), sum(int(sig.name) for sig in five_prom)) out.write(l) if len(gene_body) > 0: l = "{0}\tgene_body\t{1}\t{2}\t{3}\n".format( gene_name, gene_body_p, gene_body_p * len(gene_body), sum(int(sig.name) for sig in gene_body)) out.write(l) genelist = Bed("sorg.bed") interval_list = insert_queries("DMR_NONVAS_CG_HYPO") find_intersections(3000, interval_list, genelist, "DMR_nonvas.genes")
parser.add_option("--paralogy", dest="paralogy", help="paralogy file") parser.add_option("--orthology", dest="orthology", help="orthology file") opts, _ = parser.parse_args() if not (opts.qflat_all and opts.sflat_all and opts.datasheet): print "A" sys.exit(parser.print_help()) if not (opts.qdsgid and opts.qorg and opts.sorg): print "B" sys.exit(parser.print_help()) if not (opts. qdups and opts.sdups and opts.paralogy and opts.orthology): print "C" sys.exit(parser.print_help()) qflat_new = Bed(opts.qflat_new) sflat_new = qflat_new if opts.qflat_new == opts.sflat_new else Bed(opts.sflat_new) qflat_all = Bed(opts.qflat_all) sflat_all = qflat_all if opts.qflat_all == opts.sflat_all else Bed(opts.sflat_all) qfpath = "%s.all%s" % op.splitext(qflat_new.path) sfpath = "%s.all%s" % op.splitext(sflat_new.path) qflat = merge_flat(qfpath, qflat_all, qflat_new) sflat = merge_flat(sfpath, sflat_all, sflat_new) qdups = parse_dups(opts.qdups, qflat) sdups = parse_dups(opts.sdups, sflat) qlocaldups = parse_dups(opts.qlocaldups,qflat)
#print line.split(",")[:5] cns_id, accn, seqid, start, end, strand = line.split(",")[:6] w = '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t.\t.\t.\t1\t{6}\t0\n'.format( seqid, start, end, cns_id, (int(end) - int(start)), strand, (int(end) - int(start) + 1)) cns_bed.write(w) #cns_to_bed("/Users/gt/data/paper4/rice_j_setaria_n/rice_j_setaria_n.cns.assigned_real.csv","/Users/gt/data/paper4/rice_j_setaria_n/rice_j_setaria_n_cns.bed") #merge_flat("/Users/gt/data/paper4/rice_j_setaria_n/rice_j_set.nolocaldups.with_new_cns.bed",Bed("/Users/gt/data/paper4/rice_j_setaria_n/rice_j.nolocaldups.with_new.all.local"),Bed("/Users/gt/data/paper4/rice_j_setaria_n/rice_j_setaria_n_cns.bed")) cns_to_bed( "/Users/gt/data/paper4/rice_j_sorghum_n/rice_j_sorghum_n.cns.assigned_real.csv", "/Users/gt/data/paper4/rice_j_sorghum_n/rice_j_sorghum_n_cns.bed") merge_flat( "/Users/gt/data/paper4/rice_j_sorghum_n/rice_j_sorg.nolocaldups.with_new_cns.bed", Bed("/Users/gt/data/paper4/rice_j_sorghum_n/rice_j.nolocaldups.with_new.all.local" ), Bed("/Users/gt/data/paper4/rice_j_sorghum_n/rice_j_sorghum_n_cns.bed")) def mask_to_bed(fasta_file, mask_bed_name): "creates a bed file of the start and stops of masked seqs" mask_bed = open(mask_bed_name, "wb") f = Fasta(fasta_file) mask_id = 1 for seqid in f.keys(): seq = f[seqid][:] for m in re.finditer("X+", seq): mask_id = mask_id + 1 w = '{0}\t{1}\t{2}\t{3}\t{4}\t+\t.\t.\t.\t1\t{5}\t0\n'.format( seqid, m.start(), m.end(), "mask_id {0}".format(mask_id), (m.end() - m.start()), (m.end() - m.start() + 1))
org_bed_path = org_bed.path path = org_bed_path.split('/') dirc = '/'.join(path[:-1]) org = path[-1] missed2 = '{0}/missed_from_{1}'.format(dirc, org) merge_fh = "{0}/all_{1}".format(dirc, org) print missed2 merge(org_bed, Bed(missed2), merge_fh) if __name__ == "__main__": import optparse parser = optparse.OptionParser("usage: %prog [options] ") parser.add_option("--missed", dest="missed", help="missed ORGA from ORGB bed file from coanno ") parser.add_option( "--match", dest="fh_match", help="missed ORGA from ORGB matches.txt file from coanno") parser.add_option("--org", dest="org_bed", help="orginal bed file for ORG") (options, _) = parser.parse_args() missed_bed = Bed(options.missed) org_bed = Bed(options.org_bed) main(missed_bed, options.fh_match, org_bed) #merge_same_hits(Bed('data/athaliana_lyrata2/missed_lyrata_from_athaliana.bed'),'data/athaliana_lyrata2/missed_lyrata_from_athaliana.matches.txt',Bed('data/athaliana_lyrata2/lyrata.bed')) #merge(Bed('data/athaliana_lyrata2/lyrata.bed'),Bed('data/athaliana_lyrata2/missed_from_lyrata.bed'),'data/athaliana_lyrata2/lyrata.all.bed')
def test_main(self): """test for test_get_cns_dict""" qbed = Bed(self.qbed, self.qfasta); qbed.fill_dict() sbed = Bed(self.sbed, self.sfasta); sbed.fill_dict() x = main(qbed, sbed, self.pairs, 12000,12000, "pair", self.blast_path, "T",2) print x