def bed_seq_data(gap_chr_data, chr_size_file, out_bed_folder, seq_len=524288, shift=524288//2, cutting_chr_edges = 100000, test_fraction=0.2, chr_list="all"): if os.path.exists(out_bed_folder+"train_test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed"): logging.info("Found dump for seq bed file " + out_bed_folder+"train_test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed") bed_reader = BedReader(out_bed_folder+"train_test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed") bed_reader.read_file(renamer = {"0":"chr","1":"start","2":"end", "3":"train_test"}) return bed_reader.chr_data else: chr_size_data = pd.read_csv(chr_size_file, header=None, names = ["chr", "size"], sep="\t") chr_data = dict() if chr_list=="all": chr_list = chr_size_data["chr"] else: chr_list = chr_list # print("chr_list", chr_list) for chr in chr_list: chr_size = chr_size_data[chr_size_data["chr"]==chr] assert len(chr_size)==1 chr_len = chr_size.iloc[0][1] if chr not in gap_chr_data.keys(): logging.getLogger(__name__).warning("There are no gaps on " + chr + " chromosome") gaps = [(chr_len - cutting_chr_edges , chr_len - cutting_chr_edges)] else: gaps = list(zip( gap_chr_data[chr]["start"], gap_chr_data[chr]["end"])) chrs, starts, ends = [], [], [] start_seq = cutting_chr_edges for count,gap in enumerate(sorted(gaps)): end_seq = gap[0] - seq_len #start of last seq in region between gaps if end_seq-start_seq < seq_len: start_seq = gap[1] continue else: for start in range(start_seq, end_seq+1,shift): chrs.append(chr) starts.append(start) ends.append(start+seq_len) start_seq = gap[1] if count == len(gaps) - 1: start_seq = gap[1] end_seq = chr_len - cutting_chr_edges -seq_len if end_seq - start_seq > seq_len: for start in range(start_seq, end_seq,shift): chrs.append(chr) starts.append(start) ends.append(start+seq_len) data = pd.DataFrame({"chr":chrs, "start":starts, "end":ends}) data["train_test"] = ["train"] * len(data) # print(data) random.seed() rand_int = random.randint(0, len(data) - round(len(data)*test_fraction)) # print("rand_int", rand_int) data.iloc[rand_int:rand_int + round(len(data)*test_fraction),3] = "test" chr_data[chr] = data conc_data = pd.concat([chr_data[chr] for chr in chr_data.keys()]) conc_data.to_csv(out_bed_folder+"train_test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed", sep="\t", header = False, index=False) conc_data[conc_data["train_test"]=="train"][["chr", "start", "end"]].to_csv(out_bed_folder+"train_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed", header=False, index=False, sep="\t") conc_data[conc_data["train_test"] == "test"][["chr", "start", "end"]].to_csv(out_bed_folder + "test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed",header=False, index=False, sep="\t") return chr_data
def addUTR(self, filename, genes): hash = self.hashGenes(genes) reader = BedReader(filename) while (True): record = reader.nextRecord() if (not record): break if (not record.isBed6()): raise Exception("BED file has too few fields") id = record.name gene = hash.get(id, None) if (not gene): gene = BedGene(id, record.chr, record.strand) genes.append(gene) hash[id] = gene gene.addUTR(record.interval) for gene in genes: gene.coalesce()
def addUTR(self,filename,genes): hash=self.hashGenes(genes) reader=BedReader(filename) while(True): record=reader.nextRecord() if(not record): break if(not record.isBed6()): raise Exception("BED file has too few fields") id=record.name gene=hash.get(id,None) if(not gene): gene=BedGene(id,record.chr,record.strand) genes.append(gene) hash[id]=gene gene.addUTR(record.interval) for gene in genes: gene.coalesce()
def readCDS(self, filename): reader = BedReader(filename) genes = [] genesByName = {} while (True): record = reader.nextRecord() if (not record): break if (not record.isBed6()): raise Exception("BED file has too few fields") id = record.name gene = genesByName.get(id, None) if (not gene): gene = BedGene(id, record.chr, record.strand) genesByName[id] = gene genes.append(gene) gene.addCDS(record.interval) reader.close() return genes
def readCDS(self,filename): reader=BedReader(filename) genes=[] genesByName={} while(True): record=reader.nextRecord() if(not record): break if(not record.isBed6()): raise Exception("BED file has too few fields") id=record.name gene=genesByName.get(id,None) if(not gene): gene=BedGene(id,record.chr,record.strand) genesByName[id]=gene genes.append(gene) gene.addCDS(record.interval) reader.close() return genes
def generate_gaps(chr_list, cool_file, output_gap_folder, zero_proc_in_line=97, bins_min_gap_between=3, bins_min_gap=3): output_gap_file = output_gap_folder + "gaps+chr" + str( chr_list) + "_proc" + str(zero_proc_in_line) + ".bed" if os.path.exists(output_gap_file): logging.info("Found dump for gap file " + output_gap_file) bed_reader = BedReader(output_gap_file) bed_reader.read_file() return bed_reader.chr_data else: c = cooler.Cooler(cool_file) chr_data = dict() for chr in chr_list: data = c.matrix(balance=False).fetch(chr) bin_size = c.binsize minimum_gap_between_length = bin_size * bins_min_gap_between min_gap_length = bin_size * bins_min_gap zero_counts = [] proc_zero_counts = [] gaps = [] for n, row in enumerate(data): #calculate procent of zeroes in every column zero_count = len(row) - np.count_nonzero(row) proc_zero = zero_count / len(row) * 100 zero_counts.append(zero_count) proc_zero_counts.append(proc_zero) if proc_zero >= zero_proc_in_line: gaps.append((chr, (n + 1) * bin_size, (n + 1) * bin_size + bin_size)) # print(gaps) data = merge_gaps(gaps, minimum_gap_between_length, min_gap_length, output_gap_file) # convert to chr-dict chr_data[chr] = data logging.info("Found " + str(len(chr_data[chr])) + " gaps on chr " + chr) conc_data = pd.concat([chr_data[chr] for chr in chr_data.keys()]) conc_data.to_csv(output_gap_file, sep="\t", header=False, index=False) return chr_data
def addPeaks(bedDir,timepoint,direction,expression): records=BedReader.readAll(bedDir+"/"+timepoint+"_"+direction+".bed") for rec in records: array=expression.get(rec.chr,None) if(array is None): array=expression[rec.chr]=[] interval=rec.interval interval.begin=interval.intCenter() interval.end=interval.begin+1 interval.type="peak" interval.dir=direction array.append(interval)
#!/usr/bin/env python #========================================================================= # This is OPEN SOURCE SOFTWARE governed by the Gnu General Public # License (GPL) version 3, as described at www.opensource.org. # Copyright (C)2016 William H. Majoros ([email protected]). #========================================================================= from __future__ import (absolute_import, division, print_function, unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) from BedReader import BedReader BASE = "/Users/bmajoros/python/test/data" filename = BASE + "/DEGs_downreg.FDR_0.1.TSS.protein_coding.bed" reader = BedReader(filename) while (True): rec = reader.nextRecord() if (rec is None): break begin = rec.getBegin() end = rec.getEnd() print(begin, "-", end, sep="", end="") print("\t" + rec.name + "\t" + str(rec.score) + "\t" + rec.strand) reader.close()
#!/usr/bin/env python #========================================================================= # This is OPEN SOURCE SOFTWARE governed by the Gnu General Public # License (GPL) version 3, as described at www.opensource.org. # Copyright (C)2016 William H. Majoros ([email protected]). #========================================================================= from __future__ import (absolute_import, division, print_function, unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) from BedReader import BedReader BASE="/Users/bmajoros/python/test/data" filename=BASE+"/DEGs_downreg.FDR_0.1.TSS.protein_coding.bed" reader=BedReader(filename) while(True): rec=reader.nextRecord() if(rec is None): break begin=rec.getBegin() end=rec.getEnd() print(begin,"-",end,sep="",end="") print("\t"+rec.name+"\t"+str(rec.score)+"\t"+rec.strand) reader.close()