def bed_seq_data(gap_chr_data, chr_size_file, out_bed_folder, seq_len=524288, shift=524288//2, cutting_chr_edges = 100000, test_fraction=0.2, chr_list="all"): if os.path.exists(out_bed_folder+"train_test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed"): logging.info("Found dump for seq bed file " + out_bed_folder+"train_test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed") bed_reader = BedReader(out_bed_folder+"train_test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed") bed_reader.read_file(renamer = {"0":"chr","1":"start","2":"end", "3":"train_test"}) return bed_reader.chr_data else: chr_size_data = pd.read_csv(chr_size_file, header=None, names = ["chr", "size"], sep="\t") chr_data = dict() if chr_list=="all": chr_list = chr_size_data["chr"] else: chr_list = chr_list # print("chr_list", chr_list) for chr in chr_list: chr_size = chr_size_data[chr_size_data["chr"]==chr] assert len(chr_size)==1 chr_len = chr_size.iloc[0][1] if chr not in gap_chr_data.keys(): logging.getLogger(__name__).warning("There are no gaps on " + chr + " chromosome") gaps = [(chr_len - cutting_chr_edges , chr_len - cutting_chr_edges)] else: gaps = list(zip( gap_chr_data[chr]["start"], gap_chr_data[chr]["end"])) chrs, starts, ends = [], [], [] start_seq = cutting_chr_edges for count,gap in enumerate(sorted(gaps)): end_seq = gap[0] - seq_len #start of last seq in region between gaps if end_seq-start_seq < seq_len: start_seq = gap[1] continue else: for start in range(start_seq, end_seq+1,shift): chrs.append(chr) starts.append(start) ends.append(start+seq_len) start_seq = gap[1] if count == len(gaps) - 1: start_seq = gap[1] end_seq = chr_len - cutting_chr_edges -seq_len if end_seq - start_seq > seq_len: for start in range(start_seq, end_seq,shift): chrs.append(chr) starts.append(start) ends.append(start+seq_len) data = pd.DataFrame({"chr":chrs, "start":starts, "end":ends}) data["train_test"] = ["train"] * len(data) # print(data) random.seed() rand_int = random.randint(0, len(data) - round(len(data)*test_fraction)) # print("rand_int", rand_int) data.iloc[rand_int:rand_int + round(len(data)*test_fraction),3] = "test" chr_data[chr] = data conc_data = pd.concat([chr_data[chr] for chr in chr_data.keys()]) conc_data.to_csv(out_bed_folder+"train_test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed", sep="\t", header = False, index=False) conc_data[conc_data["train_test"]=="train"][["chr", "start", "end"]].to_csv(out_bed_folder+"train_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed", header=False, index=False, sep="\t") conc_data[conc_data["train_test"] == "test"][["chr", "start", "end"]].to_csv(out_bed_folder + "test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed",header=False, index=False, sep="\t") return chr_data
def generate_gaps(chr_list, cool_file, output_gap_folder, zero_proc_in_line=97, bins_min_gap_between=3, bins_min_gap=3): output_gap_file = output_gap_folder + "gaps+chr" + str( chr_list) + "_proc" + str(zero_proc_in_line) + ".bed" if os.path.exists(output_gap_file): logging.info("Found dump for gap file " + output_gap_file) bed_reader = BedReader(output_gap_file) bed_reader.read_file() return bed_reader.chr_data else: c = cooler.Cooler(cool_file) chr_data = dict() for chr in chr_list: data = c.matrix(balance=False).fetch(chr) bin_size = c.binsize minimum_gap_between_length = bin_size * bins_min_gap_between min_gap_length = bin_size * bins_min_gap zero_counts = [] proc_zero_counts = [] gaps = [] for n, row in enumerate(data): #calculate procent of zeroes in every column zero_count = len(row) - np.count_nonzero(row) proc_zero = zero_count / len(row) * 100 zero_counts.append(zero_count) proc_zero_counts.append(proc_zero) if proc_zero >= zero_proc_in_line: gaps.append((chr, (n + 1) * bin_size, (n + 1) * bin_size + bin_size)) # print(gaps) data = merge_gaps(gaps, minimum_gap_between_length, min_gap_length, output_gap_file) # convert to chr-dict chr_data[chr] = data logging.info("Found " + str(len(chr_data[chr])) + " gaps on chr " + chr) conc_data = pd.concat([chr_data[chr] for chr in chr_data.keys()]) conc_data.to_csv(output_gap_file, sep="\t", header=False, index=False) return chr_data