Ejemplo n.º 1
0
def bed_seq_data(gap_chr_data, chr_size_file, out_bed_folder, seq_len=524288, shift=524288//2, cutting_chr_edges = 100000, test_fraction=0.2, chr_list="all"):

    if os.path.exists(out_bed_folder+"train_test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed"):
        logging.info("Found dump for seq bed file " + out_bed_folder+"train_test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed")
        bed_reader = BedReader(out_bed_folder+"train_test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed")
        bed_reader.read_file(renamer = {"0":"chr","1":"start","2":"end", "3":"train_test"})
        return bed_reader.chr_data
    else:
        chr_size_data = pd.read_csv(chr_size_file, header=None, names = ["chr", "size"], sep="\t")
        chr_data = dict()
        if chr_list=="all":
            chr_list = chr_size_data["chr"]
        else:
            chr_list = chr_list
        # print("chr_list", chr_list)
        for chr in chr_list:
            chr_size = chr_size_data[chr_size_data["chr"]==chr]
            assert len(chr_size)==1
            chr_len = chr_size.iloc[0][1]
            if chr not in gap_chr_data.keys():
                logging.getLogger(__name__).warning("There are no gaps on " + chr + " chromosome")
                gaps = [(chr_len - cutting_chr_edges , chr_len - cutting_chr_edges)]
            else:
                gaps = list(zip( gap_chr_data[chr]["start"], gap_chr_data[chr]["end"]))
            chrs, starts, ends = [], [], []
            start_seq = cutting_chr_edges
            for count,gap in enumerate(sorted(gaps)):
                end_seq = gap[0] - seq_len #start of last seq in region between gaps
                if end_seq-start_seq < seq_len:
                    start_seq = gap[1]
                    continue
                else:
                    for start in range(start_seq, end_seq+1,shift):
                        chrs.append(chr)
                        starts.append(start)
                        ends.append(start+seq_len)
                    start_seq = gap[1]
                if count == len(gaps) - 1:
                    start_seq = gap[1]
                    end_seq = chr_len - cutting_chr_edges -seq_len
                    if end_seq - start_seq > seq_len:
                        for start in range(start_seq, end_seq,shift):
                            chrs.append(chr)
                            starts.append(start)
                            ends.append(start+seq_len)
            data = pd.DataFrame({"chr":chrs, "start":starts, "end":ends})
            data["train_test"] = ["train"] * len(data)
            # print(data)
            random.seed()
            rand_int = random.randint(0, len(data) - round(len(data)*test_fraction))
            # print("rand_int", rand_int)
            data.iloc[rand_int:rand_int + round(len(data)*test_fraction),3] = "test"
            chr_data[chr] = data
        conc_data = pd.concat([chr_data[chr] for chr in chr_data.keys()])
        conc_data.to_csv(out_bed_folder+"train_test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed", sep="\t", header = False, index=False)
        conc_data[conc_data["train_test"]=="train"][["chr", "start", "end"]].to_csv(out_bed_folder+"train_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed",  header=False, index=False, sep="\t")
        conc_data[conc_data["train_test"] == "test"][["chr", "start", "end"]].to_csv(out_bed_folder + "test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed",header=False, index=False, sep="\t")
        return chr_data
Ejemplo n.º 2
0
def generate_gaps(chr_list,
                  cool_file,
                  output_gap_folder,
                  zero_proc_in_line=97,
                  bins_min_gap_between=3,
                  bins_min_gap=3):
    output_gap_file = output_gap_folder + "gaps+chr" + str(
        chr_list) + "_proc" + str(zero_proc_in_line) + ".bed"
    if os.path.exists(output_gap_file):
        logging.info("Found dump for gap file " + output_gap_file)
        bed_reader = BedReader(output_gap_file)
        bed_reader.read_file()
        return bed_reader.chr_data
    else:
        c = cooler.Cooler(cool_file)
        chr_data = dict()
        for chr in chr_list:
            data = c.matrix(balance=False).fetch(chr)
            bin_size = c.binsize
            minimum_gap_between_length = bin_size * bins_min_gap_between
            min_gap_length = bin_size * bins_min_gap
            zero_counts = []
            proc_zero_counts = []
            gaps = []
            for n, row in enumerate(data):
                #calculate procent of zeroes in every column
                zero_count = len(row) - np.count_nonzero(row)
                proc_zero = zero_count / len(row) * 100
                zero_counts.append(zero_count)
                proc_zero_counts.append(proc_zero)
                if proc_zero >= zero_proc_in_line:
                    gaps.append((chr, (n + 1) * bin_size,
                                 (n + 1) * bin_size + bin_size))
            # print(gaps)
            data = merge_gaps(gaps, minimum_gap_between_length, min_gap_length,
                              output_gap_file)
            # convert to chr-dict
            chr_data[chr] = data
            logging.info("Found " + str(len(chr_data[chr])) + " gaps on chr " +
                         chr)
        conc_data = pd.concat([chr_data[chr] for chr in chr_data.keys()])
        conc_data.to_csv(output_gap_file, sep="\t", header=False, index=False)
        return chr_data