Ejemplo n.º 1
0
    def get_predictors(self,contact):
        assert contact.contact_st < contact.contact_en
        #Peaks outside the window
        Left_start_peaks = self.chipSeq_reader.get_nearest_peaks(
            Interval(contact.chr, contact.contact_st, contact.contact_st),
            N=self.N_closest, side="left")
        Left_start_peaks = Left_start_peaks["plus_orientation"].values.tolist() + \
                           Left_start_peaks["minus_orientation"].values.tolist() + Left_start_peaks["sigVal"].values.tolist() + \
                     (contact.contact_st - Left_start_peaks["mids"]).values.tolist()
        Right_end_peaks = self.chipSeq_reader.get_nearest_peaks(
            Interval(contact.chr, contact.contact_en, contact.contact_en), side="right", N=self.N_closest)

        Right_end_peaks = Right_end_peaks["plus_orientation"].values.tolist() + \
                          Right_end_peaks["minus_orientation"].values.tolist() + Right_end_peaks[
                              "sigVal"].values.tolist() + \
                          (Right_end_peaks["mids"] - contact.contact_en).values.tolist()
        # Next statmetn will return list of 2 dataframes
        # 1st DF with first N peaks on the right side of left interval boundary
        # 2nd DF with first N peaks on the left side of right interval boundary
        Window_peaks = self.chipSeq_reader.get_N_peaks_near_interval_boundaries(
            Interval(contact.chr, contact.contact_st, contact.contact_en),
            N=self.N_closest)

        Right_start_peaks = Window_peaks[0]["plus_orientation"].values.tolist() + Window_peaks[0][
            "minus_orientation"].values.tolist() + \
                            Window_peaks[0]["sigVal"].values.tolist() + (
                            Window_peaks[0]["mids"] - contact.contact_st).values.tolist()

        Left_end_peaks = Window_peaks[1]["plus_orientation"].values.tolist() + Window_peaks[1][
            "minus_orientation"].values.tolist() + \
                             Window_peaks[1]["sigVal"].values.tolist() + (
                             contact.contact_en - Window_peaks[1]["mids"]).values.tolist()
        predictors = Left_start_peaks + Right_start_peaks + Left_end_peaks + Right_end_peaks
        return predictors
Ejemplo n.º 2
0
    def get_predictors(self, contact):
        #print(self.name)
        assert contact.contact_st < contact.contact_en
        if contact.chr not in set(self.chipSeq_reader.chr_data.keys()):
            return [0] * len(self.header)
        else:
            intL, intM, intR = self.intevals_around_ancor(contact)
            sig_L = self.chipSeq_reader.get_interval(intL).sigVal.sum()
            sig_R = self.chipSeq_reader.get_interval(intR).sigVal.sum()
            sig_mid = self.chipSeq_reader.get_interval(intM).sigVal.sum()
            Left_top = self.chipSeq_reader.get_nearest_peaks(Interval(
                contact.chr, contact.contact_st - (self.window_size // 2),
                contact.contact_st - (self.window_size // 2)),
                                                             N=self.N_closest,
                                                             side="left")

            Left_top = Left_top["sigVal"].values.tolist() + \
                       (contact.contact_st-Left_top["mids"]).values.tolist()

            Right_top = self.chipSeq_reader.get_nearest_peaks(Interval(
                contact.chr, contact.contact_en + (self.window_size // 2),
                contact.contact_en + (self.window_size // 2)),
                                                              N=self.N_closest,
                                                              side="right")
            Right_top = Right_top["sigVal"].values.tolist() + \
                        (Right_top["mids"]-contact.contact_en).values.tolist()
            if self.name == "CTCF_SmallChip":
                return [sig_L, sig_R]
            else:
                return [sig_L, sig_mid, sig_R] + Left_top + Right_top
Ejemplo n.º 3
0
 def get_predictors(self, contact):
     assert contact.contact_st < contact.contact_en
     # get the nearest right and left peak to the start and to the end of contact
     Left_start_peaks = self.TSS_reader.get_nearest_peaks(Interval(
         contact.chr, contact.contact_st, contact.contact_st),
                                                          N=1,
                                                          side="left")
     Left_start_peak_dist = (contact.contact_st -
                             Left_start_peaks["TSS"]).values.tolist()
     Right_start_peaks = self.TSS_reader.get_nearest_peaks(Interval(
         contact.chr, contact.contact_st, contact.contact_st),
                                                           N=1,
                                                           side="right")
     Right_start_peak_dist = (Right_start_peaks["TSS"] -
                              contact.contact_st).values.tolist()
     Left_end_peaks = self.TSS_reader.get_nearest_peaks(Interval(
         contact.chr, contact.contact_en, contact.contact_en),
                                                        side="left",
                                                        N=1)
     Left_end_peak_dist = (contact.contact_en -
                           Left_end_peaks["TSS"]).values.tolist()
     Right_end_peaks = self.TSS_reader.get_nearest_peaks(Interval(
         contact.chr, contact.contact_en, contact.contact_en),
                                                         side="right",
                                                         N=1)
     Right_end_peak_dist = (Right_end_peaks["TSS"] -
                            contact.contact_en).values.tolist()
     return Left_start_peak_dist + Right_start_peak_dist + Left_end_peak_dist + Right_end_peak_dist
Ejemplo n.º 4
0
 def get_predictors(self,contact):
     assert contact.contact_st < contact.contact_en
     # get the nearest right and left peak to the start and to the end of contact
     Left_start_peaks = self.chipSeq_reader.get_nearest_peaks(
         Interval(contact.chr, contact.contact_st, contact.contact_st),N=1, side="left")
     Left_start_peak = Left_start_peaks["minus_orientation"].values.tolist()[0] \
         if abs(Left_start_peaks["mids"].values.tolist()[0]-contact.contact_st) <= self.binsize*2 else 0
     Right_start_peaks = self.chipSeq_reader.get_nearest_peaks(
         Interval(contact.chr, contact.contact_st, contact.contact_st), N=1, side="right")
     Right_start_peak = Right_start_peaks["minus_orientation"].values.tolist()[0] \
         if abs(Right_start_peaks["mids"].values.tolist()[0]-contact.contact_st) <= self.binsize*2 else 0
     Left_end_peaks = self.chipSeq_reader.get_nearest_peaks(
         Interval(contact.chr, contact.contact_en, contact.contact_en), side="left", N=1)
     Left_end_peak = Left_end_peaks["minus_orientation"].values.tolist()[0] \
         if abs(Left_end_peaks["mids"].values.tolist()[0] - contact.contact_en) <= self.binsize*2 else 0
     Right_end_peaks = self.chipSeq_reader.get_nearest_peaks(
         Interval(contact.chr, contact.contact_en, contact.contact_en), side="right", N=1)
     Right_end_peak = Right_end_peaks["minus_orientation"].values.tolist()[0] \
         if abs(Right_end_peaks["mids"].values.tolist()[0] - contact.contact_en) <= self.binsize*2 else 0
     #minus orientation is orientation of CTCF to the right, plus to the left
     # 1 if CTCF sites in the end and start of contact have convergent orientation else 0
     start_minus_orientation = [Left_start_peak,Right_start_peak]
     end_plus_orientation = [Left_end_peak,Right_end_peak]
     if len(np.nonzero(start_minus_orientation)[0]) != 0 and len(np.nonzero(end_plus_orientation)[0]) != 0:
         predictors = [1]
     else:
         predictors = [0]
     return predictors
Ejemplo n.º 5
0
 def get_predictors(self, contact):
     left_interval = Interval(contact.chr, contact.contact_st - self.dist_from_anchor, \
                              contact.contact_st + self.binsize + self.dist_from_anchor)
     right_interval = Interval(contact.chr, contact.contact_en - self.dist_from_anchor, \
                              contact.contact_en + self.binsize + self.dist_from_anchor)
     left_sequence = self.fastaReader.get_interval(left_interval)
     right_sequence = self.fastaReader.get_interval(right_interval)
     assert len(left_sequence) == self.dist_of_interval
     assert len(right_sequence) == self.dist_of_interval
     return left_sequence + right_sequence
Ejemplo n.º 6
0
 def intevals_around_ancor(self, contact):
     half = self.window_size // 2
     assert contact.contact_en - contact.contact_st > half
     return (
         Interval(contact.chr, contact.contact_st - half,
                  contact.contact_st + half),
         Interval(contact.chr, contact.contact_st + half,
                  contact.contact_en - half),
         Interval(contact.chr, contact.contact_en - half,
                  contact.contact_en + half),
     )
Ejemplo n.º 7
0
 def get_predictors(self, contact):
     assert contact.contact_st < contact.contact_en
     window_start, window_end, contacts_relative_start, contacts_relative_end = \
                                     self.symmetric_window_around_contact(contact)
     interval = Interval(contact.chr, window_start, window_end)
     return [window_start, window_end, contacts_relative_start, contacts_relative_end] + \
             self.eig_reader.get_E1inInterval(interval)["E1"].tolist()
Ejemplo n.º 8
0
    def get_predictors(self, contact):
        assert contact.contact_st < contact.contact_en

        # Get peaks in window and count "blocks"
        # Blocks are CTCF sites with divergent orientation, i.e. --> <-- <-- is a block
        all_Window_peaks = self.chipSeq_reader.get_interval(Interval(contact.chr, contact.contact_st + self.window_size//2, \
                                                                     contact.contact_en - self.window_size//2))
        N_blocks_W = 0
        plus_ori_idx = all_Window_peaks.columns.get_loc('plus_orientation')
        minus_ori_idx = all_Window_peaks.columns.get_loc('minus_orientation')
        # print(len(all_Window_peaks))
        # print(all_Window_peaks[["minus_orientation", "plus_orientation"]])
        for i in range(len(all_Window_peaks) - 1):
            if all_Window_peaks.iloc[
                    i, plus_ori_idx] != 0 and all_Window_peaks.iloc[
                        i + 1, minus_ori_idx] != 0:
                N_blocks_W += 1
        # print(N_blocks_W)

        # Check wheather we have CTCF sites in divergent orientation in contact ancors
        intL, intM, intR = self.intevals_around_ancor(contact)
        L_peaks = self.chipSeq_reader.get_interval(intL)
        R_peaks = self.chipSeq_reader.get_interval(intR)
        has_convergent_peak = 0
        if len(L_peaks) > 0 and len(R_peaks) > 0:
            has_convergent_peak = L_peaks.plus_orientation.sum(
            ) * R_peaks.minus_orientation.sum()
        return [N_blocks_W, has_convergent_peak]
Ejemplo n.º 9
0
def run_timing(func,data,N):
    now = datetime.datetime.now()
    data_dict = {"chr1": data}
    for i in range(0,N):
        start = random.randint(0,maxPos)
        l = random.randint(minLen,maxLen*5)
        interval = Interval("chr1",start,start+l)
        res = func(data_dict,interval)
    now2 = datetime.datetime.now()
    print (str(func.__name__)," : ",now2-now)
Ejemplo n.º 10
0
def test_hicReader():
    genome = fastaReader("../input/hg38/test.fa",name="hg38")
    faReader = genome.read_data()
    now = datetime.datetime.now()
    hic = hicReader(fname="../input/4DNFI2TK7L2F.hic", genome=genome, resolution = 100000)
    hic = hic.read_data()
    print (hic.norms)
    result = hic.get_contact(Interval("chr1",0,120000000)) # single float value or NaN
    print (result)
    result = hic.get_chr_contact("chr1") # returns sparse matrix of the whole chrm as pandas dataframe

    print (datetime.datetime.now() - now)
Ejemplo n.º 11
0
def test_bigWig(inMem):
    print ("Loading data")
    now = datetime.datetime.now()
    bwReader = bigWigReader("../input/ENCFF966IHQ.bigWig",name="Test",genome=genome, inMemory=inMem)
    bwReader = bwReader.readData()
    print ("Time:",datetime.datetime.now() - now)

    print ("Extracting data, inMem=",str(inMem))
    now = datetime.datetime.now()
    start = 10000000
    stop = 101000000
    step = 1000000
    for i in range(start,stop,step):
        res = bwReader.get_interval(Interval("chr1",i,i+step))
    print ("Time:",datetime.datetime.now() - now)
    print (str(len(list(range(start,stop,step))))+" extractions of length "+str(step))
Ejemplo n.º 12
0
def generate_train_dataset(seq_chr_data, fasta_genome, chr_norm_hic_data, out_file, train_test = "train", chrms = "all",
                           target_crop_bp=0, diagonal_offset=2):
    intervals, inputs, targets = [], [], []
    print("train_test", train_test)
    for chr in seq_chr_data.keys():
        print(chr)
        if chrms == "all" or chr in chrms:
            data = seq_chr_data[chr]
            seq_chr_data[chr] = seq_chr_data[chr][seq_chr_data[chr]["train_test"] == train_test]
            print(seq_chr_data[chr])
            for seq in list(zip(seq_chr_data[chr]["start"], seq_chr_data[chr]["end"])):
                seq_region = fasta_genome.get_interval(Interval(chr, seq[0], seq[1]))
                # print("seq_region")
                # print(seq_region)
                encoded_seq = tf.constant(tf.one_hot(seq_region, depth=4))
                inputs.append(encoded_seq)
                binsize = 4096 #TODO create class hic_data and its method binsize
                # compute dimensions
                seq_len_nt = seq[1] - seq[0]
                seq_len_pool = seq_len_nt // binsize

                if target_crop_bp == 0:
                    seq_len_crop = seq_len_pool
                else:
                    crop_start = target_crop_bp // binsize
                    crop_end = seq_len_pool - crop_start
                    seq_len_crop = seq_len_pool - 2 * crop_start

                # unroll upper triangular
                target = chr_norm_hic_data[chr][seq[0]//binsize:seq[1]//binsize, seq[0]//binsize:seq[1]//binsize]
                assert target.shape[0] == target.shape[1]
                assert target.shape[0] * binsize == len(seq_region)
                # compute upper triangular indexes
                triu_tup = np.triu_indices(seq_len_crop, diagonal_offset)
                target = target[triu_tup]
                targets.append(target)
                intervals.append((chr, seq[0], seq[1]))
                # print(len(seq_region), encoded_seq.shape)
                # print(target.shape)

    data = dict()
    print(len(intervals), len(inputs), len(targets))
    data["intervals"] = intervals
    data["inputs"] = inputs
    data["targets"] = targets
    with open(out_file, 'wb') as f:
        pickle.dump(data, f)
Ejemplo n.º 13
0
    def get_predictors(self,contact):
        #print(self.name)
        assert contact.contact_st < contact.contact_en

        # Peaks outside of the window
        Left_peaks = self.chipSeq_reader.get_nearest_peaks(Interval(contact.chr, contact.contact_st, contact.contact_st ),
                                                         N=self.N_closest, side="left")

        Left_peaks = Left_peaks["plus_orientation"].values.tolist() + \
                   Left_peaks["minus_orientation"].values.tolist() + Left_peaks["sigVal"].values.tolist() + \
                     (contact.contact_st - Left_peaks["mids"]).values.tolist()

        Right_peaks = self.chipSeq_reader.get_nearest_peaks(Interval(contact.chr, contact.contact_en, contact.contact_en),
                                                            N=self.N_closest, side="right")

        Right_peaks = Right_peaks["plus_orientation"].values.tolist() + \
                     Right_peaks["minus_orientation"].values.tolist() + Right_peaks["sigVal"].values.tolist() + \
                     (Right_peaks["mids"] - contact.contact_en).values.tolist()

        #Next statmetn will return list of 2 dataframes
        #1st DF with first N peaks on the right side of left interval boundary
        #2nd DF with first N peaks on the left side of right interval boundary
        Window_peaks = self.chipSeq_reader.get_N_peaks_near_interval_boundaries(Interval(contact.chr, contact.contact_st, contact.contact_en),
                                                                           N=self.N_closest)

        Window_peaks_left = Window_peaks[0]["plus_orientation"].values.tolist() +Window_peaks[0]["minus_orientation"].values.tolist() + \
                             Window_peaks[0]["sigVal"].values.tolist() + (Window_peaks[0]["mids"] - contact.contact_st).values.tolist()

        #Get properties of peaks
        Window_peaks_right = Window_peaks[1]["plus_orientation"].values.tolist() + Window_peaks[1]["minus_orientation"].values.tolist() + \
            Window_peaks[1]["sigVal"].values.tolist() + (contact.contact_en - Window_peaks[1]["mids"]).values.tolist()

        #if there are no peaks in window, set sigVal and other params to 0 TODO add if/else for onlyOrient
        if len(self.chipSeq_reader.get_interval(Interval(contact.chr, contact.contact_st, contact.contact_en))) == 0:
            Window_sigVal = 0
            N_plus_orient_W = 0
            N_minus_orient_W = 0
        else:
            Window_sigVal = self.chipSeq_reader.get_interval(Interval(contact.chr, contact.contact_st, contact.contact_en)).sigVal.sum()
            plus_orient_data = self.chipSeq_reader.get_interval(Interval(contact.chr, contact.contact_st, contact.contact_en)).query("plus_orientation!='0'")
            N_plus_orient_W = len(plus_orient_data)
            minus_orient_data = self.chipSeq_reader.get_interval(Interval(contact.chr, contact.contact_st, contact.contact_en)).query("minus_orientation!='0'")
            N_minus_orient_W = len(minus_orient_data)

        predictors = Left_peaks + Window_peaks_left + Window_peaks_right + Right_peaks + [Window_sigVal] + [N_plus_orient_W, N_minus_orient_W]
        return predictors
Ejemplo n.º 14
0
def compare_intervalfuncs(tree, df, df2, df3):
    data1 = {"chr1":df}
    data2 = {"chr1":df2}
    data3 = {"chr1":df3}

    count_match_v1 = 0
    count_match_v2 = 0
    count_match_v3 = 0
    count_match_v4 = 0
    N_tests = 100
    for i in range(0,N_tests):
        start = random.randint(0,maxPos)
        l = random.randint(minLen,maxLen*5)
        # start = 2200
        # l = 2550 - 2200
        interval = Interval("chr1",start,start+l)
        res_v1 = intersect_with_interval(data1,interval)
        res_v2 = intersect_with_interval_v2(data2, interval)
        if len(res_v2) > 0:
            ids = np.unique(res_v2.ids.values)
            res_v2 = df.loc[ids]
            #print (res_v2)
            #break
        res_v3 = intersect_with_interval_v3(data1,interval)
        res_v4 = intersect_with_interval_v4(data3,interval)
        res_intTree = np.array([ q.data for q in tree[start:start+l+1] ])
        match_v1 = compare_results(res_v1,res_intTree)
        match_v2 = compare_results(res_v2,res_intTree)
        match_v3 = compare_results(res_v3,res_intTree)
        match_v4 = compare_results(res_v4,res_intTree)
        count_match_v1 += match_v1
        count_match_v2 += match_v2
        count_match_v3 += match_v3
        count_match_v4 += match_v4
        if not match_v3:
            print("---------------")
            print (interval)
            print (res_v3)
            print (res_intTree)
    print(count_match_v1, " of ", N_tests)
    print(count_match_v2, " of ", N_tests)
    print(count_match_v3, " of ", N_tests)
    print(count_match_v4, " of ", N_tests)
Ejemplo n.º 15
0
    def plot_matrix(self, validation_data, predicted, out_dir, **kwargs):
        predicted_data = validation_data.copy(deep=True)
        predicted_data["contact_count"] = predicted
        mp = MatrixPlotter()
        mp.set_data(validation_data)
        mp.set_control(predicted_data)
        matrix = mp.getMatrix4plot(
            Interval(validation_data["chr"].iloc[0],
                     min(validation_data["contact_st"].values),
                     max(validation_data["contact_en"].values)))
        #if not self.apply_log:
        matrix = np.log(matrix)

        tick_pos, tick_labels = mp.get_bins_strart_labels(maxTicksNumber=15)
        plt.xticks(tick_pos, tick_labels, rotation=45)
        plt.imshow(matrix, cmap="OrRd")
        plt.title(self.__represent_validation__())
        # these are matplotlib.patch.Patch properties
        props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
        # place a text box in upper left in axes coords
        xml = self.toXMLDict()
        textstr = "\n".join(key + " " + val for key, val in xml.items()
                            if key != "predictors")
        plt.gca().text(0.05,
                       0.95,
                       textstr,
                       transform=plt.gca().transAxes,
                       fontsize=6,
                       verticalalignment='top',
                       bbox=props)

        plt.imsave(os.path.join(out_dir, self.__represent_validation__()) +
                   ".matrix.png",
                   matrix,
                   cmap="OrRd",
                   dpi=600)
        if not ("show_plot" in kwargs) or kwargs["show_plot"]:
            plt.show()
        plt.clf()
Ejemplo n.º 16
0
def calc_insulation_around_CTCF(chr, resolution=5000, window_size=20):
    logging.basicConfig(
        level=logging.DEBUG)  # set to INFO for less detailed output

    ### load data ###
    # load genome
    faReader = fastaReader("../input/hg38/hg38.fa", useOnlyChromosomes=[chr])
    faReader = faReader.read_data()

    # load chipSeq1
    bwReader1 = bigWigReader("../input/ENCFF473IZV_H1_CTCF.bigWig",
                             genome=faReader,
                             inMemory=True)
    bwReader1 = bwReader1.readData()

    #load contacts

    hic = hicReader("../input/4DNFI2TK7L2F.hic",
                    genome=faReader,
                    resolution=resolution)
    hic = hic.read_data()

    ### run simple check that contact count correlate with ChipSeq signal ###

    ### generate some random samples ####
    # get size of the chr1
    total_length = faReader.get_chr_sizes()[chr]

    all_CTCF = bwReader1.get_interval(Interval(chr, 0, total_length))
    all_CTCF = np.nan_to_num(all_CTCF)
    binsize = 1000
    bins = np.arange(0, total_length - 1, binsize)
    sums = [np.sum(all_CTCF[a:a + binsize]) for a in bins]
    peaks = bins[sums > np.percentile(sums, 90)]
    with open("../out/test.bed", "w") as fout:
        for i in peaks:
            fout.write(chr + "\t" + str(i) + "\t" + str(i + binsize) + "\n")
Ejemplo n.º 17
0
        # for interval in [# Interval("chr10", 59000000, 62000000)]:
        #                  Interval("chr10", 65000000, 70000000),
        #                  Interval("chr20", 37000000, 40000000),
        #                  Interval("chr10", 10000000, 60000000)]:
        #                  # Interval("chr10",36000000,41000000),
        #                  # Interval("chr1", 100000000, 110000000)]:
        # params.interval = interval
        validate_chrs = ["chr19", "chrX"]
        for validateChrName in validate_chrs:
            params.sample_size = len(
                params.contacts_reader.data[validateChrName])
            #print(params.sample_size)
            validation_file_name = "validatingOrient." + str(params) + ".txt"
            params.interval = Interval(
                validateChrName,
                params.contacts_reader.get_min_contact_position(
                    validateChrName),
                params.contacts_reader.get_max_contact_position(
                    validateChrName))
            logging.getLogger(
                __name__).info("Generating validation dataset for interval " +
                               str(params.interval))
            params.out_file = output_folder + params.interval.toFileName(
            ) + validation_file_name
            generate_data(params)
            del (params.out_file)
            del (params.sample_size)

        # for object in [params.contacts_reader]+params.pgs:
        #     lostInterval = Interval("chr1",103842568,104979840)
        #     object.delete_region(lostInterval)
        #     params.interval = Interval("chr1",100000000,109000000)
Ejemplo n.º 18
0
model_dir = '/mnt/scratch/ws/psbelokopytova/202105171236data_Polina/nn_anopheles/dataset_like_Akita/data/Aalb_2048bp_repeat/train_out_test5_fix_random3/'
params_file = model_dir+'params.json'
# model_file  = model_dir+'model_check.h5'
model_file  = model_dir+'model_best.h5'
with open(params_file) as params_open:
    params = json.load(params_open)
    params_model = params['model']
    params_train = params['train']
seqnn_model = seqnn.SeqNN(params_model)
### restore model ###
seqnn_model.restore(model_file)
print('successfully loaded')

# read data parameters
data_dir ='/mnt/scratch/ws/psbelokopytova/202105171236data_Polina/nn_anopheles/dataset_like_Akita/data/Aalb_2048'
data_stats_file = '%s/statistics.json' % data_dir
with open(data_stats_file) as data_stats_open:
    data_stats = json.load(data_stats_open)
seq_length = data_stats['seq_length']
target_length = data_stats['target_length']
hic_diags =  data_stats['diagonal_offset']
target_crop = data_stats['crop_bp'] // data_stats['pool_width']
target_length1 = data_stats['seq_length'] // data_stats['pool_width']
target_length1_cropped = target_length1 - 2*target_crop

predict_big_region_from_seq(Interval(chr, start, end), binsize=data_stats['pool_width'], seq_len=seq_length, stride = 300*data_stats['pool_width'],
                            fasta_file="/mnt/scratch/ws/psbelokopytova/202105171236data_Polina/nn_anopheles/input/genomes/AalbS2_V4.fa", seqnn_model = seqnn_model,
                            crop_bp = data_stats['crop_bp'],target_length_cropped=target_length1_cropped, hic_diags = hic_diags,
                            prediction_folder=model_dir,
                            genome_hic_expected_file='/mnt/scratch/ws/psbelokopytova/202105171236data_Polina/nn_anopheles/input/coolers/Aalb_2048.expected',
                            use_control=True, genome_cool_file = '/mnt/scratch/ws/psbelokopytova/202105171236data_Polina/nn_anopheles/input/coolers/Aalb_2048.cool')
Ejemplo n.º 19
0
        'yellow'))
print(
    colored(
        'Third parameter is the number of last nucleotide of the interval.',
        'yellow'))
print(colored('Example of input: 2L 10000000 13000000 n', 'yellow'))
choice = 'y'
interval_list = []
while choice == 'y':  # check that it is y or n? unnecessary right
    print(colored('Type parameters of one interval.', 'yellow'))
    input_list = list(map(str, input().split()))
    chr = input_list[0]
    start = int(input_list[1])
    end = int(input_list[2])
    choice = input_list[3]
    interval = Interval(chr, start, end)
    interval_list.append(interval)

# create folders
print(
    colored('Type path for directory to be created for sending output to.',
            'yellow'))
print(
    colored('If such folder already exists its contents will be overwritten.',
            'yellow'))
print(
    colored(
        'Example: path to directory named "pleasework". It is shown below.',
        'yellow'))  # add example for Windows too
print(
    colored('/home/konstantin/konstantin/2/nn_anopheles/pleasework/',
Ejemplo n.º 20
0
def simple_test():
    logging.basicConfig(
        level=logging.DEBUG)  # set to INFO for less detailed output

    ### load data ###
    # load genome
    input_folder = "/home/minja/PycharmProjects/3Dpredictor/nn/input/"
    faReader = fastaReader(input_folder + "hg38/hg38.fa",
                           useOnlyChromosomes=["chr1"])
    faReader = faReader.read_data()
    # load chipSeq
    bwReader1 = bigWigReader(input_folder + "ENCFF473IZV_H1_CTCF.bigWig",
                             genome=faReader,
                             inMemory=True)
    bwReader1 = bwReader1.readData()

    # load chipSeq
    bwReader2 = bigWigReader(input_folder + "ENCFF966IHQ.bigWig",
                             genome=faReader,
                             inMemory=False)
    bwReader2 = bwReader2.readData()

    #load contacts
    resolution = 5000
    hic = hicReader(input_folder + "4DNFI2TK7L2F.hic",
                    genome=faReader,
                    binsize=resolution,
                    indexedData=True)
    hic = hic.read_data()

    ### run simple check that contact count correlate with ChipSeq signal ###

    ### generate some random samples ####
    # get size of the chr1
    total_length = faReader.get_chr_sizes()["chr1"]

    window_size = 20 * resolution  # distance between intercting regions in this particular test, in units of resolution

    sample_size = 100000

    # select random points on chr1
    random_points_starts = np.random.random_integers(
        0, total_length - window_size, sample_size)
    random_points_starts = np.array(
        (random_points_starts // resolution) * resolution, dtype=np.uint64)
    random_points_ends = random_points_starts + window_size

    # for each of selected points get contact between this point and (point + window_size*resolution)
    contacts = []
    chipSignals = []
    seqSignals = []
    now = datetime.datetime.now()  # start timer

    logging.info("Starting data generation")
    for start, end in zip(random_points_starts, random_points_ends):
        interval = Interval("chr1", start, end)
        contact = hic.get_contact(interval)
        if contact == None:
            continue
        else:
            chipSignal = np.nansum(bwReader1.get_interval(interval))
            if np.isfinite(chipSignal):
                chipSignals.append(chipSignal)
                seqSignal = np.sum(faReader.get_interval(interval))
                seqSignals.append(seqSignal)
                contacts.append(contact)

    logging.info("Time for data generation1: " +
                 str(datetime.datetime.now() - now))
    # now = datetime.datetime.now()
    # chipSignals = []
    # seqSignals = []
    # contacts = []
    # for start,end in zip(random_points_starts,random_points_ends):
    #     interval = Interval("chr1",start,end)
    #     contact = hic.get_contact(interval)
    #     if contact == None:
    #         continue
    #     else:
    #         chipSignal = np.nansum(bwReader2.get_interval(interval))
    #         if np.isfinite(chipSignal):
    #             chipSignals.append(chipSignal)
    #             seqSignal = np.sum(faReader.get_interval(interval))
    #             seqSignals.append(seqSignal)
    #             contacts.append(contact)
    #
    # logging.info("Time for data generation2: " + str(datetime.datetime.now() - now))
    from scipy.stats import spearmanr
    import matplotlib.pyplot as plt

    print(contacts)
    print(chipSignals)

    print(spearmanr(np.array(contacts), np.array(chipSignals)))
    print(np.all(np.isfinite(contacts)))
    print(np.all(np.isfinite(chipSignals)))

    plt.scatter(contacts, chipSignals)
    plt.show()
Ejemplo n.º 21
0
 def get_predictors(self, contact):
     window_start, window_end, contacts_relative_start, contacts_relative_end = \
                                     self.symmetric_window_around_contact(contact)
     interval = Interval(contact.chr, window_start, window_end)
     return [window_start,window_end,contacts_relative_start,contacts_relative_end] \
            +self.chipSeq_reader.get_binned_interval(interval, binsize=self.binsize)
Ejemplo n.º 22
0
    fig2_inds.append(test_ind)
# print(fig2_inds)

    target_index = 0
    for test_index in fig2_inds:
        chrm, seq_start, seq_end = sequences_test.iloc[test_index][0:3]
        myseq_str = chrm + ':' + str(seq_start) + '-' + str(seq_end)
        print(' ')
    #     print(myseq_str)
        test_target = test_targets[test_index:test_index + 1, :, :]
        # plot target
        # plt.subplot(122)
        mat = from_upper_triu(test_target[:, :, target_index], target_length1_cropped, hic_diags)
        print(mat)
        #draw matrix before returning from oe to contacts
        im = plt.matshow(mat, fignum=False, cmap='RdBu_r')#, vmax=vmax, vmin=vmin)
        plt.colorbar(im, fraction=.04, pad=0.05)#, ticks=[-2, -1, 0, 1, 2])
        plt.title('target-' + str(hic_num_to_name_dict[target_index]+myseq_str), y=1.15)
        plt.tight_layout()
        plt.savefig(data_dir+"/test/test_before_"+str(chrm)+"_"+str(seq_start)+"_"+str(seq_end)+".png")
        plt.clf()
        #draw_after
        returned_mat = from_oe_to_contacts(seq_hic_obsexp=mat, genome_hic_expected_file='/mnt/scratch/ws/psbelokopytova/202103211631polina/nn_anopheles/input/coolers/Aalb_2048.expected',
                                           interval=Interval('2R', 32083968,33132544), seq_len_pool=target_length1_cropped)
        im = plt.matshow(returned_mat, fignum=False, cmap='OrRd')  # , vmax=vmax, vmin=vmin)
        plt.colorbar(im, fraction=.04, pad=0.05)  # , ticks=[-2, -1, 0, 1, 2])
        plt.title('target-' + str(hic_num_to_name_dict[target_index] + myseq_str), y=1.15)
        plt.tight_layout()
        plt.savefig(data_dir + "/test/test_after_" + str(chrm) + "_" + str(seq_start) + "_" + str(
            seq_end) + ".png")
        plt.clf()
        #     if not write_all_chrms_in_file:
        #         del(params.out_file)
        #     del (params.sample_size)

        # Generate test
        validate_chrs = []
        [
            validate_chrs.append("chr" + chr) for chr in chr_nums
        ]  #,"chr16", "chr17"]#, "chr18"]#, "chr18", "chr19", "chr20"]#,"chr14", "chr15"]
        if write_all_chrms_in_file:
            validation_file_name = "validatingOrient." + str(params) + ".txt"
            params.out_file = output_folder + "_".join(
                validate_chrs) + validation_file_name
        for validateChrName in validate_chrs:
            print("chromosome", validateChrName)
            interval = Interval("chr5", 75000000, 76400000)
            params.sample_size = len(
                params.contacts_reader.data[validateChrName])

            # params.interval = Interval(validateChrName,
            #                            params.contacts_reader.get_min_contact_position(validateChrName),
            #                            params.contacts_reader.get_max_contact_position(validateChrName))
            params.interval = interval
            logging.getLogger(
                __name__).info("Generating validation dataset for interval " +
                               str(params.interval))
            if not write_all_chrms_in_file:
                validation_file_name = "validatingOrient." + str(
                    params) + ".txt"
                params.out_file = output_folder + params.interval.toFileName(
                ) + validation_file_name
Ejemplo n.º 24
0
params.eig_reader.read_files(
    [input_folder + "chr1.Hepat.E1.50k", input_folder + "chr2.Hepat.E1.50k"],
    #input_folder + "chr10.Hepat.E1.50k"],
    #input_folder + "chr6.Hepat.E1.50k"],
    binSizeFromName=fileName2binsize
)  #infer size of E1 bins from file name using this function

e1pg = SmallE1PredictorGenerator(params.eig_reader, params.window_size)

params.pgs = [e1pg, OrientCtcfpg, NotOrientCTCFpg,
              RNAseqPG]  #,onlyOrientCtcfpg]

#Generate train
trainChrName = "chr1"
params.interval = Interval(
    trainChrName,
    params.contacts_reader.get_min_contact_position(trainChrName),
    params.contacts_reader.get_max_contact_position(trainChrName))
params.out_file = output_folder + training_file_name
generate_data(params, saveFileDescription=True)

#Generate test
for interval in [  # Interval("chr10", 59000000, 62000000)]:
        Interval("chr2", 47900000, 53900000),
        Interval("chr2", 85000000, 92500000),
        Interval("chr2", 36000000, 41000000)
]:
    # Interval("chr1", 100000000, 110000000)]:
    logging.getLogger(__name__).info(
        "Generating validation dataset for interval " + str(interval))
    params.interval = interval
    params.out_file = output_folder + params.interval.toFileName(
Ejemplo n.º 25
0
    #     if not write_all_chrms_in_file:
    #         del(params.out_file)
    #     del (params.sample_size)

    # Generate test
    validate_chrs = []  #no need to set chr for validation here!!!!
    [
        validate_chrs.append("chr" + chr) for chr in chr_nums
    ]  #,"chr16", "chr17"]#, "chr18"]#, "chr18", "chr19", "chr20"]#,"chr14", "chr15"]
    if write_all_chrms_in_file:
        validation_file_name = "validatingOrient." + str(params) + ".txt"
        params.out_file = output_folder + "_".join(
            validate_chrs) + validation_file_name
    for validateChrName in validate_chrs:
        print("chromosome", validateChrName)
        interval = Interval(chromosome, start, end)
        #params.sample_size = len(params.contacts_reader.data[validateChrName])

        # params.interval = Interval(validateChrName,
        #                            params.contacts_reader.get_min_contact_position(validateChrName),
        #                            params.contacts_reader.get_max_contact_position(validateChrName))
        params.interval = interval
        logging.getLogger(
            __name__).info("Generating validation dataset for interval " +
                           str(params.interval))
        if not write_all_chrms_in_file:
            validation_file_name = "validatingOrient." + str(params) + ".txt"
            params.out_file = output_folder + "/" + cell_type + params.interval.toFileName(
            ) + validation_file_name
        generate_data(params)
        if not write_all_chrms_in_file:
Ejemplo n.º 26
0
def predict_big_region_from_seq(interval_list,
                                binsize,
                                seq_len,
                                stride,
                                fasta_file,
                                seqnn_model,
                                crop_bp,
                                target_length_cropped,
                                hic_diags,
                                prediction_folder,
                                returned_to_contacts=True,
                                save_as_hic=True,
                                use_control=False,
                                minimal_length=3000000,
                                **kwargs):
    """
        Predict big region by stacking the predicted small region units. Write the prediction to .hic file if it need
        Parameters
        ----------
        interval : 3DPredictor.shared.Interval
            Interval object
        binsize : int
        seq_len : int
            len of one predicted unit
        stride : int
            stride in the interval for predicted units
        fasta_file : str
            path to fasta file with the genome
        minimal_length : int
            such length that interval that has it is processed within optimal time limit (as you think)
        Returns
        -------
        """

    # create dictionary that stores contents of future chrom.sizes file
    chrsizes_dict = {}

    # function that takes interval and calls other functions to create hic and chrom.sizes files
    def predictor(dictionary, subinterval):
        # define shape of predicted array
        n_end = math.ceil(interval.end / binsize)
        n_start = math.floor(interval.start / binsize)
        n = n_end - n_start
        # deprecated n = math.ceil((interval.end - interval.start)/binsize)+1
        len_predicted_mat = (seq_len - 2 * crop_bp) // binsize
        m = n
        print("Stride is", stride, ",", stride // binsize, "bins")
        mat_stride = stride // binsize
        k = (n - (len_predicted_mat - mat_stride)) // mat_stride
        print(datetime.datetime.now())
        print("...allocating array...", k, m, n)
        arr = np.empty((k, m, n))
        arr[:] = np.nan
        print(datetime.datetime.now(), "DONE")
        # print(arr.shape)
        start = interval.start
        arr_stride = crop_bp // binsize
        fasta_open = pysam.Fastafile(fasta_file)

        # predict k units
        print("going to predict", k, "matrix units")
        for k_matrix in range(0, k):
            # predict matrix for one region
            if k_matrix % 5 == 0:
                print("predict", k_matrix, "matrix unit")
            chrm, seq_start, seq_end = interval.chr, int(start), int(start +
                                                                     seq_len)
            seq = fasta_open.fetch(chrm, seq_start, seq_end).upper()
            # with open(prediction_folder+"preseq"+str(seq_start)+"-"+str(seq_end)+".pickle", 'wb') as f:
            #     pickle.dump(seq, f)
            seq_1hot = dna_io.dna_1hot(seq)
            # print (seq[21680:21685])
            # print(seq_1hot[21680:21685][:])
            # with open(prediction_folder+"prepred"+str(seq_start)+"-"+str(seq_end)+".pickle", 'wb') as f:
            #     pickle.dump(seq_1hot, f)
            test_pred_from_seq = seqnn_model.model.predict(
                np.expand_dims(seq_1hot, 0))

            predicted_mat = from_upper_triu(test_pred_from_seq[:, :, 0],
                                            target_length_cropped, hic_diags)
            with open(
                    prediction_folder + "prred_mat" + str(seq_start) + "-" +
                    str(seq_end) + ".pickle", 'wb') as f:
                pickle.dump(predicted_mat, f)
            # print(0, target_length_cropped, hic_diags)
            # im = plt.matshow(predicted_mat, fignum=False, cmap='RdBu_r')  # , vmax=2, vmin=-2)
            # plt.colorbar(im, fraction=.04, pad=0.05)  # , ticks=[-2,-1, 0, 1,2])
            # plt.savefig(prediction_folder+"testtest"+str(seq_start)+"-"+str(seq_end))
            # plt.clf()
            assert predicted_mat.shape[0] == predicted_mat.shape[1]
            # write predicted unit to array for big interval
            for i in range(len(predicted_mat)):
                arr[k_matrix][i +
                              arr_stride][0 + arr_stride:len(predicted_mat) +
                                          arr_stride] = predicted_mat[i]
            arr_stride += stride // binsize
            start += stride
        # get mean array from predictions
        mat = np.nanmean(arr, axis=0)
        # empty_mat = np.empty((mat.shape[0],1))
        # print(mat.shape)
        # im = plt.matshow(mat, fignum=False, cmap='RdBu_r')  # , vmax=2, vmin=-2)
        # plt.colorbar(im, fraction=.04, pad=0.05)  # , ticks=[-2,-1, 0, 1,2])
        # plt.savefig(prediction_folder +"prediction_"+
        #             interval.chr+"_"+str(interval.start)+"-"+str(interval.end))
        # plt.clf()

        # return predicted values from oe to contacts
        if returned_to_contacts:
            if 'genome_hic_expected_file' not in kwargs:
                print("Please add path to expected file")
            mat = from_oe_to_contacts(
                seq_hic_obsexp=mat,
                genome_hic_expected_file=kwargs['genome_hic_expected_file'],
                interval=interval,
                seq_len_pool=n)
            # im = plt.matshow(mat, fignum=False, cmap='OrRd')  # , vmax=2, vmin=-2)
            # plt.colorbar(im, fraction=.04, pad=0.05)  # , ticks=[-2,-1, 0, 1,2])
            # plt.savefig(prediction_folder + "prediction_returned_" +
            #             interval.chr + "_" + str(interval.start) + "-" + str(interval.end))
        if save_as_hic:
            print("going to save in hic format")
            plot_juicebox_from_predicted_array(
                mat=mat,
                binsize=binsize,
                interval=interval,
                out_dir=prediction_folder,
                diagonal_offset=hic_diags,
                use_control=use_control,
                genome_cool_file=kwargs["genome_cool_file"],
                ghc=cooler.Cooler(kwargs['genome_cool_file']),
                chr_dict=dictionary)

        # Write predicted regions to bed file
        bed_file = open(prediction_folder + "predictions.bed", "w")
        bed_file.write(
            str(0) + "\t" + interval.chr + "\t" + str(interval.start) + "\t" +
            str(interval.end) + "\n")

    # cycle that sends intervals from list to be processed
    for interval in interval_list:  # n = number of intervals in the list

        # sort intervals of the same chromosome by end point
        if chrsizes_dict.setdefault(str(interval.chr)) is None:
            chrsizes_dict[str(interval.chr)] = interval.end
        else:
            if chrsizes_dict[str(interval.chr)] < interval.end:
                chrsizes_dict[str(interval.chr)] = interval.end

        assert minimal_length >= seq_len
        assert interval.len >= seq_len
        if interval.len <= minimal_length:
            predictor(chrsizes_dict, interval)
        else:
            if interval.len // minimal_length == 1:
                predictor(chrsizes_dict, interval)
            else:  # elif interval.len // minimal_length > 1:
                if interval.len % minimal_length == 0:
                    i = 0  # how many bps we've predicted
                    while i != interval.len:
                        predictor(chrsizes_dict,
                                  subinterval=Interval(
                                      interval.chr, interval.start + i,
                                      interval.start + i + minimal_length))
                        i += minimal_length
                else:  # elif interval.len % minimal_length > 0:
                    residual_interval = Interval(
                        interval.chr, interval.end -
                        (minimal_length + interval.len % minimal_length),
                        interval.end)
                    without_residue_interval = Interval(
                        interval.chr, interval.start, interval.end -
                        (minimal_length + interval.len % minimal_length))
                    i = 0  # how many bps we've predicted
                    while i != without_residue_interval.len:
                        predictor(chrsizes_dict,
                                  subinterval=Interval(
                                      interval.chr, interval.start + i,
                                      interval.start + i + minimal_length))
                        i += minimal_length
                    predictor(chrsizes_dict, subinterval=residual_interval)
        ]
        # TSSPG] + chipPG  # +cagePG+metPG+chipPG

        # Generate train
        train_chrs = []
        [train_chrs.append("chr" + chr) for chr in chr_nums]
        if write_all_chrms_in_file:
            train_file_name = "training.RandOn" + str(params)
            params.out_file = output_folder + "_".join(
                train_chrs) + train_file_name
        for trainChrName in train_chrs:
            training_file_name = "training.RandOn" + trainChrName + str(
                params) + ".txt"
            # set it if you want to use all contacts of chromosome for training:
            # params.sample_size = len(params.contacts_reader.data[trainChrName])

            # if you want to use only an interval of chromosome, set its coordinates:
            params.interval = Interval(
                trainChrName,
                params.contacts_reader.get_min_contact_position(trainChrName),
                params.contacts_reader.get_max_contact_position(trainChrName))

            if not write_all_chrms_in_file:
                train_file_name = "training.RandOn" + str(params) + ".txt"
                params.out_file = output_folder + params.interval.toFileName(
                ) + train_file_name
            generate_data(params, saveFileDescription=True)
            if not write_all_chrms_in_file:
                del (params.out_file)
            del (params.sample_size)
Ejemplo n.º 28
0
        #     if not write_all_chrms_in_file:
        #         del(params.out_file)
        #     del (params.sample_size)

        # Generate test
        validate_chrs = []
        [
            validate_chrs.append("chr" + chr) for chr in chr_nums
        ]  #,"chr16", "chr17"]#, "chr18"]#, "chr18", "chr19", "chr20"]#,"chr14", "chr15"]
        if write_all_chrms_in_file:
            validation_file_name = "validatingOrient." + str(params) + ".txt"
            params.out_file = output_folder + "_".join(
                validate_chrs) + validation_file_name
        for validateChrName in validate_chrs:
            print("chromosome", validateChrName)
            interval = Interval("chr7", 86600000, 87000000)
            # params.sample_size = len(params.contacts_reader.data[validateChrName])

            # params.interval = Interval(validateChrName,
            #                            params.contacts_reader.get_min_contact_position(validateChrName),
            #                            params.contacts_reader.get_max_contact_position(validateChrName))
            params.interval = interval
            logging.getLogger(
                __name__).info("Generating validation dataset for interval " +
                               str(params.interval))
            if not write_all_chrms_in_file:
                validation_file_name = "validatingOrient." + str(
                    params) + ".txt"
                params.out_file = output_folder + params.interval.toFileName(
                ) + validation_file_name
            generate_data(params)
Ejemplo n.º 29
0
def calc_corr(chr, resolution=5000, window_size=20):
    logging.basicConfig(
        level=logging.DEBUG)  # set to INFO for less detailed output

    ### load data ###
    # load genome
    faReader = fastaReader("../input/hg38/hg38.fa", useOnlyChromosomes=[chr])
    faReader = faReader.read_data()

    # load chipSeq1
    bwReader1 = bigWigReader("../input/ENCFF473IZV_H1_CTCF.bigWig",
                             genome=faReader,
                             inMemory=True)
    bwReader1 = bwReader1.readData()

    #load contacts

    hic = hicReader("../input/4DNFI2TK7L2F.hic",
                    genome=faReader,
                    resolution=resolution)
    hic = hic.read_data()

    ### run simple check that contact count correlate with ChipSeq signal ###

    ### generate some random samples ####
    # get size of the chr1
    total_length = faReader.get_chr_sizes()[chr]

    # distance between intercting regions in this particular test, in units of resolution
    sample_size = 5000

    # select random points on chr1
    random_points_starts = np.random.random_integers(
        0, total_length - window_size, sample_size)
    random_points_starts = np.array(
        (random_points_starts // resolution) * resolution, dtype=np.uint64)
    random_points_ends = random_points_starts + window_size

    # for each of selected points get contact between this point and (point + window_size*resolution)
    contacts = []
    chipSignals = []
    seqSignals = []
    now = datetime.datetime.now()  # start timer

    logging.info("Starting data generation")
    for start, end in zip(random_points_starts, random_points_ends):
        interval = Interval(chr, start, end)
        assert window_size >= 5 * resolution
        window = Interval(chr, start + resolution, end)
        contact = hic.get_contact(interval)
        if contact == None:
            contact = 0
        if np.isfinite(contact):
            # chipSignal = np.concatenate((bwReader1.get_interval(Interval(chr,int(start-resolution),int(start+resolution))),
            #                             bwReader1.get_interval(
            #                                 Interval(chr, int(end - resolution), int(end + resolution)))))
            chipSignal = bwReader1.get_interval(window)
            chipSignal = np.nan_to_num(chipSignal)
            chipSignal = np.sum(chipSignal)
            if np.isfinite(chipSignal):
                chipSignals.append(chipSignal)
                seqSignal = np.sum(faReader.get_interval(interval))
                seqSignals.append(seqSignal)
                contacts.append(contact)

    logging.info("Time for data generation: " +
                 str(datetime.datetime.now() - now))
    from scipy.stats import spearmanr, pearsonr

    res = []
    res.append(spearmanr(np.array(contacts), np.array(chipSignals))[0])
    res.append(pearsonr(np.array(contacts), np.array(chipSignals))[0])
    res.append(spearmanr(np.array(contacts), np.array(seqSignals))[0])
    res.append(pearsonr(np.array(contacts), np.array(seqSignals))[0])

    return ("\t".join(list(map(str, res))))
Ejemplo n.º 30
0
        # Read contacts data
        params.contacts_reader = ContactsReader()
        contacts_files = [input_folder + "19.contacts.gz"]
        coeff_fname = input_folder + "coefficient.NPC.5000.txt"
        # set path to the coefficient file and to contacts files
        # contacts file format: bin_start--bin_end--contact_count
        params.contacts_reader.read_files(
            contacts_files,
            coeff_fname,
            max_cpus=params.max_cpus,
            fill_empty_contacts=fill_empty_contacts,
            maxdist=params.maxdist)
        params.fastaReader = fastaReader(
            input_folder + "chr19.fa", chrm_names_renamer=rm_chr_from_chrName)
        params.fastaReader.read_data()
        print(params.fastaReader.data)
        SequencePG = SequencePredictorGenerator(
            fastaReader=params.fastaReader,
            binsize=params.contacts_reader.binsize)
        params.pgs = [SequencePG]
        params.out_file = output_folder + "NPC_5000"

        params.sample_size = 100
        params.interval = Interval(
            "19", params.contacts_reader.get_min_contact_position("19"),
            params.contacts_reader.get_max_contact_position("19"))
        logging.getLogger(__name__).info("Generating dataset for interval " +
                                         str(params.interval))
        generate_data(params)