def get_predictors(self,contact): assert contact.contact_st < contact.contact_en #Peaks outside the window Left_start_peaks = self.chipSeq_reader.get_nearest_peaks( Interval(contact.chr, contact.contact_st, contact.contact_st), N=self.N_closest, side="left") Left_start_peaks = Left_start_peaks["plus_orientation"].values.tolist() + \ Left_start_peaks["minus_orientation"].values.tolist() + Left_start_peaks["sigVal"].values.tolist() + \ (contact.contact_st - Left_start_peaks["mids"]).values.tolist() Right_end_peaks = self.chipSeq_reader.get_nearest_peaks( Interval(contact.chr, contact.contact_en, contact.contact_en), side="right", N=self.N_closest) Right_end_peaks = Right_end_peaks["plus_orientation"].values.tolist() + \ Right_end_peaks["minus_orientation"].values.tolist() + Right_end_peaks[ "sigVal"].values.tolist() + \ (Right_end_peaks["mids"] - contact.contact_en).values.tolist() # Next statmetn will return list of 2 dataframes # 1st DF with first N peaks on the right side of left interval boundary # 2nd DF with first N peaks on the left side of right interval boundary Window_peaks = self.chipSeq_reader.get_N_peaks_near_interval_boundaries( Interval(contact.chr, contact.contact_st, contact.contact_en), N=self.N_closest) Right_start_peaks = Window_peaks[0]["plus_orientation"].values.tolist() + Window_peaks[0][ "minus_orientation"].values.tolist() + \ Window_peaks[0]["sigVal"].values.tolist() + ( Window_peaks[0]["mids"] - contact.contact_st).values.tolist() Left_end_peaks = Window_peaks[1]["plus_orientation"].values.tolist() + Window_peaks[1][ "minus_orientation"].values.tolist() + \ Window_peaks[1]["sigVal"].values.tolist() + ( contact.contact_en - Window_peaks[1]["mids"]).values.tolist() predictors = Left_start_peaks + Right_start_peaks + Left_end_peaks + Right_end_peaks return predictors
def get_predictors(self, contact): #print(self.name) assert contact.contact_st < contact.contact_en if contact.chr not in set(self.chipSeq_reader.chr_data.keys()): return [0] * len(self.header) else: intL, intM, intR = self.intevals_around_ancor(contact) sig_L = self.chipSeq_reader.get_interval(intL).sigVal.sum() sig_R = self.chipSeq_reader.get_interval(intR).sigVal.sum() sig_mid = self.chipSeq_reader.get_interval(intM).sigVal.sum() Left_top = self.chipSeq_reader.get_nearest_peaks(Interval( contact.chr, contact.contact_st - (self.window_size // 2), contact.contact_st - (self.window_size // 2)), N=self.N_closest, side="left") Left_top = Left_top["sigVal"].values.tolist() + \ (contact.contact_st-Left_top["mids"]).values.tolist() Right_top = self.chipSeq_reader.get_nearest_peaks(Interval( contact.chr, contact.contact_en + (self.window_size // 2), contact.contact_en + (self.window_size // 2)), N=self.N_closest, side="right") Right_top = Right_top["sigVal"].values.tolist() + \ (Right_top["mids"]-contact.contact_en).values.tolist() if self.name == "CTCF_SmallChip": return [sig_L, sig_R] else: return [sig_L, sig_mid, sig_R] + Left_top + Right_top
def get_predictors(self, contact): assert contact.contact_st < contact.contact_en # get the nearest right and left peak to the start and to the end of contact Left_start_peaks = self.TSS_reader.get_nearest_peaks(Interval( contact.chr, contact.contact_st, contact.contact_st), N=1, side="left") Left_start_peak_dist = (contact.contact_st - Left_start_peaks["TSS"]).values.tolist() Right_start_peaks = self.TSS_reader.get_nearest_peaks(Interval( contact.chr, contact.contact_st, contact.contact_st), N=1, side="right") Right_start_peak_dist = (Right_start_peaks["TSS"] - contact.contact_st).values.tolist() Left_end_peaks = self.TSS_reader.get_nearest_peaks(Interval( contact.chr, contact.contact_en, contact.contact_en), side="left", N=1) Left_end_peak_dist = (contact.contact_en - Left_end_peaks["TSS"]).values.tolist() Right_end_peaks = self.TSS_reader.get_nearest_peaks(Interval( contact.chr, contact.contact_en, contact.contact_en), side="right", N=1) Right_end_peak_dist = (Right_end_peaks["TSS"] - contact.contact_en).values.tolist() return Left_start_peak_dist + Right_start_peak_dist + Left_end_peak_dist + Right_end_peak_dist
def get_predictors(self,contact): assert contact.contact_st < contact.contact_en # get the nearest right and left peak to the start and to the end of contact Left_start_peaks = self.chipSeq_reader.get_nearest_peaks( Interval(contact.chr, contact.contact_st, contact.contact_st),N=1, side="left") Left_start_peak = Left_start_peaks["minus_orientation"].values.tolist()[0] \ if abs(Left_start_peaks["mids"].values.tolist()[0]-contact.contact_st) <= self.binsize*2 else 0 Right_start_peaks = self.chipSeq_reader.get_nearest_peaks( Interval(contact.chr, contact.contact_st, contact.contact_st), N=1, side="right") Right_start_peak = Right_start_peaks["minus_orientation"].values.tolist()[0] \ if abs(Right_start_peaks["mids"].values.tolist()[0]-contact.contact_st) <= self.binsize*2 else 0 Left_end_peaks = self.chipSeq_reader.get_nearest_peaks( Interval(contact.chr, contact.contact_en, contact.contact_en), side="left", N=1) Left_end_peak = Left_end_peaks["minus_orientation"].values.tolist()[0] \ if abs(Left_end_peaks["mids"].values.tolist()[0] - contact.contact_en) <= self.binsize*2 else 0 Right_end_peaks = self.chipSeq_reader.get_nearest_peaks( Interval(contact.chr, contact.contact_en, contact.contact_en), side="right", N=1) Right_end_peak = Right_end_peaks["minus_orientation"].values.tolist()[0] \ if abs(Right_end_peaks["mids"].values.tolist()[0] - contact.contact_en) <= self.binsize*2 else 0 #minus orientation is orientation of CTCF to the right, plus to the left # 1 if CTCF sites in the end and start of contact have convergent orientation else 0 start_minus_orientation = [Left_start_peak,Right_start_peak] end_plus_orientation = [Left_end_peak,Right_end_peak] if len(np.nonzero(start_minus_orientation)[0]) != 0 and len(np.nonzero(end_plus_orientation)[0]) != 0: predictors = [1] else: predictors = [0] return predictors
def get_predictors(self, contact): left_interval = Interval(contact.chr, contact.contact_st - self.dist_from_anchor, \ contact.contact_st + self.binsize + self.dist_from_anchor) right_interval = Interval(contact.chr, contact.contact_en - self.dist_from_anchor, \ contact.contact_en + self.binsize + self.dist_from_anchor) left_sequence = self.fastaReader.get_interval(left_interval) right_sequence = self.fastaReader.get_interval(right_interval) assert len(left_sequence) == self.dist_of_interval assert len(right_sequence) == self.dist_of_interval return left_sequence + right_sequence
def intevals_around_ancor(self, contact): half = self.window_size // 2 assert contact.contact_en - contact.contact_st > half return ( Interval(contact.chr, contact.contact_st - half, contact.contact_st + half), Interval(contact.chr, contact.contact_st + half, contact.contact_en - half), Interval(contact.chr, contact.contact_en - half, contact.contact_en + half), )
def get_predictors(self, contact): assert contact.contact_st < contact.contact_en window_start, window_end, contacts_relative_start, contacts_relative_end = \ self.symmetric_window_around_contact(contact) interval = Interval(contact.chr, window_start, window_end) return [window_start, window_end, contacts_relative_start, contacts_relative_end] + \ self.eig_reader.get_E1inInterval(interval)["E1"].tolist()
def get_predictors(self, contact): assert contact.contact_st < contact.contact_en # Get peaks in window and count "blocks" # Blocks are CTCF sites with divergent orientation, i.e. --> <-- <-- is a block all_Window_peaks = self.chipSeq_reader.get_interval(Interval(contact.chr, contact.contact_st + self.window_size//2, \ contact.contact_en - self.window_size//2)) N_blocks_W = 0 plus_ori_idx = all_Window_peaks.columns.get_loc('plus_orientation') minus_ori_idx = all_Window_peaks.columns.get_loc('minus_orientation') # print(len(all_Window_peaks)) # print(all_Window_peaks[["minus_orientation", "plus_orientation"]]) for i in range(len(all_Window_peaks) - 1): if all_Window_peaks.iloc[ i, plus_ori_idx] != 0 and all_Window_peaks.iloc[ i + 1, minus_ori_idx] != 0: N_blocks_W += 1 # print(N_blocks_W) # Check wheather we have CTCF sites in divergent orientation in contact ancors intL, intM, intR = self.intevals_around_ancor(contact) L_peaks = self.chipSeq_reader.get_interval(intL) R_peaks = self.chipSeq_reader.get_interval(intR) has_convergent_peak = 0 if len(L_peaks) > 0 and len(R_peaks) > 0: has_convergent_peak = L_peaks.plus_orientation.sum( ) * R_peaks.minus_orientation.sum() return [N_blocks_W, has_convergent_peak]
def run_timing(func,data,N): now = datetime.datetime.now() data_dict = {"chr1": data} for i in range(0,N): start = random.randint(0,maxPos) l = random.randint(minLen,maxLen*5) interval = Interval("chr1",start,start+l) res = func(data_dict,interval) now2 = datetime.datetime.now() print (str(func.__name__)," : ",now2-now)
def test_hicReader(): genome = fastaReader("../input/hg38/test.fa",name="hg38") faReader = genome.read_data() now = datetime.datetime.now() hic = hicReader(fname="../input/4DNFI2TK7L2F.hic", genome=genome, resolution = 100000) hic = hic.read_data() print (hic.norms) result = hic.get_contact(Interval("chr1",0,120000000)) # single float value or NaN print (result) result = hic.get_chr_contact("chr1") # returns sparse matrix of the whole chrm as pandas dataframe print (datetime.datetime.now() - now)
def test_bigWig(inMem): print ("Loading data") now = datetime.datetime.now() bwReader = bigWigReader("../input/ENCFF966IHQ.bigWig",name="Test",genome=genome, inMemory=inMem) bwReader = bwReader.readData() print ("Time:",datetime.datetime.now() - now) print ("Extracting data, inMem=",str(inMem)) now = datetime.datetime.now() start = 10000000 stop = 101000000 step = 1000000 for i in range(start,stop,step): res = bwReader.get_interval(Interval("chr1",i,i+step)) print ("Time:",datetime.datetime.now() - now) print (str(len(list(range(start,stop,step))))+" extractions of length "+str(step))
def generate_train_dataset(seq_chr_data, fasta_genome, chr_norm_hic_data, out_file, train_test = "train", chrms = "all", target_crop_bp=0, diagonal_offset=2): intervals, inputs, targets = [], [], [] print("train_test", train_test) for chr in seq_chr_data.keys(): print(chr) if chrms == "all" or chr in chrms: data = seq_chr_data[chr] seq_chr_data[chr] = seq_chr_data[chr][seq_chr_data[chr]["train_test"] == train_test] print(seq_chr_data[chr]) for seq in list(zip(seq_chr_data[chr]["start"], seq_chr_data[chr]["end"])): seq_region = fasta_genome.get_interval(Interval(chr, seq[0], seq[1])) # print("seq_region") # print(seq_region) encoded_seq = tf.constant(tf.one_hot(seq_region, depth=4)) inputs.append(encoded_seq) binsize = 4096 #TODO create class hic_data and its method binsize # compute dimensions seq_len_nt = seq[1] - seq[0] seq_len_pool = seq_len_nt // binsize if target_crop_bp == 0: seq_len_crop = seq_len_pool else: crop_start = target_crop_bp // binsize crop_end = seq_len_pool - crop_start seq_len_crop = seq_len_pool - 2 * crop_start # unroll upper triangular target = chr_norm_hic_data[chr][seq[0]//binsize:seq[1]//binsize, seq[0]//binsize:seq[1]//binsize] assert target.shape[0] == target.shape[1] assert target.shape[0] * binsize == len(seq_region) # compute upper triangular indexes triu_tup = np.triu_indices(seq_len_crop, diagonal_offset) target = target[triu_tup] targets.append(target) intervals.append((chr, seq[0], seq[1])) # print(len(seq_region), encoded_seq.shape) # print(target.shape) data = dict() print(len(intervals), len(inputs), len(targets)) data["intervals"] = intervals data["inputs"] = inputs data["targets"] = targets with open(out_file, 'wb') as f: pickle.dump(data, f)
def get_predictors(self,contact): #print(self.name) assert contact.contact_st < contact.contact_en # Peaks outside of the window Left_peaks = self.chipSeq_reader.get_nearest_peaks(Interval(contact.chr, contact.contact_st, contact.contact_st ), N=self.N_closest, side="left") Left_peaks = Left_peaks["plus_orientation"].values.tolist() + \ Left_peaks["minus_orientation"].values.tolist() + Left_peaks["sigVal"].values.tolist() + \ (contact.contact_st - Left_peaks["mids"]).values.tolist() Right_peaks = self.chipSeq_reader.get_nearest_peaks(Interval(contact.chr, contact.contact_en, contact.contact_en), N=self.N_closest, side="right") Right_peaks = Right_peaks["plus_orientation"].values.tolist() + \ Right_peaks["minus_orientation"].values.tolist() + Right_peaks["sigVal"].values.tolist() + \ (Right_peaks["mids"] - contact.contact_en).values.tolist() #Next statmetn will return list of 2 dataframes #1st DF with first N peaks on the right side of left interval boundary #2nd DF with first N peaks on the left side of right interval boundary Window_peaks = self.chipSeq_reader.get_N_peaks_near_interval_boundaries(Interval(contact.chr, contact.contact_st, contact.contact_en), N=self.N_closest) Window_peaks_left = Window_peaks[0]["plus_orientation"].values.tolist() +Window_peaks[0]["minus_orientation"].values.tolist() + \ Window_peaks[0]["sigVal"].values.tolist() + (Window_peaks[0]["mids"] - contact.contact_st).values.tolist() #Get properties of peaks Window_peaks_right = Window_peaks[1]["plus_orientation"].values.tolist() + Window_peaks[1]["minus_orientation"].values.tolist() + \ Window_peaks[1]["sigVal"].values.tolist() + (contact.contact_en - Window_peaks[1]["mids"]).values.tolist() #if there are no peaks in window, set sigVal and other params to 0 TODO add if/else for onlyOrient if len(self.chipSeq_reader.get_interval(Interval(contact.chr, contact.contact_st, contact.contact_en))) == 0: Window_sigVal = 0 N_plus_orient_W = 0 N_minus_orient_W = 0 else: Window_sigVal = self.chipSeq_reader.get_interval(Interval(contact.chr, contact.contact_st, contact.contact_en)).sigVal.sum() plus_orient_data = self.chipSeq_reader.get_interval(Interval(contact.chr, contact.contact_st, contact.contact_en)).query("plus_orientation!='0'") N_plus_orient_W = len(plus_orient_data) minus_orient_data = self.chipSeq_reader.get_interval(Interval(contact.chr, contact.contact_st, contact.contact_en)).query("minus_orientation!='0'") N_minus_orient_W = len(minus_orient_data) predictors = Left_peaks + Window_peaks_left + Window_peaks_right + Right_peaks + [Window_sigVal] + [N_plus_orient_W, N_minus_orient_W] return predictors
def compare_intervalfuncs(tree, df, df2, df3): data1 = {"chr1":df} data2 = {"chr1":df2} data3 = {"chr1":df3} count_match_v1 = 0 count_match_v2 = 0 count_match_v3 = 0 count_match_v4 = 0 N_tests = 100 for i in range(0,N_tests): start = random.randint(0,maxPos) l = random.randint(minLen,maxLen*5) # start = 2200 # l = 2550 - 2200 interval = Interval("chr1",start,start+l) res_v1 = intersect_with_interval(data1,interval) res_v2 = intersect_with_interval_v2(data2, interval) if len(res_v2) > 0: ids = np.unique(res_v2.ids.values) res_v2 = df.loc[ids] #print (res_v2) #break res_v3 = intersect_with_interval_v3(data1,interval) res_v4 = intersect_with_interval_v4(data3,interval) res_intTree = np.array([ q.data for q in tree[start:start+l+1] ]) match_v1 = compare_results(res_v1,res_intTree) match_v2 = compare_results(res_v2,res_intTree) match_v3 = compare_results(res_v3,res_intTree) match_v4 = compare_results(res_v4,res_intTree) count_match_v1 += match_v1 count_match_v2 += match_v2 count_match_v3 += match_v3 count_match_v4 += match_v4 if not match_v3: print("---------------") print (interval) print (res_v3) print (res_intTree) print(count_match_v1, " of ", N_tests) print(count_match_v2, " of ", N_tests) print(count_match_v3, " of ", N_tests) print(count_match_v4, " of ", N_tests)
def plot_matrix(self, validation_data, predicted, out_dir, **kwargs): predicted_data = validation_data.copy(deep=True) predicted_data["contact_count"] = predicted mp = MatrixPlotter() mp.set_data(validation_data) mp.set_control(predicted_data) matrix = mp.getMatrix4plot( Interval(validation_data["chr"].iloc[0], min(validation_data["contact_st"].values), max(validation_data["contact_en"].values))) #if not self.apply_log: matrix = np.log(matrix) tick_pos, tick_labels = mp.get_bins_strart_labels(maxTicksNumber=15) plt.xticks(tick_pos, tick_labels, rotation=45) plt.imshow(matrix, cmap="OrRd") plt.title(self.__represent_validation__()) # these are matplotlib.patch.Patch properties props = dict(boxstyle='round', facecolor='wheat', alpha=0.5) # place a text box in upper left in axes coords xml = self.toXMLDict() textstr = "\n".join(key + " " + val for key, val in xml.items() if key != "predictors") plt.gca().text(0.05, 0.95, textstr, transform=plt.gca().transAxes, fontsize=6, verticalalignment='top', bbox=props) plt.imsave(os.path.join(out_dir, self.__represent_validation__()) + ".matrix.png", matrix, cmap="OrRd", dpi=600) if not ("show_plot" in kwargs) or kwargs["show_plot"]: plt.show() plt.clf()
def calc_insulation_around_CTCF(chr, resolution=5000, window_size=20): logging.basicConfig( level=logging.DEBUG) # set to INFO for less detailed output ### load data ### # load genome faReader = fastaReader("../input/hg38/hg38.fa", useOnlyChromosomes=[chr]) faReader = faReader.read_data() # load chipSeq1 bwReader1 = bigWigReader("../input/ENCFF473IZV_H1_CTCF.bigWig", genome=faReader, inMemory=True) bwReader1 = bwReader1.readData() #load contacts hic = hicReader("../input/4DNFI2TK7L2F.hic", genome=faReader, resolution=resolution) hic = hic.read_data() ### run simple check that contact count correlate with ChipSeq signal ### ### generate some random samples #### # get size of the chr1 total_length = faReader.get_chr_sizes()[chr] all_CTCF = bwReader1.get_interval(Interval(chr, 0, total_length)) all_CTCF = np.nan_to_num(all_CTCF) binsize = 1000 bins = np.arange(0, total_length - 1, binsize) sums = [np.sum(all_CTCF[a:a + binsize]) for a in bins] peaks = bins[sums > np.percentile(sums, 90)] with open("../out/test.bed", "w") as fout: for i in peaks: fout.write(chr + "\t" + str(i) + "\t" + str(i + binsize) + "\n")
# for interval in [# Interval("chr10", 59000000, 62000000)]: # Interval("chr10", 65000000, 70000000), # Interval("chr20", 37000000, 40000000), # Interval("chr10", 10000000, 60000000)]: # # Interval("chr10",36000000,41000000), # # Interval("chr1", 100000000, 110000000)]: # params.interval = interval validate_chrs = ["chr19", "chrX"] for validateChrName in validate_chrs: params.sample_size = len( params.contacts_reader.data[validateChrName]) #print(params.sample_size) validation_file_name = "validatingOrient." + str(params) + ".txt" params.interval = Interval( validateChrName, params.contacts_reader.get_min_contact_position( validateChrName), params.contacts_reader.get_max_contact_position( validateChrName)) logging.getLogger( __name__).info("Generating validation dataset for interval " + str(params.interval)) params.out_file = output_folder + params.interval.toFileName( ) + validation_file_name generate_data(params) del (params.out_file) del (params.sample_size) # for object in [params.contacts_reader]+params.pgs: # lostInterval = Interval("chr1",103842568,104979840) # object.delete_region(lostInterval) # params.interval = Interval("chr1",100000000,109000000)
model_dir = '/mnt/scratch/ws/psbelokopytova/202105171236data_Polina/nn_anopheles/dataset_like_Akita/data/Aalb_2048bp_repeat/train_out_test5_fix_random3/' params_file = model_dir+'params.json' # model_file = model_dir+'model_check.h5' model_file = model_dir+'model_best.h5' with open(params_file) as params_open: params = json.load(params_open) params_model = params['model'] params_train = params['train'] seqnn_model = seqnn.SeqNN(params_model) ### restore model ### seqnn_model.restore(model_file) print('successfully loaded') # read data parameters data_dir ='/mnt/scratch/ws/psbelokopytova/202105171236data_Polina/nn_anopheles/dataset_like_Akita/data/Aalb_2048' data_stats_file = '%s/statistics.json' % data_dir with open(data_stats_file) as data_stats_open: data_stats = json.load(data_stats_open) seq_length = data_stats['seq_length'] target_length = data_stats['target_length'] hic_diags = data_stats['diagonal_offset'] target_crop = data_stats['crop_bp'] // data_stats['pool_width'] target_length1 = data_stats['seq_length'] // data_stats['pool_width'] target_length1_cropped = target_length1 - 2*target_crop predict_big_region_from_seq(Interval(chr, start, end), binsize=data_stats['pool_width'], seq_len=seq_length, stride = 300*data_stats['pool_width'], fasta_file="/mnt/scratch/ws/psbelokopytova/202105171236data_Polina/nn_anopheles/input/genomes/AalbS2_V4.fa", seqnn_model = seqnn_model, crop_bp = data_stats['crop_bp'],target_length_cropped=target_length1_cropped, hic_diags = hic_diags, prediction_folder=model_dir, genome_hic_expected_file='/mnt/scratch/ws/psbelokopytova/202105171236data_Polina/nn_anopheles/input/coolers/Aalb_2048.expected', use_control=True, genome_cool_file = '/mnt/scratch/ws/psbelokopytova/202105171236data_Polina/nn_anopheles/input/coolers/Aalb_2048.cool')
'yellow')) print( colored( 'Third parameter is the number of last nucleotide of the interval.', 'yellow')) print(colored('Example of input: 2L 10000000 13000000 n', 'yellow')) choice = 'y' interval_list = [] while choice == 'y': # check that it is y or n? unnecessary right print(colored('Type parameters of one interval.', 'yellow')) input_list = list(map(str, input().split())) chr = input_list[0] start = int(input_list[1]) end = int(input_list[2]) choice = input_list[3] interval = Interval(chr, start, end) interval_list.append(interval) # create folders print( colored('Type path for directory to be created for sending output to.', 'yellow')) print( colored('If such folder already exists its contents will be overwritten.', 'yellow')) print( colored( 'Example: path to directory named "pleasework". It is shown below.', 'yellow')) # add example for Windows too print( colored('/home/konstantin/konstantin/2/nn_anopheles/pleasework/',
def simple_test(): logging.basicConfig( level=logging.DEBUG) # set to INFO for less detailed output ### load data ### # load genome input_folder = "/home/minja/PycharmProjects/3Dpredictor/nn/input/" faReader = fastaReader(input_folder + "hg38/hg38.fa", useOnlyChromosomes=["chr1"]) faReader = faReader.read_data() # load chipSeq bwReader1 = bigWigReader(input_folder + "ENCFF473IZV_H1_CTCF.bigWig", genome=faReader, inMemory=True) bwReader1 = bwReader1.readData() # load chipSeq bwReader2 = bigWigReader(input_folder + "ENCFF966IHQ.bigWig", genome=faReader, inMemory=False) bwReader2 = bwReader2.readData() #load contacts resolution = 5000 hic = hicReader(input_folder + "4DNFI2TK7L2F.hic", genome=faReader, binsize=resolution, indexedData=True) hic = hic.read_data() ### run simple check that contact count correlate with ChipSeq signal ### ### generate some random samples #### # get size of the chr1 total_length = faReader.get_chr_sizes()["chr1"] window_size = 20 * resolution # distance between intercting regions in this particular test, in units of resolution sample_size = 100000 # select random points on chr1 random_points_starts = np.random.random_integers( 0, total_length - window_size, sample_size) random_points_starts = np.array( (random_points_starts // resolution) * resolution, dtype=np.uint64) random_points_ends = random_points_starts + window_size # for each of selected points get contact between this point and (point + window_size*resolution) contacts = [] chipSignals = [] seqSignals = [] now = datetime.datetime.now() # start timer logging.info("Starting data generation") for start, end in zip(random_points_starts, random_points_ends): interval = Interval("chr1", start, end) contact = hic.get_contact(interval) if contact == None: continue else: chipSignal = np.nansum(bwReader1.get_interval(interval)) if np.isfinite(chipSignal): chipSignals.append(chipSignal) seqSignal = np.sum(faReader.get_interval(interval)) seqSignals.append(seqSignal) contacts.append(contact) logging.info("Time for data generation1: " + str(datetime.datetime.now() - now)) # now = datetime.datetime.now() # chipSignals = [] # seqSignals = [] # contacts = [] # for start,end in zip(random_points_starts,random_points_ends): # interval = Interval("chr1",start,end) # contact = hic.get_contact(interval) # if contact == None: # continue # else: # chipSignal = np.nansum(bwReader2.get_interval(interval)) # if np.isfinite(chipSignal): # chipSignals.append(chipSignal) # seqSignal = np.sum(faReader.get_interval(interval)) # seqSignals.append(seqSignal) # contacts.append(contact) # # logging.info("Time for data generation2: " + str(datetime.datetime.now() - now)) from scipy.stats import spearmanr import matplotlib.pyplot as plt print(contacts) print(chipSignals) print(spearmanr(np.array(contacts), np.array(chipSignals))) print(np.all(np.isfinite(contacts))) print(np.all(np.isfinite(chipSignals))) plt.scatter(contacts, chipSignals) plt.show()
def get_predictors(self, contact): window_start, window_end, contacts_relative_start, contacts_relative_end = \ self.symmetric_window_around_contact(contact) interval = Interval(contact.chr, window_start, window_end) return [window_start,window_end,contacts_relative_start,contacts_relative_end] \ +self.chipSeq_reader.get_binned_interval(interval, binsize=self.binsize)
fig2_inds.append(test_ind) # print(fig2_inds) target_index = 0 for test_index in fig2_inds: chrm, seq_start, seq_end = sequences_test.iloc[test_index][0:3] myseq_str = chrm + ':' + str(seq_start) + '-' + str(seq_end) print(' ') # print(myseq_str) test_target = test_targets[test_index:test_index + 1, :, :] # plot target # plt.subplot(122) mat = from_upper_triu(test_target[:, :, target_index], target_length1_cropped, hic_diags) print(mat) #draw matrix before returning from oe to contacts im = plt.matshow(mat, fignum=False, cmap='RdBu_r')#, vmax=vmax, vmin=vmin) plt.colorbar(im, fraction=.04, pad=0.05)#, ticks=[-2, -1, 0, 1, 2]) plt.title('target-' + str(hic_num_to_name_dict[target_index]+myseq_str), y=1.15) plt.tight_layout() plt.savefig(data_dir+"/test/test_before_"+str(chrm)+"_"+str(seq_start)+"_"+str(seq_end)+".png") plt.clf() #draw_after returned_mat = from_oe_to_contacts(seq_hic_obsexp=mat, genome_hic_expected_file='/mnt/scratch/ws/psbelokopytova/202103211631polina/nn_anopheles/input/coolers/Aalb_2048.expected', interval=Interval('2R', 32083968,33132544), seq_len_pool=target_length1_cropped) im = plt.matshow(returned_mat, fignum=False, cmap='OrRd') # , vmax=vmax, vmin=vmin) plt.colorbar(im, fraction=.04, pad=0.05) # , ticks=[-2, -1, 0, 1, 2]) plt.title('target-' + str(hic_num_to_name_dict[target_index] + myseq_str), y=1.15) plt.tight_layout() plt.savefig(data_dir + "/test/test_after_" + str(chrm) + "_" + str(seq_start) + "_" + str( seq_end) + ".png") plt.clf()
# if not write_all_chrms_in_file: # del(params.out_file) # del (params.sample_size) # Generate test validate_chrs = [] [ validate_chrs.append("chr" + chr) for chr in chr_nums ] #,"chr16", "chr17"]#, "chr18"]#, "chr18", "chr19", "chr20"]#,"chr14", "chr15"] if write_all_chrms_in_file: validation_file_name = "validatingOrient." + str(params) + ".txt" params.out_file = output_folder + "_".join( validate_chrs) + validation_file_name for validateChrName in validate_chrs: print("chromosome", validateChrName) interval = Interval("chr5", 75000000, 76400000) params.sample_size = len( params.contacts_reader.data[validateChrName]) # params.interval = Interval(validateChrName, # params.contacts_reader.get_min_contact_position(validateChrName), # params.contacts_reader.get_max_contact_position(validateChrName)) params.interval = interval logging.getLogger( __name__).info("Generating validation dataset for interval " + str(params.interval)) if not write_all_chrms_in_file: validation_file_name = "validatingOrient." + str( params) + ".txt" params.out_file = output_folder + params.interval.toFileName( ) + validation_file_name
params.eig_reader.read_files( [input_folder + "chr1.Hepat.E1.50k", input_folder + "chr2.Hepat.E1.50k"], #input_folder + "chr10.Hepat.E1.50k"], #input_folder + "chr6.Hepat.E1.50k"], binSizeFromName=fileName2binsize ) #infer size of E1 bins from file name using this function e1pg = SmallE1PredictorGenerator(params.eig_reader, params.window_size) params.pgs = [e1pg, OrientCtcfpg, NotOrientCTCFpg, RNAseqPG] #,onlyOrientCtcfpg] #Generate train trainChrName = "chr1" params.interval = Interval( trainChrName, params.contacts_reader.get_min_contact_position(trainChrName), params.contacts_reader.get_max_contact_position(trainChrName)) params.out_file = output_folder + training_file_name generate_data(params, saveFileDescription=True) #Generate test for interval in [ # Interval("chr10", 59000000, 62000000)]: Interval("chr2", 47900000, 53900000), Interval("chr2", 85000000, 92500000), Interval("chr2", 36000000, 41000000) ]: # Interval("chr1", 100000000, 110000000)]: logging.getLogger(__name__).info( "Generating validation dataset for interval " + str(interval)) params.interval = interval params.out_file = output_folder + params.interval.toFileName(
# if not write_all_chrms_in_file: # del(params.out_file) # del (params.sample_size) # Generate test validate_chrs = [] #no need to set chr for validation here!!!! [ validate_chrs.append("chr" + chr) for chr in chr_nums ] #,"chr16", "chr17"]#, "chr18"]#, "chr18", "chr19", "chr20"]#,"chr14", "chr15"] if write_all_chrms_in_file: validation_file_name = "validatingOrient." + str(params) + ".txt" params.out_file = output_folder + "_".join( validate_chrs) + validation_file_name for validateChrName in validate_chrs: print("chromosome", validateChrName) interval = Interval(chromosome, start, end) #params.sample_size = len(params.contacts_reader.data[validateChrName]) # params.interval = Interval(validateChrName, # params.contacts_reader.get_min_contact_position(validateChrName), # params.contacts_reader.get_max_contact_position(validateChrName)) params.interval = interval logging.getLogger( __name__).info("Generating validation dataset for interval " + str(params.interval)) if not write_all_chrms_in_file: validation_file_name = "validatingOrient." + str(params) + ".txt" params.out_file = output_folder + "/" + cell_type + params.interval.toFileName( ) + validation_file_name generate_data(params) if not write_all_chrms_in_file:
def predict_big_region_from_seq(interval_list, binsize, seq_len, stride, fasta_file, seqnn_model, crop_bp, target_length_cropped, hic_diags, prediction_folder, returned_to_contacts=True, save_as_hic=True, use_control=False, minimal_length=3000000, **kwargs): """ Predict big region by stacking the predicted small region units. Write the prediction to .hic file if it need Parameters ---------- interval : 3DPredictor.shared.Interval Interval object binsize : int seq_len : int len of one predicted unit stride : int stride in the interval for predicted units fasta_file : str path to fasta file with the genome minimal_length : int such length that interval that has it is processed within optimal time limit (as you think) Returns ------- """ # create dictionary that stores contents of future chrom.sizes file chrsizes_dict = {} # function that takes interval and calls other functions to create hic and chrom.sizes files def predictor(dictionary, subinterval): # define shape of predicted array n_end = math.ceil(interval.end / binsize) n_start = math.floor(interval.start / binsize) n = n_end - n_start # deprecated n = math.ceil((interval.end - interval.start)/binsize)+1 len_predicted_mat = (seq_len - 2 * crop_bp) // binsize m = n print("Stride is", stride, ",", stride // binsize, "bins") mat_stride = stride // binsize k = (n - (len_predicted_mat - mat_stride)) // mat_stride print(datetime.datetime.now()) print("...allocating array...", k, m, n) arr = np.empty((k, m, n)) arr[:] = np.nan print(datetime.datetime.now(), "DONE") # print(arr.shape) start = interval.start arr_stride = crop_bp // binsize fasta_open = pysam.Fastafile(fasta_file) # predict k units print("going to predict", k, "matrix units") for k_matrix in range(0, k): # predict matrix for one region if k_matrix % 5 == 0: print("predict", k_matrix, "matrix unit") chrm, seq_start, seq_end = interval.chr, int(start), int(start + seq_len) seq = fasta_open.fetch(chrm, seq_start, seq_end).upper() # with open(prediction_folder+"preseq"+str(seq_start)+"-"+str(seq_end)+".pickle", 'wb') as f: # pickle.dump(seq, f) seq_1hot = dna_io.dna_1hot(seq) # print (seq[21680:21685]) # print(seq_1hot[21680:21685][:]) # with open(prediction_folder+"prepred"+str(seq_start)+"-"+str(seq_end)+".pickle", 'wb') as f: # pickle.dump(seq_1hot, f) test_pred_from_seq = seqnn_model.model.predict( np.expand_dims(seq_1hot, 0)) predicted_mat = from_upper_triu(test_pred_from_seq[:, :, 0], target_length_cropped, hic_diags) with open( prediction_folder + "prred_mat" + str(seq_start) + "-" + str(seq_end) + ".pickle", 'wb') as f: pickle.dump(predicted_mat, f) # print(0, target_length_cropped, hic_diags) # im = plt.matshow(predicted_mat, fignum=False, cmap='RdBu_r') # , vmax=2, vmin=-2) # plt.colorbar(im, fraction=.04, pad=0.05) # , ticks=[-2,-1, 0, 1,2]) # plt.savefig(prediction_folder+"testtest"+str(seq_start)+"-"+str(seq_end)) # plt.clf() assert predicted_mat.shape[0] == predicted_mat.shape[1] # write predicted unit to array for big interval for i in range(len(predicted_mat)): arr[k_matrix][i + arr_stride][0 + arr_stride:len(predicted_mat) + arr_stride] = predicted_mat[i] arr_stride += stride // binsize start += stride # get mean array from predictions mat = np.nanmean(arr, axis=0) # empty_mat = np.empty((mat.shape[0],1)) # print(mat.shape) # im = plt.matshow(mat, fignum=False, cmap='RdBu_r') # , vmax=2, vmin=-2) # plt.colorbar(im, fraction=.04, pad=0.05) # , ticks=[-2,-1, 0, 1,2]) # plt.savefig(prediction_folder +"prediction_"+ # interval.chr+"_"+str(interval.start)+"-"+str(interval.end)) # plt.clf() # return predicted values from oe to contacts if returned_to_contacts: if 'genome_hic_expected_file' not in kwargs: print("Please add path to expected file") mat = from_oe_to_contacts( seq_hic_obsexp=mat, genome_hic_expected_file=kwargs['genome_hic_expected_file'], interval=interval, seq_len_pool=n) # im = plt.matshow(mat, fignum=False, cmap='OrRd') # , vmax=2, vmin=-2) # plt.colorbar(im, fraction=.04, pad=0.05) # , ticks=[-2,-1, 0, 1,2]) # plt.savefig(prediction_folder + "prediction_returned_" + # interval.chr + "_" + str(interval.start) + "-" + str(interval.end)) if save_as_hic: print("going to save in hic format") plot_juicebox_from_predicted_array( mat=mat, binsize=binsize, interval=interval, out_dir=prediction_folder, diagonal_offset=hic_diags, use_control=use_control, genome_cool_file=kwargs["genome_cool_file"], ghc=cooler.Cooler(kwargs['genome_cool_file']), chr_dict=dictionary) # Write predicted regions to bed file bed_file = open(prediction_folder + "predictions.bed", "w") bed_file.write( str(0) + "\t" + interval.chr + "\t" + str(interval.start) + "\t" + str(interval.end) + "\n") # cycle that sends intervals from list to be processed for interval in interval_list: # n = number of intervals in the list # sort intervals of the same chromosome by end point if chrsizes_dict.setdefault(str(interval.chr)) is None: chrsizes_dict[str(interval.chr)] = interval.end else: if chrsizes_dict[str(interval.chr)] < interval.end: chrsizes_dict[str(interval.chr)] = interval.end assert minimal_length >= seq_len assert interval.len >= seq_len if interval.len <= minimal_length: predictor(chrsizes_dict, interval) else: if interval.len // minimal_length == 1: predictor(chrsizes_dict, interval) else: # elif interval.len // minimal_length > 1: if interval.len % minimal_length == 0: i = 0 # how many bps we've predicted while i != interval.len: predictor(chrsizes_dict, subinterval=Interval( interval.chr, interval.start + i, interval.start + i + minimal_length)) i += minimal_length else: # elif interval.len % minimal_length > 0: residual_interval = Interval( interval.chr, interval.end - (minimal_length + interval.len % minimal_length), interval.end) without_residue_interval = Interval( interval.chr, interval.start, interval.end - (minimal_length + interval.len % minimal_length)) i = 0 # how many bps we've predicted while i != without_residue_interval.len: predictor(chrsizes_dict, subinterval=Interval( interval.chr, interval.start + i, interval.start + i + minimal_length)) i += minimal_length predictor(chrsizes_dict, subinterval=residual_interval)
] # TSSPG] + chipPG # +cagePG+metPG+chipPG # Generate train train_chrs = [] [train_chrs.append("chr" + chr) for chr in chr_nums] if write_all_chrms_in_file: train_file_name = "training.RandOn" + str(params) params.out_file = output_folder + "_".join( train_chrs) + train_file_name for trainChrName in train_chrs: training_file_name = "training.RandOn" + trainChrName + str( params) + ".txt" # set it if you want to use all contacts of chromosome for training: # params.sample_size = len(params.contacts_reader.data[trainChrName]) # if you want to use only an interval of chromosome, set its coordinates: params.interval = Interval( trainChrName, params.contacts_reader.get_min_contact_position(trainChrName), params.contacts_reader.get_max_contact_position(trainChrName)) if not write_all_chrms_in_file: train_file_name = "training.RandOn" + str(params) + ".txt" params.out_file = output_folder + params.interval.toFileName( ) + train_file_name generate_data(params, saveFileDescription=True) if not write_all_chrms_in_file: del (params.out_file) del (params.sample_size)
# if not write_all_chrms_in_file: # del(params.out_file) # del (params.sample_size) # Generate test validate_chrs = [] [ validate_chrs.append("chr" + chr) for chr in chr_nums ] #,"chr16", "chr17"]#, "chr18"]#, "chr18", "chr19", "chr20"]#,"chr14", "chr15"] if write_all_chrms_in_file: validation_file_name = "validatingOrient." + str(params) + ".txt" params.out_file = output_folder + "_".join( validate_chrs) + validation_file_name for validateChrName in validate_chrs: print("chromosome", validateChrName) interval = Interval("chr7", 86600000, 87000000) # params.sample_size = len(params.contacts_reader.data[validateChrName]) # params.interval = Interval(validateChrName, # params.contacts_reader.get_min_contact_position(validateChrName), # params.contacts_reader.get_max_contact_position(validateChrName)) params.interval = interval logging.getLogger( __name__).info("Generating validation dataset for interval " + str(params.interval)) if not write_all_chrms_in_file: validation_file_name = "validatingOrient." + str( params) + ".txt" params.out_file = output_folder + params.interval.toFileName( ) + validation_file_name generate_data(params)
def calc_corr(chr, resolution=5000, window_size=20): logging.basicConfig( level=logging.DEBUG) # set to INFO for less detailed output ### load data ### # load genome faReader = fastaReader("../input/hg38/hg38.fa", useOnlyChromosomes=[chr]) faReader = faReader.read_data() # load chipSeq1 bwReader1 = bigWigReader("../input/ENCFF473IZV_H1_CTCF.bigWig", genome=faReader, inMemory=True) bwReader1 = bwReader1.readData() #load contacts hic = hicReader("../input/4DNFI2TK7L2F.hic", genome=faReader, resolution=resolution) hic = hic.read_data() ### run simple check that contact count correlate with ChipSeq signal ### ### generate some random samples #### # get size of the chr1 total_length = faReader.get_chr_sizes()[chr] # distance between intercting regions in this particular test, in units of resolution sample_size = 5000 # select random points on chr1 random_points_starts = np.random.random_integers( 0, total_length - window_size, sample_size) random_points_starts = np.array( (random_points_starts // resolution) * resolution, dtype=np.uint64) random_points_ends = random_points_starts + window_size # for each of selected points get contact between this point and (point + window_size*resolution) contacts = [] chipSignals = [] seqSignals = [] now = datetime.datetime.now() # start timer logging.info("Starting data generation") for start, end in zip(random_points_starts, random_points_ends): interval = Interval(chr, start, end) assert window_size >= 5 * resolution window = Interval(chr, start + resolution, end) contact = hic.get_contact(interval) if contact == None: contact = 0 if np.isfinite(contact): # chipSignal = np.concatenate((bwReader1.get_interval(Interval(chr,int(start-resolution),int(start+resolution))), # bwReader1.get_interval( # Interval(chr, int(end - resolution), int(end + resolution))))) chipSignal = bwReader1.get_interval(window) chipSignal = np.nan_to_num(chipSignal) chipSignal = np.sum(chipSignal) if np.isfinite(chipSignal): chipSignals.append(chipSignal) seqSignal = np.sum(faReader.get_interval(interval)) seqSignals.append(seqSignal) contacts.append(contact) logging.info("Time for data generation: " + str(datetime.datetime.now() - now)) from scipy.stats import spearmanr, pearsonr res = [] res.append(spearmanr(np.array(contacts), np.array(chipSignals))[0]) res.append(pearsonr(np.array(contacts), np.array(chipSignals))[0]) res.append(spearmanr(np.array(contacts), np.array(seqSignals))[0]) res.append(pearsonr(np.array(contacts), np.array(seqSignals))[0]) return ("\t".join(list(map(str, res))))
# Read contacts data params.contacts_reader = ContactsReader() contacts_files = [input_folder + "19.contacts.gz"] coeff_fname = input_folder + "coefficient.NPC.5000.txt" # set path to the coefficient file and to contacts files # contacts file format: bin_start--bin_end--contact_count params.contacts_reader.read_files( contacts_files, coeff_fname, max_cpus=params.max_cpus, fill_empty_contacts=fill_empty_contacts, maxdist=params.maxdist) params.fastaReader = fastaReader( input_folder + "chr19.fa", chrm_names_renamer=rm_chr_from_chrName) params.fastaReader.read_data() print(params.fastaReader.data) SequencePG = SequencePredictorGenerator( fastaReader=params.fastaReader, binsize=params.contacts_reader.binsize) params.pgs = [SequencePG] params.out_file = output_folder + "NPC_5000" params.sample_size = 100 params.interval = Interval( "19", params.contacts_reader.get_min_contact_position("19"), params.contacts_reader.get_max_contact_position("19")) logging.getLogger(__name__).info("Generating dataset for interval " + str(params.interval)) generate_data(params)