def test_hicReader(): genome = fastaReader("../input/hg38/test.fa",name="hg38") faReader = genome.read_data() now = datetime.datetime.now() hic = hicReader(fname="../input/4DNFI2TK7L2F.hic", genome=genome, resolution = 100000) hic = hic.read_data() print (hic.norms) result = hic.get_contact(Interval("chr1",0,120000000)) # single float value or NaN print (result) result = hic.get_chr_contact("chr1") # returns sparse matrix of the whole chrm as pandas dataframe print (datetime.datetime.now() - now)
def calc_insulation_around_CTCF(chr, resolution=5000, window_size=20): logging.basicConfig( level=logging.DEBUG) # set to INFO for less detailed output ### load data ### # load genome faReader = fastaReader("../input/hg38/hg38.fa", useOnlyChromosomes=[chr]) faReader = faReader.read_data() # load chipSeq1 bwReader1 = bigWigReader("../input/ENCFF473IZV_H1_CTCF.bigWig", genome=faReader, inMemory=True) bwReader1 = bwReader1.readData() #load contacts hic = hicReader("../input/4DNFI2TK7L2F.hic", genome=faReader, resolution=resolution) hic = hic.read_data() ### run simple check that contact count correlate with ChipSeq signal ### ### generate some random samples #### # get size of the chr1 total_length = faReader.get_chr_sizes()[chr] all_CTCF = bwReader1.get_interval(Interval(chr, 0, total_length)) all_CTCF = np.nan_to_num(all_CTCF) binsize = 1000 bins = np.arange(0, total_length - 1, binsize) sums = [np.sum(all_CTCF[a:a + binsize]) for a in bins] peaks = bins[sums > np.percentile(sums, 90)] with open("../out/test.bed", "w") as fout: for i in peaks: fout.write(chr + "\t" + str(i) + "\t" + str(i + binsize) + "\n")
def calc_corr(chr, resolution=5000, window_size=20): logging.basicConfig( level=logging.DEBUG) # set to INFO for less detailed output ### load data ### # load genome faReader = fastaReader("../input/hg38/hg38.fa", useOnlyChromosomes=[chr]) faReader = faReader.read_data() # load chipSeq1 bwReader1 = bigWigReader("../input/ENCFF473IZV_H1_CTCF.bigWig", genome=faReader, inMemory=True) bwReader1 = bwReader1.readData() #load contacts hic = hicReader("../input/4DNFI2TK7L2F.hic", genome=faReader, resolution=resolution) hic = hic.read_data() ### run simple check that contact count correlate with ChipSeq signal ### ### generate some random samples #### # get size of the chr1 total_length = faReader.get_chr_sizes()[chr] # distance between intercting regions in this particular test, in units of resolution sample_size = 5000 # select random points on chr1 random_points_starts = np.random.random_integers( 0, total_length - window_size, sample_size) random_points_starts = np.array( (random_points_starts // resolution) * resolution, dtype=np.uint64) random_points_ends = random_points_starts + window_size # for each of selected points get contact between this point and (point + window_size*resolution) contacts = [] chipSignals = [] seqSignals = [] now = datetime.datetime.now() # start timer logging.info("Starting data generation") for start, end in zip(random_points_starts, random_points_ends): interval = Interval(chr, start, end) assert window_size >= 5 * resolution window = Interval(chr, start + resolution, end) contact = hic.get_contact(interval) if contact == None: contact = 0 if np.isfinite(contact): # chipSignal = np.concatenate((bwReader1.get_interval(Interval(chr,int(start-resolution),int(start+resolution))), # bwReader1.get_interval( # Interval(chr, int(end - resolution), int(end + resolution))))) chipSignal = bwReader1.get_interval(window) chipSignal = np.nan_to_num(chipSignal) chipSignal = np.sum(chipSignal) if np.isfinite(chipSignal): chipSignals.append(chipSignal) seqSignal = np.sum(faReader.get_interval(interval)) seqSignals.append(seqSignal) contacts.append(contact) logging.info("Time for data generation: " + str(datetime.datetime.now() - now)) from scipy.stats import spearmanr, pearsonr res = [] res.append(spearmanr(np.array(contacts), np.array(chipSignals))[0]) res.append(pearsonr(np.array(contacts), np.array(chipSignals))[0]) res.append(spearmanr(np.array(contacts), np.array(seqSignals))[0]) res.append(pearsonr(np.array(contacts), np.array(seqSignals))[0]) return ("\t".join(list(map(str, res))))
def simple_test(): logging.basicConfig( level=logging.DEBUG) # set to INFO for less detailed output ### load data ### # load genome input_folder = "/home/minja/PycharmProjects/3Dpredictor/nn/input/" faReader = fastaReader(input_folder + "hg38/hg38.fa", useOnlyChromosomes=["chr1"]) faReader = faReader.read_data() # load chipSeq bwReader1 = bigWigReader(input_folder + "ENCFF473IZV_H1_CTCF.bigWig", genome=faReader, inMemory=True) bwReader1 = bwReader1.readData() # load chipSeq bwReader2 = bigWigReader(input_folder + "ENCFF966IHQ.bigWig", genome=faReader, inMemory=False) bwReader2 = bwReader2.readData() #load contacts resolution = 5000 hic = hicReader(input_folder + "4DNFI2TK7L2F.hic", genome=faReader, binsize=resolution, indexedData=True) hic = hic.read_data() ### run simple check that contact count correlate with ChipSeq signal ### ### generate some random samples #### # get size of the chr1 total_length = faReader.get_chr_sizes()["chr1"] window_size = 20 * resolution # distance between intercting regions in this particular test, in units of resolution sample_size = 100000 # select random points on chr1 random_points_starts = np.random.random_integers( 0, total_length - window_size, sample_size) random_points_starts = np.array( (random_points_starts // resolution) * resolution, dtype=np.uint64) random_points_ends = random_points_starts + window_size # for each of selected points get contact between this point and (point + window_size*resolution) contacts = [] chipSignals = [] seqSignals = [] now = datetime.datetime.now() # start timer logging.info("Starting data generation") for start, end in zip(random_points_starts, random_points_ends): interval = Interval("chr1", start, end) contact = hic.get_contact(interval) if contact == None: continue else: chipSignal = np.nansum(bwReader1.get_interval(interval)) if np.isfinite(chipSignal): chipSignals.append(chipSignal) seqSignal = np.sum(faReader.get_interval(interval)) seqSignals.append(seqSignal) contacts.append(contact) logging.info("Time for data generation1: " + str(datetime.datetime.now() - now)) # now = datetime.datetime.now() # chipSignals = [] # seqSignals = [] # contacts = [] # for start,end in zip(random_points_starts,random_points_ends): # interval = Interval("chr1",start,end) # contact = hic.get_contact(interval) # if contact == None: # continue # else: # chipSignal = np.nansum(bwReader2.get_interval(interval)) # if np.isfinite(chipSignal): # chipSignals.append(chipSignal) # seqSignal = np.sum(faReader.get_interval(interval)) # seqSignals.append(seqSignal) # contacts.append(contact) # # logging.info("Time for data generation2: " + str(datetime.datetime.now() - now)) from scipy.stats import spearmanr import matplotlib.pyplot as plt print(contacts) print(chipSignals) print(spearmanr(np.array(contacts), np.array(chipSignals))) print(np.all(np.isfinite(contacts))) print(np.all(np.isfinite(chipSignals))) plt.scatter(contacts, chipSignals) plt.show()
rearrangement = False # deletion = Interval("chr" + chr_num, start, end) write_all_chrms_in_file = False #set True if you want write training file consisting several chromosomes fill_empty_contacts = False #set True if you want use all contacts in region, without empty contacts logging.getLogger(__name__).debug("Using input folder " + input_folder) # Read contacts data genome = fastaReader(args['path_to_genome'], useOnlyChromosomes=[chromosome]) #str(chr_num)]) genome = genome.read_data() print(genome.data) now = datetime.datetime.now() params.contacts_reader = hicReader(fname=input_folder + "/" + cell_type + "/" + hic_name, genome=genome, binsize=params.binsize) params.contacts_reader = params.contacts_reader.read_data( fill_empty_contacts=fill_empty_contacts, noDump=False) if params.use_only_contacts_with_CTCF == "cont_with_CTCF": params.proportion = 1 params.contacts_reader.use_contacts_with_CTCF(CTCFfile=input_folder+"/" + cell_type+"/CTCF/"+CTCF_file_name, maxdist=params.maxdist, proportion=params.proportion, keep_only_orient=params.keep_only_orient, CTCForientfile=input_folder + "/" + cell_type + \ "/CTCF/"+CTCF_file_name+"-orient.bed") params.use_only_contacts_with_CTCF += str( params.contacts_reader.conts_with_ctcf) #make deletion
write_all_chrms_in_file = False #set True if you want write training file consisting several chromosomes fill_empty_contacts = True #set True if you want use all contacts in region, without empty contacts logging.getLogger(__name__).debug("Using input folder " + input_folder) # Read contacts data genome = fastaReader(input_folder + "sequence/hg38/hg38.fa", name="hg38", useOnlyChromosomes=["chr3"]) genome = genome.read_data() # print(genome) # print(genome.data.keys()) now = datetime.datetime.now() params.contacts_reader = hicReader(fname=input_folder + "H1/4DNFI2TK7L2F.hic", genome=genome, binsize=1000) # params.contacts_reader = hicReader(fname=input_folder + "H1/control.chr4.50KBhic", genome=genome, binsize=1000) params.contacts_reader = params.contacts_reader.read_data() if params.use_only_contacts_with_CTCF == "cont_with_CTCF": params.proportion = 1 params.contacts_reader.use_contacts_with_CTCF( CTCFfile=input_folder + "H1/CTCF/CTCF_H1_conservative_peaks.bed.gz", maxdist=params.maxdist, proportion=params.proportion, keep_only_orient=params.keep_only_orient, CTCForientfile=input_folder + "H1/CTCF/CTCF_H1_conservative_peaks_orient.bed") params.use_only_contacts_with_CTCF += str(