def main(): hic_fname, hdf5_fname, binsize = sys.argv[1:4] binsize = int(binsize) hic = hifive.HiC(hic_fname) chromosomes = list(hic.fends['chromosomes'][...]) chroms = [] for i in range(1, 24): if str(i) in chromosomes: chroms.append(str(i)) for chrom in ['X', '2L', '2R', '3L', '3R']: if chrom in chromosomes: chroms.append(chrom) if rank == 0: infile = h5py.File(hdf5_fname, 'a') lengths = numpy.zeros(len(chroms), dtype=numpy.int32) for i, chrom in enumerate(chroms): chrint = hic.chr2int[chrom] lengths[i] = hic.fends['chrom_sizes'][chrint] infile.create_dataset(name='chromosomes', data=numpy.array(chroms)) infile.create_dataset(name='chrom_sizes', data=lengths) infile.attrs['binsize'] = binsize else: infile = None for chrom in chroms: find_bin_probabilities(chrom, hic, infile, binsize, 'fend') find_bin_probabilities(chrom, hic, infile, binsize, 'enrichment') find_bin_counts(chrom, hic, infile, binsize) if rank == 0: infile.close() print >> sys.stderr, ("\r%s\r") % (" " * 80),
def generate_intrachromosomal_observed_data(a_chr, bin_size, input_file, species='hg38', save_file=False): """ Generate an observed intrachromosomal contact matrix from HiC_project_object.hdf5. Arguments: a_chr (str): chromosome number (example for chromosome 1: '1'). bin_size (int): bin size in bp of the contact matrix. input_file (str): object containing learned correction parameters in .hdf5 format obtained with HiCtool_hifive.py (default: 'HiC_project_object.hdf5'). species (str): 'hg38' or 'mm10' or any other species label in string format. save_file (bool): if true, save the observed contact data. Return: observed intrachromosomal contact matrix in numpy array format. Output: observed intrachromosomal contact matrix in HiCtool compressed format if "save_file=True". """ import hifive chromosome = 'chr' + a_chr if bin_size >= 1000000: bin_size_str = str(bin_size / 1000000) output_filename = 'HiCtool_' + chromosome + '_' + bin_size_str + 'mb_' elif bin_size < 1000000: bin_size_str = str(bin_size / 1000) output_filename = 'HiCtool_' + chromosome + '_' + bin_size_str + 'kb_' chromosomes = open( parameters['chromSizes_path'] + species + '.chrom.sizes', 'r') d_chr_dim = {} while True: try: line2list = next(chromosomes).split('\n')[0].split('\t') d_chr_dim[line2list[0]] = int(line2list[1]) / bin_size except StopIteration: break end_pos = d_chr_dim[a_chr] * bin_size hic = hifive.HiC(input_file) heatmap_raw = hic.cis_heatmap(chrom=chromosome, start=0, stop=end_pos, binsize=bin_size, arraytype='full', datatype='raw') observed = heatmap_raw[:, :, 0] if save_file == True: save_matrix(observed, output_filename + 'observed.txt') return observed
def main(): in_prefix, out_fname = sys.argv[1:3] data_fname = "%s.hcd" % in_prefix project_fname = "%s.hcp" % in_prefix if not os.path.exists(project_fname): hic = hifive.HiC(project_fname, 'w') hic.load_data(data_fname) hic.save() else: hic = hifive.HiC(project_fname) hic.filter.fill(1) results = [hic.filter.shape[0]] cont = True i = 1 while cont: hic.filter_fends(mininteractions=i, mindistance=0, maxdistance=0) results.append(numpy.sum(hic.filter)) i += 1 if results[-1] == 0: cont = False numpy.savetxt(out_fname, numpy.array(results, dtype=numpy.int32))
def HicSweep(prefix): ## Redirect stdout in python so I can access the output later in the script # sys.stdout = open('%s.txt' % (prefix), 'w') for ixn in range(low_ixn, high_ixn + step_ixn, step_ixn): for dist in range(low_dist, high_dist + step_dist, step_dist): os.system('mkdir tmp_parameter_sweep') os.system( 'cp ../hifive/unprocessed_hic/%s* ./tmp_parameter_sweep/' % (prefix)) hic = hifive.HiC('tmp_parameter_sweep/%s_hic.hdf5' % (prefix)) print 'mininteraction=%s, mindistance=%s' % (ixn, dist) hic.filter_fends(mininteractions=ixn, mindistance=dist) os.system('rm -r tmp_parameter_sweep')
def worker(): args = comm.bcast(None, root=0) hic = hifive.HiC(args.HIC) bounds = comm.bcast(None, root=0) indices0 = comm.recv(source=0) indices1 = comm.recv(source=0) for i in range(indices0.shape[0]): X = indices0[i] Y = indices1[i] chrom = args.CHROMS[X] chrom2 = args.CHROMS[Y] data = hic.trans_heatmap(chrom, chrom2, binsize=args.BINSIZE, start1=bounds[chrom][0, 0], stop1=bounds[chrom][-1, 1], start2=bounds[chrom2][0, 0], stop2=bounds[chrom2][-1, 1], datatype=args.DATATYPE) comm.Send(data.flatten(), dest=0, tag=(X * len(args.CHROMS) + Y)) N = comm.bcast(None, root=0) data = numpy.zeros((N, N, 2), dtype=numpy.float64) comm.Bcast(data, root=0) M = comm.recv(source=0) indices0 = numpy.zeros(M, dtype=numpy.int32) indices1 = numpy.zeros(M, dtype=numpy.int32) comm.Recv(indices0, source=0) comm.Recv(indices1, source=0) data2 = numpy.zeros((M, 2), dtype=numpy.float64) for i in range(indices0.shape[0]): X = indices0[i] Y = indices1[i] try: where = numpy.where((data[X, :, 1] > 0) & (data[Y, :, 1] > 0))[0] if where.shape[0] < N / 2.: continue corr = numpy.corrcoef(data[X, where, 0], data[Y, where, 0])[0, 1] if corr != numpy.nan and abs(corr) < numpy.inf: data2[i, 0] = corr data2[i, 1] = 1 except: pass comm.Send(data2, dest=0) del data del data2 return None
'--chroms', dest="chroms", type=str, action='store', default='', help= "A Comma-separated list of chromosomes to use. Defaults to Numbered chromosomes up to 22 (fewer if appropriate) and X." ) return parser if __name__ == "__main__": parser = generate_parser() args = parser.parse_args() quasar = QuasarNoise(args.outfile, mode='w') hic = hifive.HiC(args.hic) resolutions = args.resolution.split(',') for i in range(len(resolutions)): resolutions[i] = int(resolutions[i]) noises = args.noise.split(',') for i in range(len(noises)): noises[i] = float(noises[i]) if args.chroms == '': chroms = [] else: chroms = args.chroms.split(',') quasar.find_transformation(hic, args.model, chroms, resolutions, noises, args.coverage) quasar.save() quasar.close()
Bin = [] Enriched = [] #Data input# File = open(sys.argv[1]) for i, line in enumerate(File): fields = line.strip("\r\n").split("\t") if fields[0] == "chr17": if int(fields[1]) >= 15000000: if int(fields[2]) <= 17500000: Value = (int(fields[1]) / 2) + (int(fields[2]) / 2) Mid.append(Value) #print Mid #HiC# hic = hifive.HiC('hifive_output.hcp') data = hic.cis_heatmap(chrom="chr17", start=15000000, stop=17500000, binsize=10000, datatype='fend', arraytype='full') data[:, :, 1] *= np.sum(data[:, :, 0]) / np.sum(data[:, :, 1]) where = np.where(data[:, :, 1] > 0) data[where[0], where[1], 0] /= data[where[0], where[1], 1] data = data[:, :, 0] #Processing# for m in Mid: Bin.append((m - 15000000) / 10000)
if i == 0: continue fields = line.rstrip("\r\n").split("\t") chrom = fields[0] start = int(fields[1]) end = int(fields[2]) if chrom == "chr17" and start >= 15000000 and end <= 17500000: midpoint = ((end - start)/2) + start midpoint_list.append(midpoint) # print(len(midpoint_list)) hic = hifive.HiC('week13.hcp') data = hic.cis_heatmap(chrom='chr17', start=15000000, stop=17500000, binsize=10000, datatype='fend', arraytype='full') data[:, :, 1] *= numpy.sum(data[:, :, 0]) / numpy.sum(data[:, :, 1]) where = numpy.where(data[:, :, 1] > 0) data[where[0], where[1], 0] /= data[where[0], where[1], 1] data = data[:, :, 0] # print(data) bin_list = [] for j in midpoint_list: i = (j - 15000000)/10000 bin_list.append(i) CTCF= numpy.unique(bin_list)
def run_hifive(self, parameters): fend_file = parameters['fend_file'] bam_file_1 = parameters['bam_file_1'] bam_file_2 = parameters['bam_file_2'] model = parameters['model'] restriction_enzymes = map( str, parameters['restriction_enzyme'].strip('[]').split(',')) if len(restriction_enzymes) == 1: restriction_enzyme = restriction_enzymes[0] else: restriction_enzyme = ','.join(restriction_enzymes) if model == 'Yaffe-Tanay': # Creating a Fend object fend = hifive.Fend('fend_object.hdf5', mode='w') fend.load_fends(fend_file, re_name=restriction_enzyme, format='bed') fend.save() # Creating a HiCData object data = hifive.HiCData('HiC_data_object.hdf5', mode='w') data.load_data_from_bam('fend_object.hdf5', [bam_file_1, bam_file_2], maxinsert=500, skip_duplicate_filtering=False) data.save() # Creating a HiC Project object hic = hifive.HiC('HiC_project_object.hdf5', 'w') hic.load_data('HiC_data_object.hdf5') hic.save() # Filtering HiC fends hic = hifive.HiC('HiC_project_object.hdf5') hic.filter_fends(mininteractions=1, mindistance=0, maxdistance=0) hic.save() # Finding HiC distance function hic = hifive.HiC('HiC_project_object.hdf5') hic.find_distance_parameters(numbins=90, minsize=200, maxsize=0) hic.save() # Learning correction parameters using the binning algorithm hic = hifive.HiC('HiC_project_object.hdf5') hic.find_binning_fend_corrections( max_iterations=1000, mindistance=500000, maxdistance=0, num_bins=[20, 20, 20, 20], model=['len', 'distance', 'gc', 'mappability'], parameters=['even', 'even', 'even', 'even'], usereads='cis', learning_threshold=1.0) hic.save('HiC_norm_binning.hdf5') elif model == 'Hi-Corrector': # Creating a Fend object fend = hifive.Fend('fend_object.hdf5', mode='w') fend.load_fends(fend_file, re_name=restriction_enzyme, format='bed') fend.save() # Creating a HiCData object data = hifive.HiCData('HiC_data_object.hdf5', mode='w') data.load_data_from_bam('fend_object.hdf5', [bam_file_1, bam_file_2], maxinsert=500, skip_duplicate_filtering=False) data.save() # Creating a HiC Project object hic = hifive.HiC('HiC_project_object.hdf5', 'w') hic.load_data('HiC_data_object.hdf5') hic.save()
import hifive import numpy as np import matplotlib.pyplot as plt hic = hifive.HiC('/Users/xiangning/qbb2020-answers/HW9/project_step3', 'r') data = hic.cis_heatmap('chr13', 1000000, datatype='fend', arraytype='full', diagonalincluded=True) enrichment = data[:, :, 0] / data[:, :, 1] inds = np.where(np.isnan(enrichment)) enrichment[inds] = 1 print("The shape of the data is ", data.shape) # #Plotting plt.rcParams["figure.figsize"] = (10, 10) plt.imshow(np.log2(enrichment)) plt.title('enrichment_heatmap') plt.savefig('enrichment_heatmap.png') ##Compartment analysis--Part I plt.rcParams["figure.figsize"] = (20, 3) Comp = hifive.hic_domains.Compartment(hic, 100000, chroms=['chr13'], out_fname='tmp.hdf5') Comp.write_eigen_scores('hic_comp.bed') print(Comp) X = Comp.positions['chr13']
def main(): pattern, hdf5_fname, binsize = sys.argv[1:4] fnames = glob.glob(pattern) if len(fnames) == 0: return None binsize = int(binsize) if rank == 0: outfile = h5py.File(hdf5_fname, 'w') for i in range(len(fnames)): hic = hifive.HiC(fnames[i]) if i == 0: fends = hic.fends['fends'][...] chr_indices = hic.fends['chr_indices'][...] chromosomes = hic.fends['chromosomes'][...] chrom_sizes = hic.fends['chrom_sizes'][...] mappings = [] counts = {} for j in range(chr_indices.shape[0] - 1): if chr_indices[j + 1] - chr_indices[j] == 0: mappings.append(None) counts[chromosomes[j]] = None continue start = (fends['mid'][chr_indices[j]] / binsize) * binsize mappings.append( (fends['mid'][chr_indices[j]:chr_indices[j + 1]] - start) / binsize) N = mappings[-1][-1] + 1 counts[chromosomes[j]] = numpy.zeros(N, dtype=numpy.int64) outfile.create_dataset(name='binning_fend_indices', data=hic.binning_fend_indices) outfile.create_dataset(name='binning_num_bins', data=hic.binning_num_bins) corrections = numpy.zeros( (hic.binning_corrections.shape[0], len(fnames)), dtype=hic.binning_corrections.dtype) corrections[:, i] = hic.binning_corrections reads = hic.data['cis_data'] for j in range(chr_indices.shape[0] - 1): if mappings[j] is None: continue start = hic.data['cis_indices'][chr_indices[j]] stop = hic.data['cis_indices'][chr_indices[j + 1]] chrom = chromosomes[j] counts[chrom] += numpy.bincount( mappings[j][reads[start:stop, 0] - chr_indices[j]], minlength=counts[chrom].shape[0]) outfile.create_dataset(name='binning_corrections', data=numpy.median(corrections, axis=1)) chr2int = {} for i, chrom in enumerate(chromosomes): chr2int[chrom] = i chroms = [] for i in range(1, 24): if str(i) in chromosomes: chroms.append(str(i)) for chrom in ['X', '2L', '2R', '3L', '3R']: if chrom in chromosomes: chroms.append(chrom) lengths = numpy.zeros(len(chroms), dtype=numpy.int32) for i, chrom in enumerate(chroms): chrint = chr2int[chrom] lengths[i] = chrom_sizes[chrint] start = (fends['mid'][chr_indices[chrint]] / binsize) * binsize stop = ((fends['mid'][chr_indices[chrint + 1] - 1] - 1) / binsize + 1) * binsize outfile.attrs['%s.start' % chrom] = start outfile.attrs['%s.stop' % chrom] = stop outfile.create_dataset(name='chromosomes', data=numpy.array(chroms)) outfile.create_dataset(name='chrom_sizes', data=lengths) outfile.attrs['binsize'] = binsize binning_corrections = outfile['binning_corrections'][...] binning_num_bins = outfile['binning_num_bins'][...] fend_indices = outfile['binning_fend_indices'][...] S1, S2, S3 = comm.bcast((binning_corrections.shape, binning_num_bins.shape, fend_indices.shape), root=0) chroms = comm.bcast(chroms, root=0) chr2int = comm.bcast(chr2int, root=0) fends = comm.bcast(fends, root=0) chr_indices = comm.bcast(chr_indices, root=0) else: outfile = None S1, S2, S3 = comm.bcast(None, root=0) chroms = comm.bcast(None, root=0) chr2int = comm.bcast(None, root=0) fends = comm.bcast(None, root=0) chr_indices = comm.bcast(None, root=0) binning_corrections = numpy.zeros(S1, dtype=numpy.float32) binning_num_bins = numpy.zeros(S2, dtype=numpy.int32) fend_indices = numpy.zeros(S3, dtype=numpy.int32) counts = {} for chrom in chroms: counts[chrom] = None if comm is not None: comm.Bcast(binning_corrections, root=0) comm.Bcast(binning_num_bins, root=0) comm.Bcast(fend_indices, root=0) for chrom in chroms: find_bin_probabilities(chrom, outfile, fends, chr_indices, binsize, chr2int, binning_corrections, binning_num_bins, fend_indices, counts[chrom]) if rank == 0: outfile.close() print >> sys.stderr, ("\r%s\r") % (" " * 80),
# '17':94987271, # '18':90702639, # '19':61431566, # 'X':171031299, # 'Y':91744698} chromosome = 'chr' + ch start_pos = 0 end_pos = (chromosomes[ch] / 1000000) * 1000000 start_part = str(float(start_pos) / float(1000000)) end_part = str(float(end_pos) / float(1000000)) binsize_str = str(float(bin_size) / float(1000000)) # Enrichment data hic = hifive.HiC(HiC_norm_binning_hdf5_file) heatmap_enrich = hic.cis_heatmap(chrom=chromosome, start=start_pos, stop=end_pos, binsize=bin_size, arraytype='full', datatype='enrichment') # Observed data observed = heatmap_enrich[:, :, 0] # observed contact data extracted from the heatmap object n = len(observed) save_matrix( n, observed, outdir + '/HiCtool_observed_contact_matrix_' + chromosome + '_' + binsize_str + 'mb_' + start_part + 'mb_' + end_part + 'mb.txt')
import hifive peaks=[] bins =[] #bins= np.unique(bins) bins1l = [] bins2l = [] enrichedl = [] for line in open(sys.argv[1]): fields = line.strip("\r\n").split("\t") if fields[0] == "chr17": if int(fields[1]) >= 15000000 and int(fields[2]) <= 17500000: peaks.append((int(fields[1])+int(fields[2]))/2) hic = hifive.HiC("hifive_output.hcp") data = hic.cis_heatmap(chrom="chr17", start=15000000, stop=17500000, binsize=10000, datatype='fend', arraytype='full') data[:, :, 1] *= np.sum(data[:, :, 0]) / np.sum(data[:, :, 1]) where = np.where(data[:, :, 1] > 0) data[where[0], where[1], 0] /= data[where[0], where[1], 1] data = data[:, :, 0] for value in peaks: i=(value - 15000000)/10000 bins.append(i) for i in range(len(bins)): for j in range(i,len(bins)): enrichment = float(data[bins[i],bins[j]]) if enrichment >= 1: enrichedl.append(
<CTCF file> - A table with the start and end positions of CTCF binding sites This script takes hifive data and incorporates it with ChIP-seq data to find regions of the genome where your protein of interest binds and creates interactions between DNA. In this case, it looks at a specific region of mouse chr17 to finds all of the hifive interaction enrichments greater than 1 that have at least one CTCF peak at both ends. """ import sys import numpy as np import hifive import pandas as pd # This is a file output from hifive with interactions between regions of a genome hic = hifive.HiC('hifive.hcp') """ PART 1 Make a 2D enrichment matrix from the hifive file. All of this code was written by Mike Sauria. """ # Get data into numpy 3D array data = hic.cis_heatmap(chrom='chr17', start=15000000, stop=17500000, binsize=10000, \ datatype='fend', arraytype='full') # Make square enrichment matrix data[:, :, 1] *= np.sum(data[:, :, 0]) / np.sum(data[:, :, 1]) # Finds bins where expected value is > 0 and only preserve those bins in the data matrix where = np.where(data[:, :, 1] > 0)
def generate_interchromosomal_observed_data(chr_row, chr_col, bin_size, input_file, species='hg38', save_file=False): """ Generate an observed interchromosomal contact matrix from HiC_project_object.hdf5 Arguments: chr_row (str): chromosome number for the rows (example for chromosome 1: '1'). chr_col (str): chromosome number for the columns (example for chromosome 1: '1'). bin_size (int): bin size in bp of the contact matrix. input_file (str): object containing learned correction parameters in hdf5 format obtained with HiCtool_hifive.py (default: 'HiC_project_object.hdf5'). species (str): 'hg38' or 'mm10' or any other species label in string format. save_file (bool): if True, save the observed contact data. Return: observed interchromosomal contact matrix in numpy array format. Output: observed interchromosomal contact matrix in HiCtool compressed format if "save_file=True". """ import hifive chromosome_row = 'chr' + chr_row chromosome_col = 'chr' + chr_col if bin_size >= 1000000: bin_size_str = str(bin_size / 1000000) output_filename = 'HiCtool_' + chromosome_row + '_' + chromosome_col + '_' + bin_size_str + 'mb_' elif bin_size < 1000000: bin_size_str = str(bin_size / 1000) output_filename = 'HiCtool_' + chromosome_row + '_' + chromosome_col + '_' + bin_size_str + 'kb_' chromosomes = open( parameters['chromSizes_path'] + species + '.chrom.sizes', 'r') d_chr_dim = {} while True: try: line2list = next(chromosomes).split('\n')[0].split('\t') d_chr_dim[line2list[0]] = int(line2list[1]) / bin_size except StopIteration: break end_pos_row = d_chr_dim[chr_row] * bin_size end_pos_col = d_chr_dim[chr_col] * bin_size hic = hifive.HiC(input_file) heatmap_raw = hic.trans_heatmap(chromosome_row, chromosome_col, start1=0, stop1=end_pos_row, start2=0, stop2=end_pos_col, binsize=bin_size, datatype='raw') observed = heatmap_raw[:, :, 0] row = observed.shape[0] col = observed.shape[1] if save_file == True: row_str = str(row) col_str = str(col) output_filename = output_filename + row_str + 'x' + col_str + '_' save_matrix_rectangular(observed, output_filename + 'observed.txt') return observed
if int(col[1]) >= 5000000 and int(col[2]) <= 40000000: index = (int(col[2]) - 5000000) / 5000 rnal[index] = float(col[4]) for i, line in enumerate(activity): if i == 0: continue col = line.rstrip("\n").split("\t") if int(col[1]) >= 5000000 and int(col[2]) <= 40000000: index = (int(col[2]) - 5000000) / 5000 activityl[index] = float(col[4]) rnaa = numpy.array(rnal) activitya = numpy.array(activityl) hic = hifive.HiC('PROJECT', 'r') data = hic.cis_heatmap(chrom='chr10', start=5000000, stop=40000000, binsize=5000, datatype='fend', arraytype='full') where = numpy.where(data[:, :, 1] > 0) data[where[0], where[1], 0] /= data[where[0], where[1], 1] data = numpy.log(data[:, :, 0] + 0.1) data -= numpy.amin(data) data_subset = data[numpy.where(rnaa > 0), :] sum_data_subset = numpy.sum(data_subset, axis=1) R = numpy.corrcoef(sum_data_subset, rnaa)[0, 1] print(R)
def main(args): hic = hifive.HiC(args.HIC) if args.CHROMS == '': args.CHROMS = hic.fends['chromosomes'][...] else: args.CHROMS = args.CHROMS.split(',') bounds = {} args.CHROMS.sort() new_chr_indices = [0] if 'binned' in hic.__dict__ and hic.binned is not None: fends = hic.fends['bins'][...] chr_indices = hic.fends['bin_indices'][...] else: fends = hic.fends['fends'][...] chr_indices = hic.fends['chr_indices'][...] for chrom in args.CHROMS: chrint = hic.chr2int[chrom] sfend = chr_indices[chrint] efend = chr_indices[chrint + 1] valid = numpy.where(hic.filter[sfend:efend])[0] + sfend if valid.shape[0] < 2: print >> sys.stderr, ("Insufficient information for %s\n") % ( chrom), continue sbin = (fends['mid'][valid[0]] / args.BINSIZE) * args.BINSIZE ebin = (fends['mid'][valid[-1]] / args.BINSIZE + 1) * args.BINSIZE N = (ebin - sbin) / args.BINSIZE bounds[chrom] = numpy.zeros((N, 2), dtype=numpy.int32) bounds[chrom][:, 0] = numpy.arange(N) * args.BINSIZE + sbin bounds[chrom][:, 1] = bounds[chrom][:, 0] + args.BINSIZE new_chr_indices.append(new_chr_indices[-1] + N) args.CHROMS = bounds.keys() args.CHROMS.sort() args = comm.bcast(args, root=0) chr_indices = new_chr_indices data = numpy.zeros((chr_indices[-1], chr_indices[-1], 2), dtype=numpy.float64) mapping = numpy.zeros((chr_indices[-1], 3), dtype=numpy.int32) for i, chrom in enumerate(args.CHROMS): mapping[chr_indices[i]:chr_indices[i + 1], 0] = i mapping[chr_indices[i]:chr_indices[i + 1], 1:] = bounds[chrom] indices = list(numpy.triu_indices(len(args.CHROMS), 1)) if comm is not None: bounds = comm.bcast(bounds, root=0) node_ranges = numpy.round( numpy.linspace(0, indices[0].shape[0], num_procs + 1)).astype(numpy.int32) for i in range(1, num_procs): comm.send(indices[0][node_ranges[i]:node_ranges[i + 1]], dest=i) comm.send(indices[1][node_ranges[i]:node_ranges[i + 1]], dest=i) indices0 = indices[0][:node_ranges[1]] indices1 = indices[1][:node_ranges[1]] else: indices0 = indices[0] indices1 = indices[1] for i in range(indices0.shape[0]): X = indices0[i] Y = indices1[i] chrom = args.CHROMS[X] chrom2 = args.CHROMS[Y] data[chr_indices[X]:chr_indices[X + 1], chr_indices[Y]:chr_indices[Y + 1], :] = hic.trans_heatmap( chrom, chrom2, binsize=args.BINSIZE, start1=mapping[chr_indices[X], 1], stop1=mapping[chr_indices[X + 1] - 1, 2], start2=mapping[chr_indices[Y], 1], stop2=mapping[chr_indices[Y + 1] - 1, 2], datatype=args.DATATYPE) if comm is not None: for i in range(1, num_procs): for j in range(node_ranges[i], node_ranges[i + 1]): X = indices[0][j] Y = indices[1][j] temp = numpy.zeros((chr_indices[X + 1] - chr_indices[X]) * (chr_indices[Y + 1] - chr_indices[Y]) * 2, dtype=numpy.float32) comm.Recv(temp, source=i, tag=(X * len(args.CHROMS) + Y)) data[chr_indices[X]:chr_indices[X + 1], chr_indices[Y]:chr_indices[Y + 1], :] = temp.reshape( chr_indices[X + 1] - chr_indices[X], -1, 2) N = data.shape[0] indices = list(numpy.triu_indices(N, 1)) data[indices[1], indices[0], :] = data[indices[0], indices[1], :] valid = numpy.sum(data[:, :, 0], axis=1) > mapping.shape[0] / 2. ivrows = numpy.where(numpy.logical_not(valid))[0] data[ivrows, :, :] = 0 data[:, ivrows, :] = 0 if args.PLOT: img = hifive.plotting.plot_full_array(data, symmetricscaling=False) img.save("%s_enr.png" % args.OUTPUT) where = numpy.where((data[:, :, 0] > 0) & (data[:, :, 1] > 0)) data[where[0], where[1], 0] /= data[where[0], where[1], 1] data[where[0], where[1], 1] = 1 data[where[0], where[1], 0] = numpy.log(data[where[0], where[1], 0]) scores = data[where[0], where[1], 0] scores.sort() data[where[0], where[1], 0] = numpy.maximum( scores[int(scores.shape[0] * 0.05)], numpy.minimum(scores[int(scores.shape[0] * 0.95)], data[where[0], where[1], 0])) data[where[0], where[1], 0] -= numpy.mean(data[where[0], where[1], 0]) data2 = numpy.zeros(data.shape, dtype=data.dtype) indices[0] = indices[0].astype(numpy.int32) indices[1] = indices[1].astype(numpy.int32) if comm is not None: N = comm.bcast(N, root=0) comm.Bcast(data, root=0) node_ranges = numpy.round( numpy.linspace(0, indices[0].shape[0], num_procs + 1)).astype(numpy.int32) for i in range(1, num_procs): comm.send(node_ranges[i + 1] - node_ranges[i], dest=i) comm.Send(indices[0][node_ranges[i]:node_ranges[i + 1]], dest=i) comm.Send(indices[1][node_ranges[i]:node_ranges[i + 1]], dest=i) indices0 = indices[0][:node_ranges[1]] indices1 = indices[1][:node_ranges[1]] else: indices0, indices1 = indices for i in range(indices0.shape[0]): print >> sys.stderr, ("\r%s\rCorrelating %i of %i bins") % ( ' ' * 50, i, indices0.shape[0]), X = indices0[i] Y = indices1[i] try: where = numpy.where((data[X, :, 1] > 0) & (data[Y, :, 1] > 0))[0] if where.shape[0] < N / 10.: continue corr = numpy.corrcoef(data[X, where, 0], data[Y, where, 0])[0, 1] if corr != numpy.nan and abs(corr) < numpy.inf: data2[X, Y, 0] = corr data2[X, Y, 1] = 1 except: pass if comm is not None: for i in range(1, num_procs): temp = numpy.zeros((node_ranges[i + 1] - node_ranges[i], 2), dtype=numpy.float64) comm.Recv(temp, source=i) data2[indices[0][node_ranges[i]:node_ranges[i + 1]], indices[1][node_ranges[i]:node_ranges[i + 1]], :] = temp data2[indices[1], indices[0], :] = data2[indices[0], indices[1], :] where = numpy.where(data2[:, :, 1]) scores = data2[where[0], where[1], 0] scores.sort() data2[where[0], where[1], 0] = numpy.maximum( scores[int(scores.shape[0] * 0.05)], numpy.minimum(scores[int(scores.shape[0] * 0.95)], data2[where[0], where[1], 0])) - scores[int( scores.shape[0] / 2)] data2[where[0], where[1], 0] /= numpy.amax(numpy.abs(data2[where[0], where[1], 0])) valid = numpy.sum(data2[:, :, 1], axis=1) >= data2.shape[0] / 2 vrows = numpy.where(valid)[0] ivrows = numpy.where(numpy.logical_not(valid))[0] eigen = numpy.real( scipy.sparse.linalg.eigs(data2[vrows, :, 0][:, vrows], k=1)[1][:, 0]) output = open("%s.bg" % args.OUTPUT, 'w') output1 = open("%s.bed" % args.OUTPUT, 'w') start = mapping[vrows[0], 0] for i, X, in enumerate(vrows): print >> output, "%s\t%i\t%i\t%f" % ( args.CHROMS[mapping[X, 0]], mapping[X, 1], mapping[X, 1], eigen[i]) if i < vrows.shape[0] - 1: if mapping[X, 0] != mapping[vrows[i + 1], 0] or numpy.sign( eigen[i]) != numpy.sign(eigen[i + 1]): if eigen[i] >= 0: score = 1 sign = '+' else: score = -1 sign = '-' print >> output1, "%s\t%i\t%i\t.\t%i\t%s" % (args.CHROMS[ mapping[X, 0]], start, mapping[X, 2], score, sign) start = mapping[vrows[i + 1], 1] else: if eigen[i] >= 0: score = 1 sign = '+' else: score = -1 sign = '-' print >> output1, "%s\t%i\t%i\t.\t%i\t%s" % ( args.CHROMS[mapping[X, 0]], start, mapping[X, 2], score, sign) output.close() output1.close() if args.PLOT: data3 = numpy.zeros((data2.shape[0], data2.shape[0] + 42, 2), dtype=data2.dtype) data3[:, 42:, :] = data2 eigen /= numpy.amax(numpy.abs(eigen)) / 20.5 for i, X in enumerate(vrows): data3[X, :40, 1] = 1 if eigen[i] >= 0: data3[X, 20:(20 + int(round(eigen[i]))), 0] = 1 else: data3[X, (20 - int(round(-eigen[i]))):20, 0] = -1 img = hifive.plotting.plot_full_array(data3, logged=False, symmetricscaling=True) img.save("%s_comp.png" % args.OUTPUT)
if i == 0: continue col = line.rstrip("\n").split("\t") if int(col[1]) >= 5000000 and int(col[2])<=40000000: index = ((int(col[1]) - 5000000) / 5000) act[index]= col[4] for i, line in enumerate(bed2): if i == 0: continue col = line.rstrip("\n").split("\t") if int(col[1]) >= 5000000 and int(col[2])<=40000000: index = (int(col[1]) - 5000000) / 5000 rna[index]= col[4] hic = hifive.HiC('PROJECT_NAME', 'r') data = hic.cis_heatmap(chrom='chr10', start=5000000, stop=40000000, binsize=5000, datatype='fend', arraytype='full') where = numpy.where(data[:, :, 1] > 0) data[where[0], where[1], 0] /= data[where[0], where[1], 1] data = numpy.log(data[:, :, 0] + 0.1) data -= numpy.amin(data) int_act = {} for key1 in rna: total_act = 0 for key2 in act: total_act+=float(act[key2])*data[key1][key2] int_act[key1] = total_act rna_list = []
if not os.path.exists(outdir): os.mkdir(outdir) # Creating a Fend object fend = hifive.Fend(outdir + '/fend_object.hdf5', mode='w') fend.load_fends(RE_bed, re_name='RE', format='bed') fend.save() # Creating a HiCData object data = hifive.HiCData(outdir + '/HiC_data_object.hdf5', mode='w') data.load_data_from_bam(outdir + '/fend_object.hdf5', [bam1, bam2], maxinsert=500) data.save() # Creating a HiC Project object hic = hifive.HiC(outdir + '/HiC_project_object.hdf5', 'w') hic.load_data(outdir + '/HiC_data_object.hdf5') hic.save() # Filtering HiC fends hic = hifive.HiC(outdir + '/HiC_project_object.hdf5') hic.filter_fends(mininteractions=1, mindistance=500000, maxdistance=0) hic.save() # Finding HiC distance function hic = hifive.HiC(outdir + '/HiC_project_object.hdf5') hic.find_distance_parameters(numbins=90, minsize=200, maxsize=0) hic.save(outdir + '/HiC_distance_function.hdf5') # Learning correction parameters using the binning algorithm
def normalize_chromosome_fend_data(a_chr): """ Normalize the contact data by calculating the corrected reads count for each bin. Observed data and expected fend data (correction data) can be saved to txt file. Arguments: a_chr (str): chromosome number (example for chromosome 1: '1'). Return: Normalized fend contact matrix. Outputs: Txt file with the normalized enrichment contact matrix saved in the HiCtool compressed format. Txt file with the observed contact matrix saved in the HiCtool compressed format if "save_obs=True". Txt file with the expected contact matrix saved in the HiCtool compressed format if "save_expect=True". """ import hifive import numpy as np bin_size = parameters['bin_size'] input_file = parameters['input_file'] save_obs = bool(parameters['save_obs']) save_expect = bool(parameters['save_expect']) chromosome = 'chr' + a_chr print "Normalizing fend data " + chromosome + " ..." if bin_size >= 1000000: bin_size_str = str(bin_size/1000000) output_filename = 'HiCtool_' + chromosome + '_' + bin_size_str + 'mb_' elif bin_size < 1000000: bin_size_str = str(bin_size/1000) output_filename = 'HiCtool_' + chromosome + '_' + bin_size_str + 'kb_' chromosomes = open(parameters['chromSizes_path'] + parameters['species'] + '.chrom.sizes', 'r') d_chr_dim = {} while True: try: line2list = next(chromosomes).split('\n')[0].split('\t') d_chr_dim[line2list[0]] = int(line2list[1])/bin_size except StopIteration: break start_pos = 0 end_pos = d_chr_dim[a_chr]*bin_size # Expected raw (number of possible fend interactions). # These are needed to scale the fend expected data by the mean fend pairs #in each bin. hic = hifive.HiC(input_file) heatmap_raw = hic.cis_heatmap(chrom=chromosome, start=start_pos, stop=end_pos, binsize=bin_size, arraytype='full', datatype='raw') expected_raw = heatmap_raw[:,:,1] n = len(expected_raw) scaling_factor = float(np.sum(expected_raw)/2.0)/float(n*(n-1)/2) # mean fend pairs in each bin # Fend data hic = hifive.HiC(input_file) heatmap_fend = hic.cis_heatmap(chrom=chromosome, start=start_pos, stop=end_pos, binsize=bin_size, arraytype='full', datatype='fend') observed = heatmap_fend[:,:,0] # observed contact data extracted from the heatmap object if save_obs == True: save_matrix(observed, output_filename + 'observed.txt') # Expected fend (fend corrections) expected_fend = heatmap_fend[:,:,1]/scaling_factor # fend correction values if save_expect == True: save_matrix(expected_fend, output_filename + 'expected_fend.txt') # In the above calls, all valid possible interactions are queried from # chromosome 'chrom' between 'start' and 'stop' parameters. The 'arraytype' # parameter determines what shape of array data are returned in: 'full' # returns a square, symmetric array of size NxNx2. The 'datatype' parameter # specifies which kind of data to extract. The **observed counts** are in # the first index of the last dimension of the returned array (the same # for every 'datatype'), while the **expected counts** are in the second # index of the last dimension. # Normalized fend contact matrix n = len(expected_fend) normalized_fend = np.zeros((n,n)) for i in xrange(n): for j in xrange(n): if expected_fend[i][j] == 0: normalized_fend[i][j] = 0 else: normalized_fend[i][j] = float(observed[i][j])/float(expected_fend[i][j]) save_matrix(normalized_fend, output_filename + 'normalized_fend.txt') print "Done!" return normalized_fend
#!/usr/bin/env python2 from __future__ import division import hifive import numpy as np import matplotlib.pyplot as plt import pyBigWig # Get data hic = hifive.HiC("filtered_1.dat", 'r') chr13 = hic.cis_heatmap('chr13', 1000000, datatype='fend', arraytype='full', diagonalincluded=True) enrichment = (chr13[:, :, 0] + 1) / (chr13[:, :, 1] + 1) log_enrichment = np.log(enrichment) # Create heatmap of the log of enrichment scores fig, ax = plt.subplots(figsize=(14, 10)) ax.set_title("Heatmap of Enrichment Scores for Chr13", fontsize=20) ax = sns.heatmap(log_enrichment) plt.savefig("chr13_heatmap.png") # Compartment Analysis Comp = hifive.hic_domains.Compartment(hic, 100000, chroms=['chr13'], out_fname='tmp.hdf5') Comp.write_eigen_scores('hic_comp.bed') X = Comp.positions['chr13'] Y = Comp.eigenv['chr13']
def normalize_chromosome_enrich_data(a_chr): """ Calculate the enrichment data as "observed/expected" where the expected reads count is for each bin considering the linear distance between read pairs and the learned correction parameters. Observed and expected contact data can be saved to txt files. Arguments: a_chr (str): chromosome number (example for chromosome 1: '1'). Return: Normalized enrichment contact matrix. Outputs: Txt file with the normalized enrichment contact matrix saved in the HiCtool compressed format. Txt file with the observed contact matrix saved in the HiCtool compressed format if "save_obs=True". Txt file with the expected contact matrix saved in the HiCtool compressed format if "save_expect=True". """ import hifive import numpy as np bin_size = parameters['bin_size'] input_file = parameters['input_file'] save_obs = bool(parameters['save_obs']) save_expect = bool(parameters['save_expect']) print "Normalizing enrichment data..." chromosome = 'chr' + a_chr if bin_size >= 1000000: bin_size_str = str(bin_size/1000000) output_filename = 'HiCtool_' + chromosome + '_' + bin_size_str + 'mb_' elif bin_size < 1000000: bin_size_str = str(bin_size/1000) output_filename = 'HiCtool_' + chromosome + '_' + bin_size_str + 'kb_' start_pos = 0 chromosomes = open(parameters['chromSizes_path'] + parameters['species'] + '.chrom.sizes', 'r') d_chr_dim = {} while True: try: line2list = next(chromosomes).split('\n')[0].split('\t') d_chr_dim[line2list[0]] = int(line2list[1])/bin_size except StopIteration: break end_pos = d_chr_dim[a_chr]*bin_size # Enrichment data hic = hifive.HiC(input_file) heatmap_enrich = hic.cis_heatmap(chrom=chromosome, start=start_pos, stop=end_pos, binsize=bin_size, arraytype='full', datatype='enrichment') # Observed data observed = heatmap_enrich[:,:,0] # observed contact data extracted from the heatmap object if save_obs == True: save_matrix(observed, output_filename + 'observed.txt') # Expected enrichment data (fend corrections and distance property) expected_enrich = heatmap_enrich[:,:,1] # expected enrichment contact data extracted from the heatmap object if save_expect == True: save_matrix(expected_enrich, output_filename + 'expected_enrich.txt') # Normalized enrichment contact matrix n = len(expected_enrich) normalized_enrich = np.zeros((n,n)) for i in xrange(n): for j in xrange(n): if expected_enrich[i][j] == 0: normalized_enrich[i][j] = -1 else: normalized_enrich[i][j] = float(observed[i][j])/float(expected_enrich[i][j]) save_matrix(normalized_enrich, output_filename + 'normalized_enrich.txt') print "Done!" return normalized_enrich
for a, lines in enumerate(rna): if a == 0: continue positionRNA = lines.rstrip("/n").split() if int(positionRNA[1]) >= 5000000 and int(positionRNA[2]) <= 40000000: posRNA = (int(positionRNA[1]) - 5000000) / 5000 rnadic[posRNA] = float(positionRNA[4]) #print(rnadic) enharray = numpy.array(enh) rnarray = numpy.array(rnadic) #quit() import hifive hi = hifive.HiC('PROJECT_FNAME', 'r') data1 = hi.cis_heatmap(chrom='chr10', start=5000000, stop=40000000, binsize=5000, datatype='fend', arraytype='full') where = numpy.where(data1[:, :, 1] > 0) data1[where[0], where[1], 0] /= data1[where[0], where[1], 1] data1 = numpy.log(data1[:, :, 0] + 0.1) data1 -= numpy.amin(data) data_subset = data1[numpy.where(rnaa > 0), :] sum_data_subset = numpy.sum(data_subset, axis=1) R = numpy.corrcoef(sum_data_subset, rnaa)[0, 1] print(R)
#!/usr/bin/env python2 import hifive import numpy as np import pandas as pd import sys hic = hifive.HiC('./normalized/normalizing.hcp') data = hic.cis_heatmap(chrom='chr17', start=15000000, stop=17500000, binsize=10000, datatype='fend', arraytype='full') data[:, :, 1] *= np.sum(data[:, :, 0]) / np.sum(data[:, :, 1]) where = np.where(data[:, :, 1] > 0) data[where[0], where[1], 0] /= data[where[0], where[1], 1] data = data[:, :, 0] #print(data) f = pd.read_csv(sys.argv[1], sep='\t') f_df = pd.DataFrame(f) start = f_df.iloc[:, 1] end = f_df.iloc[:, 2] mid = np.add(end, start) mid = np.divide(mid, 2) mid = np.subtract(mid, 15000000)
#!/usr/bin/env python2.7 """ Example: $ python hifive_processing.py alignments.raw name """ import hifive import sys rawAlign, name = sys.argv[1], sys.argv[ 2] # Name will be the prefix of output files ## Load in the restriction enzyme digested fend coordinates fend = hifive.Fend('%s_fend.hdf5' % (name), mode='w') fend.load_fends('../ce10nm2.bed', genome_name='ce10', re_name='DpnII', format='bed') fend.save() ## Load in the read data data = hifive.HiCData('%s_data.hdf5' % (name), mode='w') data.load_data_from_bam('%s_fend.hdf5' % (name), rawAlign, maxinsert=500) data.save() ## Create a HiC object hic = hifive.HiC('%s_hic.hdf5' % (name), 'w') hic.load_data('%s_data.hdf5' % (name)) hic.save()
def run_hifive(self, parameters): fend_file = parameters['fend_file'] bam_file_1 = parameters['bam_file_1'] bam_file_2 = parameters['bam_file_2'] model = parameters['model'] add_gc = bool(parameters['add_gc']) add_mappability = bool(parameters['add_mappability']) restriction_enzymes = map( str, parameters['restriction_enzyme'].strip('[]').split(',')) if len(restriction_enzymes) == 1: restriction_enzyme = restriction_enzymes[0] else: restriction_enzyme = ','.join(restriction_enzymes) # Run for both models if not os.path.isfile('HiC_project_object.hdf5'): fend = hifive.Fend('fend_object.hdf5', mode='w') fend.load_fends(fend_file, re_name=restriction_enzyme, format='bed') fend.save() # Creating a HiCData object data = hifive.HiCData('HiC_data_object.hdf5', mode='w') data.load_data_from_bam('fend_object.hdf5', [bam_file_1, bam_file_2], maxinsert=500, skip_duplicate_filtering=False) data.save() # Creating a HiC Project object hic = hifive.HiC('HiC_project_object.hdf5', 'w') hic.load_data('HiC_data_object.hdf5') hic.save() if model == 'Yaffe-Tanay': if not os.path.isfile('HiC_norm_binning.hdf5'): # Filtering HiC fends hic = hifive.HiC('HiC_project_object.hdf5') hic.filter_fends(mininteractions=1, mindistance=0, maxdistance=0) # Finding HiC distance function hic.find_distance_parameters(numbins=90, minsize=200, maxsize=0) hic.save('HiC_project_object_with_distance_parameters.hdf5') # Learning correction parameters using the binning algorithm my_model = ['len', 'distance'] if add_gc == True: my_model.append('gc') if add_mappability == True: my_model.append('mappability') my_num_bins = [20] * len(my_model) my_parameters = ['even'] * len(my_model) hic.find_binning_fend_corrections(max_iterations=1000, mindistance=500000, maxdistance=0, num_bins=my_num_bins, model=my_model, parameters=my_parameters, usereads='cis', learning_threshold=1.0) hic.save('HiC_norm_binning.hdf5')
#!/usr/bin/env python2 import hifive import matplotlib.pyplot as plt import numpy as np # read in data hic = hifive.HiC('./project.fend', 'r') data = hic.cis_heatmap('chr13', 1000000, datatype='fend', arraytype='full', diagonalincluded=True) # Calculate corrected enrichment ind_true = np.where(data[:, :, 0:2] > 0) enrichment = data[ind_true[0], ind_true[1], 0] / data[ind_true[0], ind_true[1], 1] # create 2D array to input data to for plotting enrich_matrix = np.zeros((1193, 1193)) for x in range(len(enrichment)): enrich_matrix[ind_true[0][x]][ind_true[1][x]] = np.log(enrichment[x]) # plot heatmap fig, ax = plt.subplots() im = ax.imshow(enrich_matrix, cmap="Reds") cbar = ax.figure.colorbar(im, ax=ax) cbar.set_label("Log corrected enrichment scores") ax.set_xlabel("Chromosome 13 1Mb bin", fontsize=16) ax.set_ylabel("Chromosome 13 1Mb bin", fontsize=16) ax.set_title("C13 log of corrected enrichment scores", fontsize=16) plt.savefig('Chrom13_heatmap.png')
#!/usr/bin/env python2 ''' Predict gene activity based on interactions with enhancers/TSSs Usage: activity_by_contact_expression.py <bed_file1_activity> <bed_file2_RNA> ''' import hifive import numpy import sys hic = hifive.HiC('class13_project', 'r') data = hic.cis_heatmap(chrom='chr10', start=5000000, stop=40000000, binsize=5000, datatype='fend', arraytype='full') where = numpy.where(data[:, :, 1] > 0) data[where[0], where[1], 0] /= data[where[0], where[1], 1] data = numpy.log(data[:, :, 0] + 0.1) data -= numpy.amin(data) activity_dictionary = {} RNA_dictionary = {} my_file = sys.argv[1] for line in open(my_file): if line.startswith('track'): continue fields = line.rstrip('\n').split() if int(fields[1]) > 5000000 and int(fields[2]) < 50000000: # print(fields[2])
rna_expression[index1] = float(fields[-2]) # v1[index1] = f activity_index = [] activity_value= {} for i, line in enumerate(f2): if i == 0: continue fields = line.rstrip('\n').split('\t') if int(fields[1]) >= 5000000 and int(fields[1]) <= 40000000: index2 = (int(fields[1]) - 5000000) / 5000 activity_index.append(index2) activity_value[index2] = float(fields[-2]) #v2[index2] = import hifive import numpy hic = hifive.HiC('project_file', 'r') data = hic.cis_heatmap(chrom='chr10', start=5000000, stop=40000000, binsize=5000, datatype='fend', arraytype='full') where = numpy.where(data[:, :, 1] > 0) data[where[0], where[1], 0] /= data[where[0], where[1], 1] data = numpy.log(data[:, :, 0] + 0.1) data -= numpy.amin(data) #print(data) interaction_activity = {} for index1 in rna_index: int_act = 0 for index2 in activity_index: int_act += float(activity_value[index2])* data[index1][index2] interaction_activity[index1] = int_act # data_subset = data[np.where(v2 > 0), :] # sum_data_subset = np.sum(data_subset, axis=1) # R = np.corrcoef(sum_data_subset, v2)[0, 1]
midpoint = [] for i, line in enumerate(f1): if i == 0: continue fields = line.rstrip("\r\n").split("\t") chrom = fields[0] start = int(fields[1]) end = int(fields[2]) if chrom == "chr17" and start >= 15000000 and end <= 17500000: mid = ((end - start) / 2) + start midpoint.append(mid) #print(midpoint) #print(len(midpoint)) hic = hifive.HiC('hic_ex.hcp') data = hic.cis_heatmap(chrom='chr17', start=15000000, stop=17500000, binsize=10000, datatype='fend', arraytype='full') data[:, :, 1] *= numpy.sum(data[:, :, 0]) / numpy.sum(data[:, :, 1]) where = numpy.where(data[:, :, 1] > 0) data[where[0], where[1], 0] /= data[where[0], where[1], 1] # where says know position in matrix that satisfy what I say data = data[:, :, 0] #print(data) #print(data.shape) #print(data[0][0])