def process_TAD(hic_data, perc_zero, reso, cpus, outdir, bins): # Get poor bins print 'Get poor bins...' try: hic_data.filter_columns(perc_zero=perc_zero, by_mean=True) except ValueError: perc_zero = 100 hic_data.filter_columns(perc_zero=perc_zero, by_mean=True) binsrev = {y:x for x,y in bins.iteritems()} bad_file = outdir + 'bad_rows_%s_%d.tsv' % (nice(reso), perc_zero) bads = [binsrev[i][0] + "\t" + str(binsrev[i][1] * reso) + "\t" + str(i) for i in hic_data.bads.keys()] compress(bads, bad_file) # Identify biases print 'Get biases using ICE...' hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0, factor=1) # cells of the matrix have a mean of 1 bias_file = outdir + 'bias_%s.tsv' % nice(reso) bias = [binsrev[i][0] + "\t" + str(binsrev[i][1] * reso) + "\t" + '%d\t%f' % (i, hic_data.bias[i]) for i in hic_data.bias] compress(bias, bias_file) # percentage of cis interactions print 'Getting percentage of cis interactions...' cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True , diagonal=True ) cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True ) cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True , diagonal=False) cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False) cistrans_file = outdir + 'cis_trans_ratio_%s.tsv' % nice(reso) out_cistrans = open(cistrans_file, "w") out_cistrans.write("Cis/trans_ratio\tnormalized\twith_diagonal\t" + str(cis_trans_N_D) + "\n") out_cistrans.write("Cis/trans_ratio\tnormalized\twithout_diagonal\t" + str(cis_trans_N_d) + "\n") out_cistrans.write("Cis/trans_ratio\traw\twith_diagonal\t" + str(cis_trans_n_D) + "\n") out_cistrans.write("Cis/trans_ratio\traw\twithout_diagonal\t" + str(cis_trans_n_d) + "\n") out_cistrans.close() # Compute expected print 'Get expected counts ...' hic_data.expected = expected(hic_data, bads = hic_data.bads) # store matrices print 'Store matrices' write_matrices(hic_data, outdir, reso) # getting TAD borders print 'Searching TADs' for crm in hic_data.chromosomes: print ' - %s' % crm matrix = hic_data.get_matrix(focus=crm) beg, end = hic_data.section_pos[crm] size = len(matrix) if size < 10: print " Chromosome too short (%d bins), skipping..." % size continue # transform bad column in chromosome referential remove = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)]) # maximum size of a TAD max_tad_size = size result = tadbit([matrix], remove=remove, n_cpus=cpus, verbose=False, max_tad_size=max_tad_size, no_heuristic=0) tads = load_tad_height(result, size, beg, end, hic_data) table = '' table += '%s\t%s\t%s\t%s%s\n' % ('#', 'start', 'end', 'score', 'density') for tad in tads: table += '%s\t%s\t%s\t%s%s\n' % ( tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1), abs(tads[tad]['score']), '\t%s' % (round( float(tads[tad]['height']), 3))) out_tad = outdir + 'tads_%s_%s.tsv' % ( crm, nice(reso)) out = open(out_tad, 'w') out.write(table) out.close()
def find_compartments(self, crms=None, savefig=None, savedata=None, show=False, **kwargs): """ Search for A/B copartments in each chromsome of the Hi-C matrix. Hi-C matrix is normalized by the number interaction expected at a given distance, and by visibility (one iteration of ICE). A correlation matrix is then calculated from this normalized matrix, and its first eigenvector is used to identify compartments. Changes in sign marking boundaries between compartments. Result is stored as a dictionary of compartment boundaries, keys being chromsome names. :param 99 perc_zero: to filter bad columns :param 0.05 signal_to_noise: to calculate expected interaction counts, if not enough reads are observed at a given distance the observations of the distance+1 are summed. a signal to noise ratio of < 0.05 corresponds to > 400 reads. :param None crms: only runs these given list of chromosomes :param None savefig: path to a directory to store matrices with compartment predictions, one image per chromosome, stored under 'chromosome-name.png'. :param False show: show the plot :param None savedata: path to a new file to store compartment predictions, one file only. :param -1 vmin: for the color scale of the plotted map :param 1 vmax: for the color scale of the plotted map TODO: this is really slow... Notes: building the distance matrix using the amount of interactions instead of the mean correlation, gives generally worse results. """ if not self.bads: if kwargs.get('verbose', True): print 'Filtering bad columns %d' % 99 self.filter_columns(perc_zero=kwargs.get('perc_zero', 99), by_mean=False, silent=True) if not self.expected: if kwargs.get('verbose', True): print 'Normalizing by expected values' self.expected = expected(self, bads=self.bads, **kwargs) if not self.bias: if kwargs.get('verbose', True): print 'Normalizing by ICE (1 round)' self.normalize_hic(iterations=0) if savefig: mkdir(savefig) cmprts = {} for sec in self.section_pos: if crms and sec not in crms: continue if kwargs.get('verbose', False): print 'Processing chromosome', sec warn('Processing chromosome %s' % (sec)) matrix = [[(float(self[i,j]) / self.expected[abs(j-i)] / self.bias[i] / self.bias[j]) for i in xrange(*self.section_pos[sec]) if not i in self.bads] for j in xrange(*self.section_pos[sec]) if not j in self.bads] if not matrix: # MT chromosome will fall there warn('Chromosome %s is probably MT :)' % (sec)) cmprts[sec] = [] continue for i in xrange(len(matrix)): for j in xrange(i+1, len(matrix)): matrix[i][j] = matrix[j][i] matrix = [list(m) for m in corrcoef(matrix)] try: # This eighs is very very fast, only ask for one eigvector _, evect = eigsh(array(matrix), k=1) except LinAlgError: warn('Chromosome %s too small to compute PC1' % (sec)) cmprts[sec] = [] # Y chromosome, or so... continue first = list(evect[:, -1]) beg, end = self.section_pos[sec] bads = [k - beg for k in self.bads if beg <= k <= end] _ = [first.insert(b, 0) for b in bads] _ = [matrix.insert(b, [float('nan')] * len(matrix[0])) for b in bads] _ = [matrix[i].insert(b, float('nan')) for b in bads for i in xrange(len(first))] breaks = [0] + [i for i, (a, b) in enumerate(zip(first[1:], first[:-1])) if a * b < 0] + [len(first)] breaks = [{'start': b, 'end': breaks[i+1]} for i, b in enumerate(breaks[: -1])] cmprts[sec] = breaks # calculate compartment internal density for k, cmprt in enumerate(cmprts[sec]): beg = self.section_pos[sec][0] beg1, end1 = cmprt['start'] + beg, cmprt['end'] + beg sec_matrix = [(self[i,j] / self.expected[abs(j-i)] / self.bias[i] / self.bias[j]) for i in xrange(beg1, end1) if not i in self.bads for j in xrange(i, end1) if not j in self.bads] try: cmprt['dens'] = sum(sec_matrix) / len(sec_matrix) except ZeroDivisionError: cmprt['dens'] = 0. try: meanh = sum([cmprt['dens'] for cmprt in cmprts[sec]]) / len(cmprts[sec]) except ZeroDivisionError: meanh = 1. for cmprt in cmprts[sec]: try: cmprt['dens'] /= meanh except ZeroDivisionError: cmprt['dens'] = 1. gammas = {} for gamma in range(101): gammas[gamma] = _find_ab_compartments(float(gamma)/100, matrix, breaks, cmprts[sec], save=False) # print gamma, gammas[gamma] gamma = min(gammas.keys(), key=lambda k: gammas[k][0]) _ = _find_ab_compartments(float(gamma)/100, matrix, breaks, cmprts[sec], save=True) if savefig or show: vmin = kwargs.get('vmin', -1) vmax = kwargs.get('vmax', 1) if vmin == 'auto' == vmax: vmax = max([abs(npmin(matrix)), abs(npmax(matrix))]) vmin = -vmax plot_compartments(sec, first, cmprts, matrix, show, savefig + '/chr' + sec + '.pdf', vmin=vmin, vmax=vmax) plot_compartments_summary(sec, cmprts, show, savefig + '/chr' + sec + '_summ.pdf') self.compartments = cmprts if savedata: self.write_compartments(savedata)
def process_AB(hic_data, perc_zero, reso, outdir, bins): # Get poor bins print 'Get poor bins...' try: hic_data.filter_columns(perc_zero=perc_zero, by_mean=True) except ValueError: perc_zero = 100 hic_data.filter_columns(perc_zero=perc_zero, by_mean=True) binsrev = {y:x for x,y in bins.iteritems()} bad_file = outdir + 'bad_rows_%s_%d.tsv' % (nice(reso), perc_zero) bads = [binsrev[i][0] + "\t" + str(binsrev[i][1] * reso) + "\t" + str(i) for i in hic_data.bads.keys()] compress(bads, bad_file) # Identify biases print 'Get biases using ICE...' hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0, factor=1) # cells of the matrix have a mean of 1 bias_file = outdir + 'bias_%s.tsv' % nice(reso) bias = [binsrev[i][0] + "\t" + str(binsrev[i][1] * reso) + "\t" + '%d\t%f' % (i, hic_data.bias[i]) for i in hic_data.bias] compress(bias, bias_file) # percentage of cis interactions print 'Getting percentage of cis interactions...' cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True , diagonal=True ) cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True ) cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True , diagonal=False) cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False) cistrans_file = outdir + 'cis_trans_ratio_%s.tsv' % nice(reso) out_cistrans = open(cistrans_file, "w") out_cistrans.write("Cis/trans_ratio\tnormalized\twith_diagonal\t" + str(cis_trans_N_D) + "\n") out_cistrans.write("Cis/trans_ratio\tnormalized\twithout_diagonal\t" + str(cis_trans_N_d) + "\n") out_cistrans.write("Cis/trans_ratio\traw\twith_diagonal\t" + str(cis_trans_n_D) + "\n") out_cistrans.write("Cis/trans_ratio\traw\twithout_diagonal\t" + str(cis_trans_n_d) + "\n") out_cistrans.close() # Compute expected print 'Get expected counts ...' hic_data.expected = expected(hic_data, bads = hic_data.bads) # store matrices print 'Store matrices' write_matrices(hic_data, outdir, reso) # getting compartments print 'Searching compartments' ev = hic_data.find_compartments() ev_file = outdir + 'ev_%s.tsv' % nice(reso) out = [] chroms = ev.keys() chroms.sort() for ch in chroms: for i in xrange(len(ev[ch][0]) - 1): out.append("\t".join((ch, str(i * reso), str(ev[ch][0][i]), str(ev[ch][1][i])))) compress(out, ev_file) cmprt_file = outdir + 'compartments_%s.tsv' % nice(reso) hic_data.write_compartments(cmprt_file)
def find_compartments(self, crms=None, savefig=None, savedata=None, show=False, **kwargs): """ Search for A/B copartments in each chromsome of the Hi-C matrix. Hi-C matrix is normalized by the number interaction expected at a given distance, and by visibility (one iteration of ICE). A correlation matrix is then calculated from this normalized matrix, and its first eigenvector is used to identify compartments. Changes in sign marking boundaries between compartments. Result is stored as a dictionary of compartment boundaries, keys being chromsome names. :param 99 perc_zero: to filter bad columns :param 0.05 signal_to_noise: to calculate expected interaction counts, if not enough reads are observed at a given distance the observations of the distance+1 are summed. a signal to noise ratio of < 0.05 corresponds to > 400 reads. :param None crms: only runs these given list of chromosomes :param None savefig: path to a directory to store matrices with compartment predictions, one image per chromosome, stored under 'chromosome-name.png'. :param False show: show the plot :param None savedata: path to a new file to store compartment predictions, one file only. :param -1 vmin: for the color scale of the plotted map :param 1 vmax: for the color scale of the plotted map TODO: this is really slow... Notes: building the distance matrix using the amount of interactions instead of the mean correlation, gives generally worse results. """ if not self.bads: if kwargs.get('verbose', True): print 'Filtering bad columns %d' % 99 self.filter_columns(perc_zero=kwargs.get('perc_zero', 99), by_mean=False, silent=True) if not self.expected: if kwargs.get('verbose', True): print 'Normalizing by expected values' self.expected = expected(self, bads=self.bads, **kwargs) if not self.bias: if kwargs.get('verbose', True): print 'Normalizing by ICE (1 round)' self.normalize_hic(iterations=0) if savefig: mkdir(savefig) cmprts = {} for sec in self.section_pos: if crms and sec not in crms: continue if kwargs.get('verbose', False): print 'Processing chromosome', sec warn('Processing chromosome %s' % (sec)) matrix = [[(float(self[i, j]) / self.expected[abs(j - i)] / self.bias[i] / self.bias[j]) for i in xrange(*self.section_pos[sec]) if not i in self.bads] for j in xrange(*self.section_pos[sec]) if not j in self.bads] if not matrix: # MT chromosome will fall there warn('Chromosome %s is probably MT :)' % (sec)) cmprts[sec] = [] continue for i in xrange(len(matrix)): for j in xrange(i + 1, len(matrix)): matrix[i][j] = matrix[j][i] matrix = [list(m) for m in corrcoef(matrix)] try: # This eighs is very very fast, only ask for one eigvector _, evect = eigsh(array(matrix), k=1) except LinAlgError: warn('Chromosome %s too small to compute PC1' % (sec)) cmprts[sec] = [] # Y chromosome, or so... continue first = list(evect[:, -1]) beg, end = self.section_pos[sec] bads = [k - beg for k in self.bads if beg <= k <= end] _ = [first.insert(b, 0) for b in bads] _ = [ matrix.insert(b, [float('nan')] * len(matrix[0])) for b in bads ] _ = [ matrix[i].insert(b, float('nan')) for b in bads for i in xrange(len(first)) ] breaks = [0] + [ i for i, (a, b) in enumerate(zip(first[1:], first[:-1])) if a * b < 0 ] + [len(first)] breaks = [{ 'start': b, 'end': breaks[i + 1] } for i, b in enumerate(breaks[:-1])] cmprts[sec] = breaks # calculate compartment internal density for k, cmprt in enumerate(cmprts[sec]): beg = self.section_pos[sec][0] beg1, end1 = cmprt['start'] + beg, cmprt['end'] + beg sec_matrix = [(self[i, j] / self.expected[abs(j - i)] / self.bias[i] / self.bias[j]) for i in xrange(beg1, end1) if not i in self.bads for j in xrange(i, end1) if not j in self.bads] try: cmprt['dens'] = sum(sec_matrix) / len(sec_matrix) except ZeroDivisionError: cmprt['dens'] = 0. try: meanh = sum([cmprt['dens'] for cmprt in cmprts[sec]]) / len(cmprts[sec]) except ZeroDivisionError: meanh = 1. for cmprt in cmprts[sec]: try: cmprt['dens'] /= meanh except ZeroDivisionError: cmprt['dens'] = 1. gammas = {} for gamma in range(101): gammas[gamma] = _find_ab_compartments(float(gamma) / 100, matrix, breaks, cmprts[sec], save=False) # print gamma, gammas[gamma] gamma = min(gammas.keys(), key=lambda k: gammas[k][0]) _ = _find_ab_compartments(float(gamma) / 100, matrix, breaks, cmprts[sec], save=True) if savefig or show: vmin = kwargs.get('vmin', -1) vmax = kwargs.get('vmax', 1) if vmin == 'auto' == vmax: vmax = max([abs(npmin(matrix)), abs(npmax(matrix))]) vmin = -vmax plot_compartments(sec, first, cmprts, matrix, show, savefig + '/chr' + sec + '.pdf', vmin=vmin, vmax=vmax) plot_compartments_summary(sec, cmprts, show, savefig + '/chr' + sec + '_summ.pdf') self.compartments = cmprts if savedata: self.write_compartments(savedata)