def load_tad_height(tad_def, size, beg, end, hic_data): bias, zeros = hic_data.bias, hic_data.bads tads, _ = parse_tads(tad_def) diags = [] for k in xrange(1, size): try: diags.append(sum( hic_data[i, i + k] / bias[i + k] / bias[i] for i in xrange(beg, end - k) if not i in zeros and not i + k in zeros ) / float(sum(1 for i in range(beg, end - k) if not i in zeros and not i + k in zeros))) except ZeroDivisionError: diags.append(0.) for tad in tads: start, final = (int(tads[tad]['start']) + 1, int(tads[tad]['end']) + 1) matrix = sum( hic_data[i, j] / bias[j] / bias[i] for i in xrange(beg + start - 1, beg + final - 1) if not i in zeros for j in xrange(i + 1 , beg + final - 1) if not j in zeros) try: height = float(matrix) / sum( [diags[i - 1] * (final - start - i) for i in xrange(1, final - start)]) except ZeroDivisionError: height = 0. tads[tad]['height'] = height return tads
def load_tad_height(tad_def, size, beg, end, hic_data): """ to calculate tad densities """ bias, zeros = hic_data.bias, hic_data.bads tads, _ = parse_tads(tad_def) diags = [] for k in xrange(1, size): try: diags.append( sum(hic_data[i, i + k] / bias[i + k] / bias[i] for i in xrange(beg, end - k) if not i in zeros and not i + k in zeros) / float( sum(1 for i in range(beg, end - k) if not i in zeros and not i + k in zeros))) except ZeroDivisionError: diags.append(0.) for tad in tads: start, final = (int(tads[tad]['start']) + 1, int(tads[tad]['end']) + 1) matrix = sum(hic_data[i, j] / bias[j] / bias[i] for i in xrange(beg + start - 1, beg + final - 1) if not i in zeros for j in xrange(i + 1, beg + final - 1) if not j in zeros) try: height = float(matrix) / sum([ diags[i - 1] * (final - start - i) for i in xrange(1, final - start) ]) except ZeroDivisionError: height = 0. tads[tad]['height'] = height return tads
def load_tad_height(tad_def, size, beg, end, hic_data): bias, zeros = hic_data.bias, hic_data.bads if bias: norm = lambda i, j: bias[i] * bias[j] else: norm = lambda i, j: 1 # Non-normalized height, keep in mind! tads, _ = parse_tads(tad_def) diags = [] for k in xrange(1, size): try: diags.append(sum( hic_data[i, i + k] / norm(i + k, i) for i in xrange(beg, end - k) if not i in zeros and not i + k in zeros ) / float(sum(1 for i in range(beg, end - k) if not i in zeros and not i + k in zeros))) except ZeroDivisionError: diags.append(0.) for tad in tads: start, final = (int(tads[tad]['start']) + 1, int(tads[tad]['end']) + 1) matrix = sum( hic_data[i, j] / norm(j, i) for i in xrange(beg + start - 1, beg + final - 1) if not i in zeros for j in xrange(i + 1 , beg + final - 1) if not j in zeros) try: height = float(matrix) / sum( [diags[i - 1] * (final - start - i) for i in xrange(1, final - start)]) except ZeroDivisionError: height = 0. tads[tad]['height'] = height return tads
def load_tads_fromdb(opts): tads = None try: tad_job_id = int(opts.tad_def) if opts.tmpdb: dbfile = opts.tmpdb else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: cur = con.cursor() try: cur.execute(""" select distinct paths.path,SEGMENT_OUTPUTs.resolution from paths inner join SEGMENT_OUTPUTs on SEGMENT_OUTPUTs.JOBid = paths.JOBid where SEGMENT_OUTPUTs.TADs is not null and paths.jobid = %s """ % (tad_job_id)) tads_res = cur.fetchall() if not tads_res: warn("""WARNING: tad definition job not found""") return None else: tads_path = path.join(opts.workdir, tads_res[0][0]) if not path.exists(tads_path): raise IOError( 'ERROR: tad definition file_handling does not exist' ) tads_reso = int(tads_res[0][1]) tads, _ = parse_tads(tads_path) if tads_reso != opts.reso: for pos in range(len(tads)): tads[pos]['start'] = int( (tads_reso / opts.reso) * tads[pos]['start']) tads[pos]['end'] = int( (tads_reso / opts.reso) * tads[pos]['end']) tads[pos]['brk'] = int( (tads_reso / opts.reso) * tads[pos]['brk']) except IndexError: warn("""WARNING: tad definition job not found""") except TypeError: tads, _ = parse_tads(opts.tad_def) return tads
def save_new_genome(genome, trace, check=False, rootdir='./'): """ Save new chromosomes with remapped or check TAD borders into a new folder. :para genome: a dict containing all chromosomes computed :param trace: dictionary containing a trace of all mapped TAD boundaries :param False check: if no remapping have to be done (only check TAD borders) :param None target_species: name of the target species, if None, it is assumed, that only a remapping has been done. :param './' rootdir: path where to write directories for remapped/checked chromosomes. """ for crm in genome: new_crm = genome[crm] for exp in new_crm.experiments: # reorder the TADs in increasing order of their end position end, _, score = zip( *sorted(zip(*[exp._tads[k] for k in ['end', 'start', 'score']]))) exp._tads['start'] = [0.] + [v - 1 for v in end[:-1]] exp._tads['end'] = list(end) exp._tads['score'] = list(score) if check: # check TADs that have synteny tadcnt = 0 new_tads = {} for tad in exp.tads: try: if trace[crm][exp.tads[tad] ['end']]['syntenic at']['crm'] is None: continue except KeyError: print('Not found:', crm, exp.tads[tad]['end'], trace[crm][exp.tads[tad]['end']]) continue new_tads[tadcnt] = exp.tads[tad] tadcnt += 1 exp.tads = new_tads else: # create new genome on which are mapped the old coordinates tads, norm = parse_tads(exp._tads) last = max(tads.keys()) if not exp.size: exp.size = tads[last]['end'] exp.norm = norm exp.tads = tads if not os.path.exists(rootdir): os.mkdir(rootdir) exp.write_tad_borders(density=False, savedata=os.path.join( rootdir, new_crm.name + '.tsv'))
def load_tad_def(self, handler, weights=None): """ Add Topologically Associated Domains definition detection to Slice :param f_name: path to file :param None name: name of the experiment, if None f_name will be used :param None weights: Store information about the weights, corresponding to the normalization of the Hi-C data (see tadbit function documentation) """ tads, wght = parse_tads(handler) self.tads = tads self.wght = weights or wght
def save_new_genome(genome, trace, check=False, rootdir='./'): """ Save new chromosomes with remapped or check TAD borders into a new folder. :para genome: a dict containing all chromosomes computed :param trace: dictionary containing a trace of all mapped TAD boundaries :param False check: if no remapping have to be done (only check TAD borders) :param None target_species: name of the target species, if None, it is assumed, that only a remapping has been done. :param './' rootdir: path where to write directories for remapped/checked chromosomes. """ for crm in genome: new_crm = genome[crm] for exp in new_crm.experiments: # reorder the TADs in increasing order of their end position end, _, score = zip(*sorted(zip( *[exp._tads[k] for k in ['end', 'start', 'score']]))) exp._tads['start'] = [0.] + [v - 1 for v in end[:-1]] exp._tads['end' ] = list(end) exp._tads['score'] = list(score) if check: # check TADs that have synteny tadcnt = 0 new_tads = {} for tad in exp.tads: try: if trace[crm][ exp.tads[tad]['end']]['syntenic at']['crm'] is None: continue except KeyError: print ('Not found:', crm, exp.tads[tad]['end'], trace[crm][exp.tads[tad]['end']]) continue new_tads[tadcnt] = exp.tads[tad] tadcnt += 1 exp.tads = new_tads else: # create new genome on which are mapped the old coordinates tads, norm = parse_tads(exp._tads) last = max(tads.keys()) if not exp.size: exp.size = tads[last]['end'] exp.norm = norm exp.tads = tads if not os.path.exists(rootdir): os.mkdir(rootdir) exp.write_tad_borders(density=False, savedata=os.path.join(rootdir, new_crm.name + '.tsv'))
def load_tad_def(self, tad_def, weights=None): """ Add the Topologically Associated Domains definition detection to Slice :param None tad_def: a file or a dict with precomputed TADs for this experiment :param None name: name of the experiment, if None f_name will be used :param None weights: Store information about the weights, corresponding to the normalization of the Hi-C data (see tadbit function documentation) """ tads, norm = parse_tads(tad_def) self.tads = tads self.norm = weights or norm
def save_new_genome(genome, trace, check=False, target_species=None, rootdir='./'): """ Save new chromosomes with remapped or check TAD borders into a new folder. :para genome: a dict containing all chromosomes computed :param trace: dictionary containing a trace of all mapped TAD boundaries :param False check: if no remapping have to be done (only check TAD borders) :param None target_species: name of the target species, if None, it is assumed, that only a remapping has been done. :param './' rootdir: path where to write directories for remapped/checked chromosomes. """ for crm in genome: new_crm = genome[crm] for exp in new_crm.experiments: if check: tadcnt = 0 new_tads = {} for tad in exp.tads: cond = 'syntenic at' if target_species else 'mapped to' try: if trace[crm][exp.tads[tad] ['end']][cond]['chr'] is None: continue except KeyError: print('Not found:', crm, exp.tads[tad]['end'], trace[crm][exp.tads[tad]['end']]) continue new_tads[tadcnt] = exp.tads[tad] tadcnt += 1 exp.tads = new_tads else: tads, norm = parse_tads(exp._tads) last = max(tads.keys()) if not exp.size: exp.size = tads[last]['end'] exp.norm = norm exp.tads = tads crmdir = os.path.join(rootdir, crm) if not os.path.exists(crmdir): os.mkdir(crmdir) new_crm.save_chromosome(os.path.join(crmdir, crm + '.tdb'))
def save_new_genome(genome, trace, check=False, target_species=None, rootdir='./'): """ Save new chromosomes with remapped or check TAD borders into a new folder. :para genome: a dict containing all chromosomes computed :param trace: dictionary containing a trace of all mapped TAD boundaries :param False check: if no remapping have to be done (only check TAD borders) :param None target_species: name of the target species, if None, it is assumed, that only a remapping has been done. :param './' rootdir: path where to write directories for remapped/checked chromosomes. """ for crm in genome: new_crm = genome[crm] for exp in new_crm.experiments: if check: tadcnt = 0 new_tads = {} for tad in exp.tads: cond = 'syntenic at' if target_species else 'mapped to' try: if trace[crm][exp.tads[tad]['end']][cond]['chr'] is None: continue except KeyError: print ('Not found:', crm, exp.tads[tad]['end'], trace[crm][exp.tads[tad]['end']]) continue new_tads[tadcnt] = exp.tads[tad] tadcnt += 1 exp.tads = new_tads else: tads, norm = parse_tads(exp._tads) last = max(tads.keys()) if not exp.size: exp.size = tads[last]['end'] exp.norm = norm exp.tads = tads crmdir = os.path.join(rootdir, crm) if not os.path.exists(crmdir): os.mkdir(crmdir) new_crm.save_chromosome(os.path.join(crmdir, crm + '.tdb'))
def add_TAD_def(self, f_name, name=None, weights=None): """ Add Topologically Associated Domains defintinion detection to chromosome :argument f_name: path to file :argument None name: name of the experiment, if None f_name will be used: """ name = name or f_name tads, forbidden = parse_tads(f_name, max_size=self.max_tad_size, bin_size=self.resolution) brks = [t["brk"] for t in tads.values() if t["brk"]] if not name in self.experiments: self.experiments[name] = {"hi-c": None, "size": None, "tads": None, "brks": None, "wght": None} self.experiments[name]["tads"] = tads self.experiments[name]["brks"] = brks if weights: self.experiments[name]["wght"] = weights if not self.forbidden: self.forbidden = dict([(f, None) for f in forbidden]) else: self.forbidden = dict([(f, None) for f in forbidden.intersection(self.forbidden)]) if not self.size: self.size = tads[max(tads)]["end"] * self.resolution self.r_size = self.size - len(self.forbidden) * self.resolution
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, extra=['quiet']) if opts.zrange: vmin = float(opts.zrange.split(',')[0]) vmax = float(opts.zrange.split(',')[1]) else: vmin = vmax = None if opts.figsize: opts.figsize = list(map(float, opts.figsize.split(','))) clean = True # change for debug if opts.bam: mreads = path.realpath(opts.bam) if not opts.biases and all(v != 'raw' for v in opts.normalizations): raise Exception('ERROR: external BAM input, should provide path to' ' biases file.') else: biases, mreads = load_parameters_fromdb(opts) mreads = path.join(opts.workdir, mreads) biases = path.join(opts.workdir, biases) if biases else None if opts.biases: biases = opts.biases coord1 = opts.coord1 coord2 = opts.coord2 if coord2 and not coord1: coord1, coord2 = coord2, coord1 if not coord1: region1 = None start1 = None end1 = None region2 = None start2 = None end2 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None if coord2: try: crm2, pos2 = coord2.split(':') start2, end2 = pos2.split('-') region2 = crm2 start2 = int(start2) end2 = int(end2) except ValueError: region2 = coord2 start2 = None end2 = None else: region2 = None start2 = None end2 = None if opts.plot and not opts.force_plot: if opts.interactive: max_size = 3500**2 else: max_size = 5000**2 else: max_size = None outdir = path.join(opts.workdir, '05_sub-matrices') mkdir(outdir) tmpdir = path.join(opts.workdir, '05_sub-matrices', '_tmp_sub-matrices_%s' % param_hash) mkdir(tmpdir) if region1: if region1: if not opts.quiet: stdout.write('\nExtraction of %s' % (region1)) if start1: if not opts.quiet: stdout.write(':%s-%s' % (start1, end1)) else: if not opts.quiet: stdout.write(' (full chromosome)') if region2: if not opts.quiet: stdout.write(' intersection with %s' % (region2)) if start2: if not opts.quiet: stdout.write(':%s-%s\n' % (start2, end2)) else: if not opts.quiet: stdout.write(' (full chromosome)\n') else: if not opts.quiet: stdout.write('\n') else: if not opts.quiet: stdout.write('\nExtraction of %s genome\n' % ('partial' if opts.chr_name else 'full')) out_files = {} out_plots = {} if opts.matrix or opts.plot: bamfile = AlignmentFile(mreads, 'rb') bam_refs = bamfile.references bam_lengths = bamfile.lengths if opts.chr_name: bam_refs_idx = [ bam_refs.index(chr_ord) for chr_ord in opts.chr_name if chr_ord in bam_refs ] if not bam_refs_idx: raise Exception( '''ERROR: Wrong number of chromosomes in chr_order. Found %s in bam file \n''' % (' '.join(bam_refs))) bam_refs = [ bam_ref for bam_ref in [bam_refs[bam_ref_idx] for bam_ref_idx in bam_refs_idx] ] bam_lengths = [ bam_len for bam_len in [bam_lengths[bam_ref_idx] for bam_ref_idx in bam_refs_idx] ] sections = OrderedDict(list(zip(bam_refs, [x for x in bam_lengths]))) total = 0 section_pos = OrderedDict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] for norm in opts.normalizations: norm_string = ('RAW' if norm == 'raw' else 'NRM' if norm == 'norm' else 'DEC') printime('Getting %s matrices' % norm) try: matrix, bads1, bads2, regions, name, bin_coords = get_matrix( mreads, opts.reso, load(open(biases, 'rb')) if biases and norm != 'raw' else None, normalization=norm, filter_exclude=opts.filter, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, ncpus=opts.cpus, return_headers=True, nchunks=opts.nchunks, verbose=not opts.quiet, clean=clean, max_size=max_size, chr_order=opts.chr_name) except NotImplementedError: if norm == "raw&decay": warn('WARNING: raw&decay normalization not implemented ' 'for matrices\n... skipping\n') continue raise b1, e1, b2, e2 = bin_coords b1, e1 = 0, e1 - b1 b2, e2 = 0, e2 - b2 if opts.row_names: starts = [start1, start2] ends = [end1, end2] row_names = ((reg, p + 1, p + opts.reso) for r, reg in enumerate(regions) for p in range( starts[r] if r < len(starts) and starts[r] else 0, ends[r] if r < len(ends) and ends[r] else sections[reg], opts.reso)) if opts.matrix: printime(' - Writing: %s' % norm) fnam = '%s_%s_%s%s.mat' % (norm, name, nicer( opts.reso, sep=''), ('_' + param_hash)) out_files[norm_string] = path.join(outdir, fnam) out = open(path.join(outdir, fnam), 'w') for reg in regions: out.write('# CRM %s\t%d\n' % (reg, sections[reg])) if region2: out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) if opts.row_names: out.write('\n'.join('%s\t%d\t%d\t' % (next(row_names)) + '\t'.join( str(matrix.get((i, j), 0)) for i in range(b1, e1)) for j in range(b2, e2)) + '\n') else: out.write('\n'.join('\t'.join( str(matrix.get((i, j), 0)) for i in range(b1, e1)) for j in range(b2, e2)) + '\n') out.close() if opts.plot: # transform matrix matrix = array([ array([matrix.get((i, j), 0) for i in range(b1, e1)]) for j in range(b2, e2) ]) m = zeros_like(matrix) for bad1 in bads1: m[:, bad1] = 1 for bad2 in bads2: m[bad2, :] = 1 matrix = ma.masked_array(matrix, m) printime(' - Plotting: %s' % norm) fnam = '%s_%s_%s%s%s.%s' % ( 'nrm' if norm == 'norm' else norm[:3], name, nicer(opts.reso, sep=''), ('_' + param_hash), '_tri' if opts.triangular else '', opts.format) out_plots[norm_string] = path.join(outdir, fnam) pltbeg1 = 0 if start1 is None else start1 pltend1 = sections[regions[0]] if end1 is None else end1 pltbeg2 = 0 if start2 is None else start2 pltend2 = sections[regions[-1]] if end2 is None else end2 xlabel = '{}:{:,}-{:,}'.format(regions[0], pltbeg1 if pltbeg1 else 1, pltend1) ylabel = '{}:{:,}-{:,}'.format(regions[-1], pltbeg2 if pltbeg2 else 1, pltend2) section_pos = OrderedDict( (k, section_pos[k]) for k in section_pos if k in regions) transform = (log2 if opts.transform == 'log2' else log if opts.transform == 'log' else lambda x: x) tads = None if opts.tad_def and not region2: tads, _ = parse_tads(opts.tad_def) if start1: tads = dict([ (t, tads[t]) for t in tads if (int(tads[t]['start']) >= start1 // opts.reso and int(tads[t]['end']) <= end1 // opts.reso) ]) for tad in tads: tads[tad]['start'] -= start1 // opts.reso tads[tad]['end'] -= start1 // opts.reso ax1, _ = plot_HiC_matrix( matrix, triangular=opts.triangular, vmin=vmin, vmax=vmax, cmap=opts.cmap, figsize=opts.figsize, transform=transform, bad_color=opts.bad_color if norm != 'raw' else None, tad_def=tads) ax1.set_title('Region: %s, normalization: %s, resolution: %s' % (name, norm, nicer(opts.reso)), y=1.05) _format_axes(ax1, start1, end1, start2, end2, opts.reso, regions, section_pos, sections, opts.xtick_rotation, triangular=False) if opts.interactive: plt.show() plt.close('all') else: tadbit_savefig(path.join(outdir, fnam)) if not opts.matrix and not opts.only_plot: printime('Getting and writing matrices') out_files.update( write_matrix(mreads, opts.reso, load(open(biases, 'rb')) if biases else None, outdir, filter_exclude=opts.filter, normalizations=opts.normalizations, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus, nchunks=opts.nchunks, verbose=not opts.quiet, extra=param_hash, cooler=opts.cooler, clean=clean, chr_order=opts.chr_name)) if clean: printime('Cleaning') system('rm -rf %s ' % tmpdir) if not opts.interactive: printime('Saving to DB') finish_time = time.localtime() save_to_db(opts, launch_time, finish_time, out_files, out_plots)