def tb_generate_tads(self, expt_name, adj_list, chrom, resolution, normalized, tad_file): """ Function to the predict TAD sites for a given resolution from the Hi-C matrix Parameters ---------- expt_name : str Location of the adjacency list matrix_file : str Location of the HDF5 output matrix file resolution : int Resolution to read the Hi-C adjacency list at tad_file : str Location of the output TAD file Returns ------- tad_file : str Location of the output TAD file """ # chr_hic_data = read_matrix(matrix_file, resolution=int(resolution)) print("TB TAD GENERATOR:", expt_name, adj_list, chrom, resolution, normalized, tad_file) hic_data = load_hic_data_from_reads(adj_list, resolution=int(resolution)) if normalized is False: hic_data.normalize_hic(iterations=9, max_dev=0.1) save_matrix_file = adj_list + "_" + str(chrom) + "_tmp.txt" hic_data.write_matrix(save_matrix_file, (chrom, chrom), normalized=True) chr_hic_data = hic_data.get_matrix((chrom, chrom)) print("TB - chr_hic_data:", chr_hic_data) my_chrom = Chromosome(name=chrom, centromere_search=True) my_chrom.add_experiment(expt_name, hic_data=save_matrix_file, resolution=int(resolution)) # Run core TADbit function to find TADs on each expt. my_chrom.find_tad(expt_name, n_cpus=15) exp = my_chrom.experiments[expt_name] exp.write_tad_borders(savedata=tad_file + ".tmp") with open(tad_file, "wb") as f_out: with open(tad_file + ".tmp", "rb") as f_in: f_out.write(f_in.read()) return True
def tb_matrix_hdf5(self, adjlist_file, adj_hdf5, normalized, resolution, chromosomes): """ Function to the Hi-C matrix into an HDF5 file This has to be run sequentially as it is not possible for multiple streams to write to the same HDF5 file. This is a run once and leave operatation. There also needs to be a check that no other process is writing to the HDF5 file at the same time. This should be done at the stage and unstaging level to prevent to file getting written to by multiple processes and generating conflicts. This needs to include attributes for the chromosomes for each resolution - See the mg-rest-adjacency hdf5_reader for further details about the requirement. This prevents the need for secondary storage details outside of the HDF5 file. Parameters ---------- hic_data : hic_data Hi-C data object hdf5_file : str Location of the HDF5 output matrix file resolution : int Resolution to read teh Hi-C adjacency list at chromosomes : list List of listsd of the chromosome names and their size in the order that they are presented for indexing Returns ------- hdf5_file : str Location of the HDF5 output matrix file """ hic_data = load_hic_data_from_reads(adjlist_file, resolution=int(resolution)) if normalized is False: hic_data.normalize_hic(iterations=9, max_dev=0.1) d_size = len(hic_data) d_tmp = np.zeros([d_size, d_size], dtype='int32') d_tmp += hic_data.get_matrix() hdf5_handle = h5py.File(adj_hdf5, "a") dset = hdf5_handle.create_dataset(str(resolution), (d_size, d_size), dtype='int32', chunks=True, compression="gzip") dset.attrs['chromosomes'] = chromosomes dset[0:d_size, 0:d_size] += d_tmp hdf5_handle.close() return True
def tb_hic_chr(self, adj_list, resolution): # pylint: disable=no-self-use """ Get the list of chromosomes in the adjacency list """ print("TB LOADED HIC MATRIX") hic_data = load_hic_data_from_reads(adj_list, resolution=int(resolution)) print("TB LOADED HIC MATRIX") return hic_data.chromosomes.keys()
def run(opts): check_options(opts) launch_time = time.localtime() # prepare output folders mkdir(path.join(opts.workdir, '06_model')) outdir = path.join(opts.workdir, '06_model', 'chr%s_%s-%s' % (opts.crm, opts.beg, opts.end)) mkdir(outdir) # load data if opts.matrix: crm = load_hic_data(opts) else: (bad_co, bad_co_id, biases, biases_id, mreads, mreads_id, reso) = load_parameters_fromdb(opts) hic_data = load_hic_data_from_reads(mreads, reso) hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co)) hic_data.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases)) exp = crm.experiments[0] opts.beg, opts.end = opts.beg or 1, opts.end or exp.size # in case we are not going to run if opts.job_list: job_file_handler = open(path.join(outdir, 'job_list.q'), 'w') else: job_file_handler = None # optimization if opts.optimize: optimization(exp, opts, job_file_handler, outdir) finish_time = time.localtime() return # correlate all optimizations and get best set of parqameters optpar, dcutoff = correlate_models(opts, outdir, exp) # run good mmodels big_run(exp, opts, job_file_handler, outdir, optpar) finish_time = time.localtime()
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bed: mreads = path.realpath(opts.bed) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) print 'loading', mreads hic_data = load_hic_data_from_reads(mreads, opts.reso) mkdir(path.join(opts.workdir, '04_normalization')) print 'Get poor bins...' try: hic_data.filter_columns( perc_zero=opts.perc_zeros, draw_hist=True, by_mean=not opts.fast_filter, savefig=path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' % (opts.reso, opts.perc_zeros, param_hash)) if not opts.fast_filter else None) except ValueError: hic_data.filter_columns( perc_zero=100, draw_hist=True, by_mean=not opts.fast_filter, savefig=path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' % (opts.reso, opts.perc_zeros, param_hash)) if not opts.fast_filter else None) # bad columns bad_columns_file = path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.tsv' % (opts.reso, opts.perc_zeros, param_hash)) out_bad = open(bad_columns_file, 'w') out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()])) out_bad.close() # Identify biases print 'Get biases using ICE...' hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0, factor=opts.factor) print 'Getting cis/trans...' cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True, diagonal=True) cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True) cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True, diagonal=False) cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False) print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d # Plot genomic distance vs interactions print 'Plot genomic distance vs interactions...' inter_vs_gcoord = path.join( opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.pdf_%s_%s.pdf' % (opts.reso, param_hash)) (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( hic_data, max_diff=10000, resolution=opts.reso, normalized=True, savefig=inter_vs_gcoord) print 'Decay slope 0.7-10 Mb\t%s' % a2 # write biases bias_file = path.join(opts.workdir, '04_normalization', 'bias_%s_%s.tsv' % (opts.reso, param_hash)) out_bias = open(bias_file, 'w') out_bias.write( '\n'.join(['%d\t%f' % (i, hic_data.bias[i]) for i in hic_data.bias]) + '\n') out_bias.close() # to feed the save_to_db funciton intra_dir_nrm_fig = intra_dir_nrm_txt = None inter_dir_nrm_fig = inter_dir_nrm_txt = None genom_map_nrm_fig = genom_map_nrm_txt = None intra_dir_raw_fig = intra_dir_raw_txt = None inter_dir_raw_fig = inter_dir_raw_txt = None genom_map_raw_fig = genom_map_raw_txt = None if "intra" in opts.keep: print " Saving intra chromosomal raw and normalized matrices..." if opts.only_txt: intra_dir_nrm_fig = None intra_dir_raw_fig = None else: intra_dir_nrm_fig = path.join( opts.workdir, '04_normalization', 'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_fig = path.join( opts.workdir, '04_normalization', 'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) intra_dir_nrm_txt = path.join( opts.workdir, '04_normalization', 'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_txt = path.join( opts.workdir, '04_normalization', 'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) hic_map(hic_data, normalized=True, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_nrm_fig, savedata=intra_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_raw_fig, savedata=intra_dir_raw_txt) if "inter" in opts.keep: print " Saving inter chromosomal raw and normalized matrices..." if opts.only_txt: inter_dir_nrm_fig = None inter_dir_raw_fig = None else: inter_dir_nrm_fig = path.join( opts.workdir, '04_normalization', 'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_fig = path.join( opts.workdir, '04_normalization', 'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) inter_dir_nrm_txt = path.join( opts.workdir, '04_normalization', 'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_txt = path.join( opts.workdir, '04_normalization', 'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) hic_map(hic_data, normalized=True, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_nrm_fig, savedata=inter_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_raw_fig, savedata=inter_dir_raw_txt) if "genome" in opts.keep: print " Saving normalized genomic matrix..." if opts.only_txt: genom_map_nrm_fig = path.join( opts.workdir, '04_normalization', 'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash)) genom_map_raw_fig = path.join( opts.workdir, '04_normalization', 'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash)) else: genom_map_nrm_fig = None genom_map_raw_fig = None genom_map_nrm_txt = path.join( opts.workdir, '04_normalization', 'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash)) genom_map_raw_txt = path.join( opts.workdir, '04_normalization', 'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash)) hic_map(hic_data, normalized=True, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_nrm_fig, savedata=genom_map_nrm_txt) hic_map(hic_data, normalized=False, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_raw_fig, savedata=genom_map_raw_txt) finish_time = time.localtime() save_to_db(opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads, intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig, inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt, intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig, inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) reso1 = reso2 = None if opts.bed1: mreads1 = path.realpath(opts.bed1) bad_co1 = opts.bad_co1 biases1 = opts.biases1 else: bad_co1, biases1, mreads1, reso1 = load_parameters_fromdb( opts.workdir1, opts.jobid1, opts, opts.tmpdb1) mreads1 = path.join(opts.workdir1, mreads1) if opts.bed2: mreads2 = path.realpath(opts.bed2) bad_co2 = opts.bad_co2 biases2 = opts.biases2 else: bad_co2, biases2, mreads2, reso2 = load_parameters_fromdb( opts.workdir2, opts.jobid2, opts, opts.tmpdb2) mreads2 = path.join(opts.workdir2, mreads2) if reso1 != reso2: raise Exception('ERROR: differing resolutions between experiments to ' 'be merged') mkdir(path.join(opts.workdir, '00_merge')) if not opts.skip_comparison: print 'Comparison' print ' - loading first sample', mreads1 hic_data1 = load_hic_data_from_reads(mreads1, opts.reso) print ' - loading second sample', mreads2 hic_data2 = load_hic_data_from_reads(mreads2, opts.reso) if opts.norm and biases1: bad_co1 = path.join(opts.workdir1, bad_co1) print ' - loading bad columns from first sample', bad_co1 hic_data1.bads = dict( (int(l.strip()), True) for l in open(bad_co1)) biases1 = path.join(opts.workdir1, biases1) print ' - loading biases from first sample', biases1 hic_data1.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases1)) elif opts.norm: raise Exception('ERROR: biases or filtered-columns not found') if opts.norm and biases2: bad_co2 = path.join(opts.workdir2, bad_co2) print ' - loading bad columns from second sample', bad_co2 hic_data2.bads = dict( (int(l.strip()), True) for l in open(bad_co2)) biases2 = path.join(opts.workdir2, biases2) print ' - loading biases from second sample', biases2 hic_data2.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases2)) elif opts.norm: raise Exception('ERROR: biases or filtered-columns not found') decay_corr_dat = path.join( opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) decay_corr_fig = path.join( opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash)) eigen_corr_dat = path.join( opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) eigen_corr_fig = path.join( opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash)) else: hic_data1 = {} hic_data2 = {} decay_corr_dat = 'None' decay_corr_fig = 'None' eigen_corr_dat = 'None' eigen_corr_fig = 'None' # if opts.norm: # has bias file if not opts.skip_comparison: print ' => correlation between equidistant loci' corr, _, bads = correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, savefig=decay_corr_fig, savedata=decay_corr_dat, get_bads=True) print ' => correlation between eigenvectors' eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, nvect=6, savefig=eigen_corr_fig, savedata=eigen_corr_dat) else: corr = eig_corr = 0 bads = {} # merge inputs mkdir(path.join(opts.workdir, '03_filtered_reads')) outbed = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % (param_hash)) print '\nMergeing...' nreads = merge_2d_beds(mreads1, mreads2, outbed) finish_time = time.localtime() save_to_db(opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig, len(bads.keys()), len(hic_data1), nreads, eigen_corr_dat, eigen_corr_fig, outbed, corr, eig_corr, biases1, bad_co1, biases2, bad_co2, launch_time, finish_time) print '\n\nDone.'
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if not opts.nosql: (bad_co, bad_co_id, biases, biases_id, mreads, mreads_id, reso) = load_parameters_fromdb(opts) # store path ids to be saved in database inputs = bad_co_id, biases_id, mreads_id else: bad_co = opts.bad_co biases = opts.biases mreads = opts.mreads reso = opts.reso mreads = path.join(opts.workdir, mreads) bad_co = path.join(opts.workdir, bad_co) biases = path.join(opts.workdir, biases) mkdir(path.join(opts.workdir, '05_segmentation')) print 'loading %s at resolution %s' % (mreads, nice(reso)) hic_data = load_hic_data_from_reads(mreads, reso) hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co)) hic_data.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases)) # compartments cmp_result = {} if not opts.only_tads: print 'Searching compartments' hic_data.find_compartments(crms=opts.crms) cmprt_dir = path.join(opts.workdir, '05_segmentation', 'compartments_%s' % (nice(reso))) mkdir(cmprt_dir) for crm in opts.crms or hic_data.chromosomes: cmprt_file = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash)) hic_data.write_compartments(cmprt_file, chroms=[crm]) cmp_result[crm] = {'path': cmprt_file, 'num' : len(hic_data.compartments[crm])} # TADs tad_result = {} if not opts.only_compartments: print 'Searching TADs' tad_dir = path.join(opts.workdir, '05_segmentation', 'tads_%s' % (nice(reso))) mkdir(tad_dir) for crm in hic_data.chromosomes: if opts.crms and not crm in opts.crms: continue print ' - %s' % crm matrix = hic_data.get_matrix(focus=crm) beg, end = hic_data.section_pos[crm] size = len(matrix) if size < 10: print " Chromosome too short (%d bins), skipping..." % size continue # transform bad column in chromosome referential to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)]) # maximum size of a TAD max_tad_size = size if opts.max_tad_size is None else opts.max_tad_size result = tadbit([matrix], remove=to_rm, n_cpus=opts.cpus, verbose=True, max_tad_size=max_tad_size, no_heuristic=True) tads = load_tad_height(result, size, beg, end, hic_data) table = '' table += '%s\t%s\t%s\t%s%s\n' % ('#', 'start', 'end', 'score', 'density') for tad in tads: table += '%s\t%s\t%s\t%s%s\n' % ( tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1), abs(tads[tad]['score']), '\t%s' % (round( float(tads[tad]['height']), 3))) out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash)) out = open(out_tad, 'w') out.write(table) out.close() tad_result[crm] = {'path' : out_tad, 'num': len(tads)} finish_time = time.localtime() if not opts.nosql: save_to_db(opts, cmp_result, tad_result, reso, inputs, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) reso1 = reso2 = None if opts.bed1: mreads1 = path.realpath(opts.bed1) bad_co1 = opts.bad_co1 biases1 = opts.biases1 else: bad_co1, biases1, mreads1, reso1 = load_parameters_fromdb( opts.workdir1, opts.jobid1, opts, opts.tmpdb1) mreads1 = path.join(opts.workdir1, mreads1) if opts.bed2: mreads2 = path.realpath(opts.bed2) bad_co2 = opts.bad_co2 biases2 = opts.biases2 else: bad_co2, biases2, mreads2, reso2 = load_parameters_fromdb( opts.workdir2, opts.jobid2, opts, opts.tmpdb2) mreads2 = path.join(opts.workdir2, mreads2) if reso1 != reso2: raise Exception('ERROR: differing resolutions between experiments to ' 'be merged') print 'loading first sample', mreads1 hic_data1 = load_hic_data_from_reads(mreads1, opts.reso) print 'loading second sample', mreads2 hic_data2 = load_hic_data_from_reads(mreads2, opts.reso) if opts.norm and biases1: bad_co1 = path.join(opts.workdir1, bad_co1) print 'loading bad columns from first sample', bad_co1 hic_data1.bads = dict((int(l.strip()), True) for l in open(bad_co1)) biases1 = path.join(opts.workdir1, biases1) print 'loading biases from first sample', biases1 hic_data1.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases1)) elif opts.norm: raise Exception('ERROR: biases or filtered-columns not found') if opts.norm and biases2: bad_co2 = path.join(opts.workdir2, bad_co2) print 'loading bad columns from second sample', bad_co2 hic_data2.bads = dict((int(l.strip()), True) for l in open(bad_co2)) biases2 = path.join(opts.workdir2, biases2) print 'loading biases from second sample', biases2 hic_data2.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases2)) elif opts.norm: raise Exception('ERROR: biases or filtered-columns not found') mkdir(path.join(opts.workdir, '00_merge')) if not opts.skip_comparison: decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash)) eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash)) else: decay_corr_dat = 'None' decay_corr_fig = 'None' eigen_corr_dat = 'None' eigen_corr_fig = 'None' # if opts.norm: # has bias file if not opts.skip_comparison: print 'correlation between equidistant loci' corr, _, bads = correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, savefig=decay_corr_fig, savedata=decay_corr_dat, get_bads=True) print 'correlation between eigenvectors' eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, nvect=6, savefig=eigen_corr_fig, savedata=eigen_corr_dat) else: corr = eig_corr = None bads = {} # merge inputs mkdir(path.join(opts.workdir, '03_filtered_reads')) outbed = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % ( param_hash)) nreads = merge_2d_beds(mreads1, mreads2, outbed) finish_time = time.localtime() save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig, len(bads.keys()), len(hic_data1), nreads, eigen_corr_dat, eigen_corr_fig, outbed, corr, eig_corr, biases1, bad_co1, biases2, bad_co2, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() print( ''' %s%s - Region: Chromosome %s from %d to %d at resolution %s (%d particles) ''' % ('Preparing ' if opts.job_list else '', ('Optimization\n' + '*' * (21 if opts.job_list else 11)) if opts.optimize else ('Modeling\n' + '*' * (18 if opts.job_list else 8)), opts.crm, opts.ori_beg, opts.ori_end, nicer(opts.reso), opts.end - opts.beg)) # load data if opts.matrix: crm = load_hic_data(opts) else: # FIXME: copied from somewhere else (bad_co, bad_co_id, biases, biases_id, mreads, mreads_id, reso) = load_parameters_fromdb(opts) hic_data = load_hic_data_from_reads(mreads, reso) hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co)) hic_data.bias = dict( (int(l.split()[0]), float(l.split()[1])) for l in open(biases)) exp = crm.experiments[0] opts.beg, opts.end = opts.beg or 1, opts.end or exp.size # prepare output folders batch_job_hash = digest_parameters( opts, get_md5=True, extra=[ 'maxdist', 'upfreq', 'lowfreq', 'scale', 'dcutoff', 'nmodels_run', 'job_list', 'rand', 'nmodels', 'nkeep', 'optimize', 'optimization_id', 'cpus', 'workdir', 'matrix', 'ori_beg', 'ori_end' ]) mkdir(path.join(opts.workdir, '06_model')) outdir = path.join( opts.workdir, '06_model', '%s_chr%s_%s-%s' % (batch_job_hash, opts.crm, opts.beg, opts.end)) mkdir(outdir) # in case we are not going to run if opts.job_list: job_file_handler = open( path.join( outdir, 'job_list_%s.q' % ('optimization' if opts.optimize else 'modeling')), 'w') else: job_file_handler = None ############### # Optimization print ' o Optimizing parameters' if opts.optimize: optimization(exp, opts, job_file_handler, outdir) finish_time = time.localtime() print('\n optimization done') # correlate all optimization and get best set of parameters if not (opts.optimize and opts.job_list): optpar, results = correlate_models(opts, outdir, exp) else: results = [] ########### # Modeling if not opts.optimize: big_run(exp, opts, job_file_handler, outdir, optpar) finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, outdir, results, batch_job_hash, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bed: mreads = path.realpath(opts.bed) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) print 'loading', mreads hic_data = load_hic_data_from_reads(mreads, opts.reso) mkdir(path.join(opts.workdir, '04_normalization')) print 'Get poor bins...' try: hic_data.filter_columns(perc_zero=opts.perc_zeros, min_count=opts.min_count, draw_hist=True, by_mean=not opts.fast_filter, savefig=path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%d_%s.pdf' % ( opts.reso, opts.perc_zeros, opts.min_count, param_hash)) if not opts.fast_filter else None) except ValueError: raise ValueError('ERROR: probably all columns filtered out...') # bad columns bad_columns_file = path.join(opts.workdir, '04_normalization', 'bad_columns_%s_%d_%d_%s.tsv' % ( opts.reso, opts.perc_zeros, opts.min_count, param_hash)) out_bad = open(bad_columns_file, 'w') out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()])) out_bad.close() # Identify biases if not opts.filter_only: print 'Get biases using ICE...' hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0, factor=opts.factor) print 'Getting cis/trans...' cis_trans_N_D = cis_trans_N_d = float('nan') if not opts.filter_only: cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True , diagonal=True ) cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True , diagonal=False) cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True ) cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False) if not opts.filter_only: print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d # Plot genomic distance vs interactions print 'Plot genomic distance vs interactions...' inter_vs_gcoord = path.join(opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.pdf_%s_%s.pdf' % ( opts.reso, param_hash)) (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( hic_data, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print 'Decay slope 0.7-10 Mb\t%s' % a2 # write biases bias_file = path.join(opts.workdir, '04_normalization', 'bias_%s_%s.tsv' % (opts.reso, param_hash)) out_bias = 'NA' if not opts.filter_only: out_bias = open(bias_file, 'w') out_bias.write('\n'.join(['%d\t%f' % (i, hic_data.bias[i]) for i in hic_data.bias]) + '\n') out_bias.close() # pickle the HiC-data object print 'Saving genomic matrix' pickle_path = path.join(opts.workdir, '04_normalization', 'hic-data_%s_%s.pickle' % (nice(opts.reso), param_hash)) out = open(pickle_path, 'w') dump(hic_data, out) out.close() # to feed the save_to_db funciton intra_dir_nrm_fig = intra_dir_nrm_txt = None inter_dir_nrm_fig = inter_dir_nrm_txt = None genom_map_nrm_fig = genom_map_nrm_txt = None intra_dir_raw_fig = intra_dir_raw_txt = None inter_dir_raw_fig = inter_dir_raw_txt = None genom_map_raw_fig = genom_map_raw_txt = None if "intra" in opts.keep: print " Saving intra chromosomal raw and normalized matrices..." if opts.only_txt: intra_dir_nrm_fig = None intra_dir_raw_fig = None else: intra_dir_nrm_fig = path.join(opts.workdir, '04_normalization', 'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_fig = path.join(opts.workdir, '04_normalization', 'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) intra_dir_nrm_txt = path.join(opts.workdir, '04_normalization', 'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_txt = path.join(opts.workdir, '04_normalization', 'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) if not opts.filter_only: hic_map(hic_data, normalized=True, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_nrm_fig, savedata=intra_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_raw_fig, savedata=intra_dir_raw_txt) if "inter" in opts.keep: print " Saving inter chromosomal raw and normalized matrices..." if opts.only_txt: inter_dir_nrm_fig = None inter_dir_raw_fig = None else: if not opts.filter_only: inter_dir_nrm_fig = path.join(opts.workdir, '04_normalization', 'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_fig = path.join(opts.workdir, '04_normalization', 'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) if not opts.filter_only: inter_dir_nrm_txt = path.join(opts.workdir, '04_normalization', 'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_txt = path.join(opts.workdir, '04_normalization', 'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) if not opts.filter_only: hic_map(hic_data, normalized=True, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_nrm_fig, savedata=inter_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_raw_fig, savedata=inter_dir_raw_txt) if "genome" in opts.keep: print " Saving normalized genomic matrix..." if opts.only_txt: genom_map_nrm_fig = None genom_map_raw_fig = None else: if not opts.filter_only: genom_map_nrm_fig = path.join(opts.workdir, '04_normalization', 'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash)) genom_map_raw_fig = path.join(opts.workdir, '04_normalization', 'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash)) if not opts.filter_only: genom_map_nrm_txt = path.join(opts.workdir, '04_normalization', 'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash)) genom_map_raw_txt = path.join(opts.workdir, '04_normalization', 'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash)) if not opts.filter_only: hic_map(hic_data, normalized=True, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_nrm_fig, savedata=genom_map_nrm_txt) hic_map(hic_data, normalized=False, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_raw_fig, savedata=genom_map_raw_txt) finish_time = time.localtime() save_to_db (opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads, len(hic_data.bads.keys()), len(hic_data), intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig, inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt, intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig, inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt, pickle_path, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.nosql: bad_co = opts.bad_co biases = opts.biases mreads = opts.mreads reso = opts.reso inputs = [] else: (bad_co, bad_co_id, biases, biases_id, mreads, mreads_id, reso) = load_parameters_fromdb(opts) # store path ids to be saved in database inputs = bad_co_id, biases_id, mreads_id mreads = path.join(opts.workdir, mreads) bad_co = path.join(opts.workdir, bad_co) biases = path.join(opts.workdir, biases) mkdir(path.join(opts.workdir, '05_segmentation')) print 'loading %s \n at resolution %s' % (mreads, nice(reso)) hic_data = load_hic_data_from_reads(mreads, reso) hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co)) print 'loading filtered columns %s' % (bad_co) print ' with %d of %d filtered out columns' % (len(hic_data.bads), len(hic_data)) try: hic_data.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases)) except IOError: if not opts.only_tads: raise Exception('ERROR: data should be normalized to get compartments') # compartments cmp_result = {} if not opts.only_tads: print 'Searching compartments' cmprt_dir = path.join(opts.workdir, '05_segmentation', 'compartments_%s' % (nice(reso))) mkdir(cmprt_dir) firsts = hic_data.find_compartments(crms=opts.crms, label_compartments='cluster', savefig=cmprt_dir, suffix=param_hash, log=cmprt_dir, rich_in_A=opts.rich_in_A) for crm in opts.crms or hic_data.chromosomes: if not crm in firsts: continue ev_file = open(path.join(cmprt_dir, '%s_EigVect_%s.tsv' % (crm, param_hash)), 'w') ev_file.write('# first EV\tsecond EV\n') ev_file.write('\n'.join(['\t'.join([str(v) for v in vs]) for vs in zip(*firsts[crm])])) ev_file.close() for crm in opts.crms or hic_data.chromosomes: cmprt_file = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash)) hic_data.write_compartments(cmprt_file, chroms=[crm]) cmp_result[crm] = {'path': cmprt_file, 'num' : len(hic_data.compartments[crm])} # TADs tad_result = {} if not opts.only_compartments: print 'Searching TADs' tad_dir = path.join(opts.workdir, '05_segmentation', 'tads_%s' % (nice(reso))) mkdir(tad_dir) for crm in hic_data.chromosomes: if opts.crms and not crm in opts.crms: continue print ' - %s' % crm matrix = hic_data.get_matrix(focus=crm) beg, end = hic_data.section_pos[crm] size = len(matrix) if size < 10: print " Chromosome too short (%d bins), skipping..." % size continue # transform bad column in chromosome referential to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)]) # maximum size of a TAD max_tad_size = size if opts.max_tad_size is None else opts.max_tad_size result = tadbit([matrix], remove=to_rm, n_cpus=opts.cpus, verbose=False, max_tad_size=max_tad_size, no_heuristic=False) tads = load_tad_height(result, size, beg, end, hic_data) table = '' table += '%s\t%s\t%s\t%s%s\n' % ('#', 'start', 'end', 'score', 'density') for tad in tads: table += '%s\t%s\t%s\t%s%s\n' % ( tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1), abs(tads[tad]['score']), '\t%s' % (round( float(tads[tad]['height']), 3))) out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash)) out = open(out_tad, 'w') out.write(table) out.close() tad_result[crm] = {'path' : out_tad, 'num': len(tads)} finish_time = time.localtime() if not opts.nosql: save_to_db(opts, cmp_result, tad_result, reso, inputs, launch_time, finish_time)