def modeling_step(model, cfg): with HssFile(cfg['mstep']['input_hss'], 'r') as f: radii = f.radii index = f.index genome = f.genome n_struct = cfg['mstep']['n_struct'] n_beads = cfg['mstep']['n_beads'] basepath = os.path.join(cfg['mstep']['workdir'], cfg['mstep']['run_name']) cfg['mstep']['templates'] = { 'crd_out': basepath + '.outcrd.{}.npy', 'info_out': basepath + '.info.{}.txt' } cfg.save(basepath + '.config') serial_function = partial(modeling_task, cfg_file=basepath + '.config') pctype = cfg['parallel_controller'] pcopts = cfg['parallel_controller_options'] controller = controller_class[pctype](**pcopts) argument_list = list(range(n_struct)) controller.map(serial_function, argument_list) # write coordinates crd_shape = (n_beads, n_struct, 3) with HssFile(cfg['mstep']['output_hss'], 'w') as hss: hss.index = index hss.genome = genome hss.radii = radii all_crd = hss.create_dataset('coordinates', shape=crd_shape, dtype=COORD_DTYPE) for i in range(n_struct): local_vars = resolve_templates(cfg['mstep']['templates'], [i]) crd = np.load(local_vars['crd_out']) all_crd[:, i, :] = crd # note that we discard all the positions of added atoms # write info kernel = kernel_class[cfg['mstep']['kernel']] with open(cfg['mstep']['info_out'], 'w') as outf: outf.write('#') for k in kernel.INFO_KEYS: outf.write(k + '\t') outf.write('\n') for i in range(n_struct): local_vars = resolve_templates(cfg['mstep']['templates'], [i]) with open(local_vars['info_out']) as inf: outf.write(inf.read() + '\n') # cleanup for i in range(n_struct): local_vars = resolve_templates(cfg['mstep']['templates'], [i]) os.remove(local_vars['crd_out']) os.remove(local_vars['info_out'])
def setup(self): opt = self.cfg['restraints']['sprite'] self.tmp_extensions = [".npy", ".npz"] self.tmp_dir = make_absolute_path( self.cfg.get('restraints/sprite/tmp_dir', 'sprite'), self.cfg.get('parameters/tmp_dir')) self.keep_temporary_files = self.cfg.get( 'restraints/sprite/keep_temporary_files', False) if not os.path.isdir(self.tmp_dir): os.makedirs(self.tmp_dir) #--- clusters_file = self.cfg.get('restraints/sprite/clusters') with h5py.File(clusters_file, 'r') as h5: n_clusters = len(h5['indptr']) - 1 with HssFile(self.cfg.get("optimization/structure_output"), 'r') as hss: self.n_struct = hss.nstruct batch_size = self.cfg.get('restraints/sprite/batch_size', 10) n_batches = n_clusters // batch_size if n_clusters % batch_size != 0: n_batches += 1 self.n_batches = n_batches self.n_clusters = n_clusters self.argument_list = range(n_batches)
def prepareHss(fname, nbead, nstruct, genome, index, radii, nucleus_shape='sphere', nucleus_parameters=5000.0, nucleus_volume=0, coord_chunks=None): with HssFile(fname, 'w') as hss: #put everything into hssFile hss.set_nbead(nbead) hss.set_nstruct(nstruct) hss.set_genome(genome) hss.set_index(index) hss.set_radii(radii) if coord_chunks is None: hss.set_coordinates(np.zeros((nbead, nstruct, 3))) else: hss.create_dataset('coordinates', shape=(nbead, nstruct, 3), dtype=COORD_DTYPE, chunks=coord_chunks) env = hss.create_group('envelope') env.create_dataset('shape', data=nucleus_shape) env.create_dataset('volume', data=nucleus_volume) env.create_dataset('params', data=nucleus_parameters)
def task(batch_id, cfg, tmp_dir): """ Compute activation distances for batch identified by parameter batch_id """ dictHiC = cfg['restraints']['Hi-C'] hss = HssFile(cfg.get("optimization/structure_output"), 'r') # read params fname = os.path.join(tmp_dir, '%d.in.npy' % batch_id) params = np.load(fname) # initialize result list results = [] # compute activation distances for all pairs of locus indexes, append to "results" list for i, j, pwish, plast in params: res = get_actdist(int(i), int(j), pwish, plast, hss, contactRange=dictHiC.get('contact_range', 2.0)) for r in res: results.append(r) # (i, j, actdist, p) # - hss.close() # save activation distances from current batch to a batch-unique output file, using format specifier 'actdist_fmt_str' fname = os.path.join(tmp_dir, '%d.out.tmp' % batch_id) with open(fname, 'w') as f: f.write('\n'.join([actdist_fmt_str % x for x in results]))
def teardown_poller(self): """ Teardown poller, after all coodinates and statistics have been updated for the whole population """ total_violations, total_restraints = self._summary_data[ 'n_violations'], self._summary_data['n_imposed'] if total_restraints == 0: violation_score = 0 else: # percentage of restraint violations violation_score = total_violations / total_restraints # open HSS population file self._hss = HssFile(self.hssfilename, 'r+') # LB # store violation score, all the coordinates and the restraint violation summary into the HSS file self._hss.set_violation(violation_score) self._hss.set_coordinates(self._hss_crd) # LB h5_create_or_replace_dataset(self._hss, 'summary', data=json.dumps( self._summary_data, default=lambda a: a.tolist())) # close HSS file self._hss.close()
def reduce(self): """ Collect all structure coordinates together to assemble a hssFile """ hssfilename = self.cfg["optimization"]["structure_output"] + '.T' # bonimba: using changes as Nan with HssFile(hssfilename, 'r+') as hss: n_struct = hss.nstruct n_beads = hss.nbead #iterate all structure files and total_restraints = 0.0 total_violations = 0.0 # extract coordinates and put them in matrix master = hss.coordinates print('Collecting all the coordinates from all configurations....') for i in tqdm(range(hss.nstruct), desc='(REDUCE)'): fname = "{}_{}.hms".format(self.tmp_file_prefix, i) hms = HmsFile(os.path.join(self.tmp_dir, fname)) crd = hms.get_coordinates() total_restraints += hms.get_total_restraints() total_violations += hms.get_total_violations() # edit master numpy matrix... master[:, i, :] = crd # in un colpo solo, chiudi il fil hss.set_coordinates(master) #- if (total_violations == 0) and (total_restraints == 0): hss.set_violation(np.nan) else: hss.set_violation(total_violations / total_restraints) hss.close() # repack PACK_SIZE = 1e6 pack_beads = max(1, int(PACK_SIZE / n_struct / 3)) pack_beads = min(pack_beads, n_beads) logger.info('repacking...') cmd = 'h5repack -l coordinates:CHUNK={:d}x{:d}x3 {:s} {:s}'.format( pack_beads, n_struct, hssfilename, hssfilename + '.swap') os.system(cmd) logger.info('done.') os.rename(hssfilename + '.swap', self.cfg.get("optimization/structure_output")) if self.keep_intermediate_structures: copyfile(self.cfg["optimization"]["structure_output"], self.intermediate_name())
def reduce(self): """ Collect all structure coordinates together to assemble a hssFile """ # wait for poller to finish (see also RelaxInit.py script) for _ in tqdm(self.file_poller.enumerate(), desc='(REDUCE)'): pass # read and log details, and save the runtime variables with HssFile(self.hssfilename, 'r+') as hss: n_struct = hss.nstruct n_beads = hss.nbead violation_score = log_stats(hss, self.cfg) self.cfg['runtime']['violation_score'] = violation_score h5_create_or_replace_dataset(hss, 'config_data', data=json.dumps( self.cfg, default=lambda a: a.tolist())) # repack hss file PACK_SIZE = 1e6 pack_beads = max(1, int(PACK_SIZE / n_struct / 3)) pack_beads = min(pack_beads, n_beads) cmd = [ 'h5repack', '-i', self.hssfilename, '-o', self.hssfilename + '.swap', '-l', 'coordinates:CHUNK={:d}x{:d}x3'.format(pack_beads, n_struct), '-v' ] sp = Popen(cmd, stderr=PIPE, stdout=PIPE) logger.info('repacking...') stdout, stderr = sp.communicate() if sp.returncode != 0: print(' '.join(cmd)) print('O:', stdout.decode('utf-8')) print('E:', stderr.decode('utf-8')) raise RuntimeError('repacking failed. error code: %d' % sp.returncode) logger.info('done.') # save the output file with a unique file name if requested (see 'intermediate_name' function below) if self.keep_intermediate_structures: copyfile(self.hssfilename + '.swap', self.intermediate_name() + '.hss') # finally replace output file shutil.move(self.hssfilename + '.swap', self.cfg.get("optimization/structure_output"))
def reduce(self): """ Collect all structure coordinates together to assemble a hssFile, using the polling function, and repack """ # update structure coordinates for i in tqdm(self.file_poller.enumerate(), desc='(REDUCE)'): pass with HssFile(self.hssfilename, 'r+') as hss: n_struct = hss.nstruct n_beads = hss.nbead logger.info( 'Coordinates in master file updated for ALL structures; repacking starts...' ) # repack hss file (this is a syntax proper to h5df files) PACK_SIZE = 1e6 pack_beads = max(1, int(PACK_SIZE / n_struct / 3)) pack_beads = min(pack_beads, n_beads) cmd = [ 'h5repack', '-i', self.hssfilename, '-o', self.hssfilename + '.swap', '-l', 'coordinates:CHUNK={:d}x{:d}x3'.format(pack_beads, n_struct), '-v' ] sp = Popen(cmd, stderr=PIPE, stdout=PIPE) logger.info('repacking...') stdout, stderr = sp.communicate() if sp.returncode != 0: print(' '.join(cmd)) print('O:', stdout.decode('utf-8')) print('E:', stderr.decode('utf-8')) raise RuntimeError('repacking failed. error code: %d' % sp.returncode) logger.info('repacking done.') # save the output file with a unique file name if requested if self.keep_intermediate_structures: copyfile(self.hssfilename + '.swap', self.intermediate_name() + '.hss') # get rid of temporary .swap file os.rename(self.hssfilename + '.swap', self.cfg.get("optimization/structure_output"))
def checkViolations(cfg): hss = HssFile(cfg["structure_output"]) vio = hss.get_violation() if "sigma" in cfg["restraints"]["Hi-C"]: print(cfg["restraints"]["Hi-C"]["sigma"], vio) if vio < 0.01: if len(cfg["restraints"]["Hi-C"]["sigma_list"]) > 0: cfg["restraints"]["Hi-C"]["sigma"] = cfg["restraints"]["Hi-C"][ "sigma_list"].pop(0) else: return False else: print("Start", vio) cfg["restraints"]["Hi-C"]["sigma"] = cfg["restraints"]["Hi-C"][ "sigma_list"].pop(0) #- return True
def task(struct_id, cfg, tmp_dir): """ generate one random structure with territories """ k = np.random.randint(0, 2**32) np.random.seed((k * struct_id) % (2**32)) hssfilename = cfg["optimization"]["structure_output"] nucleus_radius = cfg.get("model/init_radius") with HssFile(hssfilename, 'r') as hss: index = hss.index crd = generate_territories(index, nucleus_radius) ofname = os.path.join(tmp_dir, 'random_%d.hms' % struct_id) with HmsFile(ofname, 'w') as hms: hms.saveCoordinates(struct_id, crd)
def setup_poller(self): """ Set up polling function: define coordinate master matrix and a dictionary summarizing statistics from run""" _hss = HssFile(self.hssfilename, 'r') #self._hss = HssFile(self.hssfilename, 'r+') self._hss_crd = _hss.coordinates _hss.close() self._summary_data = { 'n_imposed': 0.0, 'n_violations': 0.0, 'histogram': { 'counts': np.zeros(DEFAULT_HIST_BINS + 1), 'edges': np.arange(0, DEFAULT_HIST_MAX, DEFAULT_HIST_MAX / DEFAULT_HIST_BINS).tolist() + [DEFAULT_HIST_MAX, np.inf] }, 'bystructure': { 'n_imposed': np.zeros(self.cfg["model"]["population_size"], dtype=np.float32), 'n_violations': np.zeros(self.cfg["model"]["population_size"], dtype=np.float32), 'total_energies': np.zeros(self.cfg["model"]["population_size"], dtype=np.float32), 'pair_energies': np.zeros(self.cfg["model"]["population_size"], dtype=np.float32), 'bond_energies': np.zeros(self.cfg["model"]["population_size"], dtype=np.float32), 'thermo': {} }, 'byrestraint': {} }
def task(batch_id, cfg, tmp_dir): """ Read in temporary in.tmp files, generated list of Damid activation distances, produce out.tmp files """ nucleus_parameters = None shape = cfg.get('model/restraints/envelope/nucleus_shape') if shape == 'sphere': nucleus_parameters = cfg.get( 'model/restraints/envelope/nucleus_radius') elif shape == 'ellipsoid': nucleus_parameters = cfg.get( 'model/restraints/envelope/nucleus_semiaxes') else: raise NotImplementedError( 'shape %s has not been implemented yet.' % shape) with HssFile(cfg.get("optimization/structure_output"), 'r') as hss: # read params from temporary damid.in.npy files fname = os.path.join(tmp_dir, '%d.damid.in.npy' % batch_id) params = np.load(fname) # compute the corresponding output to save to out.tmp file results = [] for i, pwish, plast in params: res = get_damid_actdist(int(i), pwish, plast, hss, contact_range=cfg.get( 'restraints/DamID/contact_range', 0.05), shape=shape, nucleus_param=nucleus_parameters) results += res #(i, damid_actdist, p) #- # save output for this chunk to file, using the format specified by string 'damid_actdist_fmt_str' fname = os.path.join(tmp_dir, '%d.out.tmp' % batch_id) with open(fname, 'w') as f: f.write('\n'.join([damid_actdist_fmt_str % x for x in results]))
def get_structure(path, n, folder='.'): path = os.path.join(folder, path) with HssFile(path, 'r') as f: crd = f.get_struct_crd(n).tolist() chrom = f.genome.chroms[f.index.chrom].tolist() radius = f.radii.tolist() nstruct = f.nstruct cstarts = f.index.offset.tolist() return { 'crd': [crd[cstarts[i]:cstarts[i + 1]] for i in range(len(cstarts) - 1)], 'idx': chrom, 'rad': [radius[cstarts[i]:cstarts[i + 1]] for i in range(len(cstarts) - 1)], 'n': int(nstruct), 'cstarts': cstarts, 'chroms': [str(v) for i, v in enumerate(chrom) if i == 0 or v != chrom[i - 1]], }
def task(batch_id, cfg, tmp_dir): clusters_file = cfg.get('restraints/sprite/clusters') batch_size = cfg.get('restraints/sprite/batch_size', 10) keep_best = cfg.get('restraints/sprite/keep_best', 50) # read the clusters with h5py.File(clusters_file, 'r') as h5: start = batch_id * batch_size stop = (batch_id + 1) * batch_size + 1 ii = h5['indptr'][start:stop][()] data = h5['data'][ii[0]:ii[-1]] #load everything for performance ii -= ii[0] # subtract offset (ii[0]) from the full ii array # generate a list of arrays of 'stop-start' length clusters = [data[ii[i - 1]:ii[i]] for i in range(1, len(ii))] del data # open the structure file and read index hss = HssFile(cfg.get("optimization/structure_output"), 'r') index = hss.index indexes, values, selected_beads = [], [], [] for cluster in clusters: # effective number of different chromosomes involved in current cluster (no repetition) n_chrom = len(np.unique(index.chrom[cluster])) # if max_chrom_in_cluster exceeeded, then append arrays full of -1 entries if n_chrom > cfg.get('restraints/sprite/max_chrom_in_cluster', 6): selected_beads.append( np.zeros((keep_best, len(cluster)), dtype='i4') - 1 # matrix (keep_best, len(cluster)) of -1s ) indexes.append(np.array( [-1] * keep_best)) # array with "keep_best" -1 entries values.append(np.array( [-1] * keep_best)) # array with "keep_best" -1 entries continue # compute radius**2 of giration for a set of genomic segments, across population (three outputs) # NB: current_selected_beads = the bead indexes making up the cluster after considering the possible combinations of chromosome copies. rg2s, _, current_selected_beads = compute_gyration_radius( hss['coordinates'], cluster, index, index.copy_index) # rg2s = array of size (n_struct), one radius of gyration for each structure, for a given cluster # current_selected_beads = array/list of (n_struct, len(cluster)), each row pertains to a different configuration # ind is the array of indexes which would sort the array, aka rg2s[ind[0]] <= rgs2[ind[1]] <= ... # sorting out radii of gyration array, from smallest to largest # (using argpartition & argsort is suggested on stackoverflow, in order to maximize efficiency) ind = np.argpartition(rg2s, keep_best)[:keep_best] # this is O(n_struct) ind = ind[np.argsort( rg2s[ind])] # sorting is O(keep_best ln(keep_best)) # extract sorted arrays (smallest keep_best entries) best_rg2s = rg2s[ind] current_selected_beads = current_selected_beads[ ind] # select the indices of "keep_best" configurations, and correpsonding beads # for each configuration, we have the bead indexes associated to rg.. the numnber # of beads is determined by the size of the cluster # append quantities to master lists, which sweeps over the different clusters in batch selected_beads.append(current_selected_beads) indexes.append(ind) values.append( best_rg2s) # append keep_best best values of radii of gyration #------- saving step into a batch-dependent set of files sel_file = os.path.join(tmp_dir, 'tmp.%d.selected.npz' % batch_id) idx_file = os.path.join(tmp_dir, 'tmp.%d.idx.npy' % batch_id) val_file = os.path.join(tmp_dir, 'tmp.%d.values.npy' % batch_id) np.savez( sel_file, *selected_beads ) # for each configuration in batch, save list of beads making up the cluster (batch_size, keep_best, n_beads) # n_beads is different from cluster to cluster (different cluster sizes) np.save( idx_file, np.array(indexes, dtype=np.int32) ) # save indices of configurations explored in current batch (batch_size, keep_best) np.save( val_file, values ) # save radius of gyration values for each of the configutations explored in current batch (batch_size, rg) # verify pickle integrity, sometimes weird io problems happen selected_beads = np.load(sel_file) indexes = np.load(idx_file) values = np.load(val_file)
def task(struct_id, cfg, tmp_dir): """ Do single structure modeling with restraint assignment from A-step """ # the static method modifications to the cfg should only be local, # use a copy of the config file cfg = deepcopy(cfg) # extract structure information step_id = cfg.get('runtime/step_hash', 'xxxx') readyfile = os.path.join(tmp_dir, '%s.%d.ready' % (step_id, struct_id)) # if the ready file exists it does nothing, unless it is a clear run if not cfg.get('optimization/clean_restart', False): if os.path.isfile(readyfile): return hssfilename = cfg["optimization"]["structure_output"] # read index, radii, coordinates with HssFile(hssfilename, 'r') as hss: index = hss.index radii = hss.radii if cfg.get('optimization/random_shuffling', False): crd = generate_random_in_sphere( radii, cfg.get('model/restraints/envelope/nucleus_radius')) else: crd = hss.get_struct_crd(struct_id) # init Model class (igm.model) model = Model(uid=struct_id) # get the chain ids chain_ids = np.concatenate([[i] * s for i, s in enumerate(index.chrom_sizes)]) # add particles into model n_particles = len(crd) for i in range(n_particles): model.addParticle(crd[i], radii[i], Particle.NORMAL, chainID=chain_ids[i]) # Add restraints monitored_restraints = [] # ---- POLYMER STRUCTURAL INTEGRITY INTRINSIC restraints ----- # # add excluded volume restraint ex = Steric(cfg.get("model/restraints/excluded/evfactor")) model.addRestraint(ex) # add nucleus envelope restraint shape = cfg.get('model/restraints/envelope/nucleus_shape') envelope_k = cfg.get('model/restraints/envelope/nucleus_kspring') radius = 0 semiaxes = (0, 0, 0) if shape == 'sphere': radius = cfg.get('model/restraints/envelope/nucleus_radius') ev = Envelope(shape, radius, envelope_k) elif cfg['model']['restraints']['envelope'][ 'nucleus_shape'] == 'ellipsoid': semiaxes = cfg.get('model/restraints/envelope/nucleus_semiaxes') ev = Envelope(shape, semiaxes, envelope_k) elif cfg['model']['restraints']['envelope'][ 'nucleus_shape'] == 'exp_map': volume_file = cfg.get('model/restraints/envelope/input_map') ev = GenEnvelope(shape, volume_file, envelope_k) model.addRestraint(ev) monitored_restraints.append(ev) # add consecutive bead polymer restraint to ensure chain connectivity if cfg.get('model/restraints/polymer/polymer_bonds_style') != 'none': contact_probabilities = cfg['runtime'].get( 'consecutive_contact_probabilities', None) pp = Polymer( index, cfg['model']['restraints']['polymer']['contact_range'], cfg['model']['restraints']['polymer']['polymer_kspring'], contact_probabilities=contact_probabilities) model.addRestraint(pp) monitored_restraints.append(pp) # LB: add nuclear body excluded volume restraints if "nucleolus" in cfg['restraints']: # read in nucle lus coordinates and radius from cfg file for mappa in cfg['restraints']['nucleolus']['input_map']: nucl = GenEnvelope(cfg['restraints']['nucleolus']['shape'], mappa, cfg['restraints']['nucleolus']['k_spring']) model.addRestraint(nucl) monitored_restraints.append(nucl) # ---- IGM MODELING RESTRAINTS FROM EXPERIMENTAL DATA (FISH MISSING) ---- # # add Hi-C restraint if "Hi-C" in cfg['restraints']: # read parameters from cfg file actdist_file = cfg.get('runtime/Hi-C/actdist_file') contact_range = cfg.get('restraints/Hi-C/contact_range', 2.0) k = cfg.get('restraints/Hi-C/contact_kspring', 0.05) # effectively add HiC restraints (bonds) hic = HiC(actdist_file, contact_range, k) model.addRestraint(hic) monitored_restraints.append(hic) # add DAMID restraint if "DamID" in cfg['restraints']: # read parameters from cfg file actdist_file = cfg.get('runtime/DamID/damid_actdist_file') contact_range = cfg.get('restraints/DamID/contact_range', 2.0) k = cfg.get('restraints/DamID/contact_kspring', 0.05) # effectively add DAMID restraints damid = Damid(damid_file=actdist_file, contact_range=contact_range, nuclear_radius=radius, k=k, shape=shape, semiaxes=semiaxes) model.addRestraint(damid) monitored_restraints.append(damid) # add SPRITE restraint if "sprite" in cfg['restraints']: # read parameters from cfg file sprite_tmp = make_absolute_path( cfg.get('restraints/sprite/tmp_dir', 'sprite'), cfg.get('parameters/tmp_dir')) assignment_filename = make_absolute_path( cfg.get('restraints/sprite/assignment_file', 'assignment.h5'), sprite_tmp) # effectively add SPRITE retraints sprite = Sprite(assignment_filename, cfg.get('restraints/sprite/volume_fraction', 0.05), struct_id, cfg.get('restraints/sprite/kspring', 1.0)) model.addRestraint(sprite) monitored_restraints.append(sprite) # ========Optimization cfg['runtime']['run_name'] = cfg.get('runtime/step_hash') + '_' + str( struct_id) optinfo = model.optimize(cfg) # tolerance parameter: if violation score is smaller than tolerance, then restraint is satisfied tol = cfg.get('optimization/violation_tolerance', 0.01) # save optimization results to .hms file ofname = os.path.join(tmp_dir, 'mstep_%d.hms' % struct_id) with HmsFile(ofname, 'w') as hms: hms.saveModel(struct_id, model) # create violations statistics and save all of that into the "vstat" dictionary vstat = {} for r in monitored_restraints: vs = [] n_imposed = 0 for fid in r.forceID: f = model.forces[fid] n_imposed += f.rnum if f.rnum > 1: # a list of values is appended to vs = [] at once vs += f.getViolationRatios(model.particles).tolist() else: # one value is appended at the time to vs = [] vs.append(f.getViolationRatio(model.particles)) vs = np.array(vs) H, edges = get_violation_histogram(vs) num_violations = np.count_nonzero( vs > tol ) # the same 'tol' value is used to compute the number of violations across different restraint kinds...is that too easy? vstat[repr(r)] = { 'histogram': { 'edges': edges.tolist(), 'counts': H.tolist() }, 'n_violations': num_violations, 'n_imposed': n_imposed } # add violation dictionary to hms file h5_create_or_replace_dataset(hms, 'violation_stats', json.dumps(vstat)) if isinstance(optinfo, dict): grp = h5_create_group_if_not_exist(hms, 'opt_info') for k, v in optinfo.items(): if not isinstance(v, dict): h5_create_or_replace_dataset(grp, k, data=v) h5_create_or_replace_dataset(hms, 'opt_info_dict', data=json.dumps(optinfo)) # double check it has been written correctly with HmsFile(ofname, 'r') as hms: if not np.all(hms.get_coordinates() == model.getCoordinates()): raise RuntimeError('error writing the file %s' % ofname) # generat;e the .ready file, which signals to the poller that optimization for that structure has been completed readyfile = os.path.join(tmp_dir, '%s.%d.ready' % (step_id, struct_id)) open(readyfile, 'w').close() # touch the ready-file
def task(struct_id, cfg, tmp_dir): """ relax one random structure chromosome structures, SERIAL """ cfg = deepcopy(cfg) readyfile = os.path.join(tmp_dir, 'relax_%d.hms.ready' % struct_id) # if the ready file exists it does nothing, unless it is a clear run if not cfg.get('optimization/clean_restart', False): if os.path.isfile(readyfile): return # extract structure information hssfilename = cfg["optimization"]["structure_output"] # read index, radii, coordinates with HssFile(hssfilename, 'r') as hss: index = hss.index radii = hss.radii crd = hss.get_struct_crd(struct_id) # init Model model = Model(uid=struct_id) # add particles into model n_particles = len(crd) for i in range(n_particles): model.addParticle(crd[i], radii[i], Particle.NORMAL) # ========Add polymer/nucleoli restraints ========= # add excluded volume restraint ex = Steric(cfg.get("model/restraints/excluded/evfactor")) model.addRestraint(ex) # add nucleus envelope restraint (spherical, ellipsoidal OR from data) if cfg['model']['restraints']['envelope']['nucleus_shape'] == 'sphere': ev = Envelope( cfg['model']['restraints']['envelope']['nucleus_shape'], cfg['model']['restraints']['envelope']['nucleus_radius'], cfg['model']['restraints']['envelope']['nucleus_kspring']) elif cfg['model']['restraints']['envelope'][ 'nucleus_shape'] == 'ellipsoid': ev = Envelope( cfg['model']['restraints']['envelope']['nucleus_shape'], cfg['model']['restraints']['envelope']['nucleus_semiaxes'], cfg['model']['restraints']['envelope']['nucleus_kspring']) elif cfg['model']['restraints']['envelope'][ 'nucleus_shape'] == 'exp_map': ev = GenEnvelope( cfg['model']['restraints']['envelope']['nucleus_shape'], cfg['model']['restraints']['envelope']['input_map'], cfg['model']['restraints']['envelope']['nucleus_kspring']) model.addRestraint(ev) # add consecutive polymer restraint contact_probabilities = cfg['runtime'].get( 'consecutive_contact_probabilities', None) pp = Polymer(index, cfg['model']['restraints']['polymer']['contact_range'], cfg['model']['restraints']['polymer']['polymer_kspring'], contact_probabilities=contact_probabilities) model.addRestraint(pp) # LB: add nuclear body "excluded volume" restraints (keep chromosomes out of nucleolar region) if 'nucleolus' in cfg['restraints']: for mappa in cfg['restraints']['nucleolus']['input_map']: nucl = GenEnvelope(cfg['restraints']['nucleolus']['shape'], mappa, cfg['restraints']['nucleolus']['k_spring']) model.addRestraint(nucl) logger.info(nucl) # ========Optimization # set "run_name" variable into "runtime" dictionary cfg['runtime']['run_name'] = cfg['runtime']['step_hash'] + '_' + str( struct_id) # run optimization of the structures, by enforcing excluded volume, polymer and envelope restraints model.optimize(cfg) # save optimization results (both optimized coordinates and violations) into a .hms file ofname = os.path.join(tmp_dir, 'relax_%d.hms' % struct_id) with HmsFile(ofname, 'w') as hms: hms.saveModel(struct_id, model) hms.saveViolations(pp) # make sure write was successful with HmsFile(ofname, 'r') as hms: if not np.all(hms.get_coordinates() == model.getCoordinates()): raise RuntimeError('error writing the file %s' % ofname) # create .ready file, which signals to the poller that optimization went to completion readyfile = ofname + '.ready' open(readyfile, 'w').close() # touch the ready-file
def teardown_poller(self): """ Reopen HSS file, overwrite ALL coordinates, close file """ _hss = HssFile(self.hssfilename, 'r+') _hss.set_coordinates(self._hss_crd) _hss.close()
def modeling_task(struct_id, cfg_file): ''' Serial function to be mapped in parallel. // It is a wrapper intended to be used only internally by the parallel map function. Will be called as a partial with all the constant variables set, except i. Resolve the templates, obtains input data, runs the minimization routines and finally communicates back results. Parameters ---------- i : int number of the structure cfg_file : str configuration filename for the task Returns ------- None ''' cfg = Config(cfg_file) # importing here so it will be called on the parallel workers local_vars = resolve_templates(cfg['mstep']['templates'], [struct_id]) model = Model() with HssFile(cfg['mstep']['input_hss'], 'r') as f: radii = f.radii index = f.index crd = f['coordinates'][:, struct_id, :][()] n_particles = len(crd) for i in range(n_particles): model.addParticle(crd[i], radii[i], Particle.NORMAL) ee = Envelope(cfg['model']['nucleus_geometry']) model.addRestraint(ee) ex = Steric(cfg['model']['evfactor']) model.addRestraint(ex) pp = Polymer(index, cfg['model']['contact_range'], cfg['model']['contact_kspring']) model.addRestraint(pp) kernel = kernel_class[cfg['mstep']['kernel']] info = kernel.optimize(model, cfg['optimization']) new_crd = np.array([p.pos for p in model.particles], dtype=COORD_DTYPE) np.save(local_vars['crd_out'], new_crd) # make sure that is readable np.load(local_vars['crd_out']) with open(local_vars['info_out'], 'w') as f: for k in kernel.INFO_KEYS: if isinstance(info[k], float): out_str = '{:9.2f}'.format(info[k]) elif isinstance(info[k], int): out_str = '{:7d}'.format(info[k]) else: out_str = str(info[k]) f.write(out_str + '\t')
def skip(self): fn = self.intermediate_name() + '.hss' if os.path.isfile(fn): with HssFile(fn, 'r') as hss: violation_score = log_stats(hss, self.cfg) self.cfg['runtime']['violation_score'] = violation_score
def reduce(self): # reconstruct contacts full map # ii = [] # jj = [] # data = [] # with HssFile(self.cfg.get('optimization/structure_output')) as structure_output: out_dir = self.out_dir sigma = self.cfg.get('runtime/Hi-C/sigma') input_matrix = Contactmatrix( self.cfg.get('restraints/Hi-C/input_matrix')) with HssFile(self.cfg.get( 'optimization/structure_output')) as structure_output: output_matrix = structure_output.buildContactMap( contactRange=self.cfg.get('restraints/Hi-C/contact_range') * (1 + eps)) # give some tolerance. only in one direction though. output_matrix.save(os.path.join(out_dir, 'full_matrix.hcs')) output_matrix = output_matrix.sumCopies() output_matrix.matrix.data[:] = output_matrix.matrix.data.clip(0, 1) output_matrix.save(os.path.join(out_dir, 'out_matrix.hcs')) plot_comparison(input_matrix, output_matrix, labels=['input', 'output'], file=os.path.join(out_dir, 'matrix_comparison.pdf'), vmax=0.2) for c in input_matrix.index.get_chrom_names(): plot_comparison(input_matrix[c], output_matrix[c], labels=['input', 'output'], file=os.path.join(out_dir, 'matrix_comparison_%s.pdf' % c), title=c, vmax=0.2) with np.errstate(divide='ignore', invalid='ignore'): diffmat = np.log2(output_matrix.matrix.toarray() / input_matrix.matrix.toarray()) maxv = np.percentile(np.abs(diffmat[np.isfinite(diffmat)]), 99) plt.figure() plt.imshow(diffmat, vmax=maxv, vmin=-maxv, cmap='RdBu_r') plt.title('difference_matrix') plt.colorbar() plt.savefig(os.path.join(out_dir, 'diffmat.pdf')) plt.close() for c in input_matrix.genome.chroms: ii = input_matrix.index.chrom == input_matrix.genome.getchrnum(c) plt.figure() xmat = diffmat[ii][:, ii] plt.imshow(xmat, vmax=maxv, vmin=-maxv, cmap='RdBu_r') plt.colorbar() plt.savefig(os.path.join(out_dir, 'diffmap_' + c + '.pdf')) plt.close() input_matrix = {(i, j): pwish for i, j, pwish in input_matrix.matrix.coo_generator() if pwish >= sigma and i != j} diffs = [] reldiffs = [] totp = 0 for i, j, pout in output_matrix.matrix.coo_generator(): p = input_matrix.get((i, j)) if p is not None: diffs.append(pout - p) reldiffs.append((pout - p) / p) totp += p del output_matrix del input_matrix diffs = np.array(diffs) reldiffs = np.array(reldiffs) f, ax = plt.subplots(2, 2) ax[0, 0].set_title('Absolute matrix differences') ax[0, 0].hist(diffs, bins=100, range=(-1, 1)) ax[0, 1].set_title('Relative matrix differences') ax[0, 1].hist(reldiffs, bins=100, range=(-1, 1)) ax[1, 0].set_title('Absolute matrix differences (log)') ax[1, 0].hist(diffs, bins=100, log=True, range=(-1, 1)) ax[1, 1].set_title('Relative matrix differences (log)') ax[1, 1].hist(reldiffs, bins=100, log=True, range=(-1, 1)) plt.tight_layout() plt.savefig(os.path.join(out_dir, 'difference_histograms.pdf')) tol = self.cfg.get('restraints/Hi-C/evaluation_tolerance', 0.01) n = np.count_nonzero(np.abs(diffs) > tol) self.score = np.abs(reldiffs).mean() #self.ok = n < 0.01 * len(diffs) #self.score = float(n)/len(diffs) #self.cfg['runtime']['violation_score'] = self.score with open(os.path.join(out_dir, 'stats.txt'), 'w') as f: print("#score ave_differences ave_relative_differences", file=f) print(self.score, np.average(diffs), np.average(reldiffs), file=f) logger.info('>>> Average relative difference: {:6.3f}% <<<'.format( self.score * 100))
def setup_poller(self): """ Load Hss population file, store all coordinates into numpy array, close file""" _hss = HssFile(self.hssfilename, 'r') self._hss_crd = _hss.coordinates _hss.close()
def debug_minimization(cfg, struct_id, rname, **kwargs): if not isinstance(cfg, dict): cfg = Config(cfg) if os.path.isfile(cfg['step_db']): db = StepDB(cfg) h = db.get_history() cfg.update(h[-1]) cfg['optimization']['optimizer_options'].update(kwargs) cfg['optimization']['keep_temporary_files'] = True step_id = rname hssfilename = cfg['structure_output'] #read index, radii, coordinates with HssFile(hssfilename,'r') as hss: index = hss.index radii = hss.radii if cfg.get('random_shuffling', False): crd = generate_random_in_sphere(radii, cfg['model']['nucleus_radius']) else: crd = hss.get_struct_crd(struct_id) #init Model model = Model(uid=struct_id) # get the chain ids chain_ids = np.concatenate( [ [i]*s for i, s in enumerate(index.chrom_sizes) ] ) #add particles into model n_particles = len(crd) for i in range(n_particles): model.addParticle(crd[i], radii[i], Particle.NORMAL, chainID=chain_ids[i]) #========Add restraint monitored_restraints = [] #add excluded volume restraint ex = Steric(cfg['model']['evfactor']) model.addRestraint(ex) #add nucleus envelop restraint if cfg['model']['nucleus_shape'] == 'sphere': ev = Envelope(cfg['model']['nucleus_shape'], cfg['model']['nucleus_radius'], cfg['model']['contact_kspring']) elif cfg['model']['nucleus_shape'] == 'ellipsoid': ev = Envelope(cfg['model']['nucleus_shape'], cfg['model']['nucleus_semiaxes'], cfg['model']['contact_kspring']) else: raise NotImplementedError('Invalid nucleus shape') model.addRestraint(ev) #add consecutive polymer restraint pp = Polymer(index, cfg['model']['contact_range'], cfg['model']['contact_kspring']) model.addRestraint(pp) monitored_restraints.append(pp) #add Hi-C restraint # if "Hi-C" in cfg['restraints']: # dictHiC = cfg['restraints']['Hi-C'] # actdist_file = cfg['runtime']['Hi-C']['actdist_file'] # contact_range = dictHiC.get( 'contact_range', 2.0 ) # k = dictHiC.get( 'contact_kspring', 1.0 ) # hic = HiC(actdist_file, contact_range, k) # model.addRestraint(hic) # monitored_restraints.append(hic) # if "sprite" in cfg['restraints']: # sprite_opt = cfg['restraints']['sprite'] # sprite = Sprite( # sprite_opt['assignment_file'], # sprite_opt['volume_fraction'], # struct_id, # sprite_opt['kspring'] # ) # model.addRestraint(sprite) # monitored_restraints.append(sprite) #========Optimization #optimize model cfg['runtime']['run_name'] = rname model.optimize(cfg) tol = cfg.get('violation_tolerance', 0.01) lockfile = os.path.join('.', '%s.%d.ready' % (step_id, struct_id) ) with FileLock(lockfile): open(lockfile, 'w').close() # touch the ready-file ofname = os.path.join('.', 'mstep_%d.hms' % struct_id) with HmsFile(ofname, 'w') as hms: hms.saveModel(struct_id, model) for r in monitored_restraints: hms.saveViolations(r, tolerance=tol) # double check it has been written correctly with HmsFile(ofname, 'r') as hms: if np.all( hms.get_coordinates() == model.getCoordinates() ): raise RuntimeError('error writing the file %s' % ofname)