def load_samples_from_cfg(config_file, burnin=35000): """Loads results of a simulation using a config file This returns posterior samples from a simulation given a config file. :param config_file: path to config file :type config_file: str :param burnin: number of MCMC samples to be discarded as burnin :type burnin: int :returns: posterior samples :rtype: :class:`numpy.ndarray` """ from .setup_functions import parse_config_file cfg = parse_config_file(config_file) output_folder = cfg['general']['output_folder'] n_beads = int(cfg['general']['n_beads']) n_structures = int(cfg['general']['n_structures']) n_samples = int(cfg['replica']['n_samples']) samples = load_sr_samples(output_folder + 'samples/', int(cfg['replica']['n_replicas']), n_samples, int(cfg['replica']['samples_dump_interval']), burnin) return samples
def calculate_data(config_file, fish_data_file): from xlrd import open_workbook from ensemble_hic.setup_functions import parse_config_file wb = open_workbook(fish_data_file) sheet = wb.sheets()[0] table = np.array([ np.array(sheet.row_values(j))[1:13] for j in [2, 3] + range(7, sheet.nrows) ]) data = { '{}:{}'.format(x[0], x[1]): np.array([float(y) for y in x[2:] if len(y) > 0]) for x in table.T } bead_size = 3000 region_start = 100378306 n_beads = 308 settings = parse_config_file(config_file) output_folder = settings['general']['output_folder'] n_structures = int(settings['general']['n_structures']) samples = load_sr_samples(output_folder + 'samples/', int(settings['replica']['n_replicas']), 50001, 1000, 30000) X = np.array([ s.variables['structures'].reshape(n_structures, -1, 3) for s in samples ]) Xflat = X.reshape(-1, n_beads, 3) * 53 get_bead = lambda p: int((np.mean(p[1:3]) - region_start) / bead_size) mapping = (data['pEN2:pLG1'], data['pEN2:X4'], data['pEN2:X3'], data['X4:X3'], data['pLG1:pEN2'], data['Dxpas34:pEN1'], data['pEN2:pLG11']) isd_distance_dists = [ np.linalg.norm(Xflat[:, get_bead(probes[l1])] - Xflat[:, get_bead(probes[l2])], axis=1) for (l1, l2) in combinations ] fish_distance_hists = [mapping[i - 1] for i in range(len(combinations))] return isd_distance_dists, fish_distance_hists
def load_samples_from_cfg_auto(config_file, burnin=35000): """Loads results of a simulation using a config file This returns posterior samples from a simulation given a config file and automatically determines the number of actually drawn samples, i.e., it ignores to the n_samples setting in the config file. :param config_file: path to config file :type config_file: str :param burnin: number of MCMC samples to be discarded as burnin :type burnin: int :returns: posterior samples :rtype: :class:`numpy.ndarray` """ import os from .setup_functions import parse_config_file cfg = parse_config_file(config_file) output_folder = cfg['general']['output_folder'] n_structures = int(cfg['general']['n_structures']) n_replicas = int(cfg['replica']['n_replicas']) dump_interval = int(cfg['replica']['samples_dump_interval']) n_drawn_samples = 0 fname = output_folder + 'samples/samples_replica' \ + str(n_replicas) + '_{}-{}.pickle' while True: if os.path.exists( fname.format(n_drawn_samples, n_drawn_samples + dump_interval)): n_drawn_samples += dump_interval else: break samples = load_sr_samples(output_folder + 'samples/', n_replicas, n_drawn_samples + 1, dump_interval, burnin) return samples
def load_samples(samples_folder, n_replicas, n_samples, dump_interval, burnin, interval=1): """Loads full results of a Replica Exchange simulation. :param samples_folder: directory in which samples are stored :type samples_folder: str ending with a slash ("/") :param n_replicas: number of replicas :type n_replicas: int :param n_samples: number of samples :type n_samples: int :param dump_interval: number of MCMC steps after which samples are written :type dump_interval: int :param burnin: number of MCMC samples discarded as burnin :type burnin: int (multiple of dump_interval) :param interval: return only every interval-th sample :type interval: int :returns: a two-dimensional array of MCMC samples with the first axis being the replicas and the second axis the samples for a given replica :rtype: :class:`numpy.ndarray` """ samples = [] for i in xrange(1, n_replicas + 1): samples.append( load_sr_samples(samples_folder, i, n_samples, dump_interval, burnin, interval)) return np.array(samples)
import sys from cPickle import dump from ensemble_hic.setup_functions import parse_config_file from ensemble_hic.analysis_functions import load_sr_samples settings = parse_config_file(sys.argv[1]) output_folder = settings['general']['output_folder'] # just to be sure that we ship the settings used for the # simulation which yielded the samples settings = parse_config_file(output_folder + 'config.cfg') max_samples = 50001 burnin = 30000 samples = load_sr_samples(output_folder + 'samples/', int(settings['replica']['n_replicas']), max_samples, 1000, burnin) with open(sys.argv[2], "w") as opf: dump(samples, opf)
if __name__ == "__main__": import sys from cPickle import dump from ensemble_hic.setup_functions import parse_config_file before_cfg_file = sys.argv[1] after_cfg_file = sys.argv[2] before_out_file = sys.argv[3] after_out_file = sys.argv[4] scale_factor = 53 for cfg_file, out_file in zip((before_cfg_file, after_cfg_file), (before_out_file, after_out_file)): settings = parse_config_file(cfg_file) n_replicas = int(settings['replica']['n_replicas']) n_structures = int(settings['general']['n_structures']) samples = load_sr_samples( settings['general']['output_folder'] + 'samples/', n_replicas, 45001, 1000, 25000) X = np.array([ x.variables['structures'].reshape(n_structures, 308, 3) for x in samples ]) * scale_factor rgs = calculate_rgs(X) with open(out_file, "w") as opf: dump(rgs, opf)
ax.legend(handles, labels, frameon=False, title='linear distance [beads]:') if __name__ == "__main__": import sys from cPickle import dump from ensemble_hic.setup_functions import parse_config_file, make_posterior from ensemble_hic.analysis_functions import load_sr_samples cfg_file = sys.argv[1] out_file = sys.argv[2] settings = parse_config_file(cfg_file) samples = load_sr_samples( settings['general']['output_folder'] + 'samples/', int(settings['replica']['n_replicas']), 50001, 1000, 30000) p = make_posterior(settings) fwm = p.likelihoods['ensemble_contacts'].forward_model energies = np.array(map(lambda x: -p.log_prob(**x.variables), samples)) map_sample = samples[np.argmin(energies)] n_structures = fwm.n_structures sels = [ np.where(fwm.data_points[:, 1] - fwm.data_points[:, 0] == sep)[0] for sep in (2, 3) ] antisel = np.array([ i for i in np.arange(len(fwm.data_points)) if not i in [x for y in sels for x in y] ]) all_but_sel_x = fwm.data_points[antisel, 2]
fnames = [data_file[:4], data_file[5:9]] knowns = [ StructureParser(data_dir + fnames[0] + '.pdb').parse().get_coordinates( ['CA']) ] if fnames[1] != 'none': knowns.append( StructureParser(data_dir + fnames[1] + '.pdb').parse().get_coordinates( ['CA'])) knowns = np.array(knowns) / 3.8 output_folder = config['general']['output_folder'] samples = load_sr_samples(output_folder + 'samples/', n_replicas, n_samples, int(config['replica']['samples_dump_interval']), burnin=burnin) ens = np.array([ sample.variables['structures'].reshape(-1, len(knowns[0]), 3) for sample in samples ]) ens_flat = ens.reshape(ens.shape[0] * ens.shape[1], -1, 3) figures_folder = output_folder + 'analysis/compare_to_known/' if not os.path.exists(figures_folder): os.makedirs(figures_folder) if True: ## plot histograms of RMSDs to known structures rmsds = [map(lambda x: rmsd(known, x), ens_flat) for known in knowns]
def calculate_DOS(config_file, n_samples, subsamples_fraction, burnin, n_iter=100000, tol=1e-10, save_output=True, output_suffix=''): """Calculates the density of states (DOS) using non-parametric histogram reweighting (WHAM). :param config_file: Configuration file :type config_file: str :param n_samples: number of samples the simulation ran :type n_samples: int :param subsamples_fraction: faction of samples (after burnin) to be analyzed set this to, e.g., 10 to use one tenth of n_samples to decrease compution time :type subsamples_fraction: int :param burnin: number of samples to be thrown away as part of the burn-in period :type burnin: int :param n_iter: number of WHAM iterations :type n_iter: int :param tol: threshold up to which the negative log-likelihood being minimized in WHAM can change before iteration stops :type tol: float :param save_output: save resulting DOS object, parameters used during calculation and indices of randomly chosen samples in simulation output folder :type save_output: True :returns: DOS object :rtype: DOS """ from ensemble_hic.wham import PyWHAM as WHAM, DOS from ensemble_hic.setup_functions import parse_config_file, make_posterior from ensemble_hic.analysis_functions import load_sr_samples settings = parse_config_file(config_file) n_replicas = int(settings['replica']['n_replicas']) target_replica = n_replicas params = { 'n_samples': n_samples, 'burnin': burnin, 'subsamples_fraction': subsamples_fraction, 'niter': n_iter, 'tol': tol } n_samples = min(params['n_samples'], int(settings['replica']['n_samples'])) dump_interval = int(settings['replica']['samples_dump_interval']) output_folder = settings['general']['output_folder'] if output_folder[-1] != '/': output_folder += '/' n_beads = int(settings['general']['n_beads']) n_structures = int(settings['general']['n_structures']) schedule = np.load(output_folder + 'schedule.pickle') posterior = make_posterior(settings) p = posterior variables = p.variables energies = [] L = p.likelihoods['ensemble_contacts'] data = L.forward_model.data_points P = p.priors['nonbonded_prior'] sels = [] for i in range(n_replicas): samples = load_sr_samples(output_folder + 'samples/', i + 1, n_samples + 1, dump_interval, burnin=params['burnin']) sel = np.random.choice(len(samples), int(len(samples) / float(subsamples_fraction)), replace=False) samples = samples[sel] sels.append(sel) energies.append([[ -L.log_prob(**x.variables) if 'lammda' in schedule else 0, -P.log_prob(structures=x.variables['structures']) if 'beta' in schedule else 0 ] for x in samples]) print "Calculated energies for {}/{} replicas...".format(i, n_replicas) energies = np.array(energies) energies_flat = energies.reshape(np.prod(energies.shape[:2]), 2) sched = np.array([schedule['lammda'], schedule['beta']]) q = np.array([[(energy * replica_params).sum() for energy in energies_flat] for replica_params in sched.T]) wham = WHAM(len(energies_flat), n_replicas) wham.N[:] = len(energies_flat) / n_replicas wham.run(q, niter=params['niter'], tol=params['tol'], verbose=100) dos = DOS(energies_flat, wham.s, sort_energies=False) if save_output: import os import sys from cPickle import dump ana_path = output_folder + 'analysis/' if not os.path.exists(ana_path): os.makedirs(ana_path) with open(ana_path + 'dos{}.pickle'.format(output_suffix), 'w') as opf: dump(dos, opf) with open(ana_path + 'wham_params{}.pickle'.format(output_suffix), 'w') as opf: dump(params, opf) with open(ana_path + 'wham_sels{}.pickle'.format(output_suffix), 'w') as opf: dump(np.array(sels), opf) return dos
true1 = parser.parse().get_coordinates(['CA']) parser = StructureParser(data_dir + what + '/1shf.pdb') true2 = parser.parse().get_coordinates(['CA']) label1 = '1PGA' label2 = '1SHF' label1 = 'GB1 domain' label2 = 'SH3 domain' n_beads = len(true1) samples_folder = settings['general']['output_folder'] + 'samples/' n_replicas = int(settings['replica']['n_replicas']) n_samples = int(settings['replica']['n_samples']) dump_interval = int(settings['replica']['samples_dump_interval']) samples = load_sr_samples(samples_folder, n_replicas, n_samples, dump_interval, burnin) structures = np.array([ sample.variables['structures'].reshape(n_structures, -1, 3) for sample in samples ]) structures = structures.reshape(len(samples) * n_structures, -1, 3) fake_data_points = np.array([[i, j, 0] for i in range(n_beads) for j in range(i + 1, n_beads)]) ana_fwm = EnsembleContactsFWM('petitprince', 1, np.ones(len(fake_data_points)) * cdistance, fake_data_points) if True: true1_contacts = pdist(true1) < cdistance true2_contacts = pdist(true2) < cdistance normalization1 = float(n_beads * (n_beads - 1) / 2.0)
import os import sys import numpy as np import matplotlib.pyplot as plt from scipy.spatial.distance import pdist, squareform from csb.bio.utils import rmsd, radius_of_gyration as rog from ensemble_hic.analysis_functions import load_sr_samples n_beads = 308 sim_path = '/scratch/scarste/ensemble_hic/nora2012/bothdomains_fixed_it3_rep3_20structures_309replicas/' s = load_sr_samples(sim_path + 'samples/', 309, 50001, 1000, 30000) X = np.array([x.variables['structures'].reshape(20, 308, 3) for x in s]) * 53 sim_path = '/scratch/scarste/ensemble_hic/nora2012/bothdomains_nointer_it3_rep3_20structures_309replicas/' s = load_sr_samples(sim_path + 'samples/', 309, 50001, 1000, 30000) X_nointer = np.array([x.variables['structures'].reshape(20, 308, 3) for x in s]) * 53 pos_start = 100378306 if False: ## gyration radius histograms rogs_t1 = np.array(map(rog, t1flat)) rogs_t2 = np.array(map(rog, t2flat)) axes[0,0].hist(rogs_t1, bins=100, label='Tsix TAD', alpha=0.6, color='red')
from ensemble_hic.setup_functions import make_marginalized_posterior posterior = make_marginalized_posterior(settings) p = posterior variables = p.variables from ensemble_hic.analysis_functions import load_sr_samples energies = [] L = p.likelihoods['ensemble_contacts'] data = L.forward_model.data_points P = p.priors['nonbonded_prior'] sels = [] for i in range(n_replicas): print i samples = load_sr_samples(output_folder + 'samples/', i+1, n_samples+1, dump_interval, burnin=params['burnin'], )#interval=params['samples_step']) sel = np.random.choice(len(samples), int(len(samples) / float(params['samples_step'])), replace=False) samples = samples[sel] sels.append(sel) energies.append([[-L.log_prob(**x.variables) if 'lammda' in schedule else 0, -P.log_prob(structures=x.variables['structures']) if 'beta' in schedule else 0] for x in samples]) energies = np.array(energies) energies_flat = energies.reshape(np.prod(energies.shape[:2]), 2) sched = np.array([schedule['lammda'], schedule['beta']]) q = np.array([[(energy * replica_params).sum() for energy in energies_flat] for replica_params in sched.T])
output_dirs = [common_path + x[1] for x in simulations] logZs = [] data_terms = [] for x in output_dirs: dos = np.load(x + '/analysis/dos.pickle') logZs.append(log_sum_exp(-dos.E.sum(1) + dos.s) - \ log_sum_exp(-dos.E[:,1] + dos.s)) a = x.find('replicas') b = x[a-4:].find('_') n_replicas = int(x[a-4+b+1:a]) p = np.load(x + '/analysis/wham_params.pickle') c = parse_config_file(x + '/config.cfg') s = load_sr_samples(x + '/samples/', n_replicas, p['n_samples']+1, int(c['replica']['samples_dump_interval']), p['burnin']) sels = np.load(x + '/analysis/wham_sels.pickle') s = s[sels[-1]] p = make_posterior(parse_config_file(x + '/config.cfg')) L = p.likelihoods['ensemble_contacts'] d = L.forward_model.data_points[:,2] f = gammaln(d+1).sum() print "mean log-posterior:", np.mean(map(lambda x: p.log_prob(**x.variables), s)) logZs[-1] -= f + np.log(len(d)) * (not '1pga' in x) data_terms.append(np.array(map(lambda x: -L.log_prob(**x.variables), s)).mean() + f) print "evidence:", logZs[-1] data_terms = np.array(data_terms) with open(out_file, "w") as opf: dump((n_structures, logZs, data_terms), opf)