def detect_from_summaries(cfg, chrom=1, detect_fmt=("%.1f", "%d")): ''' Detect local concentrations based on existing summaries. Parameters ---------- - cfg : dictionary Dictionary of parameters containing at least those relevant MCMC draw and summary output paths and parameters for summarization. - chrom : int Index of chromosome to analyze Returns ------- - status : int Integer status for summarisation. 0 for success, > 0 for failure. ''' # Reference useful information in local namespace p_detect = cfg['mcmc_summaries']['p_detect'] # Extract window size information (+/-) from config concentration_pm = cfg['mcmc_summaries']['concentration_pm'] if isinstance(concentration_pm, str): pm_list = [int(s) for s in concentration_pm.split(',')] else: pm_list = [concentration_pm] # Get path to posterior summaries pattern_summaries = cfg['mcmc_output']['summary_pattern'] pattern_summaries = pattern_summaries.strip() path_summaries = pattern_summaries.format(**cfg) % chrom # Run detection if p_detect is not None: # Load summaries summaries = np.genfromtxt(path_summaries, names=True) # Iterate of +/- settings for pm in pm_list: # Find detected positions key = 'p_local_concentration_pm%d' % pm detected = np.where(summaries[key] > p_detect)[0] # Condense regions detected, n = condense_detections(detected) # Write detections to text file pattern_detections = cfg['mcmc_output']['detections_pattern'] pattern_detections = pattern_detections.strip() path_detections = pattern_detections.format(**cfg) % (chrom, pm) detections = np.rec.fromarrays([detected, n], names=('pos', 'n')) libio.write_recarray_to_file(fname=path_detections, data=detections, header=True, sep=' ', fmt=detect_fmt) return 0
def summarise_clusters(cfg, chrom=1, null=False): ''' Coordinate summarisation of MCMC results by cluster. Clusters are defined via Parzen window smoothing with a cfg-specified bandwidth and minimum separation. Following clustering, all cluster-level summaries are computed within each iteration (localization, structure, occupancy, etc.). The reported outputs are posterior summaries of these cluster-level summaries (mean, SD, etc.). Parameters ---------- - cfg : dictionary Dictionary of parameters containing at least those relevant MCMC draw and summary output paths and parameters for summarization. - chrom : int Index of chromosome to analyze - null : bool Summarise null results? Returns ------- - status : int Integer status for summarisation. 0 for success, > 0 for failure. ''' # Reference useful information in local namespace n_burnin = cfg['mcmc_params']['n_burnin'] scratch = cfg['mcmc_summaries']['path_scratch'] # Cluster-level summary information cluster_min_spacing = cfg['mcmc_summaries']['cluster_min_spacing'] cluster_bw = cfg['mcmc_summaries']['cluster_bw'] cluster_width = cfg['mcmc_summaries']['cluster_width'] h = cluster_width / 2 # Extract q_sparsity information for n_large summaries from config q_sparsity = cfg['mcmc_summaries']['q_sparsity'] if isinstance(q_sparsity, str): q_sparsity = [float(s) for s in q_sparsity.split(',')] else: q_sparsity = [q_sparsity] # Extract p_threshold information for n_large summaries from config p_threshold = cfg['mcmc_summaries']['p_threshold'] if isinstance(p_threshold, str): p_threshold = [float(s) for s in p_threshold.split(',')] else: p_threshold = [p_threshold] # Check for existence and writeability of scratch directory if os.access(scratch, os.F_OK): # It exists, check for read-write if not os.access(scratch, os.R_OK | os.W_OK): print >> sys.stderr, ("Error --- Cannot read and write to %s" % scratch) return 1 else: # Otherwise, try to make the directory os.makedirs(scratch) # Extract results to scratch directory if null: pattern_results = cfg['mcmc_output']['null_out_pattern'] else: pattern_results = cfg['mcmc_output']['out_pattern'] pattern_results = pattern_results.strip() path_results = pattern_results.format(**cfg) % chrom archive = tarfile.open(name=path_results, mode='r:*') archive.extractall(path=scratch) names_npy = archive.getnames() archive.close() # Load results of interest theta = np.load(scratch + '/theta.npy', mmap_mode='r') mu = np.load(scratch + '/mu.npy', mmap_mode='r') # Remove burnin if n_burnin > 0: mu = mu[n_burnin:] theta = theta[n_burnin:] # Compute posterior mean of coefficients # This looks inefficient, but it saves memory --- a lot of memory b_postmean = np.array([np.mean(np.exp(theta_k)) for theta_k in theta.T]) # Setup window for clustering cluster_window = gaussian_window(h=h, sigma=cluster_bw) # Get cluster centers cluster_centers = get_cluster_centers(x=b_postmean, window=cluster_window, min_spacing=cluster_min_spacing, edge_correction=True) n_clusters = cluster_centers.size # Create slices by cluster for efficient access cluster_slices = [ slice(max(0, c - h), min(c + h + 1, theta.shape[1]), 1) for c in cluster_centers ] # Extract cluster sizes cluster_sizes = np.array([s.stop - s.start for s in cluster_slices], dtype=np.int) # Create names for sparsity and n_large variables names_sparsity = ["sparsityq%02.0f" % (q * 100) for q in q_sparsity] names_sparsity_se = ["sparsityq%02.0f_se" % (q * 100) for q in q_sparsity] names_nlarge = ["nlargep%02.0f" % (p * 100) for p in p_threshold] names_nlarge_se = ["nlargep%02.0f_se" % (p * 100) for p in p_threshold] # Allocate arrays for cluster-level summaries cluster_summaries = collections.OrderedDict() cluster_summaries['center'] = cluster_centers cluster_summaries['cluster_length'] = cluster_sizes cluster_summaries['occupancy'] = np.empty(n_clusters, dtype=np.float) cluster_summaries['occupancy_se'] = np.empty(n_clusters, dtype=np.float) cluster_summaries['localization'] = np.empty(n_clusters, dtype=np.float) cluster_summaries['localization_se'] = np.empty(n_clusters, dtype=np.float) cluster_summaries['structure'] = np.empty(n_clusters, dtype=np.float) cluster_summaries['structure_se'] = np.empty(n_clusters, dtype=np.float) cluster_summaries['sparsity'] = np.empty(n_clusters, dtype=np.float) cluster_summaries['sparsity_se'] = np.empty(n_clusters, dtype=np.float) for var in itertools.chain([ item for items in itertools.izip(names_sparsity, names_sparsity_se) for item in items ], [ item for items in itertools.izip(names_nlarge, names_nlarge_se) for item in items ]): cluster_summaries[var] = np.empty(n_clusters, dtype=np.float) # Compute cluster-level summaries, iterating over clusters for i, center, cluster in itertools.izip(xrange(n_clusters), cluster_centers, cluster_slices): # Extract cluster coefficient draws b_draws = np.exp(theta[:, cluster]) p_draws = (b_draws.T / np.sum(b_draws, 1)).T # Compute posterior mean occupancy and its SD cluster_summaries['occupancy'][i] = np.mean(b_draws) * cluster_sizes[i] cluster_summaries['occupancy_se'][i] = np.std(np.sum(b_draws, axis=1)) # Compute localization index by draw x = np.arange(cluster_sizes[i])[np.newaxis, :] localization = localization_index(x=x, p=p_draws, axis=1) cluster_summaries['localization'][i] = np.mean(localization) cluster_summaries['localization_se'][i] = np.std(localization) # Compute structure index by draw structure = structure_index(x=b_draws, axis=1) cluster_summaries['structure'][i] = np.mean(structure) cluster_summaries['structure_se'][i] = np.std(structure) # Compute sparsity index by draw sparsity = sparsity_index(x=b_draws, q=q_sparsity, axis=1) for i_q in xrange(len(q_sparsity)): cluster_summaries[names_sparsity[i_q]][i] = np.mean(sparsity[i_q]) cluster_summaries[names_sparsity_se[i_q]][i] = np.std( sparsity[i_q]) # Compute n_large by draw n_large = compute_n_large(x=b_draws, p_threshold=p_threshold, axis=1) for i_p in xrange(len(p_threshold)): cluster_summaries[names_nlarge[i_p]][i] = np.mean(n_large[i_p]) cluster_summaries[names_nlarge_se[i_p]][i] = np.std(n_large[i_p]) # Provide nicely-formatted delimited output for analyses and plotting if null: pattern_summaries = cfg['mcmc_output']['null_cluster_pattern'] else: pattern_summaries = cfg['mcmc_output']['cluster_pattern'] pattern_summaries = pattern_summaries.strip() path_summaries = pattern_summaries.format(**cfg) % chrom # Build recarray of summaries, starting with coefficients and diagnostics summaries = np.rec.fromarrays(cluster_summaries.values(), names=cluster_summaries.keys()) # Write summaries to delimited text file libio.write_recarray_to_file(fname=path_summaries, data=summaries, header=True, sep=' ') # Clean-up scratch directory for name in names_npy: os.remove(scratch + '/' + name) return 0
def summarise(cfg, chrom=1, null=False, mmap=False, detect_fmt=("%.1f", "%d")): ''' Coordinate summarisation of MCMC results. Parameters ---------- - cfg : dictionary Dictionary of parameters containing at least those relevant MCMC draw and summary output paths and parameters for summarization. - chrom : int Index of chromosome to analyze - null : bool Summarise null results? Returns ------- - status : int Integer status for summarisation. 0 for success, > 0 for failure. ''' # Reference useful information in local namespace n_burnin = cfg['mcmc_params']['n_burnin'] scratch = cfg['mcmc_summaries']['path_scratch'] width_local = cfg['mcmc_summaries']['width_local'] p_detect = cfg['mcmc_summaries']['p_detect'] bp_per_nucleosome = cfg['mcmc_summaries']['bp_per_nucleosome'] # Extract window size information (+/-) from config concentration_pm = cfg['mcmc_summaries']['concentration_pm'] if isinstance(concentration_pm, str): pm_list = [int(s) for s in concentration_pm.split(',')] else: pm_list = [concentration_pm] # Check for existence and writeability of scratch directory if os.access(scratch, os.F_OK): # It exists, check for read-write if not os.access(scratch, os.R_OK | os.W_OK): print >> sys.stderr, ("Error --- Cannot read and write to %s" % scratch) return 1 else: # Otherwise, try to make the directory os.makedirs(scratch) # Extract results to scratch directory if null: pattern_results = cfg['mcmc_output']['null_out_pattern'] else: pattern_results = cfg['mcmc_output']['out_pattern'] pattern_results = pattern_results.strip() path_results = pattern_results.format(**cfg) % chrom archive = tarfile.open(name=path_results, mode='r:*') archive.extractall(path=scratch) names_npy = archive.getnames() archive.close() # Load results of interest if mmap: mmap_mode = 'r+' else: mmap_mode = None theta = np.load(scratch + '/theta.npy', mmap_mode=mmap_mode) mu = np.load(scratch + '/mu.npy') # Remove burnin if n_burnin > 0: mu = mu[n_burnin:] theta = theta[n_burnin:] # Compute effective sample sizes n_eff = np.array([ess1d(theta_k) for theta_k in theta.T]) gc.collect() # Compute concentration summaries local_concentrations = collections.OrderedDict() global_concentrations = collections.OrderedDict() # Iteration over concentration window sizes (+/-) for pm in pm_list: # Estimate probability of +/-(pm) local concentrations window_local = np.ones(width_local) window_pm = np.ones(1 + 2 * pm) baseline = (np.convolve(np.ones_like(theta[0]), window_pm, 'same') / np.convolve(np.ones_like(theta[0]), window_local, 'same')) # Setup array for estimates by basepair p_local_concentration = np.zeros(theta.shape[1], dtype=np.float) # Iterate over draws mean_lro = np.zeros(theta.shape[1], dtype=np.float) se_lro = np.zeros(theta.shape[1], dtype=np.float) for t in xrange(theta.shape[0]): bt = np.exp(theta[t]) local_occupancy_smoothed = local_relative_occupancy( bt, window_pm, window_local) delta = local_occupancy_smoothed - mean_lro mean_lro += delta / (t + 1.) se_lro += delta * (local_occupancy_smoothed - mean_lro) p_local_concentration *= t / (t + 1.) p_local_concentration += ((local_occupancy_smoothed > baseline) / (t + 1.)) se_lro = np.sqrt(se_lro / (theta.shape[0] - 1)) # Store results in dictionary key = 'p_local_concentration_pm%d' % pm local_concentrations[key] = p_local_concentration key = 'mean_local_concentration_pm%d' % pm local_concentrations[key] = mean_lro key = 'se_local_concentration_pm%d' % pm local_concentrations[key] = se_lro key = 'z_local_concentration_pm%d' % pm local_concentrations[key] = mean_lro / se_lro # Clean-up del local_occupancy_smoothed gc.collect() # Posterior quantiles for global concentrations baseline_global = ( np.array([np.sum(np.exp(theta_t)) for theta_t in theta]) / theta.shape[1] * bp_per_nucleosome) # Setup arrays for means and quantiles by basepair q_global_concentration = np.zeros(theta.shape[1], dtype=np.float) mean_global_concentration = np.zeros(theta.shape[1], dtype=np.float) # Iterate over basepairs for bp in xrange(theta.shape[1]): w = slice(max(0, bp - pm), min(bp + pm + 1, theta.shape[1])) prop = (np.sum(np.exp(theta[:, w]), 1) / baseline_global / (w.stop - w.start)) mean_global_concentration[bp] = np.mean(prop) q_global_concentration[bp] = mstats.mquantiles(prop, 1. - p_detect) # Store results in dictionaries key = 'q_global_concentration_pm%d' % pm global_concentrations[key] = q_global_concentration key = 'mean_global_concentration_pm%d' % pm global_concentrations[key] = mean_global_concentration # Compute posterior means theta_postmean = np.mean(theta, 0) b_postmean = np.array([np.mean(np.exp(theta_k)) for theta_k in theta.T]) # Compute standard errors theta_se = np.array([np.std(theta_k) for theta_k in theta.T]) b_se = np.array([np.std(np.exp(theta_k)) for theta_k in theta.T]) # Compute posterior medians theta_postmed = np.array([np.median(theta_k) for theta_k in theta.T]) b_postmed = np.exp(theta_postmed) # Provide nicely-formatted delimited output for analyses and plotting if null: pattern_summaries = cfg['mcmc_output']['null_summary_pattern'] else: pattern_summaries = cfg['mcmc_output']['summary_pattern'] pattern_summaries = pattern_summaries.strip() path_summaries = pattern_summaries.format(**cfg) % chrom # Build recarray of summaries, starting with coefficients and diagnostics summaries = np.rec.fromarrays([ theta_postmean, theta_postmed, theta_se, b_postmean, b_postmed, b_se, n_eff ], names=( 'theta', 'theta_med', 'se_theta', 'b', 'b_med', 'se_b', 'n_eff', )) # Append local concentration information summaries = nprf.append_fields(base=summaries, names=local_concentrations.keys(), data=local_concentrations.values()) # Append global concentration information summaries = nprf.append_fields(base=summaries, names=global_concentrations.keys(), data=global_concentrations.values()) # Write summaries to delimited text file libio.write_recarray_to_file(fname=path_summaries, data=summaries, header=True, sep=' ') # Run detection, if requested if p_detect is not None and not null: for pm in pm_list: # Find detected positions key = 'p_local_concentration_pm%d' % pm detected = np.where(local_concentrations[key] > p_detect)[0] # Condense regions detected, n = condense_detections(detected) # Write detections to text file pattern_detections = cfg['mcmc_output']['detections_pattern'] pattern_detections = pattern_detections.strip() path_detections = pattern_detections.format(**cfg) % (chrom, pm) detections = np.rec.fromarrays([detected, n], names=('pos', 'n')) libio.write_recarray_to_file(fname=path_detections, data=detections, header=True, sep=' ', fmt=detect_fmt) # Clean-up scratch directory for name in names_npy: os.remove(scratch + '/' + name) return 0
def summarise_params(cfg, chrom=1, null=False): ''' Coordinate summarisation of MCMC parameter draws. Parameters ---------- - cfg : dictionary Dictionary of parameters containing at least those relevant MCMC draw and summary output paths and parameters for summarization. - chrom : int Index of chromosome to analyze - null : bool Summarise null results? Returns ------- - status : int Integer status for summarisation. 0 for success, > 0 for failure. ''' # Reference useful information in local namespace n_burnin = cfg['mcmc_params']['n_burnin'] scratch = cfg['mcmc_summaries']['path_scratch'] # Check for existence and writeability of scratch directory if os.access(scratch, os.F_OK): # It exists, check for read-write if not os.access(scratch, os.R_OK | os.W_OK): print >> sys.stderr, ("Error --- Cannot read and write to %s" % scratch) return 1 else: # Otherwise, try to make the directory os.makedirs(scratch) # Extract results to scratch directory if null: pattern_results = cfg['mcmc_output']['null_out_pattern'] else: pattern_results = cfg['mcmc_output']['out_pattern'] pattern_results = pattern_results.strip() path_results = pattern_results.format(**cfg) % chrom archive = tarfile.open(name=path_results, mode='r:*') archive.extractall(path=scratch) names_npy = archive.getnames() archive.close() # Load results of interest mu = np.load(scratch + '/mu.npy') sigmasq = np.load(scratch + '/sigmasq.npy') region_ids = np.load(scratch + '/region_ids.npy') # Remove burnin if n_burnin > 0: mu = mu[n_burnin:] sigmasq = sigmasq[n_burnin:] # Compute posterior means mu_postmean = np.mean(mu, 0) sigmasq_postmean = np.mean(sigmasq, 0) sigma_postmean = np.mean(np.sqrt(sigmasq), 0) # Compute posterior medians mu_postmed = np.median(mu, 0) sigmasq_postmed = np.median(sigmasq, 0) sigma_postmed = np.median(np.sqrt(sigmasq), 0) # Compute standard errors mu_se = np.std(mu, 0) sigmasq_se = np.std(sigmasq, 0) sigma_se = np.std(np.sqrt(sigmasq), 0) # Provide nicely-formatted delimited output for analyses and plotting if null: pattern_summaries = cfg['mcmc_output']['null_param_pattern'] else: pattern_summaries = cfg['mcmc_output']['param_pattern'] pattern_summaries = pattern_summaries.strip() path_summaries = pattern_summaries.format(**cfg) % chrom # Build recarray of summaries, starting with coefficients and diagnostics summaries = np.rec.fromarrays([ region_ids, mu_postmean, mu_postmed, mu_se, sigmasq_postmean, sigmasq_postmed, sigmasq_se, sigma_postmean, sigma_postmed, sigma_se ], names=('region_id', 'mu_postmean', 'mu_postmed', 'mu_se', 'sigmasq_postmean', 'sigmasq_postmed', 'sigmasq_se', 'sigma_postmean', 'sigma_postmed', 'sigma_se')) # Write summaries to delimited text file libio.write_recarray_to_file(fname=path_summaries, data=summaries, header=True, sep=' ') # Clean-up scratch directory for name in names_npy: os.remove(scratch + '/' + name) return 0
def summarise_clusters(cfg, chrom=1, null=False): ''' Coordinate summarisation of MCMC results by cluster. Clusters are defined via Parzen window smoothing with a cfg-specified bandwidth and minimum separation. Following clustering, all cluster-level summaries are computed within each iteration (localization, structure, occupancy, etc.). The reported outputs are posterior summaries of these cluster-level summaries (mean, SD, etc.). Parameters ---------- - cfg : dictionary Dictionary of parameters containing at least those relevant MCMC draw and summary output paths and parameters for summarization. - chrom : int Index of chromosome to analyze - null : bool Summarise null results? Returns ------- - status : int Integer status for summarisation. 0 for success, > 0 for failure. ''' # Reference useful information in local namespace n_burnin = cfg['mcmc_params']['n_burnin'] scratch = cfg['mcmc_summaries']['path_scratch'] # Cluster-level summary information cluster_min_spacing = cfg['mcmc_summaries']['cluster_min_spacing'] cluster_bw = cfg['mcmc_summaries']['cluster_bw'] cluster_width = cfg['mcmc_summaries']['cluster_width'] h = cluster_width/2 # Extract q_sparsity information for n_large summaries from config q_sparsity = cfg['mcmc_summaries']['q_sparsity'] if isinstance(q_sparsity, str): q_sparsity = [float(s) for s in q_sparsity.split(',')] else: q_sparsity = [q_sparsity] # Extract p_threshold information for n_large summaries from config p_threshold = cfg['mcmc_summaries']['p_threshold'] if isinstance(p_threshold, str): p_threshold = [float(s) for s in p_threshold.split(',')] else: p_threshold = [p_threshold] # Check for existence and writeability of scratch directory if os.access(scratch, os.F_OK): # It exists, check for read-write if not os.access(scratch, os.R_OK | os.W_OK): print >> sys.stderr, ("Error --- Cannot read and write to %s" % scratch) return 1 else: # Otherwise, try to make the directory os.makedirs(scratch) # Extract results to scratch directory if null: pattern_results = cfg['mcmc_output']['null_out_pattern'] else: pattern_results = cfg['mcmc_output']['out_pattern'] pattern_results = pattern_results.strip() path_results = pattern_results.format(**cfg) % chrom archive = tarfile.open(name=path_results, mode='r:*') archive.extractall(path=scratch) names_npy = archive.getnames() archive.close() # Load results of interest theta = np.load(scratch + '/theta.npy', mmap_mode='r') mu = np.load(scratch + '/mu.npy', mmap_mode='r') # Remove burnin if n_burnin > 0: mu = mu[n_burnin:] theta = theta[n_burnin:] # Compute posterior mean of coefficients # This looks inefficient, but it saves memory --- a lot of memory b_postmean = np.array([np.mean(np.exp(theta_k)) for theta_k in theta.T]) # Setup window for clustering cluster_window = gaussian_window(h=h, sigma=cluster_bw) # Get cluster centers cluster_centers = get_cluster_centers(x=b_postmean, window=cluster_window, min_spacing=cluster_min_spacing, edge_correction=True) n_clusters = cluster_centers.size # Create slices by cluster for efficient access cluster_slices = [slice(max(0, c-h), min(c+h+1, theta.shape[1]), 1) for c in cluster_centers] # Extract cluster sizes cluster_sizes = np.array([s.stop - s.start for s in cluster_slices], dtype=np.int) # Create names for sparsity and n_large variables names_sparsity = ["sparsityq%02.0f" % (q * 100) for q in q_sparsity] names_sparsity_se = ["sparsityq%02.0f_se" % (q * 100) for q in q_sparsity] names_nlarge = ["nlargep%02.0f" % (p * 100) for p in p_threshold] names_nlarge_se = ["nlargep%02.0f_se" % (p * 100) for p in p_threshold] # Allocate arrays for cluster-level summaries cluster_summaries = collections.OrderedDict() cluster_summaries['center'] = cluster_centers cluster_summaries['cluster_length'] = cluster_sizes cluster_summaries['occupancy'] = np.empty(n_clusters, dtype=np.float) cluster_summaries['occupancy_se'] = np.empty(n_clusters, dtype=np.float) cluster_summaries['localization'] = np.empty(n_clusters, dtype=np.float) cluster_summaries['localization_se'] = np.empty(n_clusters, dtype=np.float) cluster_summaries['structure'] = np.empty(n_clusters, dtype=np.float) cluster_summaries['structure_se'] = np.empty(n_clusters, dtype=np.float) cluster_summaries['sparsity'] = np.empty(n_clusters, dtype=np.float) cluster_summaries['sparsity_se'] = np.empty(n_clusters, dtype=np.float) for var in itertools.chain( [item for items in itertools.izip(names_sparsity, names_sparsity_se) for item in items], [item for items in itertools.izip(names_nlarge, names_nlarge_se) for item in items]): cluster_summaries[var] = np.empty(n_clusters, dtype=np.float) # Compute cluster-level summaries, iterating over clusters for i, center, cluster in itertools.izip(xrange(n_clusters), cluster_centers, cluster_slices): # Extract cluster coefficient draws b_draws = np.exp(theta[:,cluster]) p_draws = (b_draws.T / np.sum(b_draws, 1)).T # Compute posterior mean occupancy and its SD cluster_summaries['occupancy'][i] = np.mean(b_draws)*cluster_sizes[i] cluster_summaries['occupancy_se'][i] = np.std(np.sum(b_draws, axis=1)) # Compute localization index by draw x=np.arange(cluster_sizes[i])[np.newaxis,:] localization = localization_index(x=x, p=p_draws, axis=1) cluster_summaries['localization'][i] = np.mean(localization) cluster_summaries['localization_se'][i] = np.std(localization) # Compute structure index by draw structure = structure_index(x=b_draws, axis=1) cluster_summaries['structure'][i] = np.mean(structure) cluster_summaries['structure_se'][i] = np.std(structure) # Compute sparsity index by draw sparsity = sparsity_index(x=b_draws, q=q_sparsity, axis=1) for i_q in xrange(len(q_sparsity)): cluster_summaries[names_sparsity[i_q]][i] = np.mean(sparsity[i_q]) cluster_summaries[names_sparsity_se[i_q]][i] = np.std(sparsity[i_q]) # Compute n_large by draw n_large = compute_n_large(x=b_draws, p_threshold=p_threshold, axis=1) for i_p in xrange(len(p_threshold)): cluster_summaries[names_nlarge[i_p]][i] = np.mean(n_large[i_p]) cluster_summaries[names_nlarge_se[i_p]][i] = np.std(n_large[i_p]) # Provide nicely-formatted delimited output for analyses and plotting if null: pattern_summaries = cfg['mcmc_output']['null_cluster_pattern'] else: pattern_summaries = cfg['mcmc_output']['cluster_pattern'] pattern_summaries = pattern_summaries.strip() path_summaries = pattern_summaries.format(**cfg) % chrom # Build recarray of summaries, starting with coefficients and diagnostics summaries = np.rec.fromarrays(cluster_summaries.values(), names=cluster_summaries.keys()) # Write summaries to delimited text file libio.write_recarray_to_file(fname=path_summaries, data=summaries, header=True, sep=' ') # Clean-up scratch directory for name in names_npy: os.remove(scratch + '/' + name) return 0
def summarise(cfg, chrom=1, null=False, mmap=False, detect_fmt=("%.1f", "%d")): ''' Coordinate summarisation of MCMC results. Parameters ---------- - cfg : dictionary Dictionary of parameters containing at least those relevant MCMC draw and summary output paths and parameters for summarization. - chrom : int Index of chromosome to analyze - null : bool Summarise null results? Returns ------- - status : int Integer status for summarisation. 0 for success, > 0 for failure. ''' # Reference useful information in local namespace n_burnin = cfg['mcmc_params']['n_burnin'] scratch = cfg['mcmc_summaries']['path_scratch'] width_local = cfg['mcmc_summaries']['width_local'] p_detect = cfg['mcmc_summaries']['p_detect'] bp_per_nucleosome = cfg['mcmc_summaries']['bp_per_nucleosome'] # Extract window size information (+/-) from config concentration_pm = cfg['mcmc_summaries']['concentration_pm'] if isinstance(concentration_pm, str): pm_list = [int(s) for s in concentration_pm.split(',')] else: pm_list = [concentration_pm] # Check for existence and writeability of scratch directory if os.access(scratch, os.F_OK): # It exists, check for read-write if not os.access(scratch, os.R_OK | os.W_OK): print >> sys.stderr, ("Error --- Cannot read and write to %s" % scratch) return 1 else: # Otherwise, try to make the directory os.makedirs(scratch) # Extract results to scratch directory if null: pattern_results = cfg['mcmc_output']['null_out_pattern'] else: pattern_results = cfg['mcmc_output']['out_pattern'] pattern_results = pattern_results.strip() path_results = pattern_results.format(**cfg) % chrom archive = tarfile.open(name=path_results, mode='r:*') archive.extractall(path=scratch) names_npy = archive.getnames() archive.close() # Load results of interest if mmap: mmap_mode = 'r+' else: mmap_mode = None theta = np.load(scratch + '/theta.npy', mmap_mode=mmap_mode) mu = np.load(scratch + '/mu.npy') # Remove burnin if n_burnin > 0: mu = mu[n_burnin:] theta = theta[n_burnin:] # Compute effective sample sizes n_eff = np.array([ess1d(theta_k) for theta_k in theta.T]) gc.collect() # Compute concentration summaries local_concentrations = collections.OrderedDict() global_concentrations = collections.OrderedDict() # Iteration over concentration window sizes (+/-) for pm in pm_list: # Estimate probability of +/-(pm) local concentrations window_local = np.ones(width_local) window_pm = np.ones(1 + 2*pm) baseline = (np.convolve(np.ones_like(theta[0]), window_pm, 'same') / np.convolve(np.ones_like(theta[0]), window_local, 'same')) # Setup array for estimates by basepair p_local_concentration = np.zeros(theta.shape[1], dtype=np.float) # Iterate over draws mean_lro = np.zeros(theta.shape[1], dtype=np.float) se_lro = np.zeros(theta.shape[1], dtype=np.float) for t in xrange(theta.shape[0]): bt = np.exp(theta[t]) local_occupancy_smoothed = local_relative_occupancy( bt, window_pm, window_local) delta = local_occupancy_smoothed - mean_lro mean_lro += delta / (t+1.) se_lro += delta * (local_occupancy_smoothed - mean_lro) p_local_concentration *= t/(t+1.) p_local_concentration += ( (local_occupancy_smoothed > baseline)/(t+1.)) se_lro = np.sqrt(se_lro / (theta.shape[0] - 1)) # Store results in dictionary key = 'p_local_concentration_pm%d' % pm local_concentrations[key] = p_local_concentration key = 'mean_local_concentration_pm%d' % pm local_concentrations[key] = mean_lro key = 'se_local_concentration_pm%d' % pm local_concentrations[key] = se_lro key = 'z_local_concentration_pm%d' % pm local_concentrations[key] = mean_lro / se_lro # Clean-up del local_occupancy_smoothed gc.collect() # Posterior quantiles for global concentrations baseline_global = (np.array([np.sum(np.exp(theta_t)) for theta_t in theta]) / theta.shape[1] * bp_per_nucleosome) # Setup arrays for means and quantiles by basepair q_global_concentration = np.zeros(theta.shape[1], dtype=np.float) mean_global_concentration = np.zeros(theta.shape[1], dtype=np.float) # Iterate over basepairs for bp in xrange(theta.shape[1]): w = slice(max(0,bp-pm), min(bp+pm+1, theta.shape[1])) prop = (np.sum(np.exp(theta[:,w]), 1) / baseline_global / (w.stop-w.start)) mean_global_concentration[bp] = np.mean(prop) q_global_concentration[bp] = mstats.mquantiles(prop, 1.-p_detect) # Store results in dictionaries key = 'q_global_concentration_pm%d' % pm global_concentrations[key] = q_global_concentration key = 'mean_global_concentration_pm%d' % pm global_concentrations[key] = mean_global_concentration # Compute posterior means theta_postmean = np.mean(theta, 0) b_postmean = np.array([np.mean(np.exp(theta_k)) for theta_k in theta.T]) # Compute standard errors theta_se = np.array([np.std(theta_k) for theta_k in theta.T]) b_se = np.array([np.std(np.exp(theta_k)) for theta_k in theta.T]) # Compute posterior medians theta_postmed = np.array([np.median(theta_k) for theta_k in theta.T]) b_postmed = np.exp(theta_postmed) # Provide nicely-formatted delimited output for analyses and plotting if null: pattern_summaries = cfg['mcmc_output']['null_summary_pattern'] else: pattern_summaries = cfg['mcmc_output']['summary_pattern'] pattern_summaries = pattern_summaries.strip() path_summaries = pattern_summaries.format(**cfg) % chrom # Build recarray of summaries, starting with coefficients and diagnostics summaries = np.rec.fromarrays([theta_postmean, theta_postmed, theta_se, b_postmean, b_postmed, b_se, n_eff], names=('theta', 'theta_med', 'se_theta', 'b', 'b_med', 'se_b', 'n_eff',)) # Append local concentration information summaries = nprf.append_fields(base=summaries, names=local_concentrations.keys(), data=local_concentrations.values()) # Append global concentration information summaries = nprf.append_fields(base=summaries, names=global_concentrations.keys(), data=global_concentrations.values()) # Write summaries to delimited text file libio.write_recarray_to_file(fname=path_summaries, data=summaries, header=True, sep=' ') # Run detection, if requested if p_detect is not None and not null: for pm in pm_list: # Find detected positions key = 'p_local_concentration_pm%d' % pm detected = np.where(local_concentrations[key] > p_detect)[0] # Condense regions detected, n = condense_detections(detected) # Write detections to text file pattern_detections = cfg['mcmc_output']['detections_pattern'] pattern_detections = pattern_detections.strip() path_detections = pattern_detections.format(**cfg) % (chrom, pm) detections = np.rec.fromarrays([detected, n], names=('pos', 'n')) libio.write_recarray_to_file(fname=path_detections, data=detections, header=True, sep=' ', fmt=detect_fmt) # Clean-up scratch directory for name in names_npy: os.remove(scratch + '/' + name) return 0
def summarise_params(cfg, chrom=1, null=False): ''' Coordinate summarisation of MCMC parameter draws. Parameters ---------- - cfg : dictionary Dictionary of parameters containing at least those relevant MCMC draw and summary output paths and parameters for summarization. - chrom : int Index of chromosome to analyze - null : bool Summarise null results? Returns ------- - status : int Integer status for summarisation. 0 for success, > 0 for failure. ''' # Reference useful information in local namespace n_burnin = cfg['mcmc_params']['n_burnin'] scratch = cfg['mcmc_summaries']['path_scratch'] # Check for existence and writeability of scratch directory if os.access(scratch, os.F_OK): # It exists, check for read-write if not os.access(scratch, os.R_OK | os.W_OK): print >> sys.stderr, ("Error --- Cannot read and write to %s" % scratch) return 1 else: # Otherwise, try to make the directory os.makedirs(scratch) # Extract results to scratch directory if null: pattern_results = cfg['mcmc_output']['null_out_pattern'] else: pattern_results = cfg['mcmc_output']['out_pattern'] pattern_results = pattern_results.strip() path_results = pattern_results.format(**cfg) % chrom archive = tarfile.open(name=path_results, mode='r:*') archive.extractall(path=scratch) names_npy = archive.getnames() archive.close() # Load results of interest mu = np.load(scratch + '/mu.npy') sigmasq = np.load(scratch + '/sigmasq.npy') region_ids = np.load(scratch + '/region_ids.npy') # Remove burnin if n_burnin > 0: mu = mu[n_burnin:] sigmasq = sigmasq[n_burnin:] # Compute posterior means mu_postmean = np.mean(mu, 0) sigmasq_postmean = np.mean(sigmasq, 0) sigma_postmean = np.mean(np.sqrt(sigmasq), 0) # Compute posterior medians mu_postmed = np.median(mu, 0) sigmasq_postmed = np.median(sigmasq, 0) sigma_postmed = np.median(np.sqrt(sigmasq), 0) # Compute standard errors mu_se = np.std(mu, 0) sigmasq_se = np.std(sigmasq, 0) sigma_se = np.std(np.sqrt(sigmasq), 0) # Provide nicely-formatted delimited output for analyses and plotting if null: pattern_summaries = cfg['mcmc_output']['null_param_pattern'] else: pattern_summaries = cfg['mcmc_output']['param_pattern'] pattern_summaries = pattern_summaries.strip() path_summaries = pattern_summaries.format(**cfg) % chrom # Build recarray of summaries, starting with coefficients and diagnostics summaries = np.rec.fromarrays([region_ids, mu_postmean, mu_postmed, mu_se, sigmasq_postmean, sigmasq_postmed, sigmasq_se, sigma_postmean, sigma_postmed, sigma_se], names= ('region_id', 'mu_postmean', 'mu_postmed', 'mu_se', 'sigmasq_postmean', 'sigmasq_postmed', 'sigmasq_se', 'sigma_postmean', 'sigma_postmed', 'sigma_se')) # Write summaries to delimited text file libio.write_recarray_to_file(fname=path_summaries, data=summaries, header=True, sep=' ') # Clean-up scratch directory for name in names_npy: os.remove(scratch + '/' + name) return 0