Esempio n. 1
0
def detect_from_summaries(cfg, chrom=1, detect_fmt=("%.1f", "%d")):
    '''
    Detect local concentrations based on existing summaries.

    Parameters
    ----------
    - cfg : dictionary
        Dictionary of parameters containing at least those relevant MCMC
        draw and summary output paths and parameters for summarization.
    - chrom : int
        Index of chromosome to analyze

    Returns
    -------
    - status : int
        Integer status for summarisation. 0 for success, > 0 for failure.
    '''
    # Reference useful information in local namespace
    p_detect = cfg['mcmc_summaries']['p_detect']

    # Extract window size information (+/-) from config
    concentration_pm = cfg['mcmc_summaries']['concentration_pm']
    if isinstance(concentration_pm, str):
        pm_list = [int(s) for s in concentration_pm.split(',')]
    else:
        pm_list = [concentration_pm]

    # Get path to posterior summaries
    pattern_summaries = cfg['mcmc_output']['summary_pattern']
    pattern_summaries = pattern_summaries.strip()
    path_summaries = pattern_summaries.format(**cfg) % chrom

    # Run detection
    if p_detect is not None:
        # Load summaries
        summaries = np.genfromtxt(path_summaries, names=True)

        # Iterate of +/- settings
        for pm in pm_list:
            # Find detected positions
            key = 'p_local_concentration_pm%d' % pm
            detected = np.where(summaries[key] > p_detect)[0]

            # Condense regions
            detected, n = condense_detections(detected)

            # Write detections to text file
            pattern_detections = cfg['mcmc_output']['detections_pattern']
            pattern_detections = pattern_detections.strip()
            path_detections = pattern_detections.format(**cfg) % (chrom, pm)

            detections = np.rec.fromarrays([detected, n], names=('pos', 'n'))
            libio.write_recarray_to_file(fname=path_detections,
                                         data=detections,
                                         header=True,
                                         sep=' ',
                                         fmt=detect_fmt)

    return 0
Esempio n. 2
0
def detect_from_summaries(cfg, chrom=1, detect_fmt=("%.1f", "%d")):
    '''
    Detect local concentrations based on existing summaries.

    Parameters
    ----------
    - cfg : dictionary
        Dictionary of parameters containing at least those relevant MCMC
        draw and summary output paths and parameters for summarization.
    - chrom : int
        Index of chromosome to analyze

    Returns
    -------
    - status : int
        Integer status for summarisation. 0 for success, > 0 for failure.
    '''
    # Reference useful information in local namespace
    p_detect = cfg['mcmc_summaries']['p_detect']
    
    # Extract window size information (+/-) from config
    concentration_pm = cfg['mcmc_summaries']['concentration_pm']
    if isinstance(concentration_pm, str):
        pm_list = [int(s) for s in concentration_pm.split(',')]
    else:
        pm_list = [concentration_pm]

    # Get path to posterior summaries
    pattern_summaries = cfg['mcmc_output']['summary_pattern']
    pattern_summaries = pattern_summaries.strip()
    path_summaries = pattern_summaries.format(**cfg) % chrom
    
    # Run detection
    if p_detect is not None:
        # Load summaries
        summaries = np.genfromtxt(path_summaries, names=True)
        
        # Iterate of +/- settings
        for pm in pm_list:
            # Find detected positions
            key = 'p_local_concentration_pm%d' % pm
            detected = np.where(summaries[key] > p_detect)[0]

            # Condense regions
            detected, n = condense_detections(detected)

            # Write detections to text file
            pattern_detections = cfg['mcmc_output']['detections_pattern']
            pattern_detections = pattern_detections.strip()
            path_detections = pattern_detections.format(**cfg) % (chrom, pm)

            detections = np.rec.fromarrays([detected, n],
                                           names=('pos', 'n'))
            libio.write_recarray_to_file(fname=path_detections, data=detections,
                                         header=True, sep=' ', fmt=detect_fmt)

    return 0
Esempio n. 3
0
def summarise_clusters(cfg, chrom=1, null=False):
    '''
    Coordinate summarisation of MCMC results by cluster.

    Clusters are defined via Parzen window smoothing with a cfg-specified
    bandwidth and minimum separation. Following clustering, all cluster-level
    summaries are computed within each iteration (localization, structure,
    occupancy, etc.). The reported outputs are posterior summaries of these
    cluster-level summaries (mean, SD, etc.).

    Parameters
    ----------
    - cfg : dictionary
        Dictionary of parameters containing at least those relevant MCMC
        draw and summary output paths and parameters for summarization.
    - chrom : int
        Index of chromosome to analyze
    - null : bool
        Summarise null results?

    Returns
    -------
    - status : int
        Integer status for summarisation. 0 for success, > 0 for failure.
    '''
    # Reference useful information in local namespace
    n_burnin = cfg['mcmc_params']['n_burnin']
    scratch = cfg['mcmc_summaries']['path_scratch']
    # Cluster-level summary information
    cluster_min_spacing = cfg['mcmc_summaries']['cluster_min_spacing']
    cluster_bw = cfg['mcmc_summaries']['cluster_bw']
    cluster_width = cfg['mcmc_summaries']['cluster_width']
    h = cluster_width / 2

    # Extract q_sparsity information for n_large summaries from config
    q_sparsity = cfg['mcmc_summaries']['q_sparsity']
    if isinstance(q_sparsity, str):
        q_sparsity = [float(s) for s in q_sparsity.split(',')]
    else:
        q_sparsity = [q_sparsity]

    # Extract p_threshold information for n_large summaries from config
    p_threshold = cfg['mcmc_summaries']['p_threshold']
    if isinstance(p_threshold, str):
        p_threshold = [float(s) for s in p_threshold.split(',')]
    else:
        p_threshold = [p_threshold]

    # Check for existence and writeability of scratch directory
    if os.access(scratch, os.F_OK):
        # It exists, check for read-write
        if not os.access(scratch, os.R_OK | os.W_OK):
            print >> sys.stderr, ("Error --- Cannot read and write to %s" %
                                  scratch)
            return 1
    else:
        # Otherwise, try to make the directory
        os.makedirs(scratch)

    # Extract results to scratch directory
    if null:
        pattern_results = cfg['mcmc_output']['null_out_pattern']
    else:
        pattern_results = cfg['mcmc_output']['out_pattern']
    pattern_results = pattern_results.strip()
    path_results = pattern_results.format(**cfg) % chrom

    archive = tarfile.open(name=path_results, mode='r:*')
    archive.extractall(path=scratch)
    names_npy = archive.getnames()
    archive.close()

    # Load results of interest
    theta = np.load(scratch + '/theta.npy', mmap_mode='r')
    mu = np.load(scratch + '/mu.npy', mmap_mode='r')

    # Remove burnin
    if n_burnin > 0:
        mu = mu[n_burnin:]
        theta = theta[n_burnin:]

    # Compute posterior mean of coefficients
    # This looks inefficient, but it saves memory --- a lot of memory
    b_postmean = np.array([np.mean(np.exp(theta_k)) for theta_k in theta.T])

    # Setup window for clustering
    cluster_window = gaussian_window(h=h, sigma=cluster_bw)

    # Get cluster centers
    cluster_centers = get_cluster_centers(x=b_postmean,
                                          window=cluster_window,
                                          min_spacing=cluster_min_spacing,
                                          edge_correction=True)
    n_clusters = cluster_centers.size

    # Create slices by cluster for efficient access
    cluster_slices = [
        slice(max(0, c - h), min(c + h + 1, theta.shape[1]), 1)
        for c in cluster_centers
    ]

    # Extract cluster sizes
    cluster_sizes = np.array([s.stop - s.start for s in cluster_slices],
                             dtype=np.int)

    # Create names for sparsity and n_large variables
    names_sparsity = ["sparsityq%02.0f" % (q * 100) for q in q_sparsity]
    names_sparsity_se = ["sparsityq%02.0f_se" % (q * 100) for q in q_sparsity]
    names_nlarge = ["nlargep%02.0f" % (p * 100) for p in p_threshold]
    names_nlarge_se = ["nlargep%02.0f_se" % (p * 100) for p in p_threshold]

    # Allocate arrays for cluster-level summaries
    cluster_summaries = collections.OrderedDict()
    cluster_summaries['center'] = cluster_centers
    cluster_summaries['cluster_length'] = cluster_sizes
    cluster_summaries['occupancy'] = np.empty(n_clusters, dtype=np.float)
    cluster_summaries['occupancy_se'] = np.empty(n_clusters, dtype=np.float)
    cluster_summaries['localization'] = np.empty(n_clusters, dtype=np.float)
    cluster_summaries['localization_se'] = np.empty(n_clusters, dtype=np.float)
    cluster_summaries['structure'] = np.empty(n_clusters, dtype=np.float)
    cluster_summaries['structure_se'] = np.empty(n_clusters, dtype=np.float)
    cluster_summaries['sparsity'] = np.empty(n_clusters, dtype=np.float)
    cluster_summaries['sparsity_se'] = np.empty(n_clusters, dtype=np.float)
    for var in itertools.chain([
            item for items in itertools.izip(names_sparsity, names_sparsity_se)
            for item in items
    ], [
            item for items in itertools.izip(names_nlarge, names_nlarge_se)
            for item in items
    ]):
        cluster_summaries[var] = np.empty(n_clusters, dtype=np.float)

    # Compute cluster-level summaries, iterating over clusters
    for i, center, cluster in itertools.izip(xrange(n_clusters),
                                             cluster_centers, cluster_slices):
        # Extract cluster coefficient draws
        b_draws = np.exp(theta[:, cluster])
        p_draws = (b_draws.T / np.sum(b_draws, 1)).T

        # Compute posterior mean occupancy and its SD
        cluster_summaries['occupancy'][i] = np.mean(b_draws) * cluster_sizes[i]
        cluster_summaries['occupancy_se'][i] = np.std(np.sum(b_draws, axis=1))

        # Compute localization index by draw
        x = np.arange(cluster_sizes[i])[np.newaxis, :]
        localization = localization_index(x=x, p=p_draws, axis=1)
        cluster_summaries['localization'][i] = np.mean(localization)
        cluster_summaries['localization_se'][i] = np.std(localization)

        # Compute structure index by draw
        structure = structure_index(x=b_draws, axis=1)
        cluster_summaries['structure'][i] = np.mean(structure)
        cluster_summaries['structure_se'][i] = np.std(structure)

        # Compute sparsity index by draw
        sparsity = sparsity_index(x=b_draws, q=q_sparsity, axis=1)
        for i_q in xrange(len(q_sparsity)):
            cluster_summaries[names_sparsity[i_q]][i] = np.mean(sparsity[i_q])
            cluster_summaries[names_sparsity_se[i_q]][i] = np.std(
                sparsity[i_q])

        # Compute n_large by draw
        n_large = compute_n_large(x=b_draws, p_threshold=p_threshold, axis=1)
        for i_p in xrange(len(p_threshold)):
            cluster_summaries[names_nlarge[i_p]][i] = np.mean(n_large[i_p])
            cluster_summaries[names_nlarge_se[i_p]][i] = np.std(n_large[i_p])

    # Provide nicely-formatted delimited output for analyses and plotting
    if null:
        pattern_summaries = cfg['mcmc_output']['null_cluster_pattern']
    else:
        pattern_summaries = cfg['mcmc_output']['cluster_pattern']
    pattern_summaries = pattern_summaries.strip()
    path_summaries = pattern_summaries.format(**cfg) % chrom

    # Build recarray of summaries, starting with coefficients and diagnostics
    summaries = np.rec.fromarrays(cluster_summaries.values(),
                                  names=cluster_summaries.keys())

    # Write summaries to delimited text file
    libio.write_recarray_to_file(fname=path_summaries,
                                 data=summaries,
                                 header=True,
                                 sep=' ')

    # Clean-up scratch directory
    for name in names_npy:
        os.remove(scratch + '/' + name)

    return 0
Esempio n. 4
0
def summarise(cfg, chrom=1, null=False, mmap=False, detect_fmt=("%.1f", "%d")):
    '''
    Coordinate summarisation of MCMC results.

    Parameters
    ----------
    - cfg : dictionary
        Dictionary of parameters containing at least those relevant MCMC
        draw and summary output paths and parameters for summarization.
    - chrom : int
        Index of chromosome to analyze
    - null : bool
        Summarise null results?

    Returns
    -------
    - status : int
        Integer status for summarisation. 0 for success, > 0 for failure.
    '''
    # Reference useful information in local namespace
    n_burnin = cfg['mcmc_params']['n_burnin']
    scratch = cfg['mcmc_summaries']['path_scratch']
    width_local = cfg['mcmc_summaries']['width_local']
    p_detect = cfg['mcmc_summaries']['p_detect']
    bp_per_nucleosome = cfg['mcmc_summaries']['bp_per_nucleosome']

    # Extract window size information (+/-) from config
    concentration_pm = cfg['mcmc_summaries']['concentration_pm']
    if isinstance(concentration_pm, str):
        pm_list = [int(s) for s in concentration_pm.split(',')]
    else:
        pm_list = [concentration_pm]

    # Check for existence and writeability of scratch directory
    if os.access(scratch, os.F_OK):
        # It exists, check for read-write
        if not os.access(scratch, os.R_OK | os.W_OK):
            print >> sys.stderr, ("Error --- Cannot read and write to %s" %
                                  scratch)
            return 1
    else:
        # Otherwise, try to make the directory
        os.makedirs(scratch)

    # Extract results to scratch directory
    if null:
        pattern_results = cfg['mcmc_output']['null_out_pattern']
    else:
        pattern_results = cfg['mcmc_output']['out_pattern']
    pattern_results = pattern_results.strip()
    path_results = pattern_results.format(**cfg) % chrom

    archive = tarfile.open(name=path_results, mode='r:*')
    archive.extractall(path=scratch)
    names_npy = archive.getnames()
    archive.close()

    # Load results of interest
    if mmap:
        mmap_mode = 'r+'
    else:
        mmap_mode = None

    theta = np.load(scratch + '/theta.npy', mmap_mode=mmap_mode)
    mu = np.load(scratch + '/mu.npy')

    # Remove burnin
    if n_burnin > 0:
        mu = mu[n_burnin:]
        theta = theta[n_burnin:]

    # Compute effective sample sizes
    n_eff = np.array([ess1d(theta_k) for theta_k in theta.T])
    gc.collect()

    # Compute concentration summaries
    local_concentrations = collections.OrderedDict()
    global_concentrations = collections.OrderedDict()

    # Iteration over concentration window sizes (+/-)
    for pm in pm_list:
        # Estimate probability of +/-(pm) local concentrations
        window_local = np.ones(width_local)
        window_pm = np.ones(1 + 2 * pm)
        baseline = (np.convolve(np.ones_like(theta[0]), window_pm, 'same') /
                    np.convolve(np.ones_like(theta[0]), window_local, 'same'))

        # Setup array for estimates by basepair
        p_local_concentration = np.zeros(theta.shape[1], dtype=np.float)

        # Iterate over draws
        mean_lro = np.zeros(theta.shape[1], dtype=np.float)
        se_lro = np.zeros(theta.shape[1], dtype=np.float)
        for t in xrange(theta.shape[0]):
            bt = np.exp(theta[t])
            local_occupancy_smoothed = local_relative_occupancy(
                bt, window_pm, window_local)
            delta = local_occupancy_smoothed - mean_lro
            mean_lro += delta / (t + 1.)
            se_lro += delta * (local_occupancy_smoothed - mean_lro)
            p_local_concentration *= t / (t + 1.)
            p_local_concentration += ((local_occupancy_smoothed > baseline) /
                                      (t + 1.))
        se_lro = np.sqrt(se_lro / (theta.shape[0] - 1))

        # Store results in dictionary
        key = 'p_local_concentration_pm%d' % pm
        local_concentrations[key] = p_local_concentration
        key = 'mean_local_concentration_pm%d' % pm
        local_concentrations[key] = mean_lro
        key = 'se_local_concentration_pm%d' % pm
        local_concentrations[key] = se_lro
        key = 'z_local_concentration_pm%d' % pm
        local_concentrations[key] = mean_lro / se_lro

        # Clean-up
        del local_occupancy_smoothed
        gc.collect()

        # Posterior quantiles for global concentrations
        baseline_global = (
            np.array([np.sum(np.exp(theta_t)) for theta_t in theta]) /
            theta.shape[1] * bp_per_nucleosome)

        # Setup arrays for means and quantiles by basepair
        q_global_concentration = np.zeros(theta.shape[1], dtype=np.float)
        mean_global_concentration = np.zeros(theta.shape[1], dtype=np.float)

        # Iterate over basepairs
        for bp in xrange(theta.shape[1]):
            w = slice(max(0, bp - pm), min(bp + pm + 1, theta.shape[1]))
            prop = (np.sum(np.exp(theta[:, w]), 1) / baseline_global /
                    (w.stop - w.start))
            mean_global_concentration[bp] = np.mean(prop)
            q_global_concentration[bp] = mstats.mquantiles(prop, 1. - p_detect)

        # Store results in dictionaries
        key = 'q_global_concentration_pm%d' % pm
        global_concentrations[key] = q_global_concentration
        key = 'mean_global_concentration_pm%d' % pm
        global_concentrations[key] = mean_global_concentration

    # Compute posterior means
    theta_postmean = np.mean(theta, 0)
    b_postmean = np.array([np.mean(np.exp(theta_k)) for theta_k in theta.T])

    # Compute standard errors
    theta_se = np.array([np.std(theta_k) for theta_k in theta.T])
    b_se = np.array([np.std(np.exp(theta_k)) for theta_k in theta.T])

    # Compute posterior medians
    theta_postmed = np.array([np.median(theta_k) for theta_k in theta.T])
    b_postmed = np.exp(theta_postmed)

    # Provide nicely-formatted delimited output for analyses and plotting
    if null:
        pattern_summaries = cfg['mcmc_output']['null_summary_pattern']
    else:
        pattern_summaries = cfg['mcmc_output']['summary_pattern']
    pattern_summaries = pattern_summaries.strip()
    path_summaries = pattern_summaries.format(**cfg) % chrom

    # Build recarray of summaries, starting with coefficients and diagnostics
    summaries = np.rec.fromarrays([
        theta_postmean, theta_postmed, theta_se, b_postmean, b_postmed, b_se,
        n_eff
    ],
                                  names=(
                                      'theta',
                                      'theta_med',
                                      'se_theta',
                                      'b',
                                      'b_med',
                                      'se_b',
                                      'n_eff',
                                  ))

    # Append local concentration information
    summaries = nprf.append_fields(base=summaries,
                                   names=local_concentrations.keys(),
                                   data=local_concentrations.values())

    # Append global concentration information
    summaries = nprf.append_fields(base=summaries,
                                   names=global_concentrations.keys(),
                                   data=global_concentrations.values())

    # Write summaries to delimited text file
    libio.write_recarray_to_file(fname=path_summaries,
                                 data=summaries,
                                 header=True,
                                 sep=' ')

    # Run detection, if requested
    if p_detect is not None and not null:
        for pm in pm_list:
            # Find detected positions
            key = 'p_local_concentration_pm%d' % pm
            detected = np.where(local_concentrations[key] > p_detect)[0]

            # Condense regions
            detected, n = condense_detections(detected)

            # Write detections to text file
            pattern_detections = cfg['mcmc_output']['detections_pattern']
            pattern_detections = pattern_detections.strip()
            path_detections = pattern_detections.format(**cfg) % (chrom, pm)

            detections = np.rec.fromarrays([detected, n], names=('pos', 'n'))
            libio.write_recarray_to_file(fname=path_detections,
                                         data=detections,
                                         header=True,
                                         sep=' ',
                                         fmt=detect_fmt)

    # Clean-up scratch directory
    for name in names_npy:
        os.remove(scratch + '/' + name)

    return 0
Esempio n. 5
0
def summarise_params(cfg, chrom=1, null=False):
    '''
    Coordinate summarisation of MCMC parameter draws.

    Parameters
    ----------
    - cfg : dictionary
        Dictionary of parameters containing at least those relevant MCMC
        draw and summary output paths and parameters for summarization.
    - chrom : int
        Index of chromosome to analyze
    - null : bool
        Summarise null results?

    Returns
    -------
    - status : int
        Integer status for summarisation. 0 for success, > 0 for failure.
    '''
    # Reference useful information in local namespace
    n_burnin = cfg['mcmc_params']['n_burnin']
    scratch = cfg['mcmc_summaries']['path_scratch']

    # Check for existence and writeability of scratch directory
    if os.access(scratch, os.F_OK):
        # It exists, check for read-write
        if not os.access(scratch, os.R_OK | os.W_OK):
            print >> sys.stderr, ("Error --- Cannot read and write to %s" %
                                  scratch)
            return 1
    else:
        # Otherwise, try to make the directory
        os.makedirs(scratch)

    # Extract results to scratch directory
    if null:
        pattern_results = cfg['mcmc_output']['null_out_pattern']
    else:
        pattern_results = cfg['mcmc_output']['out_pattern']
    pattern_results = pattern_results.strip()
    path_results = pattern_results.format(**cfg) % chrom

    archive = tarfile.open(name=path_results, mode='r:*')
    archive.extractall(path=scratch)
    names_npy = archive.getnames()
    archive.close()

    # Load results of interest
    mu = np.load(scratch + '/mu.npy')
    sigmasq = np.load(scratch + '/sigmasq.npy')
    region_ids = np.load(scratch + '/region_ids.npy')

    # Remove burnin
    if n_burnin > 0:
        mu = mu[n_burnin:]
        sigmasq = sigmasq[n_burnin:]

    # Compute posterior means
    mu_postmean = np.mean(mu, 0)
    sigmasq_postmean = np.mean(sigmasq, 0)
    sigma_postmean = np.mean(np.sqrt(sigmasq), 0)

    # Compute posterior medians
    mu_postmed = np.median(mu, 0)
    sigmasq_postmed = np.median(sigmasq, 0)
    sigma_postmed = np.median(np.sqrt(sigmasq), 0)

    # Compute standard errors
    mu_se = np.std(mu, 0)
    sigmasq_se = np.std(sigmasq, 0)
    sigma_se = np.std(np.sqrt(sigmasq), 0)

    # Provide nicely-formatted delimited output for analyses and plotting
    if null:
        pattern_summaries = cfg['mcmc_output']['null_param_pattern']
    else:
        pattern_summaries = cfg['mcmc_output']['param_pattern']
    pattern_summaries = pattern_summaries.strip()
    path_summaries = pattern_summaries.format(**cfg) % chrom

    # Build recarray of summaries, starting with coefficients and diagnostics
    summaries = np.rec.fromarrays([
        region_ids, mu_postmean, mu_postmed, mu_se, sigmasq_postmean,
        sigmasq_postmed, sigmasq_se, sigma_postmean, sigma_postmed, sigma_se
    ],
                                  names=('region_id', 'mu_postmean',
                                         'mu_postmed', 'mu_se',
                                         'sigmasq_postmean', 'sigmasq_postmed',
                                         'sigmasq_se', 'sigma_postmean',
                                         'sigma_postmed', 'sigma_se'))

    # Write summaries to delimited text file
    libio.write_recarray_to_file(fname=path_summaries,
                                 data=summaries,
                                 header=True,
                                 sep=' ')

    # Clean-up scratch directory
    for name in names_npy:
        os.remove(scratch + '/' + name)

    return 0
Esempio n. 6
0
def summarise_clusters(cfg, chrom=1, null=False):
    '''
    Coordinate summarisation of MCMC results by cluster.

    Clusters are defined via Parzen window smoothing with a cfg-specified
    bandwidth and minimum separation. Following clustering, all cluster-level
    summaries are computed within each iteration (localization, structure,
    occupancy, etc.). The reported outputs are posterior summaries of these
    cluster-level summaries (mean, SD, etc.).

    Parameters
    ----------
    - cfg : dictionary
        Dictionary of parameters containing at least those relevant MCMC
        draw and summary output paths and parameters for summarization.
    - chrom : int
        Index of chromosome to analyze
    - null : bool
        Summarise null results?

    Returns
    -------
    - status : int
        Integer status for summarisation. 0 for success, > 0 for failure.
    '''
    # Reference useful information in local namespace
    n_burnin    = cfg['mcmc_params']['n_burnin']
    scratch     = cfg['mcmc_summaries']['path_scratch']
    # Cluster-level summary information
    cluster_min_spacing = cfg['mcmc_summaries']['cluster_min_spacing']
    cluster_bw = cfg['mcmc_summaries']['cluster_bw']
    cluster_width = cfg['mcmc_summaries']['cluster_width']
    h = cluster_width/2

    # Extract q_sparsity information for n_large summaries from config
    q_sparsity = cfg['mcmc_summaries']['q_sparsity']
    if isinstance(q_sparsity, str):
        q_sparsity = [float(s) for s in q_sparsity.split(',')]
    else:
        q_sparsity = [q_sparsity]

    # Extract p_threshold information for n_large summaries from config
    p_threshold = cfg['mcmc_summaries']['p_threshold']
    if isinstance(p_threshold, str):
        p_threshold = [float(s) for s in p_threshold.split(',')]
    else:
        p_threshold = [p_threshold]
    
    # Check for existence and writeability of scratch directory
    if os.access(scratch, os.F_OK):
        # It exists, check for read-write
        if not os.access(scratch, os.R_OK | os.W_OK):
            print >> sys.stderr, ("Error --- Cannot read and write to %s" %
                                  scratch)
            return 1
    else:
        # Otherwise, try to make the directory
        os.makedirs(scratch)

    # Extract results to scratch directory
    if null:
        pattern_results = cfg['mcmc_output']['null_out_pattern']
    else:
        pattern_results = cfg['mcmc_output']['out_pattern']
    pattern_results = pattern_results.strip()
    path_results = pattern_results.format(**cfg) % chrom
    
    archive = tarfile.open(name=path_results, mode='r:*')
    archive.extractall(path=scratch)
    names_npy = archive.getnames()
    archive.close()

    # Load results of interest
    theta   = np.load(scratch + '/theta.npy', mmap_mode='r')
    mu      = np.load(scratch + '/mu.npy', mmap_mode='r')

    # Remove burnin
    if n_burnin > 0:
        mu = mu[n_burnin:]
        theta = theta[n_burnin:]

    # Compute posterior mean of coefficients
    # This looks inefficient, but it saves memory --- a lot of memory
    b_postmean = np.array([np.mean(np.exp(theta_k)) for theta_k in theta.T])
    
    # Setup window for clustering
    cluster_window = gaussian_window(h=h, sigma=cluster_bw)

    # Get cluster centers
    cluster_centers = get_cluster_centers(x=b_postmean, window=cluster_window,
                                          min_spacing=cluster_min_spacing,
                                          edge_correction=True)
    n_clusters = cluster_centers.size

    # Create slices by cluster for efficient access
    cluster_slices = [slice(max(0, c-h), min(c+h+1, theta.shape[1]), 1) for c in
                      cluster_centers]

    # Extract cluster sizes
    cluster_sizes = np.array([s.stop - s.start for s in cluster_slices],
                             dtype=np.int)

    # Create names for sparsity and n_large variables
    names_sparsity = ["sparsityq%02.0f" % (q * 100) for q in q_sparsity]
    names_sparsity_se = ["sparsityq%02.0f_se" % (q * 100) for q in q_sparsity]
    names_nlarge = ["nlargep%02.0f" % (p * 100) for p in p_threshold]
    names_nlarge_se = ["nlargep%02.0f_se" % (p * 100) for p in p_threshold]

    # Allocate arrays for cluster-level summaries
    cluster_summaries = collections.OrderedDict()
    cluster_summaries['center'] = cluster_centers
    cluster_summaries['cluster_length'] = cluster_sizes
    cluster_summaries['occupancy'] = np.empty(n_clusters, dtype=np.float)
    cluster_summaries['occupancy_se'] = np.empty(n_clusters, dtype=np.float) 
    cluster_summaries['localization'] = np.empty(n_clusters, dtype=np.float)
    cluster_summaries['localization_se'] = np.empty(n_clusters, dtype=np.float)
    cluster_summaries['structure'] = np.empty(n_clusters, dtype=np.float)
    cluster_summaries['structure_se'] = np.empty(n_clusters, dtype=np.float)
    cluster_summaries['sparsity'] = np.empty(n_clusters, dtype=np.float)
    cluster_summaries['sparsity_se'] = np.empty(n_clusters, dtype=np.float)
    for var in itertools.chain(
        [item for items in itertools.izip(names_sparsity, names_sparsity_se) for
         item in items],
        [item for items in itertools.izip(names_nlarge, names_nlarge_se) for
         item in items]):
        cluster_summaries[var] = np.empty(n_clusters, dtype=np.float)
    
    # Compute cluster-level summaries, iterating over clusters
    for i, center, cluster in itertools.izip(xrange(n_clusters),
                                             cluster_centers, cluster_slices):
        # Extract cluster coefficient draws
        b_draws = np.exp(theta[:,cluster])
        p_draws = (b_draws.T / np.sum(b_draws, 1)).T
        
        # Compute posterior mean occupancy and its SD
        cluster_summaries['occupancy'][i] = np.mean(b_draws)*cluster_sizes[i]
        cluster_summaries['occupancy_se'][i] = np.std(np.sum(b_draws, axis=1))

        # Compute localization index by draw
        x=np.arange(cluster_sizes[i])[np.newaxis,:]
        localization = localization_index(x=x, p=p_draws, axis=1)
        cluster_summaries['localization'][i] = np.mean(localization)
        cluster_summaries['localization_se'][i] = np.std(localization)

        # Compute structure index by draw
        structure = structure_index(x=b_draws, axis=1)
        cluster_summaries['structure'][i] = np.mean(structure)
        cluster_summaries['structure_se'][i] = np.std(structure)
        
        # Compute sparsity index by draw
        sparsity = sparsity_index(x=b_draws, q=q_sparsity, axis=1)
        for i_q in xrange(len(q_sparsity)):
            cluster_summaries[names_sparsity[i_q]][i] = np.mean(sparsity[i_q])
            cluster_summaries[names_sparsity_se[i_q]][i] = np.std(sparsity[i_q])

        # Compute n_large by draw
        n_large = compute_n_large(x=b_draws, p_threshold=p_threshold, axis=1)
        for i_p in xrange(len(p_threshold)):
            cluster_summaries[names_nlarge[i_p]][i] = np.mean(n_large[i_p])
            cluster_summaries[names_nlarge_se[i_p]][i] = np.std(n_large[i_p])

    # Provide nicely-formatted delimited output for analyses and plotting
    if null:
        pattern_summaries = cfg['mcmc_output']['null_cluster_pattern']
    else:
        pattern_summaries = cfg['mcmc_output']['cluster_pattern']
    pattern_summaries = pattern_summaries.strip()
    path_summaries = pattern_summaries.format(**cfg) % chrom

    # Build recarray of summaries, starting with coefficients and diagnostics
    summaries = np.rec.fromarrays(cluster_summaries.values(),
                                  names=cluster_summaries.keys())

    # Write summaries to delimited text file
    libio.write_recarray_to_file(fname=path_summaries, data=summaries,
                                 header=True, sep=' ')

    # Clean-up scratch directory
    for name in names_npy:
        os.remove(scratch + '/' + name)

    return 0
Esempio n. 7
0
def summarise(cfg, chrom=1, null=False, mmap=False, detect_fmt=("%.1f", "%d")):
    '''
    Coordinate summarisation of MCMC results.

    Parameters
    ----------
    - cfg : dictionary
        Dictionary of parameters containing at least those relevant MCMC
        draw and summary output paths and parameters for summarization.
    - chrom : int
        Index of chromosome to analyze
    - null : bool
        Summarise null results?

    Returns
    -------
    - status : int
        Integer status for summarisation. 0 for success, > 0 for failure.
    '''
    # Reference useful information in local namespace
    n_burnin    = cfg['mcmc_params']['n_burnin']
    scratch     = cfg['mcmc_summaries']['path_scratch']
    width_local = cfg['mcmc_summaries']['width_local']
    p_detect    = cfg['mcmc_summaries']['p_detect']
    bp_per_nucleosome = cfg['mcmc_summaries']['bp_per_nucleosome']
    
    # Extract window size information (+/-) from config
    concentration_pm = cfg['mcmc_summaries']['concentration_pm']
    if isinstance(concentration_pm, str):
        pm_list = [int(s) for s in concentration_pm.split(',')]
    else:
        pm_list = [concentration_pm]
    
    # Check for existence and writeability of scratch directory
    if os.access(scratch, os.F_OK):
        # It exists, check for read-write
        if not os.access(scratch, os.R_OK | os.W_OK):
            print >> sys.stderr, ("Error --- Cannot read and write to %s" %
                                  scratch)
            return 1
    else:
        # Otherwise, try to make the directory
        os.makedirs(scratch)

    # Extract results to scratch directory
    if null:
        pattern_results = cfg['mcmc_output']['null_out_pattern']
    else:
        pattern_results = cfg['mcmc_output']['out_pattern']
    pattern_results = pattern_results.strip()
    path_results = pattern_results.format(**cfg) % chrom
    
    archive = tarfile.open(name=path_results, mode='r:*')
    archive.extractall(path=scratch)
    names_npy = archive.getnames()
    archive.close()

    # Load results of interest
    if mmap:
        mmap_mode = 'r+'
    else:
        mmap_mode = None

    theta   = np.load(scratch + '/theta.npy', mmap_mode=mmap_mode)
    mu      = np.load(scratch + '/mu.npy')

    # Remove burnin
    if n_burnin > 0:
        mu = mu[n_burnin:]
        theta = theta[n_burnin:]

    # Compute effective sample sizes
    n_eff = np.array([ess1d(theta_k) for theta_k in theta.T])
    gc.collect()
    
    # Compute concentration summaries
    local_concentrations = collections.OrderedDict()
    global_concentrations = collections.OrderedDict()

    # Iteration over concentration window sizes (+/-)
    for pm in pm_list:
        # Estimate probability of +/-(pm) local concentrations
        window_local = np.ones(width_local)
        window_pm    = np.ones(1 + 2*pm)
        baseline = (np.convolve(np.ones_like(theta[0]), window_pm, 'same') /
                    np.convolve(np.ones_like(theta[0]), window_local, 'same'))
        
        # Setup array for estimates by basepair
        p_local_concentration = np.zeros(theta.shape[1], dtype=np.float)
        
        # Iterate over draws
        mean_lro = np.zeros(theta.shape[1], dtype=np.float)
        se_lro = np.zeros(theta.shape[1], dtype=np.float)
        for t in xrange(theta.shape[0]):
            bt = np.exp(theta[t])
            local_occupancy_smoothed = local_relative_occupancy(
                bt, window_pm, window_local)
            delta = local_occupancy_smoothed - mean_lro
            mean_lro += delta / (t+1.)
            se_lro += delta * (local_occupancy_smoothed - mean_lro)
            p_local_concentration *= t/(t+1.)
            p_local_concentration += (
                (local_occupancy_smoothed > baseline)/(t+1.))
        se_lro = np.sqrt(se_lro / (theta.shape[0] - 1))
        
        # Store results in dictionary
        key = 'p_local_concentration_pm%d' % pm
        local_concentrations[key] = p_local_concentration
        key = 'mean_local_concentration_pm%d' % pm
        local_concentrations[key] = mean_lro
        key = 'se_local_concentration_pm%d' % pm
        local_concentrations[key] = se_lro
        key = 'z_local_concentration_pm%d' % pm
        local_concentrations[key] = mean_lro / se_lro

        # Clean-up
        del local_occupancy_smoothed
        gc.collect()
        
        # Posterior quantiles for global concentrations
        baseline_global = (np.array([np.sum(np.exp(theta_t)) for theta_t in
                                     theta]) / theta.shape[1]
                            * bp_per_nucleosome)
        
        # Setup arrays for means and quantiles by basepair
        q_global_concentration = np.zeros(theta.shape[1], dtype=np.float)
        mean_global_concentration = np.zeros(theta.shape[1], dtype=np.float)
        
        # Iterate over basepairs
        for bp in xrange(theta.shape[1]):
            w = slice(max(0,bp-pm), min(bp+pm+1, theta.shape[1]))
            prop = (np.sum(np.exp(theta[:,w]), 1) / baseline_global /
                    (w.stop-w.start))
            mean_global_concentration[bp] = np.mean(prop)
            q_global_concentration[bp] =  mstats.mquantiles(prop, 1.-p_detect)

        # Store results in dictionaries
        key = 'q_global_concentration_pm%d' % pm
        global_concentrations[key] = q_global_concentration
        key = 'mean_global_concentration_pm%d' % pm
        global_concentrations[key] = mean_global_concentration
    
    # Compute posterior means
    theta_postmean = np.mean(theta, 0)
    b_postmean = np.array([np.mean(np.exp(theta_k)) for theta_k in theta.T])

    # Compute standard errors
    theta_se = np.array([np.std(theta_k) for theta_k in theta.T])
    b_se = np.array([np.std(np.exp(theta_k)) for theta_k in theta.T])

    # Compute posterior medians
    theta_postmed = np.array([np.median(theta_k) for theta_k in theta.T])
    b_postmed = np.exp(theta_postmed)
    
    # Provide nicely-formatted delimited output for analyses and plotting
    if null:
        pattern_summaries = cfg['mcmc_output']['null_summary_pattern']
    else:
        pattern_summaries = cfg['mcmc_output']['summary_pattern']
    pattern_summaries = pattern_summaries.strip()
    path_summaries = pattern_summaries.format(**cfg) % chrom

    # Build recarray of summaries, starting with coefficients and diagnostics
    summaries = np.rec.fromarrays([theta_postmean, theta_postmed, theta_se,
                                   b_postmean, b_postmed, b_se, n_eff],
                                  names=('theta', 'theta_med', 'se_theta', 'b',
                                         'b_med', 'se_b', 'n_eff',))

    # Append local concentration information
    summaries = nprf.append_fields(base=summaries,
                                   names=local_concentrations.keys(),
                                   data=local_concentrations.values())
    
    # Append global concentration information
    summaries = nprf.append_fields(base=summaries,
                                   names=global_concentrations.keys(),
                                   data=global_concentrations.values())
    
    # Write summaries to delimited text file
    libio.write_recarray_to_file(fname=path_summaries, data=summaries,
                                 header=True, sep=' ')

    # Run detection, if requested
    if p_detect is not None and not null:
        for pm in pm_list:
            # Find detected positions
            key = 'p_local_concentration_pm%d' % pm
            detected = np.where(local_concentrations[key] > p_detect)[0]

            # Condense regions
            detected, n = condense_detections(detected)

            # Write detections to text file
            pattern_detections = cfg['mcmc_output']['detections_pattern']
            pattern_detections = pattern_detections.strip()
            path_detections = pattern_detections.format(**cfg) % (chrom, pm)

            detections = np.rec.fromarrays([detected, n],
                                           names=('pos', 'n'))
            libio.write_recarray_to_file(fname=path_detections, data=detections,
                                         header=True, sep=' ', fmt=detect_fmt)

    # Clean-up scratch directory
    for name in names_npy:
        os.remove(scratch + '/' + name)

    return 0
Esempio n. 8
0
def summarise_params(cfg, chrom=1, null=False):
    '''
    Coordinate summarisation of MCMC parameter draws.

    Parameters
    ----------
    - cfg : dictionary
        Dictionary of parameters containing at least those relevant MCMC
        draw and summary output paths and parameters for summarization.
    - chrom : int
        Index of chromosome to analyze
    - null : bool
        Summarise null results?

    Returns
    -------
    - status : int
        Integer status for summarisation. 0 for success, > 0 for failure.
    '''
    # Reference useful information in local namespace
    n_burnin    = cfg['mcmc_params']['n_burnin']
    scratch     = cfg['mcmc_summaries']['path_scratch']
    
    # Check for existence and writeability of scratch directory
    if os.access(scratch, os.F_OK):
        # It exists, check for read-write
        if not os.access(scratch, os.R_OK | os.W_OK):
            print >> sys.stderr, ("Error --- Cannot read and write to %s" %
                                  scratch)
            return 1
    else:
        # Otherwise, try to make the directory
        os.makedirs(scratch)

    # Extract results to scratch directory
    if null:
        pattern_results = cfg['mcmc_output']['null_out_pattern']
    else:
        pattern_results = cfg['mcmc_output']['out_pattern']
    pattern_results = pattern_results.strip()
    path_results = pattern_results.format(**cfg) % chrom
    
    archive = tarfile.open(name=path_results, mode='r:*')
    archive.extractall(path=scratch)
    names_npy = archive.getnames()
    archive.close()

    # Load results of interest
    mu = np.load(scratch + '/mu.npy')
    sigmasq = np.load(scratch + '/sigmasq.npy')
    region_ids = np.load(scratch + '/region_ids.npy')

    # Remove burnin
    if n_burnin > 0:
        mu = mu[n_burnin:]
        sigmasq = sigmasq[n_burnin:]

    # Compute posterior means
    mu_postmean = np.mean(mu, 0)
    sigmasq_postmean = np.mean(sigmasq, 0)
    sigma_postmean = np.mean(np.sqrt(sigmasq), 0)

    # Compute posterior medians
    mu_postmed = np.median(mu, 0)
    sigmasq_postmed = np.median(sigmasq, 0)
    sigma_postmed = np.median(np.sqrt(sigmasq), 0)

    # Compute standard errors
    mu_se = np.std(mu, 0)
    sigmasq_se = np.std(sigmasq, 0)
    sigma_se = np.std(np.sqrt(sigmasq), 0)

    # Provide nicely-formatted delimited output for analyses and plotting
    if null:
        pattern_summaries = cfg['mcmc_output']['null_param_pattern']
    else:
        pattern_summaries = cfg['mcmc_output']['param_pattern']
    pattern_summaries = pattern_summaries.strip()
    path_summaries = pattern_summaries.format(**cfg) % chrom

    # Build recarray of summaries, starting with coefficients and diagnostics
    summaries = np.rec.fromarrays([region_ids, mu_postmean, mu_postmed, mu_se,
                                   sigmasq_postmean, sigmasq_postmed,
                                   sigmasq_se, sigma_postmean, sigma_postmed,
                                   sigma_se],
                                  names= ('region_id', 'mu_postmean',
                                          'mu_postmed', 'mu_se',
                                          'sigmasq_postmean', 'sigmasq_postmed',
                                          'sigmasq_se', 'sigma_postmean',
                                          'sigma_postmed', 'sigma_se'))

    # Write summaries to delimited text file
    libio.write_recarray_to_file(fname=path_summaries, data=summaries,
                                 header=True, sep=' ')

    # Clean-up scratch directory
    for name in names_npy:
        os.remove(scratch + '/' + name)

    return 0