Exemple #1
0
def write_modeled_regions(modeled_clusters, p_cutoff, out_fh):
    """
    Write a region bed file suitable for use in :func:`~evaluate_modeled_regions`.

    Parameters
    ----------

    modeled_clusters : list
        output from :func:`~model_clusters`

    p_cutoff : float
        values < this are set as true

    out_fh : filehandle
        where to write the data
    """
    fmt = "{chrom}\t{start}\t{end}\t{truth}\t{size}\n"
    out_fh.write(ts.fmt2header(fmt))
    for mc in modeled_clusters:
        c = mc['cluster']
        truth = ['false', 'true'][int(mc['p'] < p_cutoff)]
        for f in c:
            out_fh.write(fmt.format(**dict(chrom=f.chrom, start=f.position - 1,
                         end=f.position,
                         truth=truth,
                         size=len(c))))
Exemple #2
0
def write_modeled_regions(modeled_clusters, p_cutoff, out_fh):
    """
    Write a region bed file suitable for use in :func:`~evaluate_modeled_regions`.

    Parameters
    ----------

    modeled_clusters : list
        output from :func:`~model_clusters`

    p_cutoff : float
        values < this are set as true

    out_fh : filehandle
        where to write the data
    """
    fmt = "{chrom}\t{start}\t{end}\t{truth}\t{size}\n"
    out_fh.write(ts.fmt2header(fmt))
    for mc in modeled_clusters:
        c = mc['cluster']
        truth = ['false', 'true'][int(mc['p'] < p_cutoff)]
        for f in c:
            out_fh.write(
                fmt.format(**dict(chrom=f.chrom,
                                  start=f.position - 1,
                                  end=f.position,
                                  truth=truth,
                                  size=len(c))))
Exemple #3
0
def write_region_bed(feature_iter, true_regions, out_fh):
    """
    Write a region bed file suitable for use in :func:`~evaluate`.
    given true regions (likely from an external program, otherwise use
    :func:`~write_modeled_regions`).

    Parameters
    ----------

    feature_iter : iterable of Features

    true_regions : file
        BED file containing true regions

    out_fh : filehandle
        where to write the data
    """
    fmt = "{chrom}\t{start}\t{end}\t{truth}\t{size}\n"
    out_fh.write(ts.fmt2header(fmt))

    regions = defaultdict(InterLap)

    for i, toks in enumerate(ts.reader(true_regions, header=False)):
        # see if it's a header.
        if i == 0 and not (toks[1] + toks[2]).isdigit(): continue
        chrom, start, end = toks[0], int(toks[1]), int(toks[2])
        regions[chrom].add((start, end))

    for f in feature_iter:
        truth = 'true' if (f.position, f.position) in regions[f.chrom] else 'false'
        out_fh.write(fmt.format(chrom=f.chrom, start=f.position - 1,
                    end=f.position, truth=truth, size=1))
    out_fh.flush()
Exemple #4
0
def write_region_bed(feature_iter, true_regions, out_fh):
    """
    Write a region bed file suitable for use in :func:`~evaluate`.
    given true regions (likely from an external program, otherwise use
    :func:`~write_modeled_regions`).

    Parameters
    ----------

    feature_iter : iterable of Features

    true_regions : file
        BED file containing true regions

    out_fh : filehandle
        where to write the data
    """
    fmt = "{chrom}\t{start}\t{end}\t{truth}\t{size}\n"
    out_fh.write(ts.fmt2header(fmt))

    regions = defaultdict(InterLap)

    for i, toks in enumerate(ts.reader(true_regions, header=False)):
        # see if it's a header.
        if i == 0 and not (toks[1] + toks[2]).isdigit(): continue
        chrom, start, end = toks[0], int(toks[1]), int(toks[2])
        regions[chrom].add((start, end))

    for f in feature_iter:
        truth = 'true' if (f.position,
                           f.position) in regions[f.chrom] else 'false'
        out_fh.write(
            fmt.format(chrom=f.chrom,
                       start=f.position - 1,
                       end=f.position,
                       truth=truth,
                       size=1))
    out_fh.flush()
Exemple #5
0
formula = 'methylation ~ age + gender'
coef = 'gender'

covs = pd.read_csv(covariates_file)


def feature_gen(fname):
    for i, d in enumerate(ts.reader(fname, header=False)):
        if i == 0: continue
        chrom, pos = d[0].split(":")
        yield crystal.Feature(chrom, int(pos),
                              crystal.logit(np.array(map(float, d[1:]))))


cluster_iter = mclust(feature_gen(methylation_file), max_dist=100, max_skip=0)

fmt = "{chrom}\t{start}\t{end}\t{p:.4g}\t{coef:.3f}\t{n_sites:d}"
print(ts.fmt2header(fmt))
for i, c in enumerate(
        crystal.model_clusters(cluster_iter,
                               covs,
                               formula,
                               coef,
                               model_fn=crystal.zscore_cluster,
                               n_cpu=1)):
    print(fmt.format(**c))
    if c['p'] < 1e-3 and abs(c['coef']) > 0.2 and c['n_sites'] > 3:
        crystal.plot.spaghetti_plot(c, covs)
        plt.savefig('/tmp/figure-1.eps')
        break
Exemple #6
0
def simulate_regions(clust_list, region_fh, sizes=SIZES, class_order=None,
        seed=42, get_reduced_residuals=None, get_reduced_residuals_args=()):
    """Simulate regions and randomize others.

    Parameters
    ----------

    clust_list : list of clusters
        should include clusters of length 1.

    region_fh : filehandle
        a BED file of all position will be written to this
        file. The 4th column will indicate true/false indicating
        if it was simulated to have a difference. The fifth
        column will indicate the size of the cluster it was in.

    size : dict
        keys of the clust_size and values of how many clusters
        to create of that size. Default is to create 100 of each
        size from 3 to 8 and 200 clusters of size one and 2. All
        others are randomized.

    classes : np.array
        same length as cluster[i].values indicating
        which group each sample belongs to.

    seed: int

    get_reduced_residuals : function
        If this parameter is None, then they values are shuffled as they
        are received.
        A function that accepts a cluster and returns residuals of the reduced
        model. e.g. if the full model of interest is:
            methylation ~ disease + age + gender
        the reduced model would be:
            methylation ~ age + gender
        so that only the residuals of the reduced model are shuffled and the
        other effects should remain. This will implement the bootsrap for
        linear models from Efron and Tibshirani.
        An Example function would be: :func:`~rr_cluster`


    Returns
    -------

    generator of clusters in the same order as clust_list.

    """
    np.random.seed(seed)
    from math import log
    assert isinstance(clust_list, list), ("need a list due to multiple \
            iterations")

    if class_order is not None:
        class_order = np.array(class_order)
        classes = np.unique(class_order)
        assert len(classes) == 2, (classes, "should have 2 unique")
        classes = {classes[0]: 0, classes[1]: 1}
        class_order = np.array([classes[c] for c in class_order])

    clusts = defaultdict(list)
    for clust in clust_list:
        clusts[len(clust)].append(clust)
    clusts = dict(clusts)

    sim_idxs = {}
    # for each size of clust, choose n random indices based on how
    # many of that cluster we saw.
    for size, n in sizes.items():
        idxs = np.arange(len(clusts[size]))
        # get the indexes of the clusters we want
        sim_idxs[size] = frozenset(np.random.choice(idxs, size=min(n,
                                                    len(idxs)), replace=False))
    del clusts
    fmt = "{chrom}\t{start}\t{end}\t{truth}\t{size}\n"
    region_fh.write(ts.fmt2header(fmt))

    seen = defaultdict(int)
    changed = defaultdict(int)
    for c in clust_list:
        l = len(c)
        w = 0
        # need this if block in case we get a cluster longer
        # than we have in sizes
        if l in sim_idxs:
            s = seen[l]
            # denominator sets larger DMRs to have a smaller per-probe effect.
            #w = 2 * int(s in sim_idxs[l])
            #w = int(s in sim_idxs[l]) / (log(l + 1))
            #w = 2.0 * int(s in sim_idxs[l]) / (1 + log(l, 2))
            #w = int(s in sim_idxs[l]) * 2 ** (1.0 / l)
            #w = (3. * int(s in sim_idxs[l])) ** (1.0 / l)
            w = 2 * int(s in sim_idxs[l]) / log(l * 2)

            seen[l] += 1
            if w > 0: changed[l] += 1

        truth = l in sim_idxs and s in sim_idxs[l]
        for f in c:
            region_fh.write(fmt.format(chrom=f.chrom,
                                       start=f.position - 1,
                                       end=f.position,
                                       truth="true" if truth else "false",
                                       size=len(c)))
        yield simulate_cluster(c, w, class_order, get_reduced_residuals,
                get_reduced_residuals_args)
    sys.stderr.write("changed:" + str(dict(changed)) + "\n")
    sys.stderr.write("total:" + str(dict(seen)) + "\n")
    region_fh.flush()
Exemple #7
0
covariates_file = "../../../crystal/tests/covs.csv"
methylation_file = "../../../crystal/tests/meth.txt.gz"
formula = "methylation ~ age + gender"
coef = "gender"

covs = pd.read_csv(covariates_file)


def feature_gen(fname):
    for i, d in enumerate(ts.reader(fname, header=False)):
        if i == 0:
            continue
        chrom, pos = d[0].split(":")
        yield crystal.Feature(chrom, int(pos), crystal.logit(np.array(map(float, d[1:]))))


cluster_iter = mclust(feature_gen(methylation_file), max_dist=100, max_skip=0)


fmt = "{chrom}\t{start}\t{end}\t{p:.4g}\t{coef:.3f}\t{n_sites:d}"
print(ts.fmt2header(fmt))
for i, c in enumerate(
    crystal.model_clusters(cluster_iter, covs, formula, coef, model_fn=crystal.zscore_cluster, n_cpu=1)
):
    print(fmt.format(**c))
    if c["p"] < 1e-3 and abs(c["coef"]) > 0.2 and c["n_sites"] > 3:
        crystal.plot.spaghetti_plot(c, covs)
        plt.savefig("/tmp/figure-1.eps")
        break