def write_modeled_regions(modeled_clusters, p_cutoff, out_fh): """ Write a region bed file suitable for use in :func:`~evaluate_modeled_regions`. Parameters ---------- modeled_clusters : list output from :func:`~model_clusters` p_cutoff : float values < this are set as true out_fh : filehandle where to write the data """ fmt = "{chrom}\t{start}\t{end}\t{truth}\t{size}\n" out_fh.write(ts.fmt2header(fmt)) for mc in modeled_clusters: c = mc['cluster'] truth = ['false', 'true'][int(mc['p'] < p_cutoff)] for f in c: out_fh.write(fmt.format(**dict(chrom=f.chrom, start=f.position - 1, end=f.position, truth=truth, size=len(c))))
def write_modeled_regions(modeled_clusters, p_cutoff, out_fh): """ Write a region bed file suitable for use in :func:`~evaluate_modeled_regions`. Parameters ---------- modeled_clusters : list output from :func:`~model_clusters` p_cutoff : float values < this are set as true out_fh : filehandle where to write the data """ fmt = "{chrom}\t{start}\t{end}\t{truth}\t{size}\n" out_fh.write(ts.fmt2header(fmt)) for mc in modeled_clusters: c = mc['cluster'] truth = ['false', 'true'][int(mc['p'] < p_cutoff)] for f in c: out_fh.write( fmt.format(**dict(chrom=f.chrom, start=f.position - 1, end=f.position, truth=truth, size=len(c))))
def write_region_bed(feature_iter, true_regions, out_fh): """ Write a region bed file suitable for use in :func:`~evaluate`. given true regions (likely from an external program, otherwise use :func:`~write_modeled_regions`). Parameters ---------- feature_iter : iterable of Features true_regions : file BED file containing true regions out_fh : filehandle where to write the data """ fmt = "{chrom}\t{start}\t{end}\t{truth}\t{size}\n" out_fh.write(ts.fmt2header(fmt)) regions = defaultdict(InterLap) for i, toks in enumerate(ts.reader(true_regions, header=False)): # see if it's a header. if i == 0 and not (toks[1] + toks[2]).isdigit(): continue chrom, start, end = toks[0], int(toks[1]), int(toks[2]) regions[chrom].add((start, end)) for f in feature_iter: truth = 'true' if (f.position, f.position) in regions[f.chrom] else 'false' out_fh.write(fmt.format(chrom=f.chrom, start=f.position - 1, end=f.position, truth=truth, size=1)) out_fh.flush()
def write_region_bed(feature_iter, true_regions, out_fh): """ Write a region bed file suitable for use in :func:`~evaluate`. given true regions (likely from an external program, otherwise use :func:`~write_modeled_regions`). Parameters ---------- feature_iter : iterable of Features true_regions : file BED file containing true regions out_fh : filehandle where to write the data """ fmt = "{chrom}\t{start}\t{end}\t{truth}\t{size}\n" out_fh.write(ts.fmt2header(fmt)) regions = defaultdict(InterLap) for i, toks in enumerate(ts.reader(true_regions, header=False)): # see if it's a header. if i == 0 and not (toks[1] + toks[2]).isdigit(): continue chrom, start, end = toks[0], int(toks[1]), int(toks[2]) regions[chrom].add((start, end)) for f in feature_iter: truth = 'true' if (f.position, f.position) in regions[f.chrom] else 'false' out_fh.write( fmt.format(chrom=f.chrom, start=f.position - 1, end=f.position, truth=truth, size=1)) out_fh.flush()
formula = 'methylation ~ age + gender' coef = 'gender' covs = pd.read_csv(covariates_file) def feature_gen(fname): for i, d in enumerate(ts.reader(fname, header=False)): if i == 0: continue chrom, pos = d[0].split(":") yield crystal.Feature(chrom, int(pos), crystal.logit(np.array(map(float, d[1:])))) cluster_iter = mclust(feature_gen(methylation_file), max_dist=100, max_skip=0) fmt = "{chrom}\t{start}\t{end}\t{p:.4g}\t{coef:.3f}\t{n_sites:d}" print(ts.fmt2header(fmt)) for i, c in enumerate( crystal.model_clusters(cluster_iter, covs, formula, coef, model_fn=crystal.zscore_cluster, n_cpu=1)): print(fmt.format(**c)) if c['p'] < 1e-3 and abs(c['coef']) > 0.2 and c['n_sites'] > 3: crystal.plot.spaghetti_plot(c, covs) plt.savefig('/tmp/figure-1.eps') break
def simulate_regions(clust_list, region_fh, sizes=SIZES, class_order=None, seed=42, get_reduced_residuals=None, get_reduced_residuals_args=()): """Simulate regions and randomize others. Parameters ---------- clust_list : list of clusters should include clusters of length 1. region_fh : filehandle a BED file of all position will be written to this file. The 4th column will indicate true/false indicating if it was simulated to have a difference. The fifth column will indicate the size of the cluster it was in. size : dict keys of the clust_size and values of how many clusters to create of that size. Default is to create 100 of each size from 3 to 8 and 200 clusters of size one and 2. All others are randomized. classes : np.array same length as cluster[i].values indicating which group each sample belongs to. seed: int get_reduced_residuals : function If this parameter is None, then they values are shuffled as they are received. A function that accepts a cluster and returns residuals of the reduced model. e.g. if the full model of interest is: methylation ~ disease + age + gender the reduced model would be: methylation ~ age + gender so that only the residuals of the reduced model are shuffled and the other effects should remain. This will implement the bootsrap for linear models from Efron and Tibshirani. An Example function would be: :func:`~rr_cluster` Returns ------- generator of clusters in the same order as clust_list. """ np.random.seed(seed) from math import log assert isinstance(clust_list, list), ("need a list due to multiple \ iterations") if class_order is not None: class_order = np.array(class_order) classes = np.unique(class_order) assert len(classes) == 2, (classes, "should have 2 unique") classes = {classes[0]: 0, classes[1]: 1} class_order = np.array([classes[c] for c in class_order]) clusts = defaultdict(list) for clust in clust_list: clusts[len(clust)].append(clust) clusts = dict(clusts) sim_idxs = {} # for each size of clust, choose n random indices based on how # many of that cluster we saw. for size, n in sizes.items(): idxs = np.arange(len(clusts[size])) # get the indexes of the clusters we want sim_idxs[size] = frozenset(np.random.choice(idxs, size=min(n, len(idxs)), replace=False)) del clusts fmt = "{chrom}\t{start}\t{end}\t{truth}\t{size}\n" region_fh.write(ts.fmt2header(fmt)) seen = defaultdict(int) changed = defaultdict(int) for c in clust_list: l = len(c) w = 0 # need this if block in case we get a cluster longer # than we have in sizes if l in sim_idxs: s = seen[l] # denominator sets larger DMRs to have a smaller per-probe effect. #w = 2 * int(s in sim_idxs[l]) #w = int(s in sim_idxs[l]) / (log(l + 1)) #w = 2.0 * int(s in sim_idxs[l]) / (1 + log(l, 2)) #w = int(s in sim_idxs[l]) * 2 ** (1.0 / l) #w = (3. * int(s in sim_idxs[l])) ** (1.0 / l) w = 2 * int(s in sim_idxs[l]) / log(l * 2) seen[l] += 1 if w > 0: changed[l] += 1 truth = l in sim_idxs and s in sim_idxs[l] for f in c: region_fh.write(fmt.format(chrom=f.chrom, start=f.position - 1, end=f.position, truth="true" if truth else "false", size=len(c))) yield simulate_cluster(c, w, class_order, get_reduced_residuals, get_reduced_residuals_args) sys.stderr.write("changed:" + str(dict(changed)) + "\n") sys.stderr.write("total:" + str(dict(seen)) + "\n") region_fh.flush()
covariates_file = "../../../crystal/tests/covs.csv" methylation_file = "../../../crystal/tests/meth.txt.gz" formula = "methylation ~ age + gender" coef = "gender" covs = pd.read_csv(covariates_file) def feature_gen(fname): for i, d in enumerate(ts.reader(fname, header=False)): if i == 0: continue chrom, pos = d[0].split(":") yield crystal.Feature(chrom, int(pos), crystal.logit(np.array(map(float, d[1:])))) cluster_iter = mclust(feature_gen(methylation_file), max_dist=100, max_skip=0) fmt = "{chrom}\t{start}\t{end}\t{p:.4g}\t{coef:.3f}\t{n_sites:d}" print(ts.fmt2header(fmt)) for i, c in enumerate( crystal.model_clusters(cluster_iter, covs, formula, coef, model_fn=crystal.zscore_cluster, n_cpu=1) ): print(fmt.format(**c)) if c["p"] < 1e-3 and abs(c["coef"]) > 0.2 and c["n_sites"] > 3: crystal.plot.spaghetti_plot(c, covs) plt.savefig("/tmp/figure-1.eps") break