def _get_lengths(lengths): """Load chromosome lengths from file, or reformat lengths object. """ if isinstance(lengths, str) and os.path.exists(lengths): lengths = load_lengths(lengths) elif lengths is not None and (isinstance(lengths, list) or isinstance(lengths, np.ndarray)): if len(lengths) == 1 and isinstance( lengths[0], str) and os.path.exists(lengths[0]): lengths = load_lengths(lengths[0]) lengths = np.array(lengths).astype(int) return lengths
def run_nmds(directory): if os.path.exists(os.path.join(directory, "config.ini")): config_file = os.path.join(directory, "config.ini") else: config_file = None options = parse(config_file) random_state = np.random.RandomState(seed=options["seed"]) # First, compute MDS if options["lengths"].endswith(".bed"): lengths = load_lengths(os.path.join(directory, options["lengths"])) else: lengths = None if options["counts"].endswith("npy"): counts = np.load(os.path.join(directory, options["counts"])) elif options["counts"].endswith(".matrix"): counts = load_counts(os.path.join(directory, options["counts"]), lengths=lengths) if options["normalize"]: counts = iced.filter.filter_low_counts(counts, sparsity=False, percentage=0.04) counts = iced.normalization.ICE_normalization(counts, max_iter=300) if not sparse.issparse(counts): counts = sparse.coo_matrix(counts) else: counts = counts.tocsr() counts.eliminate_zeros() counts = counts.tocoo() torm = np.array((counts.sum(axis=0) == 0)).flatten() nmds = NMDS(alpha=options["alpha"], beta=options["beta"], random_state=random_state, max_iter=options["max_iter"], verbose=options["verbose"]) X = nmds.fit(counts) X[torm] = np.nan np.savetxt(os.path.join(directory, "NMDS." + options["output_name"]), X) # PDB file pdbfilename = os.path.join(directory, "NMDS." + options["output_name"] + ".pdb") # pdbfilename = "test.pdb" writePDB(X, pdbfilename) return True
def load_sample_yeast(): """ Load and return a sample of S. cerevisiae contact count matrix from duan et al, Nature, 2009 Returns ------- counts, lengths: tuple of two elements, the first a contact count matrix, the second an ndarray containing the lengths of the chromosomes. """ module_path = dirname(__file__) lengths = io.load_lengths( os.path.join(module_path, "data/duan2009/duan.SC.10000.raw_sub.bed")) counts = io.load_counts( os.path.join(module_path, "data/duan2009/duan.SC.10000.raw_sub.matrix"), lengths=lengths) counts = counts.toarray() counts = counts.T + counts return counts, lengths
from __future__ import print_function import numpy as np from glob import glob from iced import io from iced import utils filenames = glob("data/ay2013/*10000_raw.matrix") + \ glob("data/lemieux2013/25kb/*.matrix") filenames.sort() for filename in filenames: lengths = io.load_lengths(filename.replace(".matrix", ".bed")) counts = io.load_counts(filename, lengths=lengths) counts = counts.toarray() counts = counts.T + counts mask = utils.get_intra_mask(lengths) # Just making sure there is no interaction counted in teh diag counts[np.diag_indices_from(counts)] = 0 print(filename) print("Total number of counts", counts.sum()) print("%% of intra", counts[mask].sum()/counts.sum() * 100) print("%% of inter", counts[np.invert(mask)].sum()/counts.sum() * 100) print()
def run_pm2(directory): if os.path.exists(os.path.join(directory, "config.ini")): config_file = os.path.join(directory, "config.ini") else: config_file = None options = parse(config_file) random_state = np.random.RandomState(seed=options["seed"]) options = parse(config_file) if options["lengths"].endswith(".bed"): lengths = load_lengths(os.path.join(directory, options["lengths"])) else: lengths = None if options["counts"].endswith("npy"): counts = np.load(os.path.join(directory, options["counts"])) counts[np.arange(len(counts)), np.arange(len(counts))] = 0 elif options["counts"].endswith(".matrix"): counts = load_counts(os.path.join(directory, options["counts"]), lengths=lengths) if options["normalize"]: counts = iced.filter.filter_low_counts(counts, sparsity=False, percentage=0.04) _, bias = iced.normalization.ICE_normalization(counts, max_iter=300, output_bias=True) else: bias = None if not sparse.issparse(counts): counts[np.isnan(counts)] = 0 counts = sparse.coo_matrix(counts) else: counts = counts.tocsr() counts.eliminate_zeros() counts = counts.tocoo() pm2 = PM2(alpha=options["alpha"], beta=options["beta"], random_state=random_state, max_iter=options["max_iter"], bias=bias, verbose=options["verbose"]) X = pm2.fit(counts) torm = np.array(((counts + counts.transpose()).sum(axis=0) == 0)).flatten() X[torm] = np.nan np.savetxt(os.path.join(directory, "PM2." + options["output_name"]), X) # PDB file pdbfilename = os.path.join(directory, "PM2." + options["output_name"] + ".pdb") # pdbfilename = "test.pdb" writePDB(X, pdbfilename) return True
import argparse import numpy as np from sklearn.metrics import euclidean_distances from iced.utils import downsample_resolution from iced.io import load_lengths parser = argparse.ArgumentParser() parser.add_argument("directory") parser.add_argument("--lengths", "-l") parser.add_argument("--factor", type=int, default=10) args = parser.parse_args() factor = args.factor if args.lengths is not None: lengths = load_lengths(args.lengths) else: lengths = load_lengths( os.path.join(args.directory.replace("results", "data"), "raw.bed")) distances = [] filenames = glob(args.directory + "*_structure.txt") filenames.sort() distances = [] for i, filename in enumerate(filenames): sys.stdout.write("\rAnalysing %0.2f %% files" % (100. * (i + 1) / len(filenames))) sys.stdout.flush()
import argparse import numpy as np from scipy import sparse from iced import io from iced import utils parser = argparse.ArgumentParser() parser.add_argument("filename") parser.add_argument("--outname", "-o") args = parser.parse_args() filename = args.filename lengths = io.load_lengths("data/ay2013/rings_10000_raw.bed") counts = io.load_counts(filename, lengths=lengths) counts = counts.toarray() counts = counts.T + counts new_counts, new_lengths = utils.downsample_resolution(counts, lengths) new_counts = sparse.coo_matrix(np.triu(new_counts)) io.write_counts(args.outname, new_counts) io.write_lengths(args.outname.replace(".matrix", ".bed"), new_lengths)
import numpy as np import argparse from iced import io import utils from scipy import spatial from statistics import compute_mepd, compute_witten_and_noble import matplotlib.pyplot as plt from sklearn.externals.joblib import Memory mem = Memory(".joblib") fig, axes = plt.subplots(nrows=4) filename = "structures/lemieux2013/25kb/B15C2_combined_raw_PO_01_structure.txt" lengths = io.load_lengths("data/lemieux2013/25kb/B15C2_combined_raw.bed") #filename = "structures/ay2013/trophozoites_10000_raw_PO_01_structure.txt" #lengths = io.load_lengths("data/ay2013/trophozoites_10000_raw.bed") X = np.loadtxt(filename) centromeres = np.loadtxt("files/pf.cent") chr_ = np.arange(len(centromeres)) if "25kb" in filename: resolution = 25000 elif "20000" in filename: resolution = 20000 else: resolution = 10000 centromeres = np.round(centromeres.mean(axis=1) / resolution) centromeres = centromeres.astype(int)
import numpy as np from iced import io from iced import filter from iced import normalization from iced import utils from matplotlib.colors import LogNorm import matplotlib.pyplot as plt lengths = io.load_lengths("data/trophozoites_10000_raw.bed") counts = io.load_counts("data/trophozoites_10000_raw.matrix", lengths=lengths) counts = utils.from_sparse_to_dense(counts) normed = filter.filter_low_counts(counts, remove_all_zeros_loci=True, sparsity=False) normed = normalization.ICE_normalization(normed) normed, l = utils.extract_sub_contact_map(normed, lengths, [6, 7]) to_rm = normed.sum(axis=0) == 0 normed[to_rm] = np.nan normed[:, to_rm] = np.nan fig, ax = plt.subplots() m = ax.matshow(np.log(normed+1), cmap="RdYlBu_r", vmax=5) ax.set_xticks([]) ax.set_yticks([]) l = np.concatenate([[0], l]) [ax.axhline(i, color="0", linestyle="--") for i in l.cumsum()] [ax.axvline(i, color="0", linestyle="--") for i in l.cumsum()]
# recent versions of python print "--filtering_perc is deprecated. Please use filter_low_counts_perc" print "instead. This option will be removed in ice 0.3" filter_low_counts = args.filtering_perc if "--filter_low_counts_perc" in sys.argv and "--filtering_perc" in sys.argv: raise Warning("This two options are incompatible") if "--filtering_perc" is None and "--filter_low_counts_perc" not in sys.argv: filter_low_counts_perc = 0.02 elif args.filter_low_counts_perc is not None: filter_low_counts_perc = args.filter_low_counts_perc if args.verbose: print "Loading files..." if args.bed_file: lengths = io.load_lengths(args.bed_file) else: lengths = None # Loads file as i, j, counts if lengths is None: i, j, data = loadtxt(filename).T N = max(i.max(), j.max()) + 1 counts = sparse.coo_matrix((data, (i, j)), shape=(N, N), dtype=float) else: counts = io.load_counts(filename, lengths=lengths) if args.dense: counts = np.array(counts.todense()) else: counts = sparse.csr_matrix(counts)
def increase_struct_res(struct, multiscale_factor, lengths, mask=None): """Linearly interpolate structure to increase resolution. Increase resolution of structure via linear interpolation between beads. Parameters ---------- struct : array of float 3D chromatin structure at low resolution. lengths : array_like of int Number of beads per homolog of each chromosome at high resolution (the desired resolution of the output structure). multiscale_factor : int, optional Factor by which to increase the resolution. A value of 2 doubles the resolution. A value of 1 does not change the resolution. Returns ------- struct_highres : array of float 3D chromatin structure that has been linearly interpolated to the specified high resolution. """ if int(multiscale_factor) != multiscale_factor: raise ValueError('The multiscale_factor must be an integer') multiscale_factor = int(multiscale_factor) if multiscale_factor == 1: return struct if isinstance(struct, str): struct = np.loadtxt(struct) struct = struct.reshape(-1, 3) if isinstance(lengths, str): lengths = load_lengths(lengths) lengths = np.array(lengths).astype(int) lengths_lowres = decrease_lengths_res(lengths, multiscale_factor) ploidy = struct.shape[0] / lengths_lowres.sum() if ploidy != 1 and ploidy != 2: raise ValueError( "Not consistent with haploid or diploid... struct is" " %d beads (and 3 cols), sum of lengths is %d" % (struct.reshape(-1, 3).shape[0], lengths_lowres.sum())) ploidy = int(ploidy) indices = _get_struct_indices(ploidy, multiscale_factor, lengths).reshape(multiscale_factor, -1) if mask is not None: indices[~mask.reshape(multiscale_factor, -1)] = np.nan struct_highres = np.full((lengths.sum() * ploidy, 3), np.nan) begin_lowres, end_lowres = 0, 0 for i in range(lengths.shape[0] * ploidy): end_lowres += np.tile(lengths_lowres, ploidy)[i] # Beads of struct that are NaN struct_nan = np.isnan(struct[begin_lowres:end_lowres, 0]) # Get indices for this chrom at low & high res chrom_indices = indices[:, begin_lowres:end_lowres] chrom_indices[:, struct_nan] = np.nan chrom_indices_lowres = np.nanmean(chrom_indices, axis=0) chrom_indices_highres = chrom_indices.T.flatten() # Note which beads are unknown highres_mask = ~np.isnan(chrom_indices_highres) highres_mask[highres_mask] = (chrom_indices_highres[highres_mask] >= np.nanmin(chrom_indices_lowres)) & ( chrom_indices_highres[highres_mask] <= np.nanmax(chrom_indices_lowres)) unknown_beads = np.where(~highres_mask)[0] + np.tile(lengths, ploidy)[:i].sum() unknown_beads = unknown_beads[ unknown_beads < np.tile(lengths, ploidy)[:i + 1].sum()] unknown_beads_at_begin = [ unknown_beads[k] for k in range(len(unknown_beads)) if unknown_beads[k] == unknown_beads.min() or all([ unknown_beads[k] - j == unknown_beads[k - j] for j in range(k + 1) ]) ] if len(unknown_beads) - len(unknown_beads_at_begin) > 0: unknown_beads_at_end = [ unknown_beads[k] for k in range(len(unknown_beads)) if unknown_beads[k] == unknown_beads.max() or all([ unknown_beads[k] + j == unknown_beads[k + j] for j in range(len(unknown_beads) - k) ]) ] chrom_indices_highres = np.arange( max(unknown_beads_at_begin) + 1, min(unknown_beads_at_end)) else: unknown_beads_at_end = [] chrom_indices_highres = np.arange( max(unknown_beads_at_begin) + 1, int(np.nanmax(chrom_indices_highres)) + 1) struct_highres[chrom_indices_highres, 0] = interp1d(chrom_indices_lowres[~struct_nan], struct[begin_lowres:end_lowres, 0][~struct_nan], kind="linear")(chrom_indices_highres) struct_highres[chrom_indices_highres, 1] = interp1d(chrom_indices_lowres[~struct_nan], struct[begin_lowres:end_lowres, 1][~struct_nan], kind="linear")(chrom_indices_highres) struct_highres[chrom_indices_highres, 2] = interp1d(chrom_indices_lowres[~struct_nan], struct[begin_lowres:end_lowres, 2][~struct_nan], kind="linear")(chrom_indices_highres) # Fill in beads at start diff_beads_at_chr_start = struct_highres[ chrom_indices_highres[1], :] - struct_highres[ chrom_indices_highres[0], :] how_far = 1 for j in reversed(unknown_beads_at_begin): struct_highres[j, :] = struct_highres[ chrom_indices_highres[0], :] - diff_beads_at_chr_start * how_far how_far += 1 # Fill in beads at end diff_beads_at_chr_end = struct_highres[ chrom_indices_highres[-2], :] - struct_highres[ chrom_indices_highres[-1], :] how_far = 1 for j in unknown_beads_at_end: struct_highres[j, :] = struct_highres[ chrom_indices_highres[-1], :] - diff_beads_at_chr_end * how_far how_far += 1 begin_lowres = end_lowres return struct_highres