Esempio n. 1
0
def _get_lengths(lengths):
    """Load chromosome lengths from file, or reformat lengths object.
    """

    if isinstance(lengths, str) and os.path.exists(lengths):
        lengths = load_lengths(lengths)
    elif lengths is not None and (isinstance(lengths, list)
                                  or isinstance(lengths, np.ndarray)):
        if len(lengths) == 1 and isinstance(
                lengths[0], str) and os.path.exists(lengths[0]):
            lengths = load_lengths(lengths[0])
    lengths = np.array(lengths).astype(int)
    return lengths
Esempio n. 2
0
def run_nmds(directory):
    if os.path.exists(os.path.join(directory, "config.ini")):
        config_file = os.path.join(directory, "config.ini")
    else:
        config_file = None

    options = parse(config_file)

    random_state = np.random.RandomState(seed=options["seed"])

    # First, compute MDS
    if options["lengths"].endswith(".bed"):
        lengths = load_lengths(os.path.join(directory, options["lengths"]))
    else:
        lengths = None

    if options["counts"].endswith("npy"):
        counts = np.load(os.path.join(directory, options["counts"]))
    elif options["counts"].endswith(".matrix"):
        counts = load_counts(os.path.join(directory, options["counts"]),
                             lengths=lengths)

    if options["normalize"]:
        counts = iced.filter.filter_low_counts(counts,
                                               sparsity=False,
                                               percentage=0.04)
        counts = iced.normalization.ICE_normalization(counts, max_iter=300)

    if not sparse.issparse(counts):
        counts = sparse.coo_matrix(counts)
    else:
        counts = counts.tocsr()
        counts.eliminate_zeros()
        counts = counts.tocoo()

    torm = np.array((counts.sum(axis=0) == 0)).flatten()
    nmds = NMDS(alpha=options["alpha"],
                beta=options["beta"],
                random_state=random_state,
                max_iter=options["max_iter"],
                verbose=options["verbose"])
    X = nmds.fit(counts)

    X[torm] = np.nan
    np.savetxt(os.path.join(directory, "NMDS." + options["output_name"]), X)

    # PDB file
    pdbfilename = os.path.join(directory,
                               "NMDS." + options["output_name"] + ".pdb")
    # pdbfilename = "test.pdb"
    writePDB(X, pdbfilename)

    return True
Esempio n. 3
0
File: base.py Progetto: NelleV/iced
def load_sample_yeast():
    """
    Load and return a sample of S. cerevisiae contact count matrix from duan
    et al, Nature, 2009

    Returns
    -------
        counts, lengths:
            tuple of two elements, the first a contact count matrix, the
            second an ndarray containing the lengths of the chromosomes.
    """
    module_path = dirname(__file__)
    lengths = io.load_lengths(
        os.path.join(module_path, "data/duan2009/duan.SC.10000.raw_sub.bed"))
    counts = io.load_counts(
        os.path.join(module_path,
                     "data/duan2009/duan.SC.10000.raw_sub.matrix"),
        lengths=lengths)
    counts = counts.toarray()
    counts = counts.T + counts
    return counts, lengths
Esempio n. 4
0
from __future__ import print_function
import numpy as np
from glob import glob
from iced import io
from iced import utils

filenames = glob("data/ay2013/*10000_raw.matrix") + \
            glob("data/lemieux2013/25kb/*.matrix")
filenames.sort()

for filename in filenames:
    lengths = io.load_lengths(filename.replace(".matrix", ".bed"))
    counts = io.load_counts(filename, lengths=lengths)

    counts = counts.toarray()
    counts = counts.T + counts

    mask = utils.get_intra_mask(lengths)

    # Just making sure there is no interaction counted in teh diag
    counts[np.diag_indices_from(counts)] = 0
    print(filename)
    print("Total number of counts", counts.sum())
    print("%% of intra", counts[mask].sum()/counts.sum() * 100)
    print("%% of inter", counts[np.invert(mask)].sum()/counts.sum() * 100)
    print()
Esempio n. 5
0
def run_pm2(directory):
    if os.path.exists(os.path.join(directory, "config.ini")):
        config_file = os.path.join(directory, "config.ini")
    else:
        config_file = None

    options = parse(config_file)

    random_state = np.random.RandomState(seed=options["seed"])

    options = parse(config_file)

    if options["lengths"].endswith(".bed"):
        lengths = load_lengths(os.path.join(directory, options["lengths"]))
    else:
        lengths = None

    if options["counts"].endswith("npy"):
        counts = np.load(os.path.join(directory, options["counts"]))
        counts[np.arange(len(counts)), np.arange(len(counts))] = 0
    elif options["counts"].endswith(".matrix"):
        counts = load_counts(os.path.join(directory, options["counts"]),
                             lengths=lengths)

    if options["normalize"]:
        counts = iced.filter.filter_low_counts(counts,
                                               sparsity=False,
                                               percentage=0.04)

        _, bias = iced.normalization.ICE_normalization(counts,
                                                       max_iter=300,
                                                       output_bias=True)
    else:
        bias = None

    if not sparse.issparse(counts):
        counts[np.isnan(counts)] = 0
        counts = sparse.coo_matrix(counts)
    else:
        counts = counts.tocsr()
        counts.eliminate_zeros()
        counts = counts.tocoo()

    pm2 = PM2(alpha=options["alpha"],
              beta=options["beta"],
              random_state=random_state,
              max_iter=options["max_iter"],
              bias=bias,
              verbose=options["verbose"])
    X = pm2.fit(counts)

    torm = np.array(((counts + counts.transpose()).sum(axis=0) == 0)).flatten()

    X[torm] = np.nan

    np.savetxt(os.path.join(directory, "PM2." + options["output_name"]), X)
    # PDB file
    pdbfilename = os.path.join(directory,
                               "PM2." + options["output_name"] + ".pdb")
    # pdbfilename = "test.pdb"
    writePDB(X, pdbfilename)

    return True
Esempio n. 6
0
import argparse

import numpy as np
from sklearn.metrics import euclidean_distances
from iced.utils import downsample_resolution
from iced.io import load_lengths

parser = argparse.ArgumentParser()
parser.add_argument("directory")
parser.add_argument("--lengths", "-l")
parser.add_argument("--factor", type=int, default=10)
args = parser.parse_args()

factor = args.factor
if args.lengths is not None:
    lengths = load_lengths(args.lengths)
else:
    lengths = load_lengths(
        os.path.join(args.directory.replace("results", "data"), "raw.bed"))

distances = []

filenames = glob(args.directory + "*_structure.txt")
filenames.sort()

distances = []

for i, filename in enumerate(filenames):
    sys.stdout.write("\rAnalysing %0.2f %% files" % (100. *
                                                     (i + 1) / len(filenames)))
    sys.stdout.flush()
Esempio n. 7
0
import argparse
import numpy as np
from scipy import sparse

from iced import io
from iced import utils

parser = argparse.ArgumentParser()
parser.add_argument("filename")
parser.add_argument("--outname", "-o")
args = parser.parse_args()

filename = args.filename
lengths = io.load_lengths("data/ay2013/rings_10000_raw.bed")

counts = io.load_counts(filename, lengths=lengths)
counts = counts.toarray()
counts = counts.T + counts

new_counts, new_lengths = utils.downsample_resolution(counts, lengths)

new_counts = sparse.coo_matrix(np.triu(new_counts))

io.write_counts(args.outname, new_counts)
io.write_lengths(args.outname.replace(".matrix", ".bed"), new_lengths)
Esempio n. 8
0
import numpy as np
import argparse
from iced import io
import utils
from scipy import spatial
from statistics import compute_mepd, compute_witten_and_noble
import matplotlib.pyplot as plt
from sklearn.externals.joblib import Memory

mem = Memory(".joblib")

fig, axes = plt.subplots(nrows=4)

filename = "structures/lemieux2013/25kb/B15C2_combined_raw_PO_01_structure.txt"
lengths = io.load_lengths("data/lemieux2013/25kb/B15C2_combined_raw.bed")
#filename = "structures/ay2013/trophozoites_10000_raw_PO_01_structure.txt"
#lengths = io.load_lengths("data/ay2013/trophozoites_10000_raw.bed")
X = np.loadtxt(filename)
centromeres = np.loadtxt("files/pf.cent")
chr_ = np.arange(len(centromeres))

if "25kb" in filename:
    resolution = 25000
elif "20000" in filename:
    resolution = 20000
else:
    resolution = 10000

centromeres = np.round(centromeres.mean(axis=1) / resolution)
centromeres = centromeres.astype(int)
Esempio n. 9
0
import numpy as np

from iced import io
from iced import filter
from iced import normalization
from iced import utils

from matplotlib.colors import LogNorm
import matplotlib.pyplot as plt


lengths = io.load_lengths("data/trophozoites_10000_raw.bed")
counts = io.load_counts("data/trophozoites_10000_raw.matrix", lengths=lengths)
counts = utils.from_sparse_to_dense(counts)

normed = filter.filter_low_counts(counts, remove_all_zeros_loci=True,
                                  sparsity=False)
normed = normalization.ICE_normalization(normed)

normed, l = utils.extract_sub_contact_map(normed, lengths, [6, 7])
to_rm = normed.sum(axis=0) == 0
normed[to_rm] = np.nan
normed[:, to_rm] = np.nan

fig, ax = plt.subplots()
m = ax.matshow(np.log(normed+1), cmap="RdYlBu_r", vmax=5)
ax.set_xticks([])
ax.set_yticks([])
l = np.concatenate([[0], l])
[ax.axhline(i, color="0", linestyle="--") for i in l.cumsum()]
[ax.axvline(i, color="0", linestyle="--") for i in l.cumsum()]
Esempio n. 10
0
    # recent versions of python
    print "--filtering_perc is deprecated. Please use filter_low_counts_perc"
    print "instead. This option will be removed in ice 0.3"
    filter_low_counts = args.filtering_perc
if "--filter_low_counts_perc" in sys.argv and "--filtering_perc" in sys.argv:
    raise Warning("This two options are incompatible")
if "--filtering_perc" is None and "--filter_low_counts_perc" not in sys.argv:
    filter_low_counts_perc = 0.02
elif args.filter_low_counts_perc is not None:
    filter_low_counts_perc = args.filter_low_counts_perc

if args.verbose:
    print "Loading files..."

if args.bed_file:
    lengths = io.load_lengths(args.bed_file)
else:
    lengths = None

# Loads file as i, j, counts
if lengths is None:
    i, j, data = loadtxt(filename).T
    N = max(i.max(), j.max()) + 1
    counts = sparse.coo_matrix((data, (i, j)), shape=(N, N), dtype=float)
else:
    counts = io.load_counts(filename, lengths=lengths)

if args.dense:
    counts = np.array(counts.todense())
else:
    counts = sparse.csr_matrix(counts)
Esempio n. 11
0
def increase_struct_res(struct, multiscale_factor, lengths, mask=None):
    """Linearly interpolate structure to increase resolution.

    Increase resolution of structure via linear interpolation between beads.

    Parameters
    ----------
    struct : array of float
        3D chromatin structure at low resolution.
    lengths : array_like of int
        Number of beads per homolog of each chromosome at high resolution (the
        desired resolution of the output structure).
    multiscale_factor : int, optional
        Factor by which to increase the resolution. A value of 2 doubles the
        resolution. A value of 1 does not change the resolution.

    Returns
    -------
    struct_highres : array of float
        3D chromatin structure that has been linearly interpolated to the
        specified high resolution.
    """

    if int(multiscale_factor) != multiscale_factor:
        raise ValueError('The multiscale_factor must be an integer')
    multiscale_factor = int(multiscale_factor)
    if multiscale_factor == 1:
        return struct
    if isinstance(struct, str):
        struct = np.loadtxt(struct)
    struct = struct.reshape(-1, 3)
    if isinstance(lengths, str):
        lengths = load_lengths(lengths)
    lengths = np.array(lengths).astype(int)
    lengths_lowres = decrease_lengths_res(lengths, multiscale_factor)
    ploidy = struct.shape[0] / lengths_lowres.sum()
    if ploidy != 1 and ploidy != 2:
        raise ValueError(
            "Not consistent with haploid or diploid... struct is"
            " %d beads (and 3 cols), sum of lengths is %d" %
            (struct.reshape(-1, 3).shape[0], lengths_lowres.sum()))
    ploidy = int(ploidy)

    indices = _get_struct_indices(ploidy, multiscale_factor,
                                  lengths).reshape(multiscale_factor, -1)
    if mask is not None:
        indices[~mask.reshape(multiscale_factor, -1)] = np.nan

    struct_highres = np.full((lengths.sum() * ploidy, 3), np.nan)
    begin_lowres, end_lowres = 0, 0
    for i in range(lengths.shape[0] * ploidy):
        end_lowres += np.tile(lengths_lowres, ploidy)[i]

        # Beads of struct that are NaN
        struct_nan = np.isnan(struct[begin_lowres:end_lowres, 0])

        # Get indices for this chrom at low & high res
        chrom_indices = indices[:, begin_lowres:end_lowres]
        chrom_indices[:, struct_nan] = np.nan
        chrom_indices_lowres = np.nanmean(chrom_indices, axis=0)
        chrom_indices_highres = chrom_indices.T.flatten()

        # Note which beads are unknown
        highres_mask = ~np.isnan(chrom_indices_highres)
        highres_mask[highres_mask] = (chrom_indices_highres[highres_mask] >=
                                      np.nanmin(chrom_indices_lowres)) & (
                                          chrom_indices_highres[highres_mask]
                                          <= np.nanmax(chrom_indices_lowres))
        unknown_beads = np.where(~highres_mask)[0] + np.tile(lengths,
                                                             ploidy)[:i].sum()
        unknown_beads = unknown_beads[
            unknown_beads < np.tile(lengths, ploidy)[:i + 1].sum()]
        unknown_beads_at_begin = [
            unknown_beads[k] for k in range(len(unknown_beads))
            if unknown_beads[k] == unknown_beads.min() or all([
                unknown_beads[k] - j == unknown_beads[k - j]
                for j in range(k + 1)
            ])
        ]
        if len(unknown_beads) - len(unknown_beads_at_begin) > 0:
            unknown_beads_at_end = [
                unknown_beads[k] for k in range(len(unknown_beads))
                if unknown_beads[k] == unknown_beads.max() or all([
                    unknown_beads[k] + j == unknown_beads[k + j]
                    for j in range(len(unknown_beads) - k)
                ])
            ]
            chrom_indices_highres = np.arange(
                max(unknown_beads_at_begin) + 1, min(unknown_beads_at_end))
        else:
            unknown_beads_at_end = []
            chrom_indices_highres = np.arange(
                max(unknown_beads_at_begin) + 1,
                int(np.nanmax(chrom_indices_highres)) + 1)

        struct_highres[chrom_indices_highres,
                       0] = interp1d(chrom_indices_lowres[~struct_nan],
                                     struct[begin_lowres:end_lowres,
                                            0][~struct_nan],
                                     kind="linear")(chrom_indices_highres)
        struct_highres[chrom_indices_highres,
                       1] = interp1d(chrom_indices_lowres[~struct_nan],
                                     struct[begin_lowres:end_lowres,
                                            1][~struct_nan],
                                     kind="linear")(chrom_indices_highres)
        struct_highres[chrom_indices_highres,
                       2] = interp1d(chrom_indices_lowres[~struct_nan],
                                     struct[begin_lowres:end_lowres,
                                            2][~struct_nan],
                                     kind="linear")(chrom_indices_highres)

        # Fill in beads at start
        diff_beads_at_chr_start = struct_highres[
            chrom_indices_highres[1], :] - struct_highres[
                chrom_indices_highres[0], :]
        how_far = 1
        for j in reversed(unknown_beads_at_begin):
            struct_highres[j, :] = struct_highres[
                chrom_indices_highres[0], :] - diff_beads_at_chr_start * how_far
            how_far += 1
        # Fill in beads at end
        diff_beads_at_chr_end = struct_highres[
            chrom_indices_highres[-2], :] - struct_highres[
                chrom_indices_highres[-1], :]
        how_far = 1
        for j in unknown_beads_at_end:
            struct_highres[j, :] = struct_highres[
                chrom_indices_highres[-1], :] - diff_beads_at_chr_end * how_far
            how_far += 1

        begin_lowres = end_lowres

    return struct_highres