Ejemplo n.º 1
0
def test_sparse_filter_low_counts_real_data():
    counts, lengths = load_sample_yeast()
    counts_sparse = sparse.csr_matrix(counts)
    counts_dense = filter_low_counts(counts, sparsity=False, percentage=0.1)
    counts_sparse = filter_low_counts(counts_sparse, sparsity=False,
                                      percentage=0.1)
    counts_dense[np.isnan(counts_dense)] = 0
    assert_array_equal(counts_dense, counts_sparse.toarray())

    triu_counts_sparse = sparse.csr_matrix(np.triu(counts))
    triu_counts_sparse = filter_low_counts(triu_counts_sparse, sparsity=False,
                                           percentage=0.1)
    assert_array_equal(np.triu(counts_dense), triu_counts_sparse.toarray())
Ejemplo n.º 2
0
def test_sparse_filter_low_counts_real_data():
    counts, lengths = load_sample_yeast()
    counts_sparse = sparse.csr_matrix(counts)
    counts_dense = filter_low_counts(counts, sparsity=False, percentage=0.1)
    counts_sparse = filter_low_counts(counts_sparse, sparsity=False,
                                      percentage=0.1)
    counts_dense[np.isnan(counts_dense)] = 0
    assert_array_equal(counts_dense, counts_sparse.toarray())

    triu_counts_sparse = sparse.csr_matrix(np.triu(counts))
    triu_counts_sparse = filter_low_counts(triu_counts_sparse, sparsity=False,
                                           percentage=0.1)
    assert_array_equal(np.triu(counts), triu_counts_sparse.toarray())
Ejemplo n.º 3
0
def test_filter_low_counts():
    X = np.ones((100, 100))
    X[0, :] = 0
    X[:, 0] = 0
    X_filtered_true = X.copy()
    X_filtered_true[X == 0] = np.nan
    X_filtered = filter_low_counts(X)
    assert_array_equal(X_filtered, X_filtered_true)

    lengths = np.array([40, 60])
    X_filtered = filter_low_counts(X, lengths=lengths)
    assert_array_equal(X_filtered, X_filtered_true)

    X_filtered = filter_low_counts(X, sparsity=False)
    assert_array_equal(X_filtered, X_filtered_true)
Ejemplo n.º 4
0
def test_filter_low_counts():
    X = np.ones((100, 100))
    X[0, :] = 0
    X[:, 0] = 0
    X_filtered_true = X.copy()
    X_filtered_true[X == 0] = np.nan
    X_filtered = filter_low_counts(X)
    assert_array_equal(X_filtered, X_filtered_true)

    lengths = np.array([40, 60])
    X_filtered = filter_low_counts(X, lengths=lengths)
    assert_array_equal(X_filtered, X_filtered_true)

    X_filtered = filter_low_counts(X, sparsity=False)
    assert_array_equal(X_filtered, X_filtered_true)
Ejemplo n.º 5
0
def test_sparse_filter_low_counts():
    X = 10 * np.ones((100, 100))
    X[0, :] = 1
    X[:, 0] = 1
    X_filtered_dense = X.copy()
    X_filtered_dense[0] = 0
    X_filtered_dense[:, 0] = 0
    return
    # this is not implemented yet
    X_filtered_sparse_csr = filter_low_counts(sparse.csr_matrix(X),
                                              sparsity=False)
    X_filtered_sparse_coo = filter_low_counts(sparse.coo_matrix(X))

    assert_array_equal(X_filtered_dense,
                       np.array(X_filtered_sparse_csr.todense()))
    assert_array_equal(X_filtered_dense,
                       np.array(X_filtered_sparse_coo.todense()))
Ejemplo n.º 6
0
def test_sparse_filter_low_counts():
    X = 10 * np.ones((100, 100))
    X[0, :] = 1
    X[:, 0] = 1
    X_filtered_dense = X.copy()
    X_filtered_dense[0] = 0
    X_filtered_dense[:, 0] = 0
    return
    # this is not implemented yet
    X_filtered_sparse_csr = filter_low_counts(sparse.csr_matrix(X),
                                              sparsity=False)
    X_filtered_sparse_coo = filter_low_counts(sparse.coo_matrix(X))

    assert_array_equal(X_filtered_dense,
                       np.array(X_filtered_sparse_csr.todense()))
    assert_array_equal(X_filtered_dense,
                       np.array(X_filtered_sparse_coo.todense()))
Ejemplo n.º 7
0
def filter_matrix(contact_matrix):
    filtered_matrix = filter.filter_low_counts(contact_matrix,
                                               remove_all_zeros_loci=True,
                                               sparsity=False)
    rows, cols = filtered_matrix.shape
    nan_sum = 0
    to_keep = []
    for i in range(rows):
        if sum(np.isnan(filtered_matrix[i, :])) < cols:
            to_keep.append(i)
    row_filtered = filtered_matrix[to_keep, :]
    return (row_filtered[:, to_keep], to_keep)  # finally remove cols
Ejemplo n.º 8
0
def test_filter_low_counts_with_zeros():
    X = 10 * np.ones((100, 100))
    X[0, :] = 0
    X[:, 0] = 0
    X[1, :] = 1
    X[:, 1] = 1

    X_filtered_true = X.copy()
    X_filtered_true[X != 10] = np.nan
    X_filtered = filter_low_counts(X, remove_all_zeros_loci=True,
                                   sparsity=False)
    assert_array_equal(X_filtered, X_filtered_true)
Ejemplo n.º 9
0
import matplotlib.pyplot as plt
from matplotlib import colors

from iced import datasets
from iced import filter
from iced import normalization

"""
Normalizing a contact count matrix.
"""

# Loading a sample dataset
counts, lengths = datasets.load_sample_yeast()

# Filtering and normalizing contact count data
normed = filter.filter_low_counts(counts, lengths=lengths, percentage=0.04)
normed = normalization.ICE_normalization(normed)

# Plotting the results using matplotlib
chromosomes = ["I", "II", "III", "IV", "V", "VI"]

fig, axes = plt.subplots(ncols=2, figsize=(12, 4))

axes[0].imshow(counts, cmap="Blues", norm=colors.SymLogNorm(1),
               origin="bottom",
               extent=(0, len(counts), 0, len(counts)))

[axes[0].axhline(i, linewidth=1, color="#000000") for i in lengths.cumsum()]
[axes[0].axvline(i, linewidth=1, color="#000000") for i in lengths.cumsum()]
axes[0].set_title("Raw contact counts")
from iced import filter, normalization

import numpy as np

filePtr = './GSM1173492_Th1_ensemble/50kb/'

for i in range(1, 11):
    counts = np.genfromtxt(filePtr + 'chr' + str(i) + '_50kb.txt',
                           delimiter=' ')

    counts = filter.filter_low_counts(counts, percentage=0.04)
    normed = normalization.ICE_normalization(counts)

    np.savetxt(filePtr + 'Iced_chr' + str(i) + '_' + '_50kb.txt',
               normed,
               delimiter=',')
Ejemplo n.º 11
0
from iced import filter
from iced import normalization
from utils import get_mapping, get_expected

parser = argparse.ArgumentParser()
parser.add_argument("filename")
parser.add_argument("--normalize", "-n", action="store_true", default=False)
parser.add_argument("--bed-file", "-b")
parser.add_argument("--outfile", "-o")
args = parser.parse_args()

lengths, base = io.load_lengths(args.bed_file, return_base=True)
counts = io.load_counts(args.filename, lengths=lengths, base=base)

if args.normalize:
    counts = filter.filter_low_counts(counts, percentage=0.03, sparsity=False)
    counts = normalization.ICE_normalization(counts)

print("1. Compute count vs genomic distance relationship")
mapping = get_mapping(counts, lengths, verbose=True)

print("2. Estimating expected...")
c_expected = get_expected(counts, lengths, mapping=mapping)

print("3. Estimating observed over expected...")
counts.data /= c_expected

if args.outfile is not None:
    try:
        os.makedirs(os.path.dirname(args.outfile))
    except OSError:
Ejemplo n.º 12
0
import numpy as np

from iced import io
from iced import filter
from iced import normalization
from iced import utils

from matplotlib.colors import LogNorm
import matplotlib.pyplot as plt


lengths = io.load_lengths("data/trophozoites_10000_raw.bed")
counts = io.load_counts("data/trophozoites_10000_raw.matrix", lengths=lengths)
counts = utils.from_sparse_to_dense(counts)

normed = filter.filter_low_counts(counts, remove_all_zeros_loci=True,
                                  sparsity=False)
normed = normalization.ICE_normalization(normed)

normed, l = utils.extract_sub_contact_map(normed, lengths, [6, 7])
to_rm = normed.sum(axis=0) == 0
normed[to_rm] = np.nan
normed[:, to_rm] = np.nan

fig, ax = plt.subplots()
m = ax.matshow(np.log(normed+1), cmap="RdYlBu_r", vmax=5)
ax.set_xticks([])
ax.set_yticks([])
l = np.concatenate([[0], l])
[ax.axhline(i, color="0", linestyle="--") for i in l.cumsum()]
[ax.axvline(i, color="0", linestyle="--") for i in l.cumsum()]
ax.set_xlim(-20, l.sum()+20)
Ejemplo n.º 13
0
def _prep_counts(counts_list,
                 lengths,
                 ploidy=1,
                 multiscale_factor=1,
                 normalize=True,
                 filter_threshold=0.04,
                 exclude_zeros=True,
                 verbose=True):
    """Copy counts, check matrix, reduce resolution, filter, and compute bias.
    """

    if not isinstance(counts_list, list):
        counts_list = [counts_list]

    # Copy counts
    counts_list = [c.copy() for c in counts_list]

    # Check counts
    counts_list = check_counts(counts_list,
                               lengths=lengths,
                               ploidy=ploidy,
                               exclude_zeros=True)

    # Determine ambiguity
    nbeads = lengths.sum() * ploidy
    counts_dict = [('haploid' if ploidy == 1 else {
        1: 'ambig',
        1.5: 'pa',
        2: 'ua'
    }[sum(c.shape) / nbeads], c) for c in counts_list]
    if len(counts_dict) != len(dict(counts_dict)):
        raise ValueError(
            "Can't input multiple counts matrices of the same"
            " type. Inputs (%d) = %s" %
            (len(counts_dict), ', '.join([x[0] for x in counts_dict])))
    counts_dict = dict(counts_dict)

    # Reduce resolution
    lengths_lowres = lengths
    for counts_type, counts in counts_dict.items():
        if multiscale_factor != 1:
            lengths_lowres = decrease_lengths_res(
                lengths, multiscale_factor=multiscale_factor)
            counts = decrease_counts_res(counts,
                                         multiscale_factor=multiscale_factor,
                                         lengths=lengths,
                                         ploidy=ploidy)
            counts_dict[counts_type] = counts

    # Optionally filter counts
    if filter_threshold is None:
        filter_threshold = 0
    if filter_threshold and len(counts_list) > 1:
        # If there are multiple counts matrices, filter them together.
        # Counts will be ambiguated for deciding which beads to remove.
        # For diploid, any beads that are filtered out will be removed from both
        # homologs.
        if verbose:
            print(
                "FILTERING LOW COUNTS: manually filtering all counts together"
                " by %g" % filter_threshold,
                flush=True)
        all_counts_ambiguated = ambiguate_counts(list(counts_dict.values()),
                                                 lengths=lengths_lowres,
                                                 ploidy=ploidy,
                                                 exclude_zeros=True)
        initial_zero_beads = find_beads_to_remove(all_counts_ambiguated,
                                                  lengths_lowres.sum()).sum()
        all_counts_filtered = filter_low_counts(
            sparse.coo_matrix(all_counts_ambiguated),
            sparsity=False,
            percentage=filter_threshold +
            _percent_nan_beads(all_counts_ambiguated)).tocoo()
        torm = find_beads_to_remove(all_counts_filtered, lengths_lowres.sum())
        if verbose:
            print('                      removing %d beads' %
                  (torm.sum() - initial_zero_beads),
                  flush=True)
        for counts_type, counts in counts_dict.items():
            if sparse.issparse(counts):
                counts = counts.toarray()
            counts[np.tile(torm, int(counts.shape[0] / torm.shape[0])), :] = 0.
            counts[:, np.tile(torm, int(counts.shape[1] / torm.shape[0]))] = 0.
            counts = sparse.coo_matrix(counts)
            counts_dict[counts_type] = counts
    elif filter_threshold:
        # If there is just one counts matrix, filter the full, non-ambiguated
        # counts matrix.
        # For diploid unambiguous or partially ambigous counts, it is possible
        # that a bead will be filtered out on one homolog but not another.
        individual_counts_torms = np.full((lengths_lowres.sum(), ), False)
        for counts_type, counts in counts_dict.items():
            if verbose:
                print(
                    'FILTERING LOW COUNTS: manually filtering %s counts by %g'
                    % (counts_type.upper(), filter_threshold),
                    flush=True)
            initial_zero_beads = find_beads_to_remove(
                ambiguate_counts(counts, lengths=lengths_lowres,
                                 ploidy=ploidy), lengths_lowres.sum()).sum()
            if counts_type == 'pa':
                if sparse.issparse(counts):
                    counts = counts.toarray()
                counts_filtered = np.zeros_like(counts)
                homo1_upper = np.triu(counts[:min(counts.shape), :], 1)
                homo1_lower = np.triu(counts[:min(counts.shape), :].T, 1)
                homo2_upper = np.triu(counts[min(counts.shape):, :], 1)
                homo2_lower = np.triu(counts[min(counts.shape):, :].T, 1)
                counts_filtered[:min(counts.shape), :] += filter_low_counts(
                    sparse.coo_matrix(homo1_upper),
                    sparsity=False,
                    percentage=filter_threshold +
                    _percent_nan_beads(homo1_upper)).toarray()
                counts_filtered[:min(counts.shape), :] += filter_low_counts(
                    sparse.coo_matrix(homo1_lower),
                    sparsity=False,
                    percentage=filter_threshold +
                    _percent_nan_beads(homo1_lower)).toarray().T
                counts_filtered[min(counts.shape):, :] += filter_low_counts(
                    sparse.coo_matrix(homo2_upper),
                    sparsity=False,
                    percentage=filter_threshold +
                    _percent_nan_beads(homo2_upper)).toarray()
                counts_filtered[min(counts.shape):, :] += filter_low_counts(
                    sparse.coo_matrix(homo2_lower),
                    sparsity=False,
                    percentage=filter_threshold +
                    _percent_nan_beads(homo2_lower)).toarray().T
                counts = counts_filtered
            else:
                counts = filter_low_counts(sparse.coo_matrix(counts),
                                           sparsity=False,
                                           percentage=filter_threshold +
                                           _percent_nan_beads(counts)).tocoo()
            torm = find_beads_to_remove(
                ambiguate_counts(counts, lengths=lengths_lowres,
                                 ploidy=ploidy), lengths_lowres.sum())
            if verbose:
                print('                      removing %d beads' %
                      (torm.sum() - initial_zero_beads),
                      flush=True)
            individual_counts_torms = individual_counts_torms | torm
            counts = sparse.coo_matrix(counts)
            counts_dict[counts_type] = counts

    # Optionally normalize counts
    bias = None
    if normalize:
        if verbose:
            print('COMPUTING BIAS: all counts together', flush=True)
        bias = ICE_normalization(ambiguate_counts(list(counts_dict.values()),
                                                  lengths=lengths_lowres,
                                                  ploidy=ploidy,
                                                  exclude_zeros=True),
                                 max_iter=300,
                                 output_bias=True)[1].flatten()
        # In each counts matrix, zero out counts for which bias is NaN
        for counts_type, counts in counts_dict.items():
            initial_zero_beads = find_beads_to_remove(
                ambiguate_counts(counts, lengths=lengths_lowres,
                                 ploidy=ploidy), lengths_lowres.sum()).sum()
            if sparse.issparse(counts):
                counts = counts.toarray()
            counts[np.tile(np.isnan(bias), int(counts.shape[0] /
                                               bias.shape[0])), :] = 0.
            counts[:,
                   np.tile(np.isnan(bias), int(counts.shape[1] /
                                               bias.shape[0]))] = 0.
            counts = sparse.coo_matrix(counts)
            counts_dict[counts_type] = counts
            torm = find_beads_to_remove(
                ambiguate_counts(counts, lengths=lengths_lowres,
                                 ploidy=ploidy), lengths_lowres.sum())
            if verbose and torm.sum() - initial_zero_beads > 0:
                print('                removing %d additional beads from %s' %
                      (torm.sum() - initial_zero_beads, counts_type),
                      flush=True)

    output_counts = check_counts(list(counts_dict.values()),
                                 lengths=lengths_lowres,
                                 ploidy=ploidy,
                                 exclude_zeros=exclude_zeros)
    return output_counts, bias
    - filtering of the smallest x% **interacting** rows and columns

"""
import matplotlib.pyplot as plt
from matplotlib import colors

from iced import datasets
from iced import filter


# Loading a sample dataset
counts, lengths = datasets.load_sample_yeast()


fig, axes = plt.subplots(ncols=3, figsize=(12, 4))
counts_1 = filter.filter_low_counts(counts, lengths=lengths, percentage=0.04)
counts_2 = filter.filter_low_counts(counts, lengths=lengths, percentage=0.04,
                                    sparsity=False)
counts_3 = filter.filter_low_counts(counts, lengths=lengths, percentage=0.04,
                                    sparsity=False, remove_all_zeros_loci=True)


# Plotting the results using matplotlib
chromosomes = ["I", "II", "III", "IV", "V", "VI"]


for ax, c in zip(axes, [counts_1, counts_2, counts_3]):
    ax.imshow(c, cmap="Blues", norm=colors.SymLogNorm(1),
              origin="bottom",
              extent=(0, len(counts), 0, len(counts)))