def test_sparse_filter_low_counts_real_data(): counts, lengths = load_sample_yeast() counts_sparse = sparse.csr_matrix(counts) counts_dense = filter_low_counts(counts, sparsity=False, percentage=0.1) counts_sparse = filter_low_counts(counts_sparse, sparsity=False, percentage=0.1) counts_dense[np.isnan(counts_dense)] = 0 assert_array_equal(counts_dense, counts_sparse.toarray()) triu_counts_sparse = sparse.csr_matrix(np.triu(counts)) triu_counts_sparse = filter_low_counts(triu_counts_sparse, sparsity=False, percentage=0.1) assert_array_equal(np.triu(counts_dense), triu_counts_sparse.toarray())
def test_sparse_filter_low_counts_real_data(): counts, lengths = load_sample_yeast() counts_sparse = sparse.csr_matrix(counts) counts_dense = filter_low_counts(counts, sparsity=False, percentage=0.1) counts_sparse = filter_low_counts(counts_sparse, sparsity=False, percentage=0.1) counts_dense[np.isnan(counts_dense)] = 0 assert_array_equal(counts_dense, counts_sparse.toarray()) triu_counts_sparse = sparse.csr_matrix(np.triu(counts)) triu_counts_sparse = filter_low_counts(triu_counts_sparse, sparsity=False, percentage=0.1) assert_array_equal(np.triu(counts), triu_counts_sparse.toarray())
def test_filter_low_counts(): X = np.ones((100, 100)) X[0, :] = 0 X[:, 0] = 0 X_filtered_true = X.copy() X_filtered_true[X == 0] = np.nan X_filtered = filter_low_counts(X) assert_array_equal(X_filtered, X_filtered_true) lengths = np.array([40, 60]) X_filtered = filter_low_counts(X, lengths=lengths) assert_array_equal(X_filtered, X_filtered_true) X_filtered = filter_low_counts(X, sparsity=False) assert_array_equal(X_filtered, X_filtered_true)
def test_sparse_filter_low_counts(): X = 10 * np.ones((100, 100)) X[0, :] = 1 X[:, 0] = 1 X_filtered_dense = X.copy() X_filtered_dense[0] = 0 X_filtered_dense[:, 0] = 0 return # this is not implemented yet X_filtered_sparse_csr = filter_low_counts(sparse.csr_matrix(X), sparsity=False) X_filtered_sparse_coo = filter_low_counts(sparse.coo_matrix(X)) assert_array_equal(X_filtered_dense, np.array(X_filtered_sparse_csr.todense())) assert_array_equal(X_filtered_dense, np.array(X_filtered_sparse_coo.todense()))
def filter_matrix(contact_matrix): filtered_matrix = filter.filter_low_counts(contact_matrix, remove_all_zeros_loci=True, sparsity=False) rows, cols = filtered_matrix.shape nan_sum = 0 to_keep = [] for i in range(rows): if sum(np.isnan(filtered_matrix[i, :])) < cols: to_keep.append(i) row_filtered = filtered_matrix[to_keep, :] return (row_filtered[:, to_keep], to_keep) # finally remove cols
def test_filter_low_counts_with_zeros(): X = 10 * np.ones((100, 100)) X[0, :] = 0 X[:, 0] = 0 X[1, :] = 1 X[:, 1] = 1 X_filtered_true = X.copy() X_filtered_true[X != 10] = np.nan X_filtered = filter_low_counts(X, remove_all_zeros_loci=True, sparsity=False) assert_array_equal(X_filtered, X_filtered_true)
import matplotlib.pyplot as plt from matplotlib import colors from iced import datasets from iced import filter from iced import normalization """ Normalizing a contact count matrix. """ # Loading a sample dataset counts, lengths = datasets.load_sample_yeast() # Filtering and normalizing contact count data normed = filter.filter_low_counts(counts, lengths=lengths, percentage=0.04) normed = normalization.ICE_normalization(normed) # Plotting the results using matplotlib chromosomes = ["I", "II", "III", "IV", "V", "VI"] fig, axes = plt.subplots(ncols=2, figsize=(12, 4)) axes[0].imshow(counts, cmap="Blues", norm=colors.SymLogNorm(1), origin="bottom", extent=(0, len(counts), 0, len(counts))) [axes[0].axhline(i, linewidth=1, color="#000000") for i in lengths.cumsum()] [axes[0].axvline(i, linewidth=1, color="#000000") for i in lengths.cumsum()] axes[0].set_title("Raw contact counts")
from iced import filter, normalization import numpy as np filePtr = './GSM1173492_Th1_ensemble/50kb/' for i in range(1, 11): counts = np.genfromtxt(filePtr + 'chr' + str(i) + '_50kb.txt', delimiter=' ') counts = filter.filter_low_counts(counts, percentage=0.04) normed = normalization.ICE_normalization(counts) np.savetxt(filePtr + 'Iced_chr' + str(i) + '_' + '_50kb.txt', normed, delimiter=',')
from iced import filter from iced import normalization from utils import get_mapping, get_expected parser = argparse.ArgumentParser() parser.add_argument("filename") parser.add_argument("--normalize", "-n", action="store_true", default=False) parser.add_argument("--bed-file", "-b") parser.add_argument("--outfile", "-o") args = parser.parse_args() lengths, base = io.load_lengths(args.bed_file, return_base=True) counts = io.load_counts(args.filename, lengths=lengths, base=base) if args.normalize: counts = filter.filter_low_counts(counts, percentage=0.03, sparsity=False) counts = normalization.ICE_normalization(counts) print("1. Compute count vs genomic distance relationship") mapping = get_mapping(counts, lengths, verbose=True) print("2. Estimating expected...") c_expected = get_expected(counts, lengths, mapping=mapping) print("3. Estimating observed over expected...") counts.data /= c_expected if args.outfile is not None: try: os.makedirs(os.path.dirname(args.outfile)) except OSError:
import numpy as np from iced import io from iced import filter from iced import normalization from iced import utils from matplotlib.colors import LogNorm import matplotlib.pyplot as plt lengths = io.load_lengths("data/trophozoites_10000_raw.bed") counts = io.load_counts("data/trophozoites_10000_raw.matrix", lengths=lengths) counts = utils.from_sparse_to_dense(counts) normed = filter.filter_low_counts(counts, remove_all_zeros_loci=True, sparsity=False) normed = normalization.ICE_normalization(normed) normed, l = utils.extract_sub_contact_map(normed, lengths, [6, 7]) to_rm = normed.sum(axis=0) == 0 normed[to_rm] = np.nan normed[:, to_rm] = np.nan fig, ax = plt.subplots() m = ax.matshow(np.log(normed+1), cmap="RdYlBu_r", vmax=5) ax.set_xticks([]) ax.set_yticks([]) l = np.concatenate([[0], l]) [ax.axhline(i, color="0", linestyle="--") for i in l.cumsum()] [ax.axvline(i, color="0", linestyle="--") for i in l.cumsum()] ax.set_xlim(-20, l.sum()+20)
def _prep_counts(counts_list, lengths, ploidy=1, multiscale_factor=1, normalize=True, filter_threshold=0.04, exclude_zeros=True, verbose=True): """Copy counts, check matrix, reduce resolution, filter, and compute bias. """ if not isinstance(counts_list, list): counts_list = [counts_list] # Copy counts counts_list = [c.copy() for c in counts_list] # Check counts counts_list = check_counts(counts_list, lengths=lengths, ploidy=ploidy, exclude_zeros=True) # Determine ambiguity nbeads = lengths.sum() * ploidy counts_dict = [('haploid' if ploidy == 1 else { 1: 'ambig', 1.5: 'pa', 2: 'ua' }[sum(c.shape) / nbeads], c) for c in counts_list] if len(counts_dict) != len(dict(counts_dict)): raise ValueError( "Can't input multiple counts matrices of the same" " type. Inputs (%d) = %s" % (len(counts_dict), ', '.join([x[0] for x in counts_dict]))) counts_dict = dict(counts_dict) # Reduce resolution lengths_lowres = lengths for counts_type, counts in counts_dict.items(): if multiscale_factor != 1: lengths_lowres = decrease_lengths_res( lengths, multiscale_factor=multiscale_factor) counts = decrease_counts_res(counts, multiscale_factor=multiscale_factor, lengths=lengths, ploidy=ploidy) counts_dict[counts_type] = counts # Optionally filter counts if filter_threshold is None: filter_threshold = 0 if filter_threshold and len(counts_list) > 1: # If there are multiple counts matrices, filter them together. # Counts will be ambiguated for deciding which beads to remove. # For diploid, any beads that are filtered out will be removed from both # homologs. if verbose: print( "FILTERING LOW COUNTS: manually filtering all counts together" " by %g" % filter_threshold, flush=True) all_counts_ambiguated = ambiguate_counts(list(counts_dict.values()), lengths=lengths_lowres, ploidy=ploidy, exclude_zeros=True) initial_zero_beads = find_beads_to_remove(all_counts_ambiguated, lengths_lowres.sum()).sum() all_counts_filtered = filter_low_counts( sparse.coo_matrix(all_counts_ambiguated), sparsity=False, percentage=filter_threshold + _percent_nan_beads(all_counts_ambiguated)).tocoo() torm = find_beads_to_remove(all_counts_filtered, lengths_lowres.sum()) if verbose: print(' removing %d beads' % (torm.sum() - initial_zero_beads), flush=True) for counts_type, counts in counts_dict.items(): if sparse.issparse(counts): counts = counts.toarray() counts[np.tile(torm, int(counts.shape[0] / torm.shape[0])), :] = 0. counts[:, np.tile(torm, int(counts.shape[1] / torm.shape[0]))] = 0. counts = sparse.coo_matrix(counts) counts_dict[counts_type] = counts elif filter_threshold: # If there is just one counts matrix, filter the full, non-ambiguated # counts matrix. # For diploid unambiguous or partially ambigous counts, it is possible # that a bead will be filtered out on one homolog but not another. individual_counts_torms = np.full((lengths_lowres.sum(), ), False) for counts_type, counts in counts_dict.items(): if verbose: print( 'FILTERING LOW COUNTS: manually filtering %s counts by %g' % (counts_type.upper(), filter_threshold), flush=True) initial_zero_beads = find_beads_to_remove( ambiguate_counts(counts, lengths=lengths_lowres, ploidy=ploidy), lengths_lowres.sum()).sum() if counts_type == 'pa': if sparse.issparse(counts): counts = counts.toarray() counts_filtered = np.zeros_like(counts) homo1_upper = np.triu(counts[:min(counts.shape), :], 1) homo1_lower = np.triu(counts[:min(counts.shape), :].T, 1) homo2_upper = np.triu(counts[min(counts.shape):, :], 1) homo2_lower = np.triu(counts[min(counts.shape):, :].T, 1) counts_filtered[:min(counts.shape), :] += filter_low_counts( sparse.coo_matrix(homo1_upper), sparsity=False, percentage=filter_threshold + _percent_nan_beads(homo1_upper)).toarray() counts_filtered[:min(counts.shape), :] += filter_low_counts( sparse.coo_matrix(homo1_lower), sparsity=False, percentage=filter_threshold + _percent_nan_beads(homo1_lower)).toarray().T counts_filtered[min(counts.shape):, :] += filter_low_counts( sparse.coo_matrix(homo2_upper), sparsity=False, percentage=filter_threshold + _percent_nan_beads(homo2_upper)).toarray() counts_filtered[min(counts.shape):, :] += filter_low_counts( sparse.coo_matrix(homo2_lower), sparsity=False, percentage=filter_threshold + _percent_nan_beads(homo2_lower)).toarray().T counts = counts_filtered else: counts = filter_low_counts(sparse.coo_matrix(counts), sparsity=False, percentage=filter_threshold + _percent_nan_beads(counts)).tocoo() torm = find_beads_to_remove( ambiguate_counts(counts, lengths=lengths_lowres, ploidy=ploidy), lengths_lowres.sum()) if verbose: print(' removing %d beads' % (torm.sum() - initial_zero_beads), flush=True) individual_counts_torms = individual_counts_torms | torm counts = sparse.coo_matrix(counts) counts_dict[counts_type] = counts # Optionally normalize counts bias = None if normalize: if verbose: print('COMPUTING BIAS: all counts together', flush=True) bias = ICE_normalization(ambiguate_counts(list(counts_dict.values()), lengths=lengths_lowres, ploidy=ploidy, exclude_zeros=True), max_iter=300, output_bias=True)[1].flatten() # In each counts matrix, zero out counts for which bias is NaN for counts_type, counts in counts_dict.items(): initial_zero_beads = find_beads_to_remove( ambiguate_counts(counts, lengths=lengths_lowres, ploidy=ploidy), lengths_lowres.sum()).sum() if sparse.issparse(counts): counts = counts.toarray() counts[np.tile(np.isnan(bias), int(counts.shape[0] / bias.shape[0])), :] = 0. counts[:, np.tile(np.isnan(bias), int(counts.shape[1] / bias.shape[0]))] = 0. counts = sparse.coo_matrix(counts) counts_dict[counts_type] = counts torm = find_beads_to_remove( ambiguate_counts(counts, lengths=lengths_lowres, ploidy=ploidy), lengths_lowres.sum()) if verbose and torm.sum() - initial_zero_beads > 0: print(' removing %d additional beads from %s' % (torm.sum() - initial_zero_beads, counts_type), flush=True) output_counts = check_counts(list(counts_dict.values()), lengths=lengths_lowres, ploidy=ploidy, exclude_zeros=exclude_zeros) return output_counts, bias
- filtering of the smallest x% **interacting** rows and columns """ import matplotlib.pyplot as plt from matplotlib import colors from iced import datasets from iced import filter # Loading a sample dataset counts, lengths = datasets.load_sample_yeast() fig, axes = plt.subplots(ncols=3, figsize=(12, 4)) counts_1 = filter.filter_low_counts(counts, lengths=lengths, percentage=0.04) counts_2 = filter.filter_low_counts(counts, lengths=lengths, percentage=0.04, sparsity=False) counts_3 = filter.filter_low_counts(counts, lengths=lengths, percentage=0.04, sparsity=False, remove_all_zeros_loci=True) # Plotting the results using matplotlib chromosomes = ["I", "II", "III", "IV", "V", "VI"] for ax, c in zip(axes, [counts_1, counts_2, counts_3]): ax.imshow(c, cmap="Blues", norm=colors.SymLogNorm(1), origin="bottom", extent=(0, len(counts), 0, len(counts)))