def test_ICE_normalization_cancer(): n = 100 random_state = np.random.RandomState(seed=42) X = random_state.randint(0, 100, size=(n, n)) X = X + X.T profile = np.ones(n) profile[:10] = 0 profile[50:] = 2 normed_X, bias = ICE_normalization(X, eps=1e-10, counts_profile=profile, output_bias=True) assert not np.all(np.isnan(normed_X)) normed_X[np.isnan(normed_X)] = 0 mask = np.isnan(bias).flatten() bias[np.isnan(bias)] = 1 normed_from_bias_X = X / (bias.T * bias) normed_from_bias_X[mask] = 0 normed_from_bias_X[:, mask] = 0 assert_array_almost_equal(normed_X, normed_from_bias_X, 6) inferred_profile = normed_X.sum(axis=0) inferred_profile /= inferred_profile.max() assert_array_almost_equal(inferred_profile, profile / profile.max()) # Do the same for sparse matriecs normed_X = ICE_normalization(sparse.coo_matrix(X), eps=1e-10, counts_profile=profile)
def test_ICE_normalization(): n = 100 X = np.random.random((n, n)) X = X + X.T normed_X = ICE_normalization(X, eps=1e-10, max_iter=1000000) normed = normed_X.sum(axis=1) assert_array_almost_equal(normed / normed.mean(), np.ones((len(X), )), decimal=0)
def test_ICE_normalization(): n = 100 random_state = np.random.RandomState(seed=42) X = random_state.randint(0, 100, size=(n, n)) X = X + X.T normed_X = ICE_normalization(X, eps=1e-10, max_iter=1000000) normed = normed_X.sum(axis=1) assert_array_almost_equal(normed / normed.mean(), np.ones((len(X), )), decimal=0)
def gather_high_low_cool( cooler_file='Rao2014-GM12878-DpnII-allreps-filtered.10kb.cool', path='./data/raw/', chromosome='22', scale=4, output_path='./experiment/evaluation/'): file = os.path.join(path, cooler_file) cool_hic = cooler.Cooler(file) resolution = cool_hic.binsize mat = cool_hic.matrix(balance=True).fetch('chr' + chromosome) high_hic, idx = remove_zeros( mat) # idx: {true, false}, len is not changed/shrinked bool_idx = np.array(idx).flatten() num_idx = np.array(np.where(idx)).flatten() low_hic = sampling_hic(high_hic, scale**2, fix_seed=True) print('high hic shape: {}.'.format(high_hic.shape), end=' ') print('low hic shape: {}.'.format(low_hic.shape)) b = { 'chrom': ['chr{}'.format(chromosome)] * len(bool_idx), 'start': resolution * np.arange(len(bool_idx)), 'end': resolution * (np.arange(1, (len(bool_idx) + 1))), 'weight': 1.0 * bool_idx } bins = pd.DataFrame(data=b) high_hic = ICE_normalization(high_hic) low_hic = ICE_normalization(low_hic) high_hic = triu(high_hic, format='coo') low_hic = triu(low_hic, format='coo') output_path = os.path.join(output_path, 'chr{}'.format(chromosome)) os.makedirs(output_path, exist_ok=True) outfile = 'high_chr{}.cool'.format(chromosome) print('saving file {}'.format(os.path.join(output_path, outfile))) uri = os.path.join(output_path, outfile) p = { 'bin1_id': num_idx[high_hic.row], 'bin2_id': num_idx[high_hic.col], 'count': high_hic.data } pixels = pd.DataFrame(data=p) cooler.create_cooler(cool_uri=uri, bins=bins, pixels=pixels) outfile = 'low_chr{}.cool'.format(chromosome) print('saving file {}'.format(os.path.join(output_path, outfile))) uri = os.path.join(output_path, outfile) p = { 'bin1_id': num_idx[low_hic.row], 'bin2_id': num_idx[low_hic.col], 'count': low_hic.data } pixels = pd.DataFrame(data=p) cooler.create_cooler(cool_uri=uri, bins=bins, pixels=pixels)
def test_sparse_ICE_normalization(): n = 100 X = np.random.random((n, n)) thres = (np.random.random((n, n)) > 0.5).astype(bool) X[thres] = 0 X = X + X.T sparse_X = sparse.csr_matrix(X) true_normed_X = ICE_normalization(X, eps=1e-10, max_iter=10) normed_X = ICE_normalization(sparse_X, eps=1e-10, max_iter=10) assert_array_almost_equal(X, sparse_X.todense()) assert_array_almost_equal(true_normed_X, np.array(normed_X.todense()))
def test_sparse_ICE_normalization(): n = 100 random_state = np.random.RandomState(seed=42) X = random_state.randint(0, 100, size=(n, n)) thres = (random_state.rand(n, n) > 0.5).astype(bool) X[thres] = 0 X = X + X.T sparse_X = sparse.csr_matrix(X) true_normed_X = ICE_normalization(X, eps=1e-10, max_iter=10) normed_X = ICE_normalization(sparse_X, eps=1e-10, max_iter=10) assert_array_almost_equal(X, sparse_X.todense()) assert_array_almost_equal(true_normed_X, np.array(normed_X.todense()))
def test_sparse_ICE_normalization_triu(): n = 100 random_state = np.random.RandomState(seed=42) X = random_state.randint(0, 100, size=(n, n)) thres = (random_state.rand(n, n) > 0.5).astype(bool) X[thres] = 0 X = X + X.T sparse_X = sparse.triu(X) true_normed_X = ICE_normalization(X, eps=1e-10, max_iter=10) true_normed_X = np.triu(true_normed_X) X = np.triu(X) normed_X = ICE_normalization(sparse_X, eps=1e-10, max_iter=10) assert_array_almost_equal(X, sparse_X.todense()) # The sparse and dense version are going to be equal up to a constant # factor normed_X *= true_normed_X.mean() / normed_X.mean() assert_array_almost_equal(true_normed_X, np.array(normed_X.todense()))
def test_sparse_ICE_normalization_triu(): n = 100 random_state = np.random.RandomState(seed=42) X = random_state.randint(0, 100, size=(n, n)) thres = (random_state.rand(n, n) > 0.5).astype(bool) X[thres] = 0 X = X + X.T sparse_X = sparse.triu(X) true_normed_X, true_biases = ICE_normalization(X, eps=1e-10, max_iter=10, output_bias=True) true_normed_X = np.triu(true_normed_X) normed_X_sparse, biases_sparse = ICE_normalization(sparse_X, eps=1e-10, max_iter=100, output_bias=True) normed_X_dense, biases_dense = ICE_normalization(np.triu(X), eps=1e-10, max_iter=100, output_bias=True) # The sparse and dense version are going to be equal up to a constant # factor assert_array_almost_equal(normed_X_dense, np.array(normed_X_sparse.toarray())) normed_X_sparse *= true_normed_X.mean() / normed_X_sparse.mean() normed_X_dense *= true_normed_X.mean() / normed_X_dense.mean() assert_array_almost_equal(true_normed_X, np.array(normed_X_sparse.todense())) assert_array_almost_equal(true_normed_X, normed_X_dense) total_counts = 5000 normed_X = ICE_normalization(sparse_X, eps=1e-10, total_counts=total_counts) assert pytest.approx(normed_X.sum(), total_counts)
def test_sparse_ICE_normalization_triu(): n = 100 random_state = np.random.RandomState(seed=42) X = random_state.randint(0, 100, size=(n, n)) thres = (random_state.rand(n, n) > 0.5).astype(bool) X[thres] = 0 X = X + X.T sparse_X = sparse.triu(X) true_normed_X, true_biases = ICE_normalization( X, eps=1e-10, max_iter=10, output_bias=True) true_normed_X = np.triu(true_normed_X) normed_X_sparse, biases_sparse = ICE_normalization( sparse_X, eps=1e-10, max_iter=100, output_bias=True) normed_X_dense, biases_dense = ICE_normalization( np.triu(X), eps=1e-10, max_iter=100, output_bias=True) # The sparse and dense version are going to be equal up to a constant # factor assert_array_almost_equal(normed_X_dense, np.array(normed_X_sparse.toarray())) normed_X_sparse *= true_normed_X.mean() / normed_X_sparse.mean() normed_X_dense *= true_normed_X.mean() / normed_X_dense.mean() assert_array_almost_equal(true_normed_X, np.array(normed_X_sparse.todense())) assert_array_almost_equal(true_normed_X, normed_X_dense) total_counts = 5000 normed_X = ICE_normalization(sparse_X, eps=1e-10, total_counts=total_counts) assert_almost_equal(normed_X.sum(), total_counts)
def _prep_counts(counts_list, lengths, ploidy=1, multiscale_factor=1, normalize=True, filter_threshold=0.04, exclude_zeros=True, verbose=True): """Copy counts, check matrix, reduce resolution, filter, and compute bias. """ if not isinstance(counts_list, list): counts_list = [counts_list] # Copy counts counts_list = [c.copy() for c in counts_list] # Check counts counts_list = check_counts(counts_list, lengths=lengths, ploidy=ploidy, exclude_zeros=True) # Determine ambiguity nbeads = lengths.sum() * ploidy counts_dict = [('haploid' if ploidy == 1 else { 1: 'ambig', 1.5: 'pa', 2: 'ua' }[sum(c.shape) / nbeads], c) for c in counts_list] if len(counts_dict) != len(dict(counts_dict)): raise ValueError( "Can't input multiple counts matrices of the same" " type. Inputs (%d) = %s" % (len(counts_dict), ', '.join([x[0] for x in counts_dict]))) counts_dict = dict(counts_dict) # Reduce resolution lengths_lowres = lengths for counts_type, counts in counts_dict.items(): if multiscale_factor != 1: lengths_lowres = decrease_lengths_res( lengths, multiscale_factor=multiscale_factor) counts = decrease_counts_res(counts, multiscale_factor=multiscale_factor, lengths=lengths, ploidy=ploidy) counts_dict[counts_type] = counts # Optionally filter counts if filter_threshold is None: filter_threshold = 0 if filter_threshold and len(counts_list) > 1: # If there are multiple counts matrices, filter them together. # Counts will be ambiguated for deciding which beads to remove. # For diploid, any beads that are filtered out will be removed from both # homologs. if verbose: print( "FILTERING LOW COUNTS: manually filtering all counts together" " by %g" % filter_threshold, flush=True) all_counts_ambiguated = ambiguate_counts(list(counts_dict.values()), lengths=lengths_lowres, ploidy=ploidy, exclude_zeros=True) initial_zero_beads = find_beads_to_remove(all_counts_ambiguated, lengths_lowres.sum()).sum() all_counts_filtered = filter_low_counts( sparse.coo_matrix(all_counts_ambiguated), sparsity=False, percentage=filter_threshold + _percent_nan_beads(all_counts_ambiguated)).tocoo() torm = find_beads_to_remove(all_counts_filtered, lengths_lowres.sum()) if verbose: print(' removing %d beads' % (torm.sum() - initial_zero_beads), flush=True) for counts_type, counts in counts_dict.items(): if sparse.issparse(counts): counts = counts.toarray() counts[np.tile(torm, int(counts.shape[0] / torm.shape[0])), :] = 0. counts[:, np.tile(torm, int(counts.shape[1] / torm.shape[0]))] = 0. counts = sparse.coo_matrix(counts) counts_dict[counts_type] = counts elif filter_threshold: # If there is just one counts matrix, filter the full, non-ambiguated # counts matrix. # For diploid unambiguous or partially ambigous counts, it is possible # that a bead will be filtered out on one homolog but not another. individual_counts_torms = np.full((lengths_lowres.sum(), ), False) for counts_type, counts in counts_dict.items(): if verbose: print( 'FILTERING LOW COUNTS: manually filtering %s counts by %g' % (counts_type.upper(), filter_threshold), flush=True) initial_zero_beads = find_beads_to_remove( ambiguate_counts(counts, lengths=lengths_lowres, ploidy=ploidy), lengths_lowres.sum()).sum() if counts_type == 'pa': if sparse.issparse(counts): counts = counts.toarray() counts_filtered = np.zeros_like(counts) homo1_upper = np.triu(counts[:min(counts.shape), :], 1) homo1_lower = np.triu(counts[:min(counts.shape), :].T, 1) homo2_upper = np.triu(counts[min(counts.shape):, :], 1) homo2_lower = np.triu(counts[min(counts.shape):, :].T, 1) counts_filtered[:min(counts.shape), :] += filter_low_counts( sparse.coo_matrix(homo1_upper), sparsity=False, percentage=filter_threshold + _percent_nan_beads(homo1_upper)).toarray() counts_filtered[:min(counts.shape), :] += filter_low_counts( sparse.coo_matrix(homo1_lower), sparsity=False, percentage=filter_threshold + _percent_nan_beads(homo1_lower)).toarray().T counts_filtered[min(counts.shape):, :] += filter_low_counts( sparse.coo_matrix(homo2_upper), sparsity=False, percentage=filter_threshold + _percent_nan_beads(homo2_upper)).toarray() counts_filtered[min(counts.shape):, :] += filter_low_counts( sparse.coo_matrix(homo2_lower), sparsity=False, percentage=filter_threshold + _percent_nan_beads(homo2_lower)).toarray().T counts = counts_filtered else: counts = filter_low_counts(sparse.coo_matrix(counts), sparsity=False, percentage=filter_threshold + _percent_nan_beads(counts)).tocoo() torm = find_beads_to_remove( ambiguate_counts(counts, lengths=lengths_lowres, ploidy=ploidy), lengths_lowres.sum()) if verbose: print(' removing %d beads' % (torm.sum() - initial_zero_beads), flush=True) individual_counts_torms = individual_counts_torms | torm counts = sparse.coo_matrix(counts) counts_dict[counts_type] = counts # Optionally normalize counts bias = None if normalize: if verbose: print('COMPUTING BIAS: all counts together', flush=True) bias = ICE_normalization(ambiguate_counts(list(counts_dict.values()), lengths=lengths_lowres, ploidy=ploidy, exclude_zeros=True), max_iter=300, output_bias=True)[1].flatten() # In each counts matrix, zero out counts for which bias is NaN for counts_type, counts in counts_dict.items(): initial_zero_beads = find_beads_to_remove( ambiguate_counts(counts, lengths=lengths_lowres, ploidy=ploidy), lengths_lowres.sum()).sum() if sparse.issparse(counts): counts = counts.toarray() counts[np.tile(np.isnan(bias), int(counts.shape[0] / bias.shape[0])), :] = 0. counts[:, np.tile(np.isnan(bias), int(counts.shape[1] / bias.shape[0]))] = 0. counts = sparse.coo_matrix(counts) counts_dict[counts_type] = counts torm = find_beads_to_remove( ambiguate_counts(counts, lengths=lengths_lowres, ploidy=ploidy), lengths_lowres.sum()) if verbose and torm.sum() - initial_zero_beads > 0: print(' removing %d additional beads from %s' % (torm.sum() - initial_zero_beads, counts_type), flush=True) output_counts = check_counts(list(counts_dict.values()), lengths=lengths_lowres, ploidy=ploidy, exclude_zeros=exclude_zeros) return output_counts, bias
def generate_cool(input_path='./experiment/tad_boundary', chromosomes=['22', '21', '20', '19', 'X'], resolution=10000, genomic_distance=2000000): k = np.ceil(genomic_distance / resolution).astype(int) for chro in chromosomes: path = os.path.join(input_path, 'chr{}'.format(chro)) hicfile = 'high_chr{}.cool'.format(chro) cool_hic = cooler.Cooler(os.path.join(path, hicfile)) mat = cool_hic.matrix(balance=True).fetch('chr' + chro) bins = cool_hic.bins().fetch('chr' + chro) num_idx = np.array(np.where(np.array(bins['weight']))).flatten() high_mat = mat[num_idx, :] high_mat = high_mat[:, num_idx] high_mat = filter_diag_boundary(high_mat, diag_k=0, boundary_k=k) files = [f for f in os.listdir(path) if '.npz' in f] for file in files: if 'high' in file or 'low' in file: continue print(file) data = np.load(os.path.join(path, file), allow_pickle=True) mat = data['hic'] namelist = file.split('_') if len(namelist) == 3: name = namelist[0] else: model = namelist[1] win_len = namelist[3] if model == 'hicgan': # true_hic = np.log1p(true_hic) mat = np.expm1(mat) elif model == 'deephic': minv = high_mat.min() maxv = high_mat.max() # true_hic = np.divide((true_hic-minv), (maxv-minv), dtype=float,out=np.zeros_like(true_hic), where=(maxv-minv) != 0) mat = mat * (maxv - minv) + minv mat = (mat + np.transpose(mat)) / 2 elif model == 'hicsr': log_mat = np.log2(high_mat + 1) # ture_hic = 2*(log_mat/np.max(log_mat)) - 1 maxv = np.max(log_mat) log_predict_hic = (mat + 1) / 2 * maxv mat = np.expm1(log_predict_hic) '''elif model == 'ours': scn, dh = scn_normalization(high_mat, max_iter=3000) mat = scn_recover(mat, dh)''' name = '_'.join([model, win_len]) mat = filter_diag_boundary(mat, diag_k=0, boundary_k=k) # mat = mat[600:900, 600:900] # print(mat) mat = ICE_normalization(mat) print('mat shape: {}'.format(mat.shape)) uri = os.path.join(path, '{}_chr{}.cool'.format(name, chro)) mat = triu(mat, format='coo') # p = {'bin1_id': mat.row, 'bin2_id': mat.col, 'count': mat.data} p = { 'bin1_id': num_idx[mat.row], 'bin2_id': num_idx[mat.col], 'count': mat.data } pixels = pd.DataFrame(data=p) cooler.create_cooler(cool_uri=uri, bins=bins, pixels=pixels) with open(os.path.join(path, 'track.ini'), 'w') as f: f.writelines(track) f.close()