Ejemplo n.º 1
0
def test_ICE_normalization_cancer():
    n = 100
    random_state = np.random.RandomState(seed=42)
    X = random_state.randint(0, 100, size=(n, n))
    X = X + X.T
    profile = np.ones(n)
    profile[:10] = 0
    profile[50:] = 2
    normed_X, bias = ICE_normalization(X,
                                       eps=1e-10,
                                       counts_profile=profile,
                                       output_bias=True)
    assert not np.all(np.isnan(normed_X))

    normed_X[np.isnan(normed_X)] = 0
    mask = np.isnan(bias).flatten()
    bias[np.isnan(bias)] = 1
    normed_from_bias_X = X / (bias.T * bias)
    normed_from_bias_X[mask] = 0
    normed_from_bias_X[:, mask] = 0
    assert_array_almost_equal(normed_X, normed_from_bias_X, 6)
    inferred_profile = normed_X.sum(axis=0)
    inferred_profile /= inferred_profile.max()
    assert_array_almost_equal(inferred_profile, profile / profile.max())

    # Do the same for sparse matriecs
    normed_X = ICE_normalization(sparse.coo_matrix(X),
                                 eps=1e-10,
                                 counts_profile=profile)
Ejemplo n.º 2
0
def test_ICE_normalization():
    n = 100
    X = np.random.random((n, n))
    X = X + X.T
    normed_X = ICE_normalization(X, eps=1e-10, max_iter=1000000)
    normed = normed_X.sum(axis=1)
    assert_array_almost_equal(normed / normed.mean(), np.ones((len(X), )),
                              decimal=0)
Ejemplo n.º 3
0
def test_ICE_normalization():
    n = 100
    random_state = np.random.RandomState(seed=42)
    X = random_state.randint(0, 100, size=(n, n))
    X = X + X.T
    normed_X = ICE_normalization(X, eps=1e-10, max_iter=1000000)
    normed = normed_X.sum(axis=1)
    assert_array_almost_equal(normed / normed.mean(), np.ones((len(X), )),
                              decimal=0)
def gather_high_low_cool(
        cooler_file='Rao2014-GM12878-DpnII-allreps-filtered.10kb.cool',
        path='./data/raw/',
        chromosome='22',
        scale=4,
        output_path='./experiment/evaluation/'):
    file = os.path.join(path, cooler_file)
    cool_hic = cooler.Cooler(file)
    resolution = cool_hic.binsize
    mat = cool_hic.matrix(balance=True).fetch('chr' + chromosome)
    high_hic, idx = remove_zeros(
        mat)  # idx: {true, false}, len is not changed/shrinked
    bool_idx = np.array(idx).flatten()
    num_idx = np.array(np.where(idx)).flatten()
    low_hic = sampling_hic(high_hic, scale**2, fix_seed=True)
    print('high hic shape: {}.'.format(high_hic.shape), end=' ')
    print('low hic shape: {}.'.format(low_hic.shape))

    b = {
        'chrom': ['chr{}'.format(chromosome)] * len(bool_idx),
        'start': resolution * np.arange(len(bool_idx)),
        'end': resolution * (np.arange(1, (len(bool_idx) + 1))),
        'weight': 1.0 * bool_idx
    }
    bins = pd.DataFrame(data=b)

    high_hic = ICE_normalization(high_hic)
    low_hic = ICE_normalization(low_hic)

    high_hic = triu(high_hic, format='coo')
    low_hic = triu(low_hic, format='coo')

    output_path = os.path.join(output_path, 'chr{}'.format(chromosome))
    os.makedirs(output_path, exist_ok=True)

    outfile = 'high_chr{}.cool'.format(chromosome)
    print('saving file {}'.format(os.path.join(output_path, outfile)))
    uri = os.path.join(output_path, outfile)
    p = {
        'bin1_id': num_idx[high_hic.row],
        'bin2_id': num_idx[high_hic.col],
        'count': high_hic.data
    }
    pixels = pd.DataFrame(data=p)
    cooler.create_cooler(cool_uri=uri, bins=bins, pixels=pixels)

    outfile = 'low_chr{}.cool'.format(chromosome)
    print('saving file {}'.format(os.path.join(output_path, outfile)))
    uri = os.path.join(output_path, outfile)
    p = {
        'bin1_id': num_idx[low_hic.row],
        'bin2_id': num_idx[low_hic.col],
        'count': low_hic.data
    }
    pixels = pd.DataFrame(data=p)
    cooler.create_cooler(cool_uri=uri, bins=bins, pixels=pixels)
Ejemplo n.º 5
0
def test_sparse_ICE_normalization():
    n = 100
    X = np.random.random((n, n))
    thres = (np.random.random((n, n)) > 0.5).astype(bool)
    X[thres] = 0
    X = X + X.T
    sparse_X = sparse.csr_matrix(X)
    true_normed_X = ICE_normalization(X, eps=1e-10, max_iter=10)
    normed_X = ICE_normalization(sparse_X, eps=1e-10, max_iter=10)
    assert_array_almost_equal(X, sparse_X.todense())
    assert_array_almost_equal(true_normed_X, np.array(normed_X.todense()))
Ejemplo n.º 6
0
def test_sparse_ICE_normalization():
    n = 100
    random_state = np.random.RandomState(seed=42)
    X = random_state.randint(0, 100, size=(n, n))

    thres = (random_state.rand(n, n) > 0.5).astype(bool)

    X[thres] = 0
    X = X + X.T
    sparse_X = sparse.csr_matrix(X)
    true_normed_X = ICE_normalization(X, eps=1e-10, max_iter=10)
    normed_X = ICE_normalization(sparse_X, eps=1e-10, max_iter=10)
    assert_array_almost_equal(X, sparse_X.todense())
    assert_array_almost_equal(true_normed_X, np.array(normed_X.todense()))
Ejemplo n.º 7
0
def test_sparse_ICE_normalization_triu():
    n = 100
    random_state = np.random.RandomState(seed=42)
    X = random_state.randint(0, 100, size=(n, n))

    thres = (random_state.rand(n, n) > 0.5).astype(bool)
    X[thres] = 0
    X = X + X.T
    sparse_X = sparse.triu(X)
    true_normed_X = ICE_normalization(X, eps=1e-10, max_iter=10)
    true_normed_X = np.triu(true_normed_X)
    X = np.triu(X)
    normed_X = ICE_normalization(sparse_X, eps=1e-10, max_iter=10)
    assert_array_almost_equal(X, sparse_X.todense())
    # The sparse and dense version are going to be equal up to a constant
    # factor

    normed_X *= true_normed_X.mean() / normed_X.mean()
    assert_array_almost_equal(true_normed_X, np.array(normed_X.todense()))
Ejemplo n.º 8
0
def test_sparse_ICE_normalization_triu():
    n = 100
    random_state = np.random.RandomState(seed=42)
    X = random_state.randint(0, 100, size=(n, n))

    thres = (random_state.rand(n, n) > 0.5).astype(bool)
    X[thres] = 0
    X = X + X.T
    sparse_X = sparse.triu(X)
    true_normed_X, true_biases = ICE_normalization(X,
                                                   eps=1e-10,
                                                   max_iter=10,
                                                   output_bias=True)
    true_normed_X = np.triu(true_normed_X)

    normed_X_sparse, biases_sparse = ICE_normalization(sparse_X,
                                                       eps=1e-10,
                                                       max_iter=100,
                                                       output_bias=True)
    normed_X_dense, biases_dense = ICE_normalization(np.triu(X),
                                                     eps=1e-10,
                                                     max_iter=100,
                                                     output_bias=True)

    # The sparse and dense version are going to be equal up to a constant
    # factor
    assert_array_almost_equal(normed_X_dense,
                              np.array(normed_X_sparse.toarray()))

    normed_X_sparse *= true_normed_X.mean() / normed_X_sparse.mean()
    normed_X_dense *= true_normed_X.mean() / normed_X_dense.mean()

    assert_array_almost_equal(true_normed_X,
                              np.array(normed_X_sparse.todense()))
    assert_array_almost_equal(true_normed_X, normed_X_dense)

    total_counts = 5000
    normed_X = ICE_normalization(sparse_X,
                                 eps=1e-10,
                                 total_counts=total_counts)
    assert pytest.approx(normed_X.sum(), total_counts)
Ejemplo n.º 9
0
def test_sparse_ICE_normalization_triu():
    n = 100
    random_state = np.random.RandomState(seed=42)
    X = random_state.randint(0, 100, size=(n, n))

    thres = (random_state.rand(n, n) > 0.5).astype(bool)
    X[thres] = 0
    X = X + X.T
    sparse_X = sparse.triu(X)
    true_normed_X, true_biases = ICE_normalization(
        X, eps=1e-10, max_iter=10, output_bias=True)
    true_normed_X = np.triu(true_normed_X)

    normed_X_sparse, biases_sparse = ICE_normalization(
        sparse_X, eps=1e-10, max_iter=100,
        output_bias=True)
    normed_X_dense, biases_dense = ICE_normalization(
        np.triu(X), eps=1e-10, max_iter=100,
        output_bias=True)

    # The sparse and dense version are going to be equal up to a constant
    # factor
    assert_array_almost_equal(normed_X_dense,
                              np.array(normed_X_sparse.toarray()))

    normed_X_sparse *= true_normed_X.mean() / normed_X_sparse.mean()
    normed_X_dense *= true_normed_X.mean() / normed_X_dense.mean()

    assert_array_almost_equal(true_normed_X,
                              np.array(normed_X_sparse.todense()))
    assert_array_almost_equal(true_normed_X, normed_X_dense)

    total_counts = 5000
    normed_X = ICE_normalization(sparse_X, eps=1e-10,
                                 total_counts=total_counts)
    assert_almost_equal(normed_X.sum(), total_counts)
Ejemplo n.º 10
0
def _prep_counts(counts_list,
                 lengths,
                 ploidy=1,
                 multiscale_factor=1,
                 normalize=True,
                 filter_threshold=0.04,
                 exclude_zeros=True,
                 verbose=True):
    """Copy counts, check matrix, reduce resolution, filter, and compute bias.
    """

    if not isinstance(counts_list, list):
        counts_list = [counts_list]

    # Copy counts
    counts_list = [c.copy() for c in counts_list]

    # Check counts
    counts_list = check_counts(counts_list,
                               lengths=lengths,
                               ploidy=ploidy,
                               exclude_zeros=True)

    # Determine ambiguity
    nbeads = lengths.sum() * ploidy
    counts_dict = [('haploid' if ploidy == 1 else {
        1: 'ambig',
        1.5: 'pa',
        2: 'ua'
    }[sum(c.shape) / nbeads], c) for c in counts_list]
    if len(counts_dict) != len(dict(counts_dict)):
        raise ValueError(
            "Can't input multiple counts matrices of the same"
            " type. Inputs (%d) = %s" %
            (len(counts_dict), ', '.join([x[0] for x in counts_dict])))
    counts_dict = dict(counts_dict)

    # Reduce resolution
    lengths_lowres = lengths
    for counts_type, counts in counts_dict.items():
        if multiscale_factor != 1:
            lengths_lowres = decrease_lengths_res(
                lengths, multiscale_factor=multiscale_factor)
            counts = decrease_counts_res(counts,
                                         multiscale_factor=multiscale_factor,
                                         lengths=lengths,
                                         ploidy=ploidy)
            counts_dict[counts_type] = counts

    # Optionally filter counts
    if filter_threshold is None:
        filter_threshold = 0
    if filter_threshold and len(counts_list) > 1:
        # If there are multiple counts matrices, filter them together.
        # Counts will be ambiguated for deciding which beads to remove.
        # For diploid, any beads that are filtered out will be removed from both
        # homologs.
        if verbose:
            print(
                "FILTERING LOW COUNTS: manually filtering all counts together"
                " by %g" % filter_threshold,
                flush=True)
        all_counts_ambiguated = ambiguate_counts(list(counts_dict.values()),
                                                 lengths=lengths_lowres,
                                                 ploidy=ploidy,
                                                 exclude_zeros=True)
        initial_zero_beads = find_beads_to_remove(all_counts_ambiguated,
                                                  lengths_lowres.sum()).sum()
        all_counts_filtered = filter_low_counts(
            sparse.coo_matrix(all_counts_ambiguated),
            sparsity=False,
            percentage=filter_threshold +
            _percent_nan_beads(all_counts_ambiguated)).tocoo()
        torm = find_beads_to_remove(all_counts_filtered, lengths_lowres.sum())
        if verbose:
            print('                      removing %d beads' %
                  (torm.sum() - initial_zero_beads),
                  flush=True)
        for counts_type, counts in counts_dict.items():
            if sparse.issparse(counts):
                counts = counts.toarray()
            counts[np.tile(torm, int(counts.shape[0] / torm.shape[0])), :] = 0.
            counts[:, np.tile(torm, int(counts.shape[1] / torm.shape[0]))] = 0.
            counts = sparse.coo_matrix(counts)
            counts_dict[counts_type] = counts
    elif filter_threshold:
        # If there is just one counts matrix, filter the full, non-ambiguated
        # counts matrix.
        # For diploid unambiguous or partially ambigous counts, it is possible
        # that a bead will be filtered out on one homolog but not another.
        individual_counts_torms = np.full((lengths_lowres.sum(), ), False)
        for counts_type, counts in counts_dict.items():
            if verbose:
                print(
                    'FILTERING LOW COUNTS: manually filtering %s counts by %g'
                    % (counts_type.upper(), filter_threshold),
                    flush=True)
            initial_zero_beads = find_beads_to_remove(
                ambiguate_counts(counts, lengths=lengths_lowres,
                                 ploidy=ploidy), lengths_lowres.sum()).sum()
            if counts_type == 'pa':
                if sparse.issparse(counts):
                    counts = counts.toarray()
                counts_filtered = np.zeros_like(counts)
                homo1_upper = np.triu(counts[:min(counts.shape), :], 1)
                homo1_lower = np.triu(counts[:min(counts.shape), :].T, 1)
                homo2_upper = np.triu(counts[min(counts.shape):, :], 1)
                homo2_lower = np.triu(counts[min(counts.shape):, :].T, 1)
                counts_filtered[:min(counts.shape), :] += filter_low_counts(
                    sparse.coo_matrix(homo1_upper),
                    sparsity=False,
                    percentage=filter_threshold +
                    _percent_nan_beads(homo1_upper)).toarray()
                counts_filtered[:min(counts.shape), :] += filter_low_counts(
                    sparse.coo_matrix(homo1_lower),
                    sparsity=False,
                    percentage=filter_threshold +
                    _percent_nan_beads(homo1_lower)).toarray().T
                counts_filtered[min(counts.shape):, :] += filter_low_counts(
                    sparse.coo_matrix(homo2_upper),
                    sparsity=False,
                    percentage=filter_threshold +
                    _percent_nan_beads(homo2_upper)).toarray()
                counts_filtered[min(counts.shape):, :] += filter_low_counts(
                    sparse.coo_matrix(homo2_lower),
                    sparsity=False,
                    percentage=filter_threshold +
                    _percent_nan_beads(homo2_lower)).toarray().T
                counts = counts_filtered
            else:
                counts = filter_low_counts(sparse.coo_matrix(counts),
                                           sparsity=False,
                                           percentage=filter_threshold +
                                           _percent_nan_beads(counts)).tocoo()
            torm = find_beads_to_remove(
                ambiguate_counts(counts, lengths=lengths_lowres,
                                 ploidy=ploidy), lengths_lowres.sum())
            if verbose:
                print('                      removing %d beads' %
                      (torm.sum() - initial_zero_beads),
                      flush=True)
            individual_counts_torms = individual_counts_torms | torm
            counts = sparse.coo_matrix(counts)
            counts_dict[counts_type] = counts

    # Optionally normalize counts
    bias = None
    if normalize:
        if verbose:
            print('COMPUTING BIAS: all counts together', flush=True)
        bias = ICE_normalization(ambiguate_counts(list(counts_dict.values()),
                                                  lengths=lengths_lowres,
                                                  ploidy=ploidy,
                                                  exclude_zeros=True),
                                 max_iter=300,
                                 output_bias=True)[1].flatten()
        # In each counts matrix, zero out counts for which bias is NaN
        for counts_type, counts in counts_dict.items():
            initial_zero_beads = find_beads_to_remove(
                ambiguate_counts(counts, lengths=lengths_lowres,
                                 ploidy=ploidy), lengths_lowres.sum()).sum()
            if sparse.issparse(counts):
                counts = counts.toarray()
            counts[np.tile(np.isnan(bias), int(counts.shape[0] /
                                               bias.shape[0])), :] = 0.
            counts[:,
                   np.tile(np.isnan(bias), int(counts.shape[1] /
                                               bias.shape[0]))] = 0.
            counts = sparse.coo_matrix(counts)
            counts_dict[counts_type] = counts
            torm = find_beads_to_remove(
                ambiguate_counts(counts, lengths=lengths_lowres,
                                 ploidy=ploidy), lengths_lowres.sum())
            if verbose and torm.sum() - initial_zero_beads > 0:
                print('                removing %d additional beads from %s' %
                      (torm.sum() - initial_zero_beads, counts_type),
                      flush=True)

    output_counts = check_counts(list(counts_dict.values()),
                                 lengths=lengths_lowres,
                                 ploidy=ploidy,
                                 exclude_zeros=exclude_zeros)
    return output_counts, bias
def generate_cool(input_path='./experiment/tad_boundary',
                  chromosomes=['22', '21', '20', '19', 'X'],
                  resolution=10000,
                  genomic_distance=2000000):
    k = np.ceil(genomic_distance / resolution).astype(int)
    for chro in chromosomes:
        path = os.path.join(input_path, 'chr{}'.format(chro))
        hicfile = 'high_chr{}.cool'.format(chro)
        cool_hic = cooler.Cooler(os.path.join(path, hicfile))
        mat = cool_hic.matrix(balance=True).fetch('chr' + chro)
        bins = cool_hic.bins().fetch('chr' + chro)
        num_idx = np.array(np.where(np.array(bins['weight']))).flatten()

        high_mat = mat[num_idx, :]
        high_mat = high_mat[:, num_idx]
        high_mat = filter_diag_boundary(high_mat, diag_k=0, boundary_k=k)

        files = [f for f in os.listdir(path) if '.npz' in f]
        for file in files:
            if 'high' in file or 'low' in file:
                continue
            print(file)
            data = np.load(os.path.join(path, file), allow_pickle=True)
            mat = data['hic']
            namelist = file.split('_')
            if len(namelist) == 3:
                name = namelist[0]
            else:
                model = namelist[1]
                win_len = namelist[3]
                if model == 'hicgan':
                    # true_hic = np.log1p(true_hic)
                    mat = np.expm1(mat)
                elif model == 'deephic':
                    minv = high_mat.min()
                    maxv = high_mat.max()
                    # true_hic = np.divide((true_hic-minv), (maxv-minv), dtype=float,out=np.zeros_like(true_hic), where=(maxv-minv) != 0)
                    mat = mat * (maxv - minv) + minv
                    mat = (mat + np.transpose(mat)) / 2
                elif model == 'hicsr':
                    log_mat = np.log2(high_mat + 1)
                    # ture_hic = 2*(log_mat/np.max(log_mat)) - 1
                    maxv = np.max(log_mat)
                    log_predict_hic = (mat + 1) / 2 * maxv
                    mat = np.expm1(log_predict_hic)
                '''elif model == 'ours':
                    scn, dh = scn_normalization(high_mat, max_iter=3000)
                    mat = scn_recover(mat, dh)'''
                name = '_'.join([model, win_len])
            mat = filter_diag_boundary(mat, diag_k=0, boundary_k=k)
            # mat = mat[600:900, 600:900]
            # print(mat)
            mat = ICE_normalization(mat)
            print('mat shape: {}'.format(mat.shape))
            uri = os.path.join(path, '{}_chr{}.cool'.format(name, chro))
            mat = triu(mat, format='coo')
            # p = {'bin1_id': mat.row, 'bin2_id': mat.col, 'count': mat.data}
            p = {
                'bin1_id': num_idx[mat.row],
                'bin2_id': num_idx[mat.col],
                'count': mat.data
            }
            pixels = pd.DataFrame(data=p)
            cooler.create_cooler(cool_uri=uri, bins=bins, pixels=pixels)
        with open(os.path.join(path, 'track.ini'), 'w') as f:
            f.writelines(track)
        f.close()