Esempio n. 1
0
    def fit_cut_intervals(cut_intervals):
        # check that the matrix has bins of same size
        # otherwise try to adjust the bins to
        # to match a regular binning
        if len(cut_intervals) <= 1:
            # do nothing if there is only one interval
            return cut_intervals
        chrom, start, end, extra = zip(*cut_intervals)

        median = int(np.median(np.diff(start)))
        diff = np.array(end) - np.array(start)
        # check if the bin size is homogeneous
        if len(np.flatnonzero(diff != median)) > (len(diff) * 0.01):
            # set the start position of a bin to the closest multiple
            # of the median
            def snap_nearest_multiple(start_x, m):
                resi = [-1 * (start_x % m), -start_x % m]
                return start_x + resi[np.argmin(np.abs(resi))]

            start = [snap_nearest_multiple(x, median) for x in start]
            end = [snap_nearest_multiple(x, median) for x in end]
            cut_intervals = zip(chrom, start, end, extra)
            log.info('[getCountsByDistance] Bin size is not '
                     'homogeneous, setting \n'
                     'the bin distance to the median: {}\n'.format(median))
        return cut_intervals
Esempio n. 2
0
def getRegion(args, ma):
    chrom = region_start = region_end = idx1 = start_pos1 = chrom2 = region_start2 = region_end2 = idx2 = start_pos2 = None
    chrom, region_start, region_end = translate_region(args.region)

    chrom = check_chrom_str_bytes(ma.interval_trees, chrom)
    # if type(next(iter(ma.interval_trees))) in [np.bytes_, bytes]:
    #     chrom = toBytes(chrom)

    if chrom not in list(ma.interval_trees):

        chrom = change_chrom_names(chrom)

        chrom = check_chrom_str_bytes(ma.interval_trees, chrom)

        # if type(next(iter(ma.interval_trees))) in [np.bytes_, bytes]:
        #     chrom = toBytes(chrom)

        if chrom not in list(ma.interval_trees):
            exit("Chromosome name {} in --region not in matrix".format(change_chrom_names(chrom)))

    args.region = [chrom, region_start, region_end]
    is_cooler = check_cooler(args.matrix)
    if is_cooler:
        idx1, start_pos1 = zip(*[(idx, x[1]) for idx, x in enumerate(ma.cut_intervals) if x[0] == chrom and
                                 ((x[1] >= region_start and x[2] < region_end) or
                                  (x[1] < region_end and x[2] < region_end and x[2] > region_start) or
                                  (x[1] > region_start and x[1] < region_end))])
    else:
        idx1, start_pos1 = zip(*[(idx, x[1]) for idx, x in enumerate(ma.cut_intervals) if x[0] == chrom and
                                 x[1] >= region_start and x[2] < region_end])
    if args.region2:
        chrom2, region_start2, region_end2 = translate_region(args.region2)
        chrom2 = check_chrom_str_bytes(ma.interval_trees, chrom2)

        # if type(next(iter(ma.interval_trees))) in [np.bytes_, bytes]:
        #     chrom2 = toBytes(chrom)
        if chrom2 not in list(ma.interval_trees):
            chrom2 = change_chrom_names(chrom2)
            chrom2 = check_chrom_str_bytes(ma.interval_trees, chrom2)

            # if type(next(iter(ma.interval_trees))) in [np.bytes_, bytes]:
            #     chrom2 = toBytes(chrom)
            if chrom2 not in list(ma.interval_trees):
                exit("Chromosome name {} in --region2 not in matrix".format(change_chrom_names(chrom2)))
        if is_cooler:
            idx2, start_pos2 = zip(*[(idx, x[1]) for idx, x in enumerate(ma.cut_intervals) if x[0] == chrom2 and
                                     ((x[1] >= region_start2 and x[2] < region_end2) or
                                      (x[1] < region_end2 and x[2] < region_end2 and x[2] > region_start2) or
                                      (x[1] > region_start2 and x[1] < region_end2))])
        else:
            idx2, start_pos2 = zip(*[(idx, x[1]) for idx, x in enumerate(ma.cut_intervals) if x[0] == chrom2 and
                                     x[1] >= region_start2 and x[2] < region_end2])
    else:
        idx2 = idx1
        chrom2 = chrom
        start_pos2 = start_pos1

    return chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2
Esempio n. 3
0
    def load(self):
        """
        Loads a matrix stored in h5 format
        :param matrix_filename:
        :return: matrix, cut_intervals, nan_bins, distance_counts, correction_factors
        """
        log.debug('Load in h5 format')

        with tables.open_file(self.matrixFileName) as f:
            parts = {}
            for matrix_part in ('data', 'indices', 'indptr', 'shape'):
                parts[matrix_part] = getattr(f.root.matrix, matrix_part).read()

            matrix = csr_matrix(tuple([parts['data'], parts['indices'], parts['indptr']]),
                                shape=parts['shape'])
            # matrix = hiCMatrix.fillLowerTriangle(matrix)
            # get intervals
            intvals = {}
            for interval_part in ('chr_list', 'start_list', 'end_list', 'extra_list'):
                if toString(interval_part) == toString('chr_list'):
                    chrom_list = getattr(f.root.intervals, interval_part).read()
                    intvals[interval_part] = toString(chrom_list)
                else:
                    intvals[interval_part] = getattr(f.root.intervals, interval_part).read()

            cut_intervals = zip(intvals['chr_list'], intvals['start_list'], intvals['end_list'], intvals['extra_list'])
            assert len(cut_intervals) == matrix.shape[0], \
                "Error loading matrix. Length of bin intervals ({}) is different than the " \
                "size of the matrix ({})".format(len(cut_intervals), matrix.shape[0])

            # get nan_bins
            try:
                if hasattr(f.root, 'nan_bins'):
                    nan_bins = f.root.nan_bins.read()
                else:
                    nan_bins = np.array([])
            except Exception:
                nan_bins = np.array([])

            # get correction factors
            try:
                if hasattr(f.root, 'correction_factors'):
                    correction_factors = f.root.correction_factors.read()
                    assert len(correction_factors) == matrix.shape[0], \
                        "Error loading matrix. Length of correction factors does not" \
                        "match size of matrix"
                else:
                    correction_factors = None
            except Exception:
                correction_factors = None

            try:
                # get correction factors
                if hasattr(f.root, 'distance_counts'):
                    distance_counts = f.root.correction_factors.read()
                else:
                    distance_counts = None
            except Exception:
                distance_counts = None
            return matrix, cut_intervals, nan_bins, distance_counts, correction_factors
def getRegion(args, ma):
    chrom = region_start = region_end = idx1 = start_pos1 = chrom2 = region_start2 = region_end2 = idx2 = start_pos2 = None
    chrom, region_start, region_end = translate_region(args.region)

    if type(next(iter(ma.interval_trees))) in [np.bytes_, bytes]:
        chrom = toBytes(chrom)

    if chrom not in list(ma.interval_trees):

        if type(next(iter(ma.interval_trees))) in [np.bytes_, bytes]:
            chrom = toBytes(chrom)

        if chrom not in list(ma.interval_trees):
            exit(
                "The contig/scaffold name '{}' given in --region is not part of the Hi-C matrix. "
                "Check spelling".format(chrom))

    args.region = [chrom, region_start, region_end]

    idx1, start_pos1 = zip(
        *[(idx, x[1]) for idx, x in enumerate(ma.cut_intervals)
          if x[0] == chrom and x[1] >= region_start and x[2] < region_end])
    idx2 = idx1
    chrom2 = chrom
    start_pos2 = start_pos1

    return chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2
Esempio n. 5
0
def test_load_cool_save_and_load_h5():
    hic = hm.hiCMatrix(ROOT + 'Li_et_al_2015.cool')

    outfile = NamedTemporaryFile(suffix='.h5', prefix='hicexplorer_test')
    hic.matrixFileHandler = None
    hic.save(pMatrixName=outfile.name)

    hic_cool = hm.hiCMatrix(outfile.name)

    nt.assert_equal(hic_cool.matrix.data, hic.matrix.data)
    chrom_cool, start_cool, end_cool, _ = zip(*hic_cool.cut_intervals)
    chrom, start, end, _ = zip(*hic_cool.cut_intervals)

    nt.assert_equal(chrom_cool, chrom)
    nt.assert_equal(start_cool, start)
    nt.assert_equal(end_cool, end)
Esempio n. 6
0
def test_removeBySequencedCount():
    # get matrix
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    nt.assert_equal(hic.getMatrix(), matrix)

    # function returns directly if last entry of cut_intervals not float64
    _, _, _, coverage = zip(*hic.cut_intervals)
    assert type(coverage[0]) != np.float64

    # define expected outcome
    to_remove_expected = None

    # and test outcome
    to_remove = hic.removeBySequencedCount()

    nt.assert_equal(to_remove, to_remove_expected)
Esempio n. 7
0
 def _build_commands(self):
     commands = []
     for opts, label in zip(self.optionstrings, self.labels):
         if self.command == 'stat':
             commands.append(self._build_perf_stat_command(opts, self.events, label))
         else:
             commands.append(self._build_perf_record_command(opts, label))
     return commands
Esempio n. 8
0
def test_convert_to_zscore_matrix_2():

    # load test matrix
    hic = hm.hiCMatrix(ROOT + 'Li_et_al_2015.h5')
    hic.maskBins(hic.nan_bins)

    mat = hic.matrix.todense()
    max_depth = 10000
    bin_size = hic.getBinSize()
    max_depth_in_bins = int(float(max_depth) / bin_size)

    m_size = mat.shape[0]
    # compute matrix values per distance
    chrom, start, end, extra = zip(
        *hm.hiCMatrix.fit_cut_intervals(hic.cut_intervals))
    dist_values = {}
    sys.stderr.write("Computing values per distance for each matrix entry\n")

    for _i in range(mat.shape[0]):
        for _j in range(mat.shape[0]):
            if _j >= _i:
                # dist is translated to bins
                dist = int(float(start[_j] - start[_i]) / bin_size)
                if dist <= max_depth_in_bins:
                    if dist not in dist_values:
                        dist_values[dist] = []
                    dist_values[dist].append(mat[_i, _j])

    mu = {}
    std = {}
    for dist, values in iteritems(dist_values):
        mu[dist] = np.mean(values)
        std[dist] = np.std(values)

    # compute z-score for test matrix
    sys.stderr.write("Computing zscore for each matrix entry\n")
    zscore_mat = np.full((m_size, m_size), np.nan)
    for _i in range(mat.shape[0]):
        for _j in range(mat.shape[0]):
            if _j >= _i:
                dist = int(float(start[_j] - start[_i]) / bin_size)
                if dist <= max_depth_in_bins:
                    zscore = (mat[_i, _j] - mu[dist]) / std[dist]
                    zscore_mat[_i, _j] = zscore

    # compare with zscore from class
    hic.convert_to_zscore_matrix(maxdepth=max_depth)

    # from numpy.testing import assert_almost_equal
    # only the main diagonal is check. Other diagonals show minimal differences
    nt.assert_almost_equal(hic.matrix.todense().diagonal(0).A1,
                           zscore_mat.diagonal(0))
Esempio n. 9
0
def merge_tad_bins(hic, boundary_id_list, filename):
    """
    Reduces the HiCMatrix by merging the counts of tad bins.
    :param hic: HiCMatrix object
    :param boundary_id_list list of tad boundary bin ids
    :param filename Name to save the resulting matrix
    :return: HiCMatrix object
    """

    from hicexplorer.reduceMatrix import reduce_matrix
    hic.restoreMaskedBins()
    ref_name_list, start_list, end_list, coverage_list = zip(
        *hic.cut_intervals)
    new_bins = []
    bins_to_merge = []
    prev_ref = ref_name_list[0]

    # prepare new intervals
    idx_start = 0
    new_start = start_list[0]
    count = 0
    for idx, ref in enumerate(ref_name_list):
        if (count > 0 and idx in boundary_id_list) or ref != prev_ref:
            coverage = np.mean(coverage_list[idx_start:idx])
            new_bins.append((ref_name_list[idx_start], new_start,
                             end_list[idx - 1], coverage))
            bins_to_merge.append(list(range(idx_start, idx)))
            idx_start = idx
            new_start = start_list[idx]
            count = 0

        prev_ref = ref
        count += 1
    # check that the previous for loop ran, otherwise
    # some variables may not be set
    if len(bins_to_merge) > 0:
        coverage = np.mean(coverage_list[idx_start:])
        new_bins.append((ref, new_start, end_list[idx], coverage))
        bins_to_merge.append(list(range(idx_start, idx + 1)))
        # remove correction factors otherwise they are
        # saved but they no longer correspond to the
        # size of the matrix.
        hic.correction_factors = None

        hic.update_matrix(
            reduce_matrix(hic.matrix, bins_to_merge, diagonal=True), new_bins)

        hic.save(filename)
    else:
        log.info("Nothing to merge.")
Esempio n. 10
0
    def test_noniterators_produce_lists(self):
        l = range(10)
        self.assertTrue(isinstance(l, list))

        l2 = zip(l, list('ABCDE')*2)
        self.assertTrue(isinstance(l2, list))

        double = lambda x: x*2
        l3 = map(double, l)
        self.assertTrue(isinstance(l3, list))

        is_odd = lambda x: x % 2 == 1
        l4 = filter(is_odd, range(10))
        self.assertEqual(l4, [1, 3, 5, 7, 9])
        self.assertTrue(isinstance(l4, list))
Esempio n. 11
0
    def getDistList(rows, cols, cut_intervals):
        """
            Given a list of rows and cols
            an array is returned containing
            the genomic distance between
            each element of the row array
            with each element of the col array.
            -1 is returned for inter-chromosomal
            interactions.

            A matching list containing the chromosome name
            is also returned
        """
        chrnamelist, startlist, endlist, extralist = zip(*cut_intervals)
        # now the distance between any two points
        # is computed and arranged such that for each
        # element of the data array, a corespondent distance is stored
        start_row = np.take(startlist, rows)
        start_col = np.take(startlist, cols)
        dist_list = start_col - start_row

        # now  all distances that are between chromosomes are removed
        # to do this I convert the array of chromosomes to
        # a array of indices. Then, when subtracting the
        # values that correspond to matrix.row and matrix.col
        # using the array of indices, any value other
        # than 0 means inter-chromosomal row,col combination.

        # chr_id_list is based on a trick using np.unique
        # to get from a list of strings
        # a list of integers
        chr_id_list = np.unique(chrnamelist, return_inverse=True)[1]

        chr_row = np.take(chr_id_list, rows)
        chr_col = np.take(chr_id_list, cols)
        chr_diff = chr_row - chr_col
        # set in dist_list array '-1' for all interchromosomal values
        dist_list[chr_diff != 0] = -1

        # make a corresponding chromosome name list
        # if filtering per chromosome is required
        chrom_list = np.take(chrnamelist, rows)
        chrom_list[chr_diff != 0] = ''

        return dist_list, chrom_list
Esempio n. 12
0
def consistency_merge(list_of_tuples,
                      message='values differ:',
                      error='raise',
                      merge=mostfrequent):

    assert error in ('raise', 'warn', 'ignore')

    if len(list_of_tuples) == 0:
        raise Exception('cannot merge empty sequence')

    try:
        consistency_check(list_of_tuples, message)
        return list_of_tuples[0][1:]
    except Inconsistency as e:
        if error == 'raise':
            raise

        elif error == 'warn':
            logger.warning(str(e))

        return tuple([merge(x) for x in zip(*list_of_tuples)[1:]])
Esempio n. 13
0
    def getBinSize(self):
        """
        estimates the bin size. In case the bin size
        is not equal for all bins (maybe except for the
        bin at the en of the chromosomes) a warning is issued.
        In case of uneven bins, the median is returned.
        """
        if self.bin_size is None:
            chrom, start, end, extra = zip(*self.cut_intervals)
            median = int(np.median(np.diff(start)))
            diff = np.array(end) - np.array(start)

            # check if the bin size is
            # homogeneous
            if len(np.flatnonzero(diff != median)) > (len(diff) * 0.01):
                self.bin_size_homogeneous = False
                if self.non_homogeneous_warning_already_printed is False:
                    log.warning('Bin size is not homogeneous. \
                                      Median {}\n'.format(median))
                    self.non_homogeneous_warning_already_printed = True
            self.bin_size = median
        return self.bin_size
Esempio n. 14
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    if args.BED:
        count = 0
        for line in args.BED.readlines():
            count += 1
            try:
                chrom, start, end = line.strip().split('\t')[0:3]
            except ValueError:
                continue
            try:
                start, end = map(int, [start, end])
            except ValueError as detail:
                sys.stderr.write(
                    "Invalid value found at line\t{}\t. {}\n".format(
                        line, detail))
            file_name = "{}_{}:{}-{}".format(args.outFileName, chrom, start,
                                             end)
            if end - start < 200000:
                start -= 100000
                end += 100000
            sys.stderr.write("saving {}'\n".format(file_name))
            region = zip(chrom, start, end)
    else:
        region = get_region(args.region)

    trp = pygenometracks.tracksClass.PlotTracks(
        args.tracks.name,
        args.width,
        fig_height=args.height,
        fontsize=args.fontSize,
        dpi=args.dpi,
        track_label_width=args.trackLabelFraction,
        pRegion=region)
    trp.plot(args.outFileName, *region, title=args.title)
Esempio n. 15
0
def merge_bins(hic, num_bins):
    """
    Merge the bins using the specified number of bins. This
    functions takes care to make new intervals

    Parameters
    ----------

    hic : HiCMatrix object

    num_bins : number of consecutive bins to merge.

    Returns
    -------

    A sparse matrix.

    Set up a Hi-C test matrix
    >>> from scipy.sparse import csr_matrix
    >>> row, col = np.triu_indices(5)
    >>> cut_intervals = [('a', 0, 10, 0.5), ('a', 10, 20, 1),
    ... ('a', 20, 30, 1), ('a', 30, 40, 0.1), ('b', 40, 50, 1)]
    >>> hic = hm.hiCMatrix()
    >>> hic.nan_bins = []
    >>> matrix = np.array([
    ... [ 50, 10,  5,  3,   0],
    ... [  0, 60, 15,  5,   1],
    ... [  0,  0, 80,  7,   3],
    ... [  0,  0,  0, 90,   1],
    ... [  0,  0,  0,  0, 100]], dtype=np.int32)

    make the matrix symmetric:
    >>> from scipy.sparse import dia_matrix

    >>> dia = dia_matrix(([matrix.diagonal()], [0]), shape=matrix.shape)
    >>> hic.matrix = csr_matrix(matrix + matrix.T - dia)
    >>> hic.setMatrix(hic.matrix, cut_intervals)

    run merge_matrix
    >>> merge_matrix = merge_bins(hic, 2)
    >>> merge_matrix.cut_intervals
    [('a', 0, 20, 0.75), ('a', 20, 40, 0.55000000000000004), ('b', 40, 50, 1.0)]
    >>> merge_matrix.matrix.todense()
    matrix([[120,  28,   1],
            [ 28, 177,   4],
            [  1,   4, 100]], dtype=int32)
    """

    hic = remove_nans_if_needed(hic)
    # get the bins to merge
    ref_name_list, start_list, end_list, coverage_list = zip(
        *hic.cut_intervals)
    new_bins = []
    bins_to_merge = []
    prev_ref = ref_name_list[0]

    # prepare new intervals
    idx_start = 0
    new_start = start_list[0]
    count = 0
    for idx, ref in enumerate(ref_name_list):
        if (count > 0 and count % num_bins == 0) or ref != prev_ref:
            if count < num_bins / 2:
                log.debug("{} has few bins ({}). Skipping it\n".format(
                    prev_ref, count))
            else:
                coverage = np.mean(coverage_list[idx_start:idx])
                new_bins.append((ref_name_list[idx_start], new_start,
                                 end_list[idx - 1], coverage))
                bins_to_merge.append(list(range(idx_start, idx)))
            idx_start = idx
            new_start = start_list[idx]
            count = 0

        prev_ref = ref
        count += 1
    coverage = np.mean(coverage_list[idx_start:])
    new_bins.append((ref, new_start, end_list[idx], coverage))
    bins_to_merge.append(list(range(idx_start, idx + 1)))

    hic.matrix = reduce_matrix(hic.matrix, bins_to_merge, diagonal=True)
    hic.matrix.eliminate_zeros()
    hic.setCutIntervals(new_bins)
    hic.nan_bins = np.flatnonzero(hic.matrix.sum(0).A == 0)

    return hic
Esempio n. 16
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    if args.verbose:
        log.setLevel(logging.INFO)

    # args.chromosomes
    if check_cooler(args.matrix) and args.chromosomes is not None and len(
            args.chromosomes) == 1:
        ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes))
    else:
        ma = hm.hiCMatrix(args.matrix)

        if args.chromosomes:
            ma.reorderChromosomes(toString(args.chromosomes))

    # mask all zero value bins
    if 'correctionMethod' in args:
        if args.correctionMethod == 'ICE':
            row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
            log.info("Removing {} zero value bins".format(sum(row_sum == 0)))
            ma.maskBins(np.flatnonzero(row_sum == 0))
            matrix_shape = ma.matrix.shape
    if 'plotName' in args:
        row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
        log.info("Removing {} zero value bins".format(sum(row_sum == 0)))
        ma.maskBins(np.flatnonzero(row_sum == 0))
        matrix_shape = ma.matrix.shape

    ma.matrix = convertNansToZeros(ma.matrix)
    ma.matrix = convertInfsToZeros(ma.matrix)
    ma.matrix = ma.matrix.astype(np.float64, copy=True)

    log.debug('ma.matrix.indices {}'.format(ma.matrix.indices.dtype))
    log.debug('ma.matrix.data {}'.format(ma.matrix.data.dtype))
    log.debug('ma.matrix.indptr {}'.format(ma.matrix.indptr.dtype))

    # log.debug('ma.matrix.indices {}'.format(np.max(ma.matrix.indices)))
    # log.debug('ma.matrix.data {}'.format(np.max(ma.matrix.data)))
    # log.debug('ma.matrix.indptr {}'.format(np.max(ma.matrix.indptr)))

    # ma.matrix.indptr = ma.matrix.indptr.astype(np.int32, copy=False)
    # ma.matrix.indices = ma.matrix.indices.astype(np.int32, copy=False)

    if 'plotName' in args:
        plot_total_contact_dist(ma, args)
        log.info("Saving diagnostic plot {}\n".format(args.plotName))
        return

    log.info("matrix contains {} data points. Sparsity {:.3f}.".format(
        len(ma.matrix.data),
        float(len(ma.matrix.data)) / (ma.matrix.shape[0]**2)))

    if args.skipDiagonal:
        ma.diagflat(value=0)

    total_filtered_out = set()
    if args.correctionMethod == 'ICE':
        if not args.filterThreshold:
            log.error('min and max filtering thresholds should be set')
            sys.exit(1)
        outlier_regions = filter_by_zscore(ma,
                                           args.filterThreshold[0],
                                           args.filterThreshold[1],
                                           perchr=args.perchr)
        # compute and print some statistics
        pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0]
        ma.printchrtoremove(outlier_regions,
                            label="Bins that are MAD outliers ({:.2f}%) "
                            "out of".format(pct_outlier, ma.matrix.shape[0]),
                            restore_masked_bins=False)

        assert matrix_shape == ma.matrix.shape
        # mask filtered regions
        ma.maskBins(outlier_regions)
        total_filtered_out = set(outlier_regions)

        if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1:
            chrom, _, _, coverage = zip(*ma.cut_intervals)

            assert type(coverage[0]) == np.float64

            failed_bins = np.flatnonzero(
                np.array(coverage) < args.sequencedCountCutoff)

            ma.printchrtoremove(failed_bins,
                                label="Bins with low coverage",
                                restore_masked_bins=False)
            ma.maskBins(failed_bins)
            total_filtered_out = set(failed_bins)
            """
            ma.matrix, to_remove = fill_gaps(ma, failed_bins)
            log.warning("From {} failed bins, {} could "
                         "not be filled\n".format(len(failed_bins),
                                                  len(to_remove)))
            ma.maskBins(to_remove)
            """

        if args.transCutoff and 0 < args.transCutoff < 100:
            cutoff = float(args.transCutoff) / 100
            # a usual cutoff is 0.05
            ma.truncTrans(high=cutoff)
            pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()

    correction_factors = []
    corrected_matrix = lil_matrix(ma.matrix.shape)
    if args.perchr:
        # normalize each chromosome independently
        for chrname in list(ma.interval_trees):
            chr_range = ma.getChrBinRange(chrname)
            chr_submatrix = ma.matrix[chr_range[0]:chr_range[1],
                                      chr_range[0]:chr_range[1]]
            if args.correctionMethod == 'ICE':
                _matrix, _corr_factors = iterative_correction(
                    chr_submatrix, args)
                corrected_matrix[chr_range[0]:chr_range[1],
                                 chr_range[0]:chr_range[1]] = _matrix
                correction_factors.append(_corr_factors)
            else:
                # Set the kr matrix along with its correction factors vector
                assert (args.correctionMethod == 'KR')
                log.debug("Loading a float sparse matrix for KR balancing")
                kr = kr_balancing(
                    chr_submatrix.shape[0], chr_submatrix.shape[1],
                    chr_submatrix.count_nonzero(),
                    chr_submatrix.indptr.astype(np.int64, copy=False),
                    chr_submatrix.indices.astype(np.int64, copy=False),
                    chr_submatrix.data.astype(np.float64, copy=False))
                kr.computeKR()
                if args.outFileName.endswith('.h5'):
                    corrected_matrix[
                        chr_range[0]:chr_range[1],
                        chr_range[0]:chr_range[1]] = kr.get_normalised_matrix(
                            True)
                # correction_factors.append(np.true_divide(1,
                #                                          kr.get_normalisation_vector(False).todense()))
                correction_factors.append(
                    kr.get_normalisation_vector(False).todense())

        correction_factors = np.concatenate(correction_factors)

    else:
        if args.correctionMethod == 'ICE':
            corrected_matrix, correction_factors = iterative_correction(
                ma.matrix, args)
            ma.setMatrixValues(corrected_matrix)
        else:
            assert (args.correctionMethod == 'KR')
            log.debug("Loading a float sparse matrix for KR balancing")
            kr = kr_balancing(ma.matrix.shape[0], ma.matrix.shape[1],
                              ma.matrix.count_nonzero(),
                              ma.matrix.indptr.astype(np.int64, copy=False),
                              ma.matrix.indices.astype(np.int64, copy=False),
                              ma.matrix.data.astype(np.float64, copy=False))
            log.debug('passed pointers')
            kr.computeKR()
            log.debug('computation done')

            # set it to False since the vector is already normalised
            # with the previous True
            # correction_factors = np.true_divide(1, kr.get_normalisation_vector(False).todense())
            correction_factors = kr.get_normalisation_vector(False).todense()

            if args.outFileName.endswith('.h5'):
                corrected_matrix = kr.get_normalised_matrix(True)

    if args.outFileName.endswith('.h5'):
        ma.setMatrixValues(corrected_matrix)
    # if
    ma.setCorrectionFactors(correction_factors)

    log.debug("Correction factors {}".format(correction_factors[:10]))
    if args.inflationCutoff and args.inflationCutoff > 0 and args.correctionMethod == 'ICE':

        after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten()
        # identify rows that were expanded more than args.inflationCutoff times
        to_remove = np.flatnonzero(
            after_row_sum / pre_row_sum >= args.inflationCutoff)
        ma.printchrtoremove(to_remove,
                            label="inflated >={} "
                            "regions".format(args.inflationCutoff),
                            restore_masked_bins=False)
        total_filtered_out = total_filtered_out.union(to_remove)
        ma.maskBins(to_remove)
    ma.printchrtoremove(sorted(list(total_filtered_out)),
                        label="Total regions to be removed",
                        restore_masked_bins=False)

    ma.save(args.outFileName, pApplyCorrection=False)
Esempio n. 17
0
    def getDistList(rows, cols, cut_intervals):
        """
            Given a list of rows and cols
            an array is returned containing
            the genomic distance between
            each element of the row array
            with each element of the col array.
            -1 is returned for inter-chromosomal
            interactions.

            A matching list containing the chromosome name
            is also returned

        >>> from scipy.sparse import coo_matrix
        >>> import numpy as np
        >>> row, col = np.triu_indices(5)
        >>> cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
        ... ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)]
        >>> dist_list, chrom_list = hiCMatrix.getDistList(row, col,
        ... cut_intervals)
        >>> coo_matrix((dist_list, (row, col)), shape=(5,5), dtype=np.int32).todense()
        matrix([[ 0, 10, 20, 30, -1],
                [ 0,  0, 10, 20, -1],
                [ 0,  0,  0, 10, -1],
                [ 0,  0,  0,  0, -1],
                [ 0,  0,  0,  0,  0]], dtype=int32)
        >>> chrom_list.tolist()
        ['a', 'a', 'a', 'a', '', 'a', 'a', 'a', '', 'a', 'a', '', 'a', '', 'b']
        """
        chrnamelist, startlist, endlist, extralist = zip(*cut_intervals)
        # now the distance between any two points
        # is computed and arranged such that for each
        # element of the data array, a corespondent distance is stored
        start_row = np.take(startlist, rows)
        start_col = np.take(startlist, cols)
        dist_list = start_col - start_row

        # now  all distances that are between chromosomes are removed
        # to do this I convert the array of chromosomes to
        # a array of indices. Then, when subtracting the
        # values that correspond to matrix.row and matrix.col
        # using the array of indices, any value other
        # than 0 means inter-chromosomal row,col combination.

        # chr_id_list is based on a trick using np.unique
        # to get from a list of strings
        # a list of integers
        chr_id_list = np.unique(chrnamelist, return_inverse=True)[1]

        chr_row = np.take(chr_id_list, rows)
        chr_col = np.take(chr_id_list, cols)
        chr_diff = chr_row - chr_col
        # set in dist_list array '-1' for all interchromosomal values
        dist_list[chr_diff != 0] = -1

        # make a corresponding chromosome name list
        # if filtering per chromosome is required
        chrom_list = np.take(chrnamelist, rows)
        chrom_list[chr_diff != 0] = ''

        return dist_list, chrom_list
Esempio n. 18
0
def plotPerChr(hic_matrix, cmap, args, pBigwig):
    """
    plots each chromosome individually, one after the other
    in one row. scale bar is added at the end
    """
    from math import ceil
    chromosomes = hic_matrix.getChrNames()
    chrom_per_row = 5
    num_rows = int(ceil(float(len(chromosomes)) / chrom_per_row))
    num_cols = min(chrom_per_row, len(chromosomes))
    width_ratios = [1.0] * num_cols + [0.05]
    grids = gridspec.GridSpec(num_rows, num_cols + 1,
                              width_ratios=width_ratios,
                              height_ratios=[1] * num_rows)

    fig_height = 6 * num_rows
    fig_width = sum((np.array(width_ratios) + 0.05) * 6)

    fig = plt.figure(figsize=(fig_width, fig_height), dpi=args.dpi)

    chrom, start, end, _ = zip(*hic_matrix.cut_intervals)
    for idx, chrname in enumerate(chromosomes):
        log.debug('chrom: {}'.format(chrname))

        row = idx // chrom_per_row
        col = idx % chrom_per_row
        if pBigwig:
            inner_grid = gridspec.GridSpecFromSubplotSpec(2, 2, height_ratios=[0.85, 0.15], width_ratios=[0.93, 0.07],
                                                          subplot_spec=grids[row, col], wspace=0.1, hspace=0.1)
            axis = plt.subplot(inner_grid[0, 0])
            axis_eigenvector = plt.subplot(inner_grid[1, 0])
            axis_scale = plt.subplot(inner_grid[0, 1])

        else:
            axis = plt.subplot(grids[row, col])
            axis.set_title(toString(chrname))
        chrom_range = hic_matrix.getChrBinRange(chrname)
        matrix = np.asarray(hic_matrix.matrix[chrom_range[0]:chrom_range[1],
                                              chrom_range[0]:chrom_range[1]].todense().astype(float))

        norm = None
        if args.log or args.log1p:
            mask = matrix == 0
            mask_nan = np.isnan(matrix)
            mask_inf = np.isinf(matrix)
            log.debug("any nan {}".format(np.isnan(matrix).any()))
            log.debug("any inf {}".format(np.isinf(matrix).any()))

            try:
                matrix[mask] = np.nanmin(matrix[mask == False])
                matrix[mask_nan] = np.nanmin(matrix[mask_nan == False])
                matrix[mask_inf] = np.nanmin(matrix[mask_inf == False])

            except Exception:
                log.debug("Clearing of matrix failed.")
            log.debug("any nanafter remove of nan: {}".format(np.isnan(matrix).any()))
            log.debug("any inf after remove of inf: {}".format(np.isinf(matrix).any()))
        if args.log1p:
            matrix += 1
            norm = LogNorm()

        elif args.log:
            norm = LogNorm()

        bigwig_info = None
        if pBigwig:
            bigwig_info = {'args': args, 'axis': None, 'axis_colorbar': None, 'nan_bins': hic_matrix.nan_bins}
            bigwig_info['axis'] = axis_eigenvector
            bigwig_info['axis_colorbar'] = axis_scale

        chr_bin_boundary = OrderedDict()
        chr_bin_boundary[chrname] = hic_matrix.get_chromosome_sizes()[chrname]

        args.region = toString(chrname)
        chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = getRegion(args, hic_matrix)
        plotHeatmap(matrix, chr_bin_boundary, fig, None,
                    args, cmap, xlabel=chrname, ylabel=chrname,
                    start_pos=start_pos1, start_pos2=start_pos2, pNorm=norm, pAxis=axis, pBigwig=bigwig_info)
    return fig
Esempio n. 19
0
def main(args=None):
    """
    for each distance, compare the
    distribution of two samples,
    report number of cases were they differ
    """

    args = parse_arguments().parse_args(args)
    mean_dict = OrderedDict()
    matrix_sum = {}
    if args.labels is None:
        labels = OrderedDict([(x, os.path.basename(x)) for x in args.matrices])
    else:
        labels = OrderedDict(zip(args.matrices, args.labels))

    chroms = set()
    for matrix_file in args.matrices:
        hic_ma = HiCMatrix.hiCMatrix(matrix_file)
        matrix_sum[matrix_file] = hic_ma.matrix.sum()
        if args.chromosomeExclude is None:
            args.chromosomeExclude = []

        chrtokeep = [x for x in list(hic_ma.interval_trees) if x not in args.chromosomeExclude]
        hic_ma.keepOnlyTheseChr(chrtokeep)

        mean_dict[matrix_file] = compute_distance_mean(hic_ma, maxdepth=args.maxdepth, perchr=args.perchr)
        chroms = chroms.union([k for k in list(mean_dict[matrix_file]) if len(mean_dict[matrix_file][k]) > 1])

    # compute scale factors such that values are comparable
    min_sum = min(matrix_sum.values())
    scale_factor = dict([(matrix_file, float(min_sum) / mat_sum) for matrix_file, mat_sum in iteritems(matrix_sum)])
    log.info("The scale factors used are: {}".format(scale_factor))
    if len(args.matrices) > 1 and args.perchr:
        # in this case, for each chromosome a plot is made that combines the data from the
        # hic matrices
        max_cols = 4
        num_rows = int(np.ceil(float(len(chroms)) / max_cols))
        num_cols = min(len(chroms), max_cols)

    else:
        num_cols = num_rows = 1

    if args.plotsize is None:
        width = 6
        height = 4
    else:
        width, height = args.plotsize
    fig = plt.figure(figsize=(width * num_cols, height * num_rows))

    axs = np.empty((num_rows, num_cols), dtype='object')
    for matrix_file in args.matrices:
        idx = 0
        for chrom, mean_values in iteritems(mean_dict[matrix_file]):
            if len(mean_values) <= 1:
                log.debug("No values found for: {}, chromosome: {}\n".format(matrix_file, chrom))
                continue
            x, y = zip(*[(k, v) for k, v in iteritems(mean_values) if v > 0])
            if len(x) <= 1:
                log.debug("No values found for: {}, chromosome: {}\n".format(matrix_file, chrom))
                continue
            if args.perchr and len(args.matrices) == 1:
                col = 0
                row = 0
            else:
                col = idx % num_cols
                row = idx // num_cols
            if axs[row, col] is None:
                ax = plt.subplot2grid((num_rows, num_cols), (row, col))
                ax.set_xlabel('genomic distance')
                ax.set_ylabel('corrected Hi-C counts')
                try:
                    ax.set_yscale('log')
                    ax.set_xscale('log')
                except ValueError:
                    continue
            else:
                ax = axs[row, col]
            y = np.array(y) * scale_factor[matrix_file]
            if args.perchr and len(args.matrices) > 1:
                label = labels[matrix_file]
                ax.set_title(chrom)
            elif args.perchr:
                label = chrom
            else:
                label = labels[matrix_file]

            ax.plot(x, y, label=label)
            axs[row, col] = ax
            idx += 1
            if args.outFileData is not None:
                if args.perchr and len(args.matrices) > 1:
                    label = labels[matrix_file]
                    args.outFileData.write("#{}\n".format(chrom))

                elif args.perchr:
                    label = chrom
                else:
                    label = labels[matrix_file]
                args.outFileData.write("#{}\n".format(label))
                args.outFileData.write("\t".join(map(str, x)) + "\n")
                args.outFileData.write("\t".join(map(str, y)) + "\n")

    for ax in axs.reshape(-1):
        if ax is None:
            continue
        ax.legend(prop={'size': 'small'})
        ax.set_xlim(0, args.maxdepth)
        handles, labels = ax.get_legend_handles_labels()
        lgd = ax.legend(handles, labels, loc='center left', bbox_to_anchor=(1, 0.5))

    plt.tight_layout()
    plt.savefig(args.plotFile.name, bbox_inches='tight', bbox_extra_artists=(lgd,))
    plt.close(fig)
Esempio n. 20
0
def plotPerChr(hic_matrix, cmap, args, pBigwig, pResolution):
    """
    plots each chromosome individually, one after the other
    in one row. scale bar is added at the end
    """
    from math import ceil
    chromosomes = hic_matrix.getChrNames()
    chrom_per_row = 5
    num_rows = int(ceil(float(len(chromosomes)) / chrom_per_row))
    num_cols = min(chrom_per_row, len(chromosomes))
    width_ratios = [1.0] * num_cols + [0.05]
    grids = gridspec.GridSpec(num_rows, num_cols + 1,
                              width_ratios=width_ratios,
                              height_ratios=[1] * num_rows)

    fig_height = 6 * num_rows
    fig_width = sum((np.array(width_ratios) + 0.05) * 6)
    if pBigwig:
        for i in range(len(args.bigwig)):
            fig_height += args.increaseFigureHeight
            # if args.bigwigAdditionalVerticalAxis:
            fig_width += args.increaseFigureWidth

    fig = plt.figure(figsize=(fig_width, fig_height), dpi=args.dpi)

    chrom, start, end, _ = zip(*hic_matrix.cut_intervals)
    for idx, chrname in enumerate(chromosomes):
        log.debug('chrom: {}'.format(chrname))
        bigwig_info = None
        # if pBigwig:
        # bigwig_info['axis'] = axis_eigenvector
        # bigwig_info['axis_colorbar'] = axis_scale
        row = idx // chrom_per_row
        col = idx % chrom_per_row
        if pBigwig:
            bigwig_info = {'args': args, 'axis': None,
                           'axis_colorbar': None, 'nan_bins': hic_matrix.nan_bins}
            # bigwig_info, axis = bigwig_axes_config(args, bigwig_info)
            # bigwig_info['nan_bins'] = hic_matrix.nan_bins
            # bigwig_info['args'] = args

            # inner_grid = gridspec.GridSpecFromSubplotSpec(2, 2, height_ratios=[0.85, 0.15], width_ratios=[0.93, 0.07],
            #                                               subplot_spec=grids[row, col], wspace=0.1, hspace=0.1)
            # axis = plt.subplot(inner_grid[0, 0])
            # axis_eigenvector = plt.subplot(inner_grid[1, 0])
            # axis_scale = plt.subplot(inner_grid[0, 1])
            number_of_rows_plot = len(args.bigwig)
            bigwig_heights = [0.07] * number_of_rows_plot
            bigwig_height_ratio = 0.95 - (0.07 * number_of_rows_plot)
            if bigwig_height_ratio < 0.4:
                bigwig_height_ratio = 0.4
                _ratio = 0.6 / len(number_of_rows_plot)
                bigwig_heights = [_ratio] * number_of_rows_plot

            if args.bigwigAdditionalVerticalAxis:
                # gs = gridspec.GridSpecFromSubplotSpec(1 + len(args.bigwig), 3, height_ratios=[0.90, 0.1], width_ratios=[0.15, 0.82, 0.03],
                #                                       subplot_spec=grids[row, col], wspace=0.1, hspace=0.1)
                # # gs = gridspec.GridSpec(1 + len(args.bigwig), 3, height_ratios=[0.90, 0.1], width_ratios=[0.15, 0.82, 0.03])
                # # gs.update(hspace=0.05, wspace=0.05)
                # bigwig_vertical_axis = plt.subplot(gs[0, 0])
                # axis = plt.subplot(gs[0, 1])
                # ax2 = plt.subplot(gs[1, 1])
                # ax3 = plt.subplot(gs[0, 2])

                # bigwig_info['axis'] = ax2
                # bigwig_info['axis_colorbar'] = ax3
                # bigwig_info['axis_vertical'] = bigwig_vertical_axis

                gs = gridspec.GridSpecFromSubplotSpec(1 + len(args.bigwig), 2 + len(args.bigwig), height_ratios=[0.95 - (0.07 * number_of_rows_plot), *bigwig_heights], width_ratios=[*bigwig_heights, 0.97 - (0.07 * number_of_rows_plot), 0.03],
                                                      subplot_spec=grids[row, col], wspace=0.1, hspace=0.1)
                # gs.update(hspace=0.05, wspace=0.05)
                # gs.update(hspace=0.05, wspace=0.05)
                axis = plt.subplot(gs[0, len(args.bigwig)])
                ax2_list = []
                for i in range(len(args.bigwig)):
                    ax2_list.append(plt.subplot(gs[1 + i, len(args.bigwig)]))

                bigwig_vertical_axis_list = []
                for i in range(len(args.bigwig)):
                    bigwig_vertical_axis_list.append(plt.subplot(gs[0, i]))
                # ax2 = plt.subplot(gs[1, 0])
                ax3 = plt.subplot(gs[0, len(args.bigwig) + 1])
                bigwig_info['axis'] = ax2_list
                bigwig_info['axis_colorbar'] = ax3
                bigwig_info['axis_vertical'] = bigwig_vertical_axis_list

            else:
                # [0.95 - (0.07 * number_of_rows_plot), *z_score_heights], width_ratios=[0.75, 0.25])
                gs = gridspec.GridSpecFromSubplotSpec(1 + len(args.bigwig), 2, height_ratios=[0.95 - (0.07 * number_of_rows_plot), *bigwig_heights], width_ratios=[0.97, 0.03],
                                                      subplot_spec=grids[row, col], wspace=0.1, hspace=0.1)
                # gs.update(hspace=0.05, wspace=0.05)
                axis = plt.subplot(gs[0, 0])
                ax2_list = []
                for i in range(len(args.bigwig)):
                    ax2_list.append(plt.subplot(gs[1 + i, 0]))
                # ax2 = plt.subplot(gs[1, 0])
                ax3 = plt.subplot(gs[0, 1])
                bigwig_info['axis'] = ax2_list
                bigwig_info['axis_colorbar'] = ax3
        else:
            axis = plt.subplot(grids[row, col])
            axis.set_title(toString(chrname))
        chrom_range = hic_matrix.getChrBinRange(chrname)
        matrix = np.asarray(hic_matrix.matrix[chrom_range[0]:chrom_range[1],
                                              chrom_range[0]:chrom_range[1]].todense().astype(float))

        norm = None
        if args.log or args.log1p:
            mask = matrix == 0
            mask_nan = np.isnan(matrix)
            mask_inf = np.isinf(matrix)
            log.debug("any nan {}".format(np.isnan(matrix).any()))
            log.debug("any inf {}".format(np.isinf(matrix).any()))

            try:
                matrix[mask] = np.nanmin(matrix[mask == False])
                matrix[mask_nan] = np.nanmin(matrix[mask_nan == False])
                matrix[mask_inf] = np.nanmin(matrix[mask_inf == False])

            except Exception:
                log.debug("Clearing of matrix failed.")
            log.debug("any nanafter remove of nan: {}".format(
                np.isnan(matrix).any()))
            log.debug("any inf after remove of inf: {}".format(
                np.isinf(matrix).any()))
        if args.log1p:
            matrix += 1
            norm = LogNorm()

        elif args.log:
            norm = LogNorm()

        chr_bin_boundary = OrderedDict()
        chr_bin_boundary[chrname] = hic_matrix.get_chromosome_sizes()[chrname]

        args.region = toString(chrname)
        chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = getRegion(
            args, hic_matrix)
        plotHeatmap(matrix, chr_bin_boundary, fig, None,
                    args, cmap, xlabel=chrname, ylabel=chrname,
                    start_pos=start_pos1, start_pos2=start_pos2, pNorm=norm, pAxis=axis, pBigwig=bigwig_info,
                    pChromsomeStartEndDict=chromosome_start_end(hic_matrix), pResolution=pResolution)
    return fig
Esempio n. 21
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    if args.verbose:
        log.setLevel(logging.INFO)

    # args.chromosomes
    if check_cooler(args.matrix) and args.chromosomes is not None and len(args.chromosomes) == 1:
        ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes))
    else:
        ma = hm.hiCMatrix(args.matrix)

        if args.chromosomes:
            ma.reorderChromosomes(toString(args.chromosomes))

    # mask all zero value bins
    row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
    log.info("Removing {} zero value bins".format(sum(row_sum == 0)))
    ma.maskBins(np.flatnonzero(row_sum == 0))
    matrix_shape = ma.matrix.shape
    ma.matrix = convertNansToZeros(ma.matrix)
    ma.matrix = convertInfsToZeros(ma.matrix)

    if 'plotName' in args:
        plot_total_contact_dist(ma, args)
        log.info("Saving diagnostic plot {}\n".format(args.plotName))
        return

    log.info("matrix contains {} data points. Sparsity {:.3f}.".format(
        len(ma.matrix.data),
        float(len(ma.matrix.data)) / (ma.matrix.shape[0] ** 2)))

    if args.skipDiagonal:
        ma.diagflat(value=0)

    outlier_regions = filter_by_zscore(ma, args.filterThreshold[0], args.filterThreshold[1], perchr=args.perchr)
    # compute and print some statistics
    pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0]
    ma.printchrtoremove(outlier_regions, label="Bins that are MAD outliers ({:.2f}%) "
                                               "out of".format(pct_outlier, ma.matrix.shape[0]),
                        restore_masked_bins=False)

    assert matrix_shape == ma.matrix.shape
    # mask filtered regions
    ma.maskBins(outlier_regions)
    total_filtered_out = set(outlier_regions)

    if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1:
        chrom, _, _, coverage = zip(*ma.cut_intervals)

        assert type(coverage[0]) == np.float64

        failed_bins = np.flatnonzero(
            np.array(coverage) < args.sequencedCountCutoff)

        ma.printchrtoremove(failed_bins, label="Bins with low coverage", restore_masked_bins=False)
        ma.maskBins(failed_bins)
        total_filtered_out = set(failed_bins)
        """
        ma.matrix, to_remove = fill_gaps(ma, failed_bins)
        log.warning("From {} failed bins, {} could "
                         "not be filled\n".format(len(failed_bins),
                                                  len(to_remove)))
        ma.maskBins(to_remove)
        """

    if args.transCutoff and 0 < args.transCutoff < 100:
        cutoff = float(args.transCutoff) / 100
        # a usual cutoff is 0.05
        ma.truncTrans(high=cutoff)

    pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
    correction_factors = []
    if args.perchr:
        corrected_matrix = lil_matrix(ma.matrix.shape)
        # normalize each chromosome independently
        for chrname in list(ma.interval_trees):
            chr_range = ma.getChrBinRange(chrname)
            chr_submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]]
            _matrix, _corr_factors = iterative_correction(chr_submatrix, args)
            corrected_matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = _matrix
            correction_factors.append(_corr_factors)
        correction_factors = np.concatenate(correction_factors)

    else:
        corrected_matrix, correction_factors = iterative_correction(ma.matrix, args)

    ma.setMatrixValues(corrected_matrix)
    ma.setCorrectionFactors(correction_factors)
    log.info("Correction factors {}".format(correction_factors[:10]))
    if args.inflationCutoff and args.inflationCutoff > 0:
        after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten()
        # identify rows that were expanded more than args.inflationCutoff times
        to_remove = np.flatnonzero(after_row_sum / pre_row_sum >= args.inflationCutoff)
        ma.printchrtoremove(to_remove,
                            label="inflated >={} "
                            "regions".format(args.inflationCutoff), restore_masked_bins=False)
        total_filtered_out = total_filtered_out.union(to_remove)

        ma.maskBins(to_remove)

    ma.printchrtoremove(sorted(list(total_filtered_out)),
                        label="Total regions to be removed", restore_masked_bins=False)

    ma.save(args.outFileName, pApplyCorrection=False)
Esempio n. 22
0
 def _build_commands(self):
     commands = []
     for opts, label in zip(self.optionstrings, self.labels):
         commands.append(self._build_perf_command(opts, self.events, label))
     return commands
Esempio n. 23
0
    def plot(self, ax, chrom_region, region_start, region_end):

        chrom_sizes = self.hic_ma.get_chromosome_sizes()
        if chrom_region not in chrom_sizes:
            chrom_region = self.change_chrom_names(chrom_region)

        chrom_region = self.check_chrom_str_bytes(chrom_sizes, chrom_region)
        if region_end > chrom_sizes[chrom_region]:
            self.log.error("*Error*\nThe region to plot extends beyond the chromosome size. Please check.\n")
            self.log.error("{} size: {}. Region to plot {}-{}\n".format(chrom_region, chrom_sizes[chrom_region],
                                                                        region_start, region_end))

        if self.properties['file'].endswith('.cool'):
            # load now the region to be plotted
            pass

        # expand region to plus depth on both sides
        # to avoid a 45 degree 'cut' on the edges

        # get bin id of start and end of region in given chromosome
        chr_start_id, chr_end_id = self.hic_ma.getChrBinRange(chrom_region)
        chr_start = self.hic_ma.cut_intervals[chr_start_id][1]
        chr_end = self.hic_ma.cut_intervals[chr_end_id - 1][1]
        start_bp = max(chr_start, region_start - self.properties['depth'])
        end_bp = min(chr_end, region_end + self.properties['depth'])

        idx, start_pos = zip(*[(idx, x[1]) for idx, x in
                               enumerate(self.hic_ma.cut_intervals)
                               if x[0] == chrom_region and x[1] >= start_bp and x[2] <= end_bp])

        idx = idx[0:-1]
        # select only relevant matrix part
        matrix = self.hic_ma.matrix[idx, :][:, idx]
        # limit the 'depth' based on the length of the region being viewed

        region_len = region_end - region_start
        depth = min(self.properties['depth'], int(region_len * 1.25))
        depth_in_bins = int(1.5 * region_len / self.hic_ma.getBinSize())

        if depth < self.properties['depth']:
            # remove from matrix all data points that are not visible.
            matrix = matrix - scipy.sparse.triu(matrix, k=depth_in_bins, format='csr')
        matrix = np.asarray(matrix.todense().astype(float))
        if 'scale factor' in self.properties:
            matrix = matrix * self.properties['scale factor']

        if 'transform' in self.properties:
            if self.properties['transform'] == 'log1p':
                matrix += 1
                self.norm = colors.LogNorm()

            elif self.properties['transform'] == '-log':
                mask = matrix == 0
                matrix[mask] = matrix[mask is False].min()
                matrix = -1 * np.log(matrix)

            elif self.properties['transform'] == 'log':
                mask = matrix == 0
                matrix[mask] = matrix[mask is False].min()
                matrix = np.log(matrix)

        if 'max_value' in self.properties and self.properties['max_value'] != 'auto':
            vmax = self.properties['max_value']

        else:
            # try to use a 'aesthetically pleasant' max value
            vmax = np.percentile(matrix.diagonal(1), 80)

        if 'min_value' in self.properties and self.properties['min_value'] != 'auto':
            vmin = self.properties['min_value']
        else:
            if depth_in_bins > matrix.shape[0]:
                depth_in_bins = matrix.shape[0] - 5

            # if the region length is large with respect to the chromosome length, the diagonal may have
            # very few values or none. Thus, the following lines reduce the number of bins until the
            # diagonal is at least length 5
            num_bins_from_diagonal = int(region_len / self.hic_ma.getBinSize())
            for num_bins in range(0, num_bins_from_diagonal)[::-1]:
                distant_diagonal_values = matrix.diagonal(num_bins)
                if len(distant_diagonal_values) > 5:
                    break

            vmin = np.median(distant_diagonal_values)

        self.log.info("setting min, max values for track {} to: {}, {}\n".
                      format(self.properties['section_name'], vmin, vmax))
        self.img = self.pcolormesh_45deg(ax, matrix, start_pos, vmax=vmax, vmin=vmin)
        self.img.set_rasterized(True)
        if self.plot_inverted:
            ax.set_ylim(depth, 0)
        else:
            ax.set_ylim(0, depth)
Esempio n. 24
0
File: h5.py Progetto: qbss/HiCMatrix
    def save(self, filename, pSymmetric=True, pApplyCorrection=None):
        """
        Saves a matrix using hdf5 format
        :param filename:
        :return: None
        """
        log.debug('Save in h5 format')

        # self.restoreMaskedBins()
        if not filename.endswith(".h5"):
            filename += ".h5"

        # if the file name already exists
        # try to find a new suitable name
        if os.path.isfile(filename):
            log.warning("*WARNING* File already exists {}\n "
                        "Overwriting ...\n".format(filename))

            unlink(filename)
        if self.nan_bins is None:
            self.nan_bins = np.array([])
        elif not isinstance(self.nan_bins, np.ndarray):
            self.nan_bins = np.array(self.nan_bins)

        # save only the upper triangle of the
        if pSymmetric:
            # symmetric matrix
            matrix = triu(self.matrix, k=0, format='csr')
        else:
            matrix = self.matrix
        matrix.eliminate_zeros()

        filters = tables.Filters(complevel=5, complib='blosc')
        with tables.open_file(filename, mode="w",
                              title="HiCExplorer matrix") as h5file:
            matrix_group = h5file.create_group(
                "/",
                "matrix",
            )
            # save the parts of the csr matrix
            for matrix_part in ('data', 'indices', 'indptr', 'shape'):
                arr = np.array(getattr(matrix, matrix_part))
                atom = tables.Atom.from_dtype(arr.dtype)
                ds = h5file.create_carray(matrix_group,
                                          matrix_part,
                                          atom,
                                          shape=arr.shape,
                                          filters=filters)
                ds[:] = arr

            # save the matrix intervals
            intervals_group = h5file.create_group(
                "/",
                "intervals",
            )
            chr_list, start_list, end_list, extra_list = zip(
                *self.cut_intervals)
            for interval_part in ('chr_list', 'start_list', 'end_list',
                                  'extra_list'):
                arr = np.array(eval(interval_part))
                atom = tables.Atom.from_dtype(arr.dtype)
                ds = h5file.create_carray(intervals_group,
                                          interval_part,
                                          atom,
                                          shape=arr.shape,
                                          filters=filters)
                ds[:] = arr

            # save nan bins
            if len(self.nan_bins):
                atom = tables.Atom.from_dtype(self.nan_bins.dtype)
                ds = h5file.create_carray(h5file.root,
                                          'nan_bins',
                                          atom,
                                          shape=self.nan_bins.shape,
                                          filters=filters)
                ds[:] = self.nan_bins

            # save corrections factors
            if self.correction_factors is not None and len(
                    self.correction_factors):
                self.correction_factors = np.array(self.correction_factors)
                mask = np.isnan(self.correction_factors)
                self.correction_factors[mask] = 0
                atom = tables.Atom.from_dtype(self.correction_factors.dtype)
                ds = h5file.create_carray(h5file.root,
                                          'correction_factors',
                                          atom,
                                          shape=self.correction_factors.shape,
                                          filters=filters)
                ds[:] = np.array(self.correction_factors)

            # save distance counts
            if self.distance_counts is not None and len(self.distance_counts):
                atom = tables.Atom.from_dtype(self.distance_counts.dtype)
                ds = h5file.create_carray(h5file.root,
                                          'distance_counts',
                                          atom,
                                          shape=self.distance_counts.shape,
                                          filters=filters)
                ds[:] = np.array(self.distance_counts)