Ejemplo n.º 1
0
    def load(self):
        """
        Loads a matrix stored in h5 format
        :param matrix_filename:
        :return: matrix, cut_intervals, nan_bins, distance_counts, correction_factors
        """
        log.debug('Load in h5 format')

        with tables.open_file(self.matrixFileName) as f:
            parts = {}
            for matrix_part in ('data', 'indices', 'indptr', 'shape'):
                parts[matrix_part] = getattr(f.root.matrix, matrix_part).read()

            matrix = csr_matrix(tuple([parts['data'], parts['indices'], parts['indptr']]),
                                shape=parts['shape'])
            # matrix = hiCMatrix.fillLowerTriangle(matrix)
            # get intervals
            intvals = {}
            for interval_part in ('chr_list', 'start_list', 'end_list', 'extra_list'):
                if toString(interval_part) == toString('chr_list'):
                    chrom_list = getattr(f.root.intervals, interval_part).read()
                    intvals[interval_part] = toString(chrom_list)
                else:
                    intvals[interval_part] = getattr(f.root.intervals, interval_part).read()

            cut_intervals = zip(intvals['chr_list'], intvals['start_list'], intvals['end_list'], intvals['extra_list'])
            assert len(cut_intervals) == matrix.shape[0], \
                "Error loading matrix. Length of bin intervals ({}) is different than the " \
                "size of the matrix ({})".format(len(cut_intervals), matrix.shape[0])

            # get nan_bins
            try:
                if hasattr(f.root, 'nan_bins'):
                    nan_bins = f.root.nan_bins.read()
                else:
                    nan_bins = np.array([])
            except Exception:
                nan_bins = np.array([])

            # get correction factors
            try:
                if hasattr(f.root, 'correction_factors'):
                    correction_factors = f.root.correction_factors.read()
                    assert len(correction_factors) == matrix.shape[0], \
                        "Error loading matrix. Length of correction factors does not" \
                        "match size of matrix"
                else:
                    correction_factors = None
            except Exception:
                correction_factors = None

            try:
                # get correction factors
                if hasattr(f.root, 'distance_counts'):
                    distance_counts = f.root.correction_factors.read()
                else:
                    distance_counts = None
            except Exception:
                distance_counts = None
            return matrix, cut_intervals, nan_bins, distance_counts, correction_factors
Ejemplo n.º 2
0
    def load(self):
        log.debug('Load in cool format')
        self.minValue = None
        self.maxValue = None
        if self.matrixFileName is None:
            log.warning('No matrix is initialized')
        try:
            cooler_file = cooler.Cooler(self.matrixFileName)
            if 'metadata' in cooler_file.info:
                self.hic_metadata = cooler_file.info['metadata']
            else:
                self.hic_metadata = None
            self.cool_info = deepcopy(cooler_file.info)
        except Exception as e:
            log.warning("Could not open cooler file. Maybe the path is wrong or the given node is not available.")
            log.warning('The following file was tried to open: {}'.format(self.matrixFileName))
            log.warning("The following nodes are available: {}".format(cooler.fileops.list_coolers(self.matrixFileName.split("::")[0])))
            return None, e
        if self.chrnameList is None:
            matrixDataFrame = cooler_file.matrix(balance=False, sparse=True, as_pixels=True)
            used_dtype = np.int32
            if np.iinfo(np.int32).max < cooler_file.info['nbins']:
                used_dtype = np.int64
            count_dtype = matrixDataFrame[0]['count'].dtype
            data = np.empty(cooler_file.info['nnz'], dtype=count_dtype)
            instances = np.empty(cooler_file.info['nnz'], dtype=used_dtype)
            features = np.empty(cooler_file.info['nnz'], dtype=used_dtype)
            i = 0
            size = cooler_file.info['nbins'] // 32
            if size == 0:
                size = 1
            start_pos = 0
            while i < cooler_file.info['nbins']:
                matrixDataFrameChunk = matrixDataFrame[i:i + size]
                _data = matrixDataFrameChunk['count'].values.astype(count_dtype)
                _instances = matrixDataFrameChunk['bin1_id'].values.astype(used_dtype)
                _features = matrixDataFrameChunk['bin2_id'].values.astype(used_dtype)

                data[start_pos:start_pos + len(_data)] = _data
                instances[start_pos:start_pos + len(_instances)] = _instances
                features[start_pos:start_pos + len(_features)] = _features
                start_pos += len(_features)
                i += size
                del _data
                del _instances
                del _features
            matrix = csr_matrix((data, (instances, features)), shape=(np.int(cooler_file.info['nbins']), np.int(cooler_file.info['nbins'])), dtype=count_dtype)

            del data
            del instances
            del features
            gc.collect()

        else:
            if len(self.chrnameList) == 1:
                try:
                    if self.distance is None or cooler_file.binsize is None:
                        # load the full chromosome
                        matrix = cooler_file.matrix(balance=False, sparse=True, as_pixels=False).fetch(self.chrnameList[0]).tocsr()
                    else:
                        # load only the values up to a specific distance
                        lo, hi = cooler_file.extent(self.chrnameList[0])
                        dist = self.distance // cooler_file.binsize
                        step = (hi - lo) // 32
                        if step < 1:
                            step = 1
                        mat = lil_matrix((hi - lo, hi - lo), dtype=np.float32)

                        for i0, i1 in cooler.util.partition(lo, hi, step):
                            # fetch stripe
                            pixels = cooler_file.matrix(balance=False, as_pixels=True)[i0:i1, lo:hi]
                            # filter
                            pixels = pixels[(pixels['bin2_id'] - pixels['bin1_id']) < dist]
                            # insert into sparse matrix
                            mat[pixels['bin1_id'] - lo, pixels['bin2_id'] - lo] = pixels['count'].astype(np.float32)
                            del pixels

                        matrix = mat.tocsr()
                        del mat
                        gc.collect()

                except ValueError as ve:
                    log.exception("Wrong chromosome format. Please check UCSC / ensembl notation.")
                    log.exception('Error: {}'.format(str(ve)))
            else:
                raise Exception("Operation to load more as one region is not supported.")

        cut_intervals_data_frame = None
        correction_factors_data_frame = None

        if self.chrnameList is not None:
            if len(self.chrnameList) == 1:
                cut_intervals_data_frame = cooler_file.bins().fetch(self.chrnameList[0])
                log.debug('cut_intervals_data_frame {}'.format(list(cut_intervals_data_frame.columns)))
                if self.correctionFactorTable in cut_intervals_data_frame:
                    correction_factors_data_frame = cut_intervals_data_frame[self.correctionFactorTable]
            else:
                raise Exception("Operation to load more than one chr from bins is not supported.")
        else:
            if self.applyCorrectionLoad and self.correctionFactorTable in cooler_file.bins():
                correction_factors_data_frame = cooler_file.bins()[[self.correctionFactorTable]][:]

            cut_intervals_data_frame = cooler_file.bins()[['chrom', 'start', 'end']][:]

        correction_factors = None
        if correction_factors_data_frame is not None and self.applyCorrectionLoad:
            # apply correction factors to matrix
            # a_i,j = a_i,j * c_i *c_j
            matrix.eliminate_zeros()
            if len(matrix.data) > 1:

                matrix.data = matrix.data.astype(float)

                correction_factors = np.array(correction_factors_data_frame.values).flatten()
                # Don't apply correction if weight were just 'nans'
                if np.sum(np.isnan(correction_factors)) != len(correction_factors):
                    # correction_factors = convertNansToZeros(correction_factors)
                    matrix.sort_indices()

                    instances, features = matrix.nonzero()
                    instances_factors = correction_factors[instances]
                    features_factors = correction_factors[features]

                    if self.correctionOperator is None:
                        if self.correctionFactorTable in ['KR', 'VC', 'SQRT_VC']:
                            self.correctionOperator = '/'
                        else:
                            self.correctionOperator = '*'
                        if 'generated-by' in cooler_file.info:
                            log.debug('cooler_file.info[\'generated-by\'] {} {}'.format(cooler_file.info['generated-by'], type(cooler_file.info['generated-by'])))
                            generated_by = toString(cooler_file.info['generated-by'])
                            if 'hic2cool' in generated_by:
                                self.hic2cool_version = generated_by.split('-')[1]
                            elif 'hicmatrix' in generated_by:
                                self.hicmatrix_version = generated_by.split('-')[1]

                    instances_factors *= features_factors
                    log.debug('hic2cool: {}'.format(self.hic2cool_version))
                    log.debug('self.correctionOperator: {}'.format(self.correctionOperator))
                    if self.correctionOperator == '*':
                        matrix.data *= instances_factors
                    elif self.correctionOperator == '/':
                        matrix.data /= instances_factors

        cut_intervals = []
        for values in cut_intervals_data_frame.values:
            cut_intervals.append(tuple([toString(values[0]), values[1], values[2], 1.0]))
        del cut_intervals_data_frame
        del correction_factors_data_frame
        # try to restore nan_bins.
        try:
            # remove possible nan bins introduced by the correction factors
            # to have them part of the nan_bins vector
            mask = np.isnan(matrix.data)
            matrix.data[mask] = 0
            matrix.eliminate_zeros()
            shape = matrix.shape[0] if matrix.shape[0] < matrix.shape[1] else matrix.shape[1]
            nan_bins_indices = np.arange(shape)
            nan_bins_indices = np.setdiff1d(nan_bins_indices, matrix.indices)

            nan_bins = []
            for bin_id in nan_bins_indices:
                if len(matrix[bin_id, :].data) == 0:
                    nan_bins.append(bin_id)
            nan_bins = np.array(nan_bins)
        except Exception:
            nan_bins = None

        distance_counts = None

        return matrix, cut_intervals, nan_bins, distance_counts, correction_factors
Ejemplo n.º 3
0
    def load(self):
        log.debug('Load in cool format')
        self.minValue = None
        self.maxValue = None
        if self.matrixFileName is None:
            log.info('No matrix is initialized')

        try:
            cooler_file = cooler.Cooler(self.matrixFileName)
            if 'metadata' in cooler_file.info:
                self.hic_metadata = cooler_file.info['metadata']
            else:
                self.hic_metadata = None
            self.cool_info = deepcopy(cooler_file.info)
            # log.debug("cooler_file.info {}".format(cooler_file.info))
        except Exception:
            log.info("Could not open cooler file. Maybe the path is wrong or the given node is not available.")
            log.info('The following file was tried to open: {}'.format(self.matrixFileName))
            log.info("The following nodes are available: {}".format(cooler.fileops.list_coolers(self.matrixFileName.split("::")[0])))
            exit()
        log.debug('self.chrnameList {}'.format(self.chrnameList))
        if self.chrnameList is None:
            log.debug('muh 69')

            matrixDataFrame = cooler_file.matrix(balance=False, sparse=True, as_pixels=True)
            used_dtype = np.int32
            if np.iinfo(np.int32).max < cooler_file.info['nbins']:
                used_dtype = np.int64
            count_dtype = matrixDataFrame[0]['count'].dtype
            data = np.empty(cooler_file.info['nnz'], dtype=count_dtype)
            instances = np.empty(cooler_file.info['nnz'], dtype=used_dtype)
            features = np.empty(cooler_file.info['nnz'], dtype=used_dtype)
            i = 0
            size = cooler_file.info['nbins'] // 32
            if size == 0:
                size = 1
            start_pos = 0
            while i < cooler_file.info['nbins']:
                matrixDataFrameChunk = matrixDataFrame[i:i + size]
                _data = matrixDataFrameChunk['count'].values.astype(count_dtype)
                _instances = matrixDataFrameChunk['bin1_id'].values.astype(used_dtype)
                _features = matrixDataFrameChunk['bin2_id'].values.astype(used_dtype)

                data[start_pos:start_pos + len(_data)] = _data
                instances[start_pos:start_pos + len(_instances)] = _instances
                features[start_pos:start_pos + len(_features)] = _features
                start_pos += len(_features)
                i += size
                del _data
                del _instances
                del _features

            matrix = csr_matrix((data, (instances, features)), shape=(np.int(cooler_file.info['nbins']), np.int(cooler_file.info['nbins'])), dtype=count_dtype)
            self.minValue = data.min()
            self.maxValue = data.max()

            del data
            del instances
            del features
        else:
            if len(self.chrnameList) == 1:
                try:
                    log.debug('Load data')
                    matrix = cooler_file.matrix(balance=False, sparse=True).fetch(self.chrnameList[0]).tocsr()
                    # handle the case of an empty csr matrix
                    if len(matrix.data) == 0:
                        self.minValue = 0
                        self.maxValue = 0
                    else:
                        self.minValue = matrix.data.min()
                        self.maxValue = matrix.data.max()
                except ValueError:
                    exit("Wrong chromosome format. Please check UCSC / ensembl notation.")
            else:
                exit("Operation to load more as one region is not supported.")

        cut_intervals_data_frame = None
        correction_factors_data_frame = None

        if self.chrnameList is not None:
            if len(self.chrnameList) == 1:
                cut_intervals_data_frame = cooler_file.bins().fetch(self.chrnameList[0])

                if self.correctionFactorTable in cut_intervals_data_frame:
                    correction_factors_data_frame = cut_intervals_data_frame[self.correctionFactorTable]
            else:
                exit("Operation to load more than one chr from bins is not supported.")
        else:
            if self.applyCorrectionLoad and self.correctionFactorTable in cooler_file.bins():
                correction_factors_data_frame = cooler_file.bins()[[self.correctionFactorTable]][:]

            cut_intervals_data_frame = cooler_file.bins()[['chrom', 'start', 'end']][:]

        correction_factors = None
        if correction_factors_data_frame is not None and self.applyCorrectionLoad:
            # apply correction factors to matrix
            # a_i,j = a_i,j * c_i *c_j
            matrix.eliminate_zeros()
            if len(matrix.data) > 1:

                matrix.data = matrix.data.astype(float)

                correction_factors = convertNansToOnes(np.array(correction_factors_data_frame.values).flatten())
                # apply only if there are not only 1's
                if np.sum(correction_factors) != len(correction_factors):
                    matrix.sort_indices()

                    instances, features = matrix.nonzero()
                    instances_factors = correction_factors[instances]
                    features_factors = correction_factors[features]

                    if self.correctionOperator is None:
                        if 'generated-by' in cooler_file.info:
                            log.debug('cooler_file.info[\'generated-by\'] {} {}'.format(cooler_file.info['generated-by'], type(cooler_file.info['generated-by'])))
                            generated_by = toString(cooler_file.info['generated-by'])
                            if 'hic2cool' in generated_by:

                                self.hic2cool_version = generated_by.split('-')[1]
                                if self.hic2cool_version >= '0.5':
                                    log.debug('0.5')
                                    self.correctionOperator = '/'
                                else:
                                    log.debug('0.4')

                                    self.correctionOperator = '*'
                            else:
                                self.correctionOperator = '*'

                            log.debug('hic2cool: {}'.format(self.hic2cool_version))
                            log.debug('self.correctionOperator : {}'.format(self.correctionOperator))

                            # elif 'hicmatrix' in generated_by:

                            #     self.hicmatrix_version = generated_by.split('-')[1]
                            #     if self.hicmatrix_version >= '8':
                            #         self.correctionOperator = '/'
                            #     else:
                            #         self.correctionOperator = '*'
                        else:
                            self.correctionOperator = '*'

                    instances_factors *= features_factors
                    log.debug('hic2cool: {}'.format(self.hic2cool_version))
                    log.debug('self.correctionOperator: {}'.format(self.correctionOperator))
                    if self.correctionOperator == '*':
                        matrix.data *= instances_factors
                    elif self.correctionOperator == '/':
                        matrix.data /= instances_factors

                    # if self.scaleToOriginalRange is not None:
                    min_value = matrix.data.min()
                    max_value = matrix.data.max()
                    # check if max smaller one or if not same mangnitude
                    if max_value < 1 or (np.absolute(int(math.log10(max_value)) - int(math.log10(self.maxValue))) > 1):
                        desired_range_difference = self.maxValue - self.minValue

                        min_value = matrix.data.min()
                        max_value = matrix.data.max()

                        matrix.data = (matrix.data - min_value)
                        matrix.data /= (max_value - min_value)
                        matrix.data *= desired_range_difference
                        matrix.data += self.minValue
                        self.scaleToOriginalRange = True
                        # diff_scale_factor = matrix.data.max() / max_value
                        # if self.correctionOperator == '*':
                        #     correction_factors *= diff_scale_factor
                        # if self.correctionOperator == '/':
                        #     correction_factors /= diff_scale_factor

        cut_intervals = []
        time_start = time.time()
        log.debug('Creating cut_intervals {}'.format(time_start))
        for values in cut_intervals_data_frame.values:
            cut_intervals.append(tuple([toString(values[0]), values[1], values[2], 1.0]))
        log.debug('Creating cut_intervals {} DONE'.format(time.time() - time_start))
        del cut_intervals_data_frame
        del correction_factors_data_frame
        # try to restore nan_bins.
        try:
            shape = matrix.shape[0] if matrix.shape[0] < matrix.shape[1] else matrix.shape[1]
            nan_bins = np.arange(shape)
            nan_bins = np.setdiff1d(nan_bins, matrix.indices[:-1])

        except Exception:
            nan_bins = None

        distance_counts = None

        return matrix, cut_intervals, nan_bins, distance_counts, correction_factors
Ejemplo n.º 4
0
    def load(self, pApplyCorrection=None, pMatrixOnly=None):
        log.debug('Load in cool format')
        log.debug('self.chrnameList {}'.format(self.chrnameList))
        if self.matrixFileName is None:
            log.info('No matrix is initalized')
        if pApplyCorrection is None:
            pApplyCorrection = True
        try:
            cooler_file = cooler.Cooler(self.matrixFileName)
        except Exception:
            log.info(
                "Could not open cooler file. Maybe the path is wrong or the given node is not available."
            )
            log.info('The following file was tried to open: {}'.format(
                self.matrixFileName))
            log.info("The following nodes are available: {}".format(
                cooler.io.ls(self.matrixFileName.split("::")[0])))
            exit()

        if self.chrnameList is None:
            matrixDataFrame = cooler_file.matrix(balance=False,
                                                 sparse=True,
                                                 as_pixels=True)
            used_dtype = np.int32
            if np.iinfo(np.int32).max < cooler_file.info['nbins']:
                used_dtype = np.int64
            data = np.empty(cooler_file.info['nnz'], dtype=used_dtype)
            instances = np.empty(cooler_file.info['nnz'], dtype=used_dtype)
            features = np.empty(cooler_file.info['nnz'], dtype=used_dtype)
            i = 0
            size = cooler_file.info['nbins'] // 32
            if size == 0:
                size = 1
            start_pos = 0
            while i < cooler_file.info['nbins']:
                csr_data = matrixDataFrame[i:i +
                                           size].values.astype(used_dtype).T
                lenght_data = len(csr_data[0])
                data[start_pos:start_pos + lenght_data] = csr_data[2]
                instances[start_pos:start_pos + lenght_data] = csr_data[0]
                features[start_pos:start_pos + lenght_data] = csr_data[1]
                start_pos += lenght_data
                del csr_data
                i += size
            matrix = csr_matrix(
                (data, (instances, features)),
                shape=(cooler_file.info['nbins'], cooler_file.info['nbins']),
                dtype=used_dtype)
            del data
            del instances
            del features
        else:
            if len(self.chrnameList) == 1:
                try:
                    matrix = cooler_file.matrix(
                        balance=False,
                        sparse=True).fetch(self.chrnameList[0]).tocsr()
                except ValueError:
                    exit(
                        "Wrong chromosome format. Please check UCSC / ensembl notation."
                    )
            else:
                exit("Operation to load more as one region is not supported.")

        cut_intervals_data_frame = None
        correction_factors_data_frame = None

        if self.chrnameList is not None:
            if len(self.chrnameList) == 1:
                cut_intervals_data_frame = cooler_file.bins().fetch(
                    self.chrnameList[0])

                if self.correctionFactorTable in cut_intervals_data_frame:
                    correction_factors_data_frame = cut_intervals_data_frame[
                        self.correctionFactorTable]
            else:
                exit(
                    "Operation to load more than one chr from bins is not supported."
                )
        else:
            if pApplyCorrection and self.correctionFactorTable in cooler_file.bins(
            ):
                correction_factors_data_frame = cooler_file.bins()[[
                    self.correctionFactorTable
                ]][:]

            cut_intervals_data_frame = cooler_file.bins()[[
                'chrom', 'start', 'end'
            ]][:]

        correction_factors = None
        # log.debug("{} {}".format(correction_factors_data_frame, pApplyCorrection))

        if correction_factors_data_frame is not None and pApplyCorrection:
            log.debug("Apply correction factors")
            # apply correction factors to matrix
            # a_i,j = a_i,j * c_i *c_j
            matrix.eliminate_zeros()
            matrix.data = matrix.data.astype(float)

            correction_factors = convertNansToOnes(
                np.array(correction_factors_data_frame.values).flatten())
            # apply only if there are not only 1's
            if np.sum(correction_factors) != len(correction_factors):
                instances, features = matrix.nonzero()
                instances_factors = correction_factors[instances]
                features_factors = correction_factors[features]
                instances_factors *= features_factors

                if self.correctionOperator == '*':
                    matrix.data *= instances_factors
                elif self.correctionOperator == '/':
                    matrix.data /= instances_factors

        cut_intervals = []

        for values in cut_intervals_data_frame.values:
            cut_intervals.append(
                tuple([toString(values[0]), values[1], values[2], 1.0]))

        # try to restore nan_bins.
        try:
            shape = matrix.shape[
                0] if matrix.shape[0] < matrix.shape[1] else matrix.shape[1]
            nan_bins = np.array(range(shape))
            nan_bins = np.setxor1d(nan_bins, matrix.indices)

            i = 0
            while i < len(nan_bins):
                if nan_bins[i] >= shape:
                    break
                i += 1
            nan_bins = nan_bins[:i]

        except Exception:
            nan_bins = None

        distance_counts = None

        # matrix = hiCMatrix.fillLowerTriangle(matrix)

        return matrix, cut_intervals, nan_bins, distance_counts, correction_factors