def train_hmm(clr: cooler.Cooler, mix_num: int = 3, discore_fn=di_score): """ :param clr: :param mix_num: :param discore_fn: :return: """ model = ghmm_model(STATES, INIT_TRANSITION, INIT_PROB, END_PROB, init_mean_fn(mix_num), init_var_fn(mix_num)) di_dict = {} for chrom in clr.chromnames: matrix = clr.matrix(sparse=True).fetch(chrom).tocsr() di_array = discore_fn(matrix) gap_mask = remove_small_gap( np.isnan(clr.bins().fetch(chrom)['weight'].values)) di_dict[chrom] = split_diarray(di_array, gap_mask) train_data = [] for chrom_di in di_dict.values(): train_data.extend(di for di in chrom_di.values()) model.fit( train_data, algorithm='baum-welch', max_iterations=10000, stop_threshold=1e-5, n_jobs=CPU_CORE - 5, verbose=False ) return model
def preprocess_hic( clr: cooler.Cooler, min_contacts: Optional[int] = None, region: Optional[str] = None, ) -> sp.csr_matrix: """ Given an input cooler object, returns the preprocessed Hi-C matrix. Preprocessing involves (in that order): subsetting region, subsampling contacts, normalisation, detrending (obs / exp). Balancing weights must be pre-computer in the referenced cool file. Region must be in UCSC format. """ # Load raw matrix and subset region if requested mat = clr.matrix(sparse=True, balance=False) bins = clr.bins() if region is None: mat = mat[:] bins = bins[:] else: mat = mat.fetch(region) bins = bins.fetch(region) try: biases = bins["weight"].values except KeyError as err: sys.stderr.write("Error: Input cooler must be balanced.\n") raise err # get to same coverage if requested and matrix is not empty if mat.sum() and (min_contacts is not None): mat = cup.subsample_contacts(mat, min_contacts).tocoo() valid = cup.get_detectable_bins(mat, n_mads=5) # balance region with weights precomputed on the whole matrix mat.data = mat.data * biases[mat.row] * biases[mat.col] # Detrend for P(s) mat = cup.detrend(mat.tocsr(), smooth=False, detectable_bins=valid[0]) # Replace NaNs by 0s mat.data = np.nan_to_num(mat.data) mat.eliminate_zeros() return mat
def _single_clr_edge_and_node_info_from_sites(c: cooler.Cooler, sites: Dict[str, np.ndarray], balance: Optional[bool] = True, join: Optional[bool] = False): ''' Given some cooler and a dictionary of sites (with chromosomes as keys), return the submatrices retrieved from these slices within the Hi-C map. Submatrices are returned in sparse COO format with an edge_idxs dictionary, an edge_attrs dictionary and a node_info dictionary. Optionally users can balance the Hi-C matrix before retrieval of matrix information. SInce multiple chromosomes and slices per chromosome can be supplied, user may optionally join regions into one larger region consisting of the given slices concatenated together. This function does not actually do the joining procedure since the passed slices may not be disjoint. :param cooler: Cooler file object :type edge_index: cooler.Cooler :param slices: Dictionary with chromosomes as keys and lists of sites as values. Multiple sites are allowed per chromosome. :type slices: Dict[str,List[np.ndarray]] :param balance: Whether to perform matrix balancing on the Hi-C matrix before retrieving individual slices. :type balance: Optional[bool] :param join: Boolean determining whether to retrieve Hi-C martrix information corresponding to the interface between slices. This is only recommended if slices are disjoint since the interface isn't well defined if slices aren't disjoint. :type join: Optional[bool] ''' # Iterate through slices, adding in edge indexes and edge attributes edge_idxs = {} edge_attrs = {} sub_graph_nodes = {} chroms = list(sites.keys()) for idx, chrom1 in enumerate(chroms): edge_idxs[chrom1] = {} edge_attrs[chrom1] = {} sub_graph_nodes[chrom1] = {} for chrom2 in chroms[idx:]: if chrom1 != chrom2 and not join: continue mat = c.matrix(balance=balance).fetch(chrom1, chrom2) mat = mat[sites[chrom1], :] mat = mat[:, sites[chrom2]] mat = coo(mat) b1 = c.bins().fetch(chrom1).index.values[sites[chrom1]] b2 = c.bins().fetch(chrom2).index.values[sites[chrom2]] edge_index = np.concatenate( [b1[mat.row][None, :], b2[mat.col][None, :]], axis=0, ) edge_data = mat.data[:, None] if chrom1 != chrom2: edge_index = np.append(edge_index, edge_index[::-1, :], axis=1) edge_data = np.append(edge_data, edge_data, axis=0) edge_data[np.isnan(edge_data)] = 0 ind = np.lexsort((edge_index[0, :], edge_index[1, :])) edge_index = edge_index[:, ind] edge_data = edge_data[ind, :] edge_idxs[chrom1][chrom2] = [edge_index] edge_attrs[chrom1][chrom2] = [edge_data] if chrom1 == chrom2: sub_graph_nodes[chrom1][chrom2] = [b1] else: sub_graph_nodes[chrom1][chrom2] = [np.append(b1, b2)] return edge_idxs, edge_attrs, sub_graph_nodes
def _single_clr_edge_and_node_info_from_slices(c: cooler.Cooler, slices: Dict[str, List[np.ndarray]], balance: Optional[bool] = True, join: Optional[bool] = False): ''' Given some cooler and a dictionary of slices (with chromosomes as keys), return the submatrices retrieved from these slices within the Hi-C map. Submatrices are returned in sparse COO format with an edge_idxs dictionary, an edge_attrs dictionary and a node_info dictionary. Optionally users can balance the Hi-C matrix before retrieval of matrix information. SInce multiple chromosomes and slices per chromosome can be supplied, user may optionally join regions into one larger region consisting of the given slices concatenated together. This function does not actually do the joining procedure since the passed slices may not be disjoint. :param cooler: Cooler file object :type edge_index: cooler.Cooler :param slices: Dictionary with chromosomes as keys and lists of slices as values. Multiple slices are allowed per chromosome. :type slices: Dict[str,List[np.ndarray]] :param balance: Whether to perform matrix balancing on the Hi-C matrix before retrieving individual slices. :type balance: Optional[bool] :param join: Boolean determining whether to retrieve Hi-C martrix information corresponding to the interface between slices. This is only recommended if slices are disjoint since the interface isn't well defined if slices aren't disjoint. :type join: Optional[bool] ''' # Iterate through slices, adding in edge indexes and edge attributes edge_idxs = {} edge_attrs = {} sub_graph_nodes = {} chroms = list(slices.keys()) for cidx1, chrom1 in enumerate(chroms): edge_idxs[chrom1] = {} edge_attrs[chrom1] = {} sub_graph_nodes[chrom1] = {} for chrom2 in chroms[cidx1:]: if chrom1 != chrom2 and not join: continue edge_idxs[chrom1][chrom2] = [] edge_attrs[chrom1][chrom2] = [] sub_graph_nodes[chrom1][chrom2] = [] for idx, s1 in enumerate(slices[chrom1]): if chrom1 == chrom2: #don't want to repeat region pairings slist = slices[chrom1][idx:] else: slist = slices[chrom2] for jdx, s2 in enumerate(slist): if s1[0] == s2[0] and jdx != 0: continue if s1[0] != s2[0] and not join: continue mat = c.matrix(balance=balance, sparse=True)[s1[0]:s1[-1] + 1, s2[0]:s2[-1] + 1] edge_index = np.concatenate( [s1[mat.row][None, :], s2[mat.col][None, :]], axis=0, ) edge_data = mat.data[:, None] if np.sum(s1 - s2) != 0: edge_index, edge_data = make_edges_bidirectional( edge_index, edge_data) ind = np.lexsort((edge_index[0, :], edge_index[1, :])) edge_index = edge_index[:, ind] edge_data = edge_data[ind, :] edge_idxs[chrom1][chrom2].append(edge_index) edge_attrs[chrom1][chrom2].append(edge_data) if s1[0] == s2[0]: sub_graph_nodes[chrom1][chrom2].append(s1) else: sub_graph_nodes[chrom1][chrom2].append( np.append(s1, s2)) return edge_idxs, edge_attrs, sub_graph_nodes