def make_slices( clr: Cooler, regions: Dict[str, np.ndarray], names:Optional[dict] = {}, force_disjoint:Optional[bool] = False ) -> List[np.ndarray]: # Fetch relevant bin_ids from the cooler file b_ids, n_ids = fetch_bins_from_cooler(cooler=clr, regions=regions, names=names) if force_disjoint: # Identify unique bin_ids and isolate disjoint regions slices = {chrom: get_unique_bins(b_ids=b_ids[chrom]) for chrom in b_ids} n_ids = {} for chrom in slices: n_ids[chrom] = [] for sl in slices[chrom]: # start, end, bins and node names for region stl = clr.bins()[sl[0]]["start"].values[0] el = clr.bins()[sl[-1] + 1]["end"].values[0] sl_id = f"{chrom}:{stl}-{el}" n_ids[chrom].append(sl_id) else: slices = {chrom:[np.array(item) for item in b_ids[chrom]] for chrom in b_ids} return slices, n_ids
def coords_to_bins(clr: cooler.Cooler, coords: pd.DataFrame) -> np.ndarray: """ Converts genomic coordinates to a list of bin ids based on the whole genome contact map. Parameters ---------- coords : pandas.DataFrame Table of genomic coordinates, with columns chrom, pos. Returns ------- numpy.array of ints : Indices in the whole genome matrix contact map. """ coords.pos = (coords.pos // clr.binsize) * clr.binsize # Coordinates are merged with bins, both indices are kept in memory so that # the indices of matching bins can be returned in the order of the input # coordinates idx = (clr.bins()[:].reset_index().rename(columns={ "index": "bin_idx" }).merge( coords.reset_index().rename(columns={"index": "coord_idx"}), left_on=["chrom", "start"], right_on=["chrom", "pos"], how="right", ).set_index("bin_idx").sort_values("coord_idx").index.values) return idx
def train_hmm(clr: cooler.Cooler, mix_num: int = 3, discore_fn=di_score): """ :param clr: :param mix_num: :param discore_fn: :return: """ model = ghmm_model(STATES, INIT_TRANSITION, INIT_PROB, END_PROB, init_mean_fn(mix_num), init_var_fn(mix_num)) di_dict = {} for chrom in clr.chromnames: matrix = clr.matrix(sparse=True).fetch(chrom).tocsr() di_array = discore_fn(matrix) gap_mask = remove_small_gap( np.isnan(clr.bins().fetch(chrom)['weight'].values)) di_dict[chrom] = split_diarray(di_array, gap_mask) train_data = [] for chrom_di in di_dict.values(): train_data.extend(di for di in chrom_di.values()) model.fit( train_data, algorithm='baum-welch', max_iterations=10000, stop_threshold=1e-5, n_jobs=CPU_CORE - 5, verbose=False ) return model
def make_bins( clr: Cooler, sites: Dict[str,np.ndarray], names: Optional[Dict[str,str]]=None ) -> Dict[str, np.ndarray]: bins = {} outnames = {} bad_sites = {} for chrom in sites: cbins = clr.bins().fetch(chrom) start = cbins['start'].values[0] site_locs = ((sites[chrom] - start)/clr.binsize).astype('int') good_sites = site_locs < cbins.shape[0] bad_sites[chrom] = np.where(~good_sites)[0] bins[chrom] = site_locs[good_sites] if names is not None: outnames[chrom] = np.array(names[chrom])[good_sites] if names is not None: return bins, outnames, bad_sites else: return bins, bad_sites
def fetch_bins_from_cooler( cooler: Cooler, regions: Dict[str, np.ndarray], names: Optional[dict] = {} ) -> List[List[np.int64]]: # Fetch relevant bin_ids from the cooler file b_ids = {} n_ids = {} for chrom in regions: b_ids[chrom] = [] n_ids[chrom] = [] for idx, row in enumerate(regions[chrom]): b_add = list( cooler.bins() .fetch("{}:{}-{}".format(chrom, row[0], row[1])) .index.values ) try: n_ids[chrom].append(names[chrom][idx]) except: n_ids[chrom].append("{}:{}-{}".format(chrom, row[0], row[1])) b_ids[chrom].append( b_add ) return b_ids, n_ids
def preprocess_hic( clr: cooler.Cooler, min_contacts: Optional[int] = None, region: Optional[str] = None, ) -> sp.csr_matrix: """ Given an input cooler object, returns the preprocessed Hi-C matrix. Preprocessing involves (in that order): subsetting region, subsampling contacts, normalisation, detrending (obs / exp). Balancing weights must be pre-computer in the referenced cool file. Region must be in UCSC format. """ # Load raw matrix and subset region if requested mat = clr.matrix(sparse=True, balance=False) bins = clr.bins() if region is None: mat = mat[:] bins = bins[:] else: mat = mat.fetch(region) bins = bins.fetch(region) try: biases = bins["weight"].values except KeyError as err: sys.stderr.write("Error: Input cooler must be balanced.\n") raise err # get to same coverage if requested and matrix is not empty if mat.sum() and (min_contacts is not None): mat = cup.subsample_contacts(mat, min_contacts).tocoo() valid = cup.get_detectable_bins(mat, n_mads=5) # balance region with weights precomputed on the whole matrix mat.data = mat.data * biases[mat.row] * biases[mat.col] # Detrend for P(s) mat = cup.detrend(mat.tocsr(), smooth=False, detectable_bins=valid[0]) # Replace NaNs by 0s mat.data = np.nan_to_num(mat.data) mat.eliminate_zeros() return mat
def _single_clr_edge_and_node_info_from_sites(c: cooler.Cooler, sites: Dict[str, np.ndarray], balance: Optional[bool] = True, join: Optional[bool] = False): ''' Given some cooler and a dictionary of sites (with chromosomes as keys), return the submatrices retrieved from these slices within the Hi-C map. Submatrices are returned in sparse COO format with an edge_idxs dictionary, an edge_attrs dictionary and a node_info dictionary. Optionally users can balance the Hi-C matrix before retrieval of matrix information. SInce multiple chromosomes and slices per chromosome can be supplied, user may optionally join regions into one larger region consisting of the given slices concatenated together. This function does not actually do the joining procedure since the passed slices may not be disjoint. :param cooler: Cooler file object :type edge_index: cooler.Cooler :param slices: Dictionary with chromosomes as keys and lists of sites as values. Multiple sites are allowed per chromosome. :type slices: Dict[str,List[np.ndarray]] :param balance: Whether to perform matrix balancing on the Hi-C matrix before retrieving individual slices. :type balance: Optional[bool] :param join: Boolean determining whether to retrieve Hi-C martrix information corresponding to the interface between slices. This is only recommended if slices are disjoint since the interface isn't well defined if slices aren't disjoint. :type join: Optional[bool] ''' # Iterate through slices, adding in edge indexes and edge attributes edge_idxs = {} edge_attrs = {} sub_graph_nodes = {} chroms = list(sites.keys()) for idx, chrom1 in enumerate(chroms): edge_idxs[chrom1] = {} edge_attrs[chrom1] = {} sub_graph_nodes[chrom1] = {} for chrom2 in chroms[idx:]: if chrom1 != chrom2 and not join: continue mat = c.matrix(balance=balance).fetch(chrom1, chrom2) mat = mat[sites[chrom1], :] mat = mat[:, sites[chrom2]] mat = coo(mat) b1 = c.bins().fetch(chrom1).index.values[sites[chrom1]] b2 = c.bins().fetch(chrom2).index.values[sites[chrom2]] edge_index = np.concatenate( [b1[mat.row][None, :], b2[mat.col][None, :]], axis=0, ) edge_data = mat.data[:, None] if chrom1 != chrom2: edge_index = np.append(edge_index, edge_index[::-1, :], axis=1) edge_data = np.append(edge_data, edge_data, axis=0) edge_data[np.isnan(edge_data)] = 0 ind = np.lexsort((edge_index[0, :], edge_index[1, :])) edge_index = edge_index[:, ind] edge_data = edge_data[ind, :] edge_idxs[chrom1][chrom2] = [edge_index] edge_attrs[chrom1][chrom2] = [edge_data] if chrom1 == chrom2: sub_graph_nodes[chrom1][chrom2] = [b1] else: sub_graph_nodes[chrom1][chrom2] = [np.append(b1, b2)] return edge_idxs, edge_attrs, sub_graph_nodes
def get_pairing_score_obs_exp( clr: cooler.Cooler, expected: pd.DataFrame, windowsize: int = 4 * 10**4, func: Callable = np.mean, regions: pd.DataFrame = pd.DataFrame(), norm: bool = True, arms: pd.DataFrame = pd.DataFrame(), ) -> pd.DataFrame: """Takes a cooler file (clr), an expected dataframe (expected; maybe generated by getExpected), a windowsize (windowsize), a summary function (func) and a set of genomic regions to calculate the pairing score as follows: A square with side-length windowsize is created for each of the entries in the supplied genomics regions and the summary function applied to the Hi-C pixels (obs/exp values) at the location in the supplied cooler file. The results are returned as a dataframe. If no regions are supplied, regions are constructed for each bin in the cooler file to construct a genome-wide pairing score.""" # Check whether genomic regions were supplied if len(regions) == 0: # If no regions are supplied, pregenerate all bins; drop bins with nan weights regions = clr.bins()[:].dropna() # find midpoint of each bin to assign windows to each midpoint regions.loc[:, "mid"] = (regions["start"] + regions["end"]) // 2 # check that norm is only set if genomewide pairingScore is calculated elif norm: raise ValueError( "Norm flag can only be set with genomeWide pairingScore!") # drop nan rows from regions regions = regions.dropna() # fix indices regions.index = range(len(regions)) regions.loc[:, "binID"] = range(len(regions)) # Chromosomal arms are needed so each process only extracts a subset from the file if len(arms) == 0: arms = get_arms_hg19() # extract all windows windows = assign_regions(windowsize, clr.binsize, regions["chrom"], regions["mid"], arms) # add binID to later merge piles windows.loc[:, "binID"] = regions["binID"] windows = windows.dropna() # generate pileup pile = do_pileup_obs_exp(clr, expected, windows, collapse=False) # convert to dataframe pile_frame = pile_to_frame(pile) # replace inf with nan pile_frame = pile_frame.replace([np.inf, -np.inf], np.nan) # apply function to each row (row = individual window) summarized = pile_frame.apply(func, axis=1) # subset regions with regions that were assigned windows output = pd.merge(regions, windows, on="binID", suffixes=("", "_w")).dropna() # add results output.loc[:, "PairingScore"] = summarized # normalize by median if norm: output.loc[:, "PairingScore"] = output["PairingScore"] - np.median( output.dropna()["PairingScore"]) return output[["chrom", "start", "end", "PairingScore"]]