def significance( TTM: sp.csc_matrix, metric: Union[Callable, KeynessMetric], normalize: bool = False, n_contexts=None, n_words=None, ) -> sp.csc_matrix: """Computes statistical significance tf co-occurrences using `metric`. Args: TTM (sp.csc_matrix): [description] normalize (bool, optional): [description]. Defaults to False. Returns: sp.csc_matrix: [description] """ metric = metric if callable(metric) else METRIC_FUNCTION.get( metric, _undefined) K: float = n_contexts N: float = n_words """Total number of observations (counts)""" Z: float = float(TTM.sum()) """Number of observations per context (document, row sum)""" Zr = np.array(TTM.sum(axis=1), dtype=np.float64).flatten() """Row and column indices of non-zero elements.""" ii, jj = TTM.nonzero() Cij: np.ndarray = np.array(TTM[ii, jj], dtype=np.float64).flatten() """Compute weights (with optional normalize).""" weights: np.ndarray = metric(Cij=Cij, Z=Z, Zr=Zr, ii=ii, jj=jj, K=K, N=N, normalize=normalize) np.nan_to_num( weights, copy=False, posinf=0.0, neginf=0.0, nan=0.0, ) nz_indices: np.ndarray = weights.nonzero() return (weights[nz_indices], (ii[nz_indices], jj[nz_indices]))
def get_distances(node: Tuple[int, int], nodes: sparse.csc_matrix) -> List[int]: non_zero = [coord for coord in zip(*nodes.nonzero())] distances = spatial.distance.cdist([node], non_zero, metric='cityblock').flatten().tolist() return distances