def correlation_reduce_cpu(v: ArrayLike, out: ArrayLike) -> None: # pragma: no cover """Corresponding "reduce" function for pearson correlation Parameters ---------- v The correlation array on which pearson corrections has been applied on chunks out An ndarray, which is a symmetric matrix of pearson correlation Returns ------- An ndarray, which contains the result of the calculation of the application of euclidean distance on all the chunks. """ v = v.sum(axis=0) n = v[5] num = n * v[4] - v[0] * v[1] denom1 = np.sqrt(n * v[2] - v[0]**2) denom2 = np.sqrt(n * v[3] - v[1]**2) denom = denom1 * denom2 value = np.nan if denom > 0: value = 1 - (num / denom) out[0] = value
def _aggregate(x_chunk: ArrayLike, **_: typing.Any) -> ArrayLike: """Last function to be executed when resolving the dask graph, producing the final output. It is always invoked, even when the reduced Array counts a single chunk along the reduced axes.""" x_chunk = x_chunk.reshape(x_chunk.shape[:-2] + (-1, n_map_param)) result: ArrayLike = metric_reduce_func(x_chunk) return result
def _Garud_h_cohorts(gt: ArrayLike, sample_cohort: ArrayLike, n_cohorts: int, ct: ArrayLike) -> ArrayLike: # transpose to hash columns (haplotypes) haplotypes = hash_array(gt.transpose()).transpose().flatten() arr = np.full((n_cohorts, N_GARUD_H_STATS), np.nan) for c in np.nditer(ct): arr[c, :] = _Garud_h(haplotypes[sample_cohort == c]) return arr
def _combine(x_chunk: ArrayLike, **_: typing.Any) -> ArrayLike: """Function used for intermediate recursive aggregation (see split_every argument to ``da.reduction below``). If the reduction can be performed in less than 3 steps, it will not be invoked at all.""" # reduce chunks by summing along the -2 axis x_chunk_reshaped = x_chunk.reshape(x_chunk.shape[:-2] + (-1, n_map_param)) return x_chunk_reshaped.sum(axis=-2)[..., np.newaxis]
def _divergence(ac: ArrayLike, out: ArrayLike) -> None: # pragma: no cover """Generalized U-function for computing divergence. Parameters ---------- ac Allele counts of shape (cohorts, alleles) containing per-cohort allele counts. out Pairwise divergence stats with shape (cohorts, cohorts), where the entry at (i, j) is the divergence between cohort i and cohort j. """ an = ac.sum(axis=-1) out[:, :] = np.nan # (cohorts, cohorts) n_cohorts = ac.shape[0] n_alleles = ac.shape[1] # calculate the divergence for each cohort pair for i in range(n_cohorts): for j in range(i + 1, n_cohorts): n_pairs = an[i] * an[j] if n_pairs != 0.0: n_same = 0 for k in range(n_alleles): n_same += ac[i, k] * ac[j, k] n_diff = n_pairs - n_same div = n_diff / n_pairs out[i, j] = div out[j, i] = div # calculate the diversity for each cohort for i in range(n_cohorts): n_pairs = an[i] * (an[i] - 1) n_same = 0 for k in range(n_alleles): n_same += ac[i, k] * (ac[i, k] - 1) n_diff = n_pairs - n_same if n_pairs != 0.0: div = n_diff / n_pairs out[i, i] = div
def _Garud_h(haplotypes: ArrayLike) -> ArrayLike: # find haplotype counts (sorted in descending order) counts = sorted(collections.Counter(haplotypes.tolist()).values(), reverse=True) counts = np.array(counts) # find haplotype frequencies n = haplotypes.shape[0] f = counts / n # compute H1 h1 = np.sum(f**2) # compute H12 h12 = np.sum(f[:2])**2 + np.sum(f[2:]**2) # compute H123 h123 = np.sum(f[:3])**2 + np.sum(f[3:]**2) # compute H2/H1 h2 = h1 - f[0]**2 h2_h1 = h2 / h1 return np.array([h1, h12, h123, h2_h1])
def to_fixlen_str_array(arr: ArrayLike, kind: Literal["S", "U"] = "S") -> ArrayLike: length = int(max_str_len(arr)) return arr.astype(f"{kind}{length}")
def max_str_len(arr: ArrayLike) -> Any: return arr.map_blocks( lambda s: np.char.str_len(s.astype(str)), dtype=np.int8 ).max()