def __load_distance_matrix(self, data): dm = DistanceMatrix(data) nj_tree = nj(dm) df = nj_tree.tip_tip_distances().to_data_frame() df.index = df.index.astype(int) df.sort_index(inplace=True) df.columns = df.columns.values.astype(np.int32) df = df[sorted(df.columns)] self.dist_matrix = df.as_matrix() nj_tree.bifurcate() self.__post_order(nj_tree) self.__build_genotype(nj_tree)
def test_id_pairs_as_iterable(self): id_pairs = iter([ ('B', 'C'), ]) dm = partial_beta_diversity('unweighted_unifrac', self.table1, self.sids1, otu_ids=self.oids1, tree=self.tree1, id_pairs=id_pairs) self.assertEqual(dm.shape, (3, 3)) expected_data = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.25], [0.0, 0.25, 0.0]] expected_dm = DistanceMatrix(expected_data, ids=self.sids1) for id1 in self.sids1: for id2 in self.sids1: npt.assert_almost_equal(dm[id1, id2], expected_dm[id1, id2], 6)
def test_unweighted_unifrac_partial(self): # TODO: update npt.assert_almost_equal calls to use DistanceMatrix # near-equality testing when that support is available # expected values calculated by hand dm = partial_beta_diversity('unweighted_unifrac', self.table1, self.sids1, otu_ids=self.oids1, tree=self.tree1, id_pairs=[('B', 'C'), ]) self.assertEqual(dm.shape, (3, 3)) expected_data = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.25], [0.0, 0.25, 0.0]] expected_dm = DistanceMatrix(expected_data, ids=self.sids1) for id1 in self.sids1: for id2 in self.sids1: npt.assert_almost_equal(dm[id1, id2], expected_dm[id1, id2], 6)
def create_tree_of_trees(trees: dict, out, metric='score', outgroup=''): output_dir, output_filename = parse_path(out) names = sorted(trees.keys()) n = len(names) m = np.zeros((n, n)) for i, ref in enumerate(names[:-1]): targets = [trees[names[j]] for j in range(i + 1, n)] dists = get_dist_trees(trees[ref], targets, metric) m[i + 1:n, i] = dists m += m.T dist_m = DistanceMatrix(m, names) tree = ete3.Tree(nj(dist_m, result_constructor=str)) if outgroup is not None: outgroup_tree(tree) if out: tree.write(outfile=out) return tree
def _temporal_distance(corr, id_set, dist_method="fro"): '''Calculate Distance Matrix from temporal correlation data. corr: pd.DataFrame table grouped by individual ids, this is the output from _temporal_corr id_set: pd.Series unique subject ids from individual_id with index attached dist_method: str method supported by scipy.linalg.norm parameter ord ''' id_n = len(id_set) dist = np.zeros((id_n, id_n)) for i, id_i in enumerate(id_set): for j, id_j in enumerate(id_set[:i]): dist[i, j] = dist[j, i] = linalg.norm( corr.loc[id_i] - corr.loc[id_j], ord=dist_method) return DistanceMatrix(dist, ids=id_set.index)
def test_extensive(self): eigvals = [ 0.3984635, 0.36405689, 0.28804535, 0.27479983, 0.19165361, 0.0 ] proportion_explained = [ 0.2626621381, 0.2399817314, 0.1898758748, 0.1811445992, 0.1263356565, 0.0 ] sample_ids = [str(i) for i in range(6)] axis_labels = ['PC%d' % i for i in range(1, 7)] samples = [ [-0.028597, 0.22903853, 0.07055272, 0.26163576, 0.28398669, 0.0], [ 0.37494056, 0.22334055, -0.20892914, 0.05057395, -0.18710366, 0.0 ], [ -0.33517593, -0.23855979, -0.3099887, 0.11521787, -0.05021553, 0.0 ], [0.25412394, -0.4123464, 0.23343642, 0.06403168, -0.00482608, 0.0], [ -0.28256844, 0.18606911, 0.28875631, -0.06455635, -0.21141632, 0.0 ], [0.01727687, 0.012458, -0.07382761, -0.42690292, 0.1695749, 0.0] ] expected_results = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame(samples, index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) data = np.loadtxt(get_data_path('PCoA_sample_data_2')) # test passing a numpy.ndarray and a DistanceMatrix to pcoa # gives same results for dm in (data, DistanceMatrix(data)): results = pcoa(dm) assert_ordination_results_equal(results, expected_results, ignore_directionality=True)
def setUp(self): dm_data = [[0, 1, 2, 3, 5, 8, 13, 21], [1, 0, 3, 5, 12, 44, 3, 4], [2, 3, 0, 22, 3, 11, 1, 6], [3, 5, 22, 0, 6, 33, 6, 7], [5, 12, 3, 6, 0, 12, 3, 8], [8, 44, 11, 33, 12, 0, 6, 6], [13, 3, 1, 6, 3, 6, 0, 9], [21, 4, 6, 7, 8, 6, 9, 0]] ids = [ 'subject_1_1', 'subject_2_1', 'subject_2_0', 'subject_3_1', 'subject_3_4', 'subject_1_9', 'subject_1_3', 'subject_1_5' ] headers = ['LOL', 'Subject', 'Order'] map_data = [['22', '1', '1'], ['2', '2', '1'], ['2', '2', '0'], ['1', '3', '1'], ['2', '3', '4'], ['34', '1', '9'], ['NA', '1', '3'], ['11111', '1', '5']] self.dm = DistanceMatrix(dm_data, ids) self.mf = pd.DataFrame(map_data, ids, headers)
def __init__(self, args, current_wd, suppress, silence): # expect only args.p or args.i, not both if args.p: try: if args.i: raise WarningCode13(silence=silence) parameter_df = pandas.read_csv( os.path.join(current_wd, args.p), index_col=0, header=0 ) # TODO: FileNotFoundError ## TODO use 'RetrospectDataImport' self.cluster = list(parameter_df['cluster']) except WarningCode13: self.warning_code = '13' else: try: if args.i: print(f'Input: {args.i}\n') pca_coordinate = RetrospectDataImport( file_name=os.path.join(current_wd, args.i), type='coordinate') # TODO: FileNotFoundError self.individual = list( pca_coordinate.coordinate_w_info['individual']) self.dist_matrix = DistanceMatrix( distance_matrix(pca_coordinate.coordinate_select_np, pca_coordinate.coordinate_select_np)) self.coordinate_w_info = pca_coordinate.coordinate_w_info self.group_set = pca_coordinate.group_set print(self.group_set) # set parameter by interacting with users cluster_map = InputCode2(set=self.group_set, suppress=suppress) self.coordinate_w_info['cluster'] = self.coordinate_w_info[ 'group'].map(cluster_map.cluster_dict) self.cluster = self.coordinate_w_info['cluster'] else: raise Error(code='8') except Error as e: raise ErrorCode8(suppress=suppress) from e
def main_vec(): args = parse_args() genomes = parse_msa(args['msa'], args['max_samples']) try: os.makedirs(args['out_dir']) except: pass print("Count SNPs") dist_path = os.path.join(args['out_dir'], 'snp_dist.tsv') print(" path: %s" % dist_path) dist_file = open(dist_path, 'w') matrix = [] occurs = [] for id in genomes: occurs.append(genomes[id][0] != '-') occurs = np.array(occurs) for i, id in enumerate(genomes.keys()): occ_row = occurs[i] cooccs = occ_row & occurs diffs = [] for sid in genomes: diffs.append(genomes[id][0] != genomes[sid][0]) diffs = np.array(diffs) raw_counts = np.sum(diffs & cooccs, axis=1) norm_counts = raw_counts / np.sum(cooccs, axis=1) for j, sid in enumerate(genomes.keys()): dist_file.write('\t'.join([id, sid, str(raw_counts[j]), str(norm_counts[j])])+'\n') matrix.append(norm_counts) print("Build SNP tree") tree_path = os.path.join(args['out_dir'], 'snp_dist.tree') print(" path: %s" % tree_path) dm = DistanceMatrix(matrix, genomes.keys()) tree = nj(dm, result_constructor=str) open(tree_path,'w').write(tree) print("\nDone!")
def njWithRoot(dis_matrix, muestraPmid): # no culcula la distancia, solo le da un formato mas adecuado a las distancias con los ids muestraPmidStr = [str(i) for i in muestraPmid] ver = dis_matrix.tolist() dm = DistanceMatrix(ver, muestraPmidStr) treeOrig = nj(dm, result_constructor=str) # ponerle raiz t = TreeEte(treeOrig) R = t.get_midpoint_outgroup() t.set_outgroup(R) # imprime el arbol #print(t) # imprime el newick tree = t.write(format=3) tree = TreeEte(tree, format=1) #print(tree) #a = newick_to_pairwise_nodes(tree) #print(a) return tree
def export_tree_for_all(all_patterns, matrixoutput, treeoutput): result_patterns = [] for idx, (samplename, pattern, count, pcnt) in enumerate(all_patterns): removes = set() for pos, na, ref in pattern: if na == '.': removes |= { (pos, ntmp, ref) for ntmp in ['A', 'C', 'G', 'T', 'ins', 'del', '.'] } result_patterns.append([ idx, set(pattern) - removes, removes, '{}_{}_{:.1f}%'.format( samplename, idx + 1, pcnt * 100), count]) patterns = result_patterns num_patterns = len(patterns) if num_patterns < 3: with open(treeoutput, 'w') as fp: fp.write('();') return dist_matrix = np.zeros((num_patterns, num_patterns), dtype=float) patternstrs = [ptnstr for _, _, _, ptnstr, _ in patterns] for (idx1, ptn1, rm1, ptnstr1, c1), (idx2, ptn2, rm2, ptnstr2, c2) in \ combinations(patterns, 2): distance = len((ptn1 - rm2) ^ (ptn2 - rm1)) # xor dist_matrix[idx1, idx2] = distance dist_matrix[idx2, idx1] = distance with open(matrixoutput, 'w') as fp: writer = csv.writer(fp) writer.writerow(['##', *patternstrs]) writer.writerows(dist_matrix) if True or num_patterns > 10000: # TODO: add a switch to this # Too many patterns, unable to calculate dist_matrix return dist_matrix = DistanceMatrix(dist_matrix, patternstrs) tree = nj(dist_matrix) with open(treeoutput, 'w') as fp: fp.write(str(tree.root_at_midpoint()))
def test_weighted_unifrac(self): # TODO: update npt.assert_almost_equal calls to use DistanceMatrix # near-equality testing when that support is available # expected values calculated by hand dm1 = beta_diversity('weighted_unifrac', self.table1, self.sids1, otu_ids=self.oids1, tree=self.tree1) dm2 = beta_diversity(weighted_unifrac, self.table1, self.sids1, otu_ids=self.oids1, tree=self.tree1) self.assertEqual(dm1.shape, (3, 3)) self.assertEqual(dm1, dm2) expected_data = [ [0.0, 0.1750000, 0.12499999], [0.1750000, 0.0, 0.3000000], [0.12499999, 0.3000000, 0.0]] expected_dm = DistanceMatrix(expected_data, ids=self.sids1) for id1 in self.sids1: for id2 in self.sids1: npt.assert_almost_equal(dm1[id1, id2], expected_dm[id1, id2], 6)
def test_from_iterable_skbio_hamming_metric_with_metadata(self): # test for #1254 seqs = [ Sequence('ACGT'), Sequence('ACGA', metadata={'id': 'seq1'}), Sequence('AAAA', metadata={'id': 'seq2'}), Sequence('AAAA', positional_metadata={'qual': range(4)}) ] exp = DistanceMatrix([[0, 0.25, 0.75, 0.75], [0.25, 0.0, 0.5, 0.5], [0.75, 0.5, 0.0, 0.0], [0.75, 0.5, 0.0, 0.0]], ['a', 'b', 'c', 'd']) dm = DistanceMatrix.from_iterable( seqs, metric=skbio.sequence.distance.hamming, keys=['a', 'b', 'c', 'd']) self.assertEqual(dm, exp)
def PCoA_group_from_matrix(distance_matrix, biom_file, groups, plot=False): sk_distance_matrix = DistanceMatrix(distance_matrix, [str(i) for i in range(len(groups))]) metadata = {str(i): {'body_site': groups[i]} for i in range(len(groups))} pd_metadata = pd.DataFrame.from_dict(metadata, orient='index') result = pcoa(sk_distance_matrix) fig = result.plot(df=pd_metadata, column='body_site', axis_labels=('PC 1 (' + str(round(result.proportion_explained.iloc[0]*100, 2)) + '%)', 'PC 2 (' + str(round(result.proportion_explained.iloc[1]*100, 2)) + '%)', 'PC 3 (' + str(round(result.proportion_explained.iloc[2]*100, 2)) + '%)'), title='Samples colored by body site', cmap='Set1', s=50) fig.set_size_inches(18.5, 10.5) if plot: plt.show() else: return fig
def PCoA_total_from_matrix(distance_matrix, biom_file, metadata_file, plot=False): sk_distance_matrix = DistanceMatrix(distance_matrix, BW.extract_samples(biom_file)) metadata = meta.extract_metadata(metadata_file) pd_metadata = pd.DataFrame.from_dict(metadata, orient='index') result = pcoa(sk_distance_matrix) fig = result.plot(df=pd_metadata, column='body_site', axis_labels=('PC 1 (' + str(round(result.proportion_explained.iloc[0]*100, 2)) + '%)', 'PC 2 (' + str(round(result.proportion_explained.iloc[1]*100, 2)) + '%)', 'PC 3 (' + str(round(result.proportion_explained.iloc[2]*100, 2)) + '%)'), title='Samples colored by body site', cmap='Set1', s=50) fig.set_size_inches(18.5, 10.5) if plot: plt.show() else: return fig
def setUp(self): super().setUp() dm_values = [[0, 1, 2, 3], [1, 0, 3, 4], [2, 3, 0, 5], [3, 4, 5, 0]] ids = ['s1', 's2', 's3', 's4'] dm = DistanceMatrix( dm_values, ids=ids, ) self.resources = DictElement({ 'datasets': DictElement({ 'dataset1': DictElement({'__beta__': BetaElement({'unifrac': dm})}), }), }) self.resources.accept(TrivialVisitor()) self.res_patcher = patch( 'microsetta_public_api.api.diversity.beta.get_resources') self.mock_resources = self.res_patcher.start() self.mock_resources.return_value = self.resources
def test_default_usage(self): exp = DistanceMatrix(np.asarray([[0.0]]), ['1']) obs = randdm(1) self.assertEqual(obs, exp) obs = randdm(2) self.assertEqual(obs.shape, (2, 2)) self.assertEqual(obs.ids, ('1', '2')) obs1 = randdm(5) num_trials = 10 found_diff = False for _ in range(num_trials): obs2 = randdm(5) if obs1 != obs2: found_diff = True break self.assertTrue(found_diff)
def euclidean_distance(metadata: qiime2.Metadata, x: str, y: str, z: str = None, missing_data: str = 'error') -> DistanceMatrix: cols = [x, y] names = ['x', 'y'] if z is not None: cols.append(z) names.append('z') sample_md = _load_and_validate(metadata, cols, names, missing_data) # Compute pairwise distances between all points distances = scipy.spatial.distance.pdist( sample_md.values, metric='euclidean') dm = DistanceMatrix(distances, ids=sample_md.index) return dm
def main(): args = parse_args() genomes = parse_msa(args['msa'], args['max_samples']) try: os.makedirs(args['out_dir']) except: pass print("Count SNPs") dist_path = os.path.join(args['out_dir'], 'snp_dist.tsv') print(" path: %s" % dist_path) dist_file = open(dist_path, 'w') matrix = [] for id1 in genomes: array = [] is_present1 = genomes[id1] != '-' for id2 in genomes: is_present2 = genomes[id2] != '-' is_diff = genomes[id1] != genomes[id2] co_occur = is_present1 & is_present2 raw_count = (is_diff & co_occur).sum() norm_count = 0 co_sum = co_occur.sum() if raw_count != 0 and co_sum != 0: norm_count = float(raw_count) / co_sum array.append(norm_count) dist_file.write('\t'.join([id1, id2, str(raw_count), str(norm_count)])+'\n') matrix.append(array) print("Build SNP tree") tree_path = os.path.join(args['out_dir'], 'snp_dist.tree') print(" path: %s" % tree_path) dm = DistanceMatrix(matrix, genomes.keys()) tree = nj(dm, result_constructor=str) open(tree_path,'w').write(tree) print("\nDone!")
def pw_distances(counts, ids=None, metric="braycurtis"): """Compute distances between all pairs of columns in a counts matrix Parameters ---------- counts : 2D array_like of ints or floats Matrix containing count/abundance data where each row contains counts of observations in a given sample. ids : iterable of strs, optional Identifiers for each sample in ``counts``. metric : str, optional The name of the pairwise distance function to use when generating pairwise distances. See the scipy ``pdist`` docs, linked under *See Also*, for available metrics. Returns ------- skbio.DistanceMatrix Distances between all pairs of samples (i.e., rows). The number of row and columns will be equal to the number of rows in ``counts``. Raises ------ ValueError If ``len(ids) != len(counts)``. See Also -------- scipy.spatial.distance.pdist pw_distances_from_table """ num_samples = len(counts) if ids is not None and num_samples != len(ids): raise ValueError( "Number of rows in counts must be equal to number of provided " "ids.") distances = pdist(counts, metric) return DistanceMatrix( squareform(distances, force='tomatrix', checks=False), ids)
def pw_distances_from_table(table, metric="braycurtis"): """Compute distances between all pairs of samples in table Parameters ---------- table : biom.table.Table ``Table`` containing count/abundance data of observations across samples. metric : str, optional The name of the pairwise distance function to use when generating pairwise distances. See the scipy ``pdist`` docs, linked under *See Also*, for available metrics. Returns ------- skbio.DistanceMatrix Distances between all pairs of samples. The number of row and columns will be equal to the number of samples in ``table``. See Also -------- scipy.spatial.distance.pdist biom.table.Table pw_distances """ warn("pw_distances_from_table is deprecated. In the future (tentatively " "scikit-bio 0.2.0), pw_distance will take a biom.table.Table object " "and this function will be removed. You will need to update your " "code to call pw_distances at that time.") sample_ids = table.sample_ids num_samples = len(sample_ids) # initialize the result object dm = np.zeros((num_samples, num_samples)) for i, sid1 in enumerate(sample_ids): v1 = table.data(sid1) for j, sid2 in enumerate(sample_ids[:i]): v2 = table.data(sid2) dm[i, j] = dm[j, i] = pdist([v1, v2], metric) return DistanceMatrix(dm, sample_ids)
def test_pw_distances_unweighted_unifrac(self): # expected values calculated by hand dm1 = pw_distances('unweighted_unifrac', self.t1, self.ids1, otu_ids=self.otu_ids1, tree=self.tree1) dm2 = pw_distances(unweighted_unifrac, self.t1, self.ids1, otu_ids=self.otu_ids1, tree=self.tree1) self.assertEqual(dm1.shape, (3, 3)) self.assertEqual(dm1, dm2) expected_data = [[0.0, 0.0, 0.25 / 1.0], [0.0, 0.0, 0.25 / 1.0], [0.25 / 1.0, 0.25 / 1.0, 0.0]] expected_dm = DistanceMatrix(expected_data, ids=self.ids1) for id1 in self.ids1: for id2 in self.ids1: npt.assert_almost_equal(dm1[id1, id2], expected_dm[id1, id2], 6)
def _compute_collapsed_dm(dm, i, j, disallow_negative_branch_length, new_node_id): """Return the distance matrix resulting from joining ids i and j in a node. If the input distance matrix has shape ``(n, n)``, the result will have shape ``(n-1, n-1)`` as the ids `i` and `j` are collapsed to a single new ids. """ in_n = dm.shape[0] out_n = in_n - 1 out_ids = [new_node_id] out_ids.extend([e for e in dm.ids if e not in (i, j)]) result = np.zeros((out_n, out_n)) for idx1, out_id1 in enumerate(out_ids[1:]): result[0, idx1 + 1] = result[idx1 + 1, 0] = _otu_to_new_node( dm, i, j, out_id1, disallow_negative_branch_length) for idx2, out_id2 in enumerate(out_ids[1:idx1 + 1]): result[idx1+1, idx2+1] = result[idx2+1, idx1+1] = \ dm[out_id1, out_id2] return DistanceMatrix(result, out_ids)
def partition_weighted_distance(nwkfile): partdist = [] partsum = 0 totaldist = None tipnames = None for partlen, tree in iter_newick_partitoned(nwkfile): partsum += partlen if tipnames is None: try: tipnames = list( map(str, sorted(int(x.name) for x in tree.tips()))) except ValueError: tipnames = list(sorted(x.name for x in tree.tips())) dist = tree.tip_tip_distances(tipnames).data if totaldist is None: totaldist = np.zeros_like(dist) partdist.append((partlen, dist)) for partlen, dist in partdist: scale = partlen / partsum totaldist += dist * scale return DistanceMatrix(totaldist, ids=tipnames)
def PCoA_total_from_matrix_clustering(distance_matrix, biom_file, assignments, plot=False): samples = BW.extract_samples(biom_file) sk_distance_matrix = DistanceMatrix(distance_matrix, BW.extract_samples(biom_file)) metadata = {samples[i]: {'body_site': 'Group ' + str(assignments[i]+1)} for i in range(len(assignments))} pd_metadata = pd.DataFrame.from_dict(metadata, orient='index') result = pcoa(sk_distance_matrix) fig = result.plot(df=pd_metadata, column='body_site', axis_labels=('PC 1 (' + str(round(result.proportion_explained.iloc[0]*100, 2)) + '%)', 'PC 2 (' + str(round(result.proportion_explained.iloc[1]*100, 2)) + '%)', 'PC 3 (' + str(round(result.proportion_explained.iloc[2]*100, 2)) + '%)'), title='Samples colored by body site', cmap='Set1', s=50) fig.set_size_inches(18.5, 10.5) if plot: plt.show() else: return fig
def rig_pairwise(G, X, n_jobs=1): """ Compute the RIG metric on all pairs of samples Parameters ---------- G : nx.Graph A connected graph of weighted edges. X : pd.DataFrame Contingency table of samples where rows are samples and columns are features (i.e. metabolites). Returns ------- skbio.DistanceMatrix Distance matrix of distances. """ labs = X.columns.values rig_func = partial(rig, G=G, labs=labs) # Note cannot work with pandas dm = pairwise_distances(X.values, metric=rig_func, n_jobs=1) return DistanceMatrix(dm, ids=X.index.values)
def get_guide_tree(seqs, random=False): """ Get a guide tree representing distances between sequences :param seqs: Sequences to create a tree for :return: Guide tree """ # Get distances and ids if random: distances = calc_random_distances(seqs) else: distances = calc_distances(seqs) ids = [x.name for x in seqs] # distances = [[ 0, 16, 22, 26.5], # [16, 0, 25.5, 24.5], # [22, 25.5, 0, 22.5], # [26.5, 24.5, 22.5, 0. ]] # # Make a distance matrix and Neighbour-Joining tree dm = DistanceMatrix(distances, ids) tree = nj(dm) # print ('maxxy') # # print (distances) # print (np.amin(distances)) # print (np.argmin(distances)) # result = np.where(distances == 0.5692307692307692) # # print (result) # Mid-point root and then label the internal nodes tree = tree.root_at_midpoint() label_internal_nodes(tree) return tree
def nj_tree(feature_matrix): from skbio import DistanceMatrix from skbio.tree import nj import sklearn import time t = time.time() data = sklearn.metrics.pairwise_distances(feature_matrix.values, metric='hamming') print(time.time() - t) t = time.time() dm = DistanceMatrix(data) print('distance matrix', time.time() - t) t = time.time() tree = nj(dm) print('tree build', time.time() - t) return tree
def variation_matrix(X): r""" Calculate Aitchison variation matrix. This calculates the Aitchison variation matrix. Given a compositional matrix :math:`X`, and columns :math:`i` and :math:`j`, the :math:`ij` entry in the variation matrix of :math:`X` is given by .. math: V_{ij} = \frac{1}{2} var(\ln \frac{x_i}{x_j}) Parameters ---------- X : pd.DataFrame Contingency table where there are n rows corresponding to samples and p features corresponding to columns. Returns ------- skbio.DistanceMatrix Total variation matrix of size n x n. References ---------- .. [1] V. Pawlowsky-Glahn, J. J. Egozcue, R. Tolosana-Delgado (2015), Modeling and Analysis of Compositional Data, Wiley, Chichester, UK .. [2] J. J. Egozcue, V. Pawlowsky-Glahn (2004), Groups of Parts and Their Balances in Compositional Data Analysis, Mathematical Geology """ v = np.zeros((X.shape[1], X.shape[1])) x = closure(X) for i in range(X.shape[1]): for j in range(i): v[i, j] = np.var(np.log(x[:, i]) - np.log(x[:, j])) # Making matrix symmetry since V(ln (x/y) ) = V(ln (y/x) ) # Also dividing by 2, to ensure unit norm for balances. # See Eqn 4 in [2] return DistanceMatrix((v + v.T) / 2, ids=X.columns)
def test_block_compute(self): def mock_metric(u, v): return (u + v).sum() counts = np.array([[0, 1, 2, 3, 4, 5], [1, 2, 3, 4, 5, 0], [2, 3, 4, 5, 0, 1], [10, 2, 3, 6, 8, 2], [9, 9, 2, 2, 3, 4]]) kwargs = { 'metric': mock_metric, 'counts': counts, 'row_ids': np.array((2, 3)), 'col_ids': np.array((4, )), 'id_pairs': [(2, 4), (3, 4)], 'ids': [1, 2, 3, 4, 5] } exp = DistanceMatrix(np.array([[0, 0, 44], [0, 0, 60], [44, 60, 0]]), (2, 3, 4)) obs = _block_compute(**kwargs) npt.assert_equal(obs.data, exp.data) self.assertEqual(obs.ids, exp.ids)