Esempio n. 1
0
    def __load_distance_matrix(self, data):
        dm = DistanceMatrix(data)
        nj_tree = nj(dm)

        df = nj_tree.tip_tip_distances().to_data_frame()

        df.index = df.index.astype(int)
        df.sort_index(inplace=True)
        df.columns = df.columns.values.astype(np.int32)
        df = df[sorted(df.columns)]

        self.dist_matrix = df.as_matrix()

        nj_tree.bifurcate()
        self.__post_order(nj_tree)
        self.__build_genotype(nj_tree)
Esempio n. 2
0
 def test_id_pairs_as_iterable(self):
     id_pairs = iter([
         ('B', 'C'),
     ])
     dm = partial_beta_diversity('unweighted_unifrac',
                                 self.table1,
                                 self.sids1,
                                 otu_ids=self.oids1,
                                 tree=self.tree1,
                                 id_pairs=id_pairs)
     self.assertEqual(dm.shape, (3, 3))
     expected_data = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.25], [0.0, 0.25, 0.0]]
     expected_dm = DistanceMatrix(expected_data, ids=self.sids1)
     for id1 in self.sids1:
         for id2 in self.sids1:
             npt.assert_almost_equal(dm[id1, id2], expected_dm[id1, id2], 6)
Esempio n. 3
0
 def test_unweighted_unifrac_partial(self):
     # TODO: update npt.assert_almost_equal calls to use DistanceMatrix
     # near-equality testing when that support is available
     # expected values calculated by hand
     dm = partial_beta_diversity('unweighted_unifrac', self.table1,
                                 self.sids1, otu_ids=self.oids1,
                                 tree=self.tree1, id_pairs=[('B', 'C'), ])
     self.assertEqual(dm.shape, (3, 3))
     expected_data = [[0.0, 0.0, 0.0],
                      [0.0, 0.0, 0.25],
                      [0.0, 0.25, 0.0]]
     expected_dm = DistanceMatrix(expected_data, ids=self.sids1)
     for id1 in self.sids1:
         for id2 in self.sids1:
             npt.assert_almost_equal(dm[id1, id2],
                                     expected_dm[id1, id2], 6)
Esempio n. 4
0
def create_tree_of_trees(trees: dict, out, metric='score', outgroup=''):
    output_dir, output_filename = parse_path(out)
    names = sorted(trees.keys())
    n = len(names)
    m = np.zeros((n, n))
    for i, ref in enumerate(names[:-1]):
        targets = [trees[names[j]] for j in range(i + 1, n)]
        dists = get_dist_trees(trees[ref], targets, metric)
        m[i + 1:n, i] = dists
    m += m.T
    dist_m = DistanceMatrix(m, names)
    tree = ete3.Tree(nj(dist_m, result_constructor=str))
    if outgroup is not None:
        outgroup_tree(tree)
    if out:
        tree.write(outfile=out)
    return tree
Esempio n. 5
0
def _temporal_distance(corr, id_set, dist_method="fro"):
    '''Calculate Distance Matrix from temporal correlation data.
    corr: pd.DataFrame
        table grouped by individual ids, this is the output from _temporal_corr
    id_set: pd.Series
        unique subject ids from individual_id with index attached
    dist_method: str
        method supported by scipy.linalg.norm parameter ord
    '''
    id_n = len(id_set)

    dist = np.zeros((id_n, id_n))
    for i, id_i in enumerate(id_set):
        for j, id_j in enumerate(id_set[:i]):
            dist[i, j] = dist[j, i] = linalg.norm(
                corr.loc[id_i] - corr.loc[id_j], ord=dist_method)
    return DistanceMatrix(dist, ids=id_set.index)
    def test_extensive(self):
        eigvals = [
            0.3984635, 0.36405689, 0.28804535, 0.27479983, 0.19165361, 0.0
        ]
        proportion_explained = [
            0.2626621381, 0.2399817314, 0.1898758748, 0.1811445992,
            0.1263356565, 0.0
        ]
        sample_ids = [str(i) for i in range(6)]
        axis_labels = ['PC%d' % i for i in range(1, 7)]
        samples = [
            [-0.028597, 0.22903853, 0.07055272, 0.26163576, 0.28398669, 0.0],
            [
                0.37494056, 0.22334055, -0.20892914, 0.05057395, -0.18710366,
                0.0
            ],
            [
                -0.33517593, -0.23855979, -0.3099887, 0.11521787, -0.05021553,
                0.0
            ],
            [0.25412394, -0.4123464, 0.23343642, 0.06403168, -0.00482608, 0.0],
            [
                -0.28256844, 0.18606911, 0.28875631, -0.06455635, -0.21141632,
                0.0
            ],
            [0.01727687, 0.012458, -0.07382761, -0.42690292, 0.1695749, 0.0]
        ]

        expected_results = OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigvals, index=axis_labels),
            samples=pd.DataFrame(samples,
                                 index=sample_ids,
                                 columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))

        data = np.loadtxt(get_data_path('PCoA_sample_data_2'))
        # test passing a numpy.ndarray and a DistanceMatrix to pcoa
        # gives same results
        for dm in (data, DistanceMatrix(data)):
            results = pcoa(dm)
            assert_ordination_results_equal(results,
                                            expected_results,
                                            ignore_directionality=True)
Esempio n. 7
0
    def setUp(self):
        dm_data = [[0, 1, 2, 3, 5, 8, 13, 21], [1, 0, 3, 5, 12, 44, 3, 4],
                   [2, 3, 0, 22, 3, 11, 1, 6], [3, 5, 22, 0, 6, 33, 6, 7],
                   [5, 12, 3, 6, 0, 12, 3, 8], [8, 44, 11, 33, 12, 0, 6, 6],
                   [13, 3, 1, 6, 3, 6, 0, 9], [21, 4, 6, 7, 8, 6, 9, 0]]

        ids = [
            'subject_1_1', 'subject_2_1', 'subject_2_0', 'subject_3_1',
            'subject_3_4', 'subject_1_9', 'subject_1_3', 'subject_1_5'
        ]

        headers = ['LOL', 'Subject', 'Order']
        map_data = [['22', '1', '1'], ['2', '2', '1'], ['2', '2', '0'],
                    ['1', '3', '1'], ['2', '3', '4'], ['34', '1', '9'],
                    ['NA', '1', '3'], ['11111', '1', '5']]

        self.dm = DistanceMatrix(dm_data, ids)
        self.mf = pd.DataFrame(map_data, ids, headers)
Esempio n. 8
0
    def __init__(self, args, current_wd, suppress, silence):
        # expect only args.p or args.i, not both
        if args.p:
            try:
                if args.i:
                    raise WarningCode13(silence=silence)

                parameter_df = pandas.read_csv(
                    os.path.join(current_wd, args.p), index_col=0, header=0
                )  # TODO: FileNotFoundError  ## TODO use 'RetrospectDataImport'
                self.cluster = list(parameter_df['cluster'])

            except WarningCode13:
                self.warning_code = '13'

        else:
            try:
                if args.i:
                    print(f'Input: {args.i}\n')

                    pca_coordinate = RetrospectDataImport(
                        file_name=os.path.join(current_wd, args.i),
                        type='coordinate')  # TODO: FileNotFoundError
                    self.individual = list(
                        pca_coordinate.coordinate_w_info['individual'])
                    self.dist_matrix = DistanceMatrix(
                        distance_matrix(pca_coordinate.coordinate_select_np,
                                        pca_coordinate.coordinate_select_np))
                    self.coordinate_w_info = pca_coordinate.coordinate_w_info
                    self.group_set = pca_coordinate.group_set
                    print(self.group_set)

                    # set parameter by interacting with users
                    cluster_map = InputCode2(set=self.group_set,
                                             suppress=suppress)
                    self.coordinate_w_info['cluster'] = self.coordinate_w_info[
                        'group'].map(cluster_map.cluster_dict)
                    self.cluster = self.coordinate_w_info['cluster']

                else:
                    raise Error(code='8')

            except Error as e:
                raise ErrorCode8(suppress=suppress) from e
Esempio n. 9
0
def main_vec():
	args = parse_args()

	genomes = parse_msa(args['msa'], args['max_samples'])

	try: os.makedirs(args['out_dir'])
	except: pass

	print("Count SNPs")
	dist_path = os.path.join(args['out_dir'], 'snp_dist.tsv')
	print("   path: %s" % dist_path)
	dist_file = open(dist_path, 'w')
	matrix = []

	occurs = []
	for id in genomes:
		occurs.append(genomes[id][0] != '-')
	occurs = np.array(occurs)

	for i, id in enumerate(genomes.keys()):
		occ_row = occurs[i]
		cooccs = occ_row & occurs

		diffs = []
		for sid in genomes:
			diffs.append(genomes[id][0] != genomes[sid][0])
		diffs = np.array(diffs)

		raw_counts = np.sum(diffs & cooccs, axis=1)
		norm_counts = raw_counts / np.sum(cooccs, axis=1)

		for j, sid in enumerate(genomes.keys()):
			dist_file.write('\t'.join([id, sid, str(raw_counts[j]), str(norm_counts[j])])+'\n')

		matrix.append(norm_counts)

	print("Build SNP tree")
	tree_path = os.path.join(args['out_dir'], 'snp_dist.tree')
	print("   path: %s" % tree_path)
	dm = DistanceMatrix(matrix, genomes.keys())
	tree = nj(dm, result_constructor=str)
	open(tree_path,'w').write(tree)

	print("\nDone!")
Esempio n. 10
0
def njWithRoot(dis_matrix, muestraPmid):
    # no culcula la distancia, solo le da un formato mas adecuado a las distancias con los ids
    muestraPmidStr = [str(i) for i in muestraPmid]
    ver = dis_matrix.tolist()
    dm = DistanceMatrix(ver, muestraPmidStr)
    treeOrig = nj(dm, result_constructor=str)
    # ponerle raiz
    t = TreeEte(treeOrig)
    R = t.get_midpoint_outgroup()
    t.set_outgroup(R)
    # imprime el arbol
    #print(t)
    # imprime el newick
    tree = t.write(format=3)
    tree = TreeEte(tree, format=1)
    #print(tree)
    #a = newick_to_pairwise_nodes(tree)
    #print(a)
    return tree
Esempio n. 11
0
def export_tree_for_all(all_patterns, matrixoutput, treeoutput):
    result_patterns = []
    for idx, (samplename, pattern, count, pcnt) in enumerate(all_patterns):
        removes = set()
        for pos, na, ref in pattern:
            if na == '.':
                removes |= {
                    (pos, ntmp, ref)
                    for ntmp in ['A', 'C', 'G', 'T', 'ins', 'del', '.']
                }
        result_patterns.append([
            idx, set(pattern) - removes, removes,
            '{}_{}_{:.1f}%'.format(
                samplename,
                idx + 1,
                pcnt * 100),
            count])

    patterns = result_patterns
    num_patterns = len(patterns)
    if num_patterns < 3:
        with open(treeoutput, 'w') as fp:
            fp.write('();')
        return
    dist_matrix = np.zeros((num_patterns, num_patterns), dtype=float)
    patternstrs = [ptnstr for _, _, _, ptnstr, _ in patterns]
    for (idx1, ptn1, rm1, ptnstr1, c1), (idx2, ptn2, rm2, ptnstr2, c2) in \
            combinations(patterns, 2):
        distance = len((ptn1 - rm2) ^ (ptn2 - rm1))  # xor
        dist_matrix[idx1, idx2] = distance
        dist_matrix[idx2, idx1] = distance
    with open(matrixoutput, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['##', *patternstrs])
        writer.writerows(dist_matrix)
    if True or num_patterns > 10000:
        # TODO: add a switch to this
        # Too many patterns, unable to calculate dist_matrix
        return
    dist_matrix = DistanceMatrix(dist_matrix, patternstrs)
    tree = nj(dist_matrix)
    with open(treeoutput, 'w') as fp:
        fp.write(str(tree.root_at_midpoint()))
Esempio n. 12
0
 def test_weighted_unifrac(self):
     # TODO: update npt.assert_almost_equal calls to use DistanceMatrix
     # near-equality testing when that support is available
     # expected values calculated by hand
     dm1 = beta_diversity('weighted_unifrac', self.table1, self.sids1,
                          otu_ids=self.oids1, tree=self.tree1)
     dm2 = beta_diversity(weighted_unifrac, self.table1, self.sids1,
                          otu_ids=self.oids1, tree=self.tree1)
     self.assertEqual(dm1.shape, (3, 3))
     self.assertEqual(dm1, dm2)
     expected_data = [
         [0.0, 0.1750000, 0.12499999],
         [0.1750000, 0.0, 0.3000000],
         [0.12499999, 0.3000000, 0.0]]
     expected_dm = DistanceMatrix(expected_data, ids=self.sids1)
     for id1 in self.sids1:
         for id2 in self.sids1:
             npt.assert_almost_equal(dm1[id1, id2],
                                     expected_dm[id1, id2], 6)
Esempio n. 13
0
    def test_from_iterable_skbio_hamming_metric_with_metadata(self):
        # test for #1254
        seqs = [
            Sequence('ACGT'),
            Sequence('ACGA', metadata={'id': 'seq1'}),
            Sequence('AAAA', metadata={'id': 'seq2'}),
            Sequence('AAAA', positional_metadata={'qual': range(4)})
        ]

        exp = DistanceMatrix([[0, 0.25, 0.75, 0.75], [0.25, 0.0, 0.5, 0.5],
                              [0.75, 0.5, 0.0, 0.0], [0.75, 0.5, 0.0, 0.0]],
                             ['a', 'b', 'c', 'd'])

        dm = DistanceMatrix.from_iterable(
            seqs,
            metric=skbio.sequence.distance.hamming,
            keys=['a', 'b', 'c', 'd'])

        self.assertEqual(dm, exp)
Esempio n. 14
0
def PCoA_group_from_matrix(distance_matrix, biom_file, groups, plot=False):
	sk_distance_matrix = DistanceMatrix(distance_matrix, [str(i) for i in range(len(groups))])

	metadata = {str(i): {'body_site': groups[i]} for i in range(len(groups))}

	pd_metadata = pd.DataFrame.from_dict(metadata, orient='index')

	result = pcoa(sk_distance_matrix)

	fig = result.plot(df=pd_metadata, column='body_site',
							axis_labels=('PC 1 (' + str(round(result.proportion_explained.iloc[0]*100, 2)) + '%)', 'PC 2 (' + str(round(result.proportion_explained.iloc[1]*100, 2)) + '%)', 'PC 3 (' + str(round(result.proportion_explained.iloc[2]*100, 2)) + '%)'),
							title='Samples colored by body site',
							cmap='Set1', s=50)

	fig.set_size_inches(18.5, 10.5)

	if plot:
		plt.show()
	else:
		return fig
Esempio n. 15
0
def PCoA_total_from_matrix(distance_matrix, biom_file, metadata_file, plot=False):
	sk_distance_matrix = DistanceMatrix(distance_matrix, BW.extract_samples(biom_file))

	metadata = meta.extract_metadata(metadata_file)

	pd_metadata = pd.DataFrame.from_dict(metadata, orient='index')

	result = pcoa(sk_distance_matrix)

	fig = result.plot(df=pd_metadata, column='body_site',
							axis_labels=('PC 1 (' + str(round(result.proportion_explained.iloc[0]*100, 2)) + '%)', 'PC 2 (' + str(round(result.proportion_explained.iloc[1]*100, 2)) + '%)', 'PC 3 (' + str(round(result.proportion_explained.iloc[2]*100, 2)) + '%)'),
							title='Samples colored by body site',
							cmap='Set1', s=50)

	fig.set_size_inches(18.5, 10.5)

	if plot:
		plt.show()
	else:
		return fig
Esempio n. 16
0
 def setUp(self):
     super().setUp()
     dm_values = [[0, 1, 2, 3], [1, 0, 3, 4], [2, 3, 0, 5], [3, 4, 5, 0]]
     ids = ['s1', 's2', 's3', 's4']
     dm = DistanceMatrix(
         dm_values,
         ids=ids,
     )
     self.resources = DictElement({
         'datasets':
         DictElement({
             'dataset1':
             DictElement({'__beta__': BetaElement({'unifrac': dm})}),
         }),
     })
     self.resources.accept(TrivialVisitor())
     self.res_patcher = patch(
         'microsetta_public_api.api.diversity.beta.get_resources')
     self.mock_resources = self.res_patcher.start()
     self.mock_resources.return_value = self.resources
Esempio n. 17
0
    def test_default_usage(self):
        exp = DistanceMatrix(np.asarray([[0.0]]), ['1'])
        obs = randdm(1)
        self.assertEqual(obs, exp)

        obs = randdm(2)
        self.assertEqual(obs.shape, (2, 2))
        self.assertEqual(obs.ids, ('1', '2'))

        obs1 = randdm(5)
        num_trials = 10
        found_diff = False
        for _ in range(num_trials):
            obs2 = randdm(5)

            if obs1 != obs2:
                found_diff = True
                break

        self.assertTrue(found_diff)
Esempio n. 18
0
def euclidean_distance(metadata: qiime2.Metadata,
                       x: str,
                       y: str,
                       z: str = None,
                       missing_data: str = 'error') -> DistanceMatrix:
    cols = [x, y]
    names = ['x', 'y']
    if z is not None:
        cols.append(z)
        names.append('z')

    sample_md = _load_and_validate(metadata, cols, names, missing_data)

    # Compute pairwise distances between all points
    distances = scipy.spatial.distance.pdist(
        sample_md.values, metric='euclidean')

    dm = DistanceMatrix(distances, ids=sample_md.index)

    return dm
Esempio n. 19
0
def main():
	args = parse_args()

	genomes = parse_msa(args['msa'], args['max_samples'])

	try: os.makedirs(args['out_dir'])
	except: pass

	print("Count SNPs")
	dist_path = os.path.join(args['out_dir'], 'snp_dist.tsv')
	print("   path: %s" % dist_path)
	dist_file = open(dist_path, 'w')
	matrix = []

	for id1 in genomes:
		array = []
		is_present1 = genomes[id1] != '-'
		for id2 in genomes:
			is_present2 = genomes[id2] != '-'
			is_diff = genomes[id1] != genomes[id2]
			co_occur = is_present1 & is_present2
			raw_count = (is_diff & co_occur).sum()

			norm_count = 0
			co_sum = co_occur.sum()

			if raw_count != 0 and co_sum != 0:
				norm_count = float(raw_count) / co_sum

			array.append(norm_count)
			dist_file.write('\t'.join([id1, id2, str(raw_count), str(norm_count)])+'\n')
		matrix.append(array)

	print("Build SNP tree")
	tree_path = os.path.join(args['out_dir'], 'snp_dist.tree')
	print("   path: %s" % tree_path)
	dm = DistanceMatrix(matrix, genomes.keys())
	tree = nj(dm, result_constructor=str)
	open(tree_path,'w').write(tree)

	print("\nDone!")
Esempio n. 20
0
def pw_distances(counts, ids=None, metric="braycurtis"):
    """Compute distances between all pairs of columns in a counts matrix

    Parameters
    ----------
    counts : 2D array_like of ints or floats
        Matrix containing count/abundance data where each row contains counts
        of observations in a given sample.
    ids : iterable of strs, optional
        Identifiers for each sample in ``counts``.
    metric : str, optional
        The name of the pairwise distance function to use when generating
        pairwise distances. See the scipy ``pdist`` docs, linked under *See
        Also*, for available metrics.

    Returns
    -------
    skbio.DistanceMatrix
        Distances between all pairs of samples (i.e., rows). The number of
        row and columns will be equal to the number of rows in ``counts``.

    Raises
    ------
    ValueError
        If ``len(ids) != len(counts)``.

    See Also
    --------
    scipy.spatial.distance.pdist
    pw_distances_from_table

    """
    num_samples = len(counts)
    if ids is not None and num_samples != len(ids):
        raise ValueError(
            "Number of rows in counts must be equal to number of provided "
            "ids.")

    distances = pdist(counts, metric)
    return DistanceMatrix(
        squareform(distances, force='tomatrix', checks=False), ids)
Esempio n. 21
0
def pw_distances_from_table(table, metric="braycurtis"):
    """Compute distances between all pairs of samples in table

    Parameters
    ----------
    table : biom.table.Table
        ``Table`` containing count/abundance data of observations across
        samples.
    metric : str, optional
        The name of the pairwise distance function to use when generating
        pairwise distances. See the scipy ``pdist`` docs, linked under *See
        Also*, for available metrics.

    Returns
    -------
    skbio.DistanceMatrix
        Distances between all pairs of samples. The number of row and columns
        will be equal to the number of samples in ``table``.

    See Also
    --------
    scipy.spatial.distance.pdist
    biom.table.Table
    pw_distances

    """
    warn("pw_distances_from_table is deprecated. In the future (tentatively "
         "scikit-bio 0.2.0), pw_distance will take a biom.table.Table object "
         "and this function will be removed. You will need to update your "
         "code to call pw_distances at that time.")
    sample_ids = table.sample_ids
    num_samples = len(sample_ids)

    # initialize the result object
    dm = np.zeros((num_samples, num_samples))
    for i, sid1 in enumerate(sample_ids):
        v1 = table.data(sid1)
        for j, sid2 in enumerate(sample_ids[:i]):
            v2 = table.data(sid2)
            dm[i, j] = dm[j, i] = pdist([v1, v2], metric)
    return DistanceMatrix(dm, sample_ids)
Esempio n. 22
0
 def test_pw_distances_unweighted_unifrac(self):
     # expected values calculated by hand
     dm1 = pw_distances('unweighted_unifrac',
                        self.t1,
                        self.ids1,
                        otu_ids=self.otu_ids1,
                        tree=self.tree1)
     dm2 = pw_distances(unweighted_unifrac,
                        self.t1,
                        self.ids1,
                        otu_ids=self.otu_ids1,
                        tree=self.tree1)
     self.assertEqual(dm1.shape, (3, 3))
     self.assertEqual(dm1, dm2)
     expected_data = [[0.0, 0.0, 0.25 / 1.0], [0.0, 0.0, 0.25 / 1.0],
                      [0.25 / 1.0, 0.25 / 1.0, 0.0]]
     expected_dm = DistanceMatrix(expected_data, ids=self.ids1)
     for id1 in self.ids1:
         for id2 in self.ids1:
             npt.assert_almost_equal(dm1[id1, id2], expected_dm[id1, id2],
                                     6)
Esempio n. 23
0
def _compute_collapsed_dm(dm, i, j, disallow_negative_branch_length,
                          new_node_id):
    """Return the distance matrix resulting from joining ids i and j in a node.

    If the input distance matrix has shape ``(n, n)``, the result will have
    shape ``(n-1, n-1)`` as the ids `i` and `j` are collapsed to a single new
    ids.

    """
    in_n = dm.shape[0]
    out_n = in_n - 1
    out_ids = [new_node_id]
    out_ids.extend([e for e in dm.ids if e not in (i, j)])
    result = np.zeros((out_n, out_n))
    for idx1, out_id1 in enumerate(out_ids[1:]):
        result[0, idx1 + 1] = result[idx1 + 1, 0] = _otu_to_new_node(
            dm, i, j, out_id1, disallow_negative_branch_length)
        for idx2, out_id2 in enumerate(out_ids[1:idx1 + 1]):
            result[idx1+1, idx2+1] = result[idx2+1, idx1+1] = \
                dm[out_id1, out_id2]
    return DistanceMatrix(result, out_ids)
Esempio n. 24
0
def partition_weighted_distance(nwkfile):
    partdist = []
    partsum = 0
    totaldist = None
    tipnames = None
    for partlen, tree in iter_newick_partitoned(nwkfile):
        partsum += partlen
        if tipnames is None:
            try:
                tipnames = list(
                    map(str, sorted(int(x.name) for x in tree.tips())))
            except ValueError:
                tipnames = list(sorted(x.name for x in tree.tips()))
        dist = tree.tip_tip_distances(tipnames).data
        if totaldist is None:
            totaldist = np.zeros_like(dist)
        partdist.append((partlen, dist))
    for partlen, dist in partdist:
        scale = partlen / partsum
        totaldist += dist * scale
    return DistanceMatrix(totaldist, ids=tipnames)
Esempio n. 25
0
def PCoA_total_from_matrix_clustering(distance_matrix, biom_file, assignments, plot=False):
	samples = BW.extract_samples(biom_file)
	sk_distance_matrix = DistanceMatrix(distance_matrix, BW.extract_samples(biom_file))

	metadata = {samples[i]: {'body_site': 'Group ' + str(assignments[i]+1)} for i in range(len(assignments))}

	pd_metadata = pd.DataFrame.from_dict(metadata, orient='index')

	result = pcoa(sk_distance_matrix)

	fig = result.plot(df=pd_metadata, column='body_site',
							axis_labels=('PC 1 (' + str(round(result.proportion_explained.iloc[0]*100, 2)) + '%)', 'PC 2 (' + str(round(result.proportion_explained.iloc[1]*100, 2)) + '%)', 'PC 3 (' + str(round(result.proportion_explained.iloc[2]*100, 2)) + '%)'),
							title='Samples colored by body site',
							cmap='Set1', s=50)

	fig.set_size_inches(18.5, 10.5)

	if plot:
		plt.show()
	else:
		return fig
Esempio n. 26
0
def rig_pairwise(G, X, n_jobs=1):
    """ Compute the RIG metric on all pairs of samples

    Parameters
    ----------
    G : nx.Graph
      A connected graph of weighted edges.
    X : pd.DataFrame
      Contingency table of samples where rows are samples
      and columns are features (i.e. metabolites).

    Returns
    -------
    skbio.DistanceMatrix
      Distance matrix of distances.

    """
    labs = X.columns.values
    rig_func = partial(rig, G=G, labs=labs)
    # Note cannot work with pandas
    dm = pairwise_distances(X.values, metric=rig_func, n_jobs=1)
    return DistanceMatrix(dm, ids=X.index.values)
Esempio n. 27
0
def get_guide_tree(seqs, random=False):
    """
    Get a guide tree representing distances between sequences
    :param seqs: Sequences to create a tree for
    :return: Guide tree
    """

    # Get distances and ids
    if random:
        distances = calc_random_distances(seqs)
    else:
        distances = calc_distances(seqs)
    ids = [x.name for x in seqs]

    # distances = [[ 0,  16,  22,  26.5],
    #              [16,   0,  25.5, 24.5],
    #              [22,  25.5,  0,  22.5],
    #              [26.5, 24.5, 22.5,  0. ]]
    #

    # Make a distance matrix and Neighbour-Joining tree
    dm = DistanceMatrix(distances, ids)
    tree = nj(dm)

    # print ('maxxy')
    #
    # print (distances)

    # print (np.amin(distances))
    # print (np.argmin(distances))
    # result = np.where(distances == 0.5692307692307692)
    #
    # print (result)

    # Mid-point root and then label the internal nodes
    tree = tree.root_at_midpoint()
    label_internal_nodes(tree)

    return tree
Esempio n. 28
0
def nj_tree(feature_matrix):
    from skbio import DistanceMatrix
    from skbio.tree import nj
    import sklearn
    import time
    t = time.time()

    data = sklearn.metrics.pairwise_distances(feature_matrix.values,
                                              metric='hamming')
    print(time.time() - t)
    t = time.time()

    dm = DistanceMatrix(data)

    print('distance matrix', time.time() - t)
    t = time.time()

    tree = nj(dm)

    print('tree build', time.time() - t)

    return tree
Esempio n. 29
0
def variation_matrix(X):
    r""" Calculate Aitchison variation matrix.

    This calculates the Aitchison variation matrix.  Given a compositional
    matrix :math:`X`, and columns :math:`i` and :math:`j`, the :math:`ij` entry
    in the variation matrix of :math:`X` is given by

    .. math:
        V_{ij} = \frac{1}{2} var(\ln \frac{x_i}{x_j})

    Parameters
    ----------
    X : pd.DataFrame
        Contingency table where there are n rows corresponding to samples
        and p features corresponding to columns.

    Returns
    -------
    skbio.DistanceMatrix
        Total variation matrix of size n x n.

    References
    ----------
    .. [1] V. Pawlowsky-Glahn, J. J. Egozcue, R. Tolosana-Delgado (2015),
       Modeling and Analysis of Compositional Data, Wiley, Chichester, UK

    .. [2] J. J. Egozcue, V. Pawlowsky-Glahn (2004), Groups of Parts and
       Their Balances in Compositional Data Analysis, Mathematical Geology
    """
    v = np.zeros((X.shape[1], X.shape[1]))
    x = closure(X)
    for i in range(X.shape[1]):
        for j in range(i):
            v[i, j] = np.var(np.log(x[:, i]) - np.log(x[:, j]))
    # Making matrix symmetry since V(ln (x/y) ) = V(ln (y/x) )
    # Also dividing by 2, to ensure unit norm for balances.
    # See Eqn 4 in [2]
    return DistanceMatrix((v + v.T) / 2, ids=X.columns)
Esempio n. 30
0
    def test_block_compute(self):
        def mock_metric(u, v):
            return (u + v).sum()

        counts = np.array([[0, 1, 2, 3, 4, 5], [1, 2, 3, 4, 5, 0],
                           [2, 3, 4, 5, 0, 1], [10, 2, 3, 6, 8, 2],
                           [9, 9, 2, 2, 3, 4]])

        kwargs = {
            'metric': mock_metric,
            'counts': counts,
            'row_ids': np.array((2, 3)),
            'col_ids': np.array((4, )),
            'id_pairs': [(2, 4), (3, 4)],
            'ids': [1, 2, 3, 4, 5]
        }

        exp = DistanceMatrix(np.array([[0, 0, 44], [0, 0, 60], [44, 60, 0]]),
                             (2, 3, 4))

        obs = _block_compute(**kwargs)
        npt.assert_equal(obs.data, exp.data)
        self.assertEqual(obs.ids, exp.ids)