def test_permutted(self):
        dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        # this should not throw
        pcoa(dm1, method="fsvd", number_of_dimensions=3, inplace=False)

        # some operations, like permute, will change memory structure
        # we want to test that this does not break pcoa
        dm2 = dm1.permute()
        # we just want to assure it does not throw
        pcoa(dm2, method="fsvd", number_of_dimensions=3, inplace=False)
Exemple #2
0
    def test_fsvd_inplace(self):
        dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        dm2 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))

        expected_results = pcoa(dm1, method="eigh", number_of_dimensions=3,
                                inplace=True)

        results = pcoa(dm2, method="fsvd", number_of_dimensions=3,
                       inplace=True)

        assert_ordination_results_equal(results, expected_results,
                                        ignore_directionality=True,
                                        ignore_method_names=True)
def get_pcoa(args, dm1):
    #from cogent.cluster.metric_scaling import PCoA
    from skbio.stats.ordination import pcoa
    PCoA_result = pcoa(dm1)
    print (PCoA_result.samples())

    #dt = np.dtype(float)
    #print(type(PCoA_result))
    a = np.array(PCoA_result)[0:,0:5]   # capture only the first three vectors
    #print a
    json_array = {}
    json_array["P1"] = a[:,2].tolist()[:-2]  # [:-2] is to remove the last two which are not eigen vectors

    json_array["P2"] = a[:,3].tolist()[:-2]
    try:
        json_array["P3"] = a[:,4].tolist()[:-2]
    except IndexError:
        sys.exit('IndexError - try selecting more data or deeper taxonomy')

    json_array["names"] = a[:,1].tolist()[:-2]

    #json['v2'] = [x[0] for x in np.array(PCoA_result[:,3])[:-2]]
    #json['v3'] = [x[0] for x in np.array(PCoA_result[:,4])[:-2]]
    #json['v3'] = [x[0] for x in np.array(PCoA_result[:,4])[:-2]]
    # sprint json_array
    if args.function == 'pcoa_3d':
        create_emperor_pc_file(args, json_array, PCoA_result)
    return json_array
    def fit(self, X, y=None):
        """

        Parameters
        ----------
        X : array-like
            Feature table or distance matrix
        y : None
            ignored

        Returns
        -------
        self
            fitted pcoa

        """
        X_to_ordinate = X
        if self.metric != 'precomputed':
            X_to_ordinate = cdist(
                X_to_ordinate,
                X_to_ordinate,
                metric=self.metric,
            )

        self.ordination_ = pcoa(X_to_ordinate)
        self.embedding_ = self.ordination_.samples

        return self
    def setUp(self):
        # Crawford dataset for unweighted UniFrac
        fp = get_data_path('PCoA_sample_data_3')
        self.ordination = pcoa(DistanceMatrix.read(fp))

        fp = get_data_path('PCoA_biplot_descriptors')
        self.descriptors = pd.read_table(fp, index_col='Taxon').T
Exemple #6
0
def test_compare_to_rcode():
    windows, _ = ls.parse_vcf(vcf_file, "chr1", 95)
    covmat, total_variance, eigenvals, eigenvecs = ls.cov_pca(windows[0].todense(), 10, 1)

    results = np.loadtxt("lostruct-results/chr1.filtered.pca.csv", 
                            delimiter=",", 
                            skiprows=1)

    totalandvalsR = results[0][0:11]
    totalandvalsPy = np.concatenate(([total_variance], eigenvals)),
    # Comes out as 0.9999921929150888
    assert(np.corrcoef(totalandvalsR, totalandvalsPy)[0][1] >= 0.99999)

    # Squared here, because signs are often opposite between the two analyses.
    eigenvecsR = np.square(results[0][11:61])
    eigenvecsPy = np.square(eigenvecs[0])
    # Comes out as 0.9999921929150888
    assert(np.corrcoef(eigenvecsR, eigenvecsPy)[0][1] >= 0.99999)
    assert(covmat.shape == (50, 50))

    mds_coords = np.loadtxt("lostruct-results/mds_coords.csv", 
            delimiter=",", skiprows=1, usecols=[2])

    result = list()
    for x in windows:
        result.append(ls.eigen_windows(x, 10, 1))
    result = np.vstack(result)
    pc_dists = ls.get_pc_dists(result)
    mds = pcoa(pc_dists)
    # Comes out as 0.9971509982243156
    assert(np.corrcoef(mds.samples['PC1'], mds_coords)[0][1] >= 0.995)
    def setUp(self):
        # Crawford dataset for unweighted UniFrac
        fp = get_data_path('PCoA_sample_data_3')
        self.ordination = pcoa(DistanceMatrix.read(fp))

        fp = get_data_path('PCoA_biplot_descriptors')
        self.descriptors = pd.read_table(fp, index_col='Taxon').T
    def generate_aitchison_pcoa(self, df):
        """
        Takes in count dataframe and generate pcoa matrix using aitchison distance matrix as input (aitchison distance is calculated in this function)
        
        Parameters
        ------------
        dataframe: pandas dataframe object
            where 
                rows = samples
                columns = OTU
                and each datapoint is the either read count or relative abundance. 

        Returns
        ------------
        pcoa.sample = pandas dataframe object
            this stores the pcoa generated values to be visualised
        
        dist_matrix = pandas dataframe object
            this stores the distance scores used to generate the pcoa values. 

        """
        dist_matrix = self.aitchison_distance_matrix(df)
        dm = DistanceMatrix(dist_matrix, dist_matrix.index)
        pcoa = ordination.pcoa(dm)
        
        return pcoa.samples, dist_matrix
    def test_simple(self):
        eigvals = [
            0.51236726, 0.30071909, 0.26791207, 0.20898868, 0.19169895,
            0.16054235, 0.15017696, 0.12245775, 0.0
        ]
        proportion_explained = [
            0.2675738328, 0.157044696, 0.1399118638, 0.1091402725,
            0.1001110485, 0.0838401162, 0.0784269939, 0.0639511764, 0.0
        ]
        sample_ids = [
            'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593',
            'PC.355', 'PC.607', 'PC.634'
        ]
        axis_labels = ['PC%d' % i for i in range(1, 10)]

        expected_results = OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigvals, index=axis_labels),
            samples=pd.DataFrame(np.loadtxt(
                get_data_path('exp_PCoAEigenResults_site')),
                                 index=sample_ids,
                                 columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))

        dm = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        results = pcoa(dm)

        assert_ordination_results_equal(results,
                                        expected_results,
                                        ignore_directionality=True)
    def test_simple(self):
        eigvals = [0.51236726, 0.30071909, 0.26791207, 0.20898868,
                   0.19169895, 0.16054235,  0.15017696,  0.12245775,
                   0.0]
        proportion_explained = [0.2675738328, 0.157044696, 0.1399118638,
                                0.1091402725, 0.1001110485,
                                0.0838401162, 0.0784269939,
                                0.0639511764, 0.0]
        sample_ids = ['PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354',
                      'PC.593', 'PC.355', 'PC.607', 'PC.634']
        axis_labels = ['PC%d' % i for i in range(1, 10)]

        expected_results = OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigvals, index=axis_labels),
            samples=pd.DataFrame(
                np.loadtxt(get_data_path('exp_PCoAEigenResults_site')),
                index=sample_ids, columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))

        dm = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        results = pcoa(dm)

        assert_ordination_results_equal(results, expected_results,
                                        ignore_directionality=True)
Exemple #11
0
def dm_to_pcoa(dm, sample_md, category):
    title = "Samples colored by %s." % category
    pcoa_results = pcoa(dm)
    _ = pcoa_results.plot(df=sample_md,
                          column=category,
                          axis_labels=['PC 1', 'PC 2', 'PC 3'],
                          title=title,
                          s=35)
Exemple #12
0
def dm_to_pcoa(dm, sample_md, category):
    title = "Samples colored by %s." % category
    pcoa_results = pcoa(dm)
    _ = pcoa_results.plot(df=sample_md,
                          column=category,
                          axis_labels=['PC 1', 'PC 2', 'PC 3'],
                          title=title,
                          s=35)
Exemple #13
0
 def on_update(category, metadata, metric):
     dm = dms[metric]
     filtered_dm, _ = filter_dm_and_map(dm, metadata)
     pc = pcoa(filtered_dm)
     pc.plot(df=metadata,
     column=category,
     axis_labels=['PC 1', 'PC 2', 'PC 3'],
     s=35).set_size_inches(12, 9)
Exemple #14
0
 def on_update(category, metadata, metric):
     dm = dms[metric]
     filtered_dm, _ = filter_dm_and_map(dm, metadata)
     pc = pcoa(filtered_dm)
     pc.plot(df=metadata,
             column=category,
             axis_labels=['PC 1', 'PC 2', 'PC 3'],
             s=35).set_size_inches(12, 9)
def diversity_analysis(wu_dm_list,bc_dm_list):
    from skbio.stats.distance import mantel
    #do the UniFrac and  Bray-Curtis distances correlate? 
    r, p_value, n = mantel(wu_dm_list[0],bc_dm_list[0])
    print("Mantel Correlation COEF=",r)
    print("At significance of 0.05, the p-value for the correlation is = ",p_value)
    #next perform principle coordinate analysis (PCoA) on the weighted UniFrac distance matrix:
    from skbio.stats.ordination import pcoa
    wu_pc = pcoa(wu_dm_list[0])
 def __eapply__(self, experiment):
     dm = experiment.data_df
     pcoa_results = pcoa(dm)
     pcoa_df = pcoa_results.samples
     pcoa_df.index = dm.index #sample names
     pcoa_df = pcoa_df.transpose()
     pcoa_exp = experiment.with_data_df(pcoa_df)
     pcoa_exp.metadata['pcoa'] = pcoa_results
     return pcoa_exp
def create_emperor_pc_file(args, dist, ds_list):
    #from cogent3.cluster.metric_scaling import PCoA
    from skbio.stats.ordination import pcoa
    PCoA_result = pcoa(dist)
    PCoA_result.samples.index = ds_list
    pcfile = os.path.join(args.basedir, 'tmp',args.prefix+'_pc.txt')
    PCoA_result.write(pcfile)
    
    return pcfile
Exemple #18
0
def do_pcoa(infile):
    samples, distmtx = parse_distmat(infile)
    # coords, each row is an axis
    distmtx = DistanceMatrix(distmtx, ids=samples)
    ord_res = pcoa(distmtx)
    coords = ord_res.samples
    eigvals = ord_res.eigvals
    pcnts = ord_res.proportion_explained

    #Write results to output
    ord_res.write(sys.stdout)
def create_emperor_pc_file(args, dist, ds_list):
    #from cogent3.cluster.metric_scaling import PCoA
    from skbio.stats.ordination import pcoa
    PCoA_result = pcoa(dist)
    PCoA_result.samples.index = ds_list
    pcfile = os.path.join(args.basedir, args.prefix + '_pc.txt')
    PCoA_result.write(pcfile)
    try:
        os.chmod(pcfile, 0o664)
    except:
        pass
    return pcfile
Exemple #20
0
    def test_centroids_eq_groups(self):
        exp = [[1.2886811963240687, 1.890538910062923, 1.490527658097728],
               [2.17349240061718, 2.3192679626679946, 2.028338553903792]]
        exp_stat, _ = f_oneway(*exp)

        dm = pcoa(self.eq_mat)
        dm = dm.samples

        obs = _compute_groups(dm, 'centroid', self.grouping_eq)
        self.assertAlmostEqual(obs, exp_stat, places=6)

        obs_relab = _compute_groups(dm, 'centroid', self.grouping_eq_relab)
        self.assertAlmostEqual(obs_relab, obs, places=6)
Exemple #21
0
    def test_centroids_mixedgroups(self):
        exp = [[2.5847022428144935, 2.285624595858895,
                1.7022431146340287],
               [1.724817266046108, 1.724817266046108],
               [2.4333280644972795, 2.389000390879655,
                2.8547180589306036, 3.218568759338847]]
        dm = pcoa(self.uneq_mat)
        dm = dm.samples

        exp_stat, _ = f_oneway(*exp)

        obs_mixed = _compute_groups(dm, 'centroid', self.grouping_un_mixed)
        self.assertAlmostEqual(exp_stat, obs_mixed, places=6)
Exemple #22
0
    def test_centroids_mixedgroups(self):
        exp = [[2.5847022428144935, 2.285624595858895,
                1.7022431146340287],
               [1.724817266046108, 1.724817266046108],
               [2.4333280644972795, 2.389000390879655,
                2.8547180589306036, 3.218568759338847]]
        dm = pcoa(self.uneq_mat)
        dm = dm.samples

        exp_stat, _ = f_oneway(*exp)

        obs_mixed = _compute_groups(dm, 'centroid', self.grouping_un_mixed)
        self.assertAlmostEqual(exp_stat, obs_mixed, places=6)
Exemple #23
0
    def test_centroids_eq_groups(self):
        exp = [[1.2886811963240687, 1.890538910062923, 1.490527658097728],
               [2.17349240061718, 2.3192679626679946, 2.028338553903792]]
        exp_stat, _ = f_oneway(*exp)

        dm = pcoa(self.eq_mat)
        dm = dm.samples

        obs = _compute_groups(dm, 'centroid', self.grouping_eq)
        self.assertAlmostEqual(obs, exp_stat, places=6)

        obs_relab = _compute_groups(dm, 'centroid', self.grouping_eq_relab)
        self.assertAlmostEqual(obs_relab, obs, places=6)
Exemple #24
0
def plot_pcoas(metric):
    mpl.rcParams['figure.dpi'] = 100
    mpl.rcParams['figure.figsize'] = 9, 6
    df = pd.read_csv(glob.glob('diversity_core_metrics/' + ref_db + '/rpt_' + metric + '_dist/*/data/distance-matrix.tsv')[0],sep='\t',index_col=0)
    sample_ids = df.index.values
    dist = df.to_numpy()
    dm = DistanceMatrix(dist, sample_ids)
    pc = pcoa(dm)
    var1 = str(round(pc.proportion_explained[0]*100, 2))
    var2 = str(round(pc.proportion_explained[1]*100, 2))
    var3 = str(round(pc.proportion_explained[2]*100, 2))
    for i in m.columns:
        ax = pc.plot(m, i, cmap='Accent', axis_labels=('PC1, '+var1+'%', 'PC2, '+var2+'%', 'PC3, '+var3+'%'), title= metric + " PCoA colored by " + i)
Exemple #25
0
def beta_diversity_pcoa(biom_fp, method="braycurtis", permutations=99, dim=2,
                        col='method', colormap={'expected': 'red',
                                                'rdp': 'seagreen',
                                                'sortmerna': 'gray',
                                                'uclust': 'blue',
                                                'blast': 'purple'}):

    '''From biom table, compute Bray-Curtis distance; generate PCoA plot;
    and calculate adonis differences.

    biom_fp: path
        Path to biom.Table containing sample metadata.
    method: str
        skbio.Diversity method to use for ordination.
    permutations: int
        Number of permutations to perform for anosim tests.
    dim: int
        Number of dimensions to plot. Currently supports only 2-3 dimensions.
    col: str
        metadata name to use for distinguishing groups for anosim tests and
        pcoa plots.
    colormap: dict
        map groups names (must be group names in col) to colors used for plots.
    '''

    dm, s_md = make_distance_matrix(biom_fp, method=method)

    # pcoa
    pc = pcoa(dm)

    # anosim tests
    results = anosim(dm, s_md, column=col, permutations=permutations)
    print('R = ', results['test statistic'], '; P = ', results['p-value'])

    if dim == 2:
        # bokeh pcoa plots
        pc123 = pc.samples.ix[:, ["PC1", "PC2", "PC3"]]
        smd_merge = s_md.merge(pc123, left_index=True, right_index=True)
        smd_merge['Color'] = [colormap[x] for x in smd_merge['method']]
        title = smd_merge['reference'][0]
        labels = ['PC {0} ({1:.2f})'.format(d + 1, pc.proportion_explained[d])
                  for d in range(0, 2)]
        circle_plot_from_dataframe(smd_merge, "PC1", "PC2", title,
                                   columns=["method", "sample_id", "params"],
                                   color="Color", labels=labels)

    else:
        # skbio pcoa plots
        pcoa_plot_skbio(pc, s_md, col='method')

    return s_md, results, pc, dm
def pcoa(args, dist, ds_list):
    #from cogent3.cluster.metric_scaling import PCoA
    from skbio.stats.ordination import pcoa, OrdinationResults
    PCoA_result = pcoa(dist)
    #,index=['a','b','c','d','e','f','g','h','i','j','k','l','m','n']
    # <class 'skbio.stats.ordination._base.OrdinationResults'>
    sc_2 = PCoA_result
    #print(dist)
    #print(type(sc_2))
    df2 = PCoA_result.samples
    df2.index = ds_list
    print(PCoA_result)

    #dt = np.dtype(float)
    # print('sc_2.proportion_explained')
    #     print(sc_2.proportion_explained)
    #     print('eigvals')
    #     print(sc_2.eigvals)
    #     print('species')
    #     print(sc_2.species)
    #     print('site')
    #     print(sc_2.site)
    ordfile = os.path.join(args.outdir, args.prefix + '.pc')
    sc_2.write(ordfile)

    sys.exit()

    #  print('end pcoa result')
    a = np.array(
        PCoA_result)  #[0:,0:5]   # capture only the first three vectors
    #print a
    json_array = {}
    json_array["P1"] = a[:, 2].tolist(
    )[:-2]  # [:-2] is to remove the last two which are not eigen vectors

    json_array["P2"] = a[:, 3].tolist()[:-2]
    try:
        json_array["P3"] = a[:, 4].tolist()[:-2]
    except IndexError:
        sys.exit('IndexError - try selecting more data or deeper taxonomy')

    json_array["names"] = a[:, 1].tolist()[:-2]

    #json['v2'] = [x[0] for x in np.array(PCoA_result[:,3])[:-2]]
    #json['v3'] = [x[0] for x in np.array(PCoA_result[:,4])[:-2]]
    #json['v3'] = [x[0] for x in np.array(PCoA_result[:,4])[:-2]]
    # sprint json_array
    if args.function == 'pcoa_3d':
        create_emperor_pc_file(args, json_array, PCoA_result)
    return json_array
Exemple #27
0
    def fit_transform(self, data):
        # try:
        from skbio.stats.ordination import pcoa
        # except:
        #     sys.exit("PCOA is using the pcoa implemened at scikit-bio.")
        data = self._check_data(data)
        if self.metric.name != "precomputed":
            data = squareform(pdist(data, metric=self.metric.name))
            data = self.metric.fit_transform(data)
        else:
            data = self.metric.fit_transform(data)

        projected_data = pcoa(data)
        self.pcoa = projected_data
        return self.pcoa.samples.values[:, self.components]
def pcoa(args, dist, ds_list):
    #from cogent3.cluster.metric_scaling import PCoA
    from skbio.stats.ordination import pcoa, OrdinationResults
    PCoA_result = pcoa(dist)
    #,index=['a','b','c','d','e','f','g','h','i','j','k','l','m','n']
    # <class 'skbio.stats.ordination._base.OrdinationResults'>
    sc_2 = PCoA_result
    #print(dist)
    #print(type(sc_2))
    df2 = PCoA_result.samples
    df2.index = ds_list
    print(PCoA_result)

    #dt = np.dtype(float)
    # print('sc_2.proportion_explained')
#     print(sc_2.proportion_explained)
#     print('eigvals')
#     print(sc_2.eigvals)
#     print('species')
#     print(sc_2.species)
#     print('site')
#     print(sc_2.site)
    ordfile = os.path.join(args.outdir,args.prefix+'.pc')
    sc_2.write(ordfile)
    
    sys.exit()
  
  #  print('end pcoa result')
    a = np.array(PCoA_result)  #[0:,0:5]   # capture only the first three vectors
    #print a
    json_array = {}
    json_array["P1"] = a[:,2].tolist()[:-2]  # [:-2] is to remove the last two which are not eigen vectors

    json_array["P2"] = a[:,3].tolist()[:-2]
    try:
        json_array["P3"] = a[:,4].tolist()[:-2]
    except IndexError:
        sys.exit('IndexError - try selecting more data or deeper taxonomy')

    json_array["names"] = a[:,1].tolist()[:-2]

    #json['v2'] = [x[0] for x in np.array(PCoA_result[:,3])[:-2]]
    #json['v3'] = [x[0] for x in np.array(PCoA_result[:,4])[:-2]]
    #json['v3'] = [x[0] for x in np.array(PCoA_result[:,4])[:-2]]
    # sprint json_array
    if args.function == 'pcoa_3d':
        create_emperor_pc_file(args, json_array, PCoA_result)
    return json_array
Exemple #29
0
def plot_pcoa(bs_iter = 10000):
    # get F stat and p value
    df = pd.read_csv(mt.get_path() + '/data/mult_by_pop.txt', sep = '\t', index_col=0)
    #mt.get_F_2(df,4,4)
    df = df/df.sum(axis=1)[:,None]
    df_bc = pairwise_distances(df, metric='braycurtis')

    df_pcoa = pcoa(df_bc , number_of_dimensions=3)
    ord_matrix = df_pcoa.samples

    F = mt.get_F_2(ord_matrix, 4,4)
    F_nulls = []
    for i in range(bs_iter):
        F_nulls.append(mt.get_F_2(ord_matrix.sample(frac=1), 4,4)[0])
    p_value = len([F_null for F_null in F_nulls if  F_null > F[0]]) / bs_iter
    print("F = " + str(round(F[0], 4)))
    print("p = " + str(round(p_value, 4)))

    #fig = plt.figure()
    fig, ax = plt.subplots(figsize=(6, 6))
    # Scatterplot on main ax
    ax.axhline(y=0, color='k', linestyle=':', alpha = 0.8, zorder=1)
    ax.axvline(x=0, color='k', linestyle=':', alpha = 0.8, zorder=2)
    ax.scatter(0, 0, marker = "o", edgecolors='none', c = 'darkgray', s = 120, zorder=3)
    ax.scatter(ord_matrix.ix[0:4,0],ord_matrix.ix[0:4,1], marker = "o",
        edgecolors='#244162', c = 'blue', alpha = 0.8, s = 120, zorder=4, label='Wildtype')

    ax.scatter(ord_matrix.ix[4:,0],ord_matrix.ix[4:,1], marker = "o",
        edgecolors='#244162', c = 'r', alpha = 0.8, s = 120, zorder=4, label='Minimal cell')


    confidence_ellipse(ord_matrix.ix[0:4,0],ord_matrix.ix[0:4,1], ax,
        n_std=2, edgecolor='blue', linestyle='--', lw=3)
    confidence_ellipse(ord_matrix.ix[4:,0],ord_matrix.ix[4:,1], ax,
        n_std=2, edgecolor='red', linestyle='--', lw=3)
    #ax1.xlim([-0.7,0.7])
    #ax1.set_ylim([-0.7,0.7])

    ax.set_xlabel('PCo 1 (' + str(round(df_pcoa.proportion_explained[0],3)*100) + '%)' , fontsize = 14)
    ax.set_ylabel('PCo 2 (' + str(round(df_pcoa.proportion_explained[1],3)*100) + '%)' , fontsize = 14)



    plt.legend(loc="upper right")

    fig_name = mt.get_path() + '/figures/pcoa.png'
    fig.savefig(fig_name, bbox_inches = "tight", pad_inches = 0.4, dpi = 600)
    plt.close()
    def test_extensive(self):
        eigvals = [
            0.3984635, 0.36405689, 0.28804535, 0.27479983, 0.19165361, 0.0
        ]
        proportion_explained = [
            0.2626621381, 0.2399817314, 0.1898758748, 0.1811445992,
            0.1263356565, 0.0
        ]
        sample_ids = [str(i) for i in range(6)]
        axis_labels = ['PC%d' % i for i in range(1, 7)]
        samples = [
            [-0.028597, 0.22903853, 0.07055272, 0.26163576, 0.28398669, 0.0],
            [
                0.37494056, 0.22334055, -0.20892914, 0.05057395, -0.18710366,
                0.0
            ],
            [
                -0.33517593, -0.23855979, -0.3099887, 0.11521787, -0.05021553,
                0.0
            ],
            [0.25412394, -0.4123464, 0.23343642, 0.06403168, -0.00482608, 0.0],
            [
                -0.28256844, 0.18606911, 0.28875631, -0.06455635, -0.21141632,
                0.0
            ],
            [0.01727687, 0.012458, -0.07382761, -0.42690292, 0.1695749, 0.0]
        ]

        expected_results = OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigvals, index=axis_labels),
            samples=pd.DataFrame(samples,
                                 index=sample_ids,
                                 columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))

        data = np.loadtxt(get_data_path('PCoA_sample_data_2'))
        # test passing a numpy.ndarray and a DistanceMatrix to pcoa
        # gives same results
        for dm in (data, DistanceMatrix(data)):
            results = pcoa(dm)
            assert_ordination_results_equal(results,
                                            expected_results,
                                            ignore_directionality=True)
Exemple #31
0
def js_PCoA(distributions):
   """Dimension reduction via Jensen-Shannon Divergence & Principal Components

    Parameters
    ----------
    distributions : array-like, shape (`n_dists`, `k`)
        Matrix of distributions probabilities.

    Returns
    -------
    pcoa : array, shape (`n_dists`, 2)
   """
   dist_matrix = DistanceMatrix(dist.squareform(dist.pdist(distributions.values, _jensen_shannon)))
   if skbio_old:
       data = PCoA(dist_matrix).scores()
       return data.site[:,0:2]
   else:
       return pcoa(dist_matrix).samples.values[:, 0:2]
Exemple #32
0
def js_PCoA(distributions):
   """Dimension reduction via Jensen-Shannon Divergence & Principal Components

    Parameters
    ----------
    distributions : array-like, shape (`n_dists`, `k`)
        Matrix of distributions probabilities.

    Returns
    -------
    pcoa : array, shape (`n_dists`, 2)
   """
   dist_matrix = DistanceMatrix(dist.squareform(dist.pdist(distributions.values, _jensen_shannon)))
   if skbio_old:
       data = PCoA(dist_matrix).scores()
       return data.site[:,0:2]
   else:
       return pcoa(dist_matrix).samples.values[:, 0:2]
Exemple #33
0
    def test_centroids_uneq_groups(self):
        """
        the expected result here was calculated by hand
        """
        exp = [[2.5847022428144935, 2.285624595858895,
                1.7022431146340287],
               [1.724817266046108, 1.724817266046108],
               [2.4333280644972795, 2.389000390879655,
                2.8547180589306036, 3.218568759338847]]
        exp_stat, _ = f_oneway(*exp)

        dm = pcoa(self.uneq_mat)
        dm = dm.samples

        obs = _compute_groups(dm, 'centroid', self.grouping_uneq)
        self.assertAlmostEqual(obs, exp_stat, places=6)

        obs_relab = _compute_groups(dm, 'centroid', self.grouping_uneq_relab)
        self.assertAlmostEqual(obs, obs_relab, places=6)
Exemple #34
0
    def test_centroids_uneq_groups(self):
        """
        the expected result here was calculated by hand
        """
        exp = [[2.5847022428144935, 2.285624595858895,
                1.7022431146340287],
               [1.724817266046108, 1.724817266046108],
               [2.4333280644972795, 2.389000390879655,
                2.8547180589306036, 3.218568759338847]]
        exp_stat, _ = f_oneway(*exp)

        dm = pcoa(self.uneq_mat)
        dm = dm.samples

        obs = _compute_groups(dm, 'centroid', self.grouping_uneq)
        self.assertAlmostEqual(obs, exp_stat, places=6)

        obs_relab = _compute_groups(dm, 'centroid', self.grouping_uneq_relab)
        self.assertAlmostEqual(obs, obs_relab, places=6)
def PCoA_total_from_matrix(distance_matrix, biom_file, metadata_file, plot=False):
	sk_distance_matrix = DistanceMatrix(distance_matrix, BW.extract_samples(biom_file))

	metadata = meta.extract_metadata(metadata_file)

	pd_metadata = pd.DataFrame.from_dict(metadata, orient='index')

	result = pcoa(sk_distance_matrix)

	fig = result.plot(df=pd_metadata, column='body_site',
							axis_labels=('PC 1 (' + str(round(result.proportion_explained.iloc[0]*100, 2)) + '%)', 'PC 2 (' + str(round(result.proportion_explained.iloc[1]*100, 2)) + '%)', 'PC 3 (' + str(round(result.proportion_explained.iloc[2]*100, 2)) + '%)'),
							title='Samples colored by body site',
							cmap='Set1', s=50)

	fig.set_size_inches(18.5, 10.5)

	if plot:
		plt.show()
	else:
		return fig
Exemple #36
0
    def test_median_normal(self):

        exp = pd.Series(index=self.exp_index,
                        data=['PERMDISP', 'F-value', 9, 2, 0.139475441876,
                              0.61, 99],
                        name='PERMDISP results')

        np.random.seed(0)
        obs = permdisp(self.unifrac_dm, self.unif_grouping, test='median',
                       permutations=99)

        self.assert_series_equal(obs, exp)

        np.random.seed(0)
        po = pcoa(self.unifrac_dm)

        obs2 = permdisp(po, self.unif_grouping, test='median',
                        permutations=99)

        self.assert_series_equal(obs2, exp)
Exemple #37
0
    def test_median_fsvd(self):

        exp = pd.Series(index=self.exp_index,
                        data=['PERMDISP', 'F-value', 9, 2, 0.04078077215673714,
                              0.8, 99],
                        name='PERMDISP results')

        np.random.seed(0)
        obs = permdisp(self.unifrac_dm, self.unif_grouping, test='median',
                       permutations=99,
                       method='fsvd', number_of_dimensions=3)

        self.assert_series_equal(obs, exp)

        np.random.seed(0)
        po = pcoa(self.unifrac_dm, method='fsvd', number_of_dimensions=3)
        obs = permdisp(po, self.unif_grouping, test='median',
                       permutations=99)

        self.assert_series_equal(obs, exp)
def PCoA_group_from_matrix(distance_matrix, biom_file, groups, plot=False):
	sk_distance_matrix = DistanceMatrix(distance_matrix, [str(i) for i in range(len(groups))])

	metadata = {str(i): {'body_site': groups[i]} for i in range(len(groups))}

	pd_metadata = pd.DataFrame.from_dict(metadata, orient='index')

	result = pcoa(sk_distance_matrix)

	fig = result.plot(df=pd_metadata, column='body_site',
							axis_labels=('PC 1 (' + str(round(result.proportion_explained.iloc[0]*100, 2)) + '%)', 'PC 2 (' + str(round(result.proportion_explained.iloc[1]*100, 2)) + '%)', 'PC 3 (' + str(round(result.proportion_explained.iloc[2]*100, 2)) + '%)'),
							title='Samples colored by body site',
							cmap='Set1', s=50)

	fig.set_size_inches(18.5, 10.5)

	if plot:
		plt.show()
	else:
		return fig
def PCoA_total_from_matrix_clustering(distance_matrix, biom_file, assignments, plot=False):
	samples = BW.extract_samples(biom_file)
	sk_distance_matrix = DistanceMatrix(distance_matrix, BW.extract_samples(biom_file))

	metadata = {samples[i]: {'body_site': 'Group ' + str(assignments[i]+1)} for i in range(len(assignments))}

	pd_metadata = pd.DataFrame.from_dict(metadata, orient='index')

	result = pcoa(sk_distance_matrix)

	fig = result.plot(df=pd_metadata, column='body_site',
							axis_labels=('PC 1 (' + str(round(result.proportion_explained.iloc[0]*100, 2)) + '%)', 'PC 2 (' + str(round(result.proportion_explained.iloc[1]*100, 2)) + '%)', 'PC 3 (' + str(round(result.proportion_explained.iloc[2]*100, 2)) + '%)'),
							title='Samples colored by body site',
							cmap='Set1', s=50)

	fig.set_size_inches(18.5, 10.5)

	if plot:
		plt.show()
	else:
		return fig
Exemple #40
0
    def fit(self, X, y=None):
        """

        Parameters
        ----------
        X : array-like
            Feature table or distance matrix
        y : None
            ignored

        Returns
        -------
        self
            fitted pcoa

        """
        if self.metric == 'precomputed':
            self.embedding_ = pcoa(X).samples
        else:
            raise NotImplementedError()
        return self
    def test_extensive(self):
        eigvals = [0.3984635, 0.36405689, 0.28804535, 0.27479983,
                   0.19165361, 0.0]
        proportion_explained = [0.2626621381, 0.2399817314,
                                0.1898758748, 0.1811445992,
                                0.1263356565, 0.0]
        sample_ids = [str(i) for i in range(6)]
        axis_labels = ['PC%d' % i for i in range(1, 7)]
        samples = [[-0.028597, 0.22903853, 0.07055272, 0.26163576,
                    0.28398669, 0.0],
                   [0.37494056, 0.22334055, -0.20892914, 0.05057395,
                    -0.18710366, 0.0],
                   [-0.33517593, -0.23855979, -0.3099887, 0.11521787,
                    -0.05021553, 0.0],
                   [0.25412394, -0.4123464, 0.23343642, 0.06403168,
                    -0.00482608, 0.0],
                   [-0.28256844, 0.18606911, 0.28875631, -0.06455635,
                    -0.21141632, 0.0],
                   [0.01727687, 0.012458, -0.07382761, -0.42690292,
                    0.1695749, 0.0]]

        expected_results = OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigvals, index=axis_labels),
            samples=pd.DataFrame(samples, index=sample_ids,
                                 columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))

        data = np.loadtxt(get_data_path('PCoA_sample_data_2'))
        # test passing a numpy.ndarray and a DistanceMatrix to pcoa
        # gives same results
        for dm in (data, DistanceMatrix(data)):
            results = pcoa(dm)
            assert_ordination_results_equal(results, expected_results,
                                            ignore_directionality=True)
import pandas as pd
import matplotlib.pyplot as plt
import json
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics.pairwise import pairwise_distances
from skbio.stats.ordination import pcoa

# Compute Jaccard distances
# bcto_matrix = bcto_cover.as_matrix()
# bcto_distances = pairwise_distances(bcto_matrix,metric='jaccard')
# pcoa_of_distance = pcoa(bcto_distances)
# pcoa_of_distance.plot()
# plt.show()
# print pcoa_of_distance

# firm_matrix = firm_cover.as_matrix()
# firm_distances = pairwise_distances(firm_matrix,metric='jaccard')
# pcoa_of_distance = pcoa(firm_distances)
# pcoa_of_distance.plot()
# plt.show()
# print pcoa_of_distance

asf_bcto_firm_cover = pd.read_csv('../results/metagenome_coverage/asf_bcto_firm_mg_coverage.tsv',sep='\t')


asf_bcto_firm_matrix = asf_bcto_firm_cover.as_matrix()
asf_bcto_firm_distances = pairwise_distances(asf_bcto_firm_matrix,metric='jaccard')
pcoa_of_distance = pcoa(asf_bcto_firm_distances)
pcoa_of_distance.plot()
plt.show()
Exemple #43
0
    def test_centroids_null(self):
        dm = pcoa(self.null_mat)
        dm = dm.samples

        obs_null = _compute_groups(dm, 'centroid', self.grouping_eq)
        np.isnan(obs_null)
Exemple #44
0
xls_file = "../data/Coral_ChemiFRAC_test.xlsx"

table = pd.read_excel(xls_file, sheetname=1, index_col=0).T
edges = pd.read_excel(xls_file, sheetname=0)
maxID = max([edges["CLUSTERID1"].max(), edges["CLUSTERID2"].max()]) + 1
spm = coo_matrix(
    (edges["Cosine"].values, (edges["CLUSTERID1"].values, edges["CLUSTERID2"].values)), shape=(maxID, maxID)
)
coral_nwk = nx.from_scipy_sparse_matrix(spm)
meta_map = pd.read_table("../data/%s" % meta_file)

small_table = table

dm = pd.DataFrame(columns=meta_map.index, index=meta_map.index)
for i in range(len(meta_map.index)):
    for j in range(i):
        sampIDs = meta_map["#SampleID"].values
        _x, _y = sampIDs[i], sampIDs[j]

        x = small_table.loc[_x, :]
        y = small_table.loc[_y, :]
        dm.loc[_x, _y] = rig(coral_nwk, x, y)

dm.to_csv("../results/rig.txt", sep="\t")
dm = pd.read_csv("../results/rig.txt", index_col=0)
dm = dm.loc[meta_map["#SampleID"].values, meta_map["#SampleID"].values]
dm = dm.fillna(0)
dmpc = pcoa(dm + dm.T)
dmpc.samples.index = meta_map["#SampleID"].values
dmpc.write("../results/rig_pc.txt")
    def test_fsvd(self):
        dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        dm2 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        dm3 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))

        # Test eigh vs. fsvd pcoa and inplace parameter
        expected_results = pcoa(dm1, method="eigh", number_of_dimensions=3,
                                inplace=False)

        results = pcoa(dm2, method="fsvd", number_of_dimensions=3,
                       inplace=False)

        results_inplace = pcoa(dm2, method="fsvd", number_of_dimensions=3,
                               inplace=True)

        assert_ordination_results_equal(results, expected_results,
                                        ignore_directionality=True,
                                        ignore_method_names=True)

        assert_ordination_results_equal(results, results_inplace,
                                        ignore_directionality=True,
                                        ignore_method_names=True)

        # Test number_of_dimensions edge cases
        results2 = pcoa(dm3, method="fsvd", number_of_dimensions=0,
                        inplace=False)
        expected_results2 = pcoa(dm3, method="fsvd",
                                 number_of_dimensions=dm3.data.shape[0],
                                 inplace=False)

        assert_ordination_results_equal(results2, expected_results2,
                                        ignore_directionality=True,
                                        ignore_method_names=True)

        with self.assertRaises(ValueError):
            dim_too_large = dm1.data.shape[0] + 10
            pcoa(dm2, method="fsvd", number_of_dimensions=dim_too_large)

        with self.assertRaises(ValueError):
            pcoa(dm2, method="fsvd", number_of_dimensions=-1)

        with self.assertRaises(ValueError):
            dim_too_large = dm1.data.shape[0] + 10
            pcoa(dm2, method="eigh", number_of_dimensions=dim_too_large)

        with self.assertRaises(ValueError):
            pcoa(dm2, method="eigh", number_of_dimensions=-1)

        dm_big = DistanceMatrix.read(get_data_path('PCoA_sample_data_12dim'))
        with self.assertWarnsRegex(RuntimeWarning,
                                   "no value for number_of_dimensions"):
            pcoa(dm_big, method="fsvd", number_of_dimensions=0)
Exemple #46
0
    def plot_mds(
        self,
        rank="auto",
        metric="braycurtis",
        method="pcoa",
        title=None,
        xlabel=None,
        ylabel=None,
        color=None,
        size=None,
        tooltip=None,
        return_chart=False,
        label=None,
    ):
        """Plot beta diversity distance matrix using multidimensional scaling (MDS).

        Parameters
        ----------
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.
        metric : {'braycurtis', 'manhattan', 'jaccard', 'unifrac', 'unweighted_unifrac}, optional
            Function to use when calculating the distance between two samples.
        method : {'pcoa', 'smacof'}
            Algorithm to use for ordination. PCoA uses eigenvalue decomposition and is not well
            suited to non-euclidean distance functions. SMACOF is an iterative optimization strategy
            that can be used as an alternative.
        title : `string`, optional
            Text label at the top of the plot.
        xlabel : `string`, optional
            Text label along the horizontal axis.
        ylabel : `string`, optional
            Text label along the vertical axis.
        size : `string` or `tuple`, optional
            A string or a tuple containing strings representing metadata fields. The size of points
            in the resulting plot will change based on the metadata associated with each sample.
        color : `string` or `tuple`, optional
            A string or a tuple containing strings representing metadata fields. The color of points
            in the resulting plot will change based on the metadata associated with each sample.
        tooltip : `string` or `list`, optional
            A string or list containing strings representing metadata fields. When a point in the
            plot is hovered over, the value of the metadata associated with that sample will be
            displayed in a modal.
        label : `string` or `callable`, optional
            A metadata field (or function) used to label each analysis. If passing a function, a
            dict containing the metadata for each analysis is passed as the first and only
            positional argument. The callable function must return a string.

        Examples
        --------
        Scatter plot of weighted UniFrac distance between all our samples, using counts at the genus
        level.

        >>> plot_mds(rank='genus', metric='unifrac')

        Notes
        -----
        **For `smacof`**: The values reported on the axis labels are Pearson's correlations between
        the distances between points on each axis alone, and the corresponding distances in the
        distance matrix calculated using the user-specified metric. These values are related to the
        effectiveness of the MDS algorithm in placing points on the scatter plot in such a way that
        they truly represent the calculated distances. They do not reflect how well the distance
        metric captures similarities between the underlying data (in this case, an OTU table).
        """
        if len(self._results) < 2:
            raise OneCodexException("`plot_mds` requires 2 or more valid classification results.")

        dists = self._compute_distance(rank, metric).to_data_frame()

        # here we figure out what to put in the tooltips and get the appropriate data
        if tooltip:
            if not isinstance(tooltip, list):
                tooltip = [tooltip]
        else:
            tooltip = []

        tooltip.insert(0, "Label")

        if color and color not in tooltip:
            tooltip.insert(1, color)

        if size and size not in tooltip:
            tooltip.insert(2, size)

        magic_metadata, magic_fields = self._metadata_fetch(tooltip, label=label)

        if method == "smacof":
            # adapted from https://scikit-learn.org/stable/auto_examples/manifold/plot_mds.html
            x_field = "MDS1"
            y_field = "MDS2"

            seed = np.random.RandomState(seed=3)
            mds = manifold.MDS(
                max_iter=3000, eps=1e-12, random_state=seed, dissimilarity="precomputed", n_jobs=1
            )
            pos = mds.fit(dists).embedding_
            plot_data = pd.DataFrame(pos, columns=[x_field, y_field], index=dists.index)
            plot_data = plot_data.div(plot_data.abs().max(axis=0), axis=1)  # normalize to [0,1]

            # determine how much of the original distance is captured by each of the axes after MDS.
            # this implementation of MDS does not use eigen decomposition and so there's no simple
            # way of returning a 'percent of variance explained' value
            r_squared = []

            for axis in [0, 1]:
                mds_dist = pos.copy()
                mds_dist[::, axis] = 0
                mds_dist = squareform(euclidean_distances(mds_dist).round(6))
                r_squared.append(pearsonr(mds_dist, squareform(dists))[0])

            # label the axes
            x_extra_label = "r² = %.02f" % (r_squared[0],)
            y_extra_label = "r² = %.02f" % (r_squared[1],)
        elif method == "pcoa":
            # suppress eigenvalue warning from skbio--not because it's an invalid warning, but
            # because lots of folks in the field run pcoa on these distances functions, even if
            # statistically inappropriate. perhaps this will change if we ever become more
            # opinionated about the analyses that we allow our users to do (roo)
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                ord_result = ordination.pcoa(
                    dists.round(6)
                )  # round to avoid float precision errors

            plot_data = ord_result.samples.iloc[:, [0, 1]]  # get first two components
            plot_data = plot_data.div(plot_data.abs().max(axis=0), axis=1)  # normalize to [0,1]
            plot_data.index = dists.index
            x_field, y_field = plot_data.columns.tolist()  # name of first two components

            x_extra_label = "%0.02f%%" % (ord_result.proportion_explained[0] * 100,)
            y_extra_label = "%0.02f%%" % (ord_result.proportion_explained[1] * 100,)
        else:
            raise OneCodexException("MDS method must be one of: smacof, pcoa")

        # label the axes
        if xlabel is None:
            xlabel = "{} ({})".format(x_field, x_extra_label)
        if ylabel is None:
            ylabel = "{} ({})".format(y_field, y_extra_label)

        plot_data = pd.concat([plot_data, magic_metadata], axis=1).reset_index()

        alt_kwargs = dict(
            x=alt.X(x_field, axis=alt.Axis(title=xlabel)),
            y=alt.Y(y_field, axis=alt.Axis(title=ylabel)),
            tooltip=[magic_fields[t] for t in tooltip],
            href="url:N",
            url="https://app.onecodex.com/classification/" + alt.datum.classification_id,
        )

        # only add these parameters if they are in use
        if color:
            alt_kwargs["color"] = magic_fields[color]
        if size:
            alt_kwargs["size"] = magic_fields[size]

        chart = (
            alt.Chart(plot_data)
            .transform_calculate(url=alt_kwargs.pop("url"))
            .mark_circle()
            .encode(**alt_kwargs)
        )

        if title:
            chart = chart.properties(title=title)

        if return_chart:
            return chart
        else:
            chart.interactive().display()
 def test_invalid_input(self):
     with npt.assert_raises(DissimilarityMatrixError):
         pcoa([[1, 2], [3, 4]])
        #svgfile = os.path.join('/Users/avoorhis/programming/jupyter/VAMPS_API',args.prefix+'_dendrogram.svg')
        svgfile = os.path.join(args.outdir,args.prefix+'_dendrogram.svg')
        print(os.getcwd())
        #print svgfile
        print('rendering0')
        rooted_tree.render(svgfile, tree_style=ts)  # writes file to tmp



    if args.function == 'pcoa_3d':
        #print('starting pcoa_3d')
        from skbio import DistanceMatrix
        dm = DistanceMatrix(dm1)
        #print(dm1)
        #print('end pcoa_3d')
        pcoa_data = pcoa(args, dm, datasets)
        
        #test_PCoA()

    if args.function == 'pcoa_2d':
        # if not args.metadata:
        #   print "ERROR: In PCoA and no metadata recieved"
        #   sys.exit()

        pcoa_data = pcoa(args, dm3)
        #print json.dumps(pcoa_data)

        #metadata = json.loads( args.metadata.strip("'") )
        pcoa_pdf(args, pcoa_data)
        #print pcoa_data
Exemple #49
0
def permdisp(distance_matrix, grouping, column=None, test='median',
             permutations=999):
    """Test for Homogeneity of Multivariate Groups Disperisons using Marti
    Anderson's PERMDISP2 procedure.

    PERMDISP is a multivariate analogue of Levene's test for homogeneity of
    multivariate variances. Distances are handled by reducing the
    original distances to principal coordinates. PERMDISP calculates an
    F-statistic to assess whether the dispersions between groups is significant


    Parameters
    ----------
    distance_matrix : DistanceMatrix
        Distance matrix containing distances between objects (e.g., distances
        between samples of microbial communities).
    grouping : 1-D array_like or pandas.DataFrame
        Vector indicating the assignment of objects to groups. For example,
        these could be strings or integers denoting which group an object
        belongs to. If `grouping` is 1-D ``array_like``, it must be the same
        length and in the same order as the objects in `distance_matrix`. If
        `grouping` is a ``DataFrame``, the column specified by `column` will be
        used as the grouping vector. The ``DataFrame`` must be indexed by the
        IDs in `distance_matrix` (i.e., the row labels must be distance matrix
        IDs), but the order of IDs between `distance_matrix` and the
        ``DataFrame`` need not be the same. All IDs in the distance matrix must
        be present in the ``DataFrame``. Extra IDs in the ``DataFrame`` are
        allowed (they are ignored in the calculations).
    column : str, optional
        Column name to use as the grouping vector if `grouping` is a
        ``DataFrame``. Must be provided if `grouping` is a ``DataFrame``.
        Cannot be provided if `grouping` is 1-D ``array_like``.
    test : {'centroid', 'median'}
        determines whether the analysis is done using centroid or spaitial
        median.
    permutations : int, optional
        Number of permutations to use when assessing statistical
        significance. Must be greater than or equal to zero. If zero,
        statistical significance calculations will be skipped and the p-value
        will be ``np.nan``.

    Returns
    -------
    pandas.Series
        Results of the statistical test, including ``test statistic`` and
        ``p-value``.

    Raises
    ------
    TypeError
        If, when using the spatial median test, the pcoa ordination is not of
        type np.float32 or np.float64, the spatial median function will fail
        and the centroid test should be used instead
    ValueError
        If the test is not centroid or median.
    TypeError
        If the distance matrix is not an instance of a
        ``skbio.DistanceMatrix``.
    ValueError
        If there is only one group
    ValueError
        If a list and a column name are both provided
    ValueError
        If a list is provided for `grouping` and it's length does not match
        the number of ids in distance_matrix
    ValueError
        If all of the values in the grouping vector are unique
    KeyError
        If there are ids in grouping that are not in distance_matrix

    See Also
    --------
    permanova
    anosim

    Notes
    -----
    The significance of the results from this function will be the same as the
    results found in vegan's betadisper, however due to floating point
    variability the F-statistic results may vary slightly.

    See [1]_ for the original method reference, as well as
    ``vegan::betadisper``, available in R's vegan package [2]_.

    References
    ----------
    .. [1] Anderson, Marti J. "Distance-Based Tests for Homogeneity of
        Multivariate Dispersions." Biometrics 62 (2006):245-253

    .. [2] http://cran.r-project.org/web/packages/vegan/index.html

    Examples
    --------
    Load a 6x6 distance matrix and grouping vector denoting 2 groups of
    objects:

    >>> from skbio import DistanceMatrix
    >>> dm = DistanceMatrix([[0,    0.5,  0.75, 1, 0.66, 0.33],
    ...                       [0.5,  0,    0.25, 0.33, 0.77, 0.61],
    ...                       [0.75, 0.25, 0,    0.1, 0.44, 0.55],
    ...                       [1,    0.33, 0.1,  0, 0.75, 0.88],
    ...                       [0.66, 0.77, 0.44, 0.75, 0, 0.77],
    ...                       [0.33, 0.61, 0.55, 0.88, 0.77, 0]],
    ...                       ['s1', 's2', 's3', 's4', 's5', 's6'])
    >>> grouping = ['G1', 'G1', 'G1', 'G2', 'G2', 'G2']

    Run PERMDISP using 99 permutations to caluculate the p-value:

    >>> from skbio.stats.distance import permdisp
    >>> import numpy as np
    >>> #make output deterministic, should not be included during normal use
    >>> np.random.seed(0)
    >>> permdisp(dm, grouping, permutations=99)
    method name               PERMDISP
    test statistic name        F-value
    sample size                      6
    number of groups                 2
    test statistic             1.03296
    p-value                       0.35
    number of permutations          99
    Name: PERMDISP results, dtype: object

    The return value is a ``pandas.Series`` object containing the results of
    the statistical test.

    To suppress calculation of the p-value and only obtain the F statistic,
    specify zero permutations:

    >>> permdisp(dm, grouping, permutations=0)
    method name               PERMDISP
    test statistic name        F-value
    sample size                      6
    number of groups                 2
    test statistic             1.03296
    p-value                        NaN
    number of permutations           0
    Name: PERMDISP results, dtype: object

    PERMDISP computes variances based on two types of tests, using either
    centroids or spatial medians, also commonly referred to as a geometric
    median. The spatial median is thought to yield a more robust test
    statistic, and this test is used by default. Spatial medians are computed
    using an iterative algorithm to find the optimally minimum point from all
    other points in a group while centroids are computed using a deterministic
    formula. As such the two different tests yeild slightly different F
    statistics.

    >>> np.random.seed(0)
    >>> permdisp(dm, grouping, test='centroid', permutations=6)
    method name               PERMDISP
    test statistic name        F-value
    sample size                      6
    number of groups                 2
    test statistic             3.67082
    p-value                   0.428571
    number of permutations           6
    Name: PERMDISP results, dtype: object

    You can also provide a ``pandas.DataFrame`` and a column denoting the
    grouping instead of a grouping vector. The following DataFrame's
    Grouping column specifies the same grouping as the vector we used in the
    previous examples.:

    >>> import pandas as pd
    >>> df = pd.DataFrame.from_dict(
    ...      {'Grouping': {'s1': 'G1', 's2': 'G1', 's3': 'G1', 's4': 'G2',
    ...                    's5': 'G2', 's6': 'G2'}})
    >>> permdisp(dm, df, 'Grouping', permutations=6, test='centroid')
    method name               PERMDISP
    test statistic name        F-value
    sample size                      6
    number of groups                 2
    test statistic             3.67082
    p-value                   0.428571
    number of permutations           6
    Name: PERMDISP results, dtype: object

    Note that when providing a ``DataFrame``, the ordering of rows and/or
    columns does not affect the grouping vector that is extracted. The
    ``DataFrame`` must be indexed by the distance matrix IDs (i.e., the row
    labels must be distance matrix IDs).

    If IDs (rows) are present in the ``DataFrame`` but not in the distance
    matrix, they are ignored. The previous example's ``s7`` ID illustrates this
    behavior: note that even though the ``DataFrame`` had 7 objects, only 6
    were used in the test (see the "Sample size" row in the results above to
    confirm this). Thus, the ``DataFrame`` can be a superset of the distance
    matrix IDs. Note that the reverse is not true: IDs in the distance matrix
    *must* be present in the ``DataFrame`` or an error will be raised.

    PERMDISP should be used to determine whether the dispersions between the
    groups in your distance matrix are significantly separated.
    A non-significant test result indicates that group dispersions are similar
    to each other. PERMANOVA or ANOSIM should then be used in conjunction to
    determine whether clustering within groups is significant.

    """
    if test not in ['centroid', 'median']:
        raise ValueError('Test must be centroid or median')

    ordination = pcoa(distance_matrix)
    samples = ordination.samples

    sample_size, num_groups, grouping, tri_idxs, distances = _preprocess_input(
        distance_matrix, grouping, column)

    test_stat_function = partial(_compute_groups, samples, test)

    stat, p_value = _run_monte_carlo_stats(test_stat_function, grouping,
                                           permutations)

    return _build_results('PERMDISP', 'F-value', sample_size, num_groups,
                          stat, p_value, permutations)
Exemple #50
0
                        index=samples,
                        columns=samples)
graph_dm.to_csv('../results/aitchison.txt', '\t')

# Read in graph_dm
graph_dm = pd.read_csv('../results/unconnected_aitchison.txt',
                       sep='\t', index_col=0)
# table = pd.read_table('../data/skinmap_chemiFrac_test.txt',
#                        sep='\t', index_col=0)
graph_dm.index = table.columns
graph_dm.columns = table.columns
# _dm = pw_distances('braycurtis', table.values, table.index.values)
# _dm.write('../results/braycurtis.txt')
_dm = DistanceMatrix(graph_dm.values + graph_dm.values.T)
_dm.ids = graph_dm.index
pcoa_v = pcoa(_dm)

fig = plt.figure(3)
plt.plot(pcoa_v.samples['PC1'],
         pcoa_v.samples['PC2'], 'ob')
# plt.plot(pcoa_v.eigvecs[not_stressed, 0],
#          pcoa_v.eigvecs[not_stressed, 1],
#          'o', c='#FFFFFF', label='Before stress')
# plt.plot(pcoa_v.eigvecs[stressed, 0],
#          pcoa_v.eigvecs[stressed, 1],
#          'o', c='#999999', label='After stress')
# plt.legend(loc=3)
#plt.title('Weighted Aitchison on Coral data')
#fig.savefig('../results/coral_chemifrac.png')

pcoa_v.write('../results/coral_unconnected_aitchison_pcoa.txt')