Ejemplo n.º 1
0
    def test_call_unequal_group_sizes(self):
        exp = pd.Series(
            index=self.exp_index,
            data=['PERMANOVA', 'pseudo-F', 6, 3, 0.578848, 0.645, 999])

        np.random.seed(0)
        obs = permanova(self.dm_unequal, self.grouping_unequal)
        self.assert_series_equal(obs, exp)

        np.random.seed(0)
        obs = permanova(self.dm_unequal, self.grouping_unequal_relabeled)
        self.assert_series_equal(obs, exp)
Ejemplo n.º 2
0
    def test_call_unequal_group_sizes(self):
        exp = pd.Series(index=self.exp_index,
                        data=['PERMANOVA', 'pseudo-F', 6, 3, 0.578848, 0.645,
                              999])

        np.random.seed(0)
        obs = permanova(self.dm_unequal, self.grouping_unequal)
        self.assert_series_equal(obs, exp)

        np.random.seed(0)
        obs = permanova(self.dm_unequal, self.grouping_unequal_relabeled)
        self.assert_series_equal(obs, exp)
Ejemplo n.º 3
0
 def test_call_no_ties(self):
     exp = pd.Series(
         index=self.exp_index, data=["PERMANOVA", "pseudo-F", 4, 2, 4.4, 0.332, 999], name="PERMANOVA results"
     )
     np.random.seed(0)
     obs = permanova(self.dm_no_ties, self.grouping_equal)
     self.assert_series_equal(obs, exp)
Ejemplo n.º 4
0
    def test_call_ties(self):
        # Ensure we get the same results if we rerun the method using the same
        # inputs. Also ensure we get the same results if we run the method
        # using a grouping vector or a data frame with equivalent groupings.
        exp = pd.Series(index=self.exp_index,
                        data=['PERMANOVA', 'pseudo-F', 4, 2, 2.0, 0.671, 999])

        for _ in range(2):
            np.random.seed(0)
            obs = permanova(self.dm_ties, self.grouping_equal)
            self.assert_series_equal(obs, exp)

        for _ in range(2):
            np.random.seed(0)
            obs = permanova(self.dm_ties, self.df, column='Group')
            self.assert_series_equal(obs, exp)
Ejemplo n.º 5
0
    def test_call_ties(self):
        # Ensure we get the same results if we rerun the method using the same
        # inputs. Also ensure we get the same results if we run the method
        # using a grouping vector or a data frame with equivalent groupings.
        exp = pd.Series(index=self.exp_index,
                        data=['PERMANOVA', 'pseudo-F', 4, 2, 2.0, 0.671, 999])

        for _ in range(2):
            np.random.seed(0)
            obs = permanova(self.dm_ties, self.grouping_equal)
            self.assert_series_equal(obs, exp)

        for _ in range(2):
            np.random.seed(0)
            obs = permanova(self.dm_ties, self.df, column='Group')
            self.assert_series_equal(obs, exp)
Ejemplo n.º 6
0
 def test_call_no_ties(self):
     exp = pd.Series(index=self.exp_index,
                     data=['PERMANOVA', 'pseudo-F', 4, 2, 4.4, 0.332, 999],
                     name='PERMANOVA results')
     np.random.seed(0)
     obs = permanova(self.dm_no_ties, self.grouping_equal)
     self.assert_series_equal(obs, exp)
Ejemplo n.º 7
0
def permanovaResult(args, current_wd, retrospect_dir, output_file_tag,
                    notebook_name, suppress, silence, neglect):
    # python3 -m emmer.bake -m 'Permanova' -i emmer/data/bake_data_dir_6/filtered_infoRich__PCA_coordinates.csv

    permanova_args = PermanovaArgs(args=args,
                                   current_wd=current_wd,
                                   suppress=suppress,
                                   silence=silence)

    ## conduct PERMANOVA
    numpy.random.seed(0)

    result = permanova(
        permanova_args.dist_matrix, permanova_args.cluster,
        permutations=999)  ## TODO: allow user-define $permutations and $seed
    print(result)

    notebook = UpdateNoteBook(notebook_name=notebook_name,
                              neglect=neglect).updatePermanovaResult(
                                  set_seed='0',
                                  set_cluster=permanova_args.cluster,
                                  test_result=result)

    parameter_df = pandas.DataFrame({
        'individual': permanova_args.individual,
        'cluster': permanova_args.cluster
    })
    output_file_name = os.path.join(
        retrospect_dir,
        (output_file_tag + '_retrospect_permanova_parameter.csv'))
    parameter_df.to_csv(output_file_name)
Ejemplo n.º 8
0
 def permanova_permdisp(self):
     # compute the permanova
     print('running permdisp\n\n')
     print(permdisp(distance_matrix=DistanceMatrix(self.dist_df),
                    grouping=[_.split('_')[0] for _ in list(self.dist_df)], permutations=999))
     print('running permanova\n\n')
     print(permanova(distance_matrix=DistanceMatrix(self.dist_df),
                     grouping=[_.split('_')[0] for _ in list(self.dist_df)], permutations=9999))
Ejemplo n.º 9
0
 def testPer(self, dist, group):
     per = self.permanova(dist, group)
     print(per[0])
     print(per[2])
     print(
         permanova(DistanceMatrix(dist, range(len(group))),
                   group,
                   column=None,
                   permutations=999))
Ejemplo n.º 10
0
def _beta(permutations, data, xvalues, yvalues):
    x_ids = list(xvalues.index.values)
    y_ids = list(yvalues.index.values)
    ids = x_ids + y_ids
    data_test = data.filter(ids)
    permanova_result = permanova(
        distance_matrix=data_test,
        # we can use use either x or y cause they are the same
        column=xvalues.name,
        grouping=pd.concat([xvalues, yvalues]).to_frame(),
        permutations=permutations).to_dict()
    xvals = list(
        data_test.filter(xvalues.index.values).to_series().dropna().values)
    yvals = list(
        data_test.filter(yvalues.index.values).to_series().dropna().values)
    return (permanova_result['p-value'], permanova_result['test statistic'],
            xvals, yvals)
Ejemplo n.º 11
0
def _beta(permutations, data, xvalues, yvalues):
    x_ids = list(xvalues.index.values)
    y_ids = list(yvalues.index.values)
    ids = x_ids + y_ids
    data_test = data.filter(ids)
    permanova_result = permanova(
        distance_matrix=data_test,
        # we can use use either x or y cause they are the same
        column=xvalues.name,
        grouping=pd.concat([xvalues, yvalues]).to_frame(),
        permutations=permutations).to_dict()
    xvals = list(
        data_test.filter(xvalues.index.values).to_series().dropna().values)
    yvals = list(
        data_test.filter(yvalues.index.values).to_series().dropna().values)
    return (permanova_result['p-value'], permanova_result['test statistic'],
            xvals, yvals)
Ejemplo n.º 12
0
def get_permanova_ranked_list(x, y, feature_list, label_set):
    x = x.transpose().values

    values = []
    for f in range(len(feature_list)):
        sub_x = x[:, f]
        dist = pairwise_distances(sub_x.reshape(-1, 1),
                                  sub_x.reshape(-1, 1),
                                  metric="cityblock")
        dist = DistanceMatrix(data=dist)
        perm = permanova(dist, y)
        values.append(perm.loc["p-value"])

    fdr_values = multipletests(values, method="fdr_bh")[1]
    permanova_df = pd.DataFrame(index=feature_list,
                                data={
                                    "p-value": np.array(values).reshape(-1),
                                    "Adj p-value":
                                    np.array(fdr_values).reshape(-1)
                                })
    return permanova_df
Ejemplo n.º 13
0
def pseudoF_permanova(points, labels):
    """ Statistical significance is assessed via a permutation test.
     The assignment of objects to groups (grouping) is randomly permuted a number of times
     (controlled via permutations). A pseudo-F statistic is computed for each permutation and the
     p-value is the proportion of
    permuted pseudo-F statisics that are equal to or greater than the original
     (unpermuted) pseudo-F statistic. (using sklearn pairwise euclidean_distance function)

    Parameters
    ----------
    points : np.array
        np.array([N, p]) of all points
    labels: np.array
        np.array([N]) labels of all points
    """
    distances = skbio.DistanceMatrix(points.as_matrix())
    ks = np.sort(np.unique(labels))

    pseudo_f = permanova(distances, labels)
    print(pseudo_f)
    return pseudo_f
Ejemplo n.º 14
0
sns.set(font_scale=1.5, style="ticks")
g = sns.FacetGrid(tsne, hue="taxa", height=10, aspect=16 / 10)
gm = g.map(plt.scatter, "x", "y", alpha=0.25)
means = tsne.groupby(taxa).agg("median").reset_index()
texts = means.apply(lambda df: plt.text(df.x, df.y, df.taxa, alpha=0.65),
                    axis=1)
texts = adjust_text(
    texts,
    force_text=(0.02, 0.5),
    arrowprops=dict(arrowstyle="-|>", alpha=0.5, color="k"),
)
plt.savefig("figures/individual_media.png", dpi=200)
plt.close()

# Some statistics about metabolite usage
# indicator matrix 0 = metabolite not consumed, 1 = metabolite consumed
binary = mat.where(mat < -1e-6, 0).where(mat > -1e-6, 1)

# Jaccard distances = 1 - percent overlap
J = pdist(binary, "jaccard")
print("Jaccard distances:", pd.Series(J).describe(), sep="\n")

# euclidean distances
E = pdist(mat, "euclidean")

# Test whether genus explains a good amount of that variation
p = permanova(DistanceMatrix(E), taxa)
r2 = 1 - 1 / (1 + p[4] * p[3] / (p[2] - p[3] - 1))
p["R2"] = r2
print("PERMANOVA on euclidean distances:", p, sep="\n")
Ejemplo n.º 15
0
 def test_call_no_permutations(self):
     exp = pd.Series(index=self.exp_index,
                     data=['PERMANOVA', 'pseudo-F', 4, 2, 4.4, np.nan, 0])
     obs = permanova(self.dm_no_ties, self.grouping_equal, permutations=0)
     self.assert_series_equal(obs, exp)
Ejemplo n.º 16
0
 def test_call_no_ties(self):
     exp = pd.Series(index=self.exp_index,
                     data=['PERMANOVA', 'pseudo-F', 4, 2, 4.4, 0.332, 999])
     np.random.seed(0)
     obs = permanova(self.dm_no_ties, self.grouping_equal)
     self.assert_series_equal(obs, exp)
Ejemplo n.º 17
0
            clr_res = clr_inv(np.dot(np.dot(U, s), V.T))
            # use just kl_div here because already closed
            kl_clr = entropy(closure(basetmp_sub).T, clr_res.T).mean()
            results[(rank_, power_, depth_, 'rclr', 'KL-Div')] = [kl_clr]

            # test KL without rclr
            X_spn = np.array(subtmp_sub.copy()).astype(float)
            X_spn[X_spn == 0] = np.nan
            U_, s_, V_ = OptSpace(iteration=1000).fit_transform(X_spn)
            res_raw = np.dot(np.dot(U_, s_), V_.T)
            res_raw[res_raw <= 0] = 1
            kl_raw = entropy(closure(basetmp_sub).T, closure(res_raw).T).mean()
            results[(rank_, power_, depth_, 'Raw Counts', 'KL-Div')] = [kl_raw]

            # f-stat
            resfclr = permanova(DistanceMatrix(distance.cdist(U, U)),
                                meta['group'])['test statistic']
            rawfres = permanova(DistanceMatrix(distance.cdist(U_, U_)),
                                meta['group'])['test statistic']
            results[(rank_, power_, depth_, 'rclr', 'F-Statistic')] = [resfclr]
            results[(rank_, power_, depth_, 'Raw Counts',
                     'F-Statistic')] = [rawfres]

            # KNN
            for U_tmp, method in zip([U, U_], ['rclr', 'Raw Counts']):
                pcoa_tmp = pcoa(DistanceMatrix(distance.cdist(U_tmp,
                                                              U_tmp))).samples
                pcoa_tmp.index = subtmp_sub.index
                # split
                X_train, X_test, y_train, y_test = train_test_split(
                    pcoa_tmp,
                    meta['group'].ravel(),
Ejemplo n.º 18
0
perm_res = {}
perm_res_tmp = {}
for dataset_, subs in distances.items():
    perm_res[dataset_] = {}
    perm_res_tmp[dataset_] = {}
    for (fold_, Nsamp_), methods_ in subs.items():
        meta_ = meta[dataset_][(fold_, Nsamp_)]['metadata']
        if len(meta_.index) < Nsamp_:
            continue
        perm_res[dataset_][(fold_, Nsamp_)] = {}
        perm_res_tmp[dataset_][(fold_, Nsamp_)] = {}
        for method, dist_tmp in methods_.items():
            perm_res[dataset_][(fold_, Nsamp_)][method] = {}
            dist_tmp = DistanceMatrix(dist_tmp)
            perm_tmp = permanova(
                dist_tmp, meta[dataset_][(fold_, Nsamp_)]['metadata'][
                    case_study[dataset_]['factor']].values)
            perm_res[dataset_][(
                fold_,
                Nsamp_)][method]['test statistic'] = perm_tmp['test statistic']
            perm_res_tmp[dataset_][(fold_, Nsamp_)] = pd.DataFrame(
                perm_res[dataset_][(fold_, Nsamp_)])

    both_perm_res[dataset_] = pd.concat(perm_res_tmp[dataset_])

# run calssiification
import warnings
warnings.simplefilter('ignore')  #for PCoA warnings
from skbio.stats.ordination import pcoa
from sklearn import metrics
from sklearn.cluster import KMeans
Ejemplo n.º 19
0
 def test_call_no_permutations(self):
     exp = pd.Series(index=self.exp_index,
                     data=['PERMANOVA', 'pseudo-F', 4, 2, 4.4, np.nan, 0])
     obs = permanova(self.dm_no_ties, self.grouping_equal, permutations=0)
     self.assert_series_equal(obs, exp)
Ejemplo n.º 20
0
        sample_id = each_sample_split[0]
        sample_group = each_sample_split[1]
        sample_id_list.append(sample_id)
        sample_group_list.append(sample_group)

# read in data as dataframe
df = pd.read_csv(infile_data, sep='\t')

# get list of list from dataframe
lol_data_in = []
for col_id in sample_id_list:
    column_num_list = (df[col_id].values).tolist()
    lol_data_in.append(column_num_list)

# calculate distance matrix
dist_arrary = pairwise_distances(lol_data_in,
                                 lol_data_in,
                                 metric=distance_metric)

# add sample id to distance matrix
dist_matrix = DistanceMatrix(dist_arrary, sample_id_list)

# perform anosim test
anosim_test = anosim(dist_matrix, sample_group_list, permutations=999)
print(anosim_test)
print()

# perform permanova test
permanova_test = permanova(dist_matrix, sample_group_list, permutations=999)
print(permanova_test)
Ejemplo n.º 21
0
        text=
        "ATTENTION: At least 1 of your eigenvalues is negative, potentially leading to problems! You may want to choose another metric for distance calculation or apply data transformation on the distance matrix (e.g. square root) to get rid of this problem."
    )

eig_dm = pd.DataFrame(pc.eigvals, columns=["Eigenvalue"])
eig_dm["Explained"] = pc.proportion_explained
eig_dm["Summed_explanation"] = pc.proportion_explained.cumsum()
if metric == "minkowski":
    eig_dm.to_csv("eigenvalues_" + mname + "_p" + str(p) + ".txt", sep="\t")
else:
    eig_dm.to_csv("eigenvalues_" + mname + ".txt", sep="\t")

#Statistics

anos = anosim(div, map_DF, column=var, permutations=999)
perm = permanova(div, map_DF, column=var, permutations=999)

if metric == "minkowski":
    stat_file = "statistics_" + mname + "_p" + str(p) + "_" + var + ".txt"
else:
    stat_file = "statistics_" + mname + "_" + var + ".txt"

with open(stat_file, "w") as st:
    st.write("ANOSIM\tPermutations: 999\n\n")
    st.write("R\t" + str(anos["test statistic"]) + "\n")
    st.write("p-value\t" + str(anos["p-value"]) + "\n\n")
    st.write("PERMANOVA\tPermutations: 999\n\n")
    st.write("F\t" + str(perm["test statistic"]) + "\n")
    st.write("p-value\t" + str(perm["p-value"]) + "\n\n")

end = time.time()
Ejemplo n.º 22
0
 def test_call_no_permutations(self):
     exp = pd.Series(
         index=self.exp_index, data=["PERMANOVA", "pseudo-F", 4, 2, 4.4, np.nan, 0], name="PERMANOVA results"
     )
     obs = permanova(self.dm_no_ties, self.grouping_equal, permutations=0)
     self.assert_series_equal(obs, exp)
Ejemplo n.º 23
0
for a in range(len(rows[0])):
    if a > 0:
        this_sample = []
        for b in range(len(rows)):
            if b > 0:
                this_sample.append(float(rows[b][a]))
        samples.append(this_sample)
"""
only_samples = ['LR', 'SR']
new_samples, new_names = [], []
for a in range(len(sample_names)):
    for b in range(len(only_samples)):
        if sample_names[a] == only_samples[b]:
            new_samples.append(samples[a])
            new_names.append(sample_names[a])
samples = new_samples
sample_names = new_names
print(len(samples), len(sample_names))
"""

sam_dm = dm.from_iterable(samples, metric=braycurtis)
pdisp = permdisp(sam_dm,
                 sample_names,
                 column=None,
                 test='median',
                 permutations=999)
print(pdisp)
asim = anosim(sam_dm, sample_names, column=None, permutations=999)
print(asim)
perm = permanova(sam_dm, sample_names, column=None, permutations=999)
print(perm)