Esempio n. 1
0
    def test_spearmanr_full(self):
        """
        Compare the optimized version of spearman mantel
        with the naive loop implementation
        """
        x = DistanceMatrix.read(get_data_path('dm2.txt'))
        y = DistanceMatrix.read(get_data_path('dm3.txt'))

        num_perms = 12

        np.random.seed(0)
        orig_stat_fast, permuted_stats_fast = _mantel_stats_spearman(
            x, y, num_perms)

        # compute the traditional way
        np.random.seed(0)
        x_flat = x.condensed_form()
        y_flat = y.condensed_form()

        orig_stat = spearmanr(x_flat, y_flat)[0]

        perm_gen = (spearmanr(x.permute(condensed=True), y_flat)[0]
                    for _ in range(num_perms))
        permuted_stats = np.fromiter(perm_gen, np.float, count=num_perms)

        self.assertAlmostEqual(orig_stat_fast, orig_stat)
        for i in range(num_perms):
            self.assertAlmostEqual(permuted_stats_fast[i], permuted_stats[i])
Esempio n. 2
0
    def test_fsvd(self):
        dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        dm2 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        dm3 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))

        # Test eigh vs. fsvd pcoa and inplace parameter
        expected_results = pcoa(dm1, method="eigh", number_of_dimensions=3,
                                inplace=False)

        results = pcoa(dm2, method="fsvd", number_of_dimensions=3,
                       inplace=False)

        results_inplace = pcoa(dm2, method="fsvd", number_of_dimensions=3,
                               inplace=True)

        assert_ordination_results_equal(results, expected_results,
                                        ignore_directionality=True,
                                        ignore_method_names=True)

        assert_ordination_results_equal(results, results_inplace,
                                        ignore_directionality=True,
                                        ignore_method_names=True)

        # Test number_of_dimensions edge cases
        results2 = pcoa(dm3, method="fsvd", number_of_dimensions=0,
                        inplace=False)
        expected_results2 = pcoa(dm3, method="fsvd",
                                 number_of_dimensions=dm3.data.shape[0],
                                 inplace=False)

        assert_ordination_results_equal(results2, expected_results2,
                                        ignore_directionality=True,
                                        ignore_method_names=True)

        with self.assertRaises(ValueError):
            dim_too_large = dm1.data.shape[0] + 10
            pcoa(dm2, method="fsvd", number_of_dimensions=dim_too_large)

        with self.assertRaises(ValueError):
            pcoa(dm2, method="fsvd", number_of_dimensions=-1)

        with self.assertRaises(ValueError):
            dim_too_large = dm1.data.shape[0] + 10
            pcoa(dm2, method="eigh", number_of_dimensions=dim_too_large)

        with self.assertRaises(ValueError):
            pcoa(dm2, method="eigh", number_of_dimensions=-1)

        dm_big = DistanceMatrix.read(get_data_path('PCoA_sample_data_12dim'))
        with self.assertWarnsRegex(RuntimeWarning,
                                   "no value for number_of_dimensions"):
            pcoa(dm_big, method="fsvd", number_of_dimensions=0)
Esempio n. 3
0
    def test_fsvd_inplace(self):
        dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        dm2 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))

        expected_results = pcoa(dm1, method="eigh", number_of_dimensions=3,
                                inplace=True)

        results = pcoa(dm2, method="fsvd", number_of_dimensions=3,
                       inplace=True)

        assert_ordination_results_equal(results, expected_results,
                                        ignore_directionality=True,
                                        ignore_method_names=True)
Esempio n. 4
0
def distmat_corr(truthfile, distfile, reps=3, corrstat=spearman):
    '''Returns correlation between condensed distance matrices, using corrstat'''
    distmat = DistanceMatrix.read(distfile)
    truthmat = DistanceMatrix.read(truthfile)
    truthmat = sample_matrix_to_runs(truthmat, reps)

    ids = list(sorted(distmat.ids))
    t_ids = list(sorted(truthmat.ids))
    assert ids == t_ids, (ids, t_ids)

    dist = distmat.filter(ids).condensed_form()
    truth = truthmat.filter(ids).condensed_form()
    return corrstat(truth, dist)
Esempio n. 5
0
def distmat_corr(truthfile, distfile, reps=3, corrstat=spearman):
    '''Returns correlation between condensed distance matrices, using corrstat'''
    distmat = DistanceMatrix.read(distfile)
    truthmat = DistanceMatrix.read(truthfile)
    truthmat = sample_matrix_to_runs(truthmat, reps)

    ids = list(sorted(distmat.ids))
    t_ids = list(sorted(truthmat.ids))
    assert ids == t_ids, (ids, t_ids)

    dist = distmat.filter(ids).condensed_form()
    truth = truthmat.filter(ids).condensed_form()
    return corrstat(truth, dist)
Esempio n. 6
0
    def test_confirm_betadispr_results(self):
        mp_dm = DistanceMatrix.read(get_data_path('moving_pictures_dm.tsv'))
        mp_mf = pd.read_csv(get_data_path('moving_pictures_mf.tsv'), sep='\t')
        mp_mf.set_index('#SampleID', inplace=True)

        obs_med_mp = permdisp(mp_dm, mp_mf,
                              column='BodySite')
        obs_cen_mp = permdisp(mp_dm, mp_mf, column='BodySite',
                              test='centroid')

        exp_data_m = ['PERMDISP', 'F-value', 33, 4, 10.1956, 0.001, 999]
        exp_data_c = ['PERMDISP', 'F-value', 33, 4, 17.4242, 0.001, 999]
        exp_ind = ['method name', 'test statistic name', 'sample size',
                   'number of groups', 'test statistic', 'p-value',
                   'number of permutations']

        exp_med_mp = pd.Series(data=exp_data_m, index=exp_ind, dtype='object',
                               name='PERMDISP results')

        exp_cen_mp = pd.Series(data=exp_data_c, index=exp_ind, dtype='object',
                               name='PERMDISP results')

        self.assert_series_equal(exp_med_mp, obs_med_mp)

        self.assert_series_equal(exp_cen_mp, obs_cen_mp)
    def test_simple(self):
        eigvals = [
            0.51236726, 0.30071909, 0.26791207, 0.20898868, 0.19169895,
            0.16054235, 0.15017696, 0.12245775, 0.0
        ]
        proportion_explained = [
            0.2675738328, 0.157044696, 0.1399118638, 0.1091402725,
            0.1001110485, 0.0838401162, 0.0784269939, 0.0639511764, 0.0
        ]
        sample_ids = [
            'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593',
            'PC.355', 'PC.607', 'PC.634'
        ]
        axis_labels = ['PC%d' % i for i in range(1, 10)]

        expected_results = OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigvals, index=axis_labels),
            samples=pd.DataFrame(np.loadtxt(
                get_data_path('exp_PCoAEigenResults_site')),
                                 index=sample_ids,
                                 columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))

        dm = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        results = pcoa(dm)

        assert_ordination_results_equal(results,
                                        expected_results,
                                        ignore_directionality=True)
    def test_simple(self):
        eigvals = [0.51236726, 0.30071909, 0.26791207, 0.20898868,
                   0.19169895, 0.16054235,  0.15017696,  0.12245775,
                   0.0]
        proportion_explained = [0.2675738328, 0.157044696, 0.1399118638,
                                0.1091402725, 0.1001110485,
                                0.0838401162, 0.0784269939,
                                0.0639511764, 0.0]
        sample_ids = ['PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354',
                      'PC.593', 'PC.355', 'PC.607', 'PC.634']
        axis_labels = ['PC%d' % i for i in range(1, 10)]

        expected_results = OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigvals, index=axis_labels),
            samples=pd.DataFrame(
                np.loadtxt(get_data_path('exp_PCoAEigenResults_site')),
                index=sample_ids, columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))

        dm = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        results = pcoa(dm)

        assert_ordination_results_equal(results, expected_results,
                                        ignore_directionality=True)
    def setUp(self):
        # Crawford dataset for unweighted UniFrac
        fp = get_data_path('PCoA_sample_data_3')
        self.ordination = pcoa(DistanceMatrix.read(fp))

        fp = get_data_path('PCoA_biplot_descriptors')
        self.descriptors = pd.read_table(fp, index_col='Taxon').T
Esempio n. 10
0
def effect_size(mappings, alphas, betas, output, jobs, permutations,
                overwrite, na_values):
    # As we can have multiple mapping, alpha or beta files, we will construct
    # a mfs dictionary with all the dataframes. Additionally, we will load the
    # data_dictionary.csv file so we can use it to process the data
    mappings = {f: pd.read_csv(f, sep='\t', dtype=str, na_values=na_values)
                for f in mappings}
    for m, mf in mappings.items():
        mappings[m].set_index('#SampleID', inplace=True)
    if betas:
        betas = {f: DistanceMatrix.read(f) for f in betas}

        with joblib.parallel.Parallel(n_jobs=jobs, verbose=100) as par:
            par(joblib.delayed(
                _process_column)(bf, c, fname, finfo, alphas, betas,
                                 permutations)
                for bf, c, fname, finfo in _generate_betas(
                betas, mappings, permutations, output, overwrite))
    else:
        alphas = {f: pd.read_csv(f, sep='\t', dtype=str, na_values=na_values)
                  for f in alphas}
        for a, af in alphas.items():
            alphas[a].set_index('#SampleID', inplace=True)

        for af, c, fname, finfo in _generate_alphas(alphas, mappings,
                                                    output, overwrite):
            _process_column(af, c, fname, finfo, alphas, betas, permutations)
Esempio n. 11
0
def effect_size(mappings, alphas, betas, output, jobs, permutations, overwrite,
                na_values):
    # As we can have multiple mapping, alpha or beta files, we will construct
    # a mfs dictionary with all the dataframes. Additionally, we will load the
    # data_dictionary.csv file so we can use it to process the data
    mappings = {
        f: pd.read_csv(f, sep='\t', dtype=str, na_values=na_values)
        for f in mappings
    }
    for m, mf in mappings.items():
        mappings[m].set_index('#SampleID', inplace=True)
    if betas:
        betas = {f: DistanceMatrix.read(f) for f in betas}

        with joblib.parallel.Parallel(n_jobs=jobs, verbose=100) as par:
            par(
                joblib.delayed(_process_column)(bf, c, fname, finfo, alphas,
                                                betas, permutations)
                for bf, c, fname, finfo in _generate_betas(
                    betas, mappings, permutations, output, overwrite))
    else:
        alphas = {
            f: pd.read_csv(f, sep='\t', dtype=str, na_values=na_values)
            for f in alphas
        }
        for a, af in alphas.items():
            alphas[a].set_index('#SampleID', inplace=True)

        for af, c, fname, finfo in _generate_alphas(alphas, mappings, output,
                                                    overwrite):
            _process_column(af, c, fname, finfo, alphas, betas, permutations)
Esempio n. 12
0
    def test_confirm_betadispr_results(self):
        mp_dm = DistanceMatrix.read(get_data_path('moving_pictures_dm.tsv'))
        mp_mf = pd.read_csv(get_data_path('moving_pictures_mf.tsv'), sep='\t')
        mp_mf.set_index('#SampleID', inplace=True)

        obs_med_mp = permdisp(mp_dm, mp_mf,
                              column='BodySite')
        obs_cen_mp = permdisp(mp_dm, mp_mf, column='BodySite',
                              test='centroid')

        exp_data_m = ['PERMDISP', 'F-value', 33, 4, 10.1956, 0.001, 999]
        exp_data_c = ['PERMDISP', 'F-value', 33, 4, 17.4242, 0.001, 999]
        exp_ind = ['method name', 'test statistic name', 'sample size',
                   'number of groups', 'test statistic', 'p-value',
                   'number of permutations']

        exp_med_mp = pd.Series(data=exp_data_m, index=exp_ind, dtype='object',
                               name='PERMDISP results')

        exp_cen_mp = pd.Series(data=exp_data_c, index=exp_ind, dtype='object',
                               name='PERMDISP results')

        self.assert_series_equal(exp_med_mp, obs_med_mp)

        self.assert_series_equal(exp_cen_mp, obs_cen_mp)
    def setUp(self):
        # Crawford dataset for unweighted UniFrac
        fp = get_data_path('PCoA_sample_data_3')
        self.ordination = pcoa(DistanceMatrix.read(fp))

        fp = get_data_path('PCoA_biplot_descriptors')
        self.descriptors = pd.read_table(fp, index_col='Taxon').T
Esempio n. 14
0
def get_spearmans(distfile, truth):
    distmat = DistanceMatrix.read(distfile)
    ids = list(sorted(distmat.ids))
    distmat = distmat.filter(ids)
    dist  = distmat.condensed_form()
    truth = truth.condensed_form()
    sp = stats.spearmanr(truth, dist)
    return sp.correlation
Esempio n. 15
0
def get_spearmans(distfile, truth):
    distmat = DistanceMatrix.read(distfile)
    ids = list(sorted(distmat.ids))
    distmat = distmat.filter(ids)
    dist = distmat.condensed_form()
    truth = truth.condensed_form()
    sp = stats.spearmanr(truth, dist)
    return sp.correlation
Esempio n. 16
0
    def setup(self):
        dist_matrix = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        self.ordination = PCoA(dist_matrix)

        self.ids = [
            'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593',
            'PC.355', 'PC.607', 'PC.634'
        ]
Esempio n. 17
0
 def test_unweighted_unifrac_qiime_tiny_test(self):
     dm_fp = get_data_path(
         os.path.join('qiime-191-tt', 'unweighted_unifrac_dm.txt'), 'data')
     expected = DistanceMatrix.read(dm_fp)
     for sid1 in self.q_table.columns:
         for sid2 in self.q_table.columns:
             actual = unweighted_unifrac(
                 self.q_table[sid1], self.q_table[sid2],
                 otu_ids=self.q_table.index, tree=self.q_tree)
             self.assertAlmostEqual(actual, expected[sid1, sid2])
    def test_permutted(self):
        dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        # this should not throw
        pcoa(dm1, method="fsvd", number_of_dimensions=3, inplace=False)

        # some operations, like permute, will change memory structure
        # we want to test that this does not break pcoa
        dm2 = dm1.permute()
        # we just want to assure it does not throw
        pcoa(dm2, method="fsvd", number_of_dimensions=3, inplace=False)
Esempio n. 19
0
 def test_io(self):
     # Very basic check that read/write public API is present and appears to
     # be functioning. Roundtrip from memory -> disk -> memory and ensure
     # results match.
     fh = StringIO()
     self.dm_3x3.write(fh)
     fh.seek(0)
     deserialized = DistanceMatrix.read(fh)
     self.assertEqual(deserialized, self.dm_3x3)
     self.assertTrue(type(deserialized) == DistanceMatrix)
Esempio n. 20
0
    def test_varmat_larg(self):
        np.random.seed(123)
        D = 50
        N = 100
        mean = np.ones(D) * 10
        cov = np.eye(D)
        n__ = np.random.multivariate_normal(mean, cov, size=N)
        X = pd.DataFrame(np.abs(n__), columns=np.arange(D).astype(np.str))
        res = variation_matrix(X)

        exp = DistanceMatrix.read(get_data_path('exp_varmat.txt'))
        self.assertEqual(str(res), str(exp))
Esempio n. 21
0
    def test_perm_pearsonr_full(self):
        x = DistanceMatrix.read(get_data_path('dm2.txt'))
        y = DistanceMatrix.read(get_data_path('dm3.txt'))
        x_data = x._data
        y_data = y._data
        x_flat = squareform(x_data, force='tovector', checks=False)
        y_flat = squareform(y_data, force='tovector', checks=False)

        xmean = x_flat.mean()
        ymean = y_flat.mean()

        xm = x_flat - xmean
        ym = y_flat - ymean

        normxm_la = scipy.linalg.norm(xm)
        normym_la = scipy.linalg.norm(ym)

        normxm = np.linalg.norm(xm)
        normym = np.linalg.norm(ym)

        self.assertAlmostEqual(normxm, normxm_la)
        self.assertAlmostEqual(normym, normym_la)

        perm_order = np.asarray(
            [[0, 2, 3, 4, 1, 5], [4, 3, 2, 5, 0, 1], [2, 5, 3, 1, 0, 4],
             [3, 5, 4, 1, 2, 0], [4, 3, 5, 2, 0, 1], [4, 5, 1, 2, 0, 3],
             [3, 5, 1, 0, 4, 2], [4, 5, 3, 1, 2, 0], [2, 1, 5, 4, 0, 3],
             [4, 1, 0, 5, 2, 3], [1, 2, 5, 4, 0, 3], [5, 4, 0, 1, 3, 2],
             [3, 0, 1, 5, 4, 2], [5, 0, 2, 3, 1, 4]],
            dtype=np.int)

        ym_normalized = ym / normym

        permuted_stats = np.empty(len(perm_order), dtype=x_data.dtype)
        mantel_perm_pearsonr_cy(x_data, perm_order, xmean, normxm,
                                ym_normalized, permuted_stats)
        for i in range(len(perm_order)):
            exp_res = self._compute_perf_one(x_data, perm_order[i, :], xmean,
                                             normxm, ym_normalized)
            self.assertAlmostEqual(permuted_stats[i], exp_res)
Esempio n. 22
0
    def test_varmat_larg(self):
        np.random.seed(123)
        D = 50
        N = 100
        mean = np.ones(D)*10
        cov = np.eye(D)
        X = pd.DataFrame(np.abs(np.random.multivariate_normal(mean, cov,
                                                              size=N)),
                         columns=np.arange(D).astype(np.str))
        res = variation_matrix(X)

        exp = DistanceMatrix.read(get_data_path('exp_varmat.txt'))
        self.assertEqual(str(res), str(exp))
def effect_size(mappings, alphas, betas, output, jobs, permutations,
                alpha_method):
    if not mappings:
        raise ValueError("You need to pass a mappings")
    if not alphas and not betas:
        raise ValueError("You need to pass either alphas or betas")
    if alphas and betas:
        raise ValueError("You can't pass both alphas and betas")
    if output is None:
        raise ValueError("You need to pass a output")

    if not isdir(output):
        mkdir(output)

    # As we can have multiple mapping, alpha or files, we will construct a mfs
    # dictionary with all the dataframes. Additionally, we will load the
    # data_dictionary.csv file so we can use it to process the data
    mappings = {
        f: pd.read_csv(f, sep='\t', dtype=str, na_values=NA_VALUES)
        for f in mappings
    }
    for m, mf in mappings.items():
        mappings[m].set_index('#SampleID', inplace=True)
    if betas:
        betas = {f: DistanceMatrix.read(f) for f in betas}
        print('maps: %d, betas: %d, cols: %s' %
              (len(mappings), len(betas),
               [len(m.columns.values) for _, m in mappings.items()]))

        with joblib.parallel.Parallel(n_jobs=jobs, verbose=100) as par:
            par(
                joblib.delayed(_process_column)(bf, c, fname, method)
                for bf, c, fname, method in _generate_betas(
                    betas, mappings, permutations, output))
    else:
        alphas = {
            f: pd.read_csv(f, sep='\t', dtype=str, na_values=NA_VALUES)
            for f in alphas
        }
        for a, af in alphas.items():
            alphas[a].set_index('#SampleID', inplace=True)

        for af, c, fname, method in _generate_alphas(alphas, mappings, output,
                                                     alpha_method):
            _process_column(af, c, fname, method)
Esempio n. 24
0
def _validate_distance_matrix(files, metadata, out_dir):
    """Validates a distance matrix artifact"""
    # Magic number [0] -> there is only one plain text file which is
    # the distance matrix
    dm_fp = files['plain_text'][0]
    dm = DistanceMatrix.read(dm_fp)

    # Get the ids of the distance matrix and the metadata
    dm_ids = set(dm.ids)
    metadata_ids = set(metadata)

    if not metadata_ids.issuperset(dm_ids):
        return (False, None, "The distance matrix contain samples not "
                "present in the metadata")

    filepaths = [(dm_fp, 'plain_text')]

    return True, [ArtifactInfo(None, 'distance_matrix', filepaths)], ""
Esempio n. 25
0
    def compute_beta_diversity(self):
        """Compute and cache beta diversity values

        This method calculates a beta diversity distance matrix and saves it
        to a folder for re-use. The matrices are calculated based on the full
        dataset so that any subsample drawn from the full dataset can be
        fetched from these precomputed matrices.

        See Also
        --------
        Sculptor.compute_alpha_diversity
        """
        dir_fp = 'roc-curves/%s/cached-matrices/' % self.name
        os.makedirs(dir_fp, exist_ok=True)

        X = self._original_bt.matrix_data.toarray().astype(np.int).T

        self._beta_diversity_matrices = {}

        for metric in self._beta_metrics:
            fp = os.path.join(dir_fp, metric + '.full.txt')

            if os.path.exists(fp):
                distance_matrix = DistanceMatrix.read(fp)
            else:
                if metric in {'unweighted_unifrac', 'weighted_unifrac'}:
                    kws = {
                        'tree': self.tree,
                        'otu_ids': self._original_bt.ids('observation')
                    }
                else:
                    kws = {}

                distance_matrix = beta_diversity(metric, X,
                                                 self._original_bt.ids(),
                                                 **kws)

                distance_matrix.write(fp)

            self._beta_diversity_matrices[metric] = distance_matrix
Esempio n. 26
0
def _generate_distance_matrix_summary(files, metadata, out_dir):
    # Magic number [0] -> there is only one plain text file which is
    # the distance matrix
    dm = DistanceMatrix.read(files['plain_text'][0])
    data = dm.condensed_form()

    # Generate a heatmap with the distance matrix
    # The sorting in the heatmap is going to be based in hierarchical
    # clustering.
    tree = TreeNode.from_linkage_matrix(linkage(data, method='average'),
                                        id_list=dm.ids)
    ids = list(dm.ids)
    order = [ids.index(n.name) for n in tree.tips()]

    # Plotting code adapted from skbio's DistanceMatrix.plot()
    fig, ax = plt.subplots()
    heatmap = ax.pcolormesh(dm.data[order][:, order])
    fig.colorbar(heatmap)
    ax.invert_yaxis()
    ax.set_title('Distance Matrix - hierarchical clustering')
    ax.tick_params(axis='both',
                   which='both',
                   bottom='off',
                   top='off',
                   left='off',
                   right='off',
                   labelbottom='off',
                   labelleft='off')

    sc_plot = BytesIO()
    fig.savefig(sc_plot, format='png')
    sc_plot.seek(0)
    uri = 'data:image/png;base64,' + quote(b64encode(sc_plot.getbuffer()))

    html_summary_fp = join(out_dir, 'index.html')
    with open(html_summary_fp, 'w') as f:
        f.write(DM_HTML % (dm.shape[0], data.min(), data.max(), data.mean(),
                           np.median(data), uri))

    return html_summary_fp, None
Esempio n. 27
0
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 20 14:18:58 2016

@author: virginiasaulnier
"""
from io import StringIO
from skbio import DistanceMatrix
from skbio import fisher_alpha

dm_fh =StringIO("\ta\tb\tc\n"
                "a\t0.0\t0.5\t1.0\n"
                "b\t0.5\t0.0\t0.75\n"
                "c\t1.0\t0.75\t0.0\n")
                
dm = DistanceMatrix.read(dm_fh)
print(dm)

my_pairs= StringIO("ac,gt,cg,gc,at,ta,gc,ta,tg")
dm2 = DistanceMatrix.from_iterable(my_pairs,metric= fisher_alpha(),key=id)
Esempio n. 28
0
    def setup(self):
        dist_matrix = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        self.ordination = PCoA(dist_matrix)

        self.ids = ['PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593',
                    'PC.355', 'PC.607', 'PC.634']
Esempio n. 29
0
 def test_hamming_distance_matrix(self):
     msa = parse_msa_file(self.input_a3m_fp)
     obs = hamming_distance_matrix(msa)
     exp = DistanceMatrix.read(self.hamming_dm_fp)
     self.assertEqual(obs, exp)
        all_param_grids = [defaults["LinearSVR"]]


# ## Main benchmarking loop

for reg_idx, (reg, name, grid) in enumerate(zip(regressors, names, all_param_grids)):
    
    if (run_defaults):
        print("Running default parameters for " + name)
    
    is_distmatrix = name in dm_set #Boolean switch for distance-matrix specific code blocks
    if is_distmatrix: ##### Use specific X and y for distance matrix benchmarking, not amplicon experiment object
        
        if name=="jensenshannon":
            md = exp.sample_metadata
            existing_dm = DistanceMatrix.read(dir_prefix+"/beta-q2/"+"aitchison"+'.txt')
            print("Computing Jensen-Shannon Distance Matrix")
            dm = DistanceMatrix(data=distance.pdist(exp.data.todense(), metric="jensenshannon"), ids=existing_dm.ids)
        else:
            dm = DistanceMatrix.read(dir_prefix+"/beta-q2/"+name+'.txt')
            
        md = exp.sample_metadata
        md = md.filter(dm.ids,axis='index')
        dm = dm.filter(md.index, strict=True)
        
        X_dist = dm.data
        y_dist = md[target]
        
    # Make directory for this regressor if it does not yet exist
    dir_name = dir_prefix +'/' +dir_prefix + '-' + name
    print(dir_name)
Esempio n. 31
0
degapped_alignment_fn = "onekp_only_angios_degapped/{}.onlyangios.noshort.fasta".format(
    gene)

#Read in sequence alignment and trim three ways: remove non angiosperms, remove gappy sites, remove gappy sequences

if os.path.isfile(degapped_alignment_fn):
    angio_msa_nogap_noshort = TabularMSA.read(degapped_alignment_fn,
                                              constructor=DNA)
    sys.stderr.write("Read in degapped alignment: {}\n".format(
        angio_msa_nogap_noshort.shape))
else:
    angio_msa_nogap_noshort = get_reduced_alignment(
        "genes/{}/FNA2AA-upp-masked.fasta".format(gene), angio_1kp_ids)

if os.path.isfile(distance_matrix_fn):
    p_dm = DistanceMatrix.read(distance_matrix_fn)
    p_dm_df = p_dm.to_data_frame()
    sys.stderr.write("Read in pre-determined distance matrix!\n")
else:
    p_dm = DistanceMatrix.from_iterable(angio_msa_nogap_noshort,
                                        metric=p_distance,
                                        key="id")
    p_dm_df = p_dm.to_data_frame()
    p_dm_df.to_csv(
        "onekp_only_angios_pdistance/{}_angio_p_dm.csv".format(gene))

# Cluster sequences

divergent_seqs_medoids = []
runs = {}
best_run = len(p_dm_df)
Esempio n. 32
0
    def setUp(self):
        # The test dataset used here is a subset of the Lauber et al. 2009
        # "88 Soils" dataset. It has been altered to exercise various aspects
        # of the code, including (but not limited to):
        #
        # - order of distance matrix IDs and IDs in data frame (metadata) are
        #   not exactly the same
        # - data frame has an extra sample that is not in the distance matrix
        # - this extra sample has non-numeric and missing values in some of its
        #   cells
        #
        # Additional variations of the distance matrix and data frame are used
        # to test different orderings of rows/columns, extra non-numeric data
        # frame columns, etc.
        #
        # This dataset is also useful because it is non-trivial in size (6
        # samples, 11 environment variables) and it includes positive/negative
        # floats and integers in the data frame.
        self.dm = DistanceMatrix.read(get_data_path('dm.txt'))

        # Reordered rows and columns (i.e., different ID order). Still
        # conceptually the same distance matrix.
        self.dm_reordered = DistanceMatrix.read(
            get_data_path('dm_reordered.txt'))

        self.df = pd.read_csv(get_data_path('df.txt'), sep='\t', index_col=0)

        # Similar to the above data frame, except that it has an extra
        # non-numeric column, and some of the other rows and columns have been
        # reordered.
        self.df_extra_column = pd.read_csv(
            get_data_path('df_extra_column.txt'), sep='\t', index_col=0)

        # All columns in the original data frame (these are all numeric
        # columns).
        self.cols = self.df.columns.tolist()

        # This second dataset is derived from vegan::bioenv's example dataset
        # (varespec and varechem). The original dataset includes a site x
        # species table (e.g., OTU table) and a data frame of environmental
        # variables. Since the bioenv function defined here accepts a distance
        # matrix, we use a Bray-Curtis distance matrix that is derived from the
        # site x species table (this matches what is done by vegan::bioenv when
        # provided an OTU table, using their default distance measure). The
        # data frame only includes the numeric environmental variables we're
        # interested in for these tests: log(N), P, K, Ca, pH, Al
        self.dm_vegan = DistanceMatrix.read(
            get_data_path('bioenv_dm_vegan.txt'))
        self.df_vegan = pd.read_csv(get_data_path('bioenv_df_vegan.txt'),
                                    sep='\t',
                                    converters={0: str})
        self.df_vegan.set_index('#SampleID', inplace=True)

        # Load expected results.
        self.exp_results = pd.read_csv(get_data_path('exp_results.txt'),
                                       sep='\t',
                                       index_col=0)
        self.exp_results_single_column = pd.read_csv(
            get_data_path('exp_results_single_column.txt'),
            sep='\t',
            index_col=0)
        self.exp_results_different_column_order = pd.read_csv(
            get_data_path('exp_results_different_column_order.txt'),
            sep='\t',
            index_col=0)
        self.exp_results_vegan = pd.read_csv(
            get_data_path('bioenv_exp_results_vegan.txt'),
            sep='\t',
            index_col=0)
# In[19]:

# ENSURE METADATA TARGET IS TYPE INT
exp.sample_metadata[target] = pd.to_numeric(exp.sample_metadata[target])

# In[ ]:

for reg_idx, (reg, name,
              grid) in enumerate(zip(regressors, names, all_param_grids)):

    is_distmatrix = name in dm_set  #Boolean switch for distance-matrix specific code blocks

    if is_distmatrix:  ##### Use specific X and y for distance matrix benchmarking, not amplicon experiment object
        md = exp.sample_metadata
        dm = DistanceMatrix.read(distmatrix_fp[dataset] + name + '.txt')
        md = md.filter(dm.ids, axis='index')
        dm = dm.filter(md.index, strict=True)

        X_dist = dm.data
        y_dist = md[target]

    if (name == "PLSRegressor"):
        md = exp.sample_metadata
        X_dist = exp.data.toarray()
        y_dist = md[target]

    # Make directory for this regressor if it does not yet exist
    dir_name = dir_prefix + '/' + dir_prefix + '-' + name
    if not os.path.isdir(dir_name):
        os.mkdir(dir_name, mode=0o755)
Esempio n. 34
0
    def setUp(self):
        # The test dataset used here is a subset of the Lauber et al. 2009
        # "88 Soils" dataset. It has been altered to exercise various aspects
        # of the code, including (but not limited to):
        #
        # - order of distance matrix IDs and IDs in data frame (metadata) are
        #   not exactly the same
        # - data frame has an extra sample that is not in the distance matrix
        # - this extra sample has non-numeric and missing values in some of its
        #   cells
        #
        # Additional variations of the distance matrix and data frame are used
        # to test different orderings of rows/columns, extra non-numeric data
        # frame columns, etc.
        #
        # This dataset is also useful because it is non-trivial in size (6
        # samples, 11 environment variables) and it includes positive/negative
        # floats and integers in the data frame.
        self.dm = DistanceMatrix.read(get_data_path('dm.txt'))

        # Reordered rows and columns (i.e., different ID order). Still
        # conceptually the same distance matrix.
        self.dm_reordered = DistanceMatrix.read(
            get_data_path('dm_reordered.txt'))

        self.df = pd.read_csv(get_data_path('df.txt'), sep='\t', index_col=0)

        # Similar to the above data frame, except that it has an extra
        # non-numeric column, and some of the other rows and columns have been
        # reordered.
        self.df_extra_column = pd.read_csv(
            get_data_path('df_extra_column.txt'), sep='\t', index_col=0)

        # All columns in the original data frame (these are all numeric
        # columns).
        self.cols = self.df.columns.tolist()

        # This second dataset is derived from vegan::bioenv's example dataset
        # (varespec and varechem). The original dataset includes a site x
        # species table (e.g., OTU table) and a data frame of environmental
        # variables. Since the bioenv function defined here accepts a distance
        # matrix, we use a Bray-Curtis distance matrix that is derived from the
        # site x species table (this matches what is done by vegan::bioenv when
        # provided an OTU table, using their default distance measure). The
        # data frame only includes the numeric environmental variables we're
        # interested in for these tests: log(N), P, K, Ca, pH, Al
        self.dm_vegan = DistanceMatrix.read(
            get_data_path('bioenv_dm_vegan.txt'))
        self.df_vegan = pd.read_csv(
            get_data_path('bioenv_df_vegan.txt'), sep='\t',
            converters={0: str})
        self.df_vegan.set_index('#SampleID', inplace=True)

        # Load expected results.
        self.exp_results = pd.read_csv(get_data_path('exp_results.txt'),
                                       sep='\t', index_col=0)
        self.exp_results_single_column = pd.read_csv(
            get_data_path('exp_results_single_column.txt'), sep='\t',
            index_col=0)
        self.exp_results_different_column_order = pd.read_csv(
            get_data_path('exp_results_different_column_order.txt'), sep='\t',
            index_col=0)
        self.exp_results_vegan = pd.read_csv(
            get_data_path('bioenv_exp_results_vegan.txt'), sep='\t',
            index_col=0)
Esempio n. 35
0
                  index=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                         '10', '11'])

md_dup = pd.DataFrame([(1, 'a', 0.11, 1), (1, 'a', 0.12, 2), (1, 'a', 0.13, 2),
                       (2, 'a', 0.19, 1), (2, 'a', 0.18, 2), (2, 'a', 0.21, 3),
                       (1, 'b', 0.14, 4), (1, 'b', 0.13, 5), (1, 'b', 0.14, 6),
                       (2, 'b', 0.26, 4), (2, 'b', 0.27, 5), (2, 'b', 0.29, 6)
                       ],
                      columns=['Time', 'Group', 'Value', 'ind'],
                      index=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                             '10', '11'])

dm = DistanceMatrix.read(StringIO(
    "\t0\t1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\n"
    "0\t0.0\t0.3\t1.0\t0.1\t0.1\t0.3\t0.4\t0.5\t0.6\t0.1\t0.2\t0.3\n"
    "1\t0.3\t0.0\t0.9\t0.2\t0.1\t0.4\t0.2\t0.6\t0.5\t0.2\t0.3\t0.4\n"
    "2\t1.0\t0.9\t0.0\t0.3\t0.1\t0.3\t0.3\t0.6\t0.6\t0.3\t0.3\t0.4\n"
    "3\t0.1\t0.2\t0.3\t0.0\t0.2\t0.3\t0.2\t0.5\t0.4\t0.4\t0.2\t0.3\n"
    "4\t0.1\t0.1\t0.1\t0.2\t0.0\t0.4\t0.3\t0.4\t0.7\t0.1\t0.5\t0.3\n"
    "5\t0.3\t0.4\t0.3\t0.3\t0.4\t0.0\t0.4\t0.3\t0.6\t0.2\t0.4\t0.2\n"
    "6\t0.4\t0.2\t0.3\t0.2\t0.3\t0.4\t0.0\t0.5\t0.9\t0.1\t0.3\t0.1\n"
    "7\t0.5\t0.6\t0.6\t0.5\t0.4\t0.3\t0.5\t0.0\t0.8\t0.1\t0.2\t0.3\n"
    "8\t0.6\t0.5\t0.6\t0.4\t0.7\t0.6\t0.9\t0.8\t0.0\t0.3\t0.5\t0.4\n"
    "9\t0.1\t0.2\t0.3\t0.4\t0.1\t0.2\t0.1\t0.1\t0.3\t0.0\t0.4\t0.5\n"
    "10\t0.2\t0.3\t0.3\t0.2\t0.5\t0.4\t0.3\t0.2\t0.5\t0.4\t0.0\t0.6\n"
    "11\t0.3\t0.4\t0.4\t0.3\t0.3\t0.2\t0.1\t0.3\t0.4\t0.5\t0.6\t0.0\n"
    ))

groups = {'a': [1, 2, 3, 2, 3, 1.5, 2.5, 2.7, 3, 2, 1, 1.5],
          'b': [3, 4, 5, 4.3, 3.4, 3.2, 3, 4.3, 4.9, 5, 3.2, 3.6]}
Esempio n. 36
0
 def test_hamming_distance_matrix(self):
     msa = parse_msa_file(self.input_a3m_fp)
     obs = hamming_distance_matrix(msa)
     exp = DistanceMatrix.read(self.hamming_dm_fp)
     self.assertEqual(obs, exp)
Esempio n. 37
0
    def setup(self):
        dist_matrix = DistanceMatrix.read(get_data_path("PCoA_sample_data_3"))
        self.ordination = PCoA(dist_matrix)

        self.ids = ["PC.636", "PC.635", "PC.356", "PC.481", "PC.354", "PC.593", "PC.355", "PC.607", "PC.634"]
Esempio n. 38
0
def load_sample_matrix_to_runs(samplematfile, reps=3):
    '''Loads a truth distance matrix between samples and expands to runs'''
    samples = DistanceMatrix.read(samplematfile)
    return sample_matrix_to_runs(samples, reps)