def test_from_file_error(self):
        for test_path in self.fferror_test_paths:
            with open(get_data_path(test_path), 'U') as f:
                with npt.assert_raises(FileFormatError):
                    OrdinationResults.from_file(f)

        for test_path in self.verror_test_paths:
            with open(get_data_path(test_path), 'U') as f:
                with npt.assert_raises(ValueError):
                    OrdinationResults.from_file(f)
Example #2
0
    def test_from_file_error(self):
        for test_path in self.fferror_test_paths:
            with open(get_data_path(test_path), 'U') as f:
                with npt.assert_raises(FileFormatError):
                    OrdinationResults.from_file(f)

        for test_path in self.verror_test_paths:
            with open(get_data_path(test_path), 'U') as f:
                with npt.assert_raises(ValueError):
                    OrdinationResults.from_file(f)
    def test_from_file(self):
        for exp_scores, test_path in zip(self.scores, self.test_paths):
            for file_type in ('file like', 'file name'):
                fname = get_data_path(test_path)
                if file_type == 'file like':
                    with open(fname) as fh:
                        obs = OrdinationResults.from_file(fh)
                elif file_type == 'file name':
                    obs = OrdinationResults.from_file(fname)

                yield self.check_OrdinationResults_equal, obs, exp_scores
Example #4
0
    def test_from_file(self):
        for exp_scores, test_path in zip(self.scores, self.test_paths):
            for file_type in ('file like', 'file name'):
                fname = get_data_path(test_path)
                if file_type == 'file like':
                    with open(fname) as fh:
                        obs = OrdinationResults.from_file(fh)
                elif file_type == 'file name':
                    obs = OrdinationResults.from_file(fname)

                yield self.check_OrdinationResults_equal, obs, exp_scores
Example #5
0
    def test_assert_ordination_results_equal(self):
        minimal1 = OrdinationResults([1, 2])

        # a minimal set of results should be equal to itself
        assert_ordination_results_equal(minimal1, minimal1)

        # type mismatch
        with npt.assert_raises(AssertionError):
            assert_ordination_results_equal(minimal1, 'foo')

        # numeric values should be checked that they're almost equal
        almost_minimal1 = OrdinationResults([1.0000001, 1.9999999])
        assert_ordination_results_equal(minimal1, almost_minimal1)

        # species_ids missing in one, present in the other
        almost_minimal1.species_ids = ['abc', 'def']
        with npt.assert_raises(AssertionError):
            assert_ordination_results_equal(minimal1, almost_minimal1)
        almost_minimal1.species_ids = None

        # site_ids missing in one, present in the other
        almost_minimal1.site_ids = ['abc', 'def']
        with npt.assert_raises(AssertionError):
            assert_ordination_results_equal(minimal1, almost_minimal1)
        almost_minimal1.site_ids = None

        # test each of the optional numeric attributes
        for attr in ('species', 'site', 'biplot', 'site_constraints',
                     'proportion_explained'):
            # missing optional numeric attribute in one, present in the other
            setattr(almost_minimal1, attr, [[1, 2], [3, 4]])
            with npt.assert_raises(AssertionError):
                assert_ordination_results_equal(minimal1, almost_minimal1)
            setattr(almost_minimal1, attr, None)

            # optional numeric attributes present in both, but not almost equal
            setattr(minimal1, attr, [[1, 2], [3, 4]])
            setattr(almost_minimal1, attr, [[1, 2], [3.00002, 4]])
            with npt.assert_raises(AssertionError):
                assert_ordination_results_equal(minimal1, almost_minimal1)
            setattr(minimal1, attr, None)
            setattr(almost_minimal1, attr, None)

            # optional numeric attributes present in both, and almost equal
            setattr(minimal1, attr, [[1, 2], [3, 4]])
            setattr(almost_minimal1, attr, [[1, 2], [3.00000002, 4]])
            assert_ordination_results_equal(minimal1, almost_minimal1)
            setattr(minimal1, attr, None)
            setattr(almost_minimal1, attr, None)
Example #6
0
def parse_coords(lines):
    """Parse skbio's ordination results file into  coords, labels, eigvals,
        pct_explained.

    Returns:
    - list of sample labels in order
    - array of coords (rows = samples, cols = axes in descending order)
    - list of eigenvalues
    - list of percent variance explained

    For the file format check
    skbio.stats.ordination.OrdinationResults.read

    Strategy: read the file using skbio's parser and return the objects
              we want
    """
    try:
        pcoa_results = OrdinationResults.read(lines)
        return (pcoa_results.site_ids, pcoa_results.site, pcoa_results.eigvals,
                pcoa_results.proportion_explained)
    except FileFormatError:
        try:
            lines.seek(0)
        except AttributeError:
            # looks like we have a list of lines, not a file-like object
            pass
        return qiime_parse_coords(lines)
Example #7
0
    def setUpClass(cls):
        axis_labels = ['PC1', 'PC2', 'PC3']
        cls.test_df1 = pd.DataFrame.from_dict(
            {
                's1': [0.1, 0.2, 7],
                's2': [0.9, 0.2, 7],
            },
            orient='index',
            columns=axis_labels,
        )
        cls.test_df1.index.name = 'Sample ID'
        cls.pcoa1 = OrdinationResults(
            'pcoa1',
            'pcoa1',
            eigvals=pd.Series(
                [7, 2, 1],
                index=axis_labels,
            ),
            samples=cls.test_df1,
            proportion_explained=pd.Series(
                [0.7, 0.2, 0.1],
                index=axis_labels,
            ),
        )

        cls.test_metadata = pd.DataFrame(
            {
                'age_cat': ['30s', '40s', '50s', '30s', None],
                'num_cat': [7.24, 7.24, 8.25, 7.24, None],
                'other': [1, 2, 3, 4, None],
            },
            index=pd.Series(['s1', 's2', 'c', 'd', 'e'], name='#SampleID'))
Example #8
0
def community_plot(
    tree: str,
    table: str,
    sample_metadata: str,
    output_dir: str,
    pcoa: str,
    feature_metadata: str,
    ignore_missing_samples: bool,
    filter_extra_samples: bool,
    filter_missing_features: bool,
    number_of_pcoa_features: int,
    shear_to_table: bool,
) -> None:
    tree_newick, fm = check_and_process_files(output_dir, tree,
                                              feature_metadata)
    table = load_table(table)
    sample_metadata = pd.read_csv(sample_metadata, sep="\t", index_col=0)

    if pcoa is not None:
        pcoa = OrdinationResults.read(pcoa)
        pcoa = prepare_pcoa(pcoa, number_of_pcoa_features)

    viz = Empress(
        tree_newick,
        table=table,
        sample_metadata=sample_metadata,
        feature_metadata=fm,
        ordination=pcoa,
        ignore_missing_samples=ignore_missing_samples,
        filter_extra_samples=filter_extra_samples,
        filter_missing_features=filter_missing_features,
        shear_to_table=shear_to_table,
    )
    os.makedirs(output_dir)
    save_viz(viz, output_dir, q2=False)
Example #9
0
    def setUp(self):
        self.test_dm = DistanceMatrix(
            np.array([
                [0, 1, 2, 3, 4],
                [1, 0, 4, 5, 6],
                [2, 4, 0, 6, 7],
                [3, 5, 6, 0, 8],
                [4, 6, 7, 8, 0],
            ]),
            ids=[f'S{i}' for i in range(5)],
        )

        n_samples = 100
        np.random.seed(825)
        sample_embedding = np.random.normal(size=(n_samples, 3)) + 2
        sample_embedding[:, 1] *= 3
        sample_embedding[:, 2] *= 6
        sample_df = pd.DataFrame(
            sample_embedding,
            index=[f'S{i}' for i in range(n_samples)],
            columns=[f'C{i}' for i in range(3)],
        )

        self.test_ord_results = OrdinationResults(
            'foo',
            'bar',
            eigvals=pd.Series(np.arange(n_samples)),
            samples=sample_df,
        )
Example #10
0
    def test_str(self):
        exp = ("Ordination results:\n"
               "\tEigvals: 2\n"
               "\tProportion explained: N/A\n"
               "\tSpecies: 3x2\n"
               "\tSite: 3x2\n"
               "\tBiplot: N/A\n"
               "\tSite constraints: N/A\n"
               "\tSpecies IDs: 'Species1', 'Species2', 'Species3'\n"
               "\tSite IDs: 'Site1', 'Site2', 'Site3'")
        obs = str(self.ordination_results)
        self.assertEqual(obs, exp)

        # all optional attributes missing
        exp = ("Ordination results:\n"
               "\tEigvals: 1\n"
               "\tProportion explained: N/A\n"
               "\tSpecies: N/A\n"
               "\tSite: N/A\n"
               "\tBiplot: N/A\n"
               "\tSite constraints: N/A\n"
               "\tSpecies IDs: N/A\n"
               "\tSite IDs: N/A")
        obs = str(OrdinationResults(np.array([4.2])))
        self.assertEqual(obs, exp)
Example #11
0
    def test_get_procrustes_results(self):
        sample_id_map = {
            'CP3A1': 'S1',
            'CC1A1': 'S2',
            'CC2A1': 'S3',
            'CP1A1': 'S4'
        }
        actual = get_procrustes_results(StringIO(pcoa1_f),
                                        StringIO(pcoa1_f),
                                        sample_id_map=sample_id_map,
                                        randomize=None,
                                        max_dimensions=None)
        # just some sanity checks as the individual componenets are
        # already tested -- these are based on looking at the output of the
        # run, and testing to ensure that it hasn't changed
        eigvals = array([
            8976580.24393, 6044862.67619, 4372581.39431, 3161360.10319,
            2583594.45275, 2407555.39787
        ])
        prop_expl = array([
            23.1764657118, 15.6071186064, 11.2894866423, 8.16225689998,
            6.67053450426, 6.21602253997
        ])

        site = array([[
            -0.199225958574, -0.250846540029, -0.119813087305, -0.155652031006,
            0.18495315824, -0.160875399364
        ],
                      [
                          -0.238263544222, -0.37724227779, -0.169458651217,
                          0.0305157004776, 0.112181007345, 0.0677415967093
                      ],
                      [
                          0.116737988534, 0.414627960015, 0.201315243115,
                          0.113769076804, -0.283025353088, -0.144278863311
                      ],
                      [
                          0.320751514262, 0.213460857804, 0.0879564954067,
                          0.0113672537238, -0.0141088124974, 0.237412665966
                      ]])
        site_ids = ['S3', 'S2', 'S1', 'S4']
        expected = OrdinationResults(eigvals=eigvals,
                                     proportion_explained=prop_expl,
                                     site=site,
                                     site_ids=site_ids)

        assert_almost_equal(actual[0].eigvals, expected.eigvals)
        assert_almost_equal(actual[0].proportion_explained,
                            expected.proportion_explained)
        self.assertEqual(actual[0].site_ids, expected.site_ids)
        assert_almost_equal(actual[0].site, expected.site)

        assert_almost_equal(actual[1].eigvals, expected.eigvals)
        assert_almost_equal(actual[1].proportion_explained,
                            expected.proportion_explained)
        assert_almost_equal(actual[1].site, expected.site)
        self.assertEqual(actual[1].site_ids, expected.site_ids)

        self.assertTrue(actual[2] < 6e-30)
Example #12
0
 def setUpClass(cls):
     axis_labels = ['PC1', 'PC2', 'PC3']
     cls.test_df1 = pd.DataFrame.from_dict(
         {
             's1': [0.1, 0.2, 7],
             's2': [0.9, 0.2, 7],
         },
         orient='index',
         columns=axis_labels,
     )
     cls.test_df1.index.name = 'Sample ID'
     cls.pcoa1 = OrdinationResults(
         'pcoa1',
         'pcoa1',
         eigvals=pd.Series(
             [7, 2, 1],
             index=axis_labels,
         ),
         samples=cls.test_df1,
         proportion_explained=pd.Series(
             [0.7, 0.2, 0.1],
             index=axis_labels,
         ),
     )
     cls.test_metadata = pd.DataFrame(
         {
             'age_cat': ['30s', '40s', '50s', '30s', None],
             'num_cat': [7.24, 7.24, 8.25, 7.24, None],
             'other': [1, 2, 3, 4, None],
         },
         index=pd.Series(['s1', 's2', 'c', 'd', 'e'], name='#SampleID'))
     cls.resources = DictElement({
         'datasets':
         DictElement({
             'dataset1':
             DictElement({
                 '__metadata__':
                 MockMetadataElement(cls.test_metadata),
                 '__pcoa__':
                 PCOAElement({
                     'sample_set':
                     DictElement({
                         'beta_metric': cls.pcoa1,
                     }),
                 })
             }),
             'dataset2':
             DictElement({
                 '__metadata__':
                 MockMetadataElement(cls.test_metadata),
             }),
         }),
     })
     cls.resources.accept(TrivialVisitor())
     cls.res_patcher = patch(
         'microsetta_public_api.api.emperor.get_resources')
     cls.mock_resources = cls.res_patcher.start()
     cls.mock_resources.return_value = cls.resources
 def test_io(self):
     # Very basic check that read/write public API is present and appears to
     # be functioning. Roundtrip from memory -> disk -> memory and ensure
     # results match.
     fh = StringIO()
     self.ordination_results.write(fh)
     fh.seek(0)
     deserialized = OrdinationResults.read(fh)
     assert_ordination_results_equal(deserialized, self.ordination_results)
     self.assertTrue(type(deserialized) == OrdinationResults)
Example #14
0
def body_site(coords, mapping_file):
    """Generates as many figures as samples in the coordinates file"""
    o = OrdinationResults.from_file(coords)

    # coordinates
    c_df = pd.DataFrame(o.site, o.site_ids)

    # mapping file
    mf = pd.read_csv(mapping_file, '\t', converters=defaultdict(str),
                     index_col='#SampleID')
    mf = mf.loc[o.site_ids]

    color_hmp_fecal = sns.color_palette('Paired', 12)[10]  # light brown
    color_agp_fecal = sns.color_palette('Paired', 12)[11]  # dark brown
    color_hmp_oral = sns.color_palette('Paired', 12)[0]    # light blue
    color_agp_oral = sns.color_palette('Paired', 12)[1]    # dark blue
    color_hmp_skin = sns.color_palette('Paired', 12)[2]    # light green
    color_agp_skin = sns.color_palette('Paired', 12)[3]    # dark green

    grp_colors = {'AGP-FECAL': color_agp_fecal,
                  'AGP-ORAL':  color_agp_oral,
                  'AGP-SKIN':  color_agp_skin,
                  'HMP-FECAL': color_hmp_fecal,
                  'GG-FECAL':  color_hmp_fecal,
                  'PGP-FECAL': color_hmp_fecal,
                  'HMP-ORAL':  color_hmp_oral,
                  'PGP-ORAL':  color_hmp_oral,
                  'HMP-SKIN':  color_hmp_skin,
                  'PGP-SKIN':  color_hmp_skin}

    for sample in mf.index:

        # plot categories as 50 slices with random zorder
        for grp, color in grp_colors.iteritems():
            sub_coords = c_df[mf.TITLE_BODY_SITE == grp].values
            for i in np.array_split(sub_coords, 50):
                plt.scatter(i[:, 0], i[:, 1], color=color,
                            edgecolor=np.asarray(color)*0.6, lw=LINE_WIDTH,
                            alpha=ALPHA, zorder=np.random.rand())

        # plot participant's dot
        plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1],
                    color=grp_colors[mf.loc[sample]['TITLE_BODY_SITE']],
                    s=270, edgecolor='w', zorder=1)
        plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1],
                    color=grp_colors[mf.loc[sample]['TITLE_BODY_SITE']],
                    s=250, edgecolor=np.asarray(
                    grp_colors[mf.loc[sample]['TITLE_BODY_SITE']])*0.6,
                    zorder=2)

        plt.axis('off')
        my_dpi = 72
        plt.savefig(sample+'.pdf', figsize=(1000/my_dpi, 1000/my_dpi),
                    dpi=my_dpi)
        plt.close()
Example #15
0
    def setUp(self):
        # Define in-memory CA results to serialize and deserialize.
        eigvals = np.array([0.0961330159181, 0.0409418140138])
        species = np.array([[0.408869425742, 0.0695518116298],
                            [-0.1153860437, -0.299767683538],
                            [-0.309967102571, 0.187391917117]])
        site = np.array([[-0.848956053187, 0.882764759014],
                         [-0.220458650578, -1.34482000302],
                         [1.66697179591, 0.470324389808]])
        biplot = None
        site_constraints = None
        prop_explained = None
        species_ids = ['Species1', 'Species2', 'Species3']
        site_ids = ['Site1', 'Site2', 'Site3']

        self.ordination_results = OrdinationResults(
            eigvals=eigvals,
            species=species,
            site=site,
            biplot=biplot,
            site_constraints=site_constraints,
            proportion_explained=prop_explained,
            species_ids=species_ids,
            site_ids=site_ids)

        # DataFrame for testing plot method. Has a categorical column with a
        # mix of numbers and strings. Has a numeric column with a mix of ints,
        # floats, and strings that can be converted to floats. Has a numeric
        # column with missing data (np.nan).
        self.df = pd.DataFrame([['foo', '42', 10], [22, 0, 8],
                                [22, -4.2, np.nan], ['foo', '42.19', 11]],
                               index=['A', 'B', 'C', 'D'],
                               columns=['categorical', 'numeric', 'nancolumn'])

        # Minimal ordination results for easier testing of plotting method.
        # Paired with df above.
        eigvals = np.array([0.50, 0.25, 0.25])
        site = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5],
                         [0.4, 0.5, 0.6]])
        self.min_ord_results = OrdinationResults(eigvals=eigvals,
                                                 site=site,
                                                 site_ids=['A', 'B', 'C', 'D'])
Example #16
0
def gradient(coords, mapping_file, color):
    """Generates as many figures as samples in the coordinates file"""
    o = OrdinationResults.from_file(coords)

    # coordinates
    c_df = pd.DataFrame(o.site, o.site_ids)

    # mapping file
    mf = pd.read_csv(mapping_file, '\t', converters=defaultdict(str),
                     index_col='#SampleID')
    mf = mf.loc[o.site_ids]
    mf[color] = mf[color].convert_objects(convert_numeric=True)

    numeric = mf[~pd.isnull(mf[color])]
    non_numeric = mf[pd.isnull(mf[color])]

    color_array = plt.cm.RdBu(numeric[color]/max(numeric[color]))

    for sample in mf.index:

        # plot numeric metadata as colored gradient
        ids = numeric.index
        x, y = c_df.loc[ids][0], c_df.loc[ids][1]
        plt.scatter(x, y, c=numeric[color], cmap=plt.get_cmap('RdBu'),
                    alpha=ALPHA, lw=LINE_WIDTH, edgecolor=color_array*0.6)

        # plt.colorbar()

        # plot non-numeric metadata as gray
        ids = non_numeric.index
        x, y = c_df.loc[ids][0], c_df.loc[ids][1]
        plt.scatter(x, y, c='0.5', alpha=ALPHA, lw=LINE_WIDTH, edgecolor='0.3')

        # plot individual's dot
        try:
            color_index = numeric.index.tolist().index(sample)
        except ValueError:
            color_index = None

        if color_index is None:
            _color = (0.5, 0.5, 0.5)
        else:
            _color = color_array[color_index]

        plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1],
                    color=_color, s=270, edgecolor='w')
        plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1],
                    color=_color, s=250, edgecolor=np.asarray(_color)*0.6)

        plt.axis('off')
        my_dpi = 72
        plt.savefig(sample+'.pdf', figsize=(1000/my_dpi, 1000/my_dpi),
                    dpi=my_dpi)
        plt.close()
Example #17
0
    def setUp(self):
        # Define in-memory CA results to serialize and deserialize.
        eigvals = np.array([0.0961330159181, 0.0409418140138])
        species = np.array([[0.408869425742, 0.0695518116298],
                            [-0.1153860437, -0.299767683538],
                            [-0.309967102571, 0.187391917117]])
        site = np.array([[-0.848956053187, 0.882764759014],
                         [-0.220458650578, -1.34482000302],
                         [1.66697179591, 0.470324389808]])
        biplot = None
        site_constraints = None
        prop_explained = None
        species_ids = ['Species1', 'Species2', 'Species3']
        site_ids = ['Site1', 'Site2', 'Site3']

        self.ordination_results = OrdinationResults(
            eigvals=eigvals, species=species, site=site, biplot=biplot,
            site_constraints=site_constraints,
            proportion_explained=prop_explained, species_ids=species_ids,
            site_ids=site_ids)

        # DataFrame for testing plot method. Has a categorical column with a
        # mix of numbers and strings. Has a numeric column with a mix of ints,
        # floats, and strings that can be converted to floats. Has a numeric
        # column with missing data (np.nan).
        self.df = pd.DataFrame([['foo', '42', 10],
                                [22, 0, 8],
                                [22, -4.2, np.nan],
                                ['foo', '42.19', 11]],
                               index=['A', 'B', 'C', 'D'],
                               columns=['categorical', 'numeric', 'nancolumn'])

        # Minimal ordination results for easier testing of plotting method.
        # Paired with df above.
        eigvals = np.array([0.50, 0.25, 0.25])
        site = np.array([[0.1, 0.2, 0.3],
                         [0.2, 0.3, 0.4],
                         [0.3, 0.4, 0.5],
                         [0.4, 0.5, 0.6]])
        self.min_ord_results = OrdinationResults(eigvals=eigvals, site=site,
                                                 site_ids=['A', 'B', 'C', 'D'])
class TestOrdinationResults(unittest.TestCase):
    def setUp(self):
        # Define in-memory CA results to serialize and deserialize.
        eigvals = np.array([0.0961330159181, 0.0409418140138])
        species = np.array([[0.408869425742, 0.0695518116298],
                            [-0.1153860437, -0.299767683538],
                            [-0.309967102571, 0.187391917117]])
        site = np.array([[-0.848956053187, 0.882764759014],
                         [-0.220458650578, -1.34482000302],
                         [1.66697179591, 0.470324389808]])
        biplot = None
        site_constraints = None
        prop_explained = None
        species_ids = ['Species1', 'Species2', 'Species3']
        site_ids = ['Site1', 'Site2', 'Site3']
        self.ordination_results = OrdinationResults(
            eigvals=eigvals, species=species, site=site, biplot=biplot,
            site_constraints=site_constraints,
            proportion_explained=prop_explained, species_ids=species_ids,
            site_ids=site_ids)

    def test_io(self):
        # Very basic check that read/write public API is present and appears to
        # be functioning. Roundtrip from memory -> disk -> memory and ensure
        # results match.
        fh = StringIO()
        self.ordination_results.write(fh)
        fh.seek(0)
        deserialized = OrdinationResults.read(fh)
        assert_ordination_results_equal(deserialized, self.ordination_results)
        self.assertTrue(type(deserialized) == OrdinationResults)

    def test_deprecated_io(self):
        fh = StringIO()
        npt.assert_warns(UserWarning, self.ordination_results.to_file, fh)
        fh.seek(0)
        deserialized = npt.assert_warns(UserWarning,
                                        OrdinationResults.from_file, fh)
        assert_ordination_results_equal(deserialized, self.ordination_results)
        self.assertTrue(type(deserialized) == OrdinationResults)
Example #19
0
    def setUp(self):
        or_f = StringIO(PCOA_STRING)
        self.ord_res = OrdinationResults.read(or_f)

        self.data = [['PC.354', 'Control', '20061218', 'Ctrol_mouse_I.D._354'],
            ['PC.355', 'Control', '20061218', 'Control_mouse_I.D._355'],
            ['PC.356', 'Control', '20061126', 'Control_mouse_I.D._356'],
            ['PC.481', 'Control', '20070314', 'Control_mouse_I.D._481'],
            ['PC.593', 'Control', '20071210', 'Control_mouse_I.D._593'],
            ['PC.607', 'Fast', '20071112', 'Fasting_mouse_I.D._607'],
            ['PC.634', 'Fast', '20080116', 'Fasting_mouse_I.D._634'],
            ['PC.635', 'Fast', '20080116', 'Fasting_mouse_I.D._635'],
            ['PC.636', 'Fast', '20080116', 'Fasting_mouse_I.D._636']]
        self.headers = ['SampleID', 'Treatment', 'DOB', 'Description']
Example #20
0
def embed(
            distance_matrix: DistanceMatrix,
            n_neighbors: int,
            min_dist: float = 1,
            number_of_dimensions: int = 2,
            random_state: int = 724,
        ) -> OrdinationResults:

    n_samples = len(distance_matrix.ids)
    if number_of_dimensions > n_samples:
        raise ValueError(
            f'number_of_dimensions ({number_of_dimensions}) must be fewer than'
            f'number of samples ({n_samples}) - 2'
        )

    transformer = UMAP(
        n_neighbors=n_neighbors,
        n_components=number_of_dimensions,
        min_dist=min_dist,
        random_state=random_state,
        metric='precomputed',
    )

    embedding = transformer.fit_transform(distance_matrix[:, :])

    if embedding.shape[1] < 3:
        difference = 3 - embedding.shape[1]
        embedding = np.hstack((embedding, np.zeros((len(embedding),
                                                    difference))))

    number_of_dimensions = embedding.shape[1]

    embedding_df = pd.DataFrame(embedding, index=distance_matrix.ids,
                                columns=[f'UMAP-{i}' for i in
                                         range(embedding.shape[1])]
                                )

    null_eigvals = pd.Series(np.zeros(number_of_dimensions))
    ord_results = OrdinationResults(
        'umap',
        'Uniform Manifold Approximation and Projection',
        eigvals=null_eigvals,
        samples=embedding_df,
        proportion_explained=null_eigvals,
    )

    return center(ord_results)
Example #21
0
    def test_assert_ordination_results_equal(self):
        minimal1 = OrdinationResults([1, 2])

        # a minimal set of results should be equal to itself
        assert_ordination_results_equal(minimal1, minimal1)

        # type mismatch
        with npt.assert_raises(AssertionError):
            assert_ordination_results_equal(minimal1, 'foo')

        # numeric values should be checked that they're almost equal
        almost_minimal1 = OrdinationResults([1.0000001, 1.9999999])
        assert_ordination_results_equal(minimal1, almost_minimal1)

        # species_ids missing in one, present in the other
        almost_minimal1.species_ids = ['abc', 'def']
        with npt.assert_raises(AssertionError):
            assert_ordination_results_equal(minimal1, almost_minimal1)
        almost_minimal1.species_ids = None

        # site_ids missing in one, present in the other
        almost_minimal1.site_ids = ['abc', 'def']
        with npt.assert_raises(AssertionError):
            assert_ordination_results_equal(minimal1, almost_minimal1)
        almost_minimal1.site_ids = None

        # test each of the optional numeric attributes
        for attr in ('species', 'site', 'biplot', 'site_constraints',
                     'proportion_explained'):
            # missing optional numeric attribute in one, present in the other
            setattr(almost_minimal1, attr, [[1, 2], [3, 4]])
            with npt.assert_raises(AssertionError):
                assert_ordination_results_equal(minimal1, almost_minimal1)
            setattr(almost_minimal1, attr, None)

            # optional numeric attributes present in both, but not almost equal
            setattr(minimal1, attr, [[1, 2], [3, 4]])
            setattr(almost_minimal1, attr, [[1, 2], [3.00002, 4]])
            with npt.assert_raises(AssertionError):
                assert_ordination_results_equal(minimal1, almost_minimal1)
            setattr(minimal1, attr, None)
            setattr(almost_minimal1, attr, None)

            # optional numeric attributes present in both, and almost equal
            setattr(minimal1, attr, [[1, 2], [3, 4]])
            setattr(almost_minimal1, attr, [[1, 2], [3.00000002, 4]])
            assert_ordination_results_equal(minimal1, almost_minimal1)
            setattr(minimal1, attr, None)
            setattr(almost_minimal1, attr, None)
Example #22
0
def parse_coords(lines):
    """Parse skbio's ordination results file into  coords, labels, eigvals,
        pct_explained.

    Returns:
    - list of sample labels in order
    - array of coords (rows = samples, cols = axes in descending order)
    - list of eigenvalues
    - list of percent variance explained

    For the file format check
    skbio.stats.ordination.OrdinationResults.read

    Strategy: read the file using skbio's parser and return the objects
              we want
    """
    pcoa_results = OrdinationResults.read(lines)
    return (pcoa_results.site_ids, pcoa_results.site, pcoa_results.eigvals, pcoa_results.proportion_explained)
Example #23
0
def parse_coords(lines):
    """Parse skbio's ordination results file into  coords, labels, eigvals,
        pct_explained.

    Returns:
    - list of sample labels in order
    - array of coords (rows = samples, cols = axes in descending order)
    - list of eigenvalues
    - list of percent variance explained

    For the file format check
    skbio.stats.ordination.OrdinationResults.read

    Strategy: read the file using skbio's parser and return the objects
              we want
    """
    pcoa_results = OrdinationResults.read(lines)
    return (pcoa_results.site_ids, pcoa_results.site, pcoa_results.eigvals,
            pcoa_results.proportion_explained)
 def setUp(self):
     # Define in-memory CA results to serialize and deserialize.
     eigvals = np.array([0.0961330159181, 0.0409418140138])
     species = np.array([[0.408869425742, 0.0695518116298],
                         [-0.1153860437, -0.299767683538],
                         [-0.309967102571, 0.187391917117]])
     site = np.array([[-0.848956053187, 0.882764759014],
                      [-0.220458650578, -1.34482000302],
                      [1.66697179591, 0.470324389808]])
     biplot = None
     site_constraints = None
     prop_explained = None
     species_ids = ['Species1', 'Species2', 'Species3']
     site_ids = ['Site1', 'Site2', 'Site3']
     self.ordination_results = OrdinationResults(
         eigvals=eigvals, species=species, site=site, biplot=biplot,
         site_constraints=site_constraints,
         proportion_explained=prop_explained, species_ids=species_ids,
         site_ids=site_ids)
Example #25
0
def emperor_output(sklearn_output, full_file_list, eigenvalues, percent_variance, output_file, new_files = None):
    print("Made it to Emperor Function!")
    #read in sklearn output and format accordingly for emperor intake
    eigvals = pd.Series(data = eigenvalues)
    samples = pd.DataFrame(data = sklearn_output, index = full_file_list)
    p_explained = pd.Series(data = percent_variance)
    ores = OrdinationResults(long_method_name = "principal component analysis", short_method_name = "pcoa", eigvals = eigvals, samples = samples, proportion_explained = p_explained)
    
    #this first part is for the global metadata file
    global_metadata = pd.read_csv(config.PATH_TO_ORIGINAL_MAPPING_FILE, sep = "\t")
    global_metadata_headers = global_metadata.columns.tolist()
    global_metadata.rename(columns = {'filename': 'SampleID'}, inplace = True)
    global_metadata["type"] = "Global Data"
    global_metadata.set_index("SampleID", inplace = True)

    common = global_metadata    

    #this part is for the user uploaded metadata file
    if new_files != None:
        metadata_uploaded = pd.DataFrame({"SampleID": new_files, "type":["Your Data"] * len(new_files)})
        for item in global_metadata_headers:
            metadata_uploaded[item] = ["Your Data"] * len(new_files)
        metadata_uploaded.set_index("SampleID", inplace = True)
        
        common = pd.concat([global_metadata, metadata_uploaded])

   

    #so you need to align the metadata and the files contained within the ordination file BEFORE feeding it into the Emperor thing otherwise it doesn't like to output results  
    final_metadata, unused = common.align(samples, join = "right", axis = 0)
    
  
    #call stuff to ouput an emperor plot
    emp = Emperor(ores, final_metadata, remote = True)
           
    # create an output directory
    os.makedirs(output_file, exist_ok=True)

    with open(os.path.join(output_file, 'index.html'), 'w') as f:
        f.write(emp.make_emperor(standalone = True))
        emp.copy_support_files(output_file)
Example #26
0
def center(embedding: OrdinationResults) -> OrdinationResults:
    short_name = embedding.short_method_name
    long_name = embedding.long_method_name
    n_dimensions = embedding.samples.shape[1]
    transformer = PCA(n_components=n_dimensions)
    new_embedding = transformer.fit_transform(embedding.samples)

    embedding_df = pd.DataFrame(new_embedding,
                                index=embedding.samples.index,
                                columns=embedding.samples.columns
                                )

    null_eigvals = pd.Series(np.zeros(n_dimensions))
    ord_results = OrdinationResults(
        short_name,
        long_name,
        eigvals=null_eigvals,
        samples=embedding_df,
        proportion_explained=null_eigvals,
    )
    return ord_results
Example #27
0
def emperor_output(sklearn_output,
                   full_file_list,
                   eigenvalues,
                   percent_variance,
                   output_file,
                   new_files=[]):
    eigvals = pd.Series(data=eigenvalues)
    samples = pd.DataFrame(data=sklearn_output, index=full_file_list)
    samples.index.rename("SampleID", inplace=True)
    p_explained = pd.Series(data=percent_variance)
    ores = OrdinationResults(long_method_name="principal component analysis",
                             short_method_name="pcoa",
                             eigvals=eigvals,
                             samples=samples,
                             proportion_explained=p_explained)

    #read in all sample metadata
    df = pd.read_table(config.PATH_TO_ORIGINAL_MAPPING_FILE)
    df.rename(columns={"filename": "SampleID"}, inplace=True)
    df.set_index("SampleID", inplace=True)

    #handling the case in which the pca is a projection
    if len(new_files) != 0:
        df["Type"] = "Global"
        new_meta = pd.DataFrame({"SampleID": new_files, "Type": "Your Data"})
        new_meta.set_index("SampleID", inplace=True)
        df = pd.concat([df, new_meta], axis=0, join="outer")

    final_metadata, unused = df.align(samples, join="right", axis=0)

    #call stuff to ouput an emperor plot
    emp = Emperor(ores, final_metadata, remote=True)

    # create an output directory
    os.makedirs(output_file, exist_ok=True)

    with open(os.path.join(output_file, 'index.html'), 'w') as f:
        f.write(emp.make_emperor(standalone=True))
        emp.copy_support_files(output_file)
Example #28
0
def _ordination_to_ordination_results(fh):
    eigvals = _parse_vector_section(fh, 'Eigvals')
    if eigvals is None:
        raise OrdinationFormatError("At least one eigval must be present.")
    _check_empty_line(fh)

    prop_expl = _parse_vector_section(fh, 'Proportion explained')
    _check_length_against_eigvals(prop_expl, eigvals,
                                  'proportion explained values')
    _check_empty_line(fh)

    species, species_ids = _parse_array_section(fh, 'Species')
    _check_length_against_eigvals(species, eigvals, 'coordinates per species')
    _check_empty_line(fh)

    site, site_ids = _parse_array_section(fh, 'Site')
    _check_length_against_eigvals(site, eigvals, 'coordinates per site')
    _check_empty_line(fh)

    # biplot does not have ids to parse (the other arrays do)
    biplot, _ = _parse_array_section(fh, 'Biplot', has_ids=False)
    _check_empty_line(fh)

    cons, cons_ids = _parse_array_section(fh, 'Site constraints')

    if cons_ids is not None and site_ids is not None:
        if cons_ids != site_ids:
            raise OrdinationFormatError(
                "Site constraints ids and site ids must be equal: %s != %s" %
                (cons_ids, site_ids))

    return OrdinationResults(eigvals=eigvals,
                             species=species,
                             site=site,
                             biplot=biplot,
                             site_constraints=cons,
                             proportion_explained=prop_expl,
                             species_ids=species_ids,
                             site_ids=site_ids)
Example #29
0
def _ordination_to_ordination_results(fh):
    eigvals = _parse_vector_section(fh, 'Eigvals')
    if eigvals is None:
        raise OrdinationFormatError("At least one eigval must be present.")
    _check_empty_line(fh)

    prop_expl = _parse_vector_section(fh, 'Proportion explained')
    _check_length_against_eigvals(prop_expl, eigvals,
                                  'proportion explained values')
    _check_empty_line(fh)

    species = _parse_array_section(fh, 'Species')
    _check_length_against_eigvals(species, eigvals,
                                  'coordinates per species')
    _check_empty_line(fh)

    site = _parse_array_section(fh, 'Site')
    _check_length_against_eigvals(site, eigvals,
                                  'coordinates per site')
    _check_empty_line(fh)

    # biplot does not have ids to parse (the other arrays do)
    biplot = _parse_array_section(fh, 'Biplot', has_ids=False)
    _check_empty_line(fh)

    cons = _parse_array_section(fh, 'Site constraints')

    if cons is not None and site is not None:
        if not np.array_equal(cons.index, site.index):
            raise OrdinationFormatError(
                "Site constraints ids and site ids must be equal: %s != %s" %
                (cons.index, site.index))

    return OrdinationResults(
        short_method_name='', long_method_name='', eigvals=eigvals,
        features=species, samples=site, biplot_scores=biplot,
        sample_constraints=cons, proportion_explained=prop_expl)
if __name__ == '__main__':
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    ord_fp = opts.input_fp
    mapping_fp = opts.map_fp
    categories = opts.categories.split(',')
    output_dir = opts.output_dir
    sort_by = opts.sort_by
    algorithm = opts.algorithm
    axes = opts.axes
    weighted = opts.weight_by_vector
    window_size = opts.window_size

    # Parse the ordination results
    with open(ord_fp, 'U') as f:
        ord_res = OrdinationResults.read(f)

    # Parse the mapping file
    with open(mapping_fp, 'U') as f:
        map_dict = parse_mapping_file_to_dict(f)[0]
    metamap = pd.DataFrame.from_dict(map_dict, orient='index')

    for category in categories:
        if category not in metamap.keys():
            option_parser.error("Category %s does not exist in the mapping "
                                "file" % categories)

    sort_category = None
    if sort_by:
        if sort_by == 'SampleID':
            sort_category = None
Example #31
0
def country(coords, mapping_file):
    """Generates as many figures as samples in the coordinates file"""
    o = OrdinationResults.from_file(coords)
    x, y = o.site[:, 0], o.site[:, 1]

    # coordinates
    c_df = pd.DataFrame(o.site, o.site_ids)

    # mapping file
    mf = pd.read_csv(mapping_file, '\t', converters=defaultdict(str),
                     index_col='#SampleID')
    mf = mf.loc[o.site_ids]

    color_Venezuela = sns.color_palette('Paired', 12)[10]
    color_Malawi = sns.color_palette('Paired', 12)[1]
    color_Western = sns.color_palette('Paired', 12)[4]
    color_Highlight = sns.color_palette('Paired', 12)[5]
    color_no_data = (0.5, 0.5, 0.5)

    grp_colors = OrderedDict()
    grp_colors['no_data'] = color_no_data
    grp_colors['Australia'] = color_Western
    grp_colors['Belgium'] = color_Western
    grp_colors['Canada'] = color_Western
    grp_colors['China'] = color_Western
    grp_colors['Finland'] = color_Western
    grp_colors['France'] = color_Western
    grp_colors['Germany'] = color_Western
    grp_colors['Great Britain'] = color_Western
    grp_colors['Ireland'] = color_Western
    grp_colors['Japan'] = color_Western
    grp_colors['Netherlands'] = color_Western
    grp_colors['New Zealand'] = color_Western
    grp_colors['Norway'] = color_Western
    grp_colors['Scotland'] = color_Western
    grp_colors['Spain'] = color_Western
    grp_colors['Switzerland'] = color_Western
    grp_colors['Thailand'] = color_Western
    grp_colors['United Arab Emirates'] = color_Western
    grp_colors['United Kingdom'] = color_Western
    grp_colors['United States of America'] = color_Western
    grp_colors['Malawi'] = color_Malawi
    grp_colors['Venezuela'] = color_Venezuela

    for sample in mf.index:

        # countour plot superimposed
        sns.kdeplot(x, y, cmap='bone')
        sns.set_context(rc={"lines.linewidth": 0.75})

        # change particapant's country's color to color_Highlight unless
        # country is Venezuela or Malawi
        if (mf.loc[sample]['COUNTRY'] != 'Malawi') & (
                mf.loc[sample]['COUNTRY'] != 'Venezuela'):
            grp_colors[mf.loc[sample]['COUNTRY']] = color_Highlight

        # plot each country except participant's according to colors above
        for grp, color in grp_colors.iteritems():
            if grp == mf.loc[sample]['COUNTRY']:
                continue
            sub_coords = c_df[mf.COUNTRY == grp]
            plt.scatter(sub_coords[0], sub_coords[1], color=color,
                        edgecolor=np.asarray(color)*0.6, lw=LINE_WIDTH,
                        alpha=ALPHA)

        # now plot participant's country
        grp = mf.loc[sample]['COUNTRY']
        color = grp_colors[grp]
        sub_coords = c_df[mf.COUNTRY == grp]
        plt.scatter(sub_coords[0], sub_coords[1], color=color,
                    edgecolor=np.asarray(color)*0.6, lw=LINE_WIDTH,
                    alpha=ALPHA)

        # plot participant's dot
        plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1],
                    color=grp_colors[mf.loc[sample]['COUNTRY']],
                    s=270, edgecolor='w', zorder=1)
        plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1],
                    color=grp_colors[mf.loc[sample]['COUNTRY']],
                    s=250, edgecolor=np.asarray(grp_colors[mf.loc[sample]
                                                ['COUNTRY']])*0.6, zorder=2)

        # reset particapant's country's color to color_Western unless country
        # is Venezuela or Malawi
        if (mf.loc[sample]['COUNTRY'] != 'Malawi') & (
                mf.loc[sample]['COUNTRY'] != 'Venezuela'):
            grp_colors[mf.loc[sample]['COUNTRY']] = color_Western

        plt.axis('off')
        my_dpi = 72
        plt.savefig(sample+'.pdf', figsize=(1000/my_dpi, 1000/my_dpi),
                    dpi=my_dpi)
        plt.close()
Example #32
0
class TestOrdinationResults(unittest.TestCase):
    def setUp(self):
        # Define in-memory CA results to serialize and deserialize.
        eigvals = np.array([0.0961330159181, 0.0409418140138])
        species = np.array([[0.408869425742, 0.0695518116298],
                            [-0.1153860437, -0.299767683538],
                            [-0.309967102571, 0.187391917117]])
        site = np.array([[-0.848956053187, 0.882764759014],
                         [-0.220458650578, -1.34482000302],
                         [1.66697179591, 0.470324389808]])
        biplot = None
        site_constraints = None
        prop_explained = None
        species_ids = ['Species1', 'Species2', 'Species3']
        site_ids = ['Site1', 'Site2', 'Site3']

        self.ordination_results = OrdinationResults(
            eigvals=eigvals, species=species, site=site, biplot=biplot,
            site_constraints=site_constraints,
            proportion_explained=prop_explained, species_ids=species_ids,
            site_ids=site_ids)

        # DataFrame for testing plot method. Has a categorical column with a
        # mix of numbers and strings. Has a numeric column with a mix of ints,
        # floats, and strings that can be converted to floats. Has a numeric
        # column with missing data (np.nan).
        self.df = pd.DataFrame([['foo', '42', 10],
                                [22, 0, 8],
                                [22, -4.2, np.nan],
                                ['foo', '42.19', 11]],
                               index=['A', 'B', 'C', 'D'],
                               columns=['categorical', 'numeric', 'nancolumn'])

        # Minimal ordination results for easier testing of plotting method.
        # Paired with df above.
        eigvals = np.array([0.50, 0.25, 0.25])
        site = np.array([[0.1, 0.2, 0.3],
                         [0.2, 0.3, 0.4],
                         [0.3, 0.4, 0.5],
                         [0.4, 0.5, 0.6]])
        self.min_ord_results = OrdinationResults(eigvals=eigvals, site=site,
                                                 site_ids=['A', 'B', 'C', 'D'])

    def test_str(self):
        exp = ("Ordination results:\n"
               "\tEigvals: 2\n"
               "\tProportion explained: N/A\n"
               "\tSpecies: 3x2\n"
               "\tSite: 3x2\n"
               "\tBiplot: N/A\n"
               "\tSite constraints: N/A\n"
               "\tSpecies IDs: 'Species1', 'Species2', 'Species3'\n"
               "\tSite IDs: 'Site1', 'Site2', 'Site3'")
        obs = str(self.ordination_results)
        self.assertEqual(obs, exp)

        # all optional attributes missing
        exp = ("Ordination results:\n"
               "\tEigvals: 1\n"
               "\tProportion explained: N/A\n"
               "\tSpecies: N/A\n"
               "\tSite: N/A\n"
               "\tBiplot: N/A\n"
               "\tSite constraints: N/A\n"
               "\tSpecies IDs: N/A\n"
               "\tSite IDs: N/A")
        obs = str(OrdinationResults(np.array([4.2])))
        self.assertEqual(obs, exp)

    def check_basic_figure_sanity(self, fig, exp_num_subplots, exp_title,
                                  exp_legend_exists, exp_xlabel, exp_ylabel,
                                  exp_zlabel):
        # check type
        assert_is_instance(fig, mpl.figure.Figure)

        # check number of subplots
        axes = fig.get_axes()
        npt.assert_equal(len(axes), exp_num_subplots)

        # check title
        ax = axes[0]
        npt.assert_equal(ax.get_title(), exp_title)

        # shouldn't have tick labels
        for tick_label in (ax.get_xticklabels() + ax.get_yticklabels() +
                           ax.get_zticklabels()):
            npt.assert_equal(tick_label.get_text(), '')

        # check if legend is present
        legend = ax.get_legend()
        if exp_legend_exists:
            assert_true(legend is not None)
        else:
            assert_true(legend is None)

        # check axis labels
        npt.assert_equal(ax.get_xlabel(), exp_xlabel)
        npt.assert_equal(ax.get_ylabel(), exp_ylabel)
        npt.assert_equal(ax.get_zlabel(), exp_zlabel)

    def test_plot_no_metadata(self):
        fig = self.min_ord_results.plot()
        self.check_basic_figure_sanity(fig, 1, '', False, '0', '1', '2')

    def test_plot_with_numeric_metadata_and_plot_options(self):
        fig = self.min_ord_results.plot(
            self.df, 'numeric', axes=(1, 0, 2),
            axis_labels=['PC 2', 'PC 1', 'PC 3'], title='a title', cmap='Reds')
        self.check_basic_figure_sanity(
            fig, 2, 'a title', False, 'PC 2', 'PC 1', 'PC 3')

    def test_plot_with_categorical_metadata_and_plot_options(self):
        fig = self.min_ord_results.plot(
            self.df, 'categorical', axes=[2, 0, 1], title='a title',
            cmap='Accent')
        self.check_basic_figure_sanity(fig, 1, 'a title', True, '2', '0', '1')

    def test_plot_with_invalid_axis_labels(self):
        with assert_raises_regexp(ValueError, 'axis_labels.*4'):
            self.min_ord_results.plot(axes=[2, 0, 1],
                                      axis_labels=('a', 'b', 'c', 'd'))

    def test_validate_plot_axes_valid_input(self):
        # shouldn't raise an error on valid input. nothing is returned, so
        # nothing to check here
        self.min_ord_results._validate_plot_axes(self.min_ord_results.site.T,
                                                 (1, 2, 0))

    def test_validate_plot_axes_invalid_input(self):
        # not enough dimensions
        with assert_raises_regexp(ValueError, '2 dimension\(s\)'):
            self.min_ord_results._validate_plot_axes(
                np.asarray([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]]), (0, 1, 2))

        coord_matrix = self.min_ord_results.site.T

        # wrong number of axes
        with assert_raises_regexp(ValueError, 'exactly three.*found 0'):
            self.min_ord_results._validate_plot_axes(coord_matrix, [])
        with assert_raises_regexp(ValueError, 'exactly three.*found 4'):
            self.min_ord_results._validate_plot_axes(coord_matrix,
                                                     (0, 1, 2, 3))

        # duplicate axes
        with assert_raises_regexp(ValueError, 'must be unique'):
            self.min_ord_results._validate_plot_axes(coord_matrix, (0, 1, 0))

        # out of range axes
        with assert_raises_regexp(ValueError, 'axes\[1\].*3'):
            self.min_ord_results._validate_plot_axes(coord_matrix, (0, -1, 2))
        with assert_raises_regexp(ValueError, 'axes\[2\].*3'):
            self.min_ord_results._validate_plot_axes(coord_matrix, (0, 2, 3))

    def test_get_plot_point_colors_invalid_input(self):
        # column provided without df
        with npt.assert_raises(ValueError):
            self.min_ord_results._get_plot_point_colors(None, 'numeric',
                                                        ['B', 'C'], 'jet')

        # df provided without column
        with npt.assert_raises(ValueError):
            self.min_ord_results._get_plot_point_colors(self.df, None,
                                                        ['B', 'C'], 'jet')

        # column not in df
        with assert_raises_regexp(ValueError, 'missingcol'):
            self.min_ord_results._get_plot_point_colors(self.df, 'missingcol',
                                                        ['B', 'C'], 'jet')

        # id not in df
        with assert_raises_regexp(ValueError, 'numeric'):
            self.min_ord_results._get_plot_point_colors(
                self.df, 'numeric', ['B', 'C', 'missingid', 'A'], 'jet')

        # missing data in df
        with assert_raises_regexp(ValueError, 'nancolumn'):
            self.min_ord_results._get_plot_point_colors(self.df, 'nancolumn',
                                                        ['B', 'C', 'A'], 'jet')

    def test_get_plot_point_colors_no_df_or_column(self):
        obs = self.min_ord_results._get_plot_point_colors(None, None,
                                                          ['B', 'C'], 'jet')
        npt.assert_equal(obs, (None, None))

    def test_get_plot_point_colors_numeric_column(self):
        # subset of the ids in df
        exp = [0.0, -4.2, 42.0]
        obs = self.min_ord_results._get_plot_point_colors(
            self.df, 'numeric', ['B', 'C', 'A'], 'jet')
        npt.assert_almost_equal(obs[0], exp)
        assert_true(obs[1] is None)

        # all ids in df
        exp = [0.0, 42.0, 42.19, -4.2]
        obs = self.min_ord_results._get_plot_point_colors(
            self.df, 'numeric', ['B', 'A', 'D', 'C'], 'jet')
        npt.assert_almost_equal(obs[0], exp)
        assert_true(obs[1] is None)

    def test_get_plot_point_colors_categorical_column(self):
        # subset of the ids in df
        exp_colors = [[0., 0., 0.5, 1.], [0., 0., 0.5, 1.], [0.5, 0., 0., 1.]]
        exp_color_dict = {
            'foo': [0.5, 0., 0., 1.],
            22: [0., 0., 0.5, 1.]
        }
        obs = self.min_ord_results._get_plot_point_colors(
            self.df, 'categorical', ['B', 'C', 'A'], 'jet')
        npt.assert_almost_equal(obs[0], exp_colors)
        npt.assert_equal(obs[1], exp_color_dict)

        # all ids in df
        exp_colors = [[0., 0., 0.5, 1.], [0.5, 0., 0., 1.], [0.5, 0., 0., 1.],
                      [0., 0., 0.5, 1.]]
        obs = self.min_ord_results._get_plot_point_colors(
            self.df, 'categorical', ['B', 'A', 'D', 'C'], 'jet')
        npt.assert_almost_equal(obs[0], exp_colors)
        # should get same color dict as before
        npt.assert_equal(obs[1], exp_color_dict)

    def test_plot_categorical_legend(self):
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')

        # we shouldn't have a legend yet
        assert_true(ax.get_legend() is None)

        self.min_ord_results._plot_categorical_legend(
            ax, {'foo': 'red', 'bar': 'green'})

        # make sure we have a legend now
        legend = ax.get_legend()
        assert_true(legend is not None)

        # do some light sanity checking to make sure our input labels and
        # colors are present. we're not using nose.tools.assert_items_equal
        # because it isn't available in Python 3.
        labels = [t.get_text() for t in legend.get_texts()]
        npt.assert_equal(sorted(labels), ['bar', 'foo'])

        colors = [l.get_color() for l in legend.get_lines()]
        npt.assert_equal(sorted(colors), ['green', 'red'])

    def test_repr_png(self):
        obs = self.min_ord_results._repr_png_()
        assert_is_instance(obs, binary_type)
        assert_true(len(obs) > 0)

    def test_repr_svg(self):
        obs = self.min_ord_results._repr_svg_()
        # print_figure(format='svg') can return text or bytes depending on the
        # version of IPython
        assert_true(isinstance(obs, text_type) or isinstance(obs, binary_type))
        assert_true(len(obs) > 0)

    def test_png(self):
        assert_is_instance(self.min_ord_results.png, Image)

    def test_svg(self):
        assert_is_instance(self.min_ord_results.svg, SVG)
Example #33
0
class TestOrdinationResults(unittest.TestCase):
    def setUp(self):
        # Define in-memory CA results to serialize and deserialize.
        eigvals = np.array([0.0961330159181, 0.0409418140138])
        species = np.array([[0.408869425742, 0.0695518116298],
                            [-0.1153860437, -0.299767683538],
                            [-0.309967102571, 0.187391917117]])
        site = np.array([[-0.848956053187, 0.882764759014],
                         [-0.220458650578, -1.34482000302],
                         [1.66697179591, 0.470324389808]])
        biplot = None
        site_constraints = None
        prop_explained = None
        species_ids = ['Species1', 'Species2', 'Species3']
        site_ids = ['Site1', 'Site2', 'Site3']

        self.ordination_results = OrdinationResults(
            eigvals=eigvals,
            species=species,
            site=site,
            biplot=biplot,
            site_constraints=site_constraints,
            proportion_explained=prop_explained,
            species_ids=species_ids,
            site_ids=site_ids)

        # DataFrame for testing plot method. Has a categorical column with a
        # mix of numbers and strings. Has a numeric column with a mix of ints,
        # floats, and strings that can be converted to floats. Has a numeric
        # column with missing data (np.nan).
        self.df = pd.DataFrame([['foo', '42', 10], [22, 0, 8],
                                [22, -4.2, np.nan], ['foo', '42.19', 11]],
                               index=['A', 'B', 'C', 'D'],
                               columns=['categorical', 'numeric', 'nancolumn'])

        # Minimal ordination results for easier testing of plotting method.
        # Paired with df above.
        eigvals = np.array([0.50, 0.25, 0.25])
        site = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5],
                         [0.4, 0.5, 0.6]])
        self.min_ord_results = OrdinationResults(eigvals=eigvals,
                                                 site=site,
                                                 site_ids=['A', 'B', 'C', 'D'])

    def test_deprecated_io(self):
        fh = StringIO()
        npt.assert_warns(UserWarning, self.ordination_results.to_file, fh)
        fh.seek(0)
        deserialized = npt.assert_warns(UserWarning,
                                        OrdinationResults.from_file, fh)
        assert_ordination_results_equal(deserialized, self.ordination_results)
        self.assertTrue(type(deserialized) == OrdinationResults)

    def check_basic_figure_sanity(self, fig, exp_num_subplots, exp_title,
                                  exp_legend_exists, exp_xlabel, exp_ylabel,
                                  exp_zlabel):
        # check type
        assert_is_instance(fig, mpl.figure.Figure)

        # check number of subplots
        axes = fig.get_axes()
        npt.assert_equal(len(axes), exp_num_subplots)

        # check title
        ax = axes[0]
        npt.assert_equal(ax.get_title(), exp_title)

        # shouldn't have tick labels
        for tick_label in (ax.get_xticklabels() + ax.get_yticklabels() +
                           ax.get_zticklabels()):
            npt.assert_equal(tick_label.get_text(), '')

        # check if legend is present
        legend = ax.get_legend()
        if exp_legend_exists:
            assert_true(legend is not None)
        else:
            assert_true(legend is None)

        # check axis labels
        npt.assert_equal(ax.get_xlabel(), exp_xlabel)
        npt.assert_equal(ax.get_ylabel(), exp_ylabel)
        npt.assert_equal(ax.get_zlabel(), exp_zlabel)

    def test_plot_no_metadata(self):
        fig = self.min_ord_results.plot()
        self.check_basic_figure_sanity(fig, 1, '', False, '0', '1', '2')

    def test_plot_with_numeric_metadata_and_plot_options(self):
        fig = self.min_ord_results.plot(self.df,
                                        'numeric',
                                        axes=(1, 0, 2),
                                        axis_labels=['PC 2', 'PC 1', 'PC 3'],
                                        title='a title',
                                        cmap='Reds')
        self.check_basic_figure_sanity(fig, 2, 'a title', False, 'PC 2',
                                       'PC 1', 'PC 3')

    def test_plot_with_categorical_metadata_and_plot_options(self):
        fig = self.min_ord_results.plot(self.df,
                                        'categorical',
                                        axes=[2, 0, 1],
                                        title='a title',
                                        cmap='Accent')
        self.check_basic_figure_sanity(fig, 1, 'a title', True, '2', '0', '1')

    def test_plot_with_invalid_axis_labels(self):
        with assert_raises_regexp(ValueError, 'axis_labels.*4'):
            self.min_ord_results.plot(axes=[2, 0, 1],
                                      axis_labels=('a', 'b', 'c', 'd'))

    def test_validate_plot_axes_valid_input(self):
        # shouldn't raise an error on valid input. nothing is returned, so
        # nothing to check here
        self.min_ord_results._validate_plot_axes(self.min_ord_results.site.T,
                                                 (1, 2, 0))

    def test_validate_plot_axes_invalid_input(self):
        # not enough dimensions
        with assert_raises_regexp(ValueError, '2 dimension\(s\)'):
            self.min_ord_results._validate_plot_axes(
                np.asarray([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]]), (0, 1, 2))

        coord_matrix = self.min_ord_results.site.T

        # wrong number of axes
        with assert_raises_regexp(ValueError, 'exactly three.*found 0'):
            self.min_ord_results._validate_plot_axes(coord_matrix, [])
        with assert_raises_regexp(ValueError, 'exactly three.*found 4'):
            self.min_ord_results._validate_plot_axes(coord_matrix,
                                                     (0, 1, 2, 3))

        # duplicate axes
        with assert_raises_regexp(ValueError, 'must be unique'):
            self.min_ord_results._validate_plot_axes(coord_matrix, (0, 1, 0))

        # out of range axes
        with assert_raises_regexp(ValueError, 'axes\[1\].*3'):
            self.min_ord_results._validate_plot_axes(coord_matrix, (0, -1, 2))
        with assert_raises_regexp(ValueError, 'axes\[2\].*3'):
            self.min_ord_results._validate_plot_axes(coord_matrix, (0, 2, 3))

    def test_get_plot_point_colors_invalid_input(self):
        # column provided without df
        with npt.assert_raises(ValueError):
            self.min_ord_results._get_plot_point_colors(
                None, 'numeric', ['B', 'C'], 'jet')

        # df provided without column
        with npt.assert_raises(ValueError):
            self.min_ord_results._get_plot_point_colors(
                self.df, None, ['B', 'C'], 'jet')

        # column not in df
        with assert_raises_regexp(ValueError, 'missingcol'):
            self.min_ord_results._get_plot_point_colors(
                self.df, 'missingcol', ['B', 'C'], 'jet')

        # id not in df
        with assert_raises_regexp(ValueError, 'numeric'):
            self.min_ord_results._get_plot_point_colors(
                self.df, 'numeric', ['B', 'C', 'missingid', 'A'], 'jet')

        # missing data in df
        with assert_raises_regexp(ValueError, 'nancolumn'):
            self.min_ord_results._get_plot_point_colors(
                self.df, 'nancolumn', ['B', 'C', 'A'], 'jet')

    def test_get_plot_point_colors_no_df_or_column(self):
        obs = self.min_ord_results._get_plot_point_colors(
            None, None, ['B', 'C'], 'jet')
        npt.assert_equal(obs, (None, None))

    def test_get_plot_point_colors_numeric_column(self):
        # subset of the ids in df
        exp = [0.0, -4.2, 42.0]
        obs = self.min_ord_results._get_plot_point_colors(
            self.df, 'numeric', ['B', 'C', 'A'], 'jet')
        npt.assert_almost_equal(obs[0], exp)
        assert_true(obs[1] is None)

        # all ids in df
        exp = [0.0, 42.0, 42.19, -4.2]
        obs = self.min_ord_results._get_plot_point_colors(
            self.df, 'numeric', ['B', 'A', 'D', 'C'], 'jet')
        npt.assert_almost_equal(obs[0], exp)
        assert_true(obs[1] is None)

    def test_get_plot_point_colors_categorical_column(self):
        # subset of the ids in df
        exp_colors = [[0., 0., 0.5, 1.], [0., 0., 0.5, 1.], [0.5, 0., 0., 1.]]
        exp_color_dict = {'foo': [0.5, 0., 0., 1.], 22: [0., 0., 0.5, 1.]}
        obs = self.min_ord_results._get_plot_point_colors(
            self.df, 'categorical', ['B', 'C', 'A'], 'jet')
        npt.assert_almost_equal(obs[0], exp_colors)
        npt.assert_equal(obs[1], exp_color_dict)

        # all ids in df
        exp_colors = [[0., 0., 0.5, 1.], [0.5, 0., 0., 1.], [0.5, 0., 0., 1.],
                      [0., 0., 0.5, 1.]]
        obs = self.min_ord_results._get_plot_point_colors(
            self.df, 'categorical', ['B', 'A', 'D', 'C'], 'jet')
        npt.assert_almost_equal(obs[0], exp_colors)
        # should get same color dict as before
        npt.assert_equal(obs[1], exp_color_dict)

    def test_plot_categorical_legend(self):
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')

        # we shouldn't have a legend yet
        assert_true(ax.get_legend() is None)

        self.min_ord_results._plot_categorical_legend(ax, {
            'foo': 'red',
            'bar': 'green'
        })

        # make sure we have a legend now
        legend = ax.get_legend()
        assert_true(legend is not None)

        # do some light sanity checking to make sure our input labels and
        # colors are present. we're not using nose.tools.assert_items_equal
        # because it isn't available in Python 3.
        labels = [t.get_text() for t in legend.get_texts()]
        npt.assert_equal(sorted(labels), ['bar', 'foo'])

        colors = [l.get_color() for l in legend.get_lines()]
        npt.assert_equal(sorted(colors), ['green', 'red'])

    def test_repr_png(self):
        obs = self.min_ord_results._repr_png_()
        assert_is_instance(obs, binary_type)
        assert_true(len(obs) > 0)

    def test_repr_svg(self):
        obs = self.min_ord_results._repr_svg_()
        assert_is_instance(obs, text_type)
        assert_true(len(obs) > 0)

    def test_png(self):
        assert_is_instance(self.min_ord_results.png, Image)

    def test_svg(self):
        assert_is_instance(self.min_ord_results.svg, SVG)
    def get_pair_cmds(self, omics_pairs):
        crowdeds = [0, 1]
        pc_sb_correlations = []
        for keys, values in self.mmvec_res.items():
            pair, case, omic1, omic2, filt1, filt2, sams, mmvec = keys
            ranks_fp, ordi_fp, meta_fp, omic1_common, omic2_common = values
            order_omics = get_order_omics(omic1, omic2, filt1, filt2, case,
                                          omics_pairs)
            omic1 = order_omics[0]
            omic2 = order_omics[1]
            filt1 = order_omics[2]
            filt2 = order_omics[3]
            omic_feature = order_omics[4]
            omic_sample = order_omics[5]
            omic_microbe = order_omics[6]
            omic_metabolite = order_omics[7]

            # get differentials
            meta1, meta_pd1, diff_cols1 = self.metas[(pair, case, omic1, filt1,
                                                      omic2, filt2)]
            meta2, meta_pd2, diff_cols2 = self.metas[(pair, case, omic2, filt2,
                                                      omic1, filt1)]
            # features are biplot, samples are dots
            ordi = OrdinationResults.read(ordi_fp)
            cur_pc_sb_correlations, max_r = get_pc_sb_correlations(
                pair, case, ordi, omic1, omic2, filt1, filt2, diff_cols1,
                meta_pd1, diff_cols2, meta_pd2, meta_fp, omic1_common,
                omic2_common, ranks_fp)
            pc_sb_correlations.append(cur_pc_sb_correlations)

            cmd = ''
            if pair in self.highlights:
                pair_highlights = self.highlights[pair]
                for highlight, regexes_list in pair_highlights.items():
                    n_edit, meta_edit, ordi_edit_fp = edit_ordi_qzv(
                        ordi, ordi_fp, highlight, regexes_list, meta1,
                        meta_pd1)
                    if n_edit:
                        qza, qzv = get_qzs(ordi_edit_fp)
                        cmd += get_biplot_commands(ordi_edit_fp, qza, qzv,
                                                   omic_feature, omic_sample,
                                                   meta_edit, meta2, n_edit,
                                                   max_r)
            ordi_edit_fp = ordi_fp
            qza, qzv = get_qzs(ordi_edit_fp)
            for crowded in crowdeds:
                if crowded:
                    n_ordi_feats = ordi.features.shape[0]
                    qzv = qzv.replace('.qzv', '_crowded.qzv')
                else:
                    n_ordi_feats = 15
                    # heat_qza, heat_qzv = get_heatmap_qzs(ranks_fp)
                    # cmd += get_heatmap_commands(
                    #     ranks_fp, heat_qza, heat_qzv, meta1,
                    #     meta2, meta_pd1, meta_pd2)
                cmd += get_biplot_commands(ordi_edit_fp, qza, qzv,
                                           omic_feature, omic_sample, meta1,
                                           meta2, n_ordi_feats, max_r)
            cmd += get_xmmvec_commands(ordi_edit_fp, omic1, omic2, meta1,
                                       meta2, self.xmmvecs, pair)

            topn = 5
            features_names = []
            if features_names:
                heat = '%s_paired_heatmaps_custom.qzv' % splitext(ranks_fp)[0]
            else:
                heat = '%s_paired_heatmaps_top%s.qzv' % (splitext(ranks_fp)[0],
                                                         topn)
            cmd += get_paired_heatmaps_command(ranks_fp, omic1_common,
                                               omic2_common, meta1,
                                               features_names, topn, heat)
            self.cmds.setdefault(pair, []).append(cmd)
        return pc_sb_correlations
def get_procrustes_results(coords_f1, coords_f2, sample_id_map=None,
                           randomize=None, max_dimensions=None,
                           get_eigenvalues=get_mean_eigenvalues,
                           get_percent_variation_explained=get_mean_percent_variation):
    """ """
    # Parse the PCoA files
    ord_res_1 = OrdinationResults.read(coords_f1)
    ord_res_2 = OrdinationResults.read(coords_f2)

    sample_ids1 = ord_res_1.site_ids
    coords1 = ord_res_1.site
    eigvals1 = ord_res_1.eigvals
    pct_var1 = ord_res_1.proportion_explained

    sample_ids2 = ord_res_2.site_ids
    coords2 = ord_res_2.site
    eigvals2 = ord_res_2.eigvals
    pct_var2 = ord_res_2.proportion_explained

    if sample_id_map:
        sample_ids1 = map_sample_ids(sample_ids1, sample_id_map)
        sample_ids2 = map_sample_ids(sample_ids2, sample_id_map)
    # rearrange the order of coords in coords2 to correspond to
    # the order of coords in coords1
    order = list(set(sample_ids1) & set(sample_ids2))
    coords1 = reorder_coords(coords1, sample_ids1, order)
    coords2 = reorder_coords(coords2, sample_ids2, order)
    if len(order) == 0:
        raise ValueError('No overlapping samples in the two files')

    # If this is a random trial, apply the shuffling function passed as
    # randomize()
    if randomize:
        coords2 = randomize(coords2)
        randomized_coords2 = OrdinationResults(eigvals=eigvals2,
                                               proportion_explained=pct_var2,
                                               site=coords2,
                                               site_ids=order)
    else:
        randomized_coords2 = None

    coords1, coords2 = pad_coords_matrices(coords1, coords2)
    if max_dimensions:
        coords1 = filter_coords_matrix(coords1, max_dimensions)
        coords2 = filter_coords_matrix(coords2, max_dimensions)
        pct_var1 = pct_var1[:max_dimensions]
        pct_var2 = pct_var2[:max_dimensions]
        eigvals1 = eigvals1[:max_dimensions]
        eigvals2 = eigvals2[:max_dimensions]
    else:
        if len(pct_var1) > len(pct_var2):
            pct_var2 = append(pct_var2, zeros(len(pct_var1) - len(pct_var2)))
            eigvals2 = append(eigvals2, zeros(len(eigvals1) - len(eigvals2)))
        elif len(pct_var1) < len(pct_var2):
            pct_var1 = append(pct_var1, zeros(len(pct_var2) - len(pct_var1)))
            eigvals1 = append(eigvals1, zeros(len(eigvals2) - len(eigvals1)))

    # Run the Procrustes analysis
    transformed_coords_m1, transformed_coords_m2, m_squared =\
        procrustes(coords1, coords2)
    # print coords2
    # print transformed_coords_m2

    eigvals = get_eigenvalues(eigvals1, eigvals2)
    pct_var = get_percent_variation_explained(pct_var1, pct_var2)

    transformed_coords1 = OrdinationResults(eigvals=asarray(eigvals),
                                            proportion_explained=asarray(pct_var),
                                            site=asarray(transformed_coords_m1),
                                            site_ids=order)
    transformed_coords2 = OrdinationResults(eigvals=asarray(eigvals),
                                            proportion_explained=asarray(pct_var),
                                            site=asarray(transformed_coords_m2),
                                            site_ids=order)

    # Return the results
    return (transformed_coords1, transformed_coords2,
            m_squared, randomized_coords2)
    def setUp(self):
        super().setUp()
        axis_labels = ['PC1', 'PC2', 'PC3']
        self.resources = ResourceManager()
        self.fh1 = self.create_tempfile(suffix='.qza')
        self.fh2 = self.create_tempfile(suffix='.qza')
        self.pcoa_path1 = self.fh1.name
        self.pcoa_path2 = self.fh2.name
        self.test_df1 = pd.DataFrame.from_dict(
            {
                's1': [0.1, 0.2, 7],
                's2': [0.9, 0.2, 7],
            },
            orient='index',
            columns=axis_labels,
        )
        self.test_df1.index.name = 'Sample ID'
        self.test_df2 = pd.DataFrame.from_dict(
            {
                's1': [0.1, 0.2, 7],
                's2': [0.9, 0.2, 7],
                's3': [0.2, -0.3, 0],
                's4': [0.111, -4, 0.2],
            },
            orient='index',
            columns=axis_labels,
        )
        self.test_df2.index.name = 'Sample ID'

        self.pcoa1 = OrdinationResults(
            'pcoa1',
            'pcoa1',
            eigvals=pd.Series(
                [7, 2, 1],
                index=axis_labels,
            ),
            samples=self.test_df1,
            proportion_explained=pd.Series(
                [0.7, 0.2, 0.1],
                index=axis_labels,
            ),
        )
        self.pcoa2 = OrdinationResults(
            'pcoa2',
            'pcoa2',
            eigvals=pd.Series(
                [6, 3, 1],
                index=axis_labels,
            ),
            samples=self.test_df2,
            proportion_explained=pd.Series(
                [0.6, 0.3, 0.1],
                index=axis_labels,
            ),
        )
        imported_artifact = Artifact.import_data(
            "PCoAResults",
            self.pcoa1,
        )
        imported_artifact.save(self.pcoa_path1)
        imported_artifact = Artifact.import_data(
            "PCoAResults",
            self.pcoa2,
        )
        imported_artifact.save(self.pcoa_path2)
Example #37
0
def get_procrustes_results(coords_f1, coords_f2, sample_id_map=None,
                           randomize=None, max_dimensions=None,
                           get_eigenvalues=get_mean_eigenvalues,
                           get_percent_variation_explained=get_mean_percent_variation):
    """ """
    # Parse the PCoA files
    ord_res_1 = OrdinationResults.read(coords_f1)
    ord_res_2 = OrdinationResults.read(coords_f2)

    sample_ids1 = ord_res_1.site_ids
    coords1 = ord_res_1.site
    eigvals1 = ord_res_1.eigvals
    pct_var1 = ord_res_1.proportion_explained

    sample_ids2 = ord_res_2.site_ids
    coords2 = ord_res_2.site
    eigvals2 = ord_res_2.eigvals
    pct_var2 = ord_res_2.proportion_explained

    if sample_id_map:
        sample_ids1 = map_sample_ids(sample_ids1, sample_id_map)
        sample_ids2 = map_sample_ids(sample_ids2, sample_id_map)
    # rearrange the order of coords in coords2 to correspond to
    # the order of coords in coords1
    order = list(set(sample_ids1) & set(sample_ids2))
    coords1 = reorder_coords(coords1, sample_ids1, order)
    coords2 = reorder_coords(coords2, sample_ids2, order)
    if len(order) == 0:
        raise ValueError('No overlapping samples in the two files')

    # If this is a random trial, apply the shuffling function passed as
    # randomize()
    if randomize:
        coords2 = randomize(coords2)
        randomized_coords2 = OrdinationResults(eigvals=eigvals2,
                                               proportion_explained=pct_var2,
                                               site=coords2,
                                               site_ids=order)
    else:
        randomized_coords2 = None

    coords1, coords2 = pad_coords_matrices(coords1, coords2)
    if max_dimensions:
        coords1 = filter_coords_matrix(coords1, max_dimensions)
        coords2 = filter_coords_matrix(coords2, max_dimensions)
        pct_var1 = pct_var1[:max_dimensions]
        pct_var2 = pct_var2[:max_dimensions]
        eigvals1 = eigvals1[:max_dimensions]
        eigvals2 = eigvals2[:max_dimensions]
    else:
        if len(pct_var1) > len(pct_var2):
            pct_var2 = append(pct_var2, zeros(len(pct_var1) - len(pct_var2)))
            eigvals2 = append(eigvals2, zeros(len(eigvals1) - len(eigvals2)))
        elif len(pct_var1) < len(pct_var2):
            pct_var1 = append(pct_var1, zeros(len(pct_var2) - len(pct_var1)))
            eigvals1 = append(eigvals1, zeros(len(eigvals2) - len(eigvals1)))

    # Run the Procrustes analysis
    transformed_coords_m1, transformed_coords_m2, m_squared =\
        procrustes(coords1, coords2)
    # print coords2
    # print transformed_coords_m2

    eigvals = get_eigenvalues(eigvals1, eigvals2)
    pct_var = get_percent_variation_explained(pct_var1, pct_var2)

    transformed_coords1 = OrdinationResults(eigvals=asarray(eigvals),
                                            proportion_explained=asarray(pct_var),
                                            site=asarray(transformed_coords_m1),
                                            site_ids=order)
    transformed_coords2 = OrdinationResults(eigvals=asarray(eigvals),
                                            proportion_explained=asarray(pct_var),
                                            site=asarray(transformed_coords_m2),
                                            site_ids=order)

    # Return the results
    return (transformed_coords1, transformed_coords2,
            m_squared, randomized_coords2)
Example #38
0
def load_mp_data(use_artifact_api=True, is_empire=True):
    """Loads data from the QIIME 2 moving pictures tutorial for visualization.

    It's assumed that this data is already stored in docs/moving-pictures/, aka
    the PREFIX_DIR global variable set above, which should be located relative
    to where this function is being run from. If this directory or the data
    files within it cannot be accessed, this function will (probably) break.

    Parameters
    ----------
    use_artifact_api: bool, optional (default True)
        If True, this will load the artifacts using the QIIME 2 Artifact API,
        and the returned objects will have types corresponding to the first
        listed types (before the | characters) shown below.
        If False, this will instead load the artifacts without using QIIME 2's
        APIs; in this case, the returned objects will have types corresponding
        to the second listed types (after the | characters) shown below.
    is_empire: bool, optional(default True)
        If True, this will return an ordination.
        If False, will return None in place of an ordination.

    Returns
    -------
    (tree, table, md, fmd, ordination)
        tree: qiime2.Artifact | skbio.tree.TreeNode
            Phylogenetic tree.
        table: qiime2.Artifact | biom.Table
            Feature table.
        md: qiime2.Metadata | pandas.DataFrame
            Sample metadata.
        fmd: qiime2.Metadata | pandas.DataFrame
            Feature metadata. (Although this is stored in the repository as a
            FeatureData[Taxonomy] artifact, we transform it to Metadata if
            use_artifact_api is True.)
        pcoa: qiime2.Artifact | skbio.OrdinationResults | None
    """
    q2_tree_loc = os.path.join(PREFIX_DIR, "rooted-tree.qza")
    q2_table_loc = os.path.join(PREFIX_DIR, "table.qza")
    q2_pcoa_loc = os.path.join(PREFIX_DIR,
                               "unweighted_unifrac_pcoa_results.qza")
    q2_tax_loc = os.path.join(PREFIX_DIR, "taxonomy.qza")
    md_loc = os.path.join(PREFIX_DIR, "sample_metadata.tsv")
    if use_artifact_api:
        from qiime2 import Artifact, Metadata

        tree = Artifact.load(q2_tree_loc)
        table = Artifact.load(q2_table_loc)
        pcoa = Artifact.load(q2_pcoa_loc) if is_empire else None
        md = Metadata.load(md_loc)
        # We have to transform the taxonomy QZA to Metadata ourselves
        fmd = Artifact.load(q2_tax_loc).view(Metadata)
    else:
        import biom
        import pandas as pd
        from skbio.stats.ordination import OrdinationResults
        from skbio.tree import TreeNode
        with tempfile.TemporaryDirectory() as _tmp:
            tree_loc = extract_q2_artifact_to_path(_tmp, q2_tree_loc,
                                                   "tree.nwk")
            tree = TreeNode.read(tree_loc)
            tbl_loc = extract_q2_artifact_to_path(_tmp, q2_table_loc,
                                                  "feature-table.biom")
            table = biom.load_table(tbl_loc)
            if is_empire:
                pcoa_loc = extract_q2_artifact_to_path(_tmp, q2_pcoa_loc,
                                                       "ordination.txt")
                pcoa = OrdinationResults.read(pcoa_loc)
            else:
                pcoa = None
            tax_loc = extract_q2_artifact_to_path(_tmp, q2_tax_loc,
                                                  "taxonomy.tsv")
            fmd = pd.read_csv(tax_loc, sep="\t", index_col=0)
            md = pd.read_csv(md_loc, sep="\t", index_col=0, skiprows=[1])
    return tree, table, md, fmd, pcoa
Example #39
0
if __name__ == '__main__':
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    ord_fp = opts.input_fp
    mapping_fp = opts.map_fp
    categories = opts.categories.split(',')
    output_dir = opts.output_dir
    sort_by = opts.sort_by
    algorithm = opts.algorithm
    axes = opts.axes
    weighted = opts.weight_by_vector
    window_size = opts.window_size

    # Parse the ordination results
    with open(ord_fp, 'U') as f:
        ord_res = OrdinationResults.read(f)

    # Parse the mapping file
    with open(mapping_fp, 'U') as f:
        map_dict = parse_mapping_file_to_dict(f)[0]
    metamap = pd.DataFrame.from_dict(map_dict, orient='index')

    for category in categories:
        if category not in metamap.keys():
            option_parser.error("Category %s does not exist in the mapping "
                                "file" % categories)

    sort_category = None
    if sort_by:
        if sort_by == 'SampleID':
            sort_category = None
Example #40
0
    def setUp(self):
        super(OrdinationResultsReaderWriterTests, self).setUp()

        # define in-memory results, one for each of the valid files in
        # self.valid_fps

        # CA results
        eigvals = np.array([0.0961330159181, 0.0409418140138])
        species = np.array([[0.408869425742, 0.0695518116298],
                            [-0.1153860437, -0.299767683538],
                            [-0.309967102571, 0.187391917117]])
        site = np.array([[-0.848956053187, 0.882764759014],
                         [-0.220458650578, -1.34482000302],
                         [1.66697179591, 0.470324389808]])
        biplot = None
        site_constraints = None
        prop_explained = None
        species_ids = ['Species1', 'Species2', 'Species3']
        site_ids = ['Site1', 'Site2', 'Site3']
        ca_scores = OrdinationResults(eigvals=eigvals,
                                      species=species,
                                      site=site,
                                      biplot=biplot,
                                      site_constraints=site_constraints,
                                      proportion_explained=prop_explained,
                                      species_ids=species_ids,
                                      site_ids=site_ids)
        # CCA results
        eigvals = np.array([
            0.366135830393, 0.186887643052, 0.0788466514249, 0.082287840501,
            0.0351348475787, 0.0233265839374, 0.0099048981912,
            0.00122461669234, 0.000417454724117
        ])
        species = np.loadtxt(get_data_path('ordres_exp_OrdRes_CCA_species'))
        site = np.loadtxt(get_data_path('ordres_exp_OrdRes_CCA_site'))
        biplot = np.array(
            [[-0.169746767979, 0.63069090084, 0.760769036049],
             [-0.994016563505, 0.0609533148724, -0.0449369418179],
             [0.184352565909, -0.974867543612, 0.0309865007541]])
        site_constraints = np.loadtxt(
            get_data_path('ordres_exp_OrdRes_CCA_site_constraints'))
        prop_explained = None
        species_ids = [
            'Species0', 'Species1', 'Species2', 'Species3', 'Species4',
            'Species5', 'Species6', 'Species7', 'Species8'
        ]
        site_ids = [
            'Site0', 'Site1', 'Site2', 'Site3', 'Site4', 'Site5', 'Site6',
            'Site7', 'Site8', 'Site9'
        ]
        cca_scores = OrdinationResults(eigvals=eigvals,
                                       species=species,
                                       site=site,
                                       biplot=biplot,
                                       site_constraints=site_constraints,
                                       proportion_explained=prop_explained,
                                       species_ids=species_ids,
                                       site_ids=site_ids)
        # PCoA results
        eigvals = np.array([
            0.512367260461, 0.300719094427, 0.267912066004, 0.208988681078,
            0.19169895326, 0.16054234528, 0.15017695712, 0.122457748167, 0.0
        ])
        species = None
        site = np.loadtxt(get_data_path('ordres_exp_OrdRes_PCoA_site'))
        biplot = None
        site_constraints = None
        prop_explained = np.array([
            0.267573832777, 0.15704469605, 0.139911863774, 0.109140272454,
            0.100111048503, 0.0838401161912, 0.0784269939011, 0.0639511763509,
            0.0
        ])
        species_ids = None
        site_ids = [
            'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593',
            'PC.355', 'PC.607', 'PC.634'
        ]
        pcoa_scores = OrdinationResults(eigvals=eigvals,
                                        species=species,
                                        site=site,
                                        biplot=biplot,
                                        site_constraints=site_constraints,
                                        proportion_explained=prop_explained,
                                        species_ids=species_ids,
                                        site_ids=site_ids)
        # RDA results
        eigvals = np.array([
            25.8979540892, 14.9825779819, 8.93784077262, 6.13995623072,
            1.68070536498, 0.57735026919, 0.275983624351
        ])
        species = np.loadtxt(get_data_path('ordres_exp_OrdRes_RDA_species'))
        site = np.loadtxt(get_data_path('ordres_exp_OrdRes_RDA_site'))
        biplot = np.array([[0.422650019179, -0.559142585857, -0.713250678211],
                           [0.988495963777, 0.150787422017, -0.0117848614073],
                           [-0.556516618887, 0.817599992718, 0.147714267459],
                           [-0.404079676685, -0.9058434809, -0.127150316558]])
        site_constraints = np.loadtxt(
            get_data_path('ordres_exp_OrdRes_RDA_site_constraints'))
        prop_explained = None
        species_ids = [
            'Species0', 'Species1', 'Species2', 'Species3', 'Species4',
            'Species5'
        ]
        site_ids = [
            'Site0', 'Site1', 'Site2', 'Site3', 'Site4', 'Site5', 'Site6',
            'Site7', 'Site8', 'Site9'
        ]
        rda_scores = OrdinationResults(eigvals=eigvals,
                                       species=species,
                                       site=site,
                                       biplot=biplot,
                                       site_constraints=site_constraints,
                                       proportion_explained=prop_explained,
                                       species_ids=species_ids,
                                       site_ids=site_ids)

        self.ordination_results_objs = [
            ca_scores, cca_scores, pcoa_scores, rda_scores
        ]
Example #41
0
 def setUp(self):
     eigvals = np.array([0.512367260461, 0.300719094427, 0.267912066004,
                         0.208988681078, 0.19169895326, 0.16054234528,
                         0.15017695712, 0.122457748167, 0.0])
     site = np.array([[-0.212230626531, 0.216034194368, 0.03532727349,
                       -0.254450494129, -0.0687468542543, 0.231895596562,
                       0.00496549154314, -0.0026246871695,
                       9.73837390723e-10],
                      [-0.277487312135, -0.0295483215975, -0.0744173437992,
                       0.0957182357964, 0.204714844022, -0.0055407341857,
                       -0.190287966833, 0.16307126638, 9.73837390723e-10],
                      [0.220886492631, 0.0874848360559, -0.351990132198,
                       -0.00316535032886, 0.114635191853, -0.00019194106125,
                       0.188557853937, 0.030002427212, 9.73837390723e-10],
                      [0.0308923744062, -0.0446295973489, 0.133996451689,
                       0.29318228566, -0.167812539312, 0.130996149793,
                       0.113551017379, 0.109987942454, 9.73837390723e-10],
                      [0.27616778138, -0.0341866951102, 0.0633000238256,
                       0.100446653327, 0.123802521199, 0.1285839664,
                       -0.132852841046, -0.217514322505, 9.73837390723e-10],
                      [0.202458130052, -0.115216120518, 0.301820871723,
                       -0.18300251046, 0.136208248567, -0.0989435556722,
                       0.0927738484879, 0.0909429797672, 9.73837390723e-10],
                      [0.236467470907, 0.21863434374, -0.0301637746424,
                       -0.0225473129718, -0.205287183891, -0.180224615141,
                       -0.165277751908, 0.0411933458557, 9.73837390723e-10],
                      [-0.105517545144, -0.41405687433, -0.150073017617,
                       -0.116066751485, -0.158763393475, -0.0223918378516,
                       -0.0263068046112, -0.0501209518091,
                       9.73837390723e-10],
                      [-0.371636765565, 0.115484234741, 0.0721996475289,
                       0.0898852445906, 0.0212491652909, -0.184183028843,
                       0.114877153051, -0.164938000185, 9.73837390723e-10]])
     prop_expl = np.array([25.6216900347, 15.7715955926, 14.1215046787,
                           11.6913885817, 9.83044890697, 8.51253468595,
                           7.88775505332, 6.56308246609, 4.42499350906e-16])
     site_ids = ['PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593',
                 'PC.355', 'PC.607', 'PC.634']
     self.ord_res = OrdinationResults(eigvals=eigvals, site=site,
                                      proportion_explained=prop_expl,
                                      site_ids=site_ids)
     metadata_map = {'PC.354': {'Treatment': 'Control',
                                'DOB': '20061218',
                                'Weight': '60',
                                'Description': 'Control_mouse_I.D._354'},
                     'PC.355': {'Treatment': 'Control',
                                'DOB': '20061218',
                                'Weight': '55',
                                'Description': 'Control_mouse_I.D._355'},
                     'PC.356': {'Treatment': 'Control',
                                'DOB': '20061126',
                                'Weight': '50',
                                'Description': 'Control_mouse_I.D._356'},
                     'PC.481': {'Treatment': 'Control',
                                'DOB': '20070314',
                                'Weight': '52',
                                'Description': 'Control_mouse_I.D._481'},
                     'PC.593': {'Treatment': 'Control',
                                'DOB': '20071210',
                                'Weight': '57',
                                'Description': 'Control_mouse_I.D._593'},
                     'PC.607': {'Treatment': 'Fast',
                                'DOB': '20071112',
                                'Weight': '65',
                                'Description': 'Fasting_mouse_I.D._607'},
                     'PC.634': {'Treatment': 'Fast',
                                'DOB': '20080116',
                                'Weight': '68',
                                'Description': 'Fasting_mouse_I.D._634'},
                     'PC.635': {'Treatment': 'Fast',
                                'DOB': '20080116',
                                'Weight': '70',
                                'Description': 'Fasting_mouse_I.D._635'},
                     'PC.636': {'Treatment': 'Fast',
                                'DOB': '20080116',
                                'Weight': '72',
                                'Description': 'Fasting_mouse_I.D._636'}}
     self.metadata_map = pd.DataFrame.from_dict(metadata_map,
                                                orient='index')
     self.categories = ['Treatment']
     self.sort_by = 'Weight'
Example #42
0

def format_coords(coord_header, coords, eigvals, pct_var, headers=True):
    """formats coords given specified coords matrix etc."""
    result = []
    if (headers):
        result.append('pc vector number\t' +
                      '\t'.join(map(str, range(1,
                                               len(coords[0]) + 1))))
        for name, row in zip(coord_header, coords):
            result.append('\t'.join([name] + map(str, row)))
        result.append('')
        result.append('')
        result.append('eigvals\t' + '\t'.join(map(str, eigvals)))
        result.append('% variation explained\t' + '\t'.join(map(str, pct_var)))
    else:
        result = ['\t'.join(map(str, row)) for row in coords]
        result.append('')
    return '\n'.join(result)


if __name__ == "__main__":
    old_file = argv[1]
    new_file = argv[2]
    with open(old_file, 'U') as infile:
        with open(new_file, 'w') as outfile:
            res = OrdinationResults.from_file(infile)
            lines = format_coords(res.site_ids, res.site, res.eigvals,
                                  res.proportion_explained)
            outfile.write(lines)
Example #43
0
        distances[dataset_][(fold_, Nsamp_)]['Bray_Curtis'] = table_
        table_ = pd.read_table(os.path.join(subpath_, sub_set,
                                            'Robust_Aitchison_Distance.tsv'),
                               index_col=0,
                               low_memory=False)
        table_.index = table_.index.astype(str)
        table_.columns = table_.columns.astype(str)
        table_ = table_.reindex(index=index_me, columns=index_me)
        distances[dataset_][(fold_, Nsamp_)]['Robust_Aitchison'] = table_

        # ordination type file
        in_ord = os.path.join(subpath_, sub_set, 'RPCA_Ordination.txt')
        # get loadings from ordination files
        ordinations[dataset_][(
            fold_,
            Nsamp_)]['RPCA_Samples'] = OrdinationResults.read(in_ord).samples
        ordinations[dataset_][(
            fold_,
            Nsamp_)]['RPCA_Features'] = OrdinationResults.read(in_ord).features

# permanova analysis
from skbio import DistanceMatrix
from skbio.stats.distance import permanova

both_perm_res = {}
perm_res = {}
perm_res_tmp = {}
for dataset_, subs in distances.items():
    perm_res[dataset_] = {}
    perm_res_tmp[dataset_] = {}
    for (fold_, Nsamp_), methods_ in subs.items():