Exemple #1
0
 def test_errors(self):
     # test not array
     with self.assertRaises(ValueError):
         TensorFactorization().fit(list(range(10)))
     # test if none missing
     with self.assertRaises(ValueError):
         TensorFactorization().fit(np.ones((5, 5)))
     # test no nan(s)
     TE_errors = self.TE
     TE_errors[0, :, :] = np.inf
     with self.assertRaises(ValueError):
         TensorFactorization().fit(TE_errors)
     # test max rank
     with self.assertRaises(ValueError):
         TensorFactorization(n_components=np.max(self.TE_noise.shape) +
                             10).fit(self.TE_noise)
 def test_TenAls_center(self):
     # TensorFactorization no noise w/ centered
     TF = TensorFactorization(center=True).fit(self.TE)
     L1, L2, L3 = TF.loadings
     for l1v in L1.mean(axis=1):
         self.assertLess(l1v, 1)
     for l2v in L2.mean(axis=1):
         self.assertLess(l2v, 1)
Exemple #3
0
 def test_TenAls_noiseless(self):
     # TensorFactorization no noise
     TF = TensorFactorization().fit(self.TE)
     L1, L2, L3 = TF.loadings
     s = TF.eigvals
     # test accuracy
     rmse = 0
     for i3 in range(self.n3):
         A1 = self.U1
         A2 = np.matmul(self.U2, np.diag(self.U3[i3, :]))
         B1 = L1
         B2 = np.matmul(L2, np.diag(L3[i3, :] * s.T.flatten()))
         rmse += np.trace(np.matmul(np.matmul(A1.T, A1), np.matmul(A2.T,
                                                                   A2))) + \
             np.trace(np.matmul(np.matmul(B1.T, B1), np.matmul(B2.T,
                                                               B2))) + \
             -2 * np.trace(np.matmul(np.matmul(B1.T, A1), np.matmul(A2.T,
                                                                    B2)))
     self.assertTrue(1e2 > abs(rmse))
Exemple #4
0
def ctf_helper(
    table: biom.Table,
    sample_metadata: DataFrame,
    individual_id_column: str,
    state_columns: list,
    n_components: int = DEFAULT_COMP,
    min_sample_count: int = DEFAULT_MSC,
    min_feature_count: int = DEFAULT_MFC,
    max_iterations_als: int = DEFAULT_MAXITER,
    max_iterations_rptm: int = DEFAULT_MAXITER,
    n_initializations: int = DEFAULT_MAXITER,
    feature_metadata: DataFrame = DEFFM
) -> (dict, OrdinationResults, dict, tuple):
    """ Runs  Compositional Tensor Factorization CTF.
    """

    # validate the metadata using q2 as a wrapper
    if sample_metadata is not None and not isinstance(sample_metadata,
                                                      DataFrame):
        sample_metadata = sample_metadata.to_dataframe()
    keep_cols = state_columns + [individual_id_column]
    all_sample_metadata = sample_metadata.drop(keep_cols, axis=1)
    sample_metadata = sample_metadata[keep_cols]
    # validate the metadata using q2 as a wrapper
    if feature_metadata is not None and not isinstance(feature_metadata,
                                                       DataFrame):
        feature_metadata = feature_metadata.to_dataframe()
    # match the data (borrowed in part from gneiss.util.match)
    subtablefids = table.ids('observation')
    subtablesids = table.ids('sample')
    if len(subtablesids) != len(set(subtablesids)):
        raise ValueError('Data-table contains duplicate sample IDs')
    if len(subtablefids) != len(set(subtablefids)):
        raise ValueError('Data-table contains duplicate feature IDs')
    submetadataids = set(sample_metadata.index)
    subtablesids = set(subtablesids)
    subtablefids = set(subtablefids)
    if feature_metadata is not None:
        submetadatafeat = set(feature_metadata.index)
        fidx = subtablefids & submetadatafeat
        if len(fidx) == 0:
            raise ValueError(("No more features left.  Check to make "
                              "sure that the sample names between "
                              "`feature-metadata` and `table` are "
                              "consistent"))
        feature_metadata = feature_metadata.reindex(fidx)
    sidx = subtablesids & submetadataids
    if len(sidx) == 0:
        raise ValueError(("No more features left.  Check to make sure that "
                          "the sample names between `sample-metadata` and"
                          " `table` are consistent"))
    if feature_metadata is not None:
        table.filter(list(fidx), axis='observation', inplace=True)
    table.filter(list(sidx), axis='sample', inplace=True)
    sample_metadata = sample_metadata.reindex(sidx)

    # filter and import table
    for axis, min_sum in zip(['sample', 'observation'],
                             [min_sample_count, min_feature_count]):
        table = table.filter(table.ids(axis)[table.sum(axis) >= min_sum],
                             axis=axis,
                             inplace=True)

    # table to dataframe
    table = DataFrame(table.matrix_data.toarray(), table.ids('observation'),
                      table.ids('sample'))

    # tensor building
    tensor = build()
    tensor.construct(table, sample_metadata, individual_id_column,
                     state_columns)

    # factorize
    TF = TensorFactorization(n_components=n_components,
                             max_als_iterations=max_iterations_als,
                             max_rtpm_iterations=max_iterations_rptm,
                             n_initializations=n_initializations).fit(
                                 rclr(tensor.counts))
    # label tensor loadings
    TF.label(tensor, taxonomy=feature_metadata)

    # if the n_components is two add PC3 of zeros
    # this is referenced as in issue in
    # <https://github.com/biocore/emperor/commit
    # /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
    if n_components == 2:
        TF.subjects.loc[:, 'PC3'] = [0] * len(TF.subjects.index)
        TF.features.loc[:, 'PC3'] = [0] * len(TF.features.index)
        TF.proportion_explained['PC3'] = 0
        TF.eigvals['PC3'] = 0

    # save ordination results
    short_method_name = 'CTF_Biplot'
    long_method_name = 'Compositional Tensor Factorization Biplot'
    # only keep PC -- other tools merge metadata
    keep_PC = [col for col in TF.features.columns if 'PC' in col]
    subj_ordin = OrdinationResults(
        short_method_name,
        long_method_name,
        TF.eigvals,
        samples=TF.subjects[keep_PC].dropna(axis=0),
        features=TF.features[keep_PC].dropna(axis=0),
        proportion_explained=TF.proportion_explained)
    # save distance matrix for each condition
    distances = {}
    state_ordn = {}
    subject_trajectories = {}
    feature_trajectories = {}
    for condition, cond, dist, straj, ftraj in zip(tensor.conditions,
                                                   TF.conditions,
                                                   TF.subject_distances,
                                                   TF.subject_trajectory,
                                                   TF.feature_trajectory):
        # match distances to metadata
        ids = straj.index
        ind_dict = dict((ind, ind_i) for ind_i, ind in enumerate(ids))
        inter = set(ind_dict).intersection(sample_metadata.index)
        indices = sorted([ind_dict[ind] for ind in inter])
        dist = dist[indices, :][:, indices]
        distances[condition] = skbio.stats.distance.DistanceMatrix(
            dist, ids=ids[indices])
        # fix conditions
        if n_components == 2:
            cond['PC3'] = [0] * len(cond.index)
        cond = OrdinationResults(short_method_name,
                                 long_method_name,
                                 TF.eigvals,
                                 samples=cond[keep_PC].dropna(axis=0),
                                 features=TF.features[keep_PC].dropna(axis=0),
                                 proportion_explained=TF.proportion_explained)
        state_ordn[condition] = cond
        # add the sample metadata before returning output
        # addtionally only keep metadata with trajectory
        # output available.
        pre_merge_cols = list(straj.columns)
        straj = concat(
            [straj.reindex(all_sample_metadata.index), all_sample_metadata],
            axis=1,
            sort=True)
        straj = straj.dropna(subset=pre_merge_cols)
        # ensure index name for q2
        straj.index.name = "#SampleID"
        # save traj.
        keep_PC_traj = [col for col in straj.columns if 'PC' in col]
        straj[keep_PC_traj] -= straj[keep_PC_traj].mean()
        ftraj[keep_PC_traj] -= ftraj[keep_PC_traj].mean()
        subject_trajectories[condition] = straj
        ftraj.index = ftraj.index.astype(str)
        feature_trajectories[condition] = ftraj
    return (state_ordn, subj_ordin, distances, subject_trajectories,
            feature_trajectories)
Exemple #5
0
 def test_TenAls_mode5_noiseless(self):
     # TODO check values
     TF = TensorFactorization().fit(self.TE5)
     L1, L2, L3, L4, L5 = TF.loadings
Exemple #6
0
 def test_TenAls_mode4_noise(self):
     # TODO check values
     TF = TensorFactorization().fit(self.TE_noise4)
     L1, L2, L3, L4 = TF.loadings
 def test_TenAls_mode5_noise(self):
     # TODO check values
     TF = TensorFactorization(center=False).fit(self.TE_noise5)
     L1, L2, L3, L4, L5 = TF.loadings
 def test_TenAls_mode4_noiseless(self):
     # TODO check values
     TF = TensorFactorization(center=False).fit(self.TE4)
     L1, L2, L3, L4 = TF.loadings