Beispiel #1
0
    def test_props2(self):
        dat, pw = generate_peptide_data()
        np.random.seed(110820)
        #pw = pw + np.random.rand(pw.shape[0])

        pw = distance.pdist(np.random.randn(dat.shape[0], 5)) + pw

        res, Z = hcluster_tally(dat,
                                pwmat=scipy.spatial.distance.squareform(pw),
                                x_cols=['trait1'],
                                count_col='count',
                                method='complete')
        res = cluster_association_test(res, method='fishers')

        html = plot_hclust_props(Z,
                                 title='test_props2',
                                 res=res,
                                 alpha=0.05,
                                 alpha_col='pvalue')

        with open(opj('hierdiff', 'tests', 'test_props2.html'),
                  'w',
                  encoding='utf-8') as fh:
            fh.write(html)

        self.assertTrue(True)
Beispiel #2
0
 def test_hier_chm(self):
     dat, pw = generate_peptide_data()
     res, Z = hcluster_tally(dat,
                             pwmat=scipy.spatial.distance.squareform(pw),
                             x_cols=['trait1', 'trait2'],
                             count_col='count',
                             method='complete')
     res = cluster_association_test(res, method='chm')
Beispiel #3
0
 def test_hier_chm(self):
     dat, pw = generate_peptide_data()
     res, Z = hcluster_tally(dat,
                             pwmat=pw,
                             x_cols=['trait1', 'trait2'],
                             count_col='count',
                             method='complete')
     res = cluster_association_test(res, method='chm')
Beispiel #4
0
 def test_nn_fishers(self):
     dat, pw = generate_peptide_data()
     res = neighborhood_tally(dat,
                              pwmat=scipy.spatial.distance.squareform(pw),
                              x_cols=['trait1'],
                              count_col='count',
                              knn_neighbors=None,
                              knn_radius=3)
     res = dat.join(res)
     res = cluster_association_test(res, method='fishers')
Beispiel #5
0
    def test_props_motif(self):
        dat, pw = generate_peptide_data()
        np.random.seed(110820)
        #pw = pw + np.random.rand(pw.shape[0])

        # pw = scipy.spatial.distance.squareform(distance.pdist(np.random.randn(dat.shape[0], 5))) + pw
        pw = pwsd.apply_pairwise_rect(metric=pwsd.metrics.hamming_distance,
                                      seqs1=dat['seq'])

        res, Z = hcluster_tally(dat,
                                pwmat=pw,
                                x_cols=['trait1'],
                                count_col='count',
                                method='complete')
        res = cluster_association_test(res, method='fishers')

        svg = []
        for i, r in res.iterrows():
            if r['pvalue'] < 0.05:
                m = palmotif.compute_motif(dat['seq'].values[r['neighbors_i']])
                s = palmotif.svg_logo(m,
                                      return_str=True,
                                      return_html=False,
                                      svg_height='500px',
                                      svg_width='500px')
                svg.append(s)
            else:
                svg.append('')
        res = res.assign(motif=svg)

        html = plot_hclust_props(Z,
                                 title='test_props_motif',
                                 tooltip_cols=['motif'],
                                 res=res,
                                 alpha=0.05,
                                 alpha_col='pvalue')

        with open(opj('hierdiff', 'tests', 'test_props_motif.html'),
                  'w',
                  encoding='utf-8') as fh:
            fh.write(html)

        self.assertTrue(True)
Beispiel #6
0
def hcluster_diff(clone_df,
                  pwmat,
                  x_cols,
                  Z=None,
                  count_col='count',
                  subset_ind=None,
                  hclust_method='complete',
                  optimal_ordering=True,
                  test_method='fishers'):
    """Tests for association of categorical variables in x_cols with each cluster/node
    in a hierarchical clustering of TCR clones with distances in pwmat.

    Uses hierdiff package (available on PyPI) for tallying counts in each cluster
    and running tests.

    The statistical tests made available by this function are limited and meant only
    as a way to scan for signals. More sophisticated testing/modeling frameworks
    should be considered for real-world problems.

    Use test_method = None to return a table of counts for all neighborhoods that can be saved as
    a CSV and used to run other, more sophisticated tests (e.g. edgeR or other regressions).

    Use Fisher's exact test (test='fishers') to detect enrichment/association of the neighborhood/cluster
    with one binary variable.

    Tests the 2 x 2 table for each clone:

    +----+----+-------+--------+
    |         |    Cluster     |
    |         +-------+--------+
    |         |  MEM+ |  MEM-  |
    +----+----+-------+--------+
    |VAR |  0 |   a   |    b   |
    |    +----+-------+--------+
    |    |  1 |   c   |    d   |
    +----+----+-------+--------+

    Use the chi-squared test (test='chi2') to detect association across multiple categorical variables.
    Note that with small clusters Chi-squared tests are unreliable.

    Use the Cochran-Mantel-Haenszel test (test='chm') to test stratified 2 x 2 tables:
    one VAR vs. neighborhood, over several strata defined in other variables.
    Use x_cols[0] as the primary (binary) variable and other x_cols for the categorical
    strata-defining variables. This tests the overall null that OR = 1 for x_cols[0].
    A test is also performed for homogeneity of the ORs among the strata (Breslow-Day test).

    Params
    ------
    clone_df : pd.DataFrame [nclones x metadata]
        Contains metadata for each clone.
    pwmat : np.ndarray [nclones x nclones]
        Square distance matrix for defining neighborhoods
    x_cols : list
        List of columns to be tested for association with the neighborhood
    count_col : str
        Column in clone_df that specifies counts.
        Default none assumes count of 1 cell for each row.
    subset_ind : None or np.ndarray with partial index of df, optional
        Provides option to tally counts only within a subset of df, but to maintain the clustering
        of all individuals. Allows for one clustering of pooled TCRs,
        but tallying/testing within a subset (e.g. participants or conditions)
    hclust_method : str
        Method for hierarchical clustering, passed to the scipy.clustering.hierarchy
        linkage function.
    optimal_ordering : bool
        Flag passed to the scipy.clustering.hierarchy linkage function to improve
        visual tree layout. Can be slow for large trees.
    test_method : str or None
        Specifies Fisher's exact test ("fishers"), Chi-squared ("chi2") or
        Cochran-Mantel-Haenszel test ("chm") for testing.

    Returns
    -------
    res : pd.DataFrame [nclusters x results]
        Results from testing each cluster.
    Z : linkage matrix [clusters, 4]
        Clustering result returned from scipy.cluster.hierarchy.linkage"""
    res, Z = hd.hcluster_tally(df=clone_df,
                               pwmat=pwmat,
                               x_cols=x_cols,
                               Z=Z,
                               count_col=count_col,
                               subset_ind=subset_ind,
                               method='complete',
                               optimal_ordering=optimal_ordering)
    if not test_method is None:
        res = hd.cluster_association_test(res,
                                          y_col='cmember',
                                          method=test_method)
    return res, Z
Beispiel #7
0
def neighborhood_diff(clone_df,
                      pwmat,
                      x_cols,
                      count_col='count',
                      knn_neighbors=50,
                      knn_radius=None,
                      subset_ind=None,
                      cluster_ind=None,
                      test_method='fishers'):
    """Tests for association of categorical variables in x_cols with the neighborhood
    around each TCR in clone_df. The neighborhood is defined by the K closest neighbors
    using pairwise distances in pwmat, or defined by a distance radius, knn_radius.

    Uses hierdiff package (available on PyPI) for tallying counts in each cluster
    and running tests.

    The statistical tests made available by this function are limited and meant only
    as a way to scan for signals. More sophisticated testing/modeling frameworks
    should be considered for real-world problems.

    Use test_method = None to return a table of counts for all neighborhoods that can be saved as
    a CSV and used to run other, more sophisticated tests (e.g. edgeR or other regressions).

    Use Fisher's exact test (test='fishers') to detect enrichment/association of the neighborhood
    with one binary variable. For example, test the 2 x 2 table for each clone:

    +----+----+-------+--------+
    |         |  Neighborhood  |
    |         +-------+--------+
    |         | MEM+  |   MEM- |
    +----+----+-------+--------+
    |VAR |  0 | a     |    b   |
    |    +----+-------+--------+
    |    |  1 | c     |    d   |
    +----+----+-------+--------+

    Use the chi-squared test (test='chi2') to detect association across multiple variables.
    Note that with sparse neighborhoods Chi-squared tests are unreliable.

    Use the Cochran-Mantel-Haenszel test (test='chm') to test stratified 2 x 2 tables:
    one VAR vs. neighborhood, over several strata defined in other variables.
    Use x_cols[0] as the primary (binary) variable and other x_cols for the categorical
    strata-defining variables. This tests the overall null that OR = 1 for x_cols[0].
    A test is also performed for homogeneity of the ORs among the strata (Breslow-Day test).

    Params
    ------
    clone_df : pd.DataFrame [nclones x metadata]
        Contains metadata for each clone.
    pwmat : np.ndarray [nclones x nclones]
        Square distance matrix for defining neighborhoods
    x_cols : list
        List of columns to be tested for association with the neighborhood
    count_col : str
        Column in clone_df that specifies counts.
        Default none assumes count of 1 cell for each row.
    knn_neighbors : int
        Number of neighbors to include in the neighborhood.
    knn_radius : float
        Radius for inclusion of neighbors within the neighborhood.
        Specify K or R but not both.
    subset_ind : None or np.ndarray with partial index of df, optional
        Provides option to tally counts only within a subset of df, but to maintain the clustering
        of all individuals. Allows for one clustering of pooled TCRs,
        but tallying/testing within a subset (e.g. participants or conditions)
    cluster_ind : None or np.ndarray
        Indices into df specifying the neighborhoods for testing.
    test_method : str or None
        Specifies Fisher's exact test ("fishers"), Chi-squared ("chi2") or
        Cochran-Mantel-Haenszel test ("chm") for testing.

    Returns
    -------
    res : pd.DataFrame [nclones x results]
        Results from testing the neighborhood around each clone."""
    res = hd.neighborhood_tally(df=clone_df,
                                pwmat=pwmat,
                                x_cols=x_cols,
                                count_col=count_col,
                                knn_neighbors=knn_neighbors,
                                knn_radius=knn_radius,
                                subset_ind=subset_ind,
                                cluster_ind=cluster_ind)
    if not test_method is None:
        res = hd.cluster_association_test(res,
                                          y_col='cmember',
                                          method=test_method)
    return res