Beispiel #1
0
 def distance_matrix(self, squareform=True):
     """Euclidean pairwise distance matrix in scaled space.  If squareform
     is flagged, return the full distance matrix, else return the
     flattened upper triangle without the main diagonal."""
     d = self.scaled[:, None] - self.scaled[None, :]
     mat = scipy.sqrt((d**2).sum(axis=2))
     return mat if squareform else mat[scipy.triu_indices(len(self), 1)]
Beispiel #2
0
 def distance_matrix(self, squareform=True):
     """Euclidean pairwise distance matrix in scaled space.  If squareform
     is flagged, return the full distance matrix, else return the
     flattened upper triangle without the main diagonal."""
     d = self.scaled[:, None] - self.scaled[None, :]
     mat = scipy.sqrt((d**2).sum(axis=2))
     return mat if squareform else mat[scipy.triu_indices(len(self), 1)]
Beispiel #3
0
    def adjacency(self, min_snp2gene_obs=2,fdr_cutoff=0.3,return_genes=False):
        '''
            Return a matrix showing the number of shared HPO genes by Term.
            The diagonal of the matrix is the number of genes discoverd by that 
            term. The upper diagonal shows the overlap between the row and column
            and the lower diagonal shows the hypergeomitric pval for the overlap
            between the two terms. The universe used is the number of unique genes
            in the overlap results.

            min_snp2gene_obs : int (default: 2)
                The min SNP2gene mappinging observations needed to be HPO
            fdr_cutoff: float (default: 0.3)
                The FDR cutoff the be considered HPO
            return_genes : bool (default: False)
                Return the candidate gene list instead of the overlap table
        '''
        df = self.high_priority_candidates(
                fdr_cutoff=fdr_cutoff,
                min_snp2gene_obs=min_snp2gene_obs,
                original_COB_only=True)
        # 
        x={df[0]:set(df[1].gene) for df in df.groupby('Term')}                     
        adj = []                                                                        
        #num_universe = len(set(chain(*x.values())))
        num_universe = len(self.results.gene.unique())
        for i,a in enumerate(x.keys()):                                                              
            for j,b in enumerate(x.keys()):  
                if j < i:
                    continue
                common = set(x[a]).intersection(x[b])
                num_common = len(set(x[a]).intersection(x[b]))
                if a != b:
                    pval = hypergeom.sf(num_common-1,num_universe,len(x[a]),len(x[b]))
                else:
                    # This will make the diagonal of the matrix be the number HPO genes
                    # for the element
                    pval = len(x[a])
                adj.append((a,b,num_common,pval,','.join(common))) 
        adj = pd.DataFrame(adj)                                                         
        adj.columns = ['Term1','Term2','num_common','pval','common']
        # Stop early if we just want to return the lists 
        if return_genes == True:
            adj = adj[adj.num_common>0] 
            adj = adj[np.logical_not(adj.Term1==adj.Term2)]
            return adj.drop_duplicates()
        else:
            overlap = pd.pivot_table(adj,index='Term1',columns='Term2',values='num_common')
            # Mask out the lower diagonal on the overalp matrix
            overlap.values[tril_indices(len(overlap))] = 0
            pvals = pd.pivot_table(adj,index='Term2',columns='Term1',values='pval')
            # Mask out the upper tringular on the pvals matrix
            pvals.values[triu_indices(len(pvals),1)] = 0
            return (overlap+pvals).astype(float)
Beispiel #4
0
    def adjacency(
        self,
        min_snp2gene_obs=2,
        fdr_cutoff=0.3,
        return_genes=False,
        second_overlap=None,
    ):
        """
            Return a matrix showing the number of shared HPO genes by Term.
            The diagonal of the matrix is the number of genes discoverd by that 
            term. The upper diagonal shows the overlap between the row and column
            and the lower diagonal shows the hypergeomitric pval for the overlap
            between the two terms. The universe used is the number of unique genes
            in the overlap results.

            min_snp2gene_obs : int (default: 2)
                The min SNP2gene mappinging observations needed to be HPO
            fdr_cutoff: float (default: 0.3)
                The FDR cutoff the be considered HPO
            return_genes : bool (default: False)
                Return the candidate gene list instead of the overlap table
            second_overlap : Overlap Object (default: None)
                If specified, overlap between terms will be calculated 
                between this overlaps HPO genes and the second overlaps
                HPO genes resulting in a adjacency matrix where the 
                x-axis is overlap 1's terms and the y-axis is overlap
                2's terms and the values are the number of shared genes
                per term.
        """
        hpo1 = self.high_priority_candidates(
            fdr_cutoff=fdr_cutoff,
            min_snp2gene_obs=min_snp2gene_obs,
            original_COB_only=True,
        )

        if second_overlap is None:
            second_overlap = self
        hpo2 = second_overlap.high_priority_candidates(
            fdr_cutoff=fdr_cutoff,
            min_snp2gene_obs=min_snp2gene_obs,
            original_COB_only=True,
        )
        #
        x = {df[0]: set(df[1].gene) for df in hpo1.groupby("Term")}
        y = {df[0]: set(df[1].gene) for df in hpo2.groupby("Term")}
        adj = []
        # num_universe = len(set(chain(*x.values())))
        num_universe = len(
            set(self.results.gene.unique()).union(
                set(second_overlap.results.gene.unique())
            )
        )
        for i, a in enumerate(x.keys()):
            for j, b in enumerate(y.keys()):
                num_a = len(x[a])
                num_b = len(y[b])
                if j < i:
                    continue
                common = set(x[a]).intersection(y[b])
                num_common = len(set(x[a]).intersection(y[b]))
                if a != b:
                    pval = hypergeom.sf(
                        num_common - 1, num_universe, len(x[a]), len(y[b])
                    )
                else:
                    # This will make the diagonal of the matrix be the number HPO genes
                    # for the element
                    pval = len(x[a])
                adj.append((a, b, num_a, num_b, num_common, pval, ",".join(common)))
        adj = pd.DataFrame(adj)
        adj.columns = [
            "Term1",
            "Term2",
            "num_term1",
            "num_term2",
            "num_common",
            "pval",
            "common",
        ]
        # Stop early if we just want to return the lists
        if return_genes == True:
            adj = adj[adj.num_common > 0]
            adj = adj[np.logical_not(adj.Term1 == adj.Term2)]
            adj = adj.drop_duplicates()
            adj["bonferoni"] = adj.pval <= (0.05 / (len(x) * len(y)))
            return adj.drop_duplicates()
        else:
            overlap = pd.pivot_table(
                adj, index="Term1", columns="Term2", values="num_common"
            )
            # Mask out the lower diagonal on the overalp matrix
            overlap.values[tril_indices(len(overlap))] = 0
            pvals = pd.pivot_table(adj, index="Term1", columns="Term2", values="pval")
            # Mask out the upper tringular on the pvals matrix
            pvals.values[triu_indices(len(pvals), 1)] = 0
            return (overlap + pvals).astype(float)
Beispiel #5
0
    def adjacency(self, min_snp2gene_obs=2,fdr_cutoff=0.3,return_genes=False,
                 second_overlap=None):
        '''
            Return a matrix showing the number of shared HPO genes by Term.
            The diagonal of the matrix is the number of genes discoverd by that 
            term. The upper diagonal shows the overlap between the row and column
            and the lower diagonal shows the hypergeomitric pval for the overlap
            between the two terms. The universe used is the number of unique genes
            in the overlap results.

            min_snp2gene_obs : int (default: 2)
                The min SNP2gene mappinging observations needed to be HPO
            fdr_cutoff: float (default: 0.3)
                The FDR cutoff the be considered HPO
            return_genes : bool (default: False)
                Return the candidate gene list instead of the overlap table
            second_overlap : Overlap Object (default: None)
                If specified, overlap between terms will be calculated 
                between this overlaps HPO genes and the second overlaps
                HPO genes resulting in a adjacency matrix where the 
                x-axis is overlap 1's terms and the y-axis is overlap
                2's terms and the values are the number of shared genes
                per term.
        '''
        hpo1 = self.high_priority_candidates(
                fdr_cutoff=fdr_cutoff,
                min_snp2gene_obs=min_snp2gene_obs,
                original_COB_only=True)

        if second_overlap is None:
            second_overlap = self
        hpo2 = second_overlap.high_priority_candidates(
            fdr_cutoff=fdr_cutoff,
            min_snp2gene_obs=min_snp2gene_obs,
            original_COB_only=True
        )
        # 
        x={df[0]:set(df[1].gene) for df in hpo1.groupby('Term')}                     
        y={df[0]:set(df[1].gene) for df in hpo2.groupby('Term')}                     
        adj = []                                                                        
        #num_universe = len(set(chain(*x.values())))
        num_universe = len(set(self.results.gene.unique()).union(set(second_overlap.results.gene.unique())))
        for i,a in enumerate(x.keys()):                                                              
            for j,b in enumerate(y.keys()):  
                num_a = len(x[a])
                num_b = len(y[b])
                if j < i:
                    continue
                common = set(x[a]).intersection(y[b])
                num_common = len(set(x[a]).intersection(y[b]))
                if a != b:
                    pval = hypergeom.sf(num_common-1,num_universe,len(x[a]),len(y[b]))
                else:
                    # This will make the diagonal of the matrix be the number HPO genes
                    # for the element
                    pval = len(x[a])
                adj.append((a,b,num_a,num_b,num_common,pval,','.join(common))) 
        adj = pd.DataFrame(adj)                                                         
        adj.columns = ['Term1','Term2','num_term1','num_term2','num_common','pval','common']
        # Stop early if we just want to return the lists 
        if return_genes == True:
            adj = adj[adj.num_common>0] 
            adj = adj[np.logical_not(adj.Term1==adj.Term2)]
            adj = adj.drop_duplicates()
            adj['bonferoni'] = adj.pval <= (0.05 / (len(x)*len(y))) 
            return adj.drop_duplicates()
        else:
            overlap = pd.pivot_table(adj,index='Term1',columns='Term2',values='num_common')
            # Mask out the lower diagonal on the overalp matrix
            overlap.values[tril_indices(len(overlap))] = 0
            pvals = pd.pivot_table(adj,index='Term1',columns='Term2',values='pval')
            # Mask out the upper tringular on the pvals matrix
            pvals.values[triu_indices(len(pvals),1)] = 0
            return (overlap+pvals).astype(float)