Esempio n. 1
0
    def coexpression(self,gene_a,gene_b):
        ''' 
            Returns a coexpression z-score between two genes. This
            is the pearson correlation coefficient of the two genes'
            expression profiles across the accessions (experiments).
            This value is pulled from the 

            Parameters
            ----------
            gene_a : camoco.Locus
                The first gene
            gene_b : camoco.Locus
                The second gene
        
            Returns
            -------
            Coexpression Z-Score 

        '''
        # Grab the indices in the original expression matrix
        ids = np.array([self._expr_index[gene_a.id],self._expr_index[gene_b.id]])
        # We need the number of genes
        num_genes = self.num_genes()
        index = PCCUP.coex_index(ids,num_genes)[0]
        return self.coex.iloc[index]
Esempio n. 2
0
 def _calculate_coexpression(self,significance_thresh=3):
     ''' 
         Generates pairwise PCCs for gene expression profiles in self._expr.
         Also calculates pairwise gene distance.
     '''
     # Start off with a fresh set of genes we can pass to functions
     tbl = pd.DataFrame(
         list(itertools.combinations(self._expr.index.values,2)),
         columns=['gene_a','gene_b']
     )
     # Reindex the table to match genes
     self.log('Indexing coex table')
     tbl.set_index(['gene_a','gene_b'],inplace=True)
     # Now add coexpression data
     self.log("Calculating Coexpression")
     # Calculate the PCCs
     pccs = 1-PCCUP.pair_correlation(np.ascontiguousarray(self._expr.as_matrix()))
     # return the long form of the 
     assert len(pccs) == len(tbl)
     tbl['score'] = pccs
     # correlations of 1 dont transform well, they cause infinities
     tbl.loc[tbl['score'] == 1,'score'] = 0.99999999
     tbl.loc[tbl['score'] == -1,'score'] = -0.99999999
     # Perform fisher transform on PCCs
     tbl['score'] = np.arctanh(tbl['score'])
     # Sometimes, with certain datasets, the NaN mask overlap completely for the
     # two genes expression data making its PCC a nan. This affects the mean and std fro the gene.
     valid_scores = np.ma.masked_array(tbl['score'],np.isnan(tbl['score']))
     # Calculate Z Scores
     pcc_mean = valid_scores.mean()
     pcc_std = valid_scores.std()
     # Remember these so we can go back to PCCs
     self._global('pcc_mean',pcc_mean)
     self._global('pcc_std',pcc_std)
     tbl['score'] = (valid_scores-pcc_mean)/pcc_std
     # Assign significance
     self._global('significance_threshold',significance_thresh)
     tbl['significant'] = pd.Series(list(tbl['score'] >= significance_thresh),dtype='int_')
     self.log("Calculating Gene Distance")
     distances = self.refgen.pairwise_distance(gene_list=self.refgen.from_ids(self._expr.index))
     assert len(distances) == len(tbl)
     tbl['distance'] = distances
     # put in the hdf5 store
     self._build_tables(tbl)
     self.log("Done")
     return self
Esempio n. 3
0
 def neighbors(self,gene,sig_only=True):
     '''
         Returns a DataFrame containing the neighbors for gene.
 
         Parameters
         ----------
         gene : co.Locus
             The gene for which to extract neighbors
 
         Returns
         -------
         A DataFrame containing edges
     '''
     gene_id = self._expr_index[gene.id]
     neighbor_indices = PCCUP.coex_neighbors(gene_id,self.num_genes())
     edges = self.coex.iloc[neighbor_indices]
     if sig_only:
         return edges[edges.significant == 1]
     else:
         return edges
Esempio n. 4
0
 def subnetwork(self,gene_list=None,sig_only=True,min_distance=100000,
     filter_missing_gene_ids=True):
     '''
         Input: a gene list (passing None gives you all genes)
         Output: a dataframe containing all edges EXCLUSIVELY between genes
             within list
     '''
     if gene_list is None:
         df = self.coex
     else:
         ids = np.array([self._expr_index[x.id] for x in gene_list])
         if filter_missing_gene_ids:
             # filter out the Nones 
             ids = np.array(list(filter(None,ids)))
         num_genes = self.num_genes()
         # Grab the coexpression indices for the genes
         indices = PCCUP.coex_index(ids,num_genes)
         df = self.coex.iloc[indices]
     if min_distance:
         df = df.loc[df.distance >= min_distance,:]
     if sig_only:
         df = df.loc[df.significant == 1,:]
     return df.copy()