def coexpression(self,gene_a,gene_b): ''' Returns a coexpression z-score between two genes. This is the pearson correlation coefficient of the two genes' expression profiles across the accessions (experiments). This value is pulled from the Parameters ---------- gene_a : camoco.Locus The first gene gene_b : camoco.Locus The second gene Returns ------- Coexpression Z-Score ''' # Grab the indices in the original expression matrix ids = np.array([self._expr_index[gene_a.id],self._expr_index[gene_b.id]]) # We need the number of genes num_genes = self.num_genes() index = PCCUP.coex_index(ids,num_genes)[0] return self.coex.iloc[index]
def _calculate_coexpression(self,significance_thresh=3): ''' Generates pairwise PCCs for gene expression profiles in self._expr. Also calculates pairwise gene distance. ''' # Start off with a fresh set of genes we can pass to functions tbl = pd.DataFrame( list(itertools.combinations(self._expr.index.values,2)), columns=['gene_a','gene_b'] ) # Reindex the table to match genes self.log('Indexing coex table') tbl.set_index(['gene_a','gene_b'],inplace=True) # Now add coexpression data self.log("Calculating Coexpression") # Calculate the PCCs pccs = 1-PCCUP.pair_correlation(np.ascontiguousarray(self._expr.as_matrix())) # return the long form of the assert len(pccs) == len(tbl) tbl['score'] = pccs # correlations of 1 dont transform well, they cause infinities tbl.loc[tbl['score'] == 1,'score'] = 0.99999999 tbl.loc[tbl['score'] == -1,'score'] = -0.99999999 # Perform fisher transform on PCCs tbl['score'] = np.arctanh(tbl['score']) # Sometimes, with certain datasets, the NaN mask overlap completely for the # two genes expression data making its PCC a nan. This affects the mean and std fro the gene. valid_scores = np.ma.masked_array(tbl['score'],np.isnan(tbl['score'])) # Calculate Z Scores pcc_mean = valid_scores.mean() pcc_std = valid_scores.std() # Remember these so we can go back to PCCs self._global('pcc_mean',pcc_mean) self._global('pcc_std',pcc_std) tbl['score'] = (valid_scores-pcc_mean)/pcc_std # Assign significance self._global('significance_threshold',significance_thresh) tbl['significant'] = pd.Series(list(tbl['score'] >= significance_thresh),dtype='int_') self.log("Calculating Gene Distance") distances = self.refgen.pairwise_distance(gene_list=self.refgen.from_ids(self._expr.index)) assert len(distances) == len(tbl) tbl['distance'] = distances # put in the hdf5 store self._build_tables(tbl) self.log("Done") return self
def neighbors(self,gene,sig_only=True): ''' Returns a DataFrame containing the neighbors for gene. Parameters ---------- gene : co.Locus The gene for which to extract neighbors Returns ------- A DataFrame containing edges ''' gene_id = self._expr_index[gene.id] neighbor_indices = PCCUP.coex_neighbors(gene_id,self.num_genes()) edges = self.coex.iloc[neighbor_indices] if sig_only: return edges[edges.significant == 1] else: return edges
def subnetwork(self,gene_list=None,sig_only=True,min_distance=100000, filter_missing_gene_ids=True): ''' Input: a gene list (passing None gives you all genes) Output: a dataframe containing all edges EXCLUSIVELY between genes within list ''' if gene_list is None: df = self.coex else: ids = np.array([self._expr_index[x.id] for x in gene_list]) if filter_missing_gene_ids: # filter out the Nones ids = np.array(list(filter(None,ids))) num_genes = self.num_genes() # Grab the coexpression indices for the genes indices = PCCUP.coex_index(ids,num_genes) df = self.coex.iloc[indices] if min_distance: df = df.loc[df.distance >= min_distance,:] if sig_only: df = df.loc[df.significant == 1,:] return df.copy()