def fit(self, X, targetGenes=None, predictorGenes=None, dists=None, labels=None, retrieve_training=False, **params): # Extract features and start training if labels is not None: data = pd.DataFrame( np.reshape(X, list(map(len, labels))), index=labels[0], columns=labels[1], ) else: data = X if not retrieve_training: self.set_params(NNid="auto", **params) if targetGenes is not None: if len(targetGenes) < self.dims[1]: self._dims[1] = len(targetGenes) if predictorGenes is None: self.predictorGenes, self.targetGenes = get_input_genes( data, self.dims, distanceMatrix=dists, targets=targetGenes)[0] else: self.predictorGenes, self.targetGenes = predictorGenes, targetGenes # filt = (data[self.targetGenes] > 0).sum(axis=1) >= self.dims[1] * cell_thresh # n_iter = 0 # while (filt.astype(int).sum() == 0) and (n_iter<10000): # cell_thresh /= 2 # n_iter += 1 # filt = (data[self.targetGenes] > 0).sum(axis=1) >= self.dims[1] * cell_thresh features, targets = ( data.loc[:, self.predictorGenes].values, data.loc[:, self.targetGenes].values, ) model = self._fit(features, targets, retrieve_training=retrieve_training) return model
def fit( self, X, targetGenes=None, predictorGenes=None, dists=None, cell_thresh=0.1, labels=None, retrieve_training=False, **params ): # Extract features and start training if labels is not None: data = pd.DataFrame( np.reshape(X, list(map(len, labels))), index=labels[0], columns=labels[1], ) else: data = X if not retrieve_training: self.set_params(NNid="auto", **params) if targetGenes is not None: if len(targetGenes) < self.dims[1]: self._dims[1] = len(targetGenes) if predictorGenes is None: self.predictorGenes, self.targetGenes = get_input_genes( data, self.dims, distanceMatrix=dists, targets=targetGenes, predictorLimit=None, )[0] else: self.predictorGenes, self.targetGenes = predictorGenes, targetGenes filt = (data[self.targetGenes] > 0).sum(axis=1) >= self.dims[1] * cell_thresh features, targets = ( data.loc[filt, self.predictorGenes].values, data.loc[filt, self.targetGenes].values, ) self._fit(features, targets, retrieve_training=retrieve_training)
def fit( self, X, targetGenes=None, predictorGenes=None, dists=None, labels=None, retrieve_training=False, **params ): # Extract features and start training if labels is not None: data = pd.DataFrame( np.reshape(X, list(map(len, labels))), index=labels[0], columns=labels[1], ) else: data = X if not retrieve_training: self.set_params(NNid="auto", **params) if targetGenes is not None: if len(targetGenes) < self.dims[1]: self._dims[1] = len(targetGenes) if predictorGenes is None: self.predictorGenes, self.targetGenes = get_input_genes( data, self.dims[1], nbest=self.dims[0], distanceMatrix=dists, targets=targetGenes )[0] else: self.predictorGenes, self.targetGenes = predictorGenes, targetGenes features, targets = ( data.loc[:, self.predictorGenes].values, data.loc[:, self.targetGenes].values, ) model = self._fit(features, targets, retrieve_training=retrieve_training) return model
def fit(self, data, NN_lim="auto", cell_subset=None, NN_genes=None, retrieve_training=False): np.random.seed(seed=self.seed) targetGeneNames = NN_genes inputExpressionMatrixDF = pd.DataFrame(data) print("Input dataset is {} genes (columns) and {} cells (rows)".format( inputExpressionMatrixDF.shape[1], inputExpressionMatrixDF.shape[0])) print("First 3 rows and columns:") print(inputExpressionMatrixDF.iloc[0:3, 0:3]) self._setIDandRundir(inputExpressionMatrixDF) # Change the output dimension if the data has too few genes if inputExpressionMatrixDF.shape[1] < self.NN_params["dims"][1]: self.NN_params["dims"][1] = inputExpressionMatrixDF.shape[1] subnetOutputColumns = self.NN_params["dims"][1] # Choose genes to impute # geneCounts = inputExpressionMatrixDF.sum().sort_values(ascending=False) geneQuantiles = inputExpressionMatrixDF.quantile(.99).sort_values( ascending=False) if targetGeneNames is None: targetGeneNames = _get_target_genes( geneQuantiles, minExpressionLevel=self._minExpressionLevel, maxNumOfGenes=NN_lim) df_to_impute = inputExpressionMatrixDF[targetGeneNames] numberOfTargetGenes = len(targetGeneNames) if (numberOfTargetGenes == 0): raise Exception( "Unable to compute any target genes. Is your data log transformed? Perhaps try with a lower minExpressionLevel." ) n_runs, n_cores = self._getRunsAndCores(numberOfTargetGenes) # ------------------------# Subnetworks #------------------------# n_choose = int(numberOfTargetGenes / subnetOutputColumns) subGenelists = np.random.choice(targetGeneNames, [n_choose, subnetOutputColumns], replace=False).tolist() if n_choose < n_runs: # Special case: for the last run, the output layer will have previous targets selectedGenes = np.reshape(subGenelists, -1) leftOutGenes = np.setdiff1d(targetGeneNames, selectedGenes) fill_genes = np.random.choice(targetGeneNames, subnetOutputColumns - len(leftOutGenes), replace=False) subGenelists.append( np.concatenate([leftOutGenes, fill_genes]).tolist()) # ------------------------# Extracting input genes #------------------------# corrMatrix = 1 - np.abs( pd.DataFrame(np.corrcoef(df_to_impute.T), index=targetGeneNames, columns=targetGeneNames)) if self.inOutGenes is None: self.inOutGenes = get_input_genes( df_to_impute, self.NN_params["dims"], distanceMatrix=corrMatrix, targets=subGenelists, #predictorDropoutLimit=self.predictorDropoutLimit ) # ------------------------# Subsets for fitting #------------------------# n_cells = df_to_impute.shape[0] if type(cell_subset) is float or cell_subset == 1: n_cells = int(cell_subset * n_cells) elif type(cell_subset) is int: n_cells = cell_subset self.trainCells = df_to_impute.sample(n_cells, replace=False).index print( "Starting training with {} cells ({:.1%}) on {} threads ({} cores/thread)." .format( n_cells, 1. * n_cells / df_to_impute.shape[0], n_cores, self.NN_params["n_cores"], )) if self.trainingParams is None: self.trainingParams = [self.NN_params] * len(self.inOutGenes) # -------------------# Preprocessing (if any) #--------------------# normalizer = Normalizer.fromName(self.norm) df_to_impute = normalizer.fit(df_to_impute).transform(df_to_impute) # -------------------# Share matrix between subprocesses #--------------------# """ Create memory chunk and put the matrix in it """ idx, cols = self.trainCells, df_to_impute.columns trainData = df_to_impute.loc[self.trainCells, :].values """ Parallelize process with shared array """ childJobs = [(in_out, trainingParams, (idx, cols), "train", retrieve_training) for in_out, trainingParams in zip( self.inOutGenes, self.trainingParams)] self.trainingParams = self._runOnMultipleCores(n_cores, trainData.flatten(), childJobs) self.networks = [] for dictionnary in self.trainingParams: self.networks.append(Net(**dictionnary)) print('---- Hyperparameters summary ----') self.networks[0].display_params() return self
def fit(self, data, NN_lim="auto", cell_subset=None, NN_genes=None, retrieve_training=False): np.random.seed(seed=self.seed) targetGeneNames = NN_genes inputExpressionMatrixDF = pd.DataFrame(data) print("Input dataset is {} genes and {} cells".format( inputExpressionMatrixDF.shape[1], inputExpressionMatrixDF.shape[0])) print("First 3 rows and columns:") print(inputExpressionMatrixDF.iloc[0:3, 0:3]) self._setIDandRundir(inputExpressionMatrixDF) # Change the output dimension if the data has too few genes if inputExpressionMatrixDF.shape[1] < self.NN_params["dims"][1]: self.NN_params["dims"][1] = inputExpressionMatrixDF.shape[1] outputColumns = self.NN_params["dims"][1] # Choose genes to impute imputeOverThisThreshold = .99 geneQuantiles = inputExpressionMatrixDF.quantile( imputeOverThisThreshold).sort_values(ascending=False) if targetGeneNames is None: targetGeneNames = _get_target_genes( geneQuantiles, minExpressionLevel=self._minExpressionLevel, maxNumOfGenes=NN_lim) df_to_impute = inputExpressionMatrixDF[targetGeneNames] numberOfTargetGenes = len(targetGeneNames) n_runs, n_cores = self._getRunsAndCores(numberOfTargetGenes) # ------------------------# Subnetworks #------------------------# predictorGeneNames = np.intersect1d( geneQuantiles.index[geneQuantiles > self.predictorLimit], targetGeneNames) n_choose = int(numberOfTargetGenes / outputColumns) subGenelists = np.random.choice(targetGeneNames, [n_choose, outputColumns], replace=False).tolist() if n_choose < n_runs: # Special case: for the last run, the output layer will have less nodes selectedGenes = np.reshape(subGenelists, -1) subGenelists.append( np.setdiff1d(targetGeneNames, selectedGenes).tolist()) # ------------------------# Extracting input genes #------------------------# corrMatrix = 1 - np.abs( pd.DataFrame(np.corrcoef(df_to_impute.T), index=targetGeneNames, columns=targetGeneNames)[predictorGeneNames]) if self.inOutGenes is None: self.inOutGenes = get_input_genes( df_to_impute, self.NN_params["dims"], distanceMatrix=corrMatrix, targets=subGenelists, predictorLimit=self.predictorLimit, ) # ------------------------# Subsets for fitting #------------------------# n_cells = df_to_impute.shape[0] if type(cell_subset) is float or cell_subset == 1: n_cells = int(cell_subset * n_cells) elif type(cell_subset) is int: n_cells = cell_subset self.trainCells = df_to_impute.sample(n_cells, replace=False).index print( "Starting training with {} cells ({:.1%}) on {} threads ({} cores/thread)." .format( n_cells, 1. * n_cells / df_to_impute.shape[0], n_cores, self.NN_params["n_cores"], )) if self.trainingParams is None: self.trainingParams = [self.NN_params] * len(self.inOutGenes) # -------------------# Preprocessing (if any) #--------------------# df_to_impute = self.norm.fit(df_to_impute).transform(df_to_impute) # -------------------# Share matrix between subprocesses #--------------------# """ Create memory chunk and put the matrix in it """ idx, cols = self.trainCells, df_to_impute.columns trainData = df_to_impute.loc[self.trainCells, :].values """ Parallelize process with shared array """ childJobs = [(in_out, trainingParams, (idx, cols), "train", retrieve_training) for in_out, trainingParams in zip( self.inOutGenes, self.trainingParams)] self.trainingParams = self._runOnMultipleCores(n_cores, trainData.flatten(), childJobs) self.networks = [] for dictionnary in self.trainingParams: self.networks.append(Net(**dictionnary)) return self
def test_get_input_genes(self): data = pd.DataFrame(np.identity(10)) res = util.get_input_genes(data,[2,5],targets=[[0,1]])
def test_get_input_genes(self): data = pd.DataFrame(np.identity(10)) _ = util.get_input_genes(data, 2, nbest=5, targets=[[0, 1]])
def fit(self, data, NN_lim='auto', cell_subset=None): np.random.seed(seed=self.seed) df = pd.DataFrame(data) self.setIDandRundir(df) # Change the output dimension if the data has too few genes if df.shape[1] < self.NN_params['dims'][1]: self.NN_params['dims'][1] = df.shape[1] # Choose genes to impute genes_sort = df.quantile(.99).sort_values(ascending=False) NN_genes = get_target_genes(genes_sort, NN_lim=NN_lim) df_to_impute = df[NN_genes] n_runs, n_cores = self.getCores(NN_genes) # ------------------------# Subnetworks #------------------------# predictors = np.intersect1d( genes_sort.index[genes_sort > self.predictorLimit], NN_genes) print('Using {} genes as potential predictors'.format(len(predictors))) n_choose = int(len(NN_genes) / self.NN_params['dims'][1]) subGenelists = np.random.choice(NN_genes, [n_choose, self.NN_params['dims'][1]], replace=False).tolist() if n_choose < n_runs: # Special case: for the last run, the output layer will have less nodes selectedGenes = np.reshape(subGenelists, -1) subGenelists.append(np.setdiff1d(NN_genes, selectedGenes).tolist()) # ------------------------# Extracting input genes #------------------------# corrMatrix = 1 - np.abs( pd.DataFrame(np.corrcoef(df_to_impute.T), index=NN_genes, columns=NN_genes)[predictors]) in_out_genes = get_input_genes(df_to_impute, self.NN_params['dims'], distanceMatrix=corrMatrix, targets=subGenelists, predictorLimit=self.predictorLimit) # ------------------------# Subsets for fitting #------------------------# n_cells = df_to_impute.shape[0] if type(cell_subset) is float or cell_subset == 1: n_cells = int(cell_subset * n_cells) elif type(cell_subset) is int: n_cells = cell_subset self.trainCells = df_to_impute.sample(n_cells, replace=False).index print( 'Starting training with {} cells ({:.1%}) on {} threads ({} cores/thread).' .format(n_cells, 1. * n_cells / df_to_impute.shape[0], n_cores, self.NN_params['n_cores'])) # -------------------# Preprocessing (if any) #--------------------# df_to_impute = self.norm.fit(df_to_impute).transform(df_to_impute) # -------------------# Share matrix between subprocesses #--------------------# ''' Create memory chunk and put the matrix in it ''' idx, cols = self.trainCells, df_to_impute.columns trainData = df_to_impute.loc[self.trainCells, :].values ''' Parallelize process with shared array ''' childJobs = [(in_out, self.NN_params, (idx, cols), 'train') for in_out in in_out_genes] output_dicts = self.runOnMultipleCores(n_cores, trainData.flatten(), childJobs) self.networks = [] for dictionnary in output_dicts: self.networks.append(Net(**dictionnary)) return self