def test_init_works(self): net = Net() net.set_params(max_epochs=50, learning_rate=1e-3, batch_size=50, layer2={'label': 'dense', 'activation': 'relu', 'nb_neurons': 50}, ncores=4)
def trainNet(in_out, NN_param_i, data_i, labels): features, targets = in_out net = Net(**NN_param_i) net.fit(data_i, targetGenes=targets, predictorGenes=features, labels=labels) # retrieve the array params = list( NN_param_i.keys()) + ['targetGenes', 'NNid', 'predictorGenes'] args2return = [(attr, getattr(net, attr)) for attr in params] return {k: v if k[0] != '_' else (k[1:], v) for k, v in args2return}
def _trainNet(in_out, NN_param_i, data_i, labels, retrieve_training=False): features, targets = in_out net = Net(**NN_param_i) net.fit(data_i, targetGenes=targets, predictorGenes=features, labels=labels, retrieve_training=retrieve_training) # retrieve the array params = list( NN_param_i.keys()) + ["targetGenes", "NNid", "predictorGenes"] args2return = [(attr, getattr(net, attr)) for attr in params] return {k: v if k[0] != "_" else (k[1:], v) for k, v in args2return}
def test_preprocess(self): rawData = test_data.rawData idx = rawData.quantile(.99).sort_values(ascending=False).index[0:2000] rawData = np.log10(1 + rawData[idx]) hyperparams = {'layers': [{'label': 'dense', 'activation': 'relu', 'nb_neurons': 100}, {'label': 'dropout', 'activation': 'dropout', 'rate': 0.15}, {'label': 'dense', 'activation': 'relu'}], 'n_cores': 6} model = Net(**hyperparams) model.fit(rawData) _ = model.predict(rawData) print(model.score(rawData))
def _predictNet(data_i, NN_param_i, labels): net = Net(**NN_param_i) data_i_ok = pd.DataFrame(np.reshape(data_i, list(map(len, labels))), index=labels[0], columns=labels[1]) return net.predict(data_i_ok)
def fit(self, data, NN_lim="auto", cell_subset=None, NN_genes=None, retrieve_training=False): np.random.seed(seed=self.seed) targetGeneNames = NN_genes inputExpressionMatrixDF = pd.DataFrame(data) print("Input dataset is {} genes (columns) and {} cells (rows)".format( inputExpressionMatrixDF.shape[1], inputExpressionMatrixDF.shape[0])) print("First 3 rows and columns:") print(inputExpressionMatrixDF.iloc[0:3, 0:3]) self._setIDandRundir(inputExpressionMatrixDF) # Change the output dimension if the data has too few genes if inputExpressionMatrixDF.shape[1] < self.NN_params["dims"][1]: self.NN_params["dims"][1] = inputExpressionMatrixDF.shape[1] subnetOutputColumns = self.NN_params["dims"][1] # Choose genes to impute # geneCounts = inputExpressionMatrixDF.sum().sort_values(ascending=False) geneQuantiles = inputExpressionMatrixDF.quantile(.99).sort_values( ascending=False) if targetGeneNames is None: targetGeneNames = _get_target_genes( geneQuantiles, minExpressionLevel=self._minExpressionLevel, maxNumOfGenes=NN_lim) df_to_impute = inputExpressionMatrixDF[targetGeneNames] numberOfTargetGenes = len(targetGeneNames) if (numberOfTargetGenes == 0): raise Exception( "Unable to compute any target genes. Is your data log transformed? Perhaps try with a lower minExpressionLevel." ) n_runs, n_cores = self._getRunsAndCores(numberOfTargetGenes) # ------------------------# Subnetworks #------------------------# n_choose = int(numberOfTargetGenes / subnetOutputColumns) subGenelists = np.random.choice(targetGeneNames, [n_choose, subnetOutputColumns], replace=False).tolist() if n_choose < n_runs: # Special case: for the last run, the output layer will have previous targets selectedGenes = np.reshape(subGenelists, -1) leftOutGenes = np.setdiff1d(targetGeneNames, selectedGenes) fill_genes = np.random.choice(targetGeneNames, subnetOutputColumns - len(leftOutGenes), replace=False) subGenelists.append( np.concatenate([leftOutGenes, fill_genes]).tolist()) # ------------------------# Extracting input genes #------------------------# corrMatrix = 1 - np.abs( pd.DataFrame(np.corrcoef(df_to_impute.T), index=targetGeneNames, columns=targetGeneNames)) if self.inOutGenes is None: self.inOutGenes = get_input_genes( df_to_impute, self.NN_params["dims"], distanceMatrix=corrMatrix, targets=subGenelists, #predictorDropoutLimit=self.predictorDropoutLimit ) # ------------------------# Subsets for fitting #------------------------# n_cells = df_to_impute.shape[0] if type(cell_subset) is float or cell_subset == 1: n_cells = int(cell_subset * n_cells) elif type(cell_subset) is int: n_cells = cell_subset self.trainCells = df_to_impute.sample(n_cells, replace=False).index print( "Starting training with {} cells ({:.1%}) on {} threads ({} cores/thread)." .format( n_cells, 1. * n_cells / df_to_impute.shape[0], n_cores, self.NN_params["n_cores"], )) if self.trainingParams is None: self.trainingParams = [self.NN_params] * len(self.inOutGenes) # -------------------# Preprocessing (if any) #--------------------# normalizer = Normalizer.fromName(self.norm) df_to_impute = normalizer.fit(df_to_impute).transform(df_to_impute) # -------------------# Share matrix between subprocesses #--------------------# """ Create memory chunk and put the matrix in it """ idx, cols = self.trainCells, df_to_impute.columns trainData = df_to_impute.loc[self.trainCells, :].values """ Parallelize process with shared array """ childJobs = [(in_out, trainingParams, (idx, cols), "train", retrieve_training) for in_out, trainingParams in zip( self.inOutGenes, self.trainingParams)] self.trainingParams = self._runOnMultipleCores(n_cores, trainData.flatten(), childJobs) self.networks = [] for dictionnary in self.trainingParams: self.networks.append(Net(**dictionnary)) print('---- Hyperparameters summary ----') self.networks[0].display_params() return self
def fit(self, data, NN_lim="auto", cell_subset=None, NN_genes=None, retrieve_training=False): np.random.seed(seed=self.seed) targetGeneNames = NN_genes inputExpressionMatrixDF = pd.DataFrame(data) print("Input dataset is {} genes and {} cells".format( inputExpressionMatrixDF.shape[1], inputExpressionMatrixDF.shape[0])) print("First 3 rows and columns:") print(inputExpressionMatrixDF.iloc[0:3, 0:3]) self._setIDandRundir(inputExpressionMatrixDF) # Change the output dimension if the data has too few genes if inputExpressionMatrixDF.shape[1] < self.NN_params["dims"][1]: self.NN_params["dims"][1] = inputExpressionMatrixDF.shape[1] outputColumns = self.NN_params["dims"][1] # Choose genes to impute imputeOverThisThreshold = .99 geneQuantiles = inputExpressionMatrixDF.quantile( imputeOverThisThreshold).sort_values(ascending=False) if targetGeneNames is None: targetGeneNames = _get_target_genes( geneQuantiles, minExpressionLevel=self._minExpressionLevel, maxNumOfGenes=NN_lim) df_to_impute = inputExpressionMatrixDF[targetGeneNames] numberOfTargetGenes = len(targetGeneNames) n_runs, n_cores = self._getRunsAndCores(numberOfTargetGenes) # ------------------------# Subnetworks #------------------------# predictorGeneNames = np.intersect1d( geneQuantiles.index[geneQuantiles > self.predictorLimit], targetGeneNames) n_choose = int(numberOfTargetGenes / outputColumns) subGenelists = np.random.choice(targetGeneNames, [n_choose, outputColumns], replace=False).tolist() if n_choose < n_runs: # Special case: for the last run, the output layer will have less nodes selectedGenes = np.reshape(subGenelists, -1) subGenelists.append( np.setdiff1d(targetGeneNames, selectedGenes).tolist()) # ------------------------# Extracting input genes #------------------------# corrMatrix = 1 - np.abs( pd.DataFrame(np.corrcoef(df_to_impute.T), index=targetGeneNames, columns=targetGeneNames)[predictorGeneNames]) if self.inOutGenes is None: self.inOutGenes = get_input_genes( df_to_impute, self.NN_params["dims"], distanceMatrix=corrMatrix, targets=subGenelists, predictorLimit=self.predictorLimit, ) # ------------------------# Subsets for fitting #------------------------# n_cells = df_to_impute.shape[0] if type(cell_subset) is float or cell_subset == 1: n_cells = int(cell_subset * n_cells) elif type(cell_subset) is int: n_cells = cell_subset self.trainCells = df_to_impute.sample(n_cells, replace=False).index print( "Starting training with {} cells ({:.1%}) on {} threads ({} cores/thread)." .format( n_cells, 1. * n_cells / df_to_impute.shape[0], n_cores, self.NN_params["n_cores"], )) if self.trainingParams is None: self.trainingParams = [self.NN_params] * len(self.inOutGenes) # -------------------# Preprocessing (if any) #--------------------# df_to_impute = self.norm.fit(df_to_impute).transform(df_to_impute) # -------------------# Share matrix between subprocesses #--------------------# """ Create memory chunk and put the matrix in it """ idx, cols = self.trainCells, df_to_impute.columns trainData = df_to_impute.loc[self.trainCells, :].values """ Parallelize process with shared array """ childJobs = [(in_out, trainingParams, (idx, cols), "train", retrieve_training) for in_out, trainingParams in zip( self.inOutGenes, self.trainingParams)] self.trainingParams = self._runOnMultipleCores(n_cores, trainData.flatten(), childJobs) self.networks = [] for dictionnary in self.trainingParams: self.networks.append(Net(**dictionnary)) return self
def fit(self, data, NN_lim='auto', cell_subset=None): np.random.seed(seed=self.seed) df = pd.DataFrame(data) self.setIDandRundir(df) # Change the output dimension if the data has too few genes if df.shape[1] < self.NN_params['dims'][1]: self.NN_params['dims'][1] = df.shape[1] # Choose genes to impute genes_sort = df.quantile(.99).sort_values(ascending=False) NN_genes = get_target_genes(genes_sort, NN_lim=NN_lim) df_to_impute = df[NN_genes] n_runs, n_cores = self.getCores(NN_genes) # ------------------------# Subnetworks #------------------------# predictors = np.intersect1d( genes_sort.index[genes_sort > self.predictorLimit], NN_genes) print('Using {} genes as potential predictors'.format(len(predictors))) n_choose = int(len(NN_genes) / self.NN_params['dims'][1]) subGenelists = np.random.choice(NN_genes, [n_choose, self.NN_params['dims'][1]], replace=False).tolist() if n_choose < n_runs: # Special case: for the last run, the output layer will have less nodes selectedGenes = np.reshape(subGenelists, -1) subGenelists.append(np.setdiff1d(NN_genes, selectedGenes).tolist()) # ------------------------# Extracting input genes #------------------------# corrMatrix = 1 - np.abs( pd.DataFrame(np.corrcoef(df_to_impute.T), index=NN_genes, columns=NN_genes)[predictors]) in_out_genes = get_input_genes(df_to_impute, self.NN_params['dims'], distanceMatrix=corrMatrix, targets=subGenelists, predictorLimit=self.predictorLimit) # ------------------------# Subsets for fitting #------------------------# n_cells = df_to_impute.shape[0] if type(cell_subset) is float or cell_subset == 1: n_cells = int(cell_subset * n_cells) elif type(cell_subset) is int: n_cells = cell_subset self.trainCells = df_to_impute.sample(n_cells, replace=False).index print( 'Starting training with {} cells ({:.1%}) on {} threads ({} cores/thread).' .format(n_cells, 1. * n_cells / df_to_impute.shape[0], n_cores, self.NN_params['n_cores'])) # -------------------# Preprocessing (if any) #--------------------# df_to_impute = self.norm.fit(df_to_impute).transform(df_to_impute) # -------------------# Share matrix between subprocesses #--------------------# ''' Create memory chunk and put the matrix in it ''' idx, cols = self.trainCells, df_to_impute.columns trainData = df_to_impute.loc[self.trainCells, :].values ''' Parallelize process with shared array ''' childJobs = [(in_out, self.NN_params, (idx, cols), 'train') for in_out in in_out_genes] output_dicts = self.runOnMultipleCores(n_cores, trainData.flatten(), childJobs) self.networks = [] for dictionnary in output_dicts: self.networks.append(Net(**dictionnary)) return self