Esempio n. 1
0
 def test_init_works(self):
     net = Net()
     net.set_params(max_epochs=50,
                    learning_rate=1e-3,
                    batch_size=50,
                    layer2={'label': 'dense', 'activation': 'relu', 'nb_neurons': 50},
                    ncores=4)
Esempio n. 2
0
def trainNet(in_out, NN_param_i, data_i, labels):
    features, targets = in_out

    net = Net(**NN_param_i)
    net.fit(data_i,
            targetGenes=targets,
            predictorGenes=features,
            labels=labels)

    # retrieve the array
    params = list(
        NN_param_i.keys()) + ['targetGenes', 'NNid', 'predictorGenes']
    args2return = [(attr, getattr(net, attr)) for attr in params]
    return {k: v if k[0] != '_' else (k[1:], v) for k, v in args2return}
Esempio n. 3
0
def _trainNet(in_out, NN_param_i, data_i, labels, retrieve_training=False):
    features, targets = in_out

    net = Net(**NN_param_i)
    net.fit(data_i,
            targetGenes=targets,
            predictorGenes=features,
            labels=labels,
            retrieve_training=retrieve_training)

    # retrieve the array
    params = list(
        NN_param_i.keys()) + ["targetGenes", "NNid", "predictorGenes"]
    args2return = [(attr, getattr(net, attr)) for attr in params]
    return {k: v if k[0] != "_" else (k[1:], v) for k, v in args2return}
Esempio n. 4
0
    def test_preprocess(self):
        rawData = test_data.rawData

        idx = rawData.quantile(.99).sort_values(ascending=False).index[0:2000]
        rawData = np.log10(1 + rawData[idx])

        hyperparams = {'layers': [{'label': 'dense', 'activation': 'relu', 'nb_neurons': 100},
                                  {'label': 'dropout', 'activation': 'dropout', 'rate': 0.15},
                                  {'label': 'dense', 'activation': 'relu'}],
                       'n_cores': 6}

        model = Net(**hyperparams)
        model.fit(rawData)
        _ = model.predict(rawData)
        print(model.score(rawData))
Esempio n. 5
0
def _predictNet(data_i, NN_param_i, labels):
    net = Net(**NN_param_i)
    data_i_ok = pd.DataFrame(np.reshape(data_i, list(map(len, labels))),
                             index=labels[0],
                             columns=labels[1])
    return net.predict(data_i_ok)
Esempio n. 6
0
    def fit(self,
            data,
            NN_lim="auto",
            cell_subset=None,
            NN_genes=None,
            retrieve_training=False):
        np.random.seed(seed=self.seed)
        targetGeneNames = NN_genes

        inputExpressionMatrixDF = pd.DataFrame(data)
        print("Input dataset is {} genes (columns) and {} cells (rows)".format(
            inputExpressionMatrixDF.shape[1],
            inputExpressionMatrixDF.shape[0]))
        print("First 3 rows and columns:")
        print(inputExpressionMatrixDF.iloc[0:3, 0:3])

        self._setIDandRundir(inputExpressionMatrixDF)

        # Change the output dimension if the data has too few genes
        if inputExpressionMatrixDF.shape[1] < self.NN_params["dims"][1]:
            self.NN_params["dims"][1] = inputExpressionMatrixDF.shape[1]

        subnetOutputColumns = self.NN_params["dims"][1]

        # Choose genes to impute
        # geneCounts = inputExpressionMatrixDF.sum().sort_values(ascending=False)
        geneQuantiles = inputExpressionMatrixDF.quantile(.99).sort_values(
            ascending=False)

        if targetGeneNames is None:
            targetGeneNames = _get_target_genes(
                geneQuantiles,
                minExpressionLevel=self._minExpressionLevel,
                maxNumOfGenes=NN_lim)

        df_to_impute = inputExpressionMatrixDF[targetGeneNames]

        numberOfTargetGenes = len(targetGeneNames)
        if (numberOfTargetGenes == 0):
            raise Exception(
                "Unable to compute any target genes. Is your data log transformed? Perhaps try with a lower minExpressionLevel."
            )

        n_runs, n_cores = self._getRunsAndCores(numberOfTargetGenes)

        # ------------------------# Subnetworks #------------------------#

        n_choose = int(numberOfTargetGenes / subnetOutputColumns)

        subGenelists = np.random.choice(targetGeneNames,
                                        [n_choose, subnetOutputColumns],
                                        replace=False).tolist()

        if n_choose < n_runs:
            # Special case: for the last run, the output layer will have previous targets
            selectedGenes = np.reshape(subGenelists, -1)
            leftOutGenes = np.setdiff1d(targetGeneNames, selectedGenes)

            fill_genes = np.random.choice(targetGeneNames,
                                          subnetOutputColumns -
                                          len(leftOutGenes),
                                          replace=False)

            subGenelists.append(
                np.concatenate([leftOutGenes, fill_genes]).tolist())

        # ------------------------# Extracting input genes #------------------------#

        corrMatrix = 1 - np.abs(
            pd.DataFrame(np.corrcoef(df_to_impute.T),
                         index=targetGeneNames,
                         columns=targetGeneNames))

        if self.inOutGenes is None:

            self.inOutGenes = get_input_genes(
                df_to_impute,
                self.NN_params["dims"],
                distanceMatrix=corrMatrix,
                targets=subGenelists,
                #predictorDropoutLimit=self.predictorDropoutLimit
            )

        # ------------------------# Subsets for fitting #------------------------#

        n_cells = df_to_impute.shape[0]

        if type(cell_subset) is float or cell_subset == 1:
            n_cells = int(cell_subset * n_cells)

        elif type(cell_subset) is int:
            n_cells = cell_subset

        self.trainCells = df_to_impute.sample(n_cells, replace=False).index

        print(
            "Starting training with {} cells ({:.1%}) on {} threads ({} cores/thread)."
            .format(
                n_cells,
                1. * n_cells / df_to_impute.shape[0],
                n_cores,
                self.NN_params["n_cores"],
            ))

        if self.trainingParams is None:
            self.trainingParams = [self.NN_params] * len(self.inOutGenes)

        # -------------------# Preprocessing (if any) #--------------------#

        normalizer = Normalizer.fromName(self.norm)

        df_to_impute = normalizer.fit(df_to_impute).transform(df_to_impute)

        # -------------------# Share matrix between subprocesses #--------------------#
        """ Create memory chunk and put the matrix in it """
        idx, cols = self.trainCells, df_to_impute.columns
        trainData = df_to_impute.loc[self.trainCells, :].values
        """ Parallelize process with shared array """
        childJobs = [(in_out, trainingParams, (idx, cols), "train",
                      retrieve_training) for in_out, trainingParams in zip(
                          self.inOutGenes, self.trainingParams)]

        self.trainingParams = self._runOnMultipleCores(n_cores,
                                                       trainData.flatten(),
                                                       childJobs)

        self.networks = []
        for dictionnary in self.trainingParams:
            self.networks.append(Net(**dictionnary))

        print('---- Hyperparameters summary ----')
        self.networks[0].display_params()

        return self
Esempio n. 7
0
    def fit(self,
            data,
            NN_lim="auto",
            cell_subset=None,
            NN_genes=None,
            retrieve_training=False):
        np.random.seed(seed=self.seed)
        targetGeneNames = NN_genes

        inputExpressionMatrixDF = pd.DataFrame(data)
        print("Input dataset is {} genes and {} cells".format(
            inputExpressionMatrixDF.shape[1],
            inputExpressionMatrixDF.shape[0]))
        print("First 3 rows and columns:")
        print(inputExpressionMatrixDF.iloc[0:3, 0:3])

        self._setIDandRundir(inputExpressionMatrixDF)

        # Change the output dimension if the data has too few genes
        if inputExpressionMatrixDF.shape[1] < self.NN_params["dims"][1]:
            self.NN_params["dims"][1] = inputExpressionMatrixDF.shape[1]

        outputColumns = self.NN_params["dims"][1]

        # Choose genes to impute
        imputeOverThisThreshold = .99
        geneQuantiles = inputExpressionMatrixDF.quantile(
            imputeOverThisThreshold).sort_values(ascending=False)

        if targetGeneNames is None:
            targetGeneNames = _get_target_genes(
                geneQuantiles,
                minExpressionLevel=self._minExpressionLevel,
                maxNumOfGenes=NN_lim)

        df_to_impute = inputExpressionMatrixDF[targetGeneNames]

        numberOfTargetGenes = len(targetGeneNames)
        n_runs, n_cores = self._getRunsAndCores(numberOfTargetGenes)

        # ------------------------# Subnetworks #------------------------#

        predictorGeneNames = np.intersect1d(
            geneQuantiles.index[geneQuantiles > self.predictorLimit],
            targetGeneNames)

        n_choose = int(numberOfTargetGenes / outputColumns)

        subGenelists = np.random.choice(targetGeneNames,
                                        [n_choose, outputColumns],
                                        replace=False).tolist()
        if n_choose < n_runs:
            # Special case: for the last run, the output layer will have less nodes
            selectedGenes = np.reshape(subGenelists, -1)
            subGenelists.append(
                np.setdiff1d(targetGeneNames, selectedGenes).tolist())

        # ------------------------# Extracting input genes #------------------------#

        corrMatrix = 1 - np.abs(
            pd.DataFrame(np.corrcoef(df_to_impute.T),
                         index=targetGeneNames,
                         columns=targetGeneNames)[predictorGeneNames])

        if self.inOutGenes is None:

            self.inOutGenes = get_input_genes(
                df_to_impute,
                self.NN_params["dims"],
                distanceMatrix=corrMatrix,
                targets=subGenelists,
                predictorLimit=self.predictorLimit,
            )

        # ------------------------# Subsets for fitting #------------------------#

        n_cells = df_to_impute.shape[0]

        if type(cell_subset) is float or cell_subset == 1:
            n_cells = int(cell_subset * n_cells)

        elif type(cell_subset) is int:
            n_cells = cell_subset

        self.trainCells = df_to_impute.sample(n_cells, replace=False).index

        print(
            "Starting training with {} cells ({:.1%}) on {} threads ({} cores/thread)."
            .format(
                n_cells,
                1. * n_cells / df_to_impute.shape[0],
                n_cores,
                self.NN_params["n_cores"],
            ))

        if self.trainingParams is None:
            self.trainingParams = [self.NN_params] * len(self.inOutGenes)

        # -------------------# Preprocessing (if any) #--------------------#

        df_to_impute = self.norm.fit(df_to_impute).transform(df_to_impute)

        # -------------------# Share matrix between subprocesses #--------------------#
        """ Create memory chunk and put the matrix in it """
        idx, cols = self.trainCells, df_to_impute.columns
        trainData = df_to_impute.loc[self.trainCells, :].values
        """ Parallelize process with shared array """
        childJobs = [(in_out, trainingParams, (idx, cols), "train",
                      retrieve_training) for in_out, trainingParams in zip(
                          self.inOutGenes, self.trainingParams)]

        self.trainingParams = self._runOnMultipleCores(n_cores,
                                                       trainData.flatten(),
                                                       childJobs)

        self.networks = []
        for dictionnary in self.trainingParams:
            self.networks.append(Net(**dictionnary))

        return self
Esempio n. 8
0
    def fit(self, data, NN_lim='auto', cell_subset=None):
        np.random.seed(seed=self.seed)

        df = pd.DataFrame(data)

        self.setIDandRundir(df)

        # Change the output dimension if the data has too few genes
        if df.shape[1] < self.NN_params['dims'][1]:
            self.NN_params['dims'][1] = df.shape[1]

        # Choose genes to impute
        genes_sort = df.quantile(.99).sort_values(ascending=False)
        NN_genes = get_target_genes(genes_sort, NN_lim=NN_lim)

        df_to_impute = df[NN_genes]

        n_runs, n_cores = self.getCores(NN_genes)

        # ------------------------# Subnetworks #------------------------#

        predictors = np.intersect1d(
            genes_sort.index[genes_sort > self.predictorLimit], NN_genes)
        print('Using {} genes as potential predictors'.format(len(predictors)))

        n_choose = int(len(NN_genes) / self.NN_params['dims'][1])

        subGenelists = np.random.choice(NN_genes,
                                        [n_choose, self.NN_params['dims'][1]],
                                        replace=False).tolist()
        if n_choose < n_runs:
            # Special case: for the last run, the output layer will have less nodes
            selectedGenes = np.reshape(subGenelists, -1)
            subGenelists.append(np.setdiff1d(NN_genes, selectedGenes).tolist())

        # ------------------------# Extracting input genes #------------------------#

        corrMatrix = 1 - np.abs(
            pd.DataFrame(np.corrcoef(df_to_impute.T),
                         index=NN_genes,
                         columns=NN_genes)[predictors])

        in_out_genes = get_input_genes(df_to_impute,
                                       self.NN_params['dims'],
                                       distanceMatrix=corrMatrix,
                                       targets=subGenelists,
                                       predictorLimit=self.predictorLimit)

        # ------------------------# Subsets for fitting #------------------------#

        n_cells = df_to_impute.shape[0]

        if type(cell_subset) is float or cell_subset == 1:
            n_cells = int(cell_subset * n_cells)

        elif type(cell_subset) is int:
            n_cells = cell_subset

        self.trainCells = df_to_impute.sample(n_cells, replace=False).index

        print(
            'Starting training with {} cells ({:.1%}) on {} threads ({} cores/thread).'
            .format(n_cells, 1. * n_cells / df_to_impute.shape[0], n_cores,
                    self.NN_params['n_cores']))

        # -------------------# Preprocessing (if any) #--------------------#

        df_to_impute = self.norm.fit(df_to_impute).transform(df_to_impute)

        # -------------------# Share matrix between subprocesses #--------------------#
        ''' Create memory chunk and put the matrix in it '''
        idx, cols = self.trainCells, df_to_impute.columns
        trainData = df_to_impute.loc[self.trainCells, :].values
        ''' Parallelize process with shared array '''
        childJobs = [(in_out, self.NN_params, (idx, cols), 'train')
                     for in_out in in_out_genes]

        output_dicts = self.runOnMultipleCores(n_cores, trainData.flatten(),
                                               childJobs)

        self.networks = []
        for dictionnary in output_dicts:
            self.networks.append(Net(**dictionnary))

        return self