Ejemplo n.º 1
0
    def fit(self,
            X,
            targetGenes=None,
            predictorGenes=None,
            dists=None,
            labels=None,
            retrieve_training=False,
            **params):  # Extract features and start training

        if labels is not None:
            data = pd.DataFrame(
                np.reshape(X, list(map(len, labels))),
                index=labels[0],
                columns=labels[1],
            )
        else:
            data = X

        if not retrieve_training:
            self.set_params(NNid="auto", **params)

        if targetGenes is not None:
            if len(targetGenes) < self.dims[1]:
                self._dims[1] = len(targetGenes)

        if predictorGenes is None:
            self.predictorGenes, self.targetGenes = get_input_genes(
                data, self.dims, distanceMatrix=dists, targets=targetGenes)[0]
        else:
            self.predictorGenes, self.targetGenes = predictorGenes, targetGenes

        # filt = (data[self.targetGenes] > 0).sum(axis=1) >= self.dims[1] * cell_thresh

        # n_iter = 0
        # while (filt.astype(int).sum() == 0) and (n_iter<10000):
        #     cell_thresh /= 2
        #     n_iter += 1
        #     filt = (data[self.targetGenes] > 0).sum(axis=1) >= self.dims[1] * cell_thresh

        features, targets = (
            data.loc[:, self.predictorGenes].values,
            data.loc[:, self.targetGenes].values,
        )

        model = self._fit(features,
                          targets,
                          retrieve_training=retrieve_training)

        return model
Ejemplo n.º 2
0
    def fit(
        self,
        X,
        targetGenes=None,
        predictorGenes=None,
        dists=None,
        cell_thresh=0.1,
        labels=None,
        retrieve_training=False,
        **params
    ):  # Extract features and start training

        if labels is not None:
            data = pd.DataFrame(
                np.reshape(X, list(map(len, labels))),
                index=labels[0],
                columns=labels[1],
            )
        else:
            data = X

        if not retrieve_training:
            self.set_params(NNid="auto", **params)

        if targetGenes is not None:
            if len(targetGenes) < self.dims[1]:
                self._dims[1] = len(targetGenes)

        if predictorGenes is None:
            self.predictorGenes, self.targetGenes = get_input_genes(
                data,
                self.dims,
                distanceMatrix=dists,
                targets=targetGenes,
                predictorLimit=None,
            )[0]
        else:
            self.predictorGenes, self.targetGenes = predictorGenes, targetGenes

        filt = (data[self.targetGenes] > 0).sum(axis=1) >= self.dims[1] * cell_thresh

        features, targets = (
            data.loc[filt, self.predictorGenes].values,
            data.loc[filt, self.targetGenes].values,
        )

        self._fit(features, targets, retrieve_training=retrieve_training)
Ejemplo n.º 3
0
    def fit(
        self,
        X,
        targetGenes=None,
        predictorGenes=None,
        dists=None,
        labels=None,
        retrieve_training=False,
        **params
    ):  # Extract features and start training

        if labels is not None:
            data = pd.DataFrame(
                np.reshape(X, list(map(len, labels))),
                index=labels[0],
                columns=labels[1],
            )
        else:
            data = X

        if not retrieve_training:
            self.set_params(NNid="auto", **params)

        if targetGenes is not None:
            if len(targetGenes) < self.dims[1]:
                self._dims[1] = len(targetGenes)

        if predictorGenes is None:
            self.predictorGenes, self.targetGenes = get_input_genes(
                data,
                self.dims[1],
                nbest=self.dims[0],
                distanceMatrix=dists,
                targets=targetGenes
            )[0]
        else:
            self.predictorGenes, self.targetGenes = predictorGenes, targetGenes
        
        features, targets = (
            data.loc[:, self.predictorGenes].values,
            data.loc[:, self.targetGenes].values,
        )

        model = self._fit(features, targets, retrieve_training=retrieve_training)

        return model
Ejemplo n.º 4
0
    def fit(self,
            data,
            NN_lim="auto",
            cell_subset=None,
            NN_genes=None,
            retrieve_training=False):
        np.random.seed(seed=self.seed)
        targetGeneNames = NN_genes

        inputExpressionMatrixDF = pd.DataFrame(data)
        print("Input dataset is {} genes (columns) and {} cells (rows)".format(
            inputExpressionMatrixDF.shape[1],
            inputExpressionMatrixDF.shape[0]))
        print("First 3 rows and columns:")
        print(inputExpressionMatrixDF.iloc[0:3, 0:3])

        self._setIDandRundir(inputExpressionMatrixDF)

        # Change the output dimension if the data has too few genes
        if inputExpressionMatrixDF.shape[1] < self.NN_params["dims"][1]:
            self.NN_params["dims"][1] = inputExpressionMatrixDF.shape[1]

        subnetOutputColumns = self.NN_params["dims"][1]

        # Choose genes to impute
        # geneCounts = inputExpressionMatrixDF.sum().sort_values(ascending=False)
        geneQuantiles = inputExpressionMatrixDF.quantile(.99).sort_values(
            ascending=False)

        if targetGeneNames is None:
            targetGeneNames = _get_target_genes(
                geneQuantiles,
                minExpressionLevel=self._minExpressionLevel,
                maxNumOfGenes=NN_lim)

        df_to_impute = inputExpressionMatrixDF[targetGeneNames]

        numberOfTargetGenes = len(targetGeneNames)
        if (numberOfTargetGenes == 0):
            raise Exception(
                "Unable to compute any target genes. Is your data log transformed? Perhaps try with a lower minExpressionLevel."
            )

        n_runs, n_cores = self._getRunsAndCores(numberOfTargetGenes)

        # ------------------------# Subnetworks #------------------------#

        n_choose = int(numberOfTargetGenes / subnetOutputColumns)

        subGenelists = np.random.choice(targetGeneNames,
                                        [n_choose, subnetOutputColumns],
                                        replace=False).tolist()

        if n_choose < n_runs:
            # Special case: for the last run, the output layer will have previous targets
            selectedGenes = np.reshape(subGenelists, -1)
            leftOutGenes = np.setdiff1d(targetGeneNames, selectedGenes)

            fill_genes = np.random.choice(targetGeneNames,
                                          subnetOutputColumns -
                                          len(leftOutGenes),
                                          replace=False)

            subGenelists.append(
                np.concatenate([leftOutGenes, fill_genes]).tolist())

        # ------------------------# Extracting input genes #------------------------#

        corrMatrix = 1 - np.abs(
            pd.DataFrame(np.corrcoef(df_to_impute.T),
                         index=targetGeneNames,
                         columns=targetGeneNames))

        if self.inOutGenes is None:

            self.inOutGenes = get_input_genes(
                df_to_impute,
                self.NN_params["dims"],
                distanceMatrix=corrMatrix,
                targets=subGenelists,
                #predictorDropoutLimit=self.predictorDropoutLimit
            )

        # ------------------------# Subsets for fitting #------------------------#

        n_cells = df_to_impute.shape[0]

        if type(cell_subset) is float or cell_subset == 1:
            n_cells = int(cell_subset * n_cells)

        elif type(cell_subset) is int:
            n_cells = cell_subset

        self.trainCells = df_to_impute.sample(n_cells, replace=False).index

        print(
            "Starting training with {} cells ({:.1%}) on {} threads ({} cores/thread)."
            .format(
                n_cells,
                1. * n_cells / df_to_impute.shape[0],
                n_cores,
                self.NN_params["n_cores"],
            ))

        if self.trainingParams is None:
            self.trainingParams = [self.NN_params] * len(self.inOutGenes)

        # -------------------# Preprocessing (if any) #--------------------#

        normalizer = Normalizer.fromName(self.norm)

        df_to_impute = normalizer.fit(df_to_impute).transform(df_to_impute)

        # -------------------# Share matrix between subprocesses #--------------------#
        """ Create memory chunk and put the matrix in it """
        idx, cols = self.trainCells, df_to_impute.columns
        trainData = df_to_impute.loc[self.trainCells, :].values
        """ Parallelize process with shared array """
        childJobs = [(in_out, trainingParams, (idx, cols), "train",
                      retrieve_training) for in_out, trainingParams in zip(
                          self.inOutGenes, self.trainingParams)]

        self.trainingParams = self._runOnMultipleCores(n_cores,
                                                       trainData.flatten(),
                                                       childJobs)

        self.networks = []
        for dictionnary in self.trainingParams:
            self.networks.append(Net(**dictionnary))

        print('---- Hyperparameters summary ----')
        self.networks[0].display_params()

        return self
Ejemplo n.º 5
0
    def fit(self,
            data,
            NN_lim="auto",
            cell_subset=None,
            NN_genes=None,
            retrieve_training=False):
        np.random.seed(seed=self.seed)
        targetGeneNames = NN_genes

        inputExpressionMatrixDF = pd.DataFrame(data)
        print("Input dataset is {} genes and {} cells".format(
            inputExpressionMatrixDF.shape[1],
            inputExpressionMatrixDF.shape[0]))
        print("First 3 rows and columns:")
        print(inputExpressionMatrixDF.iloc[0:3, 0:3])

        self._setIDandRundir(inputExpressionMatrixDF)

        # Change the output dimension if the data has too few genes
        if inputExpressionMatrixDF.shape[1] < self.NN_params["dims"][1]:
            self.NN_params["dims"][1] = inputExpressionMatrixDF.shape[1]

        outputColumns = self.NN_params["dims"][1]

        # Choose genes to impute
        imputeOverThisThreshold = .99
        geneQuantiles = inputExpressionMatrixDF.quantile(
            imputeOverThisThreshold).sort_values(ascending=False)

        if targetGeneNames is None:
            targetGeneNames = _get_target_genes(
                geneQuantiles,
                minExpressionLevel=self._minExpressionLevel,
                maxNumOfGenes=NN_lim)

        df_to_impute = inputExpressionMatrixDF[targetGeneNames]

        numberOfTargetGenes = len(targetGeneNames)
        n_runs, n_cores = self._getRunsAndCores(numberOfTargetGenes)

        # ------------------------# Subnetworks #------------------------#

        predictorGeneNames = np.intersect1d(
            geneQuantiles.index[geneQuantiles > self.predictorLimit],
            targetGeneNames)

        n_choose = int(numberOfTargetGenes / outputColumns)

        subGenelists = np.random.choice(targetGeneNames,
                                        [n_choose, outputColumns],
                                        replace=False).tolist()
        if n_choose < n_runs:
            # Special case: for the last run, the output layer will have less nodes
            selectedGenes = np.reshape(subGenelists, -1)
            subGenelists.append(
                np.setdiff1d(targetGeneNames, selectedGenes).tolist())

        # ------------------------# Extracting input genes #------------------------#

        corrMatrix = 1 - np.abs(
            pd.DataFrame(np.corrcoef(df_to_impute.T),
                         index=targetGeneNames,
                         columns=targetGeneNames)[predictorGeneNames])

        if self.inOutGenes is None:

            self.inOutGenes = get_input_genes(
                df_to_impute,
                self.NN_params["dims"],
                distanceMatrix=corrMatrix,
                targets=subGenelists,
                predictorLimit=self.predictorLimit,
            )

        # ------------------------# Subsets for fitting #------------------------#

        n_cells = df_to_impute.shape[0]

        if type(cell_subset) is float or cell_subset == 1:
            n_cells = int(cell_subset * n_cells)

        elif type(cell_subset) is int:
            n_cells = cell_subset

        self.trainCells = df_to_impute.sample(n_cells, replace=False).index

        print(
            "Starting training with {} cells ({:.1%}) on {} threads ({} cores/thread)."
            .format(
                n_cells,
                1. * n_cells / df_to_impute.shape[0],
                n_cores,
                self.NN_params["n_cores"],
            ))

        if self.trainingParams is None:
            self.trainingParams = [self.NN_params] * len(self.inOutGenes)

        # -------------------# Preprocessing (if any) #--------------------#

        df_to_impute = self.norm.fit(df_to_impute).transform(df_to_impute)

        # -------------------# Share matrix between subprocesses #--------------------#
        """ Create memory chunk and put the matrix in it """
        idx, cols = self.trainCells, df_to_impute.columns
        trainData = df_to_impute.loc[self.trainCells, :].values
        """ Parallelize process with shared array """
        childJobs = [(in_out, trainingParams, (idx, cols), "train",
                      retrieve_training) for in_out, trainingParams in zip(
                          self.inOutGenes, self.trainingParams)]

        self.trainingParams = self._runOnMultipleCores(n_cores,
                                                       trainData.flatten(),
                                                       childJobs)

        self.networks = []
        for dictionnary in self.trainingParams:
            self.networks.append(Net(**dictionnary))

        return self
Ejemplo n.º 6
0
 def test_get_input_genes(self):
     data = pd.DataFrame(np.identity(10))
     res = util.get_input_genes(data,[2,5],targets=[[0,1]])
Ejemplo n.º 7
0
 def test_get_input_genes(self):
     data = pd.DataFrame(np.identity(10))
     _ = util.get_input_genes(data, 2, nbest=5, targets=[[0, 1]])
Ejemplo n.º 8
0
    def fit(self, data, NN_lim='auto', cell_subset=None):
        np.random.seed(seed=self.seed)

        df = pd.DataFrame(data)

        self.setIDandRundir(df)

        # Change the output dimension if the data has too few genes
        if df.shape[1] < self.NN_params['dims'][1]:
            self.NN_params['dims'][1] = df.shape[1]

        # Choose genes to impute
        genes_sort = df.quantile(.99).sort_values(ascending=False)
        NN_genes = get_target_genes(genes_sort, NN_lim=NN_lim)

        df_to_impute = df[NN_genes]

        n_runs, n_cores = self.getCores(NN_genes)

        # ------------------------# Subnetworks #------------------------#

        predictors = np.intersect1d(
            genes_sort.index[genes_sort > self.predictorLimit], NN_genes)
        print('Using {} genes as potential predictors'.format(len(predictors)))

        n_choose = int(len(NN_genes) / self.NN_params['dims'][1])

        subGenelists = np.random.choice(NN_genes,
                                        [n_choose, self.NN_params['dims'][1]],
                                        replace=False).tolist()
        if n_choose < n_runs:
            # Special case: for the last run, the output layer will have less nodes
            selectedGenes = np.reshape(subGenelists, -1)
            subGenelists.append(np.setdiff1d(NN_genes, selectedGenes).tolist())

        # ------------------------# Extracting input genes #------------------------#

        corrMatrix = 1 - np.abs(
            pd.DataFrame(np.corrcoef(df_to_impute.T),
                         index=NN_genes,
                         columns=NN_genes)[predictors])

        in_out_genes = get_input_genes(df_to_impute,
                                       self.NN_params['dims'],
                                       distanceMatrix=corrMatrix,
                                       targets=subGenelists,
                                       predictorLimit=self.predictorLimit)

        # ------------------------# Subsets for fitting #------------------------#

        n_cells = df_to_impute.shape[0]

        if type(cell_subset) is float or cell_subset == 1:
            n_cells = int(cell_subset * n_cells)

        elif type(cell_subset) is int:
            n_cells = cell_subset

        self.trainCells = df_to_impute.sample(n_cells, replace=False).index

        print(
            'Starting training with {} cells ({:.1%}) on {} threads ({} cores/thread).'
            .format(n_cells, 1. * n_cells / df_to_impute.shape[0], n_cores,
                    self.NN_params['n_cores']))

        # -------------------# Preprocessing (if any) #--------------------#

        df_to_impute = self.norm.fit(df_to_impute).transform(df_to_impute)

        # -------------------# Share matrix between subprocesses #--------------------#
        ''' Create memory chunk and put the matrix in it '''
        idx, cols = self.trainCells, df_to_impute.columns
        trainData = df_to_impute.loc[self.trainCells, :].values
        ''' Parallelize process with shared array '''
        childJobs = [(in_out, self.NN_params, (idx, cols), 'train')
                     for in_out in in_out_genes]

        output_dicts = self.runOnMultipleCores(n_cores, trainData.flatten(),
                                               childJobs)

        self.networks = []
        for dictionnary in output_dicts:
            self.networks.append(Net(**dictionnary))

        return self