Ejemplo n.º 1
0
    def __init__(self,
                 source_data,
                 target_data,
                 n_factors,
                 n_pv,
                 dim_reduction='pca',
                 dim_reduction_target=None,
                 n_representations=100,
                 use_data=False,
                 mean_center=False,
                 std_unit=False):
        """
        Parameters
        -------
        source_data: np.ndarray (n_samples, n_genes)
            Data use as source, e.g. cell line or PDX transcriptome read outs.

        target_data: np.ndarray (n_samples, n_genes)
            Data use as target, e.g. tumor transcriptome read outs.

        n_factors: int
            Number of domain-invariant factors.

        n_pv: int
            Number of principal vectors.

        dim_reduction : str, default to 'pca' 
            Dimensionality reduction method for the source data,
            i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'.

        dim_reduction_target : str, default to None 
            Dimensionality reduction method for the target data.

        n_representations: int, optional default to 100
            Number of interpolated features.

        use_data: bool, optional, default to False
            Whether data given additionally in fit should be used in domain-adaptation.

        mean_center : bool, optional, default to False
            Whether X_source features (i.e. genes) should be mean-centered.

        std_unit : bool, optional, default to False
            Whether X_source features (i.e. genes) should be standardized.
        """
        self.source_data = source_data
        self.target_data = target_data

        self.n_factors = n_factors
        self.n_pv = n_pv
        self.dim_reduction = dim_reduction
        self.dim_reduction_target = dim_reduction_target
        self.n_representations = n_representations
        self.use_data = use_data

        self.standard_scaler_input_ = StandardScaler(with_mean=mean_center,
                                                     with_std=std_unit)
        self.standard_scaler_source_ = StandardScaler(with_mean=mean_center,
                                                      with_std=std_unit)
        self.standard_scaler_target_ = StandardScaler(with_mean=mean_center,
                                                      with_std=std_unit)

        self.pv_computation = PVComputation(
            n_factors=self.n_factors,
            n_pv=self.n_pv,
            dim_reduction=self.dim_reduction,
            dim_reduction_target=self.dim_reduction_target,
        )

        self.intermediate_factors = IntermediateFactors(
            n_representations=self.n_representations)
Ejemplo n.º 2
0
    def __init__(self, n_representations=100, method='consensus',
                mean_center=True,
                std_unit=False,
                n_factors=70,
                n_pv=40,
                dim_reduction='pca',
                dim_reduction_target=None,
                l1_ratio=0,
                source_data=None,
                target_data=None,
                n_jobs=1):
        """
        Parameters
        -------
        n_representations : int, default to 100
            Number of representations between source and target principal vectors for interpolation.
            0 means source only, -1 means target only.

        method : str, default to 'consensus'
            Scheme used for the domain adaptation step, i.e. 'consensus', 'elasticnet', or 'gfk'.

        mean_center : bool, default to True
            Whether the different datasets used in the implementation should be mean centered.

        std_unit : bool, default to False 
            Whether the different datasets used in the implementation should be standardized
            (feature-level variance to 1).

        n_factors : int, default to 70 
            Number of domain-specific factors to compute, e.g. PCs.

        n_pv : int, default to 40
            Number of principal vectors to compute from the domain-specific factors.

        dim_reduction : str, default to 'pca' 
            Dimensionality reduction method for the source data,
            i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'.

        dim_reduction_target : str, default to None 
            Dimensionality reduction method for the target data,
            i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'. If None, set to dim_reduction.

        l1_ratio : float, default to 0
            l1 ratio for elasticnet model (0 is Ridge, 1 is Lasso).

        source_data : np.ndarray, default to None
            source data to use in domain adaptation phase.

        target_data : np.ndarray, default to None 
            target data to use in domain adaptation phase.

        n_jobs : int, default to 1
            number of jobs used in parallelisation.
        """

        self.n_representations = n_representations
        self.mean_center = mean_center
        self.std_unit = std_unit
        self.method = method
        self.n_factors = n_factors
        self.n_pv = n_pv
        self.l1_ratio = l1_ratio
        self.dim_reduction = dim_reduction
        self.dim_reduction_target = dim_reduction_target
        self.n_jobs = n_jobs

        self.source_data = source_data
        self.target_data = target_data

        self.pv_computation = PVComputation(
            self.n_factors,
            self.n_pv,
            self.dim_reduction,
            self.dim_reduction_target
        )

        self.intermediate_factors = IntermediateFactors(
            self.n_representations
        )

        self.predictor = None
        
        # Default values for CV
        self.alpha_values = np.logspace(-6,10,34)
        self.cv_fold = 10
        self.verbose = 1
Ejemplo n.º 3
0
class ConsensusRepresentation(BaseEstimator):
    """Consensus Representation computation.

    Compute the geodesic flow kernel matrix. We use the equivalent definition
    derived in [1] to make it faster. Principal vectors are therefore first
    computed to project onto them.

    Attributes
    -------
    """
    def __init__(self,
                 source_data,
                 target_data,
                 n_factors,
                 n_pv,
                 dim_reduction='pca',
                 dim_reduction_target=None,
                 n_representations=1000,
                 use_data=False,
                 mean_center=False,
                 std_unit=False):
        """
        Parameters
        -------
        source_data: np.ndarray (n_samples, n_genes)
            Data use as source, e.g. cell line or PDX transcriptome read outs.

        target_data: np.ndarray (n_samples, n_genes)
            Data use as target, e.g. tumor transcriptome read outs.

        n_factors: int
            Number of domain-invariant factors.

        n_pv: int
            Number of principal vectors.

        dim_reduction : str, default to 'pca' 
            Dimensionality reduction method for the source data,
            i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'.

        dim_reduction_target : str, default to None 
            Dimensionality reduction method for the target data.

        n_representations: int, optional default to 100
            Number of interpolated features.

        use_data: bool, optional, default to False
            Whether data given additionally in fit should be used in domain-adaptation.

        mean_center : bool, optional, default to False
            Whether X_source features (i.e. genes) should be mean-centered.

        std_unit : bool, optional, default to False
            Whether X_source features (i.e. genes) should be standardized.
        """
        self.source_data = source_data
        self.target_data = target_data

        self.n_factors = n_factors
        self.n_pv = n_pv
        self.dim_reduction = dim_reduction
        self.dim_reduction_target = dim_reduction_target
        self.n_representations = n_representations
        self.use_data = use_data

        self.standard_scaler_input_ = StandardScaler(with_mean=mean_center,
                                                     with_std=std_unit)
        self.standard_scaler_source_ = StandardScaler(with_mean=mean_center,
                                                      with_std=std_unit)
        self.standard_scaler_target_ = StandardScaler(with_mean=mean_center,
                                                      with_std=std_unit)

        self.pv_computation = PVComputation(
            n_factors=self.n_factors,
            n_pv=self.n_pv,
            dim_reduction=self.dim_reduction,
            dim_reduction_target=self.dim_reduction_target,
        )

        self.intermediate_factors = IntermediateFactors(
            n_representations=self.n_representations)

    def _find_common_representation(self):
        flow_vectors = self.flow.transpose(1, 0, 2)
        self.consensus_representation = []

        for i in range(self.n_pv):
            source_projected = flow_vectors[i].dot(
                self.source_data.transpose())
            target_projected = flow_vectors[i].dot(
                self.target_data.transpose())

            ks_stats = [
                ks_2samp(s, t)[0]
                for (s, t) in zip(source_projected, target_projected)
            ]

            self.consensus_representation.append(
                flow_vectors[i, np.argmin(ks_stats)])

        self.consensus_representation = np.array(
            self.consensus_representation).transpose()

        return self.consensus_representation

    def fit(self, X, y=None):
        """
        Computes the principal vectors, interpolates between them, projects source and target
        data, and finally computes, by comparing for each pair source and target projected data, 
        the point where these two quantities are comparable (using KS statistics).

        Parameters
        -------
        X: numpy.ndarray, shape (n_samples, n_genes)
            Genomics data to consider

        y: numpy.ndarray, shape(n_samples, 1), optional, default to None
            Response data (optional, just for compliance with BaseEstimator)

        Returned Values
        -------
        self: returns an instance of self.
        """

        # Add X to source data if use_data set to True
        if self.use_data:
            if self.source_data is None or self.source_data.shape[0] == 0:
                self.source_data = X
            else:
                self.source_data = np.concatenate([self.source_data, X])

        # Standardize data
        self.standard_scaler_input_.fit(X)
        self.source_data = self.standard_scaler_source_.fit_transform(
            self.source_data)
        self.target_data = self.standard_scaler_target_.fit_transform(
            self.target_data)

        # Compute principal vectors
        self.pv_computation.fit(self.source_data, self.target_data, y)

        # Compute intermediate features
        self.flow = self.intermediate_factors.sample_flow(
            self.pv_computation.source_components_,
            self.pv_computation.target_components_)

        # Compute the consensus representation between each PV
        self._find_common_representation()

        return self

    def transform(self, X, y=None):
        """
        Project data along the geodesic path. 

        Attributes
        -------
        X: numpy.ndarray, shape (n_components, n_features)
            Genomics data use for prediction.

        Return values
        -------
        X_projected: numpy.ndarray, shape (n_components, n_representations)
            Genomics data projected on the consensus representation.
        """
        return self.standard_scaler_input_.fit_transform(X).dot(
            self.consensus_representation)
Ejemplo n.º 4
0
class FlowProjector(BaseEstimator):
    """Project on the geodesic.

    Given source and target data, computes the domain-specific factors, aligns them
    to get the principal vectors and finally interpolates between source PVs and
    target PVs. Data can then be projected on all these intermediate features.

    Attributes
    -------
    """
    def __init__(self,
                 source_data,
                 target_data,
                 n_factors,
                 n_pv,
                 dim_reduction='pca',
                 dim_reduction_target=None,
                 n_representations=100,
                 use_data=False,
                 mean_center=False,
                 std_unit=False):
        """
        Parameters
        -------
        source_data: np.ndarray (n_samples, n_genes)
            Data use as source, e.g. cell line or PDX transcriptome read outs.

        target_data: np.ndarray (n_samples, n_genes)
            Data use as target, e.g. tumor transcriptome read outs.

        n_factors: int
            Number of domain-invariant factors.

        n_pv: int
            Number of principal vectors.

        dim_reduction : str, default to 'pca' 
            Dimensionality reduction method for the source data,
            i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'.

        dim_reduction_target : str, default to None 
            Dimensionality reduction method for the target data.

        n_representations: int, optional default to 100
            Number of interpolated features.

        use_data: bool, optional, default to False
            Whether data given additionally in fit should be used in domain-adaptation.

        mean_center : bool, optional, default to False
            Whether X_source features (i.e. genes) should be mean-centered.

        std_unit : bool, optional, default to False
            Whether X_source features (i.e. genes) should be standardized.
        """
        self.source_data = source_data
        self.target_data = target_data

        self.n_factors = n_factors
        self.n_pv = n_pv
        self.dim_reduction = dim_reduction
        self.dim_reduction_target = dim_reduction_target
        self.n_representations = n_representations
        self.use_data = use_data

        self.standard_scaler_input_ = StandardScaler(with_mean=mean_center,
                                                     with_std=std_unit)
        self.standard_scaler_source_ = StandardScaler(with_mean=mean_center,
                                                      with_std=std_unit)
        self.standard_scaler_target_ = StandardScaler(with_mean=mean_center,
                                                      with_std=std_unit)

        self.pv_computation = PVComputation(
            n_factors=self.n_factors,
            n_pv=self.n_pv,
            dim_reduction=self.dim_reduction,
            dim_reduction_target=self.dim_reduction_target,
        )

        self.intermediate_factors = IntermediateFactors(
            n_representations=self.n_representations)

    def fit(self, X, y=None):
        """
        Computes the intermediate features between the pairs of principal vectors.

        Parameters
        -------
        X: numpy.ndarray, shape (n_samples, n_genes)
            Genomics data to consider

        y: numpy.ndarray, shape(n_samples, 1), optional, default to None
            Response data (optional, just for compliance with BaseEstimator)

        Returned Values
        -------
        self: returns an instance of self.
        """

        # Add X to source data if use_data set to True
        if self.use_data:
            if self.source_data is None or self.source_data.shape[0] == 0:
                self.source_data = X
            else:
                self.source_data = np.concatenate([self.source_data, X])

        # Standardize data
        self.standard_scaler_input_.fit(X)
        self.source_data = self.standard_scaler_source_.fit_transform(
            self.source_data)
        self.target_data = self.standard_scaler_target_.fit_transform(
            self.target_data)

        # Compute principal vectors
        self.pv_computation.fit(self.source_data, self.target_data, y)

        # Compute intermediate factors.
        self.flow = self.intermediate_factors.sample_flow(
            self.pv_computation.source_components_,
            self.pv_computation.target_components_)

        # Concatenate feature representations before projection
        self.flow = np.concatenate(self.flow).transpose()

        return self

    def transform(self, X, y=None):
        """
        Project data along the geodesic path. 

        Parameters
        -------
        X: numpy.ndarray, shape (n_components, n_features)
            Genomics data use for prediction.

        Returned values
        -------
        X_projected: numpy.ndarray, shape (n_components, n_pv * n_representations)
            Genomics data projected along the flow.
        """

        return self.standard_scaler_input_.fit_transform(X).dot(self.flow)
Ejemplo n.º 5
0
class GeodesicMatrixComputer(BaseEstimator):
    """ Geodesic Flow Kernel computation.

    Compute the geodesic flow kernel matrix. We use the equivalent definition
    derived in [1] to make it faster. Principal vectors are therefore first
    computed to project onto them.

    Attributes
    -------
    """
    def __init__(self,
                 source_data,
                 target_data,
                 n_factors,
                 n_pv,
                 dim_reduction='pca',
                 dim_reduction_target=None,
                 n_representations=1000,
                 use_data=False,
                 mean_center=False,
                 std_unit=False):
        """
        Parameters
        -------
        source_data: np.ndarray (n_samples, n_genes)
            Data use as source, e.g. cell line or PDX transcriptome read outs.

        target_data: np.ndarray (n_samples, n_genes)
            Data use as target, e.g. tumor transcriptome read outs.

        n_factors: int
            Number of domain-invariant factors.

        n_pv: int
            Number of principal vectors.

        dim_reduction : str, default to 'pca' 
            Dimensionality reduction method for the source data,
            i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'.

        dim_reduction_target : str, default to None 
            Dimensionality reduction method for the target data.

        n_representations: int, optional default to 100
            Number of interpolated features.

        use_data: bool, optional, default to False
            Whether data given additionally in fit should be used in domain-adaptation.

        mean_center : bool, optional, default to False
            Whether X_source features (i.e. genes) should be mean-centered.

        std_unit : bool, optional, default to False
            Whether X_source features (i.e. genes) should be standardized.
        """
        self.source_data = source_data
        self.target_data = target_data

        self.n_factors = n_factors
        self.n_pv = n_pv
        self.dim_reduction = dim_reduction
        self.dim_reduction_target = dim_reduction_target
        self.n_representations = n_representations
        self.use_data = use_data

        self.standard_scaler_input_ = StandardScaler(with_mean=mean_center,
                                                     with_std=std_unit)
        self.standard_scaler_source_ = StandardScaler(with_mean=mean_center,
                                                      with_std=std_unit)
        self.standard_scaler_target_ = StandardScaler(with_mean=mean_center,
                                                      with_std=std_unit)

        self.pv_computation_ = PVComputation(
            n_factors=self.n_factors,
            n_pv=self.n_pv,
            dim_reduction=self.dim_reduction,
            dim_reduction_target=self.dim_reduction_target,
        )

        self.intermediate_factors = IntermediateFactors(
            n_representations=self.n_representations)

    def fit(self, X, y=None):
        """
        Computes the geodesic flow kernel matrix used in kernel ridge.

        Parameters
        -------
        X: numpy.ndarray, shape (n_samples, n_genes)
            Genomics data to consider

        y: numpy.ndarray, shape(n_samples, 1), optional, default to None
            Response data (optional, just for compliance with BaseEstimator)

        Returned Values
        -------
        self: returns an instance of self.
        """

        # Add X to source data if use_data set to True
        if self.use_data:
            if self.source_data is None or self.source_data.shape[0] == 0:
                self.source_data = X
            else:
                self.source_data = np.concatenate([self.source_data, X])

        # Standardize data
        self.standard_scaler_input_.fit(X)
        self.source_data = self.standard_scaler_source_.fit_transform(
            self.source_data)
        self.target_data = self.standard_scaler_target_.fit_transform(
            self.target_data)
        self.training_data = self.standard_scaler_input_.transform(X)

        # Compute principal vectors
        self.pv_computation_.fit(self.source_data, self.target_data, y)

        # Compute G, kernel matrix
        self.G_ = self.intermediate_factors.compute_geodesic_matrix(
            self.pv_computation_.source_components_,
            self.pv_computation_.target_components_)

        # Compute projector
        self.projector_ = np.block([
            self.pv_computation_.source_components_.transpose(),
            self.pv_computation_.target_components_.transpose()
        ])

        return self

    def _compute_kernel_matrix(self, X1, X2=None):
        X1_projected = X1.dot(self.projector_)
        if X2 is None:
            X2_projected = X1_projected
        else:
            X2_projected = X2.dot(self.projector_)

        return X1_projected.dot(self.G_).dot(X2_projected.transpose())

    def transform(self, X, y=None):
        """
        Compute the domain-invariant kernel matrix

        Parameters
        -------
        X: numpy.ndarray, shape (n_components, n_features)
            Genomics data use for prediction.

        Returned values
        -------
        X_projected: numpy.ndarray, shape (n_components, n_representations)
            Kernel matrix with source data (fed in fit method).
        """
        return self._compute_kernel_matrix(
            self.standard_scaler_input_.fit_transform(X), self.training_data)
Ejemplo n.º 6
0
    def fit(self, source_data, target_data):
        """
        Compute the consensus representation between two set of data.

        IMPORTANT: Same genes have to be given for source and target, and in same order

        Parameters
        -------
        source_data : np.ndarray, shape (n_components, n_genes)
            Source dataset

        target_data : np.ndarray, shape (n_components, n_genes)
            Target dataset

        Return values
        -------
        self: returns an instance of self.
        """
        # Low-rank representation
        Ps = self.dim_reduction_source.fit(X_source, y_source).components_
        self.source_components_ = scipy.linalg.orth(Ps.transpose()).transpose()

        Pt = self.dim_reduction_target.fit(X_target, y_source).components_
        self.target_components_ = scipy.linalg.orth(Pt.transpose()).transpose()

        # Compute intermediate factors
        self.intermediate_factors_ = IntermediateFactors(self.n_representations)\
                                    .sample_flow(self.source_components_, self.target_components_)
        self.intermediate_factors_ = self.intermediate_factors_.transpose(
            1, 0, 2)

        # Normalize for total variance
        target_total_variance = np.sqrt(np.sum(np.var(target_data, 0)))
        normalized_target_data = target_data / target_total_variance
        normalized_target_data *= self.total_variance

        source_total_variance = np.sqrt(np.sum(np.var(source_data, 0)))
        normalized_source_data = source_data / source_total_variance
        normalized_source_data *= self.total_variance

        # Compute consensus representation
        self.consensus_components_ = []

        for i in range(self.n_pv):
            source_projected = intermediate_factors_[i].dot(
                normalized_source_data.transpose())
            target_projected = intermediate_factors_[i].dot(
                normalized_target_data.transpose())

            ks_stats = [
                ks_2samp(s, t)[0]
                for (s, t) in zip(source_projected, target_projected)
            ]

            self.consensus_components_.append(
                intermediate_factors_[i, np.argmin(ks_stats)])

        self.consensus_components_ = np.array(
            self.consensus_components_).transpose()

        return self.consensus_components_
Ejemplo n.º 7
0
class ConsensusRepresentation:
    def __init__(self,
                 n_factors,
                 n_pv,
                 n_representations=100,
                 dim_reduction='pca',
                 dim_reduction_target=None,
                 total_variance=10**3,
                 n_jobs=1):
        """
        Parameters
        -------
        n_factors: int
            Number of domain-specific factors.

        n_pv: int
            Number of principal vectors.

        n_representations: int, optional, default to 100
            Number of interpolated features between source and target principal vectors.

        dim_reduction : str, default to 'pca' 
            Dimensionality reduction method for the source data,
            i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'.

        dim_reduction_target : str, default to None 
            Dimensionality reduction method for the target data.

        total_variance: float, default to 10^3
            Total variance in both source and target after total variance normalization.

        n_jobs: int (optional, default to 1)
            Number of jobs for computation.
        """
        self.n_factors = n_factors
        self.n_pv = n_pv
        self.n_representations = n_representations
        self.dim_reduction = dim_reduction
        self.dim_reduction_target = dim_reduction_target
        self.total_variance = total_variance

        self.source_data = None
        self.target_data = None
        self.source_components_ = None
        self.target_components_ = None

        self.intermediate_factors_ = None

        self.consensus_components_ = None

        self.n_jobs = 1

    def fit(self, source_data, target_data):
        """
        Compute the consensus representation between two set of data.

        IMPORTANT: Same genes have to be given for source and target, and in same order

        Parameters
        -------
        source_data : np.ndarray, shape (n_components, n_genes)
            Source dataset

        target_data : np.ndarray, shape (n_components, n_genes)
            Target dataset

        Return values
        -------
        self: returns an instance of self.
        """
        # Low-rank representation
        Ps = self.dim_reduction_source.fit(X_source, y_source).components_
        self.source_components_ = scipy.linalg.orth(Ps.transpose()).transpose()

        Pt = self.dim_reduction_target.fit(X_target, y_source).components_
        self.target_components_ = scipy.linalg.orth(Pt.transpose()).transpose()

        # Compute intermediate factors
        self.intermediate_factors_ = IntermediateFactors(self.n_representations)\
                                    .sample_flow(self.source_components_, self.target_components_)
        self.intermediate_factors_ = self.intermediate_factors_.transpose(
            1, 0, 2)

        # Normalize for total variance
        target_total_variance = np.sqrt(np.sum(np.var(target_data, 0)))
        normalized_target_data = target_data / target_total_variance
        normalized_target_data *= self.total_variance

        source_total_variance = np.sqrt(np.sum(np.var(source_data, 0)))
        normalized_source_data = source_data / source_total_variance
        normalized_source_data *= self.total_variance

        # Compute consensus representation
        self.consensus_components_ = []

        for i in range(self.n_pv):
            source_projected = intermediate_factors_[i].dot(
                normalized_source_data.transpose())
            target_projected = intermediate_factors_[i].dot(
                normalized_target_data.transpose())

            ks_stats = [
                ks_2samp(s, t)[0]
                for (s, t) in zip(source_projected, target_projected)
            ]

            self.consensus_components_.append(
                intermediate_factors_[i, np.argmin(ks_stats)])

        self.consensus_components_ = np.array(
            self.consensus_components_).transpose()

        return self.consensus_components_