def __init__(self, kernel, kernel_params={}, n_components=None, n_pv=None, n_jobs=1): """ Parameters ---------- kernel : str, default to 'linear' Name of the kernel to be used in the algorithm. Has to be compliant with <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.kernel_metrics.html#sklearn.metrics.pairwise.kernel_metrics"> scikit-learn kernel</a>, e.g., "rbf", "polynomial", "laplacian", "linear", ... kernel_params : dict, default to None Parameters of the kernel (degree for polynomial kernel, gamma for RBF). Naming has to be compliant with scikit-learn, e.g., {"gamma": 0.0005}. n_components : int or dict, default to None Number of components for kernel PCA. <br/> If int, then indicates the same number of components for source and target. <br/> If dict, then must be of the form {'source':int, 'target':int}. n_pv : int, default to None Number of principal vectors. n_jobs : int, default to 1 Number of concurrent threads to use for tasks that can be parallelized. """ self.gamma_coef = None self.alpha_coef = None self.beta_coef = None self.canonical_angles = None self.kernel = kernel self.kernel_params_ = kernel_params self.kernel_values_ = KernelComputer(self.kernel, self.kernel_params_, n_jobs) # Put n_components in dictionary format. self.n_components = n_components if type(self.n_components) == int: self.n_components = { s: self.n_components for s in ['source', 'target'] } self.n_pv = n_pv self.n_jobs = n_jobs
def uncentered_rbf_kernel_computer(self, source_data, target_data): return KernelComputer('rbf').fit(source_data, target_data, center=False)
def rbf_kernel_computer(self): return KernelComputer('rbf')
def uncentered_linear_kernel_computer(self, source_data, target_data): return KernelComputer('linear').fit(source_data, target_data, center=False)
def linear_kernel_computer(self): return KernelComputer('linear')
def linear_kernel_matrix(source_data, target_data): k = KernelComputer('linear') k.fit(source_data, target_data, center=False) return k
def rbf_kernel_matrix(source_data, target_data): k = KernelComputer('rbf', rbf_params) k.fit(source_data, target_data, center=False) return k
class PVComputation: """ PVComputation handles the dimensionality reduction and alignment of learned manifold. <br/><br/> This class contains all the following tasks and sub-routines: <ul> <li> Kernel PCA decomposition on source and target independently. <li> Kernel principal components comparison. <li> Computation of Principal Vectors (PVs). </ul> """ def __init__(self, kernel, kernel_params={}, n_components=None, n_pv=None, n_jobs=1): """ Parameters ---------- kernel : str, default to 'linear' Name of the kernel to be used in the algorithm. Has to be compliant with <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.kernel_metrics.html#sklearn.metrics.pairwise.kernel_metrics"> scikit-learn kernel</a>, e.g., "rbf", "polynomial", "laplacian", "linear", ... kernel_params : dict, default to None Parameters of the kernel (degree for polynomial kernel, gamma for RBF). Naming has to be compliant with scikit-learn, e.g., {"gamma": 0.0005}. n_components : int or dict, default to None Number of components for kernel PCA. <br/> If int, then indicates the same number of components for source and target. <br/> If dict, then must be of the form {'source':int, 'target':int}. n_pv : int, default to None Number of principal vectors. n_jobs : int, default to 1 Number of concurrent threads to use for tasks that can be parallelized. """ self.gamma_coef = None self.alpha_coef = None self.beta_coef = None self.canonical_angles = None self.kernel = kernel self.kernel_params_ = kernel_params self.kernel_values_ = KernelComputer(self.kernel, self.kernel_params_, n_jobs) # Put n_components in dictionary format. self.n_components = n_components if type(self.n_components) == int: self.n_components = { s: self.n_components for s in ['source', 'target'] } self.n_pv = n_pv self.n_jobs = n_jobs def fit(self, source_data, target_data, method='two-stage', n_components=None, n_pv=None): """ Computes the kernel principal vectors between source and target data. Parameters ------- source_data: numpy.ndarray, shape (n_samples, n_genes) Source data target_data: numpy.ndarray, shape (n_samples, n_genes) Source data method: str, default to "two-stage" Method used for computed the kernel PVs, either "two-stage" (first kernel PCA, then alignment), or "direct" (direct minimization). <br/> <b>NOT IMPLEMENTED:</b> The one-shot computation of the PVs has not been implemented. n_components: int, default to None Number of components taken into the decomposition. n_pv: int, default to None Number of Principal Vectors. If not set here or in __init__, then maximum number of PV will be computed. Returned Values ------- self : PVComputation Fitted instance. """ # Compute kernel matrices self.kernel_values_.fit(source_data, target_data, center=True) if method == 'two-stage': self._two_stage_computation(n_components, n_pv) elif method == 'direct': self._direct_computation(n_components) return self def transform(self, X, right_center=False): """ Project data X on source and target kernel principal vectors Parameters ------- X: numpy.ndarray, shape (n_samples, n_genes) Data to project right_center: Boolean, default to False Whether data should be implicitly mean centered Returned Values ------- Dictionary with 'source' and 'target' as keys, and projected arrays as values. """ X_projected = {} for t in ['source', 'target']: X_projected[t] = self._project_PV_from_data(X, t, right_center) return X_projected def fit_transform(self, source_data, target_data, method='two-stage', n_components=None, n_pv=None): """ Computes the kernel principal vectors between source and target data. ------- source_data: numpy.ndarray, shape (n_samples, n_genes) Source data target_data: numpy.ndarray, shape (n_samples, n_genes) Source data method: str, default to "two-stage" Method used for computed the kernel PVs, either "two-stage" (first kernel PCA, then alignment), or "direct" (direct minimization). <br/> <b>NOT IMPLEMENTED:</b> The one-shot computation of the PVs has not been implemented. n_components: int or dictionary, default to None Number of components taken into account for PCA. Can be int (if same number of components for source or target) or dictionary with {'source': int, 'target':int} indicating the number of source and target principal components. n_pv: int, default to None Number of Principal Vectors. If not set here or in __init__, then maximum number of PV will be computed. Returned Values ------- source_projected: dictionary target_projected: dictionary """ self.fit(source_data, target_data, method, n_components) source_projected = { 'source': self._project_PV_from_data(source_data, 'source'), 'target': self._project_PV_from_data(source_data, 'target') } target_projected = { 'source': self._project_PV_from_data(target_data, 'source'), 'target': self._project_PV_from_data(target_data, 'target') } return source_projected, target_projected def _two_stage_computation(self, n_components=None, n_pv=None): self.n_components = n_components or self.n_components if self.n_components is None or type(self.n_components) == int: self.n_components = { s: self.n_components for s in ['source', 'target'] } self.n_pv = n_pv or (self.n_pv or min(self.n_components.values())) ## First step: Kernel PCA self._dim_reduction() ## Second step: Align based on cosine similarity self._align_principal_components() def _dim_reduction(self): self.dim_reduc_clf_ = {} self.alpha_coef = {} # Independent processing of source and target for t in ['source', 'target']: # Reduce dimensionality using kernelPCA. self.dim_reduc_clf_[t] = KernelPCA( self.n_components[t], kernel='precomputed' if self.kernel == 'mallow' else self.kernel, n_jobs=self.n_jobs, **self.kernel_params_) if self.kernel == 'mallow': self.dim_reduc_clf_[t].fit( self.kernel_values_.kernel_submatrices[t]) else: self.dim_reduc_clf_[t].fit(self.kernel_values_.data[t]) # Save kernel PCA coefficients self.alpha_coef[t] = self.dim_reduc_clf_[t].alphas_ / np.sqrt( self.dim_reduc_clf_[t].lambdas_) def _align_principal_components(self): self.cosine_similarity_ = self.alpha_coef['source'].T.dot( self.kernel_values_.k_st).dot(self.alpha_coef['target']) beta_s, theta, beta_t = np.linalg.svd(self.cosine_similarity_) self.beta_coef = {} self.beta_coef['source'] = beta_s self.beta_coef[ 'target'] = beta_t.T # Due to definition of SVD by matplotlib # Computation of gamma coefficients self.gamma_coef = {} for t in ['source', 'target']: self.gamma_coef[t] = self.beta_coef[t].T.dot(self.alpha_coef[t].T) self.gamma_coef[t] = self.gamma_coef[t][:self.n_pv] # Canonical angles self.canonical_angles = np.arccos(theta[:self.n_pv]) def _direct_computation(self, n_components=None): raise NotImplementedError( 'Direct computation of PVs has not been implemented.') def _project_PV_from_data(self, X, t, right_center=False): """ Project data X on source and target kernel principal vectors ------- X: numpy.ndarray, shape (n_samples, n_genes) Data to project t: str Type, either 'source' or 'target' right_center: Boolean, default to False Whether data should be implicitly mean centered Returned Values ------- Dictionary with 'source' and 'target' as keys, and projected arrays as values. Projected arrays are of size (n_samples, n_pv) """ K = self.kernel_(self.kernel_values_.data[t], X, **self.kernel_params_) K = _left_center_kernel(K) if right_center: K = _right_center_kernel(K) return self._project_PV_from_kernel(K, t) def _project_PV_from_kernel(self, K, t): """ Project kernel X on source and target kernel principal vectors ------- K: numpy.ndarray, shape (n_samples, n_samples) Kernel matrix between data from type t and specific dataset. Source (or target) samples in the rows (same order as given to the algorithm) New dataset samples in the columns t: str Type, either 'source' or 'target' Returned Values ------- Dictionary with 'source' and 'target' as keys, and projected arrays as values. Projected arrays are of size (n_samples, n_pv) """ return self.gamma_coef[t].dot(K).T
def __init__(self, kernel='linear', kernel_params=None, n_components=None, n_pv=None, method='two-stage', step=100, n_jobs=1, verbose=False): """ Parameters ---------- kernel : str, default to 'linear' Name of the kernel to be used in the algorithm. Has to be compliant with <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.kernel_metrics.html#sklearn.metrics.pairwise.kernel_metrics"> scikit-learn kernel</a>, e.g., "rbf", "polynomial", "laplacian", "linear", ... kernel_params : dict, default to None Parameters of the kernel (degree for polynomial kernel, gamma for RBF). Naming has to be compliant with scikit-learn, e.g., {"gamma": 0.0005}. n_components : int or dict, default to None Number of components for kernel PCA. <br/> If int, then indicates the same number of components for source and target. <br/> If dict, then must be of the form {'source':int, 'target':int}. n_pv : int, default to None Number of principal vectors. method : str, default to 'two-stage' Method used for computing the principal vectors. Only 'two-stage' has been implemented. step: int, default to 100 Number of interpolation steps. n_jobs: int, default to 1 Number of concurrent threads to use for tasks that can be parallelized. verbose: bool or int, default to False Degree of verbosity in joblib routines. """ self.kernel = kernel self.kernel_params_ = kernel_params or {} self.kernel_values_ = KernelComputer(self.kernel, self.kernel_params_, n_jobs) self.source_data_ = None self.target_data_ = None self.is_fitted = False self.n_components = n_components self.n_pv = n_pv self.method = method self.step = step self.predictive_clf = None self.n_jobs = n_jobs self.verbose = verbose
class TRANSACT: """ TRANSACT is a package designed to adapt predictors of drug response from pre-clinical models to the clinic. <br/><br/> This class contains all the tasks and sub-routines required for training the domain adaptation framework, i.e.: <ul> <li> Kernel PCA decomposition on source and target independently. <li> Kernel principal components comparison. <li> Computation of Principal Vectors (PVs). <li> Interpolation between source and target PVs and extraction of Consensus Features (CFs). <li> Out-of-sample extension: project new dataset onto the consensus features. </ul> """ def __init__(self, kernel='linear', kernel_params=None, n_components=None, n_pv=None, method='two-stage', step=100, n_jobs=1, verbose=False): """ Parameters ---------- kernel : str, default to 'linear' Name of the kernel to be used in the algorithm. Has to be compliant with <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.kernel_metrics.html#sklearn.metrics.pairwise.kernel_metrics"> scikit-learn kernel</a>, e.g., "rbf", "polynomial", "laplacian", "linear", ... kernel_params : dict, default to None Parameters of the kernel (degree for polynomial kernel, gamma for RBF). Naming has to be compliant with scikit-learn, e.g., {"gamma": 0.0005}. n_components : int or dict, default to None Number of components for kernel PCA. <br/> If int, then indicates the same number of components for source and target. <br/> If dict, then must be of the form {'source':int, 'target':int}. n_pv : int, default to None Number of principal vectors. method : str, default to 'two-stage' Method used for computing the principal vectors. Only 'two-stage' has been implemented. step: int, default to 100 Number of interpolation steps. n_jobs: int, default to 1 Number of concurrent threads to use for tasks that can be parallelized. verbose: bool or int, default to False Degree of verbosity in joblib routines. """ self.kernel = kernel self.kernel_params_ = kernel_params or {} self.kernel_values_ = KernelComputer(self.kernel, self.kernel_params_, n_jobs) self.source_data_ = None self.target_data_ = None self.is_fitted = False self.n_components = n_components self.n_pv = n_pv self.method = method self.step = step self.predictive_clf = None self.n_jobs = n_jobs self.verbose = verbose def fit(self, source_data, target_data, n_components=None, n_pv=None, method='two-stage', step=100, with_interpolation=True, left_center=True): """ Compute the Consensus Features (CFs) onto which predictive models can be trained. <br/> Specifically: <ul> <li> Compute the kernel matrices. <li> Compute the cosine similarity matrix. <li> Compute principal vectors. <li> Interpolate between the PVs. <li> Find optimal interpolation time. </ul> Parameters ---------- source_data : np.ndarray, dtype=float Source data, matrix with samples in the rows, i.e. shape (n_source_samples, n_features). <br./> pandas.DataFrame are supported. target_data : np.ndarray, dtype=float Source data, matrix with samples in the rows, i.e. shape (n_target_samples, n_features). <br./> pandas.DataFrame are supported. <br/><b>WARNING</b>: features need to be ordered in the same way as in source_data. n_components: int, default to None Number of components. If not set here or in __init__, then use the maximum number of principal components possible for source and target. n_pv: int, default to None Number of Principal Vectors. If not set here or in __init__, then maximum number of PV will be computed. method : str, default to 'two-stage' Method used for computing the principal vectors. Only 'two-stage' has been implemented. step: int, default to 100 Number of interpolation steps. with_interpolation: bool, default to True Bool indicating whether interpolation shall also be fitted. Useful for just computing PV prior to null distribution fitting (and choose of PV number). left_center: bool, default to True Bool indicating whether the output should be mean-centered, i.e. whether source and target consensus features values (or PVs if no interpolation) must have an independent mean-centering. Returns ------- self : TRANSACT Fitted instance. """ # Save parameters self.source_data_ = source_data self.target_data_ = target_data self.method = method or self.method self.n_components = n_components or self.n_components self.n_pv = n_pv or self.n_pv self.step = step or self.step self.left_center = left_center # Compute kernel values self.kernel_values_.fit(source_data, target_data, center=False) # Compute principal vectors self.principal_vectors_ = PVComputation(self.kernel, self.kernel_params_, n_jobs=self.n_jobs) self.principal_vectors_.fit(self.source_data_, self.target_data_, method=self.method, n_components=self.n_components, n_pv=self.n_pv) # Stop here if interpolation should not be computed. if not with_interpolation: return self # Set up interpolation scheme self.interpolation_ = Interpolation(self.kernel, self.kernel_params_, self.n_jobs) self.interpolation_.fit(self.principal_vectors_, self.kernel_values_) # Compute optimal interpolation time self._compute_optimal_time(step=self.step, left_center=self.left_center) self.is_fitted = True return self def null_distribution_pv_similarity(self, method='gene_shuffling', n_iter=100): """ Generate a null distribution for the PV similarity function: <ul> <li> Gene shuffling: genes get shuffled in source to destroy any structure existing at the gene-level while preserving the sample structure. PV get recomputed and similarity is saved. </ul> Parameters ---------- method : string, default to gene_shuffling Method used for generating the null distribution. Only method developped: gene_shuffling n_iter: int, default to 100 Number of iterations Returns ------- np.ndarray, dtype=float, shape (n_iter, n_pv) Array containing the distribution of similarity after shuffling. Each row contains the values of one shuffling across PVs. """ if method.lower() == 'gene_shuffling': null_method = self._gene_shuffling else: raise NotImplementedError( '%s is not a proper method for generating null distribution' % (method)) null_distribution = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)\ (delayed(null_method)() for _ in range(n_iter)) return np.array(null_distribution) def _gene_shuffling(self): perm = np.random.permutation(self.source_data_.shape[1]) pv = PVComputation(self.kernel, self.kernel_params_) pv.fit(self.source_data_[:, perm], self.target_data_, method=self.method, n_components=self.n_components, n_pv=self.n_pv) return np.cos(pv.canonical_angles) def fit_predictor(self, X, y, alpha_values=None, l1_ratio=0.5): """ Project X on consensus features and train a predictor of drug response. Parameters ---------- X : np.ndarray of shape (n_samples, n_features), dtype=float Dataset to project. Features should be ordered in same way as in source_data and target_data. y : np.ndarray of shape (n_samples, 1), dtype=float Output to predict Returns ------- """ self.alpha_values = alpha_values if alpha_values is not None else np.logspace( -10, 5, 34) self.l1_ratio_values = [0., .1, .2, .4, .5, .6, .8, .9, 1.] param_grid = { 'regression__alpha': self.alpha_values, 'regression__l1_ratio': self.l1_ratio_values } #Grid search setup self.predictive_clf = GridSearchCV(Pipeline([ ('regression', ElasticNet()) ]),\ cv=10, n_jobs=self.n_jobs, param_grid=param_grid, verbose=self.verbose, scoring='neg_mean_squared_error') self.predictive_clf.fit(self.transform(X, center=False), y) return self def compute_pred_performance(self, X, y, cv=10): """ Compute predictive performance of predictive model by cross-validation on X and y. Parameters ---------- X : np.ndarray of shape (n_samples, n_features), dtype=float Dataset to project. Features should be ordered in same way as in source_data and target_data. Returns ------- np.ndarray of shape (n_samples, n_pv), dtype=float Dataset projected on consensus features. """ kf = KFold(n_splits=cv, shuffle=True) X_projected = self.transform(X) if self.predictive_clf is None: print('BEWARE: NOT FITTED INSTANCE') self.fit_predictor(X, y) clf = clone(self.predictive_clf) y_predicted = np.zeros(X.shape[0]) for train_index, test_index in kf.split(X_projected): clf.fit(X_projected[train_index], y[train_index]) y_predicted[test_index] = clf.predict(X_projected[test_index]) return scipy.stats.pearsonr(y_predicted, y) def predict(self, X): """ Predict the drug response of a set of samples, i.e.: <ul> <li> Project data on consensus features. <li> Use the Elastic Net model to predict based on the consensus features. </ul> Parameters ---------- X : np.ndarray, dtype=float Dataset to project, of shape (n_samples, n_features). Features should be ordered in same way as in source_data and target_data. Returns ------- np.ndarray of shape (n_samples, 1), dtype=float Predicted drug response values. """ return self.predictive_clf.predict(self.transform(X, center=False)) def transform(self, X, center=False): """ Project a dataset X onto the consensus features. Parameters ---------- X : np.ndarray, dtype=float Dataset to project, of shape (n_samples, n_features). Features should be ordered in same way as in source_data and target_data. Returns ------- np.ndarray of shape (n_samples, n_pv), dtype=float Dataset projected on consensus features. """ return self.interpolation_.transform(X, self.optimal_time, center=center) def _compute_optimal_time(self, step=100, left_center=True): # Based on Kolmogorov Smirnov statistics, find interpolation time # Compute the interpolated values interpolated_values = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)\ (delayed(self.interpolation_.project_data)(s/step, center=left_center) for s in range(step+1)) interpolated_values = np.array(interpolated_values).transpose(2, 0, 1) source_interpolated_values = interpolated_values[:, :, :self. source_data_.shape[0]] target_interpolated_values = interpolated_values[:, :, self.source_data_. shape[0]:] self.optimal_time = [] self.ks_statistics = [] self.ks_p_values = [] # For each PV, find the time when interpolation has the largest overlap. for source_pv, target_pv in zip(source_interpolated_values, target_interpolated_values): self.ks_statistics.append([]) for s, t in zip(source_pv, target_pv): self.ks_statistics[-1].append(scipy.stats.ks_2samp(s, t)) self.ks_statistics[-1] = list(zip(*self.ks_statistics[-1])) self.ks_p_values.append(self.ks_statistics[-1][-1]) self.ks_statistics[-1] = self.ks_statistics[-1][0] self.optimal_time.append(np.argmin(self.ks_statistics[-1]) / step) # Save the different statistics self.optimal_time = np.array( self.optimal_time) # Optimal tau for each PV. self.ks_statistics = np.array( self.ks_statistics) # Computed KS statistics between each PV. self.ks_p_values = np.array( self.ks_p_values) # Corresponding p_values.