def _gene_shuffling(self): perm = np.random.permutation(self.source_data_.shape[1]) pv = PVComputation(self.kernel, self.kernel_params_) pv.fit(self.source_data_[:, perm], self.target_data_, method=self.method, n_components=self.n_components, n_pv=self.n_pv) return np.cos(pv.canonical_angles)
def kernel_pv_offset_centered(self, source_offset_data, target_offset_data): """ Create same classifier than kernel_pv_offset, but first center the data. The idea is to see the influence of kernel centering in the linear setting -- which must be completly equivalent. """ clf_offset_centered = PVComputation('linear', n_components=n_pc, n_pv=n_pv) x_source = StandardScaler(with_mean=True, with_std=False).fit_transform(source_offset_data) x_target = StandardScaler(with_mean=True, with_std=False).fit_transform(target_offset_data) return clf_offset_centered.fit(x_source, x_target, 'two-stage')
def linear_principal_vectors(source_data, target_data): return PVComputation('linear', n_components=n_pc, n_pv=n_pv).fit(source_data, target_data)
def rbf_principal_vectors(source_data, target_data): return PVComputation('rbf', rbf_params, n_components=n_pc, n_pv=n_pv).fit(source_data, target_data)
def kernel_pv_offset(self, source_offset_data, target_offset_data): clf_offset = PVComputation('linear', n_components=n_pc, n_pv=n_pv) return clf_offset.fit(source_offset_data, target_offset_data, 'two-stage')
class TRANSACT: """ TRANSACT is a package designed to adapt predictors of drug response from pre-clinical models to the clinic. <br/><br/> This class contains all the tasks and sub-routines required for training the domain adaptation framework, i.e.: <ul> <li> Kernel PCA decomposition on source and target independently. <li> Kernel principal components comparison. <li> Computation of Principal Vectors (PVs). <li> Interpolation between source and target PVs and extraction of Consensus Features (CFs). <li> Out-of-sample extension: project new dataset onto the consensus features. </ul> """ def __init__(self, kernel='linear', kernel_params=None, n_components=None, n_pv=None, method='two-stage', step=100, n_jobs=1, verbose=False): """ Parameters ---------- kernel : str, default to 'linear' Name of the kernel to be used in the algorithm. Has to be compliant with <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.kernel_metrics.html#sklearn.metrics.pairwise.kernel_metrics"> scikit-learn kernel</a>, e.g., "rbf", "polynomial", "laplacian", "linear", ... kernel_params : dict, default to None Parameters of the kernel (degree for polynomial kernel, gamma for RBF). Naming has to be compliant with scikit-learn, e.g., {"gamma": 0.0005}. n_components : int or dict, default to None Number of components for kernel PCA. <br/> If int, then indicates the same number of components for source and target. <br/> If dict, then must be of the form {'source':int, 'target':int}. n_pv : int, default to None Number of principal vectors. method : str, default to 'two-stage' Method used for computing the principal vectors. Only 'two-stage' has been implemented. step: int, default to 100 Number of interpolation steps. n_jobs: int, default to 1 Number of concurrent threads to use for tasks that can be parallelized. verbose: bool or int, default to False Degree of verbosity in joblib routines. """ self.kernel = kernel self.kernel_params_ = kernel_params or {} self.kernel_values_ = KernelComputer(self.kernel, self.kernel_params_, n_jobs) self.source_data_ = None self.target_data_ = None self.is_fitted = False self.n_components = n_components self.n_pv = n_pv self.method = method self.step = step self.predictive_clf = None self.n_jobs = n_jobs self.verbose = verbose def fit(self, source_data, target_data, n_components=None, n_pv=None, method='two-stage', step=100, with_interpolation=True, left_center=True): """ Compute the Consensus Features (CFs) onto which predictive models can be trained. <br/> Specifically: <ul> <li> Compute the kernel matrices. <li> Compute the cosine similarity matrix. <li> Compute principal vectors. <li> Interpolate between the PVs. <li> Find optimal interpolation time. </ul> Parameters ---------- source_data : np.ndarray, dtype=float Source data, matrix with samples in the rows, i.e. shape (n_source_samples, n_features). <br./> pandas.DataFrame are supported. target_data : np.ndarray, dtype=float Source data, matrix with samples in the rows, i.e. shape (n_target_samples, n_features). <br./> pandas.DataFrame are supported. <br/><b>WARNING</b>: features need to be ordered in the same way as in source_data. n_components: int, default to None Number of components. If not set here or in __init__, then use the maximum number of principal components possible for source and target. n_pv: int, default to None Number of Principal Vectors. If not set here or in __init__, then maximum number of PV will be computed. method : str, default to 'two-stage' Method used for computing the principal vectors. Only 'two-stage' has been implemented. step: int, default to 100 Number of interpolation steps. with_interpolation: bool, default to True Bool indicating whether interpolation shall also be fitted. Useful for just computing PV prior to null distribution fitting (and choose of PV number). left_center: bool, default to True Bool indicating whether the output should be mean-centered, i.e. whether source and target consensus features values (or PVs if no interpolation) must have an independent mean-centering. Returns ------- self : TRANSACT Fitted instance. """ # Save parameters self.source_data_ = source_data self.target_data_ = target_data self.method = method or self.method self.n_components = n_components or self.n_components self.n_pv = n_pv or self.n_pv self.step = step or self.step self.left_center = left_center # Compute kernel values self.kernel_values_.fit(source_data, target_data, center=False) # Compute principal vectors self.principal_vectors_ = PVComputation(self.kernel, self.kernel_params_, n_jobs=self.n_jobs) self.principal_vectors_.fit(self.source_data_, self.target_data_, method=self.method, n_components=self.n_components, n_pv=self.n_pv) # Stop here if interpolation should not be computed. if not with_interpolation: return self # Set up interpolation scheme self.interpolation_ = Interpolation(self.kernel, self.kernel_params_, self.n_jobs) self.interpolation_.fit(self.principal_vectors_, self.kernel_values_) # Compute optimal interpolation time self._compute_optimal_time(step=self.step, left_center=self.left_center) self.is_fitted = True return self def null_distribution_pv_similarity(self, method='gene_shuffling', n_iter=100): """ Generate a null distribution for the PV similarity function: <ul> <li> Gene shuffling: genes get shuffled in source to destroy any structure existing at the gene-level while preserving the sample structure. PV get recomputed and similarity is saved. </ul> Parameters ---------- method : string, default to gene_shuffling Method used for generating the null distribution. Only method developped: gene_shuffling n_iter: int, default to 100 Number of iterations Returns ------- np.ndarray, dtype=float, shape (n_iter, n_pv) Array containing the distribution of similarity after shuffling. Each row contains the values of one shuffling across PVs. """ if method.lower() == 'gene_shuffling': null_method = self._gene_shuffling else: raise NotImplementedError( '%s is not a proper method for generating null distribution' % (method)) null_distribution = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)\ (delayed(null_method)() for _ in range(n_iter)) return np.array(null_distribution) def _gene_shuffling(self): perm = np.random.permutation(self.source_data_.shape[1]) pv = PVComputation(self.kernel, self.kernel_params_) pv.fit(self.source_data_[:, perm], self.target_data_, method=self.method, n_components=self.n_components, n_pv=self.n_pv) return np.cos(pv.canonical_angles) def fit_predictor(self, X, y, alpha_values=None, l1_ratio=0.5): """ Project X on consensus features and train a predictor of drug response. Parameters ---------- X : np.ndarray of shape (n_samples, n_features), dtype=float Dataset to project. Features should be ordered in same way as in source_data and target_data. y : np.ndarray of shape (n_samples, 1), dtype=float Output to predict Returns ------- """ self.alpha_values = alpha_values if alpha_values is not None else np.logspace( -10, 5, 34) self.l1_ratio_values = [0., .1, .2, .4, .5, .6, .8, .9, 1.] param_grid = { 'regression__alpha': self.alpha_values, 'regression__l1_ratio': self.l1_ratio_values } #Grid search setup self.predictive_clf = GridSearchCV(Pipeline([ ('regression', ElasticNet()) ]),\ cv=10, n_jobs=self.n_jobs, param_grid=param_grid, verbose=self.verbose, scoring='neg_mean_squared_error') self.predictive_clf.fit(self.transform(X, center=False), y) return self def compute_pred_performance(self, X, y, cv=10): """ Compute predictive performance of predictive model by cross-validation on X and y. Parameters ---------- X : np.ndarray of shape (n_samples, n_features), dtype=float Dataset to project. Features should be ordered in same way as in source_data and target_data. Returns ------- np.ndarray of shape (n_samples, n_pv), dtype=float Dataset projected on consensus features. """ kf = KFold(n_splits=cv, shuffle=True) X_projected = self.transform(X) if self.predictive_clf is None: print('BEWARE: NOT FITTED INSTANCE') self.fit_predictor(X, y) clf = clone(self.predictive_clf) y_predicted = np.zeros(X.shape[0]) for train_index, test_index in kf.split(X_projected): clf.fit(X_projected[train_index], y[train_index]) y_predicted[test_index] = clf.predict(X_projected[test_index]) return scipy.stats.pearsonr(y_predicted, y) def predict(self, X): """ Predict the drug response of a set of samples, i.e.: <ul> <li> Project data on consensus features. <li> Use the Elastic Net model to predict based on the consensus features. </ul> Parameters ---------- X : np.ndarray, dtype=float Dataset to project, of shape (n_samples, n_features). Features should be ordered in same way as in source_data and target_data. Returns ------- np.ndarray of shape (n_samples, 1), dtype=float Predicted drug response values. """ return self.predictive_clf.predict(self.transform(X, center=False)) def transform(self, X, center=False): """ Project a dataset X onto the consensus features. Parameters ---------- X : np.ndarray, dtype=float Dataset to project, of shape (n_samples, n_features). Features should be ordered in same way as in source_data and target_data. Returns ------- np.ndarray of shape (n_samples, n_pv), dtype=float Dataset projected on consensus features. """ return self.interpolation_.transform(X, self.optimal_time, center=center) def _compute_optimal_time(self, step=100, left_center=True): # Based on Kolmogorov Smirnov statistics, find interpolation time # Compute the interpolated values interpolated_values = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)\ (delayed(self.interpolation_.project_data)(s/step, center=left_center) for s in range(step+1)) interpolated_values = np.array(interpolated_values).transpose(2, 0, 1) source_interpolated_values = interpolated_values[:, :, :self. source_data_.shape[0]] target_interpolated_values = interpolated_values[:, :, self.source_data_. shape[0]:] self.optimal_time = [] self.ks_statistics = [] self.ks_p_values = [] # For each PV, find the time when interpolation has the largest overlap. for source_pv, target_pv in zip(source_interpolated_values, target_interpolated_values): self.ks_statistics.append([]) for s, t in zip(source_pv, target_pv): self.ks_statistics[-1].append(scipy.stats.ks_2samp(s, t)) self.ks_statistics[-1] = list(zip(*self.ks_statistics[-1])) self.ks_p_values.append(self.ks_statistics[-1][-1]) self.ks_statistics[-1] = self.ks_statistics[-1][0] self.optimal_time.append(np.argmin(self.ks_statistics[-1]) / step) # Save the different statistics self.optimal_time = np.array( self.optimal_time) # Optimal tau for each PV. self.ks_statistics = np.array( self.ks_statistics) # Computed KS statistics between each PV. self.ks_p_values = np.array( self.ks_p_values) # Corresponding p_values.
def fit(self, source_data, target_data, n_components=None, n_pv=None, method='two-stage', step=100, with_interpolation=True, left_center=True): """ Compute the Consensus Features (CFs) onto which predictive models can be trained. <br/> Specifically: <ul> <li> Compute the kernel matrices. <li> Compute the cosine similarity matrix. <li> Compute principal vectors. <li> Interpolate between the PVs. <li> Find optimal interpolation time. </ul> Parameters ---------- source_data : np.ndarray, dtype=float Source data, matrix with samples in the rows, i.e. shape (n_source_samples, n_features). <br./> pandas.DataFrame are supported. target_data : np.ndarray, dtype=float Source data, matrix with samples in the rows, i.e. shape (n_target_samples, n_features). <br./> pandas.DataFrame are supported. <br/><b>WARNING</b>: features need to be ordered in the same way as in source_data. n_components: int, default to None Number of components. If not set here or in __init__, then use the maximum number of principal components possible for source and target. n_pv: int, default to None Number of Principal Vectors. If not set here or in __init__, then maximum number of PV will be computed. method : str, default to 'two-stage' Method used for computing the principal vectors. Only 'two-stage' has been implemented. step: int, default to 100 Number of interpolation steps. with_interpolation: bool, default to True Bool indicating whether interpolation shall also be fitted. Useful for just computing PV prior to null distribution fitting (and choose of PV number). left_center: bool, default to True Bool indicating whether the output should be mean-centered, i.e. whether source and target consensus features values (or PVs if no interpolation) must have an independent mean-centering. Returns ------- self : TRANSACT Fitted instance. """ # Save parameters self.source_data_ = source_data self.target_data_ = target_data self.method = method or self.method self.n_components = n_components or self.n_components self.n_pv = n_pv or self.n_pv self.step = step or self.step self.left_center = left_center # Compute kernel values self.kernel_values_.fit(source_data, target_data, center=False) # Compute principal vectors self.principal_vectors_ = PVComputation(self.kernel, self.kernel_params_, n_jobs=self.n_jobs) self.principal_vectors_.fit(self.source_data_, self.target_data_, method=self.method, n_components=self.n_components, n_pv=self.n_pv) # Stop here if interpolation should not be computed. if not with_interpolation: return self # Set up interpolation scheme self.interpolation_ = Interpolation(self.kernel, self.kernel_params_, self.n_jobs) self.interpolation_.fit(self.principal_vectors_, self.kernel_values_) # Compute optimal interpolation time self._compute_optimal_time(step=self.step, left_center=self.left_center) self.is_fitted = True return self