def _replicates(self, records: List[TrainingRecord], cv: int = 5, comple_steps: int = 20, conta_steps: int = 20, n_replicates: int = 10): """ Generator function to yield test/training sets which will be fed into subprocesses for _completeness_cv :param records: the complete set of TrainingRecords :param cv: number of folds in the crossvalidation to be performed :param comple_steps: number of steps between 0 and 1 (relative completeness) to be simulated :param conta_steps: number of steps between 0 and 1 (relative contamination level) to be simulated :param n_replicates: number of replicates for the entire crossvalidation :return: parameter list to submit to worker process """ for r in range(n_replicates): X, y, tn, ft = get_x_y_tn_ft(records) skf = StratifiedKFold(n_splits=cv, random_state=self.random_state) fold = 0 for train_index, test_index in skf.split(X, y): fold += 1 # separate in training set lists: training_records = [records[i] for i in train_index] test_records = [records[i] for i in test_index] starting_message = f"Starting comple/conta replicate {r + 1}/{n_replicates}: fold {fold}" yield [ test_records, training_records, comple_steps, conta_steps, self.logger.level, starting_message ]
def _validate_subset(self, records: List[TrainingRecord], estimator: Pipeline): """ Use a fitted Pipeline to predict scores on resampled test data. part of the compleconta crossvalidation where only validation is performed. :param records: test records as a List of TrainingRecord objects :param estimator: classifier previously trained as a sklearn.Pipeline object :return: score """ X, y, tn, ft = get_x_y_tn_ft(records) preds = estimator.predict(X) score = self.scoring_method(y, preds) return score
def _completeness_cv(self, param, **kwargs) -> Dict[float, Dict[float, float]]: """ Perform completeness/contamination simulation and testing for one fold. This is a separate function only called by run_cccv which spawns subprocesses using a ProcessPoolExecutor from concurrent.futures :param param: List [test_records, X_train, y_train, comple_steps, conta_steps, starting_message] workaround to get multiple parameters into this function. (using processor.map) """ # unpack parameters test_records, training_records, comple_steps, conta_steps, verb, starting_message = param # needed to create a new logger, self.logger not accessible from a different process logger = get_logger(__name__, verb=verb) logger.info(starting_message) classifier = copy.deepcopy(self.pipeline) if self.reduce_features: recursive_feature_elimination(training_records, classifier, n_features=self.n_features, random_state=self.random_state) X_train, y_train, tn, ft = get_x_y_tn_ft(training_records) classifier.fit(X=X_train, y=y_train, **kwargs) # initialize the resampler with the test_records only, # so the samples are unknown to the classifier resampler = TrainingRecordResampler(random_state=self.random_state, verb=False) resampler.fit(records=test_records) cv_scores = {} comple_increment = 1 / comple_steps conta_increment = 1 / conta_steps for comple in np.arange(0, 1.05, comple_increment): comple = np.round(comple, 2) cv_scores[comple] = {} for conta in np.arange(0, 1.05, conta_increment): conta = np.round(conta, 2) resampled_set = [ resampler.get_resampled(x, comple, conta) for x in test_records ] cv_scores[comple][conta] = self._validate_subset( resampled_set, classifier) return cv_scores
def train(self, records: List[TrainingRecord], reduce_features: bool = False, n_features: int = 10000, **kwargs): """ Fit CountVectorizer and train LinearSVC on a list of TrainingRecord. :param records: a List[TrainingRecord] for fitting of CountVectorizer and training of LinearSVC. :param reduce_features: toggles feature reduction using recursive feature elimination :param n_features: minimum number of features to retain when reducing features :param kwargs: additional named arguments are passed to the fit() method of Pipeline. :returns: Whether the Pipeline has been fitted on the records. """ self.logger.info("Begin training classifier.") X, y, tn, ft = get_x_y_tn_ft(records) if self.trait_name is not None or self.feature_type is not None: self.logger.warning( "Pipeline is already fitted. Refusing to fit again.") return False if reduce_features: self.logger.info( "using recursive feature elimination as feature selection strategy" ) # use non-calibrated classifier recursive_feature_elimination(records, self.cv_pipeline, n_features=n_features) self.trait_name = tn self.feature_type = ft extra_explainer_arg = kwargs.pop('train_explainer', None) if extra_explainer_arg is not None: self.logger.warning( f'{self.__class__.__name__} provides SHAP explanations without ' f'training an Explainer. Argument ' f'"train_explainer"={extra_explainer_arg} ignored.') self.pipeline.fit(X=X, y=y, **kwargs) self.logger.info("Classifier training completed.") return self
def test_recursive_feature_elimination(self, trait_name, n_features): """ Perform feature compression tests only for SVM; counterindicated for XGB. :param trait_name: :return: """ training_records, genotype, phenotype, group = self.test_load_data( trait_name, False) svm = TrexSVM(verb=True, random_state=RANDOM_STATE) recursive_feature_elimination( records=training_records, pipeline=svm.cv_pipeline, step=0.01, n_features=n_features, ) vec = svm.cv_pipeline.named_steps["vec"] vec._validate_vocabulary() # check if vocabulary is set properly assert vec.fixed_vocabulary_ # check if length of vocabulary is matching assert len(vec.vocabulary_) >= n_features X, y, tn, ft = get_x_y_tn_ft(training_records) X_trans = vec.transform(X) # check if number of unique features is matching assert X_trans.shape[1] >= n_features # check if all samples still have at least one feature present one_is_zero = False non_zero = X_trans.nonzero() for x in non_zero: if len(x) == 0: one_is_zero = True assert not one_is_zero
def crossvalidate(self, records: List[TrainingRecord], cv: int = 5, scoring: Union[str, Callable] = DEFAULT_SCORING_FUNCTION, n_jobs=-1, n_replicates: int = 10, groups: bool = False, reduce_features: bool = False, n_features: int = 10000, demote=False, **kwargs) -> Tuple[float, float, np.ndarray]: """ Perform cv-fold crossvalidation or leave-one(-group)-out validation if groups == True :param records: training records to perform crossvalidation on. :param scoring: String identifying scoring function of crossvalidation, or Callable. If a callable is passed, it must take two parameters `y_true` and `y_pred` (iterables of true and predicted class labels, respectively) and return a (numeric) score. :param cv: Number of folds in crossvalidation. Default: 5 :param n_jobs: Number of parallel jobs. Default: -1 (All processors used) :param n_replicates: Number of replicates of the crossvalidation :param groups: If True, use group information stored in records for splitting. Otherwise, stratify split according to labels in records. This also resets n_replicates to 1. :param reduce_features: toggles feature reduction using recursive feature elimination :param n_features: minimum number of features to retain when reducing features :param demote: toggles logger that is used. if true, msg is written to debug else info :param kwargs: Unused :return: A list of mean score, score SD, and the percentage of misclassifications per sample """ if n_jobs != 1 and self.n_jobs > 1: self.logger.info( f'Will use selected classifier parallelism instead of multithreading.' ) n_jobs = self.n_jobs if hasattr(scoring, '__call__'): scoring_func = scoring else: scoring_func = self.scoring_function_mapping.get(scoring) assert scoring_func is not None, f'invalid or missing scoring function: {scoring}.' log_function = self.logger.debug if demote else self.logger.info t1 = time() X, y, tn, ft = get_x_y_tn_ft(records) # unfortunately RFECV does not work with pipelines (need to use the vectorizer separately) self.cv_pipeline.fit(X, y) vec = self.cv_pipeline.named_steps["vec"] clf = self.cv_pipeline.named_steps["clf"] if not vec.vocabulary: vec.fit(X) X_trans = vec.transform(X) misclassifications = np.zeros(len(y)) scores = [] if groups: log_function( "Begin Leave-One-Group-Out validation on training data.") splitting_strategy = LeaveOneGroupOut() group_ids = get_groups(records) n_replicates = 1 else: log_function("Begin cross-validation on training data.") splitting_strategy = StratifiedKFold( n_splits=cv, shuffle=True, random_state=self.random_state) group_ids = None for i in range(n_replicates): inner_cv = StratifiedKFold(n_splits=cv, shuffle=True, random_state=self.random_state) outer_cv = splitting_strategy for tr, ts in outer_cv.split(X_trans, y, groups=group_ids): if reduce_features: est = RFECV(estimator=clf, cv=inner_cv, n_jobs=n_jobs, step=DEFAULT_STEP_SIZE, min_features_to_select=n_features, scoring=DEFAULT_SCORING_FUNCTION) else: est = clf est.fit(X_trans[tr], y[tr]) y_pred = est.predict(X_trans[ts]) mismatch = np.logical_xor(y[ts], y_pred) mismatch_indices = ts[np.where(mismatch)] misclassifications[mismatch_indices] += 1 score = scoring_func(y[ts], y_pred) scores.append(score) log_function(f"Finished replicate {i + 1} of {n_replicates}") misclassifications /= n_replicates score_mean, score_sd = float(np.mean(scores)), float(np.std(scores)) t2 = time() log_function(f"Cross-validation completed.") log_function( f"Total duration of cross-validation: {np.round(t2 - t1, 2)} seconds." ) return score_mean, score_sd, misclassifications
def parameter_search(self, records: List[TrainingRecord], search_params: Dict[str, List] = None, cv: int = 5, scoring: str = DEFAULT_SCORING_FUNCTION, n_jobs: int = -1, n_iter: int = 10, return_optimized: bool = False): """ Perform stratified, randomized parameter search. If desired, return a new class instance with optimized training parameters. :param records: training records to perform crossvalidation on. :param search_params: A dictionary of iterables of possible model training parameters. If None, use default search parameters for the given classifier. :param scoring: Scoring function of crossvalidation. Default: Balanced Accuracy. :param cv: Number of folds in crossvalidation. Default: 5 :param n_jobs: Number of parallel jobs. Default: -1 (All processors used) :param n_iter: Number of grid points to evaluate. Default: 10 :param return_optimized: Whether to return a ready-made classifier with the optimized params instead of a dictionary of params. :return: A dictionary containing best found parameters or an optimized class instance. """ if n_jobs != 1 and self.n_jobs > 1: self.logger.info( f'Will use selected classifier parallelism instead of multithreading.' ) n_jobs = self.n_jobs t1 = time() self.logger.info(f'Performing randomized parameter search.') X, y, tn, ft = get_x_y_tn_ft(records) if search_params is None: search_params = self.default_search_params vec = clone(self.cv_pipeline.named_steps['vec']) clf = clone(self.cv_pipeline.named_steps['clf']) X_trans = vec.fit_transform(X) cv = StratifiedKFold(n_splits=cv, shuffle=True, random_state=self.random_state) rcv = RandomizedSearchCV(estimator=clf, scoring=scoring, param_distributions=search_params, n_jobs=n_jobs, n_iter=n_iter, cv=cv, iid=False, verbose=1 if self.verb else 0) rcv.fit(X_trans, y=y) best_params = rcv.best_params_ t2 = time() gc.collect( ) # essential due to imperfect memory management of XGBoost sklearn interface self.logger.info(f'Optimized params:\n{pformat(best_params)}') self.logger.info( f'{np.round(t2 - t1)} sec elapsed during parameter search.') if return_optimized: self.logger.info( f'Returning optimized instance of {self.__class__.__name__}.') return self.get_instance(**best_params, random_state=self.random_state_init, verb=self.verb) return best_params
def recursive_feature_elimination(records: List[TrainingRecord], pipeline: Pipeline, step: float = DEFAULT_STEP_SIZE, n_features: int = None, random_state: np.random.RandomState = None): """ Function to apply RFE to limit the vocabulary used by the CustomVectorizer, optional step. :param records: list of TrainingRecords, entire training set. :param pipeline: the pipeline which vocabulary should be modified :param step: rate of features to eliminate at each step. the lower the number, the more steps :param n_features: number of features to select (if None: half of the provided features) :param random_state: random state for deterministic results :return: number of features used """ t1 = time() X, y, tn, ft = get_x_y_tn_ft(records) vec = pipeline.named_steps["vec"] estimator = pipeline.named_steps["clf"] if not vec.vocabulary: vec.fit(X) previous_vocabulary = vec.vocabulary_ if not n_features: n_features = len(previous_vocabulary) // 2 X_trans = vec.transform(X) logger = get_logger(__name__, verb=True) split = StratifiedKFold(shuffle=True, n_splits=5, random_state=random_state) selector = RFECV(estimator, step=step, min_features_to_select=n_features, cv=split, n_jobs=5, scoring=DEFAULT_SCORING_FUNCTION) selector = selector.fit(X=X_trans, y=y) original_size = len(previous_vocabulary) support = selector.get_support() support = support.nonzero()[0] new_id = {support[x]: x for x in range(len(support))} vocabulary = { feature: new_id[i] for feature, i in previous_vocabulary.items() if new_id.get(i) is not None } size_after = selector.n_features_ t2 = time() logger.info( f"{size_after}/{original_size} features selected using Recursive Feature Eliminiation." f" in {np.round(t2 - t1, 2)} seconds.") # set vocabulary to vectorizer pipeline.named_steps["vec"].vocabulary = vocabulary pipeline.named_steps["vec"].vocabulary_ = vocabulary pipeline.named_steps["vec"].fixed_vocabulary_ = True return size_after