Esempio n. 1
0
    def _replicates(self,
                    records: List[TrainingRecord],
                    cv: int = 5,
                    comple_steps: int = 20,
                    conta_steps: int = 20,
                    n_replicates: int = 10):
        """
        Generator function to yield test/training sets which will be fed into subprocesses for
        _completeness_cv

        :param records: the complete set of TrainingRecords
        :param cv: number of folds in the crossvalidation to be performed
        :param comple_steps: number of steps between 0 and 1 (relative completeness) to be simulated
        :param conta_steps: number of steps between 0 and 1 (relative contamination level)
                            to be simulated
        :param n_replicates: number of replicates for the entire crossvalidation
        :return: parameter list to submit to worker process
        """
        for r in range(n_replicates):
            X, y, tn, ft = get_x_y_tn_ft(records)
            skf = StratifiedKFold(n_splits=cv, random_state=self.random_state)
            fold = 0
            for train_index, test_index in skf.split(X, y):
                fold += 1
                # separate in training set lists:
                training_records = [records[i] for i in train_index]
                test_records = [records[i] for i in test_index]
                starting_message = f"Starting comple/conta replicate {r + 1}/{n_replicates}: fold {fold}"
                yield [
                    test_records, training_records, comple_steps, conta_steps,
                    self.logger.level, starting_message
                ]
Esempio n. 2
0
    def _validate_subset(self, records: List[TrainingRecord],
                         estimator: Pipeline):
        """
        Use a fitted Pipeline to predict scores on resampled test data.
        part of the compleconta crossvalidation where only validation is performed.

        :param records: test records as a List of TrainingRecord objects
        :param estimator: classifier previously trained as a sklearn.Pipeline object
        :return: score
        """
        X, y, tn, ft = get_x_y_tn_ft(records)
        preds = estimator.predict(X)
        score = self.scoring_method(y, preds)
        return score
Esempio n. 3
0
    def _completeness_cv(self, param,
                         **kwargs) -> Dict[float, Dict[float, float]]:
        """
        Perform completeness/contamination simulation and testing for one fold.
        This is a separate function only called by run_cccv which spawns
        subprocesses using a ProcessPoolExecutor from concurrent.futures

        :param param: List [test_records, X_train, y_train, comple_steps, conta_steps, starting_message]
                      workaround to get multiple parameters into this function. (using processor.map)
        """
        # unpack parameters
        test_records, training_records, comple_steps, conta_steps, verb, starting_message = param

        # needed to create a new logger, self.logger not accessible from a different process
        logger = get_logger(__name__, verb=verb)
        logger.info(starting_message)

        classifier = copy.deepcopy(self.pipeline)
        if self.reduce_features:
            recursive_feature_elimination(training_records,
                                          classifier,
                                          n_features=self.n_features,
                                          random_state=self.random_state)

        X_train, y_train, tn, ft = get_x_y_tn_ft(training_records)
        classifier.fit(X=X_train, y=y_train, **kwargs)

        # initialize the resampler with the test_records only,
        # so the samples are unknown to the classifier
        resampler = TrainingRecordResampler(random_state=self.random_state,
                                            verb=False)
        resampler.fit(records=test_records)
        cv_scores = {}
        comple_increment = 1 / comple_steps
        conta_increment = 1 / conta_steps
        for comple in np.arange(0, 1.05, comple_increment):
            comple = np.round(comple, 2)
            cv_scores[comple] = {}
            for conta in np.arange(0, 1.05, conta_increment):
                conta = np.round(conta, 2)
                resampled_set = [
                    resampler.get_resampled(x, comple, conta)
                    for x in test_records
                ]
                cv_scores[comple][conta] = self._validate_subset(
                    resampled_set, classifier)
        return cv_scores
Esempio n. 4
0
    def train(self,
              records: List[TrainingRecord],
              reduce_features: bool = False,
              n_features: int = 10000,
              **kwargs):
        """
        Fit CountVectorizer and train LinearSVC on a list of TrainingRecord.

        :param records: a List[TrainingRecord] for fitting of CountVectorizer and training of LinearSVC.
        :param reduce_features: toggles feature reduction using recursive feature elimination
        :param n_features: minimum number of features to retain when reducing features
        :param kwargs: additional named arguments are passed to the fit() method of Pipeline.
        :returns: Whether the Pipeline has been fitted on the records.
        """
        self.logger.info("Begin training classifier.")
        X, y, tn, ft = get_x_y_tn_ft(records)
        if self.trait_name is not None or self.feature_type is not None:
            self.logger.warning(
                "Pipeline is already fitted. Refusing to fit again.")
            return False
        if reduce_features:
            self.logger.info(
                "using recursive feature elimination as feature selection strategy"
            )
            # use non-calibrated classifier
            recursive_feature_elimination(records,
                                          self.cv_pipeline,
                                          n_features=n_features)

        self.trait_name = tn
        self.feature_type = ft

        extra_explainer_arg = kwargs.pop('train_explainer', None)
        if extra_explainer_arg is not None:
            self.logger.warning(
                f'{self.__class__.__name__} provides SHAP explanations without '
                f'training an Explainer. Argument '
                f'"train_explainer"={extra_explainer_arg} ignored.')
        self.pipeline.fit(X=X, y=y, **kwargs)
        self.logger.info("Classifier training completed.")
        return self
Esempio n. 5
0
    def test_recursive_feature_elimination(self, trait_name, n_features):
        """
        Perform feature compression tests only for SVM; counterindicated for XGB.
        :param trait_name:
        :return:
        """
        training_records, genotype, phenotype, group = self.test_load_data(
            trait_name, False)
        svm = TrexSVM(verb=True, random_state=RANDOM_STATE)
        recursive_feature_elimination(
            records=training_records,
            pipeline=svm.cv_pipeline,
            step=0.01,
            n_features=n_features,
        )
        vec = svm.cv_pipeline.named_steps["vec"]
        vec._validate_vocabulary()

        # check if vocabulary is set properly
        assert vec.fixed_vocabulary_

        # check if length of vocabulary is matching
        assert len(vec.vocabulary_) >= n_features

        X, y, tn, ft = get_x_y_tn_ft(training_records)
        X_trans = vec.transform(X)

        # check if number of unique features is matching
        assert X_trans.shape[1] >= n_features

        # check if all samples still have at least one feature present
        one_is_zero = False
        non_zero = X_trans.nonzero()
        for x in non_zero:
            if len(x) == 0:
                one_is_zero = True
        assert not one_is_zero
Esempio n. 6
0
    def crossvalidate(self,
                      records: List[TrainingRecord],
                      cv: int = 5,
                      scoring: Union[str, Callable] = DEFAULT_SCORING_FUNCTION,
                      n_jobs=-1,
                      n_replicates: int = 10,
                      groups: bool = False,
                      reduce_features: bool = False,
                      n_features: int = 10000,
                      demote=False,
                      **kwargs) -> Tuple[float, float, np.ndarray]:
        """
        Perform cv-fold crossvalidation or leave-one(-group)-out validation if groups == True

        :param records: training records to perform crossvalidation on.
        :param scoring: String identifying scoring function of crossvalidation, or Callable.
                        If a callable is passed, it must take two parameters `y_true` and `y_pred`
                        (iterables of true and predicted class labels, respectively) and return a
                        (numeric) score.
        :param cv: Number of folds in crossvalidation. Default: 5
        :param n_jobs: Number of parallel jobs. Default: -1 (All processors used)
        :param n_replicates: Number of replicates of the crossvalidation
        :param groups: If True, use group information stored in records for splitting. Otherwise,
            stratify split according to labels in records. This also resets n_replicates to 1.
        :param reduce_features: toggles feature reduction using recursive feature elimination
        :param n_features: minimum number of features to retain when reducing features
        :param demote: toggles logger that is used. if true, msg is written to debug else info
        :param kwargs: Unused
        :return: A list of mean score, score SD, and the percentage of misclassifications per sample
        """
        if n_jobs != 1 and self.n_jobs > 1:
            self.logger.info(
                f'Will use selected classifier parallelism instead of multithreading.'
            )
            n_jobs = self.n_jobs

        if hasattr(scoring, '__call__'):
            scoring_func = scoring
        else:
            scoring_func = self.scoring_function_mapping.get(scoring)
        assert scoring_func is not None, f'invalid or missing scoring function: {scoring}.'
        log_function = self.logger.debug if demote else self.logger.info
        t1 = time()
        X, y, tn, ft = get_x_y_tn_ft(records)

        # unfortunately RFECV does not work with pipelines (need to use the vectorizer separately)
        self.cv_pipeline.fit(X, y)
        vec = self.cv_pipeline.named_steps["vec"]
        clf = self.cv_pipeline.named_steps["clf"]

        if not vec.vocabulary:
            vec.fit(X)
        X_trans = vec.transform(X)

        misclassifications = np.zeros(len(y))
        scores = []

        if groups:
            log_function(
                "Begin Leave-One-Group-Out validation on training data.")
            splitting_strategy = LeaveOneGroupOut()
            group_ids = get_groups(records)
            n_replicates = 1
        else:
            log_function("Begin cross-validation on training data.")
            splitting_strategy = StratifiedKFold(
                n_splits=cv, shuffle=True, random_state=self.random_state)
            group_ids = None

        for i in range(n_replicates):
            inner_cv = StratifiedKFold(n_splits=cv,
                                       shuffle=True,
                                       random_state=self.random_state)
            outer_cv = splitting_strategy
            for tr, ts in outer_cv.split(X_trans, y, groups=group_ids):
                if reduce_features:
                    est = RFECV(estimator=clf,
                                cv=inner_cv,
                                n_jobs=n_jobs,
                                step=DEFAULT_STEP_SIZE,
                                min_features_to_select=n_features,
                                scoring=DEFAULT_SCORING_FUNCTION)
                else:
                    est = clf
                est.fit(X_trans[tr], y[tr])
                y_pred = est.predict(X_trans[ts])
                mismatch = np.logical_xor(y[ts], y_pred)
                mismatch_indices = ts[np.where(mismatch)]
                misclassifications[mismatch_indices] += 1
                score = scoring_func(y[ts], y_pred)
                scores.append(score)
            log_function(f"Finished replicate {i + 1} of {n_replicates}")

        misclassifications /= n_replicates
        score_mean, score_sd = float(np.mean(scores)), float(np.std(scores))
        t2 = time()
        log_function(f"Cross-validation completed.")
        log_function(
            f"Total duration of cross-validation: {np.round(t2 - t1, 2)} seconds."
        )
        return score_mean, score_sd, misclassifications
Esempio n. 7
0
    def parameter_search(self,
                         records: List[TrainingRecord],
                         search_params: Dict[str, List] = None,
                         cv: int = 5,
                         scoring: str = DEFAULT_SCORING_FUNCTION,
                         n_jobs: int = -1,
                         n_iter: int = 10,
                         return_optimized: bool = False):
        """
        Perform stratified, randomized parameter search. If desired, return a new class instance
        with optimized training parameters.

        :param records: training records to perform crossvalidation on.
        :param search_params: A dictionary of iterables of possible model training parameters.
                              If None, use default search parameters for the given classifier.
        :param scoring: Scoring function of crossvalidation. Default: Balanced Accuracy.
        :param cv: Number of folds in crossvalidation. Default: 5
        :param n_jobs: Number of parallel jobs. Default: -1 (All processors used)
        :param n_iter: Number of grid points to evaluate. Default: 10
        :param return_optimized: Whether to return a ready-made classifier
                                 with the optimized params instead of a dictionary of params.
        :return: A dictionary containing best found parameters or an optimized class instance.
        """
        if n_jobs != 1 and self.n_jobs > 1:
            self.logger.info(
                f'Will use selected classifier parallelism instead of multithreading.'
            )
            n_jobs = self.n_jobs

        t1 = time()
        self.logger.info(f'Performing randomized parameter search.')
        X, y, tn, ft = get_x_y_tn_ft(records)
        if search_params is None:
            search_params = self.default_search_params

        vec = clone(self.cv_pipeline.named_steps['vec'])
        clf = clone(self.cv_pipeline.named_steps['clf'])

        X_trans = vec.fit_transform(X)
        cv = StratifiedKFold(n_splits=cv,
                             shuffle=True,
                             random_state=self.random_state)
        rcv = RandomizedSearchCV(estimator=clf,
                                 scoring=scoring,
                                 param_distributions=search_params,
                                 n_jobs=n_jobs,
                                 n_iter=n_iter,
                                 cv=cv,
                                 iid=False,
                                 verbose=1 if self.verb else 0)

        rcv.fit(X_trans, y=y)
        best_params = rcv.best_params_
        t2 = time()
        gc.collect(
        )  # essential due to imperfect memory management of XGBoost sklearn interface

        self.logger.info(f'Optimized params:\n{pformat(best_params)}')
        self.logger.info(
            f'{np.round(t2 - t1)} sec elapsed during parameter search.')
        if return_optimized:
            self.logger.info(
                f'Returning optimized instance of {self.__class__.__name__}.')
            return self.get_instance(**best_params,
                                     random_state=self.random_state_init,
                                     verb=self.verb)
        return best_params
Esempio n. 8
0
def recursive_feature_elimination(records: List[TrainingRecord],
                                  pipeline: Pipeline,
                                  step: float = DEFAULT_STEP_SIZE,
                                  n_features: int = None,
                                  random_state: np.random.RandomState = None):
    """
    Function to apply RFE to limit the vocabulary used by the CustomVectorizer, optional step.

    :param records: list of TrainingRecords, entire training set.
    :param pipeline: the pipeline which vocabulary should be modified
    :param step: rate of features to eliminate at each step. the lower the number, the more steps
    :param n_features: number of features to select (if None: half of the provided features)
    :param random_state: random state for deterministic results
    :return: number of features used
    """
    t1 = time()

    X, y, tn, ft = get_x_y_tn_ft(records)
    vec = pipeline.named_steps["vec"]
    estimator = pipeline.named_steps["clf"]

    if not vec.vocabulary:
        vec.fit(X)
    previous_vocabulary = vec.vocabulary_

    if not n_features:
        n_features = len(previous_vocabulary) // 2

    X_trans = vec.transform(X)

    logger = get_logger(__name__, verb=True)
    split = StratifiedKFold(shuffle=True,
                            n_splits=5,
                            random_state=random_state)
    selector = RFECV(estimator,
                     step=step,
                     min_features_to_select=n_features,
                     cv=split,
                     n_jobs=5,
                     scoring=DEFAULT_SCORING_FUNCTION)
    selector = selector.fit(X=X_trans, y=y)

    original_size = len(previous_vocabulary)
    support = selector.get_support()
    support = support.nonzero()[0]
    new_id = {support[x]: x for x in range(len(support))}
    vocabulary = {
        feature: new_id[i]
        for feature, i in previous_vocabulary.items()
        if new_id.get(i) is not None
    }
    size_after = selector.n_features_

    t2 = time()

    logger.info(
        f"{size_after}/{original_size} features selected using Recursive Feature Eliminiation."
        f" in {np.round(t2 - t1, 2)} seconds.")

    # set vocabulary to vectorizer
    pipeline.named_steps["vec"].vocabulary = vocabulary
    pipeline.named_steps["vec"].vocabulary_ = vocabulary
    pipeline.named_steps["vec"].fixed_vocabulary_ = True

    return size_after