Esempio n. 1
0
    def _get_support_mask(self):
        check_is_fitted(self, 'scores_')

        # Cater for NaNs
        if self.percentile == 100:
            return {k: np.ones(len(score), dtype=np.bool) for k, score in self.scores_.items()}
        elif self.percentile == 0:
            return {k: np.zeros(len(score), dtype=np.bool) for k, score in self.scores_.items()}

        masks = dict()
        for roi_id, score in self.scores_.items():
            scores = _clean_nans(score)
            if len(scores) == 1:
                mask = np.array([True])
            else:
                treshold = stats.scoreatpercentile(scores,
                                                   100 - self.percentile)
                mask = scores > treshold
                ties = np.where(scores == treshold)[0]
                if len(ties):
                    max_feats = len(scores) * self.percentile // 100
                    kept_ties = ties[:max_feats - mask.sum()]
                    mask[kept_ties] = True
            masks[roi_id] = mask
        return masks
Esempio n. 2
0
    def _get_support_mask(self):
        check_is_fitted(self, 'scores_')

        # Cater for NaNs
        if self.percentile == 100:
            return {k: np.ones(len(score), dtype=np.bool) for k, score in self.scores_.items()}
        elif self.percentile == 0:
            return {k: np.zeros(len(score), dtype=np.bool) for k, score in self.scores_.items()}

        masks = dict()
        for roi_id, score in self.scores_.items():
            scores = _clean_nans(score)
            if len(scores) == 1:
                mask = np.array([True])
            else:
                treshold = stats.scoreatpercentile(scores,
                                                   100 - self.percentile)
                mask = scores > treshold
                ties = np.where(scores == treshold)[0]
                if len(ties):
                    max_feats = len(scores) * self.percentile // 100
                    kept_ties = ties[:max_feats - mask.sum()]
                    mask[kept_ties] = True
            masks[roi_id] = mask
        return masks
    def _get_support_mask(self):
        check_is_fitted(self, 'scores_')

        if self.thresh == 'none':
            return np.ones(self.scores_.shape, dtype=bool)
        elif self.thresh == 0:
            return np.zeros(self.scores_.shape, dtype=bool)
        else:
            scores = _clean_nans(self.scores_)
            mask = np.zeros(scores.shape, dtype=bool)

            # Request a stable sort. Mergesort takes more memory (~40MB per
            # megafeature on x86-64).
            return np.array(scores > self.thresh, dtype=bool)
Esempio n. 4
0
    def _get_support_mask(self):
        # Cater for NaNs
        if self.threshold == 1:
            return np.ones(len(self.scores_), dtype=np.bool)
        elif self.threshold == 0:
            return np.zeros(len(self.scores_), dtype=np.bool)

        scores = _clean_nans(self.scores_)

        mask = self.threshold.fit_transform(scores)
        ties = np.where(scores == self.threshold.value)[0]
        if len(ties):
            max_feats = len(scores) * self.threshold.value
            kept_ties = ties[:max_feats - mask.sum()]
            mask[kept_ties] = True
        return mask
Esempio n. 5
0
    def _get_support_mask(self):
        check_is_fitted(self, 'scores_')

        if self.k == 'all':
            return np.ones(self.scores_.shape, dtype=bool)
        elif self.k == 0:
            return np.zeros(self.scores_.shape, dtype=bool)
        else:
            scores = _clean_nans(self.scores_)
            mask = np.zeros(scores.shape, dtype=bool)

            k = self.k
            if k > len(mask):
                if self.validate:
                    logger.warning('Configured k ({}) is larger than number of features ({}) '
                                   'so none will be eliminated'.format(k, len(mask)))
                k = len(mask)

            # Request a stable sort. Mergesort takes more memory (~40MB per
            # megafeature on x86-64).
            mask[np.argsort(scores, kind="mergesort")[-k:]] = 1
            return mask
    def _get_support_mask(self):
        k = self.k
        chi2_scores = self.scores_
        chi2_mask = np.ones(chi2_scores.shape, dtype=bool)

        if k != 'all' and k < len(chi2_scores):
            # we don't want all features to be kept, and the number we want is less than the number available
            chi2_scores = _clean_nans(chi2_scores)
            selected_indices = np.argsort(chi2_scores)[:k]
            chi2_mask[selected_indices] = False

        mask = chi2_mask & self.vectors_mask & self.log_odds_mask
        logging.info('%d/%d features survived feature selection', np.count_nonzero(mask), len(mask))

        # Only keep the scores of the features that survived. This array is used to check the
        # input data shape at train and decode time matches. However, because the post-feature-selections
        # vocabulary is passed back into the vectorizer, at decode time the input will likely be smaller. This is
        # like doing feature selection in the vectorizer.
        self.scores_ = self.scores_[mask]
        self.log_odds_mask = self.log_odds_mask[mask]
        self.vectors_mask = self.vectors_mask[mask]

        self.vocabulary_ = update_dict_according_to_mask(self.vocabulary_, mask)
        return mask
Esempio n. 7
0
    def _get_support_mask(self):
        check_is_fitted(self, 'scores_')

        if self.k == 'all':
            return np.ones(self.scores_.shape, dtype=bool)
        elif self.k == 0:
            return np.zeros(self.scores_.shape, dtype=bool)
        else:
            scores = _clean_nans(self.scores_)
            numofgroups = math.ceil(len(scores) / self.group_size)
            for i in range(0, numofgroups):
                meanScoreOfGroup = np.mean(
                    scores[i * self.group_size:(i + 1) * self.group_size])
                # Whole cube is assigned the mean its elements
                scores[i * self.group_size:(
                    i + 1) * self.group_size] = meanScoreOfGroup

            mask = np.zeros(scores.shape, dtype=bool)

            # Request a stable sort. Mergesort takes more memory (~40MB per
            # megafeature on x86-64).
            mask[np.argsort(scores,
                            kind="mergesort")[-self.k * self.group_size:]] = 1
            return mask