def _get_support_mask(self): check_is_fitted(self, 'scores_') # Cater for NaNs if self.percentile == 100: return {k: np.ones(len(score), dtype=np.bool) for k, score in self.scores_.items()} elif self.percentile == 0: return {k: np.zeros(len(score), dtype=np.bool) for k, score in self.scores_.items()} masks = dict() for roi_id, score in self.scores_.items(): scores = _clean_nans(score) if len(scores) == 1: mask = np.array([True]) else: treshold = stats.scoreatpercentile(scores, 100 - self.percentile) mask = scores > treshold ties = np.where(scores == treshold)[0] if len(ties): max_feats = len(scores) * self.percentile // 100 kept_ties = ties[:max_feats - mask.sum()] mask[kept_ties] = True masks[roi_id] = mask return masks
def _get_support_mask(self): check_is_fitted(self, 'scores_') # Cater for NaNs if self.percentile == 100: return {k: np.ones(len(score), dtype=np.bool) for k, score in self.scores_.items()} elif self.percentile == 0: return {k: np.zeros(len(score), dtype=np.bool) for k, score in self.scores_.items()} masks = dict() for roi_id, score in self.scores_.items(): scores = _clean_nans(score) if len(scores) == 1: mask = np.array([True]) else: treshold = stats.scoreatpercentile(scores, 100 - self.percentile) mask = scores > treshold ties = np.where(scores == treshold)[0] if len(ties): max_feats = len(scores) * self.percentile // 100 kept_ties = ties[:max_feats - mask.sum()] mask[kept_ties] = True masks[roi_id] = mask return masks
def _get_support_mask(self): check_is_fitted(self, 'scores_') if self.thresh == 'none': return np.ones(self.scores_.shape, dtype=bool) elif self.thresh == 0: return np.zeros(self.scores_.shape, dtype=bool) else: scores = _clean_nans(self.scores_) mask = np.zeros(scores.shape, dtype=bool) # Request a stable sort. Mergesort takes more memory (~40MB per # megafeature on x86-64). return np.array(scores > self.thresh, dtype=bool)
def _get_support_mask(self): # Cater for NaNs if self.threshold == 1: return np.ones(len(self.scores_), dtype=np.bool) elif self.threshold == 0: return np.zeros(len(self.scores_), dtype=np.bool) scores = _clean_nans(self.scores_) mask = self.threshold.fit_transform(scores) ties = np.where(scores == self.threshold.value)[0] if len(ties): max_feats = len(scores) * self.threshold.value kept_ties = ties[:max_feats - mask.sum()] mask[kept_ties] = True return mask
def _get_support_mask(self): check_is_fitted(self, 'scores_') if self.k == 'all': return np.ones(self.scores_.shape, dtype=bool) elif self.k == 0: return np.zeros(self.scores_.shape, dtype=bool) else: scores = _clean_nans(self.scores_) mask = np.zeros(scores.shape, dtype=bool) k = self.k if k > len(mask): if self.validate: logger.warning('Configured k ({}) is larger than number of features ({}) ' 'so none will be eliminated'.format(k, len(mask))) k = len(mask) # Request a stable sort. Mergesort takes more memory (~40MB per # megafeature on x86-64). mask[np.argsort(scores, kind="mergesort")[-k:]] = 1 return mask
def _get_support_mask(self): k = self.k chi2_scores = self.scores_ chi2_mask = np.ones(chi2_scores.shape, dtype=bool) if k != 'all' and k < len(chi2_scores): # we don't want all features to be kept, and the number we want is less than the number available chi2_scores = _clean_nans(chi2_scores) selected_indices = np.argsort(chi2_scores)[:k] chi2_mask[selected_indices] = False mask = chi2_mask & self.vectors_mask & self.log_odds_mask logging.info('%d/%d features survived feature selection', np.count_nonzero(mask), len(mask)) # Only keep the scores of the features that survived. This array is used to check the # input data shape at train and decode time matches. However, because the post-feature-selections # vocabulary is passed back into the vectorizer, at decode time the input will likely be smaller. This is # like doing feature selection in the vectorizer. self.scores_ = self.scores_[mask] self.log_odds_mask = self.log_odds_mask[mask] self.vectors_mask = self.vectors_mask[mask] self.vocabulary_ = update_dict_according_to_mask(self.vocabulary_, mask) return mask
def _get_support_mask(self): check_is_fitted(self, 'scores_') if self.k == 'all': return np.ones(self.scores_.shape, dtype=bool) elif self.k == 0: return np.zeros(self.scores_.shape, dtype=bool) else: scores = _clean_nans(self.scores_) numofgroups = math.ceil(len(scores) / self.group_size) for i in range(0, numofgroups): meanScoreOfGroup = np.mean( scores[i * self.group_size:(i + 1) * self.group_size]) # Whole cube is assigned the mean its elements scores[i * self.group_size:( i + 1) * self.group_size] = meanScoreOfGroup mask = np.zeros(scores.shape, dtype=bool) # Request a stable sort. Mergesort takes more memory (~40MB per # megafeature on x86-64). mask[np.argsort(scores, kind="mergesort")[-self.k * self.group_size:]] = 1 return mask