def test_is_sa(self): nd = np.array([[1, 2, 3], [4, 5, 6]], dtype=int) dtype = np.dtype({'names': map('f{}'.format, xrange(3)), 'formats': [float] * 3}) sa = np.array([(-1.0, 2.0, -1.0), (0.0, -1.0, 2.0)], dtype=dtype) self.assertFalse(utils.is_sa(nd)) self.assertTrue(utils.is_sa(sa))
def __init__(self, M, labels, clfs=[{ 'clf': RandomForestClassifier }], subsets=[{ 'subset': s_i.SubsetNoSubset }], cvs=[{ 'cv': KFold }], trials=None): if M is not None: if utils.is_nd(M) and not utils.is_sa(M): # nd_array, short circuit the usual type checking and coersion if M.ndim != 2: raise ValueError('Expected 2-dimensional array for M') self.M = M self.col_names = ['f{}'.format(i) for i in xrange(M.shape[1])] self.labels = utils.check_col(labels, n_rows=M.shape[0], argument_name='labels') else: # M is either a structured array or something that should # be converted (M, self.labels) = utils.check_consistent( M, labels, col_argument_name='labels') self.col_names = M.dtype.names self.M = utils.cast_np_sa_to_nd(M) else: self.col_names = None if trials is None: clfs = utils.check_arguments( clfs, {'clf': lambda clf: issubclass(clf, BaseEstimator)}, optional_keys_take_lists=True, argument_name='clfs') subsets = utils.check_arguments(subsets, { 'subset': lambda subset: issubclass(subset, s_i.BaseSubsetIter) }, optional_keys_take_lists=True, argument_name='subsets') cvs = utils.check_arguments( cvs, {'cv': lambda cv: issubclass(cv, _PartitionIterator)}, optional_keys_take_lists=True, argument_name='cvs') self.clfs = clfs self.subsets = subsets self.cvs = cvs self.trials = trials
def __init__( self, M, labels, clfs=[{'clf': RandomForestClassifier}], subsets=[{'subset': s_i.SubsetNoSubset}], cvs=[{'cv': KFold}], trials=None): if M is not None: if utils.is_nd(M) and not utils.is_sa(M): # nd_array, short circuit the usual type checking and coersion if M.ndim != 2: raise ValueError('Expected 2-dimensional array for M') self.M = M self.col_names = ['f{}'.format(i) for i in xrange(M.shape[1])] self.labels = utils.check_col( labels, n_rows=M.shape[0], argument_name='labels') else: # M is either a structured array or something that should # be converted (M, self.labels) = utils.check_consistent( M, labels, col_argument_name='labels') self.col_names = M.dtype.names self.M = utils.cast_np_sa_to_nd(M) else: self.col_names = None if trials is None: clfs = utils.check_arguments( clfs, {'clf': lambda clf: issubclass(clf, BaseEstimator)}, optional_keys_take_lists=True, argument_name='clfs') subsets = utils.check_arguments( subsets, {'subset': lambda subset: issubclass(subset, s_i.BaseSubsetIter)}, optional_keys_take_lists=True, argument_name='subsets') cvs = utils.check_arguments( cvs, {'cv': lambda cv: issubclass(cv, _PartitionIterator)}, optional_keys_take_lists=True, argument_name='cvs') self.clfs = clfs self.subsets = subsets self.cvs = cvs self.trials = trials
def test_check_sa(self): valid1 = np.array([(1, 'a'), (2, 'b'), (3, 'c')], dtype=[('int', int), ('s', 'S0')]) valid2 = np.array([[1, 2, 3], [4, 5, 6]]) valid3 = [[1, 2, 3], [4, 5, 6]] valid4 = pd.DataFrame(valid1) for valid in (valid1, valid2, valid3, valid4): self.assertTrue(utils.is_sa(utils.check_sa(valid))) self.assertRaises(ValueError, utils.check_sa, None) self.assertRaises(ValueError, utils.check_sa, "lalala") utils.check_sa(valid1, n_rows=3, n_cols=2) self.assertRaises(ValueError, utils.check_sa, valid1, n_rows=4) self.assertRaises(ValueError, utils.check_sa, valid1, n_cols=3)
def get_top_features(clf, M=None, col_names=None, n=10, verbose=True): """Gets the top features for a fitted clf Parameters ---------- clf : sklearn.base.BaseEstimator Fitted classifier with a feature_importances_ attribute M : numpy.ndarray or None Structured array corresponding to fitted clf. Used here to deterimine column names col_names : list of str or None List of column names corresponding to fitted clf. n : int Number of features to return verbose : boolean iff True, prints ranked features Returns ------- numpy.ndarray structured array with top feature names and scores """ if not isinstance(clf, BaseEstimator): raise ValueError('clf must be an instance of sklearn.Base.BaseEstimator') scores = clf.feature_importances_ if col_names is None: if is_sa(M): col_names = M.dtype.names else: col_names = ['f{}'.format(i) for i in xrange(len(scores))] else: col_names = utils.check_col_names(col_names, n_cols = scores.shape[0]) ranked_name_and_score = [(col_names[x], scores[x]) for x in scores.argsort()[::-1]] ranked_name_and_score = convert_to_sa( ranked_name_and_score[:n], col_names=('feat_name', 'score')) if verbose: pprint_sa(ranked_name_and_score) return ranked_name_and_score
def get_top_features(clf, M=None, col_names=None, n=10, verbose=True): """Gets the top features for a fitted clf Parameters ---------- clf : sklearn.base.BaseEstimator Fitted classifier with a feature_importances_ attribute M : numpy.ndarray or None Structured array corresponding to fitted clf. Used here to deterimine column names col_names : list of str or None List of column names corresponding to fitted clf. n : int Number of features to return verbose : boolean iff True, prints ranked features Returns ------- numpy.ndarray structured array with top feature names and scores """ if not isinstance(clf, BaseEstimator): raise ValueError( 'clf must be an instance of sklearn.Base.BaseEstimator') scores = clf.feature_importances_ if col_names is None: if is_sa(M): col_names = M.dtype.names else: col_names = ['f{}'.format(i) for i in xrange(len(scores))] else: col_names = utils.check_col_names(col_names, n_cols=scores.shape[0]) ranked_name_and_score = [(col_names[x], scores[x]) for x in scores.argsort()[::-1]] ranked_name_and_score = convert_to_sa(ranked_name_and_score[:n], col_names=('feat_name', 'score')) if verbose: pprint_sa(ranked_name_and_score) return ranked_name_and_score