Beispiel #1
0
 def test_is_sa(self):
     nd = np.array([[1, 2, 3], [4, 5, 6]], dtype=int)
     dtype = np.dtype({'names': map('f{}'.format, xrange(3)),
                       'formats': [float] * 3})
     sa = np.array([(-1.0, 2.0, -1.0), (0.0, -1.0, 2.0)], dtype=dtype)
     self.assertFalse(utils.is_sa(nd))
     self.assertTrue(utils.is_sa(sa))
Beispiel #2
0
 def __init__(self,
              M,
              labels,
              clfs=[{
                  'clf': RandomForestClassifier
              }],
              subsets=[{
                  'subset': s_i.SubsetNoSubset
              }],
              cvs=[{
                  'cv': KFold
              }],
              trials=None):
     if M is not None:
         if utils.is_nd(M) and not utils.is_sa(M):
             # nd_array, short circuit the usual type checking and coersion
             if M.ndim != 2:
                 raise ValueError('Expected 2-dimensional array for M')
             self.M = M
             self.col_names = ['f{}'.format(i) for i in xrange(M.shape[1])]
             self.labels = utils.check_col(labels,
                                           n_rows=M.shape[0],
                                           argument_name='labels')
         else:
             # M is either a structured array or something that should
             # be converted
             (M, self.labels) = utils.check_consistent(
                 M, labels, col_argument_name='labels')
             self.col_names = M.dtype.names
             self.M = utils.cast_np_sa_to_nd(M)
     else:
         self.col_names = None
     if trials is None:
         clfs = utils.check_arguments(
             clfs, {'clf': lambda clf: issubclass(clf, BaseEstimator)},
             optional_keys_take_lists=True,
             argument_name='clfs')
         subsets = utils.check_arguments(subsets, {
             'subset':
             lambda subset: issubclass(subset, s_i.BaseSubsetIter)
         },
                                         optional_keys_take_lists=True,
                                         argument_name='subsets')
         cvs = utils.check_arguments(
             cvs, {'cv': lambda cv: issubclass(cv, _PartitionIterator)},
             optional_keys_take_lists=True,
             argument_name='cvs')
     self.clfs = clfs
     self.subsets = subsets
     self.cvs = cvs
     self.trials = trials
Beispiel #3
0
 def __init__(
         self, 
         M, 
         labels, 
         clfs=[{'clf': RandomForestClassifier}], 
         subsets=[{'subset': s_i.SubsetNoSubset}], 
         cvs=[{'cv': KFold}],
         trials=None):
     if M is not None:
         if utils.is_nd(M) and not utils.is_sa(M):
             # nd_array, short circuit the usual type checking and coersion
             if M.ndim != 2:
                 raise ValueError('Expected 2-dimensional array for M')
             self.M = M
             self.col_names = ['f{}'.format(i) for i in xrange(M.shape[1])]
             self.labels = utils.check_col(
                     labels, 
                     n_rows=M.shape[0], 
                     argument_name='labels')
         else:    
             # M is either a structured array or something that should
             # be converted
             (M, self.labels) = utils.check_consistent(
                     M, 
                     labels, 
                     col_argument_name='labels')
             self.col_names = M.dtype.names
             self.M = utils.cast_np_sa_to_nd(M)
     else:
         self.col_names = None
     if trials is None:
         clfs = utils.check_arguments(
                 clfs, 
                 {'clf': lambda clf: issubclass(clf, BaseEstimator)},
                 optional_keys_take_lists=True,
                 argument_name='clfs')
         subsets = utils.check_arguments(
                 subsets,
                 {'subset': lambda subset: issubclass(subset, s_i.BaseSubsetIter)},
                 optional_keys_take_lists=True,
                 argument_name='subsets')
         cvs = utils.check_arguments(
                 cvs,
                 {'cv': lambda cv: issubclass(cv, _PartitionIterator)},
                 optional_keys_take_lists=True,
                 argument_name='cvs')
     self.clfs = clfs
     self.subsets = subsets
     self.cvs = cvs
     self.trials = trials
Beispiel #4
0
    def test_check_sa(self):
        valid1 = np.array([(1, 'a'), (2, 'b'), (3, 'c')], 
                          dtype=[('int', int), ('s', 'S0')])
        valid2 = np.array([[1, 2, 3], [4, 5, 6]])
        valid3 = [[1, 2, 3], [4, 5, 6]]
        valid4 = pd.DataFrame(valid1)
        for valid in (valid1, valid2, valid3, valid4):
            self.assertTrue(utils.is_sa(utils.check_sa(valid)))

        self.assertRaises(ValueError, utils.check_sa, None)
        self.assertRaises(ValueError, utils.check_sa, "lalala")

        utils.check_sa(valid1, n_rows=3, n_cols=2)
        self.assertRaises(ValueError, utils.check_sa, valid1, n_rows=4)
        self.assertRaises(ValueError, utils.check_sa, valid1, n_cols=3)
Beispiel #5
0
def get_top_features(clf, M=None, col_names=None, n=10, verbose=True):
    """Gets the top features for a fitted clf

    Parameters
    ----------
    clf : sklearn.base.BaseEstimator
        Fitted classifier with a feature_importances_ attribute
    M : numpy.ndarray or None
        Structured array corresponding to fitted clf. Used here to deterimine
        column names
    col_names : list of str or None
        List of column names corresponding to fitted clf.
    n : int
        Number of features to return
    verbose : boolean
        iff True, prints ranked features

    Returns
    -------
    numpy.ndarray
        structured array with top feature names and scores

    """
    if not isinstance(clf, BaseEstimator):
        raise ValueError('clf must be an instance of sklearn.Base.BaseEstimator')


    scores = clf.feature_importances_
    if col_names is None:
        if is_sa(M):
            col_names = M.dtype.names
        else:
            col_names = ['f{}'.format(i) for i in xrange(len(scores))]
    else:
        col_names = utils.check_col_names(col_names, n_cols = scores.shape[0])
    ranked_name_and_score = [(col_names[x], scores[x]) for x in 
                             scores.argsort()[::-1]]
    ranked_name_and_score = convert_to_sa(
            ranked_name_and_score[:n], 
            col_names=('feat_name', 'score'))
    if verbose:
        pprint_sa(ranked_name_and_score)
    return ranked_name_and_score
Beispiel #6
0
def get_top_features(clf, M=None, col_names=None, n=10, verbose=True):
    """Gets the top features for a fitted clf

    Parameters
    ----------
    clf : sklearn.base.BaseEstimator
        Fitted classifier with a feature_importances_ attribute
    M : numpy.ndarray or None
        Structured array corresponding to fitted clf. Used here to deterimine
        column names
    col_names : list of str or None
        List of column names corresponding to fitted clf.
    n : int
        Number of features to return
    verbose : boolean
        iff True, prints ranked features

    Returns
    -------
    numpy.ndarray
        structured array with top feature names and scores

    """
    if not isinstance(clf, BaseEstimator):
        raise ValueError(
            'clf must be an instance of sklearn.Base.BaseEstimator')

    scores = clf.feature_importances_
    if col_names is None:
        if is_sa(M):
            col_names = M.dtype.names
        else:
            col_names = ['f{}'.format(i) for i in xrange(len(scores))]
    else:
        col_names = utils.check_col_names(col_names, n_cols=scores.shape[0])
    ranked_name_and_score = [(col_names[x], scores[x])
                             for x in scores.argsort()[::-1]]
    ranked_name_and_score = convert_to_sa(ranked_name_and_score[:n],
                                          col_names=('feat_name', 'score'))
    if verbose:
        pprint_sa(ranked_name_and_score)
    return ranked_name_and_score