Beispiel #1
0
def plot_correlation_matrix(M, verbose=True):
    """Plot correlation between variables in M
    
    Parameters
    ----------
    M : numpy.ndarray
        structured array
    verbose : boolean
        iff True, display the graph

    Returns
    -------
    matplotlib.figure.Figure
        Figure containing plot
    
    """
    # http://glowingpython.blogspot.com/2012/10/visualizing-correlation-matrices.html
    # TODO work on structured arrays or not
    # TODO ticks are col names
    M = utils.check_sa(M)
    names = M.dtype.names
    M = cast_np_sa_to_nd(M)
    
    #set rowvar =0 for rows are items, cols are features
    cc = np.corrcoef(M, rowvar=0)
    
    fig = plt.figure()
    plt.pcolor(cc)
    plt.colorbar()
    plt.yticks(np.arange(0.5, M.shape[1] + 0.5), range(0, M.shape[1]))
    plt.xticks(np.arange(0.5, M.shape[1] + 0.5), range(0, M.shape[1]))
    if verbose:
        plt.show()
    return fig
Beispiel #2
0
def plot_correlation_matrix(M, verbose=True):
    """Plot correlation between variables in M
    
    Parameters
    ----------
    M : numpy.ndarray
        structured array
    verbose : boolean
        iff True, display the graph

    Returns
    -------
    matplotlib.figure.Figure
        Figure containing plot
    
    """
    # http://glowingpython.blogspot.com/2012/10/visualizing-correlation-matrices.html
    # TODO work on structured arrays or not
    # TODO ticks are col names
    M = utils.check_sa(M)
    names = M.dtype.names
    M = cast_np_sa_to_nd(M)

    #set rowvar =0 for rows are items, cols are features
    cc = np.corrcoef(M, rowvar=0)

    fig = plt.figure()
    plt.pcolor(cc)
    plt.colorbar()
    plt.yticks(np.arange(0.5, M.shape[1] + 0.5), range(0, M.shape[1]))
    plt.xticks(np.arange(0.5, M.shape[1] + 0.5), range(0, M.shape[1]))
    if verbose:
        plt.show()
    return fig
Beispiel #3
0
 def test_sa_to_nd(self):
     dtype = np.dtype({'names': map('f{}'.format, xrange(3)),
                       'formats': [float] * 3})
     sa = np.array([(-1.0, 2.0, -1.0), (0.0, -1.0, 2.0)], dtype=dtype)
     control = np.array([[-1.0, 2.0, -1.0], [0.0, -1.0, 2.0]],
                        dtype=float)
     result = utils.cast_np_sa_to_nd(sa)
     self.assertTrue(np.array_equal(result, control))
Beispiel #4
0
 def __init__(self,
              M,
              labels,
              clfs=[{
                  'clf': RandomForestClassifier
              }],
              subsets=[{
                  'subset': s_i.SubsetNoSubset
              }],
              cvs=[{
                  'cv': KFold
              }],
              trials=None):
     if M is not None:
         if utils.is_nd(M) and not utils.is_sa(M):
             # nd_array, short circuit the usual type checking and coersion
             if M.ndim != 2:
                 raise ValueError('Expected 2-dimensional array for M')
             self.M = M
             self.col_names = ['f{}'.format(i) for i in xrange(M.shape[1])]
             self.labels = utils.check_col(labels,
                                           n_rows=M.shape[0],
                                           argument_name='labels')
         else:
             # M is either a structured array or something that should
             # be converted
             (M, self.labels) = utils.check_consistent(
                 M, labels, col_argument_name='labels')
             self.col_names = M.dtype.names
             self.M = utils.cast_np_sa_to_nd(M)
     else:
         self.col_names = None
     if trials is None:
         clfs = utils.check_arguments(
             clfs, {'clf': lambda clf: issubclass(clf, BaseEstimator)},
             optional_keys_take_lists=True,
             argument_name='clfs')
         subsets = utils.check_arguments(subsets, {
             'subset':
             lambda subset: issubclass(subset, s_i.BaseSubsetIter)
         },
                                         optional_keys_take_lists=True,
                                         argument_name='subsets')
         cvs = utils.check_arguments(
             cvs, {'cv': lambda cv: issubclass(cv, _PartitionIterator)},
             optional_keys_take_lists=True,
             argument_name='cvs')
     self.clfs = clfs
     self.subsets = subsets
     self.cvs = cvs
     self.trials = trials
Beispiel #5
0
 def __init__(
         self, 
         M, 
         labels, 
         clfs=[{'clf': RandomForestClassifier}], 
         subsets=[{'subset': s_i.SubsetNoSubset}], 
         cvs=[{'cv': KFold}],
         trials=None):
     if M is not None:
         if utils.is_nd(M) and not utils.is_sa(M):
             # nd_array, short circuit the usual type checking and coersion
             if M.ndim != 2:
                 raise ValueError('Expected 2-dimensional array for M')
             self.M = M
             self.col_names = ['f{}'.format(i) for i in xrange(M.shape[1])]
             self.labels = utils.check_col(
                     labels, 
                     n_rows=M.shape[0], 
                     argument_name='labels')
         else:    
             # M is either a structured array or something that should
             # be converted
             (M, self.labels) = utils.check_consistent(
                     M, 
                     labels, 
                     col_argument_name='labels')
             self.col_names = M.dtype.names
             self.M = utils.cast_np_sa_to_nd(M)
     else:
         self.col_names = None
     if trials is None:
         clfs = utils.check_arguments(
                 clfs, 
                 {'clf': lambda clf: issubclass(clf, BaseEstimator)},
                 optional_keys_take_lists=True,
                 argument_name='clfs')
         subsets = utils.check_arguments(
                 subsets,
                 {'subset': lambda subset: issubclass(subset, s_i.BaseSubsetIter)},
                 optional_keys_take_lists=True,
                 argument_name='subsets')
         cvs = utils.check_arguments(
                 cvs,
                 {'cv': lambda cv: issubclass(cv, _PartitionIterator)},
                 optional_keys_take_lists=True,
                 argument_name='cvs')
     self.clfs = clfs
     self.subsets = subsets
     self.cvs = cvs
     self.trials = trials
Beispiel #6
0
    def test_get_top_features(self):
        M, labels = uft.generate_test_matrix(1000, 15, random_state=0)
        M = utils.cast_np_sa_to_nd(M)
        M_train, M_test, labels_train, labels_test = train_test_split(
                M, 
                labels)
        clf = RandomForestClassifier(random_state=0)
        clf.fit(M_train, labels_train)

        ctrl_feat_importances = clf.feature_importances_
        ctrl_col_names = ['f{}'.format(i) for i in xrange(15)]
        ctrl_feat_ranks = np.argsort(ctrl_feat_importances)[::-1][:10]
        ctrl = utils.convert_to_sa(
                zip(ctrl_col_names, ctrl_feat_importances),
                col_names=('feat_name', 'score'))[ctrl_feat_ranks]

        res = dsp.get_top_features(clf, M, verbose=False)
        self.assertTrue(uft.array_equal(ctrl, res))

        res = dsp.get_top_features(clf, col_names=['f{}'.format(i) for i in xrange(15)], verbose=False)
        self.assertTrue(uft.array_equal(ctrl, res))
Beispiel #7
0
 def test_get_top_features(self):
     M, labels = uft.generate_test_matrix(1000, 15, random_state=0)
     M = utils.cast_np_sa_to_nd(M)
     M_train, M_test, labels_train, labels_test = train_test_split(
             M, 
             labels)
     clf = RandomForestClassifier(random_state=0)
     clf.fit(M_train, labels_train)
     res = dsp.get_top_features(clf, M, verbose=False)
     ctrl = utils.convert_to_sa(
             [('f5',  0.0773838526068), 
              ('f13',   0.0769596713039),
              ('f8',  0.0751584839431),
              ('f6',  0.0730815879102),
              ('f11',   0.0684456133071),
              ('f9',  0.0666747414603),
              ('f10',   0.0659621889608),
              ('f7',  0.0657988099065),
              ('f2',  0.0634000069218),
              ('f0',  0.0632912268319)],
             col_names=('feat_name', 'score'))
     self.assertTrue(uft.array_equal(ctrl, res))
     res = dsp.get_top_features(clf, col_names=['f{}'.format(i) for i in xrange(15)], verbose=False)
     self.assertTrue(uft.array_equal(ctrl, res))
    def subset_over(
        self,
        label_col,
        interval_train_window_start,
        interval_train_window_size,
        interval_test_window_start,
        interval_test_window_size,
        interval_inc_value,
        interval_expanding=False,
        row_M_col_name=None,
        row_M_train_window_start=None,
        row_M_train_window_size=None,
        row_M_test_window_start=None,
        row_M_test_window_size=None,
        row_M_inc_value=None,
        row_M_expanding=False,
        clfs=[{"clf": RandomForestClassifier}],
        feature_gen_lambda=None,
    ):
        """
        Generates ArrayGenerators according to some subsetting directive.

        There are two ways that we determine what the train and test sets are
        for each trial:

        1. The start time/stop time interval. This is the interval used to
           create features in the M-formatted matrix. Setting the start 
           time/stop time of this interval is equalivalent to passing values 
           to set_interval.  variables pertaining to this interval have the 
           interval* prefix.

        2. The rows of the M matrix to select, based on the value of some
           column in the M matrix. Setting the start and end of this interval
           is equivalent to passing values to select_rows_in_M. Values 
           pertaining to this set of rows have the row_M* prefix. Taking
           subsets over rows of M is optional, and it will only occur if
           row_M_col_name is not None

        Parameters
        ----------
        label_col : str
            The name of the column containing labels
        interval_train_window_start : number or datetime
            start of training interval
        interval_train_window_size : number or datetime
            (Initial) size of training interval
        interval_test_window_start : number or datetime
            start of testing interval
        interval_test_window_size : number or datetime
            size of testing interval
        interval_inc_value : datetime, timedelta, or number
            interval to increment train and test interval
        interval_expanding : boolean
            whether or not the training interval is expanding
        row_M_col_name : str or None
            If not None, the name of the feature which will be used to select
            different training and testing sets in addition to the interval

            If None, train and testing sets will use all rows given a 
            particular time interval
        row_M_train_window_start : ? or None
            Start of train window for M rows. If None, uses
            interval_train_window_start
        row_M_train_window_size : ? or None
            (Initial) size of train window for M rows. If None, uses
            interval_train_window_size
        row_M_test_window_start : ? or None
            Start of test window for M rows. If None, uses
            interval_test_window_start
        row_M_train_window_size : ? or None
            size of test window for M rows. If None, uses
            interval_test_window_size
        row_M_inc_value : ? or None
            interval to increment train and test window for M rows. If None,
            uses interval_inc_value
        row_M_expanding : bool
            whether or not the training window for M rows is expanding
        clfs : list of dict
            classifiers and parameters to run with each train/test set. See
            documentation for diogenes.grid_search.experiment.Experiment.
        feature_gen_lambda : (np.ndarray, str, ?, ?, ?, ?) -> np.ndarray or None
            If not None,function to by applied to generated arrays before they 
            are fit to classifiers. Must be a function of signature:

            f(M, test_or_train, interval_start, interval_end, row_M_start,
              row_M_end)

            Where:
            * M is the generated array, 
            * test_or_train is 'test' if this is a test set or 'train' if it's
              a train set
            * interval_start and interval_end define the interval
            * row_M_start and row_M_end define the rows of M that are included

        Returns
        -------
        diogenes.grid_search.experiment.Experiment
            Experiment collecting train/test sets that have been run
        """
        if row_M_train_window_start is None:
            row_M_train_window_start = interval_train_window_start
        if row_M_train_window_size is None:
            row_M_train_window_size = interval_train_window_size
        if row_M_test_window_start is None:
            row_M_test_window_start = interval_test_window_start
        if row_M_test_window_size is None:
            row_M_test_window_size = interval_test_window_size
        if row_M_inc_value is None:
            row_M_inc_value = interval_inc_value

        conn = self.__conn
        col_specs = self.__col_specs
        table_name = self.__rg_table_name

        sql_get_max_interval_end = "SELECT MAX({}) FROM {}".format(col_specs["stop_time"], table_name)
        interval_end = conn.execute(sql_get_max_interval_end)[0][0]
        if row_M_col_name is not None:
            sql_get_max_col = ("SELECT MAX({}) FROM {} " "WHERE {} = '{}'").format(
                col_specs["val"], table_name, col_specs["feature"], row_M_col_name
            )
            row_M_end = conn.execute(sql_get_max_col)[0][0]
        else:
            row_M_end = interval_end

        trial_directives = []
        for clf_params in clfs:
            clf = clf_params["clf"]
            all_clf_ps = clf_params.copy()
            del all_clf_ps["clf"]
            for param_dict in utils.transpose_dict_of_lists(all_clf_ps):
                trial_directives.append((clf, param_dict, []))

        current_interval_train_start = interval_train_window_start
        current_interval_train_end = interval_train_window_start + interval_train_window_size
        current_interval_test_start = interval_test_window_start
        current_interval_test_end = interval_test_window_start + interval_test_window_size
        current_row_M_train_start = row_M_train_window_start
        current_row_M_train_end = row_M_train_window_start + row_M_train_window_size
        current_row_M_test_start = row_M_test_window_start
        current_row_M_test_end = row_M_test_window_start + row_M_test_window_size
        while current_interval_test_end <= interval_end and current_row_M_test_end <= row_M_end:
            ae_train = self.set_interval(current_interval_train_start, current_interval_train_end)
            ae_test = self.set_interval(current_interval_test_start, current_interval_test_end)
            if row_M_col_name is not None:
                ae_train = ae_train.select_rows_in_M(
                    "{col} >= {start} AND {col} <= {stop}".format(
                        col=row_M_col_name, start=current_row_M_train_start, stop=current_row_M_train_end
                    )
                )
                ae_test = ae_test.select_rows_in_M(
                    "{col} >= {start} AND {col} <= {stop}".format(
                        col=row_M_col_name, start=current_row_M_test_start, stop=current_row_M_test_end
                    )
                )
            # TODO this should actually run clfs and build an experiment
            # rather than doing this yield
            data_train = ae_train.emit_M()
            M_train = utils.remove_cols(data_train, label_col)
            y_train = data_train[label_col]
            data_test = ae_test.emit_M()
            M_test = utils.remove_cols(data_test, label_col)
            y_test = data_test[label_col]

            if feature_gen_lambda is not None:
                M_train = feature_gen_lambda(
                    M_train,
                    "train",
                    current_interval_train_start,
                    current_interval_train_end,
                    current_row_M_train_start,
                    current_row_M_train_end,
                )
                M_test = feature_gen_lambda(
                    M_test,
                    "test",
                    current_interval_test_start,
                    current_interval_test_end,
                    current_row_M_test_start,
                    current_row_M_test_end,
                )

            col_names = M_train.dtype.names
            M_train_nd = utils.cast_np_sa_to_nd(M_train)
            M_test_nd = utils.cast_np_sa_to_nd(M_test)

            for clf, params, runs in trial_directives:
                clf_inst = clf(**params)
                clf_inst.fit(M_train_nd, y_train)
                runs.append(
                    exp.Run(
                        M_train_nd,
                        y_train,
                        col_names,
                        clf_inst,
                        None,
                        None,
                        col_names,
                        np.arange(len(col_names)),
                        {
                            "train_interval_start": current_interval_train_start,
                            "train_interval_end": current_interval_train_end,
                            "test_interval_start": current_interval_test_start,
                            "test_interval_end": current_interval_test_end,
                        },
                        {
                            "train_start": current_row_M_train_start,
                            "train_end": current_row_M_train_end,
                            "test_start": current_row_M_test_start,
                            "test_end": current_row_M_test_end,
                        },
                        M_test_nd,
                        y_test,
                    )
                )

            if not interval_expanding:
                current_interval_train_start += interval_inc_value
            current_interval_train_end += interval_inc_value
            current_interval_test_start += interval_inc_value
            current_interval_test_end += interval_inc_value
            if not row_M_expanding:
                current_row_M_train_start += row_M_inc_value
            current_row_M_train_end += row_M_inc_value
            current_row_M_test_start += row_M_inc_value
            current_row_M_test_end += row_M_inc_value

        trials = [
            exp.Trial(None, None, None, clf, params, "Array Emitter", {}, "Array Emitter", {}, [runs])
            for clf, params, runs in trial_directives
        ]
        return exp.Experiment(None, None, clfs, [{"subset": "Array Emitter"}], [{"cv": "Array Emitter"}], trials)