예제 #1
0
    def test_kfold(self):
        """Test some cross-validation."""
        features, targets, _, _ = get_data()
        f, t = k_fold(features, nsplit=5, targets=targets)
        self.assertTrue(len(f) == 5 and len(t) == 5)
        for s in f:
            self.assertEqual(np.shape(s), (9, 100))
        f, t = k_fold(features, nsplit=4, targets=targets, fix_size=5)
        self.assertTrue(len(f) == 4 and len(t) == 4)
        for s in f:
            self.assertEqual(np.shape(s), (5, 100))

        write_split(features=f, targets=t, fname='cvsave', fformat='pickle')
        f1, t1 = read_split(fname='cvsave', fformat='pickle')
        self.assertEqual(len(f1), len(f))
        self.assertEqual(len(t1), len(t))

        write_split(features=f, targets=t, fname='cvsave', fformat='json')
        f1, t1 = read_split(fname='cvsave', fformat='pickle')
        self.assertEqual(len(f1), len(f))
        self.assertEqual(len(t1), len(t))
예제 #2
0
    def _load_data(self, features, targets, nsplit):
        """Function to load or initialize data.

        Parameters
        ----------
        features : array
            The feature set for the training data.
        targets : array
            The targets for the traning data.
        nsplit : int
            The number of k-folds for the CV.

        Returns
        -------
        features : list
            List of k-fold feature arrays.
        targets : list
            List of k-fold target arrays.
        output : list
            The current list of output data.
        survivors : list
            The current list of surviving features.
        total_features : int
            The current number of surviving features.
        """
        # Make some k-fold splits.
        total_features = np.shape(features)[1]

        output = []
        survivors = list(range(total_features))
        load_data = False
        if self.save_file is not None:
            try:
                with open(self.save_file) as save_data:
                    data = json.load(save_data)
                    output = data['output']
                    survivors = data['survivors']
                    total_features = data['total_features']
                    features = [np.array(f) for f in data['features']]
                    targets = [np.array(t) for t in data['targets']]
                print('Resuming greedy search with {} features.'.format(
                    total_features))
                load_data = True
            except FileNotFoundError:
                print('Starting new greedy search.')

        if not load_data:
            features, targets = k_fold(
                features, targets=targets, nsplit=nsplit)

        return features, targets, output, survivors, total_features
예제 #3
0
    def importance_elimination(self,
                               train_predict,
                               test_predict,
                               features,
                               targets,
                               nsplit=2,
                               step=1):
        """Importance feature elimination.

        Function to iterate through feature set, eliminating least important
        feature in each pass. This is the backwards elimination algorithm.

        Parameters
        ----------
        train_predict : object
            A function that will train a model. The function should accept
            the parameters:

                train_features : array
                train_targets : list

            predict should return a function that can be passed to
            test_predict.
        features : array
            An n, d array of features.
        targets : list
            A list of the target values.
        nsplit : int
            Number of folds in k-fold cross-validation.

        Returns
        -------
        output : array
            First column is the index of features in the order they were
            eliminated.

            Second column are corresponding cost function values, averaged over
            the k fold split.

            Following columns are any additional values returned by predict,
            averaged over the k fold split.
        """
        # Make some k-fold splits.
        features, targets = k_fold(features, targets=targets, nsplit=nsplit)
        _, total_features = np.shape(features[0])

        output = []
        survivors = list(range(total_features))

        if self.verbose:
            # The tqdm package is used for tracking progress.
            iterator1 = trange((total_features - 1) // step,
                               desc='features eliminated ',
                               leave=False)
        else:
            iterator1 = range((total_features - 1) // step)

        for fnum in iterator1:
            self.result = np.zeros((nsplit, total_features))
            meta = []

            if self.verbose:
                iterator2 = trange(nsplit,
                                   desc='k-folds             ',
                                   leave=False)
            else:
                iterator2 = range(nsplit)

            for self.index in iterator2:
                # Sort out training and testing data.
                train_features = copy.deepcopy(features)
                train_targets = copy.deepcopy(targets)
                test_features = train_features.pop(self.index)[:, survivors]
                test_targets = train_targets.pop(self.index)

                train_features = np.concatenate(train_features,
                                                axis=0)[:, survivors]
                train_targets = np.concatenate(train_targets, axis=0)

                pred = train_predict(train_features, train_targets)

                _, d = np.shape(train_features)
                meta_k = []

                # Iterate through features and find error for removing it.
                if self.nprocs != 1:
                    meta_k = self._parallel_iterator(d, train_features,
                                                     test_features,
                                                     train_targets,
                                                     test_targets, pred,
                                                     test_predict, meta_k)

                else:
                    meta_k = self._serial_iterator(d, train_features,
                                                   test_features,
                                                   train_targets, test_targets,
                                                   pred, test_predict, meta_k)

                if len(meta_k) > 0:
                    meta.append(meta_k)

            # Scores summed over k.
            scores = np.mean(self.result, axis=0)
            # Sort features according to score.
            s = np.argsort(scores)
            for g in range(step):
                eliminated = [
                    np.array(survivors)[s][g],
                    np.array(scores)[s][g]
                ]
                if len(meta) > 0:
                    mean_meta = np.mean(meta, axis=0)
                    output.append(
                        np.concatenate([eliminated, mean_meta[g]], axis=0))
                else:
                    output.append(eliminated)
            # Delete features that, while missing gave the smallest error.
            survivors = [
                x for i, x in enumerate(survivors) if i not in s[:step]
            ]
            total_features -= step

        return output
예제 #4
0
    def __init__(self, population_size, fit_func, features, targets,
                 population=None, operators=None, fitness_parameters=1,
                 nsplit=2, accuracy=None):
        """Initialize the genetic algorithm.

        Parameters
        ----------
        population_size : int
            Population size, same as generation size.
        fit_func : object
            User defined function to calculate fitness.
        features : array
            The feature space upon which to optimize.
        targets : array
            The targets corresponding to the feature data.
        population : list
            The current population. Default is None, will generate a random
            initial population.
        operators : list
            A list of operation functions. These are used for mating and
            mutation operations.
        fitness_parameters : int
            The number of variables to optimize. Default is a single variable.
        nslpit : int
            Number of data splits for k-fold cv.
        accuracy : int
            Number of decimal places to include when finding unique candidates
            for duplication removal. If None, duplication removel is not
            performed.
        """
        # Set parameters.
        self.step = -1
        self.population_size = population_size
        self.fit_func = fit_func
        self.dimension = features.shape[1]
        self.nsplit = nsplit
        self.accuracy = accuracy

        # Define the starting population.
        self.population = population
        if self.population is None:
            self.population = initialize_population(
                population_size, self.dimension)

        # Define the operators to use.
        self.operators = operators
        if self.operators is None:
            self.operators = [cut_and_splice, random_permutation,
                              probability_remove, probability_include]

        self.fitness_parameters = fitness_parameters
        self.pareto = False
        if self.fitness_parameters > 1:
            self.pareto = True
        if self.pareto and self.accuracy is not None:
            msg = 'Should not set an accuracy parameter for multivariable '
            msg += 'searches.'
            raise RuntimeError(msg)

        # Make some k-fold splits.
        self.features, self.targets = k_fold(
            features, targets=targets, nsplit=self.nsplit)
예제 #5
0
# Get the target values.
targets = []
for a in all_cand:
    targets.append(a.info['key_value_pairs']['raw_score'])
print('Generated {} target vector'.format(np.shape(targets)))

# It is important to note that the `all_cand` variable is simply a list of atoms objects. There are no constraints on how this should be set up, the above example is just a succinct method for generating the list.
#
# ## Subset Generation <a name="subset-generation"></a>
# [(Back to top)](#head)
#
# Once the data has been generated, it is necessary to split the training features and training targets into k-folds. This can be achieved using a function in CatLearn with the `k_fold` function. Here is is possible to provide feature data, target data and the number of folds.

# In[3]:

fsplit, tsplit = k_fold(features=features, targets=targets, nsplit=5)

print('k_fold has generated {} subsets of features.'.format(len(fsplit)))
for index in range(len(fsplit)):
    print('    subset {0} has shape {1}'.format(index,
                                                np.shape(fsplit[index])))

print('\nk_fold has generated {} subsets of targets.'.format(len(tsplit)))
for index in range(len(tsplit)):
    print('    subset {0} has shape {1}'.format(index,
                                                np.shape(tsplit[index])))

# If we are interested in saving this data, it is possible to write a JSON or pickle file. This is achieved using the following functions to write and read the data.

# In[4]: