def test_kfold(self): """Test some cross-validation.""" features, targets, _, _ = get_data() f, t = k_fold(features, nsplit=5, targets=targets) self.assertTrue(len(f) == 5 and len(t) == 5) for s in f: self.assertEqual(np.shape(s), (9, 100)) f, t = k_fold(features, nsplit=4, targets=targets, fix_size=5) self.assertTrue(len(f) == 4 and len(t) == 4) for s in f: self.assertEqual(np.shape(s), (5, 100)) write_split(features=f, targets=t, fname='cvsave', fformat='pickle') f1, t1 = read_split(fname='cvsave', fformat='pickle') self.assertEqual(len(f1), len(f)) self.assertEqual(len(t1), len(t)) write_split(features=f, targets=t, fname='cvsave', fformat='json') f1, t1 = read_split(fname='cvsave', fformat='pickle') self.assertEqual(len(f1), len(f)) self.assertEqual(len(t1), len(t))
def _load_data(self, features, targets, nsplit): """Function to load or initialize data. Parameters ---------- features : array The feature set for the training data. targets : array The targets for the traning data. nsplit : int The number of k-folds for the CV. Returns ------- features : list List of k-fold feature arrays. targets : list List of k-fold target arrays. output : list The current list of output data. survivors : list The current list of surviving features. total_features : int The current number of surviving features. """ # Make some k-fold splits. total_features = np.shape(features)[1] output = [] survivors = list(range(total_features)) load_data = False if self.save_file is not None: try: with open(self.save_file) as save_data: data = json.load(save_data) output = data['output'] survivors = data['survivors'] total_features = data['total_features'] features = [np.array(f) for f in data['features']] targets = [np.array(t) for t in data['targets']] print('Resuming greedy search with {} features.'.format( total_features)) load_data = True except FileNotFoundError: print('Starting new greedy search.') if not load_data: features, targets = k_fold( features, targets=targets, nsplit=nsplit) return features, targets, output, survivors, total_features
def importance_elimination(self, train_predict, test_predict, features, targets, nsplit=2, step=1): """Importance feature elimination. Function to iterate through feature set, eliminating least important feature in each pass. This is the backwards elimination algorithm. Parameters ---------- train_predict : object A function that will train a model. The function should accept the parameters: train_features : array train_targets : list predict should return a function that can be passed to test_predict. features : array An n, d array of features. targets : list A list of the target values. nsplit : int Number of folds in k-fold cross-validation. Returns ------- output : array First column is the index of features in the order they were eliminated. Second column are corresponding cost function values, averaged over the k fold split. Following columns are any additional values returned by predict, averaged over the k fold split. """ # Make some k-fold splits. features, targets = k_fold(features, targets=targets, nsplit=nsplit) _, total_features = np.shape(features[0]) output = [] survivors = list(range(total_features)) if self.verbose: # The tqdm package is used for tracking progress. iterator1 = trange((total_features - 1) // step, desc='features eliminated ', leave=False) else: iterator1 = range((total_features - 1) // step) for fnum in iterator1: self.result = np.zeros((nsplit, total_features)) meta = [] if self.verbose: iterator2 = trange(nsplit, desc='k-folds ', leave=False) else: iterator2 = range(nsplit) for self.index in iterator2: # Sort out training and testing data. train_features = copy.deepcopy(features) train_targets = copy.deepcopy(targets) test_features = train_features.pop(self.index)[:, survivors] test_targets = train_targets.pop(self.index) train_features = np.concatenate(train_features, axis=0)[:, survivors] train_targets = np.concatenate(train_targets, axis=0) pred = train_predict(train_features, train_targets) _, d = np.shape(train_features) meta_k = [] # Iterate through features and find error for removing it. if self.nprocs != 1: meta_k = self._parallel_iterator(d, train_features, test_features, train_targets, test_targets, pred, test_predict, meta_k) else: meta_k = self._serial_iterator(d, train_features, test_features, train_targets, test_targets, pred, test_predict, meta_k) if len(meta_k) > 0: meta.append(meta_k) # Scores summed over k. scores = np.mean(self.result, axis=0) # Sort features according to score. s = np.argsort(scores) for g in range(step): eliminated = [ np.array(survivors)[s][g], np.array(scores)[s][g] ] if len(meta) > 0: mean_meta = np.mean(meta, axis=0) output.append( np.concatenate([eliminated, mean_meta[g]], axis=0)) else: output.append(eliminated) # Delete features that, while missing gave the smallest error. survivors = [ x for i, x in enumerate(survivors) if i not in s[:step] ] total_features -= step return output
def __init__(self, population_size, fit_func, features, targets, population=None, operators=None, fitness_parameters=1, nsplit=2, accuracy=None): """Initialize the genetic algorithm. Parameters ---------- population_size : int Population size, same as generation size. fit_func : object User defined function to calculate fitness. features : array The feature space upon which to optimize. targets : array The targets corresponding to the feature data. population : list The current population. Default is None, will generate a random initial population. operators : list A list of operation functions. These are used for mating and mutation operations. fitness_parameters : int The number of variables to optimize. Default is a single variable. nslpit : int Number of data splits for k-fold cv. accuracy : int Number of decimal places to include when finding unique candidates for duplication removal. If None, duplication removel is not performed. """ # Set parameters. self.step = -1 self.population_size = population_size self.fit_func = fit_func self.dimension = features.shape[1] self.nsplit = nsplit self.accuracy = accuracy # Define the starting population. self.population = population if self.population is None: self.population = initialize_population( population_size, self.dimension) # Define the operators to use. self.operators = operators if self.operators is None: self.operators = [cut_and_splice, random_permutation, probability_remove, probability_include] self.fitness_parameters = fitness_parameters self.pareto = False if self.fitness_parameters > 1: self.pareto = True if self.pareto and self.accuracy is not None: msg = 'Should not set an accuracy parameter for multivariable ' msg += 'searches.' raise RuntimeError(msg) # Make some k-fold splits. self.features, self.targets = k_fold( features, targets=targets, nsplit=self.nsplit)
# Get the target values. targets = [] for a in all_cand: targets.append(a.info['key_value_pairs']['raw_score']) print('Generated {} target vector'.format(np.shape(targets))) # It is important to note that the `all_cand` variable is simply a list of atoms objects. There are no constraints on how this should be set up, the above example is just a succinct method for generating the list. # # ## Subset Generation <a name="subset-generation"></a> # [(Back to top)](#head) # # Once the data has been generated, it is necessary to split the training features and training targets into k-folds. This can be achieved using a function in CatLearn with the `k_fold` function. Here is is possible to provide feature data, target data and the number of folds. # In[3]: fsplit, tsplit = k_fold(features=features, targets=targets, nsplit=5) print('k_fold has generated {} subsets of features.'.format(len(fsplit))) for index in range(len(fsplit)): print(' subset {0} has shape {1}'.format(index, np.shape(fsplit[index]))) print('\nk_fold has generated {} subsets of targets.'.format(len(tsplit))) for index in range(len(tsplit)): print(' subset {0} has shape {1}'.format(index, np.shape(tsplit[index]))) # If we are interested in saving this data, it is possible to write a JSON or pickle file. This is achieved using the following functions to write and read the data. # In[4]: