def _train_helper(self): """ Helper function for self.train() """ self.lnr.irl_selection = self.irl_selection self.lnr.train() self.loss = (logistic_loss(self.training_instances, self.lnr) / sum(self.irl_selection)) iteration = 0 old_irl_selection = np.full(len(self.irl_selection), -1) while np.linalg.norm(self.irl_selection - old_irl_selection) != 0: old_irl_selection = deepcopy(self.irl_selection) self.irl_selection = np.full(len(self.irl_selection), 0) for i, loss in enumerate(self.loss): if loss < self.loss_threshold: self.irl_selection[i] = 1 # Have to have at least 50% of the instances to train with as # we assume at least 50% of the data is clean if sum(self.irl_selection) < 0.5 * len(self.training_instances): raise ValueError() if self.verbose: print('IRL Iteration:', iteration, '- number of instances:', sum(self.irl_selection)) self.lnr.irl_selection = self.irl_selection self.lnr.train() self.loss = (logistic_loss(self.training_instances, self.lnr) / sum(self.irl_selection)) iteration += 1
def _calculate_constants(self, instances: List[Instance]): """ Calculates constants needed for the alternating minimization loop. :param instances: the list of Instances :return: the constants """ half_n = len(instances) n = half_n * 2 # size of new (doubled) input feature_vectors = [] labels = [] labels_flipped = [] for inst in instances: feature_vectors.append(inst.get_feature_vector()) labels.append(inst.get_label()) labels_flipped.append(-1 * inst.get_label()) feature_vectors = np.array(feature_vectors + feature_vectors) labels = np.array(labels + labels_flipped) fvs, _ = get_fvs_and_labels(instances) orig_loss = logistic_loss(fvs, self.learner, np.array(labels[:(len(labels) // 2)])) orig_loss = np.concatenate([orig_loss, orig_loss]) cost = np.concatenate([np.full(half_n, 0), np.array(self.cost)]) return half_n, n, orig_loss, feature_vectors, labels, cost
def train(self): """ Train on the set of training instances. """ if len(self.training_instances) < 2: raise ValueError('Must have at least 2 instances to train.') fvs, labels = get_fvs_and_labels(self.training_instances) orig_fvs, orig_labels = deepcopy(fvs), deepcopy(labels) cutoff = 10 * math.log(len(self.training_instances)) / fvs.shape[1] cutoff /= 100 base_cutoff = cutoff factor = 1 max_cutoff = cutoff * 100 if self.verbose: print('\nBase cutoff:', cutoff, '\nMax cutoff:', max_cutoff, '\n') best_loss = None best_w = None iteration = 0 while cutoff < max_cutoff: result = self._remove_outliers(fvs, labels, cutoff) if not result: continue fvs, labels = result self.w = np.full(fvs.shape[1], 0.0) for i, fv in enumerate(fvs): self.w += labels[i] * fv self.w /= fvs.shape[0] loss = logistic_loss(fvs, self, labels) loss = sum(loss) / fvs.shape[0] if self.verbose: print('\nORL Iteration:', iteration, '- factor:', factor, '- cutoff:', cutoff, '- loss:', loss, '\n') if not best_loss or loss < best_loss: best_loss = loss best_w = deepcopy(self.w) factor += 1 cutoff = base_cutoff * factor fvs, labels = deepcopy(orig_fvs), deepcopy(orig_labels) iteration += 1 self.w = best_w
def train(self): """ Train on the set of training instances. """ if len(self.training_instances) < 2: raise ValueError('Must have at least 2 instances to train.') step_size = 1 / len(self.training_instances) best_poison_percentage = 0.05 best_lnr = None best_loss = None self.poison_percentage = 0.05 self.n = int( (1 - self.poison_percentage) * len(self.training_instances)) self.lnr.n = self.n while self.poison_percentage < 0.5: self.lnr.train() self.lnr.redo_problem_on_train = False loss = logistic_loss(self.training_instances, self.lnr) / self.n loss.sort() loss = sum(loss[:self.n]) if self.verbose: print('\nPoison Percentage:', self.poison_percentage, '- loss:', loss, '\n') if not best_loss or loss < best_loss: best_poison_percentage = self.poison_percentage best_loss = loss best_lnr = deepcopy( (self.lnr.training_instances, self.lnr.n, self.lnr.lda, self.lnr.verbose, self.lnr.w, self.lnr.b)) self.poison_percentage += step_size self.n = int( (1 - self.poison_percentage) * len(self.training_instances)) self.lnr.n = self.n self.poison_percentage = best_poison_percentage self.n = int( (1 - self.poison_percentage) * len(self.training_instances)) self.lnr = TRIMLearner(best_lnr[0], best_lnr[1], best_lnr[2], best_lnr[3]) self.lnr.w, self.lnr.b = best_lnr[4], best_lnr[5]
def train(self): """ Train on the set of training instances. """ if len(self.training_instances) < 2: raise ValueError('Must have at least 2 instances to train.') self.irl_selection = np.full(len(self.training_instances), 1) self.lnr.set_training_instances(self.training_instances) self.lnr.train() self.lnr.redo_problem_on_train = False self.loss = (logistic_loss(self.training_instances, self.lnr) / sum(self.irl_selection)) sorted_loss = self.loss[:] sorted_loss.sort() step_size = np.mean( np.array( list( filter(lambda x: x > 0, sorted_loss[1:] - sorted_loss[:-1])))) max_loss_threshold = np.max(self.loss) best_loss_threshold = np.median(self.loss) best_lnr = None best_loss = None loss_list = [] if self.verbose: print('Minimum loss threshold:', best_loss_threshold, '\nMaximum loss threshold:', max_loss_threshold, '\nStep size:', step_size) self.loss_threshold = best_loss_threshold while self.loss_threshold < max_loss_threshold: self.irl_selection = np.full(len(self.training_instances), 1) try: self._train_helper() except: if self.verbose: print('\nLoss threshold:', self.loss_threshold, '- FAILURE\n') self.loss_threshold += step_size continue self.lnr.n = sum(self.irl_selection) loss = sum(map(lambda x, y: x * y, self.loss, self.irl_selection)) if self.verbose: print('\nLoss threshold:', self.loss_threshold, '- loss:', loss, '\n') if len(loss_list) > 1 and loss_list[-2] == loss_list[-1] == loss: print( '\n---Exiting early as increasing threshold no longer changes loss---\n' ) break else: loss_list.append(loss) if not best_loss or loss < best_loss: best_loss_threshold = self.loss_threshold best_loss = loss best_lnr = deepcopy( (self.lnr.training_instances, self.lnr.n, self.lnr.lda, self.lnr.verbose, self.lnr.w, self.lnr.b, self.lnr.irl_selection)) self.loss_threshold += step_size self.loss_threshold = best_loss_threshold self.lnr = TRIMLearner(best_lnr[0], best_lnr[1], best_lnr[2], best_lnr[3]) self.lnr.w, self.lnr.b = best_lnr[4], best_lnr[5] self.lnr.irl_selection = best_lnr[6] self.irl_selection = best_lnr[6]