def update(self, Y, X=None, verbose=False): # TODO: change this to use topkY feedback set '''Runs the entire updating procedure, updating interal tracking of wl_weights and expert_weights Args: X (list): A list of the covariates of the current data point. Float for numerical, string for categorical. Categorical data must have been in initial dataset. If not given the last X used for prediction will be used. Y (string): The true class ''' if X is None: X = self.X self.X = np.array(X) Ystr = reduce(lambda x,y:x+y, Y) Yset = utils.str_to_set(Ystr) self.Y = Yset self.num_data +=1 self.cum_error += utils.rank_loss(self.Yhat, Yset) expert_votes = np.zeros(self.num_classes) cost_vec = self.compute_cost(expert_votes, 0) for i in xrange(self.num_wls): alpha = self.wl_weights[i] w = self.weight_consts[i] # if self.loss == 'zero_one': # w *= 5 data_indices = self.data_indices[i] _max = max(cost_vec) for l in Yset: full_inst = self.make_full_instance(self.X[data_indices], l) full_inst.set_weight(w*(_max - cost_vec[l])) self.weaklearners[i].update_classifier(full_inst) if verbose is True: print i, _max - min(cost_vec) # updating the quality weights and weighted vote vector expert_votes = self.expert_votes_mat[i,:] cost_vec = self.compute_cost(expert_votes, i+1) self.wl_weights[i] = \ self.update_alpha(cost_vec, i, alpha) if self.loss == 'logistic': self.expert_weights[i] *= \ np.exp(-utils.rank_loss(expert_votes, Yset) \ * self.exp_step_size) self.expert_weights = self.expert_weights/sum(self.expert_weights)
def record_losses(self, cum_votes): exp_sum = 0.0 rank_sum = 0.0 for t in xrange(self.num_data): s = cum_votes[t] Y = self.class_sets[t] exp_sum += utils.univ_logistic_loss(s, Y) rank_sum += utils.rank_loss(s, Y) self.exp_losses.append(exp_sum/self.num_data) self.rank_losses.append(rank_sum/self.num_data)
def get_test_results(self, num_wls=0): if num_wls == 0: num_wls = self.num_wls cum_votes = np.zeros((self.test_num_data, self.num_classes)) for i in xrange(num_wls): alpha = self.wl_weights[i] wl = self.weaklearners[i] preds = wl.predict_proba(self.test_data[:,self.data_indices[i]]) for t in xrange(self.test_num_data): cum_votes[t] += alpha*preds[t] exp_sum = 0.0 rank_sum = 0.0 for t in xrange(self.test_num_data): s = cum_votes[t] Y = self.test_class_sets[t] exp_sum += utils.exp_loss(s, Y) rank_sum += utils.rank_loss(s, Y) exp_sum /= self.test_num_data rank_sum /= self.test_num_data return exp_sum, rank_sum
cum_error = 0 cum_unnorm_error = 0 model.verbose = True for row in test_rows: X = row[:class_index] Y = row[class_index:] pred = model.predict(X) topk = utils.topk(pred, k) Yset = utils.label_array_to_set(Y) topkRel = topk.intersection(Yset) model.update(topkRel) cum_error += utils.rank_loss(pred, Yset) cum_unnorm_error += utils.unnormalized_rank_loss(pred, Yset) total_unnorm_error += utils.unnormalized_rank_loss(pred, Yset) history_counter += 1.0 error_history.append(total_unnorm_error / history_counter) end = time.time() print 'Training time', mid - start print 'Test time', end - mid if loss == 'zero_one': print 'cache hit percentage:', float(model.cache_hits) / float( model.potential_calls) print 'Average rank loss', round(cum_error / float(len(test_rows)), 4) print 'Average unnormalized rank loss', round( cum_unnorm_error / float(len(test_rows)), 4)
def main(): seed = np.random.randint(1, 999) # Read params.csv file and parse the options params = utils.read_params() loss = params['loss'] data_source = params['data_source'] num_wls = int(params['num_wls']) num_covs = int(params['num_covs']) M = int(params['M']) gamma = params['gamma'] # Load the train data fp = utils.get_filepath(data_source, 'train') data = arff.load(open(fp, 'rb')) class_index, _, _ = utils.parse_attributes(data) train_rows = data['data'] # Load the test data fp = utils.get_filepath(data_source, 'test') data = arff.load(open(fp, 'rb')) test_rows = data['data'] start = time.time() model = AdaOLMR(data_source, loss=loss, num_covs=num_covs, gamma=gamma) model.M = M model.gen_weaklearners(num_wls, min_grace=5, max_grace=20, min_tie=0.01, max_tie=0.9, min_conf=0.01, max_conf=0.9, min_weight=3, max_weight=10, seed=seed) for i, row in enumerate(train_rows): X = row[:class_index] Y = row[class_index:] pred = model.predict(X) model.update(Y) cum_error = 0 for i, row in enumerate(test_rows): X = row[:class_index] Y = row[class_index:] pred = model.predict(X) model.update(Y) cum_error += utils.rank_loss(pred, model.Y) end = time.time() runtime = round(end - start, 2) avg_loss = round(cum_error / float(len(test_rows)), 4) print 'data_source', data_source print 'loss', loss print 'gamma', gamma print 'num_wls', num_wls print 'num_covs', num_covs print 'M', M print 'seed', seed print 'runtime', runtime print 'avg_loss', avg_loss