def fitData(self, data, batch_size=100, n_epochs=10, print_every=10, valdata=None): ''' fit a model to x, y data by batch print_every is 0 if do not wish to print ''' time_start = time.time() losses = [] best_valloss, best_valindex = np.inf, 0 vallosses = [] n = len(data.dataset) cost = 0 for epoch in range(n_epochs): for k, (x_batch, y_batch) in enumerate(data): x_batch, y_batch = to_var(x_batch), to_var(y_batch) y_hat, regret = self.step(x_batch, y_batch) m = x_batch.size(0) cost += 1 / (k+1) * (regret/m - cost) if print_every != 0 and k % print_every == 0: losses.append(cost) # progress, time, avg loss, auc to_print = ('%.2f%% (%s) %.4f %.4f' % ((epoch * n + (k+1) * m) / (n_epochs * n) * 100, timeSince(time_start), cost, model_auc(self.model, data))) if valdata is not None: valloss = calc_loss(self.model, valdata, self.loss) vallosses.append(valloss) np.save('models/%s.valloss' % self.name, vallosses) to_print += " %.4f" % model_auc(self.model, valdata) if valloss <= best_valloss: best_valloss = valloss best_valindex = len(vallosses) - 1 torch.save(self.model, 'models/%s.pt' % self.name) else: torch.save(self.model, 'models/%s.pt' % self.name) print(to_print) np.save('models/%s.loss' % self.name, losses) cost = 0 return losses, vallosses
def run(self, n_bootstrap=100): #map_parallel(trainData, self.tasks, self.n_cpus) for task in self.tasks: trainData(*task) # select a model to run: split on auc and sparsity aucs = [] models = [] sparsities = [] for name, reg, alpha in self.hyperparams: # load the model model = torch.load('models/' + name + '.pt') sp = modelSparsity(model, self.valdata) models.append(model) sparsities.append(sp) for _ in range(n_bootstrap): test = bootstrap(self.valdata) local_aucs = [] for model in models: # bootstrap for CI on auc local_aucs.append(model_auc(model, test)) aucs.append(local_aucs) aucs = np.array(aucs) # only keep those with high auc b = np.argmax(aucs.mean(0)) discardset = set([]) for a in range(len(models)): diffs = ((aucs[:, a] - aucs[:, b]) >= 0).astype(np.int) if diffs.sum() / diffs.shape[0] <= 0.05: discardset.add(a) # choose the one with largest sparsity chosen, sp = max(filter(lambda x: x[0] not in discardset, enumerate(sparsities)), key=lambda x: x[1]) # retrian the chosen model name, reg, alpha = self.hyperparams[chosen] print('name', name) trainData(name, self.data, reg, alpha, p.epochs, p.lr, p.batch_size, p.log_interval, test=True)
def select_on_auc(self, *args, **kwargs): '''hyperparameter selection based on auc alone return index within self.hyperparams that need to retrain ''' print('hyperparam select using auc') aucs = [] sparsities = [] for name, reg, alpha in self.hyperparams: # load the model model = torch.load('models/' + name + '.pt') aucs.append(model_auc(model, self.valdata)) # choose the one with largest auc chosen, auc = max(enumerate(aucs), key=lambda x: x[1]) return chosen
def trainData(name, data, regularization=eye_loss, alpha=0.01, n_epochs=300, learning_rate=1e-3, batch_size=4000, r=None, test=False): ''' return validation auc, average precision, score1 if test is true, combine train and val and report on test performance ''' m = data if test: name = 'test' + name xtrain = np.vstack([m.xtrain, m.xval]) xval = m.xte ytrain = np.hstack([m.ytrain, m.yval]) yval = m.yte else: xtrain = m.xtrain xval = m.xval ytrain = m.ytrain yval = m.yval # note: for cross validation, just split data into n fold and # choose appropriate train_data and valdata from those folds # not doing here for simplicity d = m.r.size(0) train_data = TensorDataset(*map(lambda x: x.data, prepareData(xtrain, ytrain))) data = DataLoader(train_data, batch_size=batch_size, shuffle=True) valdata = TensorDataset(*map(lambda x: x.data, prepareData(xval, yval))) valdata = DataLoader(valdata, batch_size=4000, shuffle=True) n_output = 2 # binary classification task model = LR(d, n_output) reg_parameters = model.i2o.weight t = Trainer(model, lr=learning_rate, risk_factors=m.r, alpha=alpha, regularization=regularization, reg_parameters=reg_parameters, name=name) losses, vallosses = t.fit(data, n_epochs=n_epochs, print_every=1, valdata=valdata) # report statistics val_auc = model_auc(model, valdata) ap = calcAP(m.r.data.numpy(), (reg_parameters[1] - reg_parameters[0]).data.numpy()) t, s1 = sweepS1(model, valdata) sp = sparsity((reg_parameters[1]-reg_parameters[0]).data.numpy()) joblib.dump((val_auc, ap, s1, sp), 'models/' + name + '.pkl') return val_auc, ap, s1, sp
def select_on_auc_sp(self, n_bootstrap=100): ''' return the index in self.hyperparams chosen hyper parameter selection based on auc and sp choose the hyper parameter that has no auc difference with top but is the sparsest model This is the criteria used in the learning credible models paper ''' print('hyperparam select using auc and sparsity') aucs = [] models = [] sparsities = [] for name, reg, alpha in self.hyperparams: # load the model model = torch.load('models/' + name + '.pt') reg_parameters = model.i2o.weight sp = sparsity((reg_parameters[1]-reg_parameters[0]).data.numpy()) models.append(model) sparsities.append(sp) for _ in range(n_bootstrap): test = bootstrap(self.valdata) local_aucs = [] for model in models: # bootstrap for CI on auc local_aucs.append(model_auc(model, test)) aucs.append(local_aucs) aucs = np.array(aucs) # only keep those with high auc b = np.argmax(aucs.mean(0)) discardset = set([]) for a in range(len(models)): diffs = ((aucs[:,a] - aucs[:,b]) >= 0).astype(int) if diffs.sum() / diffs.shape[0] <= 0.05: discardset.add(a) # choose the one with largest sparsity chosen, sp = max(filter(lambda x: x[0] not in discardset, enumerate(sparsities)), key=lambda x: x[1]) return chosen
def select_on_auc_ap(self, n_bootstrap=100): ''' return the index in self.hyperparams chosen hyper parameter selection based on auc and ap choose the hyper parameter that has no auc difference with top but is the model with highest average precision (align with expert) ''' print('hyperparam select using auc and ap') aucs = [] models = [] aps = [] for name, reg, alpha in self.hyperparams: # load the model model = torch.load('models/' + name + '.pt') reg_parameters = model.i2o.weight ap = calcAP(self.data.r.data.numpy(), (reg_parameters[1] - reg_parameters[0]).data.numpy()) models.append(model) aps.append(ap) for _ in range(n_bootstrap): test = bootstrap(self.valdata) local_aucs = [] for model in models: # bootstrap for CI on auc local_aucs.append(model_auc(model, test)) aucs.append(local_aucs) aucs = np.array(aucs) # only keep those with high auc b = np.argmax(aucs.mean(0)) discardset = set([]) for a in range(len(models)): diffs = ((aucs[:,a] - aucs[:,b]) >= 0).astype(int) if diffs.sum() / diffs.shape[0] <= 0.05: discardset.add(a) # choose the one with largest average precision chosen, ap = max(filter(lambda x: x[0] not in discardset, enumerate(aps)), key=lambda x: x[1]) return chosen