Beispiel #1
0
    def fitData(self, data, batch_size=100, n_epochs=10, print_every=10,
                valdata=None):
        '''
        fit a model to x, y data by batch
        print_every is 0 if do not wish to print
        '''
        time_start = time.time()
        losses = []
        best_valloss, best_valindex = np.inf, 0
        vallosses = []
        n = len(data.dataset)
        cost = 0 
        
        for epoch in range(n_epochs):

            for k, (x_batch, y_batch) in enumerate(data):
                x_batch, y_batch = to_var(x_batch), to_var(y_batch)
                y_hat, regret = self.step(x_batch, y_batch)
                m = x_batch.size(0)                
                cost += 1 / (k+1) * (regret/m - cost)

                if print_every != 0 and k % print_every == 0:
                    
                    losses.append(cost)
                    # progress, time, avg loss, auc
                    to_print = ('%.2f%% (%s) %.4f %.4f' % ((epoch * n + (k+1) * m) /
                                                           (n_epochs * n) * 100,
                                                           timeSince(time_start),
                                                           cost,
                                                           model_auc(self.model,
                                                                     data)))
                    if valdata is not None:
                        valloss = calc_loss(self.model, valdata, self.loss)
                        vallosses.append(valloss)
                        np.save('models/%s.valloss' % self.name, vallosses)   
                        to_print += " %.4f" % model_auc(self.model, valdata)

                        if valloss <= best_valloss:
                            best_valloss = valloss
                            best_valindex = len(vallosses) - 1
                            torch.save(self.model, 'models/%s.pt' % self.name)   
                    else:
                        torch.save(self.model, 'models/%s.pt' % self.name)
                        
                    print(to_print)
                    np.save('models/%s.loss' % self.name, losses)

                    cost = 0
                    
        return losses, vallosses
Beispiel #2
0
    def run(self, n_bootstrap=100):
        #map_parallel(trainData, self.tasks, self.n_cpus)
        for task in self.tasks:
            trainData(*task)

        # select a model to run: split on auc and sparsity
        aucs = []
        models = []
        sparsities = []
        for name, reg, alpha in self.hyperparams:
            # load the model
            model = torch.load('models/' + name + '.pt')
            sp = modelSparsity(model, self.valdata)
            models.append(model)
            sparsities.append(sp)

        for _ in range(n_bootstrap):
            test = bootstrap(self.valdata)
            local_aucs = []
            for model in models:
                # bootstrap for CI on auc
                local_aucs.append(model_auc(model, test))
            aucs.append(local_aucs)
        aucs = np.array(aucs)

        # only keep those with high auc
        b = np.argmax(aucs.mean(0))
        discardset = set([])
        for a in range(len(models)):
            diffs = ((aucs[:, a] - aucs[:, b]) >= 0).astype(np.int)
            if diffs.sum() / diffs.shape[0] <= 0.05:
                discardset.add(a)

        # choose the one with largest sparsity
        chosen, sp = max(filter(lambda x: x[0] not in discardset,
                                enumerate(sparsities)),
                         key=lambda x: x[1])

        # retrian the chosen model
        name, reg, alpha = self.hyperparams[chosen]
        print('name', name)
        trainData(name,
                  self.data,
                  reg,
                  alpha,
                  p.epochs,
                  p.lr,
                  p.batch_size,
                  p.log_interval,
                  test=True)
Beispiel #3
0
    def select_on_auc(self, *args, **kwargs):
        '''hyperparameter selection based on auc alone
        return index within self.hyperparams that need to retrain
        '''
        print('hyperparam select using auc')
        aucs = []
        sparsities = []
        for name, reg, alpha in self.hyperparams:
            # load the model        
            model = torch.load('models/' + name + '.pt')
            aucs.append(model_auc(model, self.valdata))

        # choose the one with largest auc
        chosen, auc = max(enumerate(aucs),
                          key=lambda x: x[1])
        return chosen
Beispiel #4
0
def trainData(name, data, regularization=eye_loss, alpha=0.01, n_epochs=300,
              learning_rate=1e-3, batch_size=4000, r=None, test=False):
    '''
    return validation auc, average precision, score1
    if test is true, combine train and val and report on test performance
    '''
    m = data

    if test:
        name = 'test' + name
        xtrain = np.vstack([m.xtrain, m.xval])
        xval = m.xte
        ytrain = np.hstack([m.ytrain, m.yval])
        yval = m.yte
    else:
        xtrain = m.xtrain
        xval = m.xval
        ytrain = m.ytrain
        yval = m.yval

    # note: for cross validation, just split data into n fold and
    # choose appropriate train_data and valdata from those folds
    # not doing here for simplicity
    d = m.r.size(0)
    train_data = TensorDataset(*map(lambda x: x.data, prepareData(xtrain, ytrain)))
    data = DataLoader(train_data, batch_size=batch_size, shuffle=True)

    valdata = TensorDataset(*map(lambda x: x.data, prepareData(xval, yval)))
    valdata = DataLoader(valdata, batch_size=4000, shuffle=True)

    n_output = 2 # binary classification task 
    model = LR(d, n_output)
    reg_parameters = model.i2o.weight

    t = Trainer(model, lr=learning_rate, risk_factors=m.r, alpha=alpha,
                regularization=regularization, reg_parameters=reg_parameters,
                name=name)
    losses, vallosses = t.fit(data, n_epochs=n_epochs, print_every=1, valdata=valdata)

    # report statistics
    val_auc = model_auc(model, valdata)
    ap = calcAP(m.r.data.numpy(), (reg_parameters[1] - reg_parameters[0]).data.numpy())
    t, s1 = sweepS1(model, valdata)
    sp = sparsity((reg_parameters[1]-reg_parameters[0]).data.numpy())
    joblib.dump((val_auc, ap, s1, sp), 'models/' + name + '.pkl')    
    return val_auc, ap, s1, sp
Beispiel #5
0
    def select_on_auc_sp(self, n_bootstrap=100):
        '''
        return the index in self.hyperparams chosen

        hyper parameter selection based on auc and sp
        choose the hyper parameter that has no auc difference with top 
        but is the sparsest model

        This is the criteria used in the learning credible models paper
        '''
        print('hyperparam select using auc and sparsity')        
        aucs = []
        models = []
        sparsities = []
        for name, reg, alpha in self.hyperparams:
            # load the model        
            model = torch.load('models/' + name + '.pt')
            reg_parameters = model.i2o.weight
            sp = sparsity((reg_parameters[1]-reg_parameters[0]).data.numpy())
            models.append(model)
            sparsities.append(sp)

        for _ in range(n_bootstrap):
            test = bootstrap(self.valdata)
            local_aucs = []
            for model in models:
                # bootstrap for CI on auc
                local_aucs.append(model_auc(model, test))
            aucs.append(local_aucs)
        aucs = np.array(aucs)

        # only keep those with high auc
        b = np.argmax(aucs.mean(0))
        discardset = set([])
        for a in range(len(models)):
            diffs = ((aucs[:,a] - aucs[:,b]) >= 0).astype(int)
            if diffs.sum() / diffs.shape[0] <= 0.05:
                discardset.add(a)

        # choose the one with largest sparsity
        chosen, sp = max(filter(lambda x: x[0] not in discardset,
                                enumerate(sparsities)),
                         key=lambda x: x[1])
        return chosen
Beispiel #6
0
    def select_on_auc_ap(self, n_bootstrap=100):
        '''
        return the index in self.hyperparams chosen

        hyper parameter selection based on auc and ap
        choose the hyper parameter that has no auc difference with top 
        but is the model with highest average precision (align with expert)
        '''
        print('hyperparam select using auc and ap')
        aucs = []
        models = []
        aps = []
        for name, reg, alpha in self.hyperparams:
            # load the model        
            model = torch.load('models/' + name + '.pt')
            reg_parameters = model.i2o.weight
            ap = calcAP(self.data.r.data.numpy(),
                        (reg_parameters[1] - reg_parameters[0]).data.numpy())
            models.append(model)
            aps.append(ap)

        for _ in range(n_bootstrap):
            test = bootstrap(self.valdata)
            local_aucs = []
            for model in models:
                # bootstrap for CI on auc
                local_aucs.append(model_auc(model, test))
            aucs.append(local_aucs)
        aucs = np.array(aucs)

        # only keep those with high auc
        b = np.argmax(aucs.mean(0))
        discardset = set([])
        for a in range(len(models)):
            diffs = ((aucs[:,a] - aucs[:,b]) >= 0).astype(int)
            if diffs.sum() / diffs.shape[0] <= 0.05:
                discardset.add(a)

        # choose the one with largest average precision
        chosen, ap = max(filter(lambda x: x[0] not in discardset,
                                enumerate(aps)),
                         key=lambda x: x[1])
        return chosen