def ens_john(X, y, x):
    from sklearn.cluster import KMeans
    from utils import dist
    from itertools import chain

    mean_psu = 1
    mean_ssu = 50
    mean_M = 60

    var_psu = 3
    var_ssu = 7
    var_M = 10

    kmeans = KMeans(n_clusters=10)
    kmeans.fit(np.concatenate([X], axis=0))
    c = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32)

    class translatedSigmoid(nn.Module):
        def __init__(self):
            super(translatedSigmoid, self).__init__()
            self.beta = nn.Parameter(torch.tensor([1.5]))

        def forward(self, x):
            beta = torch.nn.functional.softplus(self.beta)
            alpha = -beta * (6.9077542789816375)
            return torch.sigmoid((x + alpha) / beta)

    class GPNNModel(nn.Module):
        def __init__(self):
            super(GPNNModel, self).__init__()
            self.mean = nn.Sequential(nn.Linear(1, n_neurons), nn.Sigmoid(),
                                      nn.Linear(n_neurons, 1))
            self.alph = nn.Sequential(nn.Linear(1, n_neurons), nn.Sigmoid(),
                                      nn.Linear(n_neurons, 1), nn.Softplus())
            self.bet = nn.Sequential(nn.Linear(1, n_neurons), nn.Sigmoid(),
                                     nn.Linear(n_neurons, 1), nn.Softplus())
            self.trans = translatedSigmoid()

        def forward(self, x, switch):
            d = dist(x, c)
            d_min = d.min(dim=1, keepdim=True)[0]
            s = self.trans(d_min)
            mean = self.mean(x)
            if switch:
                a = self.alph(x)
                b = self.bet(x)
                gamma_dist = D.Gamma(a + 1e-8, 1.0 / (b + 1e-8))
                if self.training:
                    samples_var = gamma_dist.rsample(torch.Size([50]))
                    x_var = (1.0 / (samples_var + 1e-8))
                else:
                    samples_var = gamma_dist.rsample(torch.Size([2000]))
                    x_var = (1.0 / (samples_var + 1e-8))
                var = (1 - s) * x_var + s * torch.tensor([3.5**2
                                                          ])  # HYPERPARAMETER

            else:
                var = torch.tensor([0.05])
            return mean, var

    ens_mean, ens_var = [], []
    for i in range(5):
        model = GPNNModel()
        optimizer = torch.optim.Adam(model.mean.parameters(), lr=1e-2)
        optimizer2 = torch.optim.Adam(chain(model.alph.parameters(),
                                            model.bet.parameters(),
                                            model.trans.parameters()),
                                      lr=1e-3)

        n_iter = 6000
        it = 0
        mean_Q, mean_w = gen_Qw(X, mean_psu, mean_ssu, mean_M)
        var_Q, var_w = gen_Qw(X, var_psu, var_ssu, var_M)
        mean_pseupoch = get_pseupoch(mean_w, 0.5)
        var_pseupoch = get_pseupoch(var_w, 0.5)
        opt_switch = 1
        mean_w = torch.Tensor(mean_w)
        var_w = torch.Tensor(var_w)
        model.train()

        while it < n_iter:
            model.train()
            switch = 1.0 if it > 5000 else 0.0

            if it % 11:
                opt_switch = opt_switch + 1  # change between var and mean optimizer

            if not switch:
                optimizer.zero_grad()
                m, v = model(X, switch)
                loss = -(-v.log() - (m.flatten() - y)**2 / (2 * v)).sum()
                loss.backward()
                optimizer.step()
            else:
                if opt_switch % 2 == 0:
                    for b in range(mean_pseupoch):
                        optimizer.zero_grad()
                        batch = locality_sampler2(mean_psu, mean_ssu, mean_Q,
                                                  mean_w)
                        m, v = model(X[batch], switch)
                        loss = -t_likelihood(
                            y[batch], m, v, mean_w[batch]
                        )  #-(-v.log() - ((m.flatten()-y[batch])**2).reshape(1,-1,1) / (2 * v)) / mean_w[batch].reshape(1,-1,1)
                        loss = loss.sum()  # why the f*** is it so slow
                        loss.backward()
                        optimizer.step()
                else:
                    for b in range(var_pseupoch):
                        optimizer2.zero_grad()
                        batch = locality_sampler2(var_psu, var_ssu, var_Q,
                                                  var_w)
                        m, v = model(X[batch], switch)
                        loss = -t_likelihood(
                            y[batch], m, v, var_w[batch]
                        )  #-(-(diff.log() / 2 + diff/v + v.log() / 2)) / var_w[batch].reshape(1,-1,1)
                        loss = loss.sum()  # why the f*** is it so slow
                        loss.backward()
                        optimizer2.step()

            if it % 500 == 0:
                model.eval()
                m, v = model(X, switch)
                loss = -(-v.log() - (m.flatten() - y)**2 / (2 * v)).mean()
                print('Iter {0}/{1}, Loss {2}'.format(it, n_iter, loss.item()))
            it += 1

        model.eval()
        with torch.no_grad():
            mean, var = model(x, switch)
        ens_mean.append(mean)
        ens_var.append(var.mean(dim=0))

    ens_mean = torch.stack(ens_mean)
    ens_var = torch.stack(ens_var)

    mean = ens_mean.mean(dim=0)
    var = (ens_var + ens_mean**2).mean(dim=0) - mean**2

    return mean.numpy(), var.sqrt().numpy()
Example #2
0
def john(args, X, y, Xval, yval):
    from sklearn.cluster import KMeans
    from utils import dist
    from itertools import chain
    from torch import distributions as D
    from locality_sampler import gen_Qw, locality_sampler2
    from sklearn.decomposition import PCA
    
    if args.dataset == 'protein' or args.dataset == 'year_prediction':
        n_neurons = 100
    else:
        n_neurons = 50
    args.n_clusters = min(args.n_clusters, X.shape[0])
    
    y, y_mean, y_std = normalize_y(y)
    
    mean_psu = 1
    mean_ssu = 40
    mean_M = 50

    var_psu = 2
    var_ssu = 10
    var_M = 10
    
    num_draws_train = 20
    kmeans = KMeans(n_clusters=args.n_clusters)
    if args.dataset != 'year_prediction':
        kmeans.fit(np.concatenate([X], axis=0))
    else:
        kmeans.fit(X[np.random.randint(0, X.shape[0], size=(10000))])
    c = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32)
    if torch.cuda.is_available() and args.cuda: 
        c = torch.tensor(c).to(torch.float32).to('cuda')
    else:
        c = torch.tensor(c).to(torch.float32)
        
    class translatedSigmoid(torch.nn.Module):
        def __init__(self):
            super(translatedSigmoid, self).__init__()
            self.beta = torch.nn.Parameter(torch.tensor([1.5]))
            
        def forward(self, x):
            beta = torch.nn.functional.softplus(self.beta)
            alpha = -beta*(6.9077542789816375)
            return torch.sigmoid((x+alpha)/beta)
    
    class GPNNModel(torch.nn.Module):
        def __init__(self):
            super(GPNNModel, self).__init__()
            self.mean = torch.nn.Sequential(torch.nn.Linear(X.shape[1], n_neurons),
                                      torch.nn.ReLU(),
                                      torch.nn.Linear(n_neurons, y.shape[1]))
            self.alph = torch.nn.Sequential(torch.nn.Linear(X.shape[1], n_neurons),
                                      torch.nn.ReLU(),
                                      torch.nn.Linear(n_neurons, y.shape[1]),
                                      torch.nn.Softplus())
            self.bet = torch.nn.Sequential(torch.nn.Linear(X.shape[1], n_neurons),
                                     torch.nn.ReLU(),
                                     torch.nn.Linear(n_neurons, y.shape[1]),
                                     torch.nn.Softplus())
            self.trans = translatedSigmoid()
            
        def forward(self, x, switch):
            d = dist(x, c)
            d_min = d.min(dim=1, keepdim=True)[0]
            s = self.trans(d_min)
            mean = self.mean(x)
            if switch:
                a = self.alph(x)
                b = self.bet(x)
                gamma_dist = D.Gamma(a+1e-8, 1.0/(b+1e-8))
                if self.training:
                    samples_var = gamma_dist.rsample(torch.Size([num_draws_train]))
                    x_var = (1.0/(samples_var+1e-8))
                else:
                    samples_var = gamma_dist.rsample(torch.Size([2000]))
                    x_var = (1.0/(samples_var+1e-8))
                var = (1-s) * x_var + s * y_std ** 2

            else:
                var = 0.05*torch.ones_like(mean)
            return mean, var
    
    model = GPNNModel()
    if torch.cuda.is_available() and args.cuda: 
        model.cuda()
        device=torch.device('cuda')
    else:
        device=torch.device('cpu')

    optimizer = torch.optim.Adam(model.mean.parameters(), lr=1e-2)
    optimizer2 = torch.optim.Adam(chain(model.alph.parameters(),
                                        model.bet.parameters(),
                                        model.trans.parameters()), lr=1e-4)
    mean_Q, mean_w = gen_Qw(X, mean_psu, mean_ssu, mean_M)
    
    if X.shape[0] > 100000 and X.shape[1] > 10:
        pca = PCA(n_components=0.5)
        temp = pca.fit_transform(X)
        var_Q, var_w = gen_Qw(temp, var_psu, var_ssu, var_M)
    else:    
        var_Q, var_w = gen_Qw(X, var_psu, var_ssu, var_M)
    
    #mean_pseupoch = get_pseupoch(mean_w,0.5)
    #var_pseupoch = get_pseupoch(var_w,0.5)
    opt_switch = 1
    mean_w = torch.tensor(mean_w).to(torch.float32).to(device)
    var_w = torch.tensor(var_w).to(torch.float32).to(device)
    model.train()
    
    X = torch.tensor(X).to(torch.float32).to(device)
    y = torch.tensor(y).to(torch.float32).to(device)
    batches = batchify(X, y, batch_size = args.batch_size, shuffel=args.shuffel)

    # validation data and performance measures
    ll_list = []
    mae_list = []
    rmse_list = []
    x_eval = torch.tensor(Xval).to(torch.float32).to(device)
    y_eval = torch.tensor(yval).to(torch.float32).to(device)
    y_mean = torch.tensor(y_mean).to(torch.float32).to(device)
    y_std = torch.tensor(y_std).to(torch.float32).to(device)

    it = 0
    its_per_epoch = int(np.ceil(X.shape[0] / args.batch_size))
    epochs = round(args.iters / its_per_epoch)
    while it < args.iters:
        switch = 1.0 if it > args.iters/2.0 else 0.0
        
        if it % 11: opt_switch = opt_switch + 1 # change between var and mean optimizer
        with torch.autograd.detect_anomaly():
            data, label = next(batches)
            if not switch:
                optimizer.zero_grad()
                m, v = model(data, switch)
                loss = -t_likelihood(label, m, v.unsqueeze(0))
                loss.backward()
                optimizer.step()
            else:
                if opt_switch % 2 == 0:    
                    #for b in range(mean_pseupoch):
                    optimizer.zero_grad()
                    batch = locality_sampler2(mean_psu,mean_ssu,mean_Q,mean_w)
                    m, v = model(X[batch], switch)
                    loss = -t_likelihood(y[batch], m, v, mean_w[batch])
                    loss.backward()
                    optimizer.step()
                else:
                    #for b in range(var_pseupoch):
                    optimizer2.zero_grad()
                    batch = locality_sampler2(var_psu,var_ssu,var_Q,var_w)
                    m, v = model(X[batch], switch)
                    loss = -t_likelihood(y[batch], m, v, var_w[batch])
                    loss.backward()
                    optimizer2.step()

        # test on validation set once per epoch
        if it % its_per_epoch == 0:
            model.eval()
            with torch.no_grad():
                m, v = model(x_eval, switch)
            m = m * y_std + y_mean
            v = v * y_std ** 2
            if switch == 0:
                ll = t_likelihood(y_eval, m, v.unsqueeze(0)).item()
            else:
                ll = t_likelihood(y_eval, m, v).item()
            # if it % (500 * its_per_epoch) == 0:
            #     print('Epoch {:d}/{:d},'.format(it // its_per_epoch, epochs), 'Loss {:.4f},'.format(ll))

            # log validation performance after we are stable in the second optimization regime
            if it > args.iters * 0.60:
                ll_list.append(ll)
                error = torch.norm(y_eval - m, p=2, dim=1)
                mae_list.append(error.abs().mean().item())
                rmse_list.append((error ** 2).mean().sqrt().item())
                model.train()

                # early stop check
                if len(ll_list) - np.argmax(ll_list) > 50:
                    it = args.iters
                    print('Early Stop!')

        it+=1

    # get best LL
    i_best = np.argmax(ll_list)

    # evaluate model moments
    with torch.no_grad():
        model.training = False
        m, v = model(x_eval, 1.0)
        m = m * y_std + y_mean
        v = v * y_std ** 2

    return ll_list[i_best], rmse_list[i_best], m.cpu().numpy(), v.cpu().numpy()
Example #3
0
    def fit(self,
            Xtrain,
            x_test,
            Xplot,
            n_iters=100,
            lr=1e-3,
            batch_size=250,
            n_clusters=50,
            beta=1.0,
            its_per_epoch=2500):
        self.train()
        if self.device == torch.device('cuda'):
            self.cuda()

        optimizer1 = torch.optim.Adam(
            chain(  #self.enc_mu.parameters(),
                #self.enc_var.parameters(),
                self.enc.parameters(),
                self.dec_mu.parameters()),
            lr=lr)
        optimizer2 = torch.optim.Adam(
            chain(  #self.enc_mu.parameters(),
                #self.enc_var.parameters(),
                self.enc.parameters(),
                self.dec_mu.parameters()),
            lr=lr)
        optimizer3 = torch.optim.Adam(chain(self.alpha.parameters(),
                                            self.beta.parameters()),
                                      lr=lr)

        it = 0
        batches = batchify(Xtrain, batch_size=batch_size, shuffel=True)
        local_batches = local_batchify(Xtrain)
        progressBar = tqdm(desc='training', total=n_iters, unit='iter')
        loss, var = [[], [], []], []
        x_plot = torch.tensor(Xplot).to(torch.float32).to(self.device)
        ll_best = -np.inf
        epoch_best = np.inf
        while it < n_iters:
            self.switch = 1.0 if it > n_iters / 2 else 0.0
            anneling = np.minimum(1, it / (n_iters / 2)) * beta
            #self.opt_switch = (self.opt_switch+1) if (it % 11 == 0 and self.switch) else self.opt_switch
            # if self.switch and (it % 1000 == 0 or not hasattr(self, "C")):
            if self.switch and not hasattr(self, "C"):
                kmeans = KMeans(n_clusters=n_clusters)
                kmeans.fit(
                    self.encoder(torch.tensor(Xtrain).to(
                        self.device))[0].detach().cpu().numpy())
                self.C = torch.tensor(kmeans.cluster_centers_,
                                      dtype=torch.float32).to(self.device)

            if not self.switch:
                x = next(batches)
                x = torch.tensor(x).to(torch.float32).to(self.device)

                optimizer1.zero_grad()
                elbo, log_px, kl, x_mu, x_var, z, z_mu, z_var = self.forward(
                    x, anneling)
                (-elbo).backward()
                optimizer1.step()
            else:
                x, mean_w, var_w = next(local_batches)
                x = torch.tensor(x).to(torch.float32).to(self.device)
                mean_w = torch.tensor(mean_w).to(torch.float32).to(self.device)
                var_w = torch.tensor(var_w).to(torch.float32).to(self.device)

                elbo, logpx, kl, x_mu, x_var, z, z_mu, z_var = self.forward(
                    x, anneling)
                if self.opt_switch % 2 == 0:
                    optimizer2.zero_grad()
                    elbo = t_likelihood(x, x_mu, x_var,
                                        mean_w) / Xtrain.shape[0] - kl.mean()
                    (-elbo).backward()
                    optimizer2.step()
                else:
                    optimizer3.zero_grad()
                    elbo = t_likelihood(x, x_mu, x_var,
                                        mean_w) / Xtrain.shape[0] - kl.mean()
                    (-elbo).backward()
                    optimizer3.step()

                elbo, log_px, kl, x_mu, x_var, z, z_mu, z_var = self.forward(
                    x, anneling)

            progressBar.update()
            progressBar.set_postfix({
                'elbo': (-elbo).item(),
                'x_var': x_var.mean().item(),
                'anneling': anneling
            })
            # loss[0].append((-elbo).item())
            # loss[1].append(log_px.mean().item())
            # loss[2].append(kl.mean().item())
            # var.append(x_var.mean().item())

            if it % its_per_epoch == 0:
                self.eval()
                with torch.no_grad():
                    ll = []
                    mean_x = []
                    var_x = []
                    for i in range(int(np.ceil(x_test.shape[0] / batch_size))):
                        i_start = i * batch_size
                        i_end = min((i + 1) * batch_size, x_test.shape[0])
                        x = torch.tensor(x_test[i_start:i_end]).to(
                            torch.float32).to(self.device)
                        _, l, _, m, v, _, _, _ = self.forward(x, anneling)
                        ll.append(l.cpu().numpy())
                        mean_x.append(m.cpu().numpy())
                        var_x.append(v.cpu().numpy())
                    ll = np.mean(np.concatenate(ll))
                    mean_x = np.concatenate(mean_x, axis=0)
                    var_x = np.concatenate(var_x, axis=0)
                    print('\nEpoch {:d}/{:d}: LL = {:.4f}'.format(
                        it // its_per_epoch, n_iters // its_per_epoch, ll))
                    if ll > ll_best and it > n_iters * 0.6:
                        ll_best = ll
                        px = D.Independent(
                            D.Normal(
                                torch.tensor(mean_x).to(torch.float32).to(
                                    self.device),
                                torch.tensor(var_x**0.5).to(torch.float32).to(
                                    self.device)), 1)
                        rmse_best = np.sqrt(
                            np.mean((x_test - px.sample().cpu().numpy())**2))
                        h_best = px.entropy().mean().item()
                        epoch_best = it // its_per_epoch
                        _, _, _, mean_x, var_x, _, _, _ = self.forward(
                            x_plot, anneling)
                        px = D.Independent(D.Normal(mean_x, var_x**0.5), 1)
                        mean_best = mean_x.cpu().numpy()
                        var_best = var_x.cpu().numpy()
                        sample_best = px.sample().cpu().numpy()
                    elif self.switch and it // its_per_epoch > epoch_best + 50:
                        print('Early Stop!')
                        break
                self.train()
            it += 1

        progressBar.close()
        return ll_best, rmse_best, h_best, mean_best, var_best, sample_best
def john(args, X, y, Xval, yval):
    from sklearn.cluster import KMeans
    from utils import dist
    from itertools import chain
    from torch import distributions as D
    from locality_sampler import gen_Qw, locality_sampler2
    from sklearn.decomposition import PCA

    if args.dataset == 'protein' or args.dataset == 'year_prediction':
        n_neurons = 100
    else:
        n_neurons = 50
    args.n_clusters = min(args.n_clusters, X.shape[0])

    y, y_mean, y_std = normalize_y(y)

    mean_psu = 1
    mean_ssu = 40
    mean_M = 50

    var_psu = 2
    var_ssu = 10
    var_M = 10

    num_draws_train = 20
    kmeans = KMeans(n_clusters=args.n_clusters)
    if args.dataset != 'year_prediction':
        kmeans.fit(np.concatenate([X], axis=0))
    else:
        kmeans.fit(X[np.random.randint(0, X.shape[0], size=(10000))])
    c = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32)
    if torch.cuda.is_available() and args.cuda:
        c = torch.tensor(c).to(torch.float32).to('cuda')
    else:
        c = torch.tensor(c).to(torch.float32)

    class translatedSigmoid(torch.nn.Module):
        def __init__(self):
            super(translatedSigmoid, self).__init__()
            self.beta = torch.nn.Parameter(torch.tensor([1.5]))

        def forward(self, x):
            beta = torch.nn.functional.softplus(self.beta)
            alpha = -beta * (6.9077542789816375)
            return torch.sigmoid((x + alpha) / beta)

    class GPNNModel(torch.nn.Module):
        def __init__(self):
            super(GPNNModel, self).__init__()
            self.mean = torch.nn.Sequential(
                torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(),
                torch.nn.Linear(n_neurons, 1))
            self.alph = torch.nn.Sequential(
                torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(),
                torch.nn.Linear(n_neurons, 1), torch.nn.Softplus())
            self.bet = torch.nn.Sequential(
                torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(),
                torch.nn.Linear(n_neurons, 1), torch.nn.Softplus())
            self.trans = translatedSigmoid()

        def forward(self, x, switch):
            d = dist(x, c)
            d_min = d.min(dim=1, keepdim=True)[0]
            s = self.trans(d_min)
            mean = self.mean(x)
            if switch:
                a = self.alph(x)
                b = self.bet(x)
                gamma_dist = D.Gamma(a + 1e-8, 1.0 / (b + 1e-8))
                if self.training:
                    samples_var = gamma_dist.rsample(
                        torch.Size([num_draws_train]))
                    x_var = (1.0 / (samples_var + 1e-8))
                else:
                    samples_var = gamma_dist.rsample(torch.Size([1000]))
                    x_var = (1.0 / (samples_var + 1e-8))
                var = (1 - s) * x_var + s * torch.tensor(
                    [y_std**2], device=x.device)  # HYPERPARAMETER

            else:
                var = 0.05 * torch.ones_like(mean)
            return mean, var

    model = GPNNModel()
    if torch.cuda.is_available() and args.cuda:
        model.cuda()
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    optimizer = torch.optim.Adam(model.mean.parameters(), lr=1e-2)
    optimizer2 = torch.optim.Adam(chain(model.alph.parameters(),
                                        model.bet.parameters(),
                                        model.trans.parameters()),
                                  lr=1e-4)
    mean_Q, mean_w = gen_Qw(X, mean_psu, mean_ssu, mean_M)

    if X.shape[0] > 100000 and X.shape[1] > 10:
        pca = PCA(n_components=0.5)
        temp = pca.fit_transform(X)
        var_Q, var_w = gen_Qw(temp, var_psu, var_ssu, var_M)
    else:
        var_Q, var_w = gen_Qw(X, var_psu, var_ssu, var_M)

    #mean_pseupoch = get_pseupoch(mean_w,0.5)
    #var_pseupoch = get_pseupoch(var_w,0.5)
    opt_switch = 1
    mean_w = torch.tensor(mean_w).to(torch.float32).to(device)
    var_w = torch.tensor(var_w).to(torch.float32).to(device)
    model.train()

    X = torch.tensor(X).to(torch.float32).to(device)
    y = torch.tensor(y).to(torch.float32).to(device)
    batches = batchify(X, y, batch_size=args.batch_size, shuffel=args.shuffel)

    it = 0
    while it < args.iters:
        switch = 1.0 if it > args.iters / 2.0 else 0.0

        if it % 11:
            opt_switch = opt_switch + 1  # change between var and mean optimizer
        with torch.autograd.detect_anomaly():
            data, label = next(batches)
            if not switch:
                optimizer.zero_grad()
                m, v = model(data, switch)
                loss = -t_likelihood(label.reshape(-1, 1), m,
                                     v.reshape(1, -1, 1)) / X.shape[0]
                loss.backward()
                optimizer.step()
            else:
                if opt_switch % 2 == 0:
                    #for b in range(mean_pseupoch):
                    optimizer.zero_grad()
                    batch = locality_sampler2(mean_psu, mean_ssu, mean_Q,
                                              mean_w)
                    m, v = model(X[batch], switch)
                    loss = -t_likelihood(y[batch].reshape(-1, 1), m, v,
                                         mean_w[batch]) / X.shape[0]
                    loss.backward()
                    optimizer.step()
                else:
                    #for b in range(var_pseupoch):
                    optimizer2.zero_grad()
                    batch = locality_sampler2(var_psu, var_ssu, var_Q, var_w)
                    m, v = model(X[batch], switch)
                    loss = -t_likelihood(y[batch].reshape(-1, 1), m, v,
                                         var_w[batch]) / X.shape[0]
                    loss.backward()
                    optimizer2.step()

        if it % 500 == 0:
            m, v = model(data, switch)
            loss = -(-v.log() / 2 -
                     ((m.flatten() - label)**2).reshape(1, -1, 1) /
                     (2 * v)).mean()
            print('Iter {0}/{1}, Loss {2}'.format(it, args.iters, loss.item()))
        it += 1

    model.eval()

    data = torch.tensor(Xval).to(torch.float32).to(device)
    label = torch.tensor(yval).to(torch.float32).to(device)
    with torch.no_grad():
        m, v = model(data, switch)
    m = m * y_std + y_mean
    v = v * y_std**2
    #log_px = normal_log_prob(label, m, v).mean(dim=0) # check for correctness
    log_px = t_likelihood(label.reshape(-1, 1), m, v) / Xval.shape[0]  # check
    rmse = ((label - m.flatten())**2).mean().sqrt()
    return log_px.mean().item(), rmse.item()
Example #5
0
def jnlsmv(args, X, y, Xval, yval):
    from sklearn.cluster import KMeans
    from utils import dist
    from torch import distributions as D

    if args.dataset == 'protein' or args.dataset == 'year_prediction':
        n_neurons = 100
    else:
        n_neurons = 50
    args.n_clusters = min(args.n_clusters, X.shape[0])

    y, y_mean, y_std = normalize_y(y)

    kmeans = KMeans(n_clusters=args.n_clusters)
    kmeans.fit(np.concatenate([X], axis=0))
    c = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32)
    if torch.cuda.is_available() and args.cuda:
        c = torch.tensor(c).to(torch.float32).to('cuda')
    else:
        c = torch.tensor(c).to(torch.float32)

    class translatedSigmoid(torch.nn.Module):
        def __init__(self):
            super(translatedSigmoid, self).__init__()
            self.beta = torch.nn.Parameter(torch.tensor([1.5]))

        def forward(self, x):
            beta = torch.nn.functional.softplus(self.beta)
            alpha = -beta * (6.9077542789816375)
            return torch.sigmoid((x + alpha) / beta)

    class GPNNModel(torch.nn.Module):
        def __init__(self):
            super(GPNNModel, self).__init__()
            self.mean = torch.nn.Sequential(
                torch.nn.Linear(X.shape[1], n_neurons), torch.nn.Sigmoid(),
                torch.nn.Linear(n_neurons, 1))
            self.alph = torch.nn.Sequential(
                torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(),
                torch.nn.Linear(n_neurons, 1), torch.nn.Softplus())
            self.bet = torch.nn.Sequential(
                torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(),
                torch.nn.Linear(n_neurons, 1), torch.nn.Softplus())
            self.trans = translatedSigmoid()

        def forward(self, x, switch):
            d = dist(x, c)
            d_min = d.min(dim=1, keepdim=True)[0]
            s = self.trans(d_min)
            mean = self.mean(x)
            if switch:
                a = self.alph(x)
                b = self.bet(x)
                gamma_dist = D.Gamma(a + 1e-8, 1.0 / (b + 1e-8))
                if self.training:
                    samples_var = gamma_dist.rsample(torch.Size([20]))
                    x_var = (1.0 / (samples_var + 1e-8))
                else:
                    samples_var = gamma_dist.rsample(torch.Size([1000]))
                    x_var = (1.0 / (samples_var + 1e-8)).mean(dim=0)
                var = (1 - s) * x_var + s * torch.tensor(
                    [3.5**2], device=x.device)  # HYPERPARAMETER

            else:
                var = torch.tensor([0.05], device=x.device)
            return mean, var

    model = GPNNModel()
    if torch.cuda.is_available() and args.cuda:
        model.cuda()
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    optimizer = torch.optim.Adam(model.mean.parameters(), lr=1e-2)
    optimizer2 = torch.optim.Adam(chain(model.alph.parameters(),
                                        model.bet.parameters(),
                                        model.trans.parameters()),
                                  lr=1e-4)

    it = 0
    opt_switch = 0
    progressBar = tqdm(desc='Training nn', total=args.iters, unit='iter')
    batches = local_batchify(X,
                             y,
                             batch_size=args.batch_size,
                             shuffel=args.shuffel)

    while it < args.iters:
        switch = 1.0 if it > args.iters / 2 else 0.0
        if it % 11 == 0 and switch:
            opt_switch = opt_switch + 1  # change between var and mean optimizer

        data, label, mean_w, var_w = next(batches)
        data = torch.tensor(data).to(torch.float32).to(device)
        label = torch.tensor(label).to(torch.float32).to(device)
        mean_w = torch.tensor(mean_w).to(torch.float32).to(device)
        var_w = torch.tensor(var_w).to(torch.float32).to(device)

        if opt_switch % 2 == 0:
            #for b in range(mean_pseupoch):
            optimizer.zero_grad()
            #batch = locality_sampler2(mean_psu,mean_ssu,mean_Q,mean_w)
            m, v = model(data, switch)
            loss = -t_likelihood(label.reshape(-1, 1), m, v,
                                 mean_w) / X.shape[0]
            loss.backward()
            optimizer.step()
        else:
            #for b in range(var_pseupoch):
            optimizer2.zero_grad()
            #batch = locality_sampler2(var_psu,var_ssu,var_Q,var_w)
            m, v = model(data, switch)
            loss = -t_likelihood(label.reshape(-1, 1), m, v,
                                 var_w) / X.shape[0]
            loss.backward()
            optimizer2.step()

        it += 1
        progressBar.update()
        progressBar.set_postfix({'loss': loss.item()})
    progressBar.close()

    data = torch.tensor(Xval).to(torch.float32).to(device)
    label = torch.tensor(yval).to(torch.float32).to(device)
    m, v = model(data, switch)
    m = m * y_std + y_mean
    v = v * y_std**2
    log_px = t_likelihood(label.reshape(-1, 1), m, v)
    rmse = ((label - m.flatten())**2).mean().sqrt()
    return log_px.mean().item(), rmse.item()
Example #6
0
    def fit(self,
            Xtrain,
            n_iters=100,
            lr=1e-3,
            batch_size=250,
            n_clusters=50,
            beta=1.0):
        self.train()
        if self.device == torch.device('cuda'):
            self.cuda()

        optimizer1 = torch.optim.Adam(chain(self.enc_mu.parameters(),
                                            self.enc_var.parameters(),
                                            self.dec_mu.parameters()),
                                      lr=lr)
        optimizer2 = torch.optim.Adam(chain(self.enc_mu.parameters(),
                                            self.enc_var.parameters(),
                                            self.dec_mu.parameters()),
                                      lr=lr)
        optimizer3 = torch.optim.Adam(chain(self.alpha.parameters(),
                                            self.beta.parameters()),
                                      lr=lr)

        it = 0
        batches = batchify(Xtrain, batch_size=batch_size, shuffel=True)
        local_batches = local_batchify(Xtrain)
        progressBar = tqdm(desc='training', total=n_iters, unit='iter')
        loss, var = [[], [], []], []
        while it < n_iters:
            self.switch = 1.0 if it > n_iters / 2 else 0.0
            anneling = np.minimum(1, it / (n_iters / 2)) * beta
            #self.opt_switch = (self.opt_switch+1) if (it % 11 == 0 and self.switch) else self.opt_switch
            if self.switch and (it % 1000 == 0 or not hasattr(self, "C")):
                kmeans = KMeans(n_clusters=n_clusters)
                kmeans.fit(
                    self.encoder(torch.tensor(Xtrain).to(
                        self.device))[0].detach().cpu().numpy())
                self.C = torch.tensor(kmeans.cluster_centers_,
                                      dtype=torch.float32).to(self.device)

            if not self.switch:
                x = next(batches)
                x = torch.tensor(x).to(torch.float32).to(self.device)

                optimizer1.zero_grad()
                elbo, log_px, kl, x_mu, x_var, z, z_mu, z_var = self.forward(
                    x, anneling)
                (-elbo).backward()
                optimizer1.step()
            else:
                x, mean_w, var_w = next(local_batches)
                x = torch.tensor(x).to(torch.float32).to(self.device)
                mean_w = torch.tensor(mean_w).to(torch.float32).to(self.device)
                var_w = torch.tensor(var_w).to(torch.float32).to(self.device)

                elbo, logpx, kl, x_mu, x_var, z, z_mu, z_var = self.forward(
                    x, anneling)
                if self.opt_switch % 2 == 0:
                    optimizer2.zero_grad()
                    elbo = t_likelihood(x, x_mu, x_var,
                                        mean_w) / Xtrain.shape[0] - kl.mean()
                    (-elbo).backward()
                    optimizer2.step()
                else:
                    optimizer3.zero_grad()
                    elbo = t_likelihood(x, x_mu, x_var,
                                        mean_w) / Xtrain.shape[0] - kl.mean()
                    (-elbo).backward()
                    optimizer3.step()

                elbo, log_px, kl, x_mu, x_var, z, z_mu, z_var = self.forward(
                    x, anneling)

            progressBar.update()
            progressBar.set_postfix({
                'elbo': (-elbo).item(),
                'x_var': x_var.mean().item(),
                'anneling': anneling
            })
            loss[0].append((-elbo).item())
            loss[1].append(log_px.mean().item())
            loss[2].append(kl.mean().item())
            var.append(x_var.mean().item())
            it += 1

            if it % 2500 == 0:
                self.save_something('it' + str(it), Xtrain[::20])

        progressBar.close()
        return loss, var
Example #7
0
    def fit(self,
            Xtrain,
            x_test,
            Xplot,
            n_iters=100,
            lr=1e-3,
            batch_size=250,
            n_clusters=50,
            beta=1.0,
            its_per_epoch=2500):
        self.train()
        if self.device == torch.device('cuda'):
            self.cuda()

        optimizer1 = torch.optim.Adam(
            chain(  #self.enc_mu.parameters(),
                #self.enc_var.parameters(),
                self.enc.parameters(),
                self.dec_mu.parameters()),
            lr=lr)
        optimizer2 = torch.optim.Adam(
            chain(  #self.enc_mu.parameters(),
                #self.enc_var.parameters(),
                self.enc.parameters(),
                self.dec_mu.parameters()),
            lr=lr)
        optimizer3 = torch.optim.Adam(chain(self.alpha.parameters(),
                                            self.beta.parameters()),
                                      lr=lr)

        it = 0
        batches = batchify(Xtrain, batch_size=batch_size, shuffel=True)
        local_batches = local_batchify(Xtrain)
        progressBar = tqdm(desc='training', total=n_iters, unit='iter')
        x_plot = torch.tensor(Xplot).to(torch.float32).to(self.device)
        ll_best = -np.inf
        epoch_best = np.inf
        while it < n_iters:
            self.switch = 1.0 if it > n_iters / 2 else 0.0
            anneling = np.minimum(1, it / (n_iters / 2)) * beta
            #self.opt_switch = (self.opt_switch+1) if (it % 11 == 0 and self.switch) else self.opt_switch
            # if self.switch and (it % 1000 == 0 or not hasattr(self, "C")):
            if self.switch and not hasattr(self, "C"):
                kmeans = KMeans(n_clusters=n_clusters)
                kmeans.fit(
                    self.encoder(torch.tensor(Xtrain).to(
                        self.device))[0].detach().cpu().numpy())
                self.C = torch.tensor(kmeans.cluster_centers_,
                                      dtype=torch.float32).to(self.device)

            if not self.switch:
                x = next(batches)
                x = torch.tensor(x).to(torch.float32).to(self.device)

                optimizer1.zero_grad()
                elbo, log_px, kl, x_mu, x_var, z, z_mu, z_var = self.forward(
                    x, anneling)
                (-elbo).backward()
                optimizer1.step()
            else:
                x, mean_w, var_w = next(local_batches)
                x = torch.tensor(x).to(torch.float32).to(self.device)
                mean_w = torch.tensor(mean_w).to(torch.float32).to(self.device)
                var_w = torch.tensor(var_w).to(torch.float32).to(self.device)

                elbo, logpx, kl, x_mu, x_var, z, z_mu, z_var = self.forward(
                    x, anneling)
                if self.opt_switch % 2 == 0:
                    optimizer2.zero_grad()
                    elbo = t_likelihood(x, x_mu, x_var, mean_w) - kl.mean()
                    (-elbo).backward()
                    optimizer2.step()
                else:
                    optimizer3.zero_grad()
                    elbo = t_likelihood(x, x_mu, x_var, var_w) - kl.mean()
                    (-elbo).backward()
                    optimizer3.step()

                elbo, log_px, kl, x_mu, x_var, z, z_mu, z_var = self.forward(
                    x, anneling)

            progressBar.update()
            progressBar.set_postfix({
                'elbo': (-elbo).item(),
                'x_var': x_var.mean().item(),
                'anneling': anneling
            })

            # epoch complete and in second phase of training (i.e. fitting variance)
            if it % its_per_epoch == 0 and self.switch:
                self.eval()
                with torch.no_grad():

                    # initialize containers
                    ll = []
                    elbo = []
                    mean_residuals = []
                    var_residuals = []
                    sample_residuals = []

                    # loop over batches
                    for i in range(int(np.ceil(x_test.shape[0] / batch_size))):

                        # run Detlefsen network
                        i_start = i * batch_size
                        i_end = min((i + 1) * batch_size, x_test.shape[0])
                        x = torch.tensor(x_test[i_start:i_end]).to(
                            torch.float32).to(self.device)
                        _, _, _, mu_x, sigma2_x, _, _, _ = self.forward(
                            x, anneling)
                        elbo_test = t_likelihood(x, mu_x, sigma2_x) - kl.mean()
                        mean = mu_x.cpu().numpy()
                        variance = sigma2_x.cpu().numpy()

                        # create p(x|x): a uniform mixture of Normals over the variance samples
                        components = []
                        for v in tf.unstack(variance):
                            normal = tfp.distributions.Normal(loc=mean,
                                                              scale=v**0.5)
                            components.append(
                                tfp.distributions.Independent(
                                    normal, reinterpreted_batch_ndims=1))
                        cat = tfp.distributions.Categorical(
                            logits=tf.ones((variance.shape[1],
                                            variance.shape[0])))
                        px_x = tfp.distributions.Mixture(cat=cat,
                                                         components=components)

                        # append results
                        x = x.cpu().numpy()
                        elbo.append(elbo_test.cpu().numpy())
                        ll.append(px_x.log_prob(x))
                        mean_residuals.append(px_x.mean() - x)
                        var_residuals.append(px_x.variance() -
                                             mean_residuals[-1]**2)
                        sample_residuals.append(px_x.sample() - x)

                    # if mean likelihood is new best
                    ll = tf.reduce_mean(tf.concat(ll, axis=0)).numpy()
                    if ll > ll_best and it > n_iters * 0.6:

                        # record best ll
                        ll_best = ll

                        # compute metrics
                        metrics = {
                            'LL':
                            ll_best,
                            'ELBO':
                            tf.reduce_mean(tf.concat(elbo, axis=0)).numpy(),
                            'Best Epoch':
                            it // its_per_epoch,
                            'Mean Bias':
                            tf.reduce_mean(tf.concat(mean_residuals,
                                                     axis=0)).numpy(),
                            'Mean RMSE':
                            tf.sqrt(
                                tf.reduce_mean(
                                    tf.concat(mean_residuals,
                                              axis=0)**2)).numpy(),
                            'Var Bias':
                            tf.reduce_mean(tf.concat(var_residuals,
                                                     axis=0)).numpy(),
                            'Var RMSE':
                            tf.sqrt(
                                tf.reduce_mean(
                                    tf.concat(var_residuals,
                                              axis=0)**2)).numpy(),
                            'Sample Bias':
                            tf.reduce_mean(tf.concat(sample_residuals,
                                                     axis=0)).numpy(),
                            'Sample RMSE':
                            tf.sqrt(
                                tf.reduce_mean(
                                    tf.concat(sample_residuals,
                                              axis=0)**2)).numpy()
                        }

                        # get p(x|x) for the held-out plotting data
                        _, _, _, mu_x, sigma2_x, _, _, _ = self.forward(
                            x_plot, anneling)
                        mean = mu_x.cpu().numpy()
                        variance = sigma2_x.cpu().numpy()
                        components = []
                        for v in tf.unstack(variance):
                            normal = tfp.distributions.Normal(loc=mean,
                                                              scale=v**0.5)
                            components.append(
                                tfp.distributions.Independent(
                                    normal, reinterpreted_batch_ndims=1))
                        cat = tfp.distributions.Categorical(
                            logits=tf.ones((variance.shape[1],
                                            variance.shape[0])))
                        px_x = tfp.distributions.Mixture(cat=cat,
                                                         components=components)

                        # save first two moments and samples for the plotting data
                        reconstruction = {
                            'mean': px_x.mean().numpy(),
                            'std': px_x.stddev().numpy(),
                            'sample': px_x.sample().numpy()
                        }

                    # early stop check
                    elif self.switch and it // its_per_epoch > epoch_best + 50:
                        print('Early Stop!')
                        break
                self.train()
            it += 1

        progressBar.close()
        return metrics, reconstruction