def estimate_divergences(model_hyperparams, lr, criterion, device, batch_size, n_epochs, is_wasserstein): input_size, h1_size, h2_size, out_size, out_sigmoid = model_hyperparams p_dist = iter(samplers.distribution1(x=0, batch_size=batch_size)) # train a model for each value of phi phi_list = np.linspace(-1, 1, 21) jsd_list = [] for phi in phi_list: # create model and optimizer model = MLP(input_size, h1_size, h2_size, out_size, out_sigmoid).to(device) print(model) optimizer = torch.optim.SGD(model.parameters(), lr=lr) q_dist = iter(samplers.distribution1(x=phi, batch_size=batch_size)) losses = train(model, p_dist, q_dist, optimizer, criterion, device, n_epochs, is_wasserstein) # visualise loss. # plt.figure() # plt.plot(losses) # plt.title('phi = {}'.format(phi)) # plt.show() divergence_estimate = -1 * losses[-1] print('At phi = {}, divergence estimate = {}'.format(phi, divergence_estimate)) jsd_list.append(divergence_estimate) plt.figure() plt.plot(phi_list, jsd_list, 'o') plt.xlabel('$\phi$', fontsize=14) plt.ylabel('Jensen-Shannon Divergence', fontsize=14) plt.show()
def train_JSD(): losses = [] thetas = np.array(range(-10, 11))/10 D_real = next(samplers.distribution1(0, 512)) for i in range(21): if cuda: Discriminator = Net().cuda() else: Discriminator = Net() # optimizer = optim.SGD(Discriminator.parameters(), lr = 1e-3, momentum = 0.9) optimizer = optim.Adam(Discriminator.parameters(), lr = 1e-3) print(thetas[i]) D_fake = next(samplers.distribution1(thetas[i], 512)) # print X = torch.from_numpy(D_real).float() Y = torch.from_numpy(D_fake).float() if cuda: X = X.cuda() Y = Y.cuda() # training stage for e in range(1000): O_real = Discriminator(X) O_fake = Discriminator(Y) optimizer.zero_grad() loss = JSD(O_real, O_fake) if ( e%100 == True): print(-loss.data) loss.backward() optimizer.step() # testing the values O_real = Discriminator(X) O_fake = Discriminator(Y) loss = JSD(O_real, O_fake) print (-loss.data) losses.append(-loss) print ('Done...') losses = np.array(losses) plt.figure() plt.scatter(thetas,losses) plt.title('Jenssen Shanon Divergence') plt.xlabel('Theta') plt.ylabel('Divergance') plt.savefig('Jenssen_Shanon_Divergence.png') plt.close()
def problem1_3(): phis = np.arange(-1, 1.1, 0.1) wds = [] jss = [] for phi in phis: p = distribution1(0, 512) q = distribution1(phi, 512) func1 = MLP_WD(dim=2) func2 = MLP_Disc(dim=2) wds.append(func1._train(p, q, epochs=200)) jss.append(func2._train(p, q, js_obj, epochs=100)) plt.figure() plt.plot(phis, torch.FloatTensor(jss).detach().numpy(), "-sk") plt.xlabel(r"$\phi$") plt.ylabel("Jensen-Shanon estimate") plt.figure() plt.plot(phis, torch.FloatTensor(wds).detach().numpy(), "-sk") plt.xlabel(r"$\phi$") plt.ylabel("Wasserstein distance estimate") plt.show() return wds, jss
############################################################################### ############################## Optimizer definung ############################# optimizer_D = torch.optim.Adam(D.parameters(), lr=setting.lr) ############################################################################### ################################ Running on GPU ############################### cuda_available = True if torch.cuda.is_available() else False #cuda_available = False if cuda_available: D.cuda() Tensor = torch.cuda.FloatTensor if cuda_available else torch.FloatTensor ############################################################################### ################################ Distribution p ############################### dist_p = iter(distribution1(0, sample_num)) samples = next(dist_p) samples_p = Tensor(samples) D_LOSSES = [] for theta in np.arange(-1., 1.1, 0.1): dist_q = iter(distribution1(theta, sample_num)) samples = next(dist_q) samples_q = Tensor(samples) batches_done = 0 wsd = [] for epoch in range(setting.n_epochs): fakes = [] for i in range(0, sample_num, setting.batch_size): up_bnd = i + setting.batch_size if up_bnd > sample_num + 1:
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--metric', type=str, default='JSD', help="options are 'WS' or 'JSD' or 'ce'") parser.add_argument('--use_cuda', type=bool, default=False) parser.add_argument('--setup', type=int, default=3) args = parser.parse_args() ### setup for question 3 if args.setup == 3: data_points = ([], []) dist1 = samplers.distribution1(0, batch_size=256) for x in np.arange(-1.0, 1.01, 0.1): x = np.around(x, decimals=1) print("x = " + str(x)) dist2 = samplers.distribution1(x, batch_size=256) D = train(args.metric, dist1, dist2, args.use_cuda) y = estimate(args.metric, dist1, dist2, D, args.use_cuda) data_points[0].append(x) data_points[1].append(y) plt.plot(data_points[0], data_points[1], '.') plt.xlim(-1, 1) plt.show() ### setup for question 4 ### using the provided script density_estimation.py
optimizer_T.step() Wd = wd_objective(Critic, x_p, y_q) penalty = gradient_penalty(Critic, x_p, y_q, lamda) Wd = Wd - penalty return Critic, Wd ########### Question 1.3 ############ Phi_values = [-1 + 0.1 * i for i in range(21)] estimated_jsd, estimated_wd = [], [] for Phi in Phi_values: dist_p = distribution1(0, batch_size=512) dist_q = distribution1(Phi, batch_size=512) Discrim, jsd = js_divergence(dist_p, dist_q, m_minibatch=1000) estimated_jsd.append(jsd) Critic, wd = w_distance(dist_p, dist_q, m_minibatch=1000, lamda=10) estimated_wd.append(wd) # TO DO print( f"Phi: {Phi:.2f} estimated JSD: {jsd.item():.6f} estimated WD: {wd.item():.6f}" ) plt.figure(figsize=(8, 4))
import WGAN_Final as problem2 import samplers from matplotlib import pyplot as plt theta_initial = -1 theta_incremental = 0.1 number_of_models = 21 p_distribution = samplers.distribution1(0, 512) WD_Values = [] Theta_Values = [] for i in range(number_of_models): print("#####################", "Itteration", i + 1, "######################################") q_distribution = samplers.distribution1(theta_initial, 512) WD = problem2.WGAN(hidden_size=64, mini_batch=512, learning_rate=0.001, num_epochs=1000, print_interval=100) WD_Values.append(WD.run_main_loop(p_distribution, q_distribution)) Theta_Values.append(theta_initial) theta_initial = theta_initial + theta_incremental plt.plot(Theta_Values, WD_Values, label='WD') plt.legend() plt.show() print("Training complete")
num_epochs = 100 # number of training epochs init_lr = 0.001 # initial learning rate # the binary cross entropy l(y,D(x)) is: -1 * [y log(D(x)) + (1 - y)*log(1 - D(x))] # if we take y to be zero (distribution q), we optimize -log(1-D(x)) # if we take y to be 1 (distribution p), we optimize -log(D(x)) # we want to minimize this (this is directly equivalent to maximizing our objective function) criterion = nn.BCEWithLogitsLoss( ) # binary cross entropy with built-in sigmoid optimizer = optim.SGD(d.parameters(), lr=init_lr) for epoch in range(num_epochs): # sample minibatches from the two distributions: distr1 = sp.distribution1(0, batch_size) dist1 = iter(distr1) samples1 = np.squeeze(next(dist1)[:, 0]) t1 = torch.Tensor(samples1).to(device) distr3 = sp.distribution3(batch_size) dist3 = iter(distr3) samples3 = np.squeeze(next(dist3)) t3 = torch.Tensor(samples3).to(device) d.zero_grad() # gradients to zero # gradients on 'real' distribution: out_r = d(t1) err_r = criterion(out_r, torch.Tensor([1]).to(device)) err_r.backward()
if __name__ == '__main__': batch_size = 512 phi_values = np.arange(-1., 1.1, 0.1) print(phi_values) best_list = [] # Problem 1.3 - JSD for phi in phi_values: d = Discriminator(input_size=2) optimizer = torch.optim.SGD(d.parameters(), lr=1e-1) criterion = JSDLoss() real_dist = iter(samplers.distribution1(x=0, batch_size=batch_size)) fake_dist = iter(samplers.distribution1(x=phi, batch_size=batch_size)) best_loss = -float('Inf') for batch in range(2500): real_samples = torch.as_tensor(next(real_dist), dtype=torch.float32).view(-1, 2) fake_samples = torch.as_tensor(next(fake_dist), dtype=torch.float32).view(-1, 2) optimizer.zero_grad() real_outputs = d(real_samples) fake_outputs = d(fake_samples) loss = -criterion(real_outputs, fake_outputs) print(loss) loss.backward() optimizer.step()
def plot_functions_estimate(self, epochs, distance_fct): global graph data_points = [] #Sets the default graph graph = tf.get_default_graph() #Initialize the model to save defaults weights discriminator = sq() discriminator.add(Dense(units=64, activation='relu', input_dim=2)) discriminator.add(Dense(units=64, activation='relu')) discriminator.save_weights('model.h5') # Get appropriate loss function if distance_fct == 'JSD': loss_fct = self.JSD_Loss discriminator.add(Dense(units=1, activation='sigmoid')) if distance_fct == 'Wasserstein': loss_fct = self.Wasserstein_Loss discriminator.add(Dense(units=1, activation='linear')) else: assert 'Unknown loss function' discriminator.save_weights('model.h5') for i in range(21): # Reset the model K.clear_session() tf.reset_default_graph() with graph.as_default(): # Initialize current experiment model discriminator = sq() # Get appropriate loss function if distance_fct == 'JSD': loss_fct = self.JSD_Loss discriminator.add(Dense(units=64, activation='relu', input_dim=2)) discriminator.add(Dense(units=64, activation='relu')) discriminator.add(Dense(units=1, activation='sigmoid')) if distance_fct == 'Wasserstein': loss_fct = self.Wasserstein_Loss discriminator.add(Dense(units=64, activation='relu', kernel_constraint=max_norm(0.2), input_dim=2)) discriminator.add(Dense(units=64, kernel_constraint=max_norm(0.2), activation='relu')) discriminator.add(Dense(units=1,kernel_constraint=max_norm(0.5), activation='linear')) else: assert 'Unknown loss function' discriminator.load_weights('model.h5') if distance_fct == 'Wasserstein': discriminator.compile(loss=loss_fct, optimizer=SGD(lr=0.1)) else: discriminator.compile(loss=loss_fct, optimizer=SGD(lr=0.5)) phi = round(-1.0 + (0.1 * i), 2) for _ in range(epochs): # Create our distributions p_gen = samplers.distribution1(0) q_gen = samplers.distribution1(phi) p = next(p_gen) q = next(q_gen) # Make target dummy for Keras y_dummy = np.zeros(512 * 2) # Train the model on the current distributions discriminator.train_on_batch(np.concatenate((p, q)), y_dummy) x = discriminator.get_weights() # Create the test distributions p_gen = samplers.distribution1(0) q_gen = samplers.distribution1(phi) p = next(p_gen) q = next(q_gen) D_x = discriminator.predict(p) D_y = discriminator.predict(q) if distance_fct == 'JSD': data_points.append(self.JSD(D_x,D_y)) if distance_fct == 'Wasserstein': data_points.append(self.Wasserstein(D_x,D_y)) plt.plot(data_points) plt.show() return 0
d(torch.from_numpy(xx)).numpy()**(-1) * N(xx)) plt.plot(xx, N(xx)) plt.clf() ############### import the sampler ``samplers.distribution4'' ############### train a discriminator on distribution4 and standard gaussian ############### estimate the density of distribution4 #######--- INSERT YOUR CODE BELOW ---####### directory = "model/" num_epochs = 1000 if args.question == 3: print("question 3") phi = np.linspace(-1, 1, 21) x = samplers.distribution1(0) values = [] for i in phi: y = samplers.distribution1(i) model = Discriminator(2, 50, 512, 0) for epoch in range(num_epochs): x_batch = torch.from_numpy(next(x)) y_batch = torch.from_numpy(next(y)) model.train(x_batch.type(torch.FloatTensor), y_batch.type(torch.FloatTensor), args.loss_type) #torch.save(model.state_dict(), os.path.join(directory, 'best_params_'+str(i)+'.pt')) x_dist = samplers.distribution1(0, 10000) y_dist = samplers.distribution1(i, 10000) x_dist_batch = torch.from_numpy(next(x_dist)) y_dist_batch = torch.from_numpy(next(y_dist)) x_value = x_dist_batch.type(torch.FloatTensor)
def get_samples(phi: float): p = distribution1(0) q = distribution1(phi) jsd, _ = q1(p, q, maxsteps=1000, threshold=0.01) wd, _ = q2(p, q, maxsteps=1000, threshold=0.01) return jsd, wd
def training_loop(LossFct, x, distribution=1, learning_rate=0.0001, num_epochs=50000): # Device configuration device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # Distributions properties if distribution == 1: p_gen = samplers.distribution1(0) q_gen = samplers.distribution1(x) nb_input = 2 elif distribution == 4: q_gen = samplers.distribution3(2048) p_gen = samplers.distribution4(2048) nb_input = 1 p_gen.send(None) q_gen.send(None) D = discriminators.Discriminator(n_input=nb_input).to(device) # Loss and optimizer optimizer = torch.optim.SGD(D.parameters(), lr=learning_rate) # Train the model trainLoss = [] trainAcc = [] meanLoss = 0 correct = 0 total = 0 log_frequency = 100 for epoch in range(num_epochs): # exp_lr_scheduler.step() p = torch.from_numpy(p_gen.send(0)).float().to(device) q = torch.from_numpy(q_gen.send(x)).float().to(device) labels_real = torch.ones(p.shape[0]).to(device) labels_fake = torch.zeros(q.shape[0]).to(device) # Forward pass outputs_real = torch.sigmoid(D(p)) outputs_fake = torch.sigmoid(D(q)) predicted_real = (outputs_real.data > 0.5).float().squeeze() predicted_fake = (outputs_fake.data > 0.5).float().squeeze() total += 2 * labels_real.size(0) correct_this_batch = (predicted_real == labels_real).sum().item() + ( predicted_fake == labels_fake).sum().item() correct += correct_this_batch loss = LossFct.forward( outputs_real, outputs_fake, labels_real, labels_fake, p, q, D ) #(torch.log(torch.tensor([2.0])).to(device) + 0.5*criterion(outputs_real, labels_real) + 0.5*criterion(outputs_fake, labels_fake)) meanLoss += loss.item() # Backward and optimize optimizer.zero_grad() loss.backward() optimizer.step() if epoch % log_frequency == 0: print('Epoch [{}/{}]'.format(epoch, num_epochs)) print('Loss: {:.4f}({:.4f}), Acc: {:.3f}({:.3f})'.format( loss.item(), meanLoss / (epoch + 1), correct_this_batch * 100 / (2 * labels_real.size(0)), correct * 100 / total)) trainLoss.append(meanLoss / (epoch + 1)) trainAcc.append(100 * correct / total) return loss, D
D.eval() x = torch.from_numpy(next(p)).float().to(device) y = torch.from_numpy(next(q)).float().to(device) loss = D.loss_func(x, y, loss_metric) return -loss.item() if __name__ == "__main__": print(device) # Q1.3 JSD jsd_list = [] wd_list = [] phis = np.around(np.arange(-1.0, 1.0, 0.1), 1) for phi in phis: print(phi) dist_p = distribution1(0, 512) dist_q = distribution1(phi, 512) D = Discriminator() train(D, dist_p, dist_q) y = predict(D, dist_p, dist_q) print("Estimate: ", y) jsd_list.append(y) plt.scatter(phis, jsd_list) plt.title('{}'.format('JSD')) plt.ylabel('Estimated Jensen-Shannon Divergence') plt.xlabel('$\phi$') plt.savefig('JSD.png') plt.show() # Q1.3 WD