def train(env, mu, std, alpha): p = Normal(mu, std) num_train_runs = 5 for t in range(num_train_runs): sample_weights = p.sample(pop_size) fitnesses = simulate(env, sample_weights) scaled_fitnesses = (fitnesses - fitnesses.mean()) / fitnesses.std() mean = expectation(scaled_fitnesses, sample_weights, p=p) mean.backward() with torch.no_grad(): mu += alpha * mu mu.grad.zero_()
import torch import numpy as np from evograd import expectation from evograd.distributions import Normal def fun(x): return 5 * torch.sin(0.2 * x) * torch.sin(20 * x) mu = torch.tensor([1.0], requires_grad=True) npop = 500 # population size std = 0.5 # noise standard deviation alpha = 0.03 # learning rate p = Normal(mu, std) for t in range(2000): sample = p.sample(npop) behaviors = fun(sample) zscores = (behaviors - behaviors.mean()) / behaviors.std() variance = expectation(zscores**2, sample, p=p) variance.backward() with torch.no_grad(): mu += alpha * mu.grad mu.grad.zero_() print("step: {}, estimated variance: {:0.5}".format(t, float(mu)))
y = np.zeros(npop) for i in range(0, npop): y[i] = -(1 / calc_cd_over_cl(x_np[i]) - 50)**2 return torch.from_numpy(y) mu = torch.tensor([0.1, 0.1, 1.0, 1.0, 1.0, 1.0], requires_grad=True) p = Normal(mu, std) for t in range(max_iter): print('Current iteration ' + str(t) + '/' + str(max_iter)) sample = p.sample(npop) fitnesses = fun(sample) fitnesses = (fitnesses - fitnesses.mean()) / fitnesses.std() mean = expectation(fitnesses, sample, p=p) mean.backward() with torch.no_grad(): mu += alpha * mu.grad mu.grad.zero_() print('Current fitness: ' + str(1 / calc_cd_over_cl(mu.detach().numpy()))) # print('step: {}, mean fitness: {:0.5}'.format(t, float(mu))) print('') print(mu) print(1 / calc_cd_over_cl(mu.detach().numpy())) print(bad)
break return total_reward / num_run def simulate(batch_weights): rewards = [] for weights in batch_weights: rewards.append(simulate_single(weights.numpy())) return torch.tensor(rewards) mu = torch.randn(4, requires_grad=True) # population mean npop = 50 # population size std = 0.5 # noise standard deviation alpha = 0.03 # learning rate p = Normal(mu, std) env = gym.make("CartPole-v0") for t in range(2000): sample = p.sample(npop) fitnesses = simulate(sample) scaled_fitnesses = (fitnesses - fitnesses.mean()) / fitnesses.std() mean = expectation(scaled_fitnesses, sample, p=p) mean.backward() with torch.no_grad(): mu += alpha * mu.grad mu.grad.zero_() print("step: {}, mean fitness: {:0.5}".format(t, float(fitnesses.mean())))
from evograd.distributions import Normal def fun(x): return 5 * torch.sin(0.2 * x) * torch.sin(20 * x) mu = torch.tensor(1.0, requires_grad=True) npop = 500 # population size std = 0.5 # noise standard deviation k_sigma = 1.0 # kernel standard deviation alpha = 0.10 # learning rate p = Normal(mu, std) for t in range(2000): sample = p.sample(npop) novelties = fun(sample).unsqueeze(1) novelties = (novelties - novelties.mean()) / novelties.std() dists = scipy.spatial.distance.squareform( scipy.spatial.distance.pdist(novelties, "sqeuclidean")) kernel = torch.tensor(scipy.exp(-dists / k_sigma**2), dtype=torch.float32) p_x = expectation(kernel, sample, p=p) entropy = expectation(-torch.log(p_x), sample, p=p) entropy.backward() with torch.no_grad(): mu += alpha * mu.grad mu.grad.zero_() print('step: {}, estimated entropy: {:0.5}'.format(t, float(mu)))