def __init__(self, netsize, Nsensors=1, Nmotors=1): # Create ising model self.size = netsize #Network size self.Ssize = Nsensors # Number of sensors self.Msize = Nmotors # Number of sensors self.h = np.zeros(netsize) self.J = np.zeros((netsize, netsize)) self.max_weights = 2 self.randomize_state() self.env = BipedalWalker() #self.env.min_position = -1.5 * np.pi / 3 #self.env.max_position = 8.5 * np.pi / 3 self.env.goal_position = 1.5 * np.pi / 3 self.env.max_speed = .045 self.observation = self.env.reset() #self.max_angle_j0 = -0.61 #-35º #self.min_angle_j0 = -1.39 #-80º self.max_angle_j0 = -1 self.min_angle_j0 = -1 self.Beta = 1.0 self.defaultT = max(100, netsize * 20) self.Ssize1 = 0 #self.maxspeed = self.env.max_speed self.Update(-1)
def create_env(self, hardcore=False, super_easy=False): # Create a BipedalWalker environmnet if hardcore: self.max_number_steps = 2000 self.env = BipedalWalkerHardcore() elif super_easy: self.max_number_steps = 1600 self.env = BipedalWalkerSuperEasy() else: self.max_number_steps = 1600 self.env = BipedalWalker()
class BipedalWalkerWrapper: def __init__(self, hardcore=False, render=False, verbose=False): self.render = render self.verbose = verbose def create_env(self, hardcore=False, super_easy=False): # Create a BipedalWalker environmnet if hardcore: self.max_number_steps = 2000 self.env = BipedalWalkerHardcore() elif super_easy: self.max_number_steps = 1600 self.env = BipedalWalkerSuperEasy() else: self.max_number_steps = 1600 self.env = BipedalWalker() def setup_simulation(self, render=False, verbose=False): # Setup variable for simulation self.n_step = 0 self.render = render self.verbose = verbose def step(self, action): state, reward, done, info = self.env.step(action) self.n_step += 1 # print(self.n_step) if(self.n_step == self.max_number_steps): done = True if self.render: self.env.render() return state, reward, done, info def reset_environment(self, render=False): self.render = render self.env.reset() self.n_step = 0
def _worker(self, rank, pipe, parent_pipe): parent_pipe.close() rng = np.random.RandomState(rank) env = BipedalWalker() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] rnn = WorldModel(obs_dim, act_dim) ctrls = [Controller(obs_dim+rnn.hid_dim, act_dim) for _ in range(self.agents_per_worker)] while True: command, data = pipe.recv() if command == 'upload_rnn': # data: rnn rnn.load_state_dict(data.state_dict()) pipe.send((None, True)) elif command == 'upload_ctrl': # data: ([inds], noisy) inds, noisy = data for ctrl, ind in zip(ctrls, inds): if noisy: ind += rng.normal(scale=1e-3, size=ind.shape) ctrl.load_genotype(ind) pipe.send((None, True)) elif command == 'rollout': # data: random_policy rollouts = [] for ctrl in ctrls: env.seed(rng.randint(2**31-1)) if data: # if rollout with random policy trajectory = random_rollout(env) else: trajectory = rollout(env, rnn, ctrl) rollouts.append(trajectory) pipe.send((rollouts, True)) elif command == 'evaluate': # data: None evaluations = [] for ctrl in ctrls: env.seed(rng.randint(2**31-1)) evaluations.append(evaluate(env, rnn, ctrl)) pipe.send((evaluations, True)) elif command == 'close': # data: None env.close() pipe.send((None, True)) return True return False
from bipedal_walker import BipedalWalker from gp_gym_info import info import gym import numpy as np np.random.seed(1) info["env_name"] = "BipedalWalker-v3" info["pop_size"] = 10 info["max_gens"] = 5 info["num_eps"] = 10 info["num_time_steps"] = 100 info["tournament_size"] = 3 info["mutation_rate"] = 0.1 info["term_fit"] = 300 info["max_depth"] = 5 agent = BipedalWalker(info) best_program = agent.train() print(best_program) fitness = agent.fit(best_program, info["num_eps"], info["num_time_steps"], render=False) print("Fitness: {}".format(fitness)) agent.fit(best_program, 5, 100, render=True)
class ising: # Initialize the network def __init__(self, netsize, Nsensors=1, Nmotors=1): # Create ising model self.size = netsize #Network size self.Ssize = Nsensors # Number of sensors self.Msize = Nmotors # Number of sensors self.h = np.zeros(netsize) self.J = np.zeros((netsize, netsize)) self.max_weights = 2 self.randomize_state() self.env = BipedalWalker() #self.env.min_position = -1.5 * np.pi / 3 #self.env.max_position = 8.5 * np.pi / 3 self.env.goal_position = 1.5 * np.pi / 3 self.env.max_speed = .045 self.observation = self.env.reset() #self.max_angle_j0 = -0.61 #-35º #self.min_angle_j0 = -1.39 #-80º self.max_angle_j0 = -1 self.min_angle_j0 = -1 self.Beta = 1.0 self.defaultT = max(100, netsize * 20) self.Ssize1 = 0 #self.maxspeed = self.env.max_speed self.Update(-1) def get_state(self, mode='all'): if mode == 'all': return self.s elif mode == 'motors': return self.s[-self.Msize:] elif mode == 'sensors': return self.s[0:self.Ssize] elif mode == 'input': return self.sensors elif mode == 'non-sensors': return self.s[self.Ssize:] elif mode == 'hidden': return self.s[self.Ssize:-self.Msize] def get_state_index(self, mode='all'): return bool2int(0.5 * (self.get_state(mode) + 1)) # Randomize the state of the network def randomize_state(self): self.s = np.random.randint(0, 2, self.size) * 2 - 1 self.sensors = np.random.randint(0, 2, self.Ssize) * 2 - 1 # Randomize the position of the agent def randomize_position(self): self.observation = self.env.reset() # Set random bias to sets of units of the system def random_fields(self, max_weights=None): if max_weights is None: max_weights = self.max_weights self.h[self.Ssize:] = max_weights * \ (np.random.rand(self.size - self.Ssize) * 2 - 1) # Set random connections to sets of units of the system def random_wiring(self, max_weights=None): # Set random values for h and J if max_weights is None: max_weights = self.max_weights for i in range(self.size): for j in np.arange(i + 1, self.size): if i < j and (i >= self.Ssize or j >= self.Ssize): self.J[i, j] = (np.random.rand(1) * 2 - 1) * self.max_weights # Update the position of the agent def Move(self): action = self.s[-self.Msize:] #print(action) observation, reward, done, info = self.env.step(action) # Transorm the sensor input into integer index def SensorIndex(self, x, xmax): return int( np.floor((x + xmax) / (2 * xmax + 10 * np.finfo(float).eps) * 2**(self.Ssize / 4))) # Update the state of the sensor def UpdateSensors(self): self.speed_ind_leg1_j0 = self.SensorIndex( self.env.joints[0].speed / self.env.vmaxhip, self.env.vmaxhip * 2) self.sensors[0:4] = 2 * bitfield(self.speed_ind_leg1_j0, int(self.Ssize / 4)) - 1 self.speed_ind_leg1_j1 = self.SensorIndex( self.env.joints[1].speed / self.env.vmaxknee, self.env.vmaxknee * 2) self.sensors[4:8] = 2 * bitfield(self.speed_ind_leg1_j1, int(self.Ssize / 4)) - 1 self.speed_ind_leg2_j0 = self.SensorIndex( self.env.joints[2].speed / self.env.vmaxhip, self.env.vmaxhip * 2) self.sensors[8:12] = 2 * bitfield(self.speed_ind_leg2_j0, int(self.Ssize / 4)) - 1 self.speed_ind_leg2_j1 = self.SensorIndex( self.env.joints[3].speed / self.env.vmaxknee, self.env.vmaxknee * 2) self.sensors[12:] = 2 * bitfield(self.speed_ind_leg2_j1, int(self.Ssize / 4)) - 1 # Execute step of the Glauber algorithm to update the state of one unit def GlauberStep(self, i=None): if i is None: i = np.random.randint(self.size) I = 0 if i < self.Ssize: I = self.sensors[i] eDiff = 2 * self.s[i] * (self.h[i] + I + np.dot(self.J[i, :] + self.J[:, i], self.s)) if eDiff * self.Beta < np.log(1 / np.random.rand() - 1): # Glauber self.s[i] = -self.s[i] # Update random unit of the agent def Update(self, i=None): if i is None: i = np.random.randint(-1, self.size) if i == -1: self.Move() self.UpdateSensors() else: self.GlauberStep(i) # Sequentially update state of all units of the agent in random order def SequentialUpdate(self): for i in np.random.permutation(range(-1, self.size)): self.Update(i) # Step of the learning algorith to ajust correlations to the critical regime def AdjustCorrelations(self, T=None): if T is None: T = self.defaultT self.m = np.zeros(self.size) self.c = np.zeros((self.size, self.size)) self.C = np.zeros((self.size, self.size)) # Main simulation loop: self.x = np.zeros(T) samples = [] for t in range(T): self.SequentialUpdate() #self.x[t] = self.position self.m += self.s for i in range(self.size): self.c[i, i + 1:] += self.s[i] * self.s[i + 1:] self.m /= T self.c /= T for i in range(self.size): self.C[i, i + 1:] = self.c[i, i + 1:] - self.m[i] * self.m[i + 1:] c1 = np.zeros((self.size, self.size)) for i in range(self.size): inds = np.array([], int) c = np.array([]) for j in range(self.size): if not i == j: inds = np.append(inds, [j]) if i < j: c = np.append(c, [self.c[i, j]]) elif i > j: c = np.append(c, [self.c[j, i]]) order = np.argsort(c)[::-1] c1[i, inds[order]] = self.Cint[i, :] self.c1 = np.triu(c1 + c1.T, 1) self.c1 *= 0.5 self.m[0:self.Ssize] = 0 self.m1[0:self.Ssize] = 0 self.c[0:self.Ssize, 0:self.Ssize] = 0 self.c[-self.Msize:, -self.Msize:] = 0 self.c[0:self.Ssize, -self.Msize:] = 0 self.c1[0:self.Ssize, 0:self.Ssize] = 0 self.c1[-self.Msize:, -self.Msize:] = 0 self.c1[0:self.Ssize, -self.Msize:] = 0 dh = self.m1 - self.m dJ = self.c1 - self.c #print(self.m1,self.m) #print("dh, dJ:",dh,",",dJ) return dh, dJ # Algorithm for poising an agent in a critical regime def CriticalLearning(self, Iterations, T=None): u = 0.01 count = 0 dh, dJ = self.AdjustCorrelations(T) fit = max(np.max(np.abs(self.c1 - self.c)), np.max(np.abs(self.m1 - self.m))) #x_min = np.min(self.x) #x_max = np.max(self.x) #maxmin_range = (self.env.max_position + self.env.min_position) / 2 #maxmin = (np.array([x_min, x_max]) - maxmin_range) / maxmin_range #print(count, fit, np.max(np.abs(self.J))) for it in range(Iterations): count += 1 self.h += u * dh self.J += u * dJ #print(self.h,self.J) if it % 4 == 0: self.randomize_state() self.randomize_position() Vmax = self.max_weights for i in range(self.size): if np.abs(self.h[i]) > Vmax: self.h[i] = Vmax * np.sign(self.h[i]) for j in np.arange(i + 1, self.size): if np.abs(self.J[i, j]) > Vmax: self.J[i, j] = Vmax * np.sign(self.J[i, j]) dh, dJ = self.AdjustCorrelations(T) fit = np.max(np.abs(self.c1 - self.c))
def main(args): print("IT'S DANGEROUS TO GO ALONE! TAKE THIS.") np.random.seed(0) pt.manual_seed(0) env = BipedalWalker() env.seed(0) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] print(f"Initializing agent (device={device})...") rnn = WorldModel(obs_dim, act_dim) ctrl = Controller(obs_dim + rnn.hid_dim, act_dim) # Adjust population size based on the number of available CPUs. num_workers = mp.cpu_count() if args.nproc is None else args.nproc num_workers = min(num_workers, mp.cpu_count()) agents_per_worker = args.popsize // num_workers popsize = num_workers * agents_per_worker print(f"Initializing population with {popsize} workers...") pop = Population(num_workers, agents_per_worker) global_mu = np.zeros_like(ctrl.genotype) loss_logger = ValueLogger('ha_rnn_loss', bufsize=20) best_logger = ValueLogger('ha_ctrl_best', bufsize=100) # Train the RNN with random policies. print(f"Training M model with a random policy...") optimizer = optim.Adam(rnn.parameters(), lr=args.lr) train_rnn(rnn, optimizer, pop, random_policy=True, num_rollouts=args.num_rollouts, logger=loss_logger) loss_logger.plot('M model training loss', 'step', 'loss') # Upload the trained RNN. success = pop.upload_rnn(rnn.cpu()) assert success # Iteratively update controller and RNN. for i in range(args.niter): # Evolve controllers with the trained RNN. print(f"Iter. {i}: Evolving C model...") es = EvolutionStrategy(global_mu, args.sigma0, popsize) evolve_ctrl(ctrl, es, pop, num_gen=args.num_gen, logger=best_logger) best_logger.plot('C model evolution', 'gen', 'fitness') # Update the global best individual and upload them. global_mu = np.copy(ctrl.genotype) success = pop.upload_ctrl(global_mu, noisy=True) assert success # Train the RNN with the current best controller. print(f"Iter. {i}: Training M model...") train_rnn(rnn, optimizer, pop, random_policy=False, num_rollouts=args.num_rollouts, logger=loss_logger) loss_logger.plot('M model training loss', 'step', 'loss') # Upload the trained RNN. success = pop.upload_rnn(rnn.cpu()) assert success # Test run! rollout(env, rnn, ctrl, render=True) success = pop.close() assert success
import cProfile from bipedal_walker import BipedalWalker from gp_gym_info import info info["env_name"] = "BipedalWalker-v3" info["pop_size"] = 100 info["max_gens"] = 10 info["num_eps"] = 1 info["num_time_steps"] = 1 info["tournament_size"] = 3 info["mutation_rate"] = 0.1 info["term_fit"] = 300 info["max_depth"] = 5 agent = BipedalWalker(info) cProfile.run("agent.train()", "bipedal_walker.profile")
def create_env(args): env = BipedalWalker(args.scale_legs) env = frame_stack(env, args) return env