예제 #1
0
    def __init__(self, netsize, Nsensors=1, Nmotors=1):  # Create ising model

        self.size = netsize  #Network size
        self.Ssize = Nsensors  # Number of sensors
        self.Msize = Nmotors  # Number of sensors

        self.h = np.zeros(netsize)
        self.J = np.zeros((netsize, netsize))
        self.max_weights = 2

        self.randomize_state()

        self.env = BipedalWalker()
        #self.env.min_position = -1.5 * np.pi / 3
        #self.env.max_position = 8.5 * np.pi / 3
        self.env.goal_position = 1.5 * np.pi / 3
        self.env.max_speed = .045
        self.observation = self.env.reset()
        #self.max_angle_j0 = -0.61 #-35º
        #self.min_angle_j0 = -1.39 #-80º
        self.max_angle_j0 = -1
        self.min_angle_j0 = -1

        self.Beta = 1.0
        self.defaultT = max(100, netsize * 20)

        self.Ssize1 = 0
        #self.maxspeed = self.env.max_speed
        self.Update(-1)
예제 #2
0
 def create_env(self, hardcore=False, super_easy=False):
     # Create a BipedalWalker environmnet
     if hardcore:
         self.max_number_steps = 2000
         self.env = BipedalWalkerHardcore()
     elif super_easy:
         self.max_number_steps = 1600
         self.env = BipedalWalkerSuperEasy()
     else:
         self.max_number_steps = 1600
         self.env = BipedalWalker()
예제 #3
0
class BipedalWalkerWrapper:

    def __init__(self, hardcore=False, render=False, verbose=False):
        self.render = render
        self.verbose = verbose

    def create_env(self, hardcore=False, super_easy=False):
        # Create a BipedalWalker environmnet
        if hardcore:
            self.max_number_steps = 2000
            self.env = BipedalWalkerHardcore()
        elif super_easy:
            self.max_number_steps = 1600
            self.env = BipedalWalkerSuperEasy()
        else:
            self.max_number_steps = 1600
            self.env = BipedalWalker()

    def setup_simulation(self, render=False, verbose=False):
        # Setup variable for simulation
        self.n_step = 0
        self.render = render
        self.verbose = verbose

    def step(self, action):
        state, reward, done, info = self.env.step(action)
        self.n_step += 1
        # print(self.n_step)
        
        if(self.n_step == self.max_number_steps):
            done = True
        
        if self.render:
            self.env.render()

        return state, reward, done, info


    def reset_environment(self, render=False):
        self.render = render
        self.env.reset()
        self.n_step = 0
예제 #4
0
  def _worker(self, rank, pipe, parent_pipe):
    parent_pipe.close()

    rng = np.random.RandomState(rank)

    env = BipedalWalker()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    rnn = WorldModel(obs_dim, act_dim)
    ctrls = [Controller(obs_dim+rnn.hid_dim, act_dim)
             for _ in range(self.agents_per_worker)]
  
    while True:
      command, data = pipe.recv()

      if command == 'upload_rnn': # data: rnn
        rnn.load_state_dict(data.state_dict())
        pipe.send((None, True))

      elif command == 'upload_ctrl': # data: ([inds], noisy)
        inds, noisy = data
        for ctrl, ind in zip(ctrls, inds):
          if noisy:
            ind += rng.normal(scale=1e-3, size=ind.shape)
          ctrl.load_genotype(ind)
        pipe.send((None, True))

      elif command == 'rollout': # data: random_policy
        rollouts = []
        for ctrl in ctrls:
          env.seed(rng.randint(2**31-1))
          if data: # if rollout with random policy
            trajectory = random_rollout(env)
          else:
            trajectory = rollout(env, rnn, ctrl)
          rollouts.append(trajectory)
        pipe.send((rollouts, True))

      elif command == 'evaluate': # data: None
        evaluations = []
        for ctrl in ctrls:
          env.seed(rng.randint(2**31-1))
          evaluations.append(evaluate(env, rnn, ctrl))
        pipe.send((evaluations, True))

      elif command == 'close': # data: None
        env.close()
        pipe.send((None, True))
        return True

    return False
from bipedal_walker import BipedalWalker
from gp_gym_info import info
import gym
import numpy as np

np.random.seed(1)

info["env_name"] = "BipedalWalker-v3"
info["pop_size"] = 10
info["max_gens"] = 5
info["num_eps"] = 10
info["num_time_steps"] = 100
info["tournament_size"] = 3
info["mutation_rate"] = 0.1
info["term_fit"] = 300
info["max_depth"] = 5

agent = BipedalWalker(info)
best_program = agent.train()
print(best_program)
fitness = agent.fit(best_program,
                    info["num_eps"],
                    info["num_time_steps"],
                    render=False)
print("Fitness: {}".format(fitness))

agent.fit(best_program, 5, 100, render=True)
예제 #6
0
class ising:
    # Initialize the network
    def __init__(self, netsize, Nsensors=1, Nmotors=1):  # Create ising model

        self.size = netsize  #Network size
        self.Ssize = Nsensors  # Number of sensors
        self.Msize = Nmotors  # Number of sensors

        self.h = np.zeros(netsize)
        self.J = np.zeros((netsize, netsize))
        self.max_weights = 2

        self.randomize_state()

        self.env = BipedalWalker()
        #self.env.min_position = -1.5 * np.pi / 3
        #self.env.max_position = 8.5 * np.pi / 3
        self.env.goal_position = 1.5 * np.pi / 3
        self.env.max_speed = .045
        self.observation = self.env.reset()
        #self.max_angle_j0 = -0.61 #-35º
        #self.min_angle_j0 = -1.39 #-80º
        self.max_angle_j0 = -1
        self.min_angle_j0 = -1

        self.Beta = 1.0
        self.defaultT = max(100, netsize * 20)

        self.Ssize1 = 0
        #self.maxspeed = self.env.max_speed
        self.Update(-1)

    def get_state(self, mode='all'):
        if mode == 'all':
            return self.s
        elif mode == 'motors':
            return self.s[-self.Msize:]
        elif mode == 'sensors':
            return self.s[0:self.Ssize]
        elif mode == 'input':
            return self.sensors
        elif mode == 'non-sensors':
            return self.s[self.Ssize:]
        elif mode == 'hidden':
            return self.s[self.Ssize:-self.Msize]

    def get_state_index(self, mode='all'):
        return bool2int(0.5 * (self.get_state(mode) + 1))

    # Randomize the state of the network
    def randomize_state(self):
        self.s = np.random.randint(0, 2, self.size) * 2 - 1
        self.sensors = np.random.randint(0, 2, self.Ssize) * 2 - 1

    # Randomize the position of the agent
    def randomize_position(self):
        self.observation = self.env.reset()

    # Set random bias to sets of units of the system
    def random_fields(self, max_weights=None):
        if max_weights is None:
            max_weights = self.max_weights
        self.h[self.Ssize:] = max_weights * \
         (np.random.rand(self.size - self.Ssize) * 2 - 1)

    # Set random connections to sets of units of the system
    def random_wiring(self, max_weights=None):  # Set random values for h and J
        if max_weights is None:
            max_weights = self.max_weights
        for i in range(self.size):
            for j in np.arange(i + 1, self.size):
                if i < j and (i >= self.Ssize or j >= self.Ssize):
                    self.J[i,
                           j] = (np.random.rand(1) * 2 - 1) * self.max_weights

    # Update the position of the agent
    def Move(self):
        action = self.s[-self.Msize:]
        #print(action)
        observation, reward, done, info = self.env.step(action)

    # Transorm the sensor input into integer index
    def SensorIndex(self, x, xmax):
        return int(
            np.floor((x + xmax) / (2 * xmax + 10 * np.finfo(float).eps) *
                     2**(self.Ssize / 4)))

    # Update the state of the sensor
    def UpdateSensors(self):
        self.speed_ind_leg1_j0 = self.SensorIndex(
            self.env.joints[0].speed / self.env.vmaxhip, self.env.vmaxhip * 2)
        self.sensors[0:4] = 2 * bitfield(self.speed_ind_leg1_j0,
                                         int(self.Ssize / 4)) - 1
        self.speed_ind_leg1_j1 = self.SensorIndex(
            self.env.joints[1].speed / self.env.vmaxknee,
            self.env.vmaxknee * 2)
        self.sensors[4:8] = 2 * bitfield(self.speed_ind_leg1_j1,
                                         int(self.Ssize / 4)) - 1
        self.speed_ind_leg2_j0 = self.SensorIndex(
            self.env.joints[2].speed / self.env.vmaxhip, self.env.vmaxhip * 2)
        self.sensors[8:12] = 2 * bitfield(self.speed_ind_leg2_j0,
                                          int(self.Ssize / 4)) - 1
        self.speed_ind_leg2_j1 = self.SensorIndex(
            self.env.joints[3].speed / self.env.vmaxknee,
            self.env.vmaxknee * 2)
        self.sensors[12:] = 2 * bitfield(self.speed_ind_leg2_j1,
                                         int(self.Ssize / 4)) - 1

    # Execute step of the Glauber algorithm to update the state of one unit
    def GlauberStep(self, i=None):
        if i is None:
            i = np.random.randint(self.size)

        I = 0
        if i < self.Ssize:
            I = self.sensors[i]

        eDiff = 2 * self.s[i] * (self.h[i] + I +
                                 np.dot(self.J[i, :] + self.J[:, i], self.s))
        if eDiff * self.Beta < np.log(1 / np.random.rand() - 1):  # Glauber
            self.s[i] = -self.s[i]

    # Update random unit of the agent
    def Update(self, i=None):
        if i is None:
            i = np.random.randint(-1, self.size)
        if i == -1:
            self.Move()
            self.UpdateSensors()
        else:
            self.GlauberStep(i)

    # Sequentially update state of all units of the agent in random order
    def SequentialUpdate(self):
        for i in np.random.permutation(range(-1, self.size)):
            self.Update(i)

    # Step of the learning algorith to ajust correlations to the critical regime
    def AdjustCorrelations(self, T=None):
        if T is None:
            T = self.defaultT

        self.m = np.zeros(self.size)
        self.c = np.zeros((self.size, self.size))
        self.C = np.zeros((self.size, self.size))

        # Main simulation loop:
        self.x = np.zeros(T)
        samples = []
        for t in range(T):

            self.SequentialUpdate()
            #self.x[t] = self.position
            self.m += self.s
            for i in range(self.size):
                self.c[i, i + 1:] += self.s[i] * self.s[i + 1:]
        self.m /= T
        self.c /= T
        for i in range(self.size):
            self.C[i, i + 1:] = self.c[i, i + 1:] - self.m[i] * self.m[i + 1:]

        c1 = np.zeros((self.size, self.size))
        for i in range(self.size):
            inds = np.array([], int)
            c = np.array([])
            for j in range(self.size):
                if not i == j:
                    inds = np.append(inds, [j])
                if i < j:
                    c = np.append(c, [self.c[i, j]])
                elif i > j:
                    c = np.append(c, [self.c[j, i]])
            order = np.argsort(c)[::-1]
            c1[i, inds[order]] = self.Cint[i, :]
        self.c1 = np.triu(c1 + c1.T, 1)
        self.c1 *= 0.5

        self.m[0:self.Ssize] = 0
        self.m1[0:self.Ssize] = 0
        self.c[0:self.Ssize, 0:self.Ssize] = 0
        self.c[-self.Msize:, -self.Msize:] = 0
        self.c[0:self.Ssize, -self.Msize:] = 0
        self.c1[0:self.Ssize, 0:self.Ssize] = 0
        self.c1[-self.Msize:, -self.Msize:] = 0
        self.c1[0:self.Ssize, -self.Msize:] = 0
        dh = self.m1 - self.m
        dJ = self.c1 - self.c
        #print(self.m1,self.m)
        #print("dh, dJ:",dh,",",dJ)
        return dh, dJ

    # Algorithm for poising an agent in a critical regime
    def CriticalLearning(self, Iterations, T=None):
        u = 0.01
        count = 0
        dh, dJ = self.AdjustCorrelations(T)
        fit = max(np.max(np.abs(self.c1 - self.c)),
                  np.max(np.abs(self.m1 - self.m)))
        #x_min = np.min(self.x)
        #x_max = np.max(self.x)
        #maxmin_range = (self.env.max_position + self.env.min_position) / 2
        #maxmin = (np.array([x_min, x_max]) - maxmin_range) / maxmin_range
        #print(count, fit, np.max(np.abs(self.J)))
        for it in range(Iterations):
            count += 1
            self.h += u * dh
            self.J += u * dJ
            #print(self.h,self.J)
            if it % 4 == 0:
                self.randomize_state()
                self.randomize_position()
            Vmax = self.max_weights
            for i in range(self.size):
                if np.abs(self.h[i]) > Vmax:
                    self.h[i] = Vmax * np.sign(self.h[i])
                for j in np.arange(i + 1, self.size):
                    if np.abs(self.J[i, j]) > Vmax:
                        self.J[i, j] = Vmax * np.sign(self.J[i, j])

            dh, dJ = self.AdjustCorrelations(T)
            fit = np.max(np.abs(self.c1 - self.c))
예제 #7
0
def main(args):
    print("IT'S DANGEROUS TO GO ALONE! TAKE THIS.")

    np.random.seed(0)
    pt.manual_seed(0)

    env = BipedalWalker()
    env.seed(0)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    print(f"Initializing agent (device={device})...")
    rnn = WorldModel(obs_dim, act_dim)
    ctrl = Controller(obs_dim + rnn.hid_dim, act_dim)

    # Adjust population size based on the number of available CPUs.
    num_workers = mp.cpu_count() if args.nproc is None else args.nproc
    num_workers = min(num_workers, mp.cpu_count())
    agents_per_worker = args.popsize // num_workers
    popsize = num_workers * agents_per_worker

    print(f"Initializing population with {popsize} workers...")
    pop = Population(num_workers, agents_per_worker)
    global_mu = np.zeros_like(ctrl.genotype)

    loss_logger = ValueLogger('ha_rnn_loss', bufsize=20)
    best_logger = ValueLogger('ha_ctrl_best', bufsize=100)

    # Train the RNN with random policies.
    print(f"Training M model with a random policy...")
    optimizer = optim.Adam(rnn.parameters(), lr=args.lr)
    train_rnn(rnn,
              optimizer,
              pop,
              random_policy=True,
              num_rollouts=args.num_rollouts,
              logger=loss_logger)
    loss_logger.plot('M model training loss', 'step', 'loss')

    # Upload the trained RNN.
    success = pop.upload_rnn(rnn.cpu())
    assert success

    # Iteratively update controller and RNN.
    for i in range(args.niter):
        # Evolve controllers with the trained RNN.
        print(f"Iter. {i}: Evolving C model...")
        es = EvolutionStrategy(global_mu, args.sigma0, popsize)
        evolve_ctrl(ctrl, es, pop, num_gen=args.num_gen, logger=best_logger)
        best_logger.plot('C model evolution', 'gen', 'fitness')

        # Update the global best individual and upload them.
        global_mu = np.copy(ctrl.genotype)
        success = pop.upload_ctrl(global_mu, noisy=True)
        assert success

        # Train the RNN with the current best controller.
        print(f"Iter. {i}: Training M model...")
        train_rnn(rnn,
                  optimizer,
                  pop,
                  random_policy=False,
                  num_rollouts=args.num_rollouts,
                  logger=loss_logger)
        loss_logger.plot('M model training loss', 'step', 'loss')

        # Upload the trained RNN.
        success = pop.upload_rnn(rnn.cpu())
        assert success

        # Test run!
        rollout(env, rnn, ctrl, render=True)

    success = pop.close()
    assert success
import cProfile
from bipedal_walker import BipedalWalker
from gp_gym_info import info

info["env_name"] = "BipedalWalker-v3"
info["pop_size"] = 100
info["max_gens"] = 10
info["num_eps"] = 1
info["num_time_steps"] = 1
info["tournament_size"] = 3
info["mutation_rate"] = 0.1
info["term_fit"] = 300
info["max_depth"] = 5

agent = BipedalWalker(info)
cProfile.run("agent.train()", "bipedal_walker.profile")
예제 #9
0
def create_env(args):
    env = BipedalWalker(args.scale_legs)
    env = frame_stack(env, args)
    return env