def rollout(self, ntrials, render=False, timestep_limit=None, seed=None): rews = 0.0 steps = 0 # To ensure replicability (we always pass a valid seed, even if fully-random evaluation is going to be run) if seed is not None: self.env.seed(seed) self.nn.seed(seed) # initialize the render for showing the behavior of the robot/s and the activation of the neurons if (self.test > 0): self.objs = np.arange( 1000, dtype=np.float64 ) # the environment can contain up to 100 objects to be displayed self.objs[0] = -1 self.env.copyDobj(self.objs) import renderWorld # Loop over the number of trials for trial in range(ntrials): # if normalize=1, occasionally we store data for input normalization if self.normalize: if np.random.uniform(low=0.0, high=1.0) < 0.01: normphase = 1 self.nn.normphase(1) else: normphase = 0 # Reset environment self.env.reset() # Reset network self.nn.resetNet() # Reset episode-reward and step counter for current trial rew = 0.0 t = 0 while t < self.maxsteps: # Activate network self.nn.updateNet() # Perform a step rew += self.env.step() t += 1 # Render if (self.test > 0): self.env.render() info = 'Trial %d Step %d Fit %.2f %.2f' % (trial, t, rew, rews) renderWorld.update(self.objs, info, self.ob, self.ac, self.nact) if self.done: break if (self.test > 0): print("Trial %d Fit %.2f Steps %d " % (trial, rew, t)) # if we normalize, we might need to stop store data for normalization if self.normalize and normphase > 0: self.nn.normphase(0) # Update steps steps += t rews += rew # Normalize reward by the number of trials rews /= ntrials if (self.test > 0 and ntrials > 1): print("Average Fit %.2f Steps %.2f " % (rews, steps / float(ntrials))) return rews, steps
def rollout(self, ntrials, render=False, seed=None): rews = 0.0 # summed rewards steps = 0 # step performed if ( self.test == 2 ): # if the policy is used to test a trained agent and to visualize the neurons, we need initialize the graphic render import renderWorld self.objs = np.arange(10, dtype=np.float64) self.objs[0] = -1 if seed is not None: self.env.seed( seed ) # set the seed of the environment that impacts on the initialization of the robot/environment self.nn.seed( seed ) # set the seed of evonet that impacts on the noise eventually added to the activation of the neurons for trial in range(ntrials): self.ob = self.env.reset( ) # reset the environment at the beginning of a new episode self.nn.resetNet( ) # reset the activation of the neurons (necessary for recurrent policies) rew = 0.0 t = 0 while t < self.maxsteps: self.nn.copyInput( np.float32(self.ob) ) # copy the pointer to the observation vector to evonet and convert from float64 to float32 self.nn.updateNet() # update the activation of the policy action = np.argmax( self.ac ) # select the action that corresponds to the most activated output neuron self.ob, r, done, _ = self.env.step( action) # perform a simulation step rew += r t += 1 if render: if (self.test == 1): self.env.render() time.sleep(0.05) if (self.test == 2): info = 'Trial %d Step %d Fit %.2f %.2f' % (trial, t, r, rew) renderWorld.update(self.objs, info, self.ob, self.ac, self.nact) if done: break if (self.test > 0): print("Trial %d Fit %.2f Steps %d " % (trial, rew, t)) steps += t rews += rew rews /= ntrials # Normalize reward by the number of trials if (self.test > 0 and ntrials > 1): print("Average Fit %.2f Steps %d " % (rews, steps / float(ntrials))) return rews, steps
def rollout(self, render=False, timestep_limit=None): """ If random_stream is provided, the rollout will take noisy actions with noise drawn from that stream. Otherwise, no action noise will be added. """ rews = 0.0 steps = 0 # Set the number of trials depending on whether or not test flag is set to True ntrials = self.ntrials if self.genTest: ntrials = self.nttrials # Loop over the number of trials for trial in range(ntrials): self.nn.normPhase(0) # Observations must be saved if and only if normalization # flag is set to True and we are not in test phase if self.normalize == 1 and not self.test: if np.random.uniform(low=0.0, high=1.0) < 0.01: # Save observations self.nn.normPhase(1) # Reset environment self.env.reset() # Reset network self.nn.resetNet() # Reward for current trial crew = 0.0 # Perform the steps t = 0 while t < self.maxsteps: # Activate network self.nn.updateNet() # Perform a step rew = self.env.step() # Append the reward crew += rew t += 1 if render: self.env.render() info = 'Trial %d Step %d Fit %.2f %.2f' % (trial, t, rew, rews) renderWorld.update(self.objs, info, self.ob, self.ac, self.nact[self.ninputs:len(self.nact)-self.noutputs]) if self.done: break # Print fitness for each trial during test phase if self.test: print("Trial %d - fitness %lf" % (trial, crew)) # Update overall reward rews += crew # Update steps steps += t # Normalize reward by the number of trials rews /= ntrials return rews, steps
def rollout(self, ntrials, render=False, seed=None): rews = 0.0 # summed reward steps = 0 # steps performed if seed is not None: self.env.seed( seed ) # set the seed of the environment that impacts on the initialization of the robot/environment self.nn.seed( seed ) # set the seed of evonet that impacts on the noise eventually added to the activation of the neurons if ( self.test > 0 ): # if the policy is used to test a trained agent and to visualize the neurons, we need initialize the graphic render self.objs = np.arange(1000, dtype=np.float64) self.objs[0] = -1 self.env.copyDobj(self.objs) import renderWorld for trial in range(ntrials): self.env.reset( ) # reset the environment at the beginning of a new episode self.nn.resetNet( ) # reset the activation of the neurons (necessary for recurrent policies) rew = 0.0 t = 0 while t < self.maxsteps: self.nn.updateNet() # update the activation of the policy rew += self.env.step() # perform a simulation step t += 1 if (self.test > 0): self.env.render() info = 'Trial %d Step %d Fit %.2f %.2f' % (trial, t, rew, rews) renderWorld.update(self.objs, info, self.ob, self.ac, self.nact) if self.done: break if (self.test > 0): print("Trial %d Fit %.2f Steps %d " % (trial, rew, t)) steps += t rews += rew rews /= ntrials # Normalize reward by the number of trials if (self.test > 0 and ntrials > 1): print("Average Fit %.2f Steps %.2f " % (rews, steps / float(ntrials))) return rews, steps
def rollout(self, ntrials, render=False, timestep_limit=None, seed=None): rews = 0.0 steps = 0 # initialize the render for showing the activation of the neurons if (self.test == 2): import renderWorld self.objs = np.arange(10, dtype=np.float64) self.objs[0] = -1 # To ensure replicability (we always pass a valid seed, even if fully-random evaluation is going to be run) if seed is not None: self.env.seed(seed) self.nn.seed(seed) # Loop over the number of trials for trial in range(ntrials): # if normalize=1, occasionally we store data for input normalization if self.normalize: if np.random.uniform(low=0.0, high=1.0) < 0.01: normphase = 1 self.nn.normphase(1) else: normphase = 0 # Reset environment self.ob = self.env.reset() # Reset network self.nn.resetNet() # Reset episode-reward and step counter for current trial rew = 0.0 t = 0 while t < self.maxsteps: # Copy the input in the network self.nn.copyInput(np.float32(self.ob)) # Activate network self.nn.updateNet() # Convert the action array into an integer action = np.argmax(self.ac) # Perform a step self.ob, r, done, _ = self.env.step(action) # Append the reward rew += r t += 1 if render: if (self.test == 1): self.env.render() time.sleep(0.05) if (self.test == 2): info = 'Trial %d Step %d Fit %.2f %.2f' % (trial, t, r, rew) renderWorld.update(self.objs, info, self.ob, self.ac, self.nact) if done: break if (self.test > 0): print("Trial %d Fit %.2f Steps %d " % (trial, rew, t)) # if we normalize, we might need to stop store data for normalization if self.normalize and normphase > 0: self.nn.normphase(0) # Update steps steps += t rews += rew # Normalize reward by the number of trials rews /= ntrials if (self.test > 0 and ntrials > 1): print("Average Fit %.2f Steps %d " % (rews, steps / float(ntrials))) return rews, steps
def rollout(self, render=False, timestep_limit=None): """ If random_stream is provided, the rollout will take noisy actions with noise drawn from that stream. Otherwise, no action noise will be added. """ env_timestep_limit = self.env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps') timestep_limit = env_timestep_limit if timestep_limit is None else min(timestep_limit, env_timestep_limit) if timestep_limit is None: timestep_limit = self.maxsteps rews = 0.0 steps = 0 # Set the number of trials depending on whether or not test flag is set to True ntrials = self.ntrials if self.genTest: ntrials = self.nttrials # Loop over the number of trials for trial in range(ntrials): self.nn.normPhase(0) # Observations must be saved if and only if normalization # flag is set to True and we are not in test phase if self.normalize == 1 and not self.test: if np.random.uniform(low=0.0, high=1.0) < 0.01: # Save observations self.nn.normPhase(1) # Reset environment self.ob = self.env.reset() # Reset network self.nn.resetNet() # Reward for current trial crew = 0.0 # Perform the steps t = 0 while t < timestep_limit: # Copy the input pointer to the network self.nn.copyInput(self.ob) # Activate network self.nn.updateNet() # Perform a step self.ob, rew, done, _ = self.env.step(self.ac) # Append the reward crew += rew t += 1 if render: if self.displayneurons == 0: self.env.render(mode="human") time.sleep(0.05) else: info = 'Trial %d Step %d Fit %.2f %.2f' % (trial, t, rew, rews) renderWorld.update(self.objs, info, self.ob, self.ac, self.nact[self.ninputs:len(self.nact)-self.noutputs]) if done: break # Print fitness for each trial during test phase if self.test: print("Trial %d - fitness %lf" % (trial, crew)) # Update overall reward rews += crew # Update steps steps += t # Normalize reward by the number of trials rews /= ntrials return rews, steps
def rollout(self, net_1, net_2, ntrials, render=False, timestep_limit=None, seed=None): rews = 0.0 steps = 0 ##Osipov################################################## # coerf to regulate aditional reward(curiosity_bonus) curiosity_bonus = 0 coef = 1.0 MSE = nn.MSELoss() observations = [] targets_tensor_vectors = [] ##Osipov##################################################### # To ensure replicability (we always pass a valid seed, even if fully-random evaluation is going to be run) if seed is not None: self.env.seed(seed) self.nn.seed(seed) # initialize the render for showing the behavior of the robot/s and the activation of the neurons if (self.test > 0): self.objs = np.arange( 1000, dtype=np.float64 ) # the environment can contain up to 100 objects to be displayed self.objs[0] = -1 self.env.copyDobj(self.objs) import renderWorld # Loop over the number of trials for trial in range(ntrials): # if normalize=1, occasionally we store data for input normalization if self.normalize: if np.random.uniform(low=0.0, high=1.0) < 0.01: normphase = 1 self.nn.normphase(1) else: normphase = 0 # Reset environment self.env.reset() # Reset network self.nn.resetNet() # Reset episode-reward and step counter for current trial rew = 0.0 t = 0 while t < self.maxsteps: # Activate network self.nn.updateNet() # Perform a step rew += self.env.step() ##Osipov############################################ tensor_obs = torch.from_numpy(self.ob) out_net_1 = net_1(tensor_obs) out_net_2 = net_2(tensor_obs) observations += [tensor_obs] targets_tensor_vectors += [out_net_1] curiosity_bonus += MSE(out_net_2, out_net_1).item() ##Osipov############################################# t += 1 # Render if (self.test > 0): self.env.render() info = 'Trial %d Step %d Fit %.2f %.2f' % (trial, t, rew, rews) renderWorld.update(self.objs, info, self.ob, self.ac, self.nact) if self.done: break if (self.test > 0): print("Trial %d Fit %.2f Steps %d " % (trial, rew, t)) # if we normalize, we might need to stop store data for normalization if self.normalize and normphase > 0: self.nn.normphase(0) # Update steps steps += t #More cool reward rews += rew + coef * curiosity_bonus # Normalize reward by the number of trials rews /= ntrials if (self.test > 0 and ntrials > 1): print("Average Fit %.2f Steps %.2f " % (rews, steps / float(ntrials))) ##Osipov## return rews, steps, observations, targets_tensor_vectors