Exemple #1
0
 def rollout(self, ntrials, render=False, timestep_limit=None, seed=None):
     rews = 0.0
     steps = 0
     # To ensure replicability (we always pass a valid seed, even if fully-random evaluation is going to be run)
     if seed is not None:
         self.env.seed(seed)
         self.nn.seed(seed)
     # initialize the render for showing the behavior of the robot/s and the activation of the neurons
     if (self.test > 0):
         self.objs = np.arange(
             1000, dtype=np.float64
         )  # the environment can contain up to 100 objects to be displayed
         self.objs[0] = -1
         self.env.copyDobj(self.objs)
         import renderWorld
     # Loop over the number of trials
     for trial in range(ntrials):
         # if normalize=1, occasionally we store data for input normalization
         if self.normalize:
             if np.random.uniform(low=0.0, high=1.0) < 0.01:
                 normphase = 1
                 self.nn.normphase(1)
             else:
                 normphase = 0
         # Reset environment
         self.env.reset()
         # Reset network
         self.nn.resetNet()
         # Reset episode-reward and step counter for current trial
         rew = 0.0
         t = 0
         while t < self.maxsteps:
             # Activate network
             self.nn.updateNet()
             # Perform a step
             rew += self.env.step()
             t += 1
             # Render
             if (self.test > 0):
                 self.env.render()
                 info = 'Trial %d Step %d Fit %.2f %.2f' % (trial, t, rew,
                                                            rews)
                 renderWorld.update(self.objs, info, self.ob, self.ac,
                                    self.nact)
             if self.done:
                 break
         if (self.test > 0):
             print("Trial %d Fit %.2f Steps %d " % (trial, rew, t))
         # if we normalize, we might need to stop store data for normalization
         if self.normalize and normphase > 0:
             self.nn.normphase(0)
         # Update steps
         steps += t
         rews += rew
     # Normalize reward by the number of trials
     rews /= ntrials
     if (self.test > 0 and ntrials > 1):
         print("Average Fit %.2f Steps %.2f " %
               (rews, steps / float(ntrials)))
     return rews, steps
Exemple #2
0
 def rollout(self, ntrials, render=False, seed=None):
     rews = 0.0  # summed rewards
     steps = 0  # step performed
     if (
             self.test == 2
     ):  # if the policy is used to test a trained agent and to visualize the neurons, we need initialize the graphic render
         import renderWorld
         self.objs = np.arange(10, dtype=np.float64)
         self.objs[0] = -1
     if seed is not None:
         self.env.seed(
             seed
         )  # set the seed of the environment that impacts on the initialization of the robot/environment
         self.nn.seed(
             seed
         )  # set the seed of evonet that impacts on the noise eventually added to the activation of the neurons
     for trial in range(ntrials):
         self.ob = self.env.reset(
         )  # reset the environment at the beginning of a new episode
         self.nn.resetNet(
         )  # reset the activation of the neurons (necessary for recurrent policies)
         rew = 0.0
         t = 0
         while t < self.maxsteps:
             self.nn.copyInput(
                 np.float32(self.ob)
             )  # copy the pointer to the observation vector to evonet and convert from float64 to float32
             self.nn.updateNet()  # update the activation of the policy
             action = np.argmax(
                 self.ac
             )  # select the action that corresponds to the most activated output neuron
             self.ob, r, done, _ = self.env.step(
                 action)  # perform a simulation step
             rew += r
             t += 1
             if render:
                 if (self.test == 1):
                     self.env.render()
                     time.sleep(0.05)
                 if (self.test == 2):
                     info = 'Trial %d Step %d Fit %.2f %.2f' % (trial, t, r,
                                                                rew)
                     renderWorld.update(self.objs, info, self.ob, self.ac,
                                        self.nact)
             if done:
                 break
         if (self.test > 0):
             print("Trial %d Fit %.2f Steps %d " % (trial, rew, t))
         steps += t
         rews += rew
     rews /= ntrials  # Normalize reward by the number of trials
     if (self.test > 0 and ntrials > 1):
         print("Average Fit %.2f Steps %d " %
               (rews, steps / float(ntrials)))
     return rews, steps
Exemple #3
0
 def rollout(self, render=False, timestep_limit=None):
     """
     If random_stream is provided, the rollout will take noisy actions with noise drawn from that stream.
     Otherwise, no action noise will be added.
     """
     rews = 0.0
     steps = 0
     # Set the number of trials depending on whether or not test flag is set to True
     ntrials = self.ntrials
     if self.genTest:
         ntrials = self.nttrials
     # Loop over the number of trials
     for trial in range(ntrials):
         self.nn.normPhase(0)
         # Observations must be saved if and only if normalization
         # flag is set to True and we are not in test phase
         if self.normalize == 1 and not self.test:
             if np.random.uniform(low=0.0, high=1.0) < 0.01:
                 # Save observations
                 self.nn.normPhase(1)
         # Reset environment
         self.env.reset()
         # Reset network
         self.nn.resetNet()
         # Reward for current trial
         crew = 0.0
         # Perform the steps
         t = 0
         while t < self.maxsteps:
             # Activate network
             self.nn.updateNet()
             # Perform a step
             rew = self.env.step()
             # Append the reward
             crew += rew
             t += 1
             if render:
                 self.env.render()
                 info = 'Trial %d Step %d Fit %.2f %.2f' % (trial, t, rew, rews)
                 renderWorld.update(self.objs, info, self.ob, self.ac, self.nact[self.ninputs:len(self.nact)-self.noutputs])
             if self.done:
                 break
         # Print fitness for each trial during test phase
         if self.test:
             print("Trial %d - fitness %lf" % (trial, crew))
         # Update overall reward
         rews += crew
         # Update steps
         steps += t
     # Normalize reward by the number of trials
     rews /= ntrials
     return rews, steps
Exemple #4
0
 def rollout(self, ntrials, render=False, seed=None):
     rews = 0.0  # summed reward
     steps = 0  # steps performed
     if seed is not None:
         self.env.seed(
             seed
         )  # set the seed of the environment that impacts on the initialization of the robot/environment
         self.nn.seed(
             seed
         )  # set the seed of evonet that impacts on the noise eventually added to the activation of the neurons
     if (
             self.test > 0
     ):  # if the policy is used to test a trained agent and to visualize the neurons, we need initialize the graphic render
         self.objs = np.arange(1000, dtype=np.float64)
         self.objs[0] = -1
         self.env.copyDobj(self.objs)
         import renderWorld
     for trial in range(ntrials):
         self.env.reset(
         )  # reset the environment at the beginning of a new episode
         self.nn.resetNet(
         )  # reset the activation of the neurons (necessary for recurrent policies)
         rew = 0.0
         t = 0
         while t < self.maxsteps:
             self.nn.updateNet()  # update the activation of the policy
             rew += self.env.step()  # perform a simulation step
             t += 1
             if (self.test > 0):
                 self.env.render()
                 info = 'Trial %d Step %d Fit %.2f %.2f' % (trial, t, rew,
                                                            rews)
                 renderWorld.update(self.objs, info, self.ob, self.ac,
                                    self.nact)
             if self.done:
                 break
         if (self.test > 0):
             print("Trial %d Fit %.2f Steps %d " % (trial, rew, t))
         steps += t
         rews += rew
     rews /= ntrials  # Normalize reward by the number of trials
     if (self.test > 0 and ntrials > 1):
         print("Average Fit %.2f Steps %.2f " %
               (rews, steps / float(ntrials)))
     return rews, steps
Exemple #5
0
 def rollout(self, ntrials, render=False, timestep_limit=None, seed=None):
     rews = 0.0
     steps = 0
     # initialize the render for showing the activation of the neurons
     if (self.test == 2):
         import renderWorld
         self.objs = np.arange(10, dtype=np.float64)
         self.objs[0] = -1
     # To ensure replicability (we always pass a valid seed, even if fully-random evaluation is going to be run)
     if seed is not None:
         self.env.seed(seed)
         self.nn.seed(seed)
     # Loop over the number of trials
     for trial in range(ntrials):
         # if normalize=1, occasionally we store data for input normalization
         if self.normalize:
             if np.random.uniform(low=0.0, high=1.0) < 0.01:
                 normphase = 1
                 self.nn.normphase(1)
             else:
                 normphase = 0
         # Reset environment
         self.ob = self.env.reset()
         # Reset network
         self.nn.resetNet()
         # Reset episode-reward and step counter for current trial
         rew = 0.0
         t = 0
         while t < self.maxsteps:
             # Copy the input in the network
             self.nn.copyInput(np.float32(self.ob))
             # Activate network
             self.nn.updateNet()
             # Convert the action array into an integer
             action = np.argmax(self.ac)
             # Perform a step
             self.ob, r, done, _ = self.env.step(action)
             # Append the reward
             rew += r
             t += 1
             if render:
                 if (self.test == 1):
                     self.env.render()
                     time.sleep(0.05)
                 if (self.test == 2):
                     info = 'Trial %d Step %d Fit %.2f %.2f' % (trial, t, r,
                                                                rew)
                     renderWorld.update(self.objs, info, self.ob, self.ac,
                                        self.nact)
             if done:
                 break
         if (self.test > 0):
             print("Trial %d Fit %.2f Steps %d " % (trial, rew, t))
         # if we normalize, we might need to stop store data for normalization
         if self.normalize and normphase > 0:
             self.nn.normphase(0)
         # Update steps
         steps += t
         rews += rew
     # Normalize reward by the number of trials
     rews /= ntrials
     if (self.test > 0 and ntrials > 1):
         print("Average Fit %.2f Steps %d " %
               (rews, steps / float(ntrials)))
     return rews, steps
Exemple #6
0
    def rollout(self, render=False, timestep_limit=None):
        """
        If random_stream is provided, the rollout will take noisy actions with noise drawn from that stream.
        Otherwise, no action noise will be added.
        """
        env_timestep_limit = self.env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
        timestep_limit = env_timestep_limit if timestep_limit is None else min(timestep_limit, env_timestep_limit)
        if timestep_limit is None:
            timestep_limit = self.maxsteps
        rews = 0.0
        steps = 0
	# Set the number of trials depending on whether or not test flag is set to True
        ntrials = self.ntrials
        if self.genTest:
            ntrials = self.nttrials
        # Loop over the number of trials
        for trial in range(ntrials):
            self.nn.normPhase(0)
            # Observations must be saved if and only if normalization
            # flag is set to True and we are not in test phase
            if self.normalize == 1 and not self.test:
                if np.random.uniform(low=0.0, high=1.0) < 0.01:
                    # Save observations
                    self.nn.normPhase(1)
            # Reset environment
            self.ob = self.env.reset()
            # Reset network
            self.nn.resetNet()
            # Reward for current trial
            crew = 0.0
            # Perform the steps
            t = 0
            while t < timestep_limit:
                # Copy the input pointer to the network
                self.nn.copyInput(self.ob)
                # Activate network
                self.nn.updateNet()
                # Perform a step
                self.ob, rew, done, _ = self.env.step(self.ac)
                # Append the reward
                crew += rew
                t += 1
                if render:
                    if  self.displayneurons == 0:
                        self.env.render(mode="human")
                        time.sleep(0.05)
                    else:
                        info = 'Trial %d Step %d Fit %.2f %.2f' % (trial, t, rew, rews)
                        renderWorld.update(self.objs, info, self.ob, self.ac, self.nact[self.ninputs:len(self.nact)-self.noutputs])
                if done:
                    break
            # Print fitness for each trial during test phase
            if self.test:
                print("Trial %d - fitness %lf" % (trial, crew))
            # Update overall reward
            rews += crew
            # Update steps
            steps += t
        # Normalize reward by the number of trials
        rews /= ntrials
        return rews, steps
    def rollout(self,
                net_1,
                net_2,
                ntrials,
                render=False,
                timestep_limit=None,
                seed=None):
        rews = 0.0
        steps = 0

        ##Osipov##################################################
        # coerf to regulate aditional reward(curiosity_bonus)
        curiosity_bonus = 0
        coef = 1.0
        MSE = nn.MSELoss()
        observations = []
        targets_tensor_vectors = []
        ##Osipov#####################################################

        # To ensure replicability (we always pass a valid seed, even if fully-random evaluation is going to be run)
        if seed is not None:
            self.env.seed(seed)
            self.nn.seed(seed)
        # initialize the render for showing the behavior of the robot/s and the activation of the neurons
        if (self.test > 0):
            self.objs = np.arange(
                1000, dtype=np.float64
            )  # the environment can contain up to 100 objects to be displayed
            self.objs[0] = -1
            self.env.copyDobj(self.objs)
            import renderWorld
        # Loop over the number of trials
        for trial in range(ntrials):
            # if normalize=1, occasionally we store data for input normalization
            if self.normalize:
                if np.random.uniform(low=0.0, high=1.0) < 0.01:
                    normphase = 1
                    self.nn.normphase(1)
                else:
                    normphase = 0
            # Reset environment
            self.env.reset()
            # Reset network
            self.nn.resetNet()
            # Reset episode-reward and step counter for current trial
            rew = 0.0
            t = 0
            while t < self.maxsteps:
                # Activate network
                self.nn.updateNet()
                # Perform a step
                rew += self.env.step()
                ##Osipov############################################
                tensor_obs = torch.from_numpy(self.ob)
                out_net_1 = net_1(tensor_obs)
                out_net_2 = net_2(tensor_obs)
                observations += [tensor_obs]
                targets_tensor_vectors += [out_net_1]
                curiosity_bonus += MSE(out_net_2, out_net_1).item()
                ##Osipov#############################################
                t += 1
                # Render
                if (self.test > 0):
                    self.env.render()
                    info = 'Trial %d Step %d Fit %.2f %.2f' % (trial, t, rew,
                                                               rews)
                    renderWorld.update(self.objs, info, self.ob, self.ac,
                                       self.nact)
                if self.done:
                    break
            if (self.test > 0):
                print("Trial %d Fit %.2f Steps %d " % (trial, rew, t))
            # if we normalize, we might need to stop store data for normalization
            if self.normalize and normphase > 0:
                self.nn.normphase(0)
            # Update steps
            steps += t
            #More cool reward
            rews += rew + coef * curiosity_bonus
        # Normalize reward by the number of trials
        rews /= ntrials
        if (self.test > 0 and ntrials > 1):
            print("Average Fit %.2f Steps %.2f " %
                  (rews, steps / float(ntrials)))
        ##Osipov##
        return rews, steps, observations, targets_tensor_vectors