Esempio n. 1
0
def run_episodes(sess, env, n_eps, n_steps, render, obs_in, pi_out, pi_logits_out, rnn_state_in, rnn_state_out, predict_ac_op,f, seed):
    logger = logging.getLogger(__name__)
    ep_length = []
    ep_return = []
    logger.info('---------------- Episode results -----------------------')
    for i in range(0, n_eps):  # TODO parallelize this here! Problem: guarantee same sequence of random numbers in each parallel process. --> Solution Index based RNG instead of sequential seed based RNG
        obs = env.reset()
        obs = normalize_obs(obs)
        done = False
        if rnn_state_in is not None:
            if len(rnn_state_in) > 1:
                rnn_s_in = (np.zeros(rnn_state_in[0].shape), np.zeros(rnn_state_in[1].shape))  # init lstm cell vector
            else:
                rnn_s_in = np.zeros(len(rnn_state_in))  # init gru cell vector
        total_return = 0
        total_length = -1
        reward = 0
        i_sample = 0
        if f is not None:
            rew_traj = []

        while not done and (i_sample < n_steps):
            i_sample += 1
            total_length += 1
            total_return += reward  # add reward of previous step, s.t. termination reward is not added anymore.

            if rnn_state_in is not None:
                pi, pi_log, act, rnn_s_out = sess.run([pi_out, pi_logits_out, predict_ac_op, rnn_state_out], feed_dict={obs_in[0]: [obs], rnn_state_in: rnn_s_in})
            else:
                pi, pi_log, act = sess.run([pi_out, pi_logits_out, predict_ac_op], feed_dict={obs_in[0]: [obs]})
            ac = np.argmax(pi_log)
            obs, reward, done, _ = env.step(ac)
            obs = normalize_obs(obs)

            if f is not None:
                rew_traj.append(reward)

            if render:
                env.render()

            if rnn_state_in is not None:
                rnn_s_in = rnn_s_out
        logger.info('Episode %s: %s, %s' % (i, total_return, total_length))
        ep_length.append(total_length)
        ep_return.append(total_return)

        if f is not None:
            with open(f, "a") as csvfile:
                writer = csv.writer(csvfile)
                rew_traj[0:0] = [seed, i, np.mean(rew_traj)]
                writer.writerow(rew_traj)

    return ep_return
Esempio n. 2
0
        def test_run(env, n_eps, n_pipes):
            self.logger.info('Evaluating current agent')
            ep_return = []
            ep_length = []
            for i in range(0, n_eps):
                obs = env.reset()
                obs = normalize_obs(obs)
                done = False
                if eval_model.initial_state is not None:
                    if len(eval_model.initial_state) > 1:
                        rnn_s_in = (np.zeros(
                            eval_model.initial_state[0].shape),
                                    np.zeros(eval_model.initial_state[1].shape)
                                    )  # init lstm cell vector
                    else:
                        rnn_s_in = np.zeros(eval_model.initial_state.shape
                                            )  # init gru cell vector
                total_return = 0
                total_length = 0

                while not done and (total_return < n_pipes):
                    # self.logger.info(total_return)
                    if eval_model.initial_state is not None:
                        pi, pi_log, act, rnn_s_out = sess.run(
                            [
                                eval_model.pi, eval_model.pi_logit,
                                eval_model.ac, eval_model.rnn_state_out
                            ],
                            feed_dict={
                                eval_model.X: [obs],
                                eval_model.rnn_state_in: rnn_s_in
                            })
                    else:
                        pi, pi_log, act = sess.run([
                            eval_model.pi, eval_model.pi_logit, eval_model.ac
                        ],
                                                   feed_dict={
                                                       eval_model.X: [obs]
                                                   })
                    ac = np.argmax(pi_log)
                    obs, reward, done, _ = env.step(ac)
                    obs = normalize_obs(obs)
                    total_length += 1
                    total_return += reward
                    if eval_model.initial_state is not None:
                        rnn_s_in = rnn_s_out
                self.logger.info('Episode %s: %s, %s' %
                                 (i, total_return, total_length))
                ep_length.append(total_length)
                ep_return.append(total_return)
            return ep_return
Esempio n. 3
0
    def run(self):
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_rawrewards = [],[],[],[],[],[]
        mb_states = self.states
        for n in range(self.nsteps):
            actions, pi, values, states, _ = self.model.step(
                self.obs, self.states)  # , self.dones) ?
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)
            obs, rewards, dones, _ = self.env.step(actions)
            obs = normalize_obs(obs)
            self.logger.debug('Observations: %s' % obs)

            # render only every i-th episode
            if self.show_interval != 0:
                if (self.ep_idx[0] % self.show_interval) == 0:
                    self.env.render()

            self.eplength = [
                self.eplength[i] + 1 for i in range(self.nenv)
            ]  # Todo use already implemented functions in run_ple_utils!!!
            self.epreturn = [
                self.epreturn[i] + rewards[i] for i in range(self.nenv)
            ]
            [
                self.reward_window[i].append(rewards[i])
                for i in range(self.nenv)
            ]

            # Check for terminal states in every env
            for i, done in enumerate(dones):  # i -> environment ID
                if done:
                    self.ep_idx[i] += 1
                    self.obs[i] = self.obs[i] * 0

                    # update tensorboard summary
                    if self.summary_writer is not None:
                        summary = tf.Summary()
                        summary.value.add(
                            tag='envs/environment%s/episode_length' % i,
                            simple_value=self.eplength[i])
                        summary.value.add(
                            tag='envs/environment%s/episode_reward' % i,
                            simple_value=self.epreturn[i])
                        self.summary_writer.add_summary(
                            summary, self.ep_idx[i])  #self.global_step.eval())
                        self.summary_writer.flush()
                    # self.retbuffer.append(self.epreturn[i])
                    if self.epreturn[i] > self.return_threshold:
                        self.return_threshold = self.epreturn[i]
                        self.logger.info('Save model at max reward %s' %
                                         self.return_threshold)
                        self.model.save('inter_model')
                    self.eplength[i] = 0
                    self.epreturn[i] = 0

            # # Is not necessary, as the environment is continuous now!
            # # Reset RNN state vector to 0 if previous sample is a terminating one.
            # # As no history should be used in rnn training then.
            # if states:
            #     env_was_done = False
            #     for i, done in enumerate(self.dones):
            #         if done and not env_was_done:
            #             env_was_done = True
            #             c_new = states[0]
            #             h_new = states[1]
            #             c_new[i] = np.zeros_like(c_new[i])
            #             h_new[i] = np.zeros_like(h_new[i])
            #         elif done:
            #             c_new[i] = np.zeros_like(c_new[i])
            #             h_new[i] = np.zeros_like(h_new[i])
            #     if env_was_done:
            #         states = tf.contrib.rnn.LSTMStateTuple(c_new, h_new)
            #         # print(states)

            self.states = states
            self.dones = dones
            self.obs = obs
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape(
            self.batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        # mb_masks = mb_dones[:, :-1] ?
        mb_rawrewards = np.copy(mb_rewards)
        mb_dones = mb_dones[:, 1:]
        last_values = self.model.value(self.obs, self.states).tolist()

        # discount/bootstrap off value fn
        for n, (rewards, dones,
                value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist()
            dones = dones.tolist()
            if dones[-1] == 0:
                rewards = discount_with_dones(rewards + [value], dones + [0],
                                              self.gamma)[:-1]
            else:
                rewards = discount_with_dones(rewards, dones, self.gamma)
            self.logger.debug('Discounted rewards: %s' % rewards)
            mb_rewards[n] = rewards
        mb_rewards = mb_rewards.flatten()
        mb_actions = mb_actions.flatten()
        mb_values = mb_values.flatten()

        self.logger.debug('Actions: %s' % mb_actions)
        self.logger.debug('Q values: %s' % mb_values)
        self.logger.debug('Observations: %s' % mb_obs)

        return mb_obs, mb_states, mb_rewards, mb_actions, mb_values, self.reward_window, mb_rawrewards  # self.avg_return_n_episodes
Esempio n. 4
0
def run_episodes(sess, env, n_eps, n_steps, render, obs_in, rnn_state_in,
                 rnn_state_out, predQ_out, f, seed):
    logger = logging.getLogger(__name__)
    ep_length = []
    ep_return = []
    logger.info('---------------- Episode results -----------------------')
    for i in range(0, n_eps):  # TODO parallelize this here!
        obs = env.reset()
        obs = normalize_obs(obs)
        done = False
        if rnn_state_in is not None:
            if len(rnn_state_in) > 1:
                rnn_s_in = (np.zeros(rnn_state_in[0].shape),
                            np.zeros(rnn_state_in[1].shape)
                            )  # init lstm cell vector
            else:
                rnn_s_in = np.zeros(len(rnn_state_in))  # init gru cell vector
        total_return = 0
        total_length = -1
        reward = 0
        i_sample = 0
        if f is not None:
            rew_traj = []

        while not done and (i_sample < n_steps):
            i_sample += 1
            total_length += 1
            total_return += reward  # add reward of previous step, s.t. termination reward is not added anymore.

            if rnn_state_in is not None:
                pQ, rnn_s_out = sess.run([predQ_out, rnn_state_out],
                                         feed_dict={
                                             obs_in[0]: [obs],
                                             rnn_state_in: rnn_s_in
                                         })
            else:
                pQ = sess.run([predQ_out], feed_dict={obs_in[0]: [obs]})
            best_ac = np.argmax(pQ)  # greedy policy not epsilon greedy policy
            obs, reward, done, _ = env.step(best_ac)
            # obs, reward, done, _ = env.step(act[0][0])
            obs = normalize_obs(obs)

            if f is not None:
                rew_traj.append(reward)

            if render:
                env.render()

            if rnn_state_in is not None:
                rnn_s_in = rnn_s_out
        logger.info('Episode %s: %s, %s' % (i, total_return, total_length))
        ep_length.append(total_length)
        ep_return.append(total_return)

        if f is not None:
            with open(f, "a") as csvfile:
                writer = csv.writer(csvfile)
                rew_traj[0:0] = [seed, i, np.mean(rew_traj)]
                writer.writerow(rew_traj)

    return ep_return
Esempio n. 5
0
def q_learning(q_network,
               env,
               test_env,
               seed,
               total_timesteps,
               log_interval,
               test_interval,
               show_interval,
               logdir,
               lr,
               max_grad_norm,
               units_per_hlayer,
               activ_fcn,
               gamma=0.95,
               epsilon=0.4,
               epsilon_decay=.95,
               buffer_size=4000,
               batch_size=128,
               trace_length=32,
               tau=0.99,
               update_interval=30,
               early_stop=False,
               keep_model=2,
               save_model=True,
               restore_model=False,
               save_traj=False):
    # """
    # Q-Learning algorithm for off-policy TD control using Function Approximation.
    # Finds the optimal greedy policy while following an epsilon-greedy policy.
    # Implements the options of online learning or using experience replay and also
    # target calculation by target networks, depending on the flags. You can reuse
    # your Q-learning implementation of the last exercise.
    #
    # Args:
    #     env: PLE game
    #     approx: Action-Value function estimator
    #     num_episodes: Number of episodes to run for.
    #     max_time_per_episode: maximum number of time steps before episode is terminated
    #     discount_factor: gamma, discount factor of future rewards.
    #     epsilon: Chance to sample a random action. Float betwen 0 and 1.
    #     epsilon_decay: decay rate of epsilon parameter
    #     use_experience_replay: Indicator if experience replay should be used.
    #     batch_size: Number of samples per batch.
    #     target: Slowly updated target network to calculate the targets. Ignored if None.
    #
    # Returns:
    #     An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    # """
    logger = logging.getLogger(__name__)
    # logger.info(datetime.time)

    tf.reset_default_graph()
    set_global_seeds(seed)

    # Params
    ob_space = env.observation_space
    ac_space = env.action_space
    nd, = ob_space.shape
    n_ac = ac_space.n

    # Create learning agent and the replay buffer
    agent = DQNAgent(q_network=q_network,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     lr=lr,
                     max_grad_norm=max_grad_norm,
                     units_per_hlayer=units_per_hlayer,
                     activ_fcn=activ_fcn,
                     log_interval=log_interval,
                     logdir=logdir,
                     batch_size=batch_size,
                     trace_length=trace_length,
                     update_interval=update_interval,
                     tau=tau,
                     keep_model=keep_model)
    summary_writer = agent.get_summary_writer()
    result_path = os.path.join(logdir, 'train_results.csv')
    if save_traj:
        rew_traj = []
        rew_results_path = os.path.join(
            logdir, ('lr' + str(lr) + '_tracking_results.csv'))
    else:
        rew_results_path = None
    replay_buffer = ReplayBuffer(buffer_size)

    # Keeps track of useful statistics
    stats = EpisodeStats

    if restore_model:
        for el in os.listdir(logdir):
            if 'final' in el and '.meta' in el:
                # Load pre trained model and set network parameters
                logger.info('load %s' % os.path.join(logdir, el[:-5]))
                agent.load(os.path.join(logdir, el[:-5]))
                # Reset global step parameter.
                agent.sess.run(agent.global_step.assign(0))

    # ------------------ TRAINING --------------------------------------------
    logger.info("Start Training")
    early_stopped = False
    i_episode, i_sample, i_train = 0, 0, 0
    len, rew = 0, 0
    horizon = 100
    reward_window = deque(maxlen=horizon)
    avg_rm = deque(maxlen=30)
    nbatch = batch_size * trace_length
    return_threshold = -0.05  # 40

    # Reset envnn
    obs = env.reset()
    obs = normalize_obs(obs)
    done = False
    rnn_state0 = agent.step_initial_state
    if rnn_state0 is None:  # If we use a normal feed forward architecture, we sample a batch of single samples, not a batch of sequences.
        trace_length = 1

    # Set the target network to be equal to the primary network
    agent.update_target(agent.target_ops)
    while i_sample < total_timesteps:
        if np.random.rand(1) < epsilon:
            _, next_rnn_state = agent.step([obs],
                                           rnn_state0)  # epsilon greedy action
            action = np.random.randint(0, n_ac)
        else:
            AP, next_rnn_state = agent.step(
                [obs], rnn_state0)  # epsilon greedy action
            action = AP[0]
        next_obs, reward, done, _ = env.step(action)
        next_obs = normalize_obs(next_obs)
        i_sample += 1
        # render only every i-th episode
        if show_interval != 0:
            if i_episode % show_interval == 0:
                env.render()

        len += 1
        rew += reward
        reward_window.append(reward)

        # When episode is done, add episode information to tensorboard summary and stats
        if done:  # env.game_over():
            next_obs = list(np.zeros_like(next_obs, dtype=np.float32))

            stats['episode_lengths'].append(len)
            stats['episode_rewards'].append(rew)

            if summary_writer is not None:
                summary = tf.Summary()
                summary.value.add(
                    tag='envs/ep_return',
                    simple_value=stats['episode_rewards'][i_episode])
                summary.value.add(
                    tag="envs/ep_length",
                    simple_value=stats['episode_lengths'][i_episode])
                summary_writer.add_summary(summary, i_episode)
                summary_writer.flush()

            if save_model and rew > return_threshold:
                return_threshold = rew
                logger.info('Save model at max reward %s' % return_threshold)
                agent.save('inter_model')

            i_episode += 1
            len, rew = 0, 0

        # Update replay buffer
        replay_buffer.add_transition(obs, action, next_obs, reward, done)
        if save_traj:
            rew_traj.append(reward)

        # Update model parameters every #update_interval steps. Use real experience and replayed experience.
        if replay_buffer.size() > nbatch and (i_sample % update_interval == 0):
            if (env.spec._env_name == 'ContFlappyBird'):
                rm = sum(reward_window) / horizon
                if summary_writer is not None:
                    s_summary = tf.Summary()
                    s_summary.value.add(tag='envs/isample_return',
                                        simple_value=rm)
                    summary_writer.add_summary(s_summary, i_sample)
                    summary_writer.flush()
                if save_model and rm > return_threshold:
                    return_threshold = rm
                    logger.info('Save model at max rolling mean %s' %
                                return_threshold)
                    agent.save('inter_model')
                avg_rm.append(rm)

            if early_stop:
                if (i_sample > 60000) and (i_sample <=
                                           (60000 + update_interval)):
                    if (sum(avg_rm) / 30) <= -0.88:
                        print('breaked')
                        early_stopped = True
                        break

            agent.update_target(agent.target_ops)

            # reset rnn state (history knowledge) before every training step
            rnn_state_train = agent.train_initial_state

            # Sample training mini-batch from replay buffer
            if rnn_state_train is not None:
                mb_obs, mb_actions, mb_next_obs, mb_rewards, _, batch_dones = \
                                                replay_buffer.recent_and_next_batch_of_seq(batch_size, trace_length)
            else:
                mb_obs, mb_actions, mb_next_obs, mb_rewards, _, batch_dones = \
                                                replay_buffer.recent_and_next_batch(batch_size)

            # Calculate TD target for batch. Use "old" fixed parameters if target network is available
            # to compute targets else use "old" parameters of value function estimate.
            # mb_next_obs = np.reshape(mb_next_obs, (-1, nd))
            mb_next_q_values, _ = agent.target_model.predict(
                mb_next_obs, rnn_state_train)
            mb_best_next_action = np.argmax(mb_next_q_values, axis=1)
            mb_td_target = [
                mb_rewards[j] +
                gamma * mb_next_q_values[j][mb_best_next_action[j]]
                for j in range(nbatch)
            ]

            # Update Q value estimator parameters by optimizing between Q network and Q-learning targets
            loss = agent.train(mb_obs, mb_actions, mb_td_target,
                               rnn_state_train)
            i_train += 1

            # If test_interval > 0 the learned model is evaluated every "test_interval" gradient updates
            if test_interval > 0 and i_train > 0 and (i_train % test_interval
                                                      == 0):
                ep_return = agent.test_run(test_env, n_eps=10, n_pipes=2000)
                with open(result_path, "a") as csvfile:
                    writer = csv.writer(csvfile)
                    ep_return[0:0] = [i_sample, i_train]
                    writer.writerow(ep_return)

        if done:
            # Reset the model
            next_obs = env.reset()
            next_obs = normalize_obs(next_obs)

        epsilon *= epsilon_decay
        obs = next_obs
        rnn_state0 = next_rnn_state

    # Save final model when training is finished.
    if save_model:
        agent.save('final_model')
        logger.info('Finished Training. Saving Final model.')

    if rew_results_path is not None:
        logger.info('Save reward trajectory to %s' % rew_results_path)
        with open(rew_results_path, "a") as csvfile:
            writer = csv.writer(csvfile)
            traj = np.asanyarray(rew_traj).reshape(-1).tolist()
            traj[0:0] = [np.mean(traj)]  # i_train, i_sample
            writer.writerow(traj)

    logger.info('*******************************************************')
    logger.info('Total number of interactions with the environment: %s' %
                i_sample)
    logger.info('Total number of parameter updates during training: %s' %
                i_train)
    logger.info('*******************************************************\n')

    return early_stopped, i_sample
Esempio n. 6
0
def evaluate(args,
             seed,
             policies_list,
             ob_rms=None,
             render=False,
             env=None,
             master=None,
             render_attn=True):
    """
    RL evaluation: supports eval through training code as well as independently
    policies_list should be a list of policies of all the agents;
    len(policies_list) = num agents
    """
    if env is None or master is None:  # if any one of them is None, generate both of them
        master, env = setup_master(args, return_env=True)

    if seed is None:  # ensure env eval seed is different from training seed
        seed = np.random.randint(0, 100000)
    print("Evaluation Seed: ", seed)
    env.seed(seed)

    if ob_rms is not None:
        obs_mean, obs_std = ob_rms
    else:
        obs_mean = None
        obs_std = None
    master.load_models(policies_list)
    master.set_eval_mode()

    num_eval_episodes = args.num_eval_episodes
    all_episode_rewards = np.full((num_eval_episodes, env.n), 0.0)
    per_step_rewards = np.full((num_eval_episodes, env.n), 0.0)

    # TODO: provide support for recurrent policies and mask
    recurrent_hidden_states = None
    mask = None

    # world.dists at the end of episode for simple_spread
    final_min_dists = []
    num_success = 0
    episode_length = 0

    for t in range(num_eval_episodes):
        obs = env.reset()
        obs = normalize_obs(obs, obs_mean, obs_std)
        done = [False] * env.n
        episode_rewards = np.full(env.n, 0.0)
        episode_steps = 0
        if render:
            attn = None if not render_attn else master.team_attn
            if attn is not None and len(attn.shape) == 3:
                attn = attn.max(0)
            env.render(attn=attn)

        while not np.all(done):
            actions = []
            with torch.no_grad():
                actions = master.eval_act(obs, recurrent_hidden_states, mask)
            episode_steps += 1
            obs, reward, done, info = env.step(actions)
            obs = normalize_obs(obs, obs_mean, obs_std)
            episode_rewards += np.array(reward)
            if render:
                attn = None if not render_attn else master.team_attn
                if attn is not None and len(attn.shape) == 3:
                    attn = attn.max(0)
                env.render(attn=attn)
                if args.record_video:
                    time.sleep(0.08)

        per_step_rewards[t] = episode_rewards / episode_steps
        num_success += info['n'][0]['is_success']
        episode_length = (episode_length * t +
                          info['n'][0]['world_steps']) / (t + 1)

        # for simple spread env only
        if args.env_name == 'simple_spread':
            final_min_dists.append(env.world.min_dists)
        elif args.env_name == 'simple_formation' or args.env_name == 'simple_line':
            final_min_dists.append(env.world.dists)

        if render:
            print(
                "Ep {} | Success: {} \n Av per-step reward: {:.2f} | Ep Length {}"
                .format(t, info['n'][0]['is_success'], per_step_rewards[t][0],
                        info['n'][0]['world_steps']))
        all_episode_rewards[
            t, :] = episode_rewards  # all_episode_rewards shape: num_eval_episodes x num agents

        if args.record_video:
            # print(attn)
            input('Press enter to continue: ')

    return all_episode_rewards, per_step_rewards, final_min_dists, num_success, episode_length
Esempio n. 7
0
    def run(self):
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
        mb_states = self.states
        for _ in range(self.nsteps):
            actions, pi, values, self.states, neglogpacs = self.model.step(
                self.obs, self.states, self.dones)
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_neglogpacs.append(neglogpacs)
            mb_dones.append(self.dones)
            obs, rewards, self.dones, _ = self.env.step(actions)
            self.obs[:] = normalize_obs(obs)
            mb_rewards.append(rewards)

            self.logger.debug('Observations: %s' % self.obs)

            # render only every i-th episode
            if self.show_interval != 0:
                if (self.ep_idx[0] % self.show_interval) == 0:
                    self.env.render()

            self.eplength = [
                self.eplength[i] + 1 for i in range(self.nenv)
            ]  # Todo use already implemented functions in run_ple_utils!!!
            self.epreturn = [
                self.epreturn[i] + rewards[i] for i in range(self.nenv)
            ]
            [
                self.reward_window[i].append(rewards[i])
                for i in range(self.nenv)
            ]

            # Check for terminal states in every env - this is only used in terminating version of FlappyBird
            for i, done in enumerate(self.dones):  # i -> environment ID
                if done:
                    self.ep_idx[i] += 1
                    self.obs[i] = self.obs[i] * 0

                    # update tensorboard summary
                    if self.summary_writer is not None:
                        summary = tf.Summary()
                        summary.value.add(
                            tag='envs/environment%s/episode_length' % i,
                            simple_value=self.eplength[i])
                        summary.value.add(
                            tag='envs/environment%s/episode_reward' % i,
                            simple_value=self.epreturn[i])
                        self.summary_writer.add_summary(
                            summary,
                            self.ep_idx[i])  # self.global_step.eval())
                        self.summary_writer.flush()
                    # self.retbuffer.append(self.epreturn[i])
                    if self.epreturn[i] > self.return_threshold:
                        self.return_threshold = self.epreturn[i]
                        self.logger.info('Save model at max reward %s' %
                                         self.return_threshold)
                        self.model.save('inter_model')
                    self.eplength[i] = 0
                    self.epreturn[i] = 0

        # batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_actions = np.asarray(mb_actions)
        mb_values = np.asarray(mb_values, dtype=np.float32)
        mb_neglogpacs = np.asarray(
            mb_neglogpacs, dtype=np.float32
        )  # TODO this is an array of tensors, output values instead!!
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        last_values = self.model.value(self.obs, self.states, self.dones)
        # discount/bootstrap off value fn
        mb_returns = np.zeros_like(mb_rewards)
        mb_advs = np.zeros_like(mb_rewards)
        lastgaelam = 0
        for t in reversed(range(self.nsteps)):
            if t == self.nsteps - 1:
                nextnonterminal = 1.0 - self.dones
                nextvalues = last_values
            else:
                nextnonterminal = 1.0 - mb_dones[t + 1]
                nextvalues = mb_values[t + 1]
            delta = mb_rewards[
                t] + self.gamma * nextvalues * nextnonterminal - mb_values[
                    t]  # 1-step td-error
            mb_advs[
                t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
        mb_returns = mb_advs + mb_values

        self.logger.debug('Actions: %s' % mb_actions)
        self.logger.debug('Q values: %s' % mb_values)
        # self.logger.debug('Done mask: %s' % mb_masks)  # ?
        self.logger.debug('Observations: %s' % mb_obs)

        return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions,
                            mb_values, mb_neglogpacs)), mb_states,
                self.reward_window, mb_rewards)
Esempio n. 8
0
def evaluate(args,
             seed,
             policies_list,
             ob_rms=None,
             render=False,
             env=None,
             master=None,
             render_attn=True):
    """
    RL evaluation: 训练时或者单独使用均可
    policies_list 是所有agent策略的list;
    len(policies_list) = 智能体数量
    """
    import numpy as np
    import torch
    from arguments import get_args
    from utils import normalize_obs
    from learner import setup_master
    import time
    if env is None or master is None:  # 其中一个为空则两个都生成
        master, env = setup_master(args, return_env=True)

    if seed is None:
        seed = np.random.randint(0, 100000)
    print("Evaluation Seed: ", seed)
    env.seed(seed)

    if ob_rms is not None:
        obs_mean, obs_std = ob_rms
    else:
        obs_mean = None
        obs_std = None
    master.load_models(policies_list)
    master.set_eval_mode()

    num_eval_episodes = args.num_eval_episodes
    all_episode_rewards = np.full((num_eval_episodes, env.n), 0.0)
    per_step_rewards = np.full((num_eval_episodes, env.n), 0.0)

    recurrent_hidden_states = None
    mask = None

    # simple_spread 回合结束时 world.dists
    final_min_dists = []
    num_success = 0
    episode_length = 0

    for t in range(num_eval_episodes):
        obs = env.reset()
        obs = normalize_obs(obs, obs_mean, obs_std)
        done = [False] * env.n
        episode_rewards = np.full(env.n, 0.0)
        episode_steps = 0
        if render:
            attn = None if not render_attn else master.team_attn
            if attn is not None and len(attn.shape) == 3:
                attn = attn.max(0)
            env.render(attn=attn)

        while not np.all(done):
            actions = []
            with torch.no_grad():
                actions = master.eval_act(obs, recurrent_hidden_states, mask)
            episode_steps += 1
            obs, reward, done, info = env.step(actions)
            obs = normalize_obs(obs, obs_mean, obs_std)
            episode_rewards += np.array(reward)
            if render:
                attn = None if not render_attn else master.team_attn
                if attn is not None and len(attn.shape) == 3:
                    attn = attn.max(0)
                env.render(attn=attn)
                if args.record_video:
                    time.sleep(0.08)

        per_step_rewards[t] = episode_rewards / episode_steps
        num_success += info['n'][0]['is_success']
        episode_length = (episode_length * t +
                          info['n'][0]['world_steps']) / (t + 1)

        # simple spread
        if args.env_name == 'simple_spread':
            final_min_dists.append(env.world.min_dists)
        elif args.env_name == 'simple_formation' or args.env_name == 'simple_line':
            final_min_dists.append(env.world.dists)

        if render:
            print(
                "Ep {} | Success: {} \n Av per-step reward: {:.2f} | Ep Length {}"
                .format(t, info['n'][0]['is_success'], per_step_rewards[t][0],
                        info['n'][0]['world_steps']))
        all_episode_rewards[
            t, :] = episode_rewards  # all_episode_rewards shape: num_eval_episodes x num agents

        if args.record_video:
            # print(attn)
            input('Press enter to continue: ')

    return all_episode_rewards, per_step_rewards, final_min_dists, num_success, episode_length
Esempio n. 9
0
    pi_logits_out = tf.get_collection('pi_logit')
    predict_vf_op = tf.get_collection('val')
    predict_ac_op = tf.get_collection('step')
    rnn_state_in, rnn_state_out = None, None

    env = ple_env
    pi_out = probs_out

    logger = logging.getLogger(__name__)
    ep_length = []
    ep_return = []
    logger.info('---------------- Episode results -----------------------')
    for i in range(0,
                   2):  # TODO parallelize this here! Problem: guarantee same sequence of random numbers in each parallel process. --> Solution Index based RNG instead of sequential seed based RNG
        obs = env.reset()
        obs = normalize_obs(obs)
        done = False

        i_sample = 0

        while not done and (i_sample < 5000):
            i_sample += 1
            pi, pi_log, act = sess.run([pi_out, pi_logits_out, predict_ac_op], feed_dict={obs_in[0]: [obs]})
            ac = np.argmax(pi_log)
            obs, reward, done, _ = env.step(ac)
            # obs, reward, done, _ = env.step(act[0][0])
            obs = normalize_obs(obs)

            env.render()
            time.sleep(0.01)