Exemple #1
0
    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.
        
        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
            self.param_noise_stddev: self.param_noise.current_stddev,
        })
        distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
            self.obs0: batch['obs0'],
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

        mean_distance = mpi_mean(distance)
        self.param_noise.adapt(mean_distance)
        return mean_distance
Exemple #2
0
    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.
        
        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
            self.param_noise_stddev: self.param_noise.current_stddev,
        })
        distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
            self.obs0: batch['obs0'],
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

        mean_distance = mpi_mean(distance)
        self.param_noise.adapt(mean_distance)
        return mean_distance
Exemple #3
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          **kwargs):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                eval_episode_reward = 0.
                for i in range(10):  # 5 rollouts
                    while True:
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.
                            break

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = mpi_mean(
                np.mean(episode_rewards_history))
            combined_stats['rollout/episode_steps'] = mpi_mean(
                epoch_episode_steps)
            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)

            # Train statistics.
            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = mpi_mean(
                epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = mpi_mean(
                    np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q'] = mpi_mean(eval_qs)
                combined_stats['eval/episodes'] = mpi_mean(
                    len(eval_episode_rewards))

            # Total statistics.
            combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(
                float(t) / float(duration))
            combined_stats['total/episodes'] = mpi_mean(episodes)
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Exemple #4
0
def train_implicit(env,
                   nb_epochs,
                   nb_epoch_cycles,
                   render_eval,
                   reward_scale,
                   render,
                   actor,
                   critic,
                   classifier,
                   normalize_returns,
                   normalize_observations,
                   critic_l2_reg,
                   classifier_l2_reg,
                   actor_lr,
                   critic_lr,
                   classifier_lr,
                   action_noise,
                   popart,
                   gamma,
                   clip_norm,
                   nb_train_steps,
                   nb_rollout_steps,
                   nb_eval_steps,
                   batch_size,
                   memory,
                   fifomemory,
                   tau=0.01,
                   eval_env=None,
                   callback=None,
                   entropy_coeff=1.,
                   pretrained='none'):
    rank = MPI.COMM_WORLD.Get_rank()

    logger.info('noisynet implementation of DDPG')

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG_paramnoise(actor,
                            critic,
                            classifier,
                            memory,
                            fifomemory,
                            env.observation_space.shape,
                            env.action_space.shape,
                            gamma=gamma,
                            tau=tau,
                            normalize_returns=normalize_returns,
                            normalize_observations=normalize_observations,
                            batch_size=batch_size,
                            action_noise=action_noise,
                            critic_l2_reg=critic_l2_reg,
                            classifier_l2_reg=classifier_l2_reg,
                            actor_lr=actor_lr,
                            critic_lr=critic_lr,
                            classifier_lr=classifier_lr,
                            enable_popart=popart,
                            clip_norm=clip_norm,
                            reward_scale=reward_scale,
                            entropy_coeff=entropy_coeff)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Copy an env for evaluation
    env_eval = copy.deepcopy(env.env)

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        # load pretrained agent if possible
        if pretrained == 'none':
            logger.info('Training from scratch...')
        else:
            logger.info('Loading pretrained model from {}'.format(pretrained))
            #assert os.path.exists(pretrained)
            saver.restore(sess, pretrained)

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0
        total_time = 0

        epoch = 0
        start_time = time.time()

        total_time_record = []
        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_actor_losses_record = []
        epoch_critic_losses_record = []
        epoch_classifier_losses_record = []
        epoch_approx_entropy_record = []
        epoch_end_xpos = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1
                    total_time += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        total_time_record.append(total_time)
                        epoch_end_xpos.append(obs[0])
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_classifier_losses = []
                epoch_approx_entropy = []
                for t_train in range(nb_train_steps):

                    cl, al, cll, ae = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    epoch_classifier_losses.append(cll)
                    epoch_approx_entropy.append(ae)
                    agent.update_target_net()

                    #epoch_actor_losses_record.append(mpi_mean(epoch_actor_losses))
                    #epoch_critic_losses_record.append(mpi_mean(epoch_critic_losses))
                    #epoch_classifier_losses_record.append(mpi_mean(epoch_classifier_losses))
                    #epoch_approx_entropy_record.append(mpi_mean(epoch_approx_entropy))

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    # eval for one episode
                    eval_episode_reward = 0.0
                    eval_done = False
                    eval_obs = eval_env.reset()
                    while not eval_done:
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action)
                        eval_episode_reward += eval_r
                        eval_qs.append(eval_q)
                    eval_episode_rewards.append(eval_episode_reward)
                    eval_episode_rewards_history.append(eval_episode_reward)
                """
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.
                """

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['rollout/nb-epoch'] = epoch
            combined_stats['rollout/nb-cycle'] = cycle
            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = mpi_mean(
                np.mean(episode_rewards_history))
            combined_stats['rollout/episode_steps'] = mpi_mean(
                epoch_episode_steps)
            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)

            # Train statistics.
            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
            combined_stats['train/loss_classifier'] = mpi_mean(
                epoch_classifier_losses)
            combined_stats['train/approx_entropy'] = mpi_mean(
                epoch_approx_entropy)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = mpi_mean(
                    np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q'] = mpi_mean(eval_qs)
                combined_stats['eval/episodes'] = mpi_mean(
                    len(eval_episode_rewards))

            # Total statistics.
            combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(
                float(t) / float(duration))
            combined_stats['total/episodes'] = mpi_mean(episodes)
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)

            # Call the callback
            if callback is not None:
                if callback(locals(),
                            globals()):  # callback returns a boolean value
                    break

        # Evaluate the policy on env to record trajs
        eval_rewards, eval_steps, trajs_obs, trajs_actions = evaluate(
            env_eval, agent=agent)
        if callback is not None:
            callback.final_call(locals(), globals())
Exemple #5
0
def learn(env,
          v=0,
          graph=True,
          render=True,
          repeats=1,
          episodes=1000,
          max_episode_steps=200,
          train_steps=5,
          batch_normalize=True,
          learning_rate=0.001,
          gamma=0.99,
          tau=0.99,
          epsilon=0.1,
          hidden_size=100,
          hidden_n=2,
          hidden_activation=tf.nn.relu,
          batch_size=128,
          memory_capacity=10000,
          load_path=None,
          covariance="original"):
    if v > 0:
        print("Experiment " + str(args))

    experiments_rewards = []
    for i in range(repeats):
        agent = naf.Agent(v, env.observation_space, env.action_space,
                          learning_rate, batch_normalize, gamma, tau, epsilon,
                          hidden_size, hidden_n, hidden_activation, batch_size,
                          memory_capacity, load_path, covariance)
        experiment_rewards = []
        terminate = None
        solved = 0  #only relevant if solved_threshold is set

        for j in range(episodes):
            if terminate is not None:
                fill_value = 0
                if terminate == "solved":
                    fill_value = solve_threshold
                experiment_rewards = fill_episodes(experiment_rewards,
                                                   episodes - j, fill_value)
                break

            rewards = 0
            state = env.reset()

            for k in range(max_episode_steps):
                if render:
                    env.render()

                action = agent.get_action(state)
                if np.isnan(np.min(
                        action)):  #if NaN action (neural network exploded)
                    print("Warning: NaN action, terminating agent")
                    with open("error.txt", "a") as error_file:
                        error_file.write(
                            str(args) + " repeat " + str(i) + " episode " +
                            str(j) + " step " + str(k) + " NaN\n")
                    rewards = 0  #TODO ?
                    terminate = "nan"
                    break
                #print(action)
                state_next, reward, terminal, _ = env.step(
                    agent.scale(action, env.action_space.low,
                                env.action_space.high))

                if k - 1 >= max_episode_steps:
                    terminal = True

                agent.observe(state, action, reward, state_next, terminal)

                for l in range(train_steps):
                    agent.learn()

                state = state_next
                rewards += reward
                if terminal:
                    agent.reset()
                    break
            experiment_rewards += [rewards]

            #   if solve_threshold is not None:
            #     if rewards >= solve_threshold:
            #       solved += 1
            #     else:
            #       solved = 0
            #     if solved >= 10: #number of repeated rewards above threshold to consider environment solved = 10
            #       print("[Solved]")
            #       terminate = "solved"

            #print("logger directory", logger.get_dir())
            #print("rewards", rewards)
            #print("np.std(experiment_rewards)", np.std(experiment_rewards))
            #print("EpRew",  mpi_mean(np.mean(experiment_rewards)))
            #print("EpRewStd",  np.std(experiment_rewards))
            logger.record_tabular("EpRew",
                                  mpi_mean(np.mean(experiment_rewards)))
            logger.record_tabular("EpRewStd", np.std(experiment_rewards))
            logger.dump_tabular()

            #tensorboard
            tensorboard_outdir = '/tmp/rosrl/GazeboModularScara3DOF-v3/deepq_naf/' + str(
                j)
            summary_writer = tf.summary.FileWriter(
                tensorboard_outdir, graph=tf.get_default_graph())
            summary = tf.Summary(value=[
                tf.Summary.Value(tag="Experiment reward",
                                 simple_value=mpi_mean(
                                     np.mean(experiment_rewards)))
            ])
            summary_writer.add_summary(summary, j)

            #print("experiment_rewards", experiment_rewards)

        #   if solve_threshold is not None:
        #     if rewards >= solve_threshold:
        #       solved += 1
        #     else:
        #       solved = 0
        #     if solved >= 10: #number of repeated rewards above threshold to consider environment solved = 10
        #       print("[Solved]")
        #       terminate = "solved"

        # if args['v'] > 0:
        #     print("Reward(" + str(i) + "," + str(j) + "," + str(k) + ")=" + str(rewards))
        # if args['v'] > 1:
        #   print(np.mean(experiment_rewards[-10:]))
        experiments_rewards += [experiment_rewards]

    #print("experiments_rewards", mpi_mean(np.mean(experiment_rewards)))

    return experiments_rewards
Exemple #6
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          explorer,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_train_onpolicy_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          on_policy_mem,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high).all()
    # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPGExp(actor,
                    critic,
                    explorer,
                    memory,
                    on_policy_mem,
                    env.observation_space.shape,
                    env.action_space.shape,
                    gamma=gamma,
                    tau=tau,
                    normalize_returns=normalize_returns,
                    normalize_observations=normalize_observations,
                    batch_size=batch_size,
                    action_noise=action_noise,
                    param_noise=param_noise,
                    critic_l2_reg=critic_l2_reg,
                    actor_lr=actor_lr,
                    critic_lr=critic_lr,
                    enable_popart=popart,
                    clip_norm=clip_norm,
                    reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        eval_obs = obs
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        eval_episode_rewards = []
        eval_qs = []
        for epoch in range(nb_epochs):
            # 1.inner loop cycle, learn to explore
            # clear exploration buffer D_0, on-policy
            agent.clear_on_policy_mem()
            for cycle in range(nb_epoch_cycles):
                # a) generate rollouts {D_0} with exploration policy $\pi_0$,
                for t_rollout in range(nb_rollout_steps):
                    # predict next action, first a = a + N
                    action, q = agent.pi_noisy_exp(obs,
                                                   apply_noise=True,
                                                   compute_Q=True)
                    assert action.shape == env.action_space.shape
                    # exec next action
                    assert max_action.shape == action.shape
                    # scale for execution in env (as far as DDPG is concerned,
                    # every action is in [-1, 1])
                    new_obs, r, done, info = env.step(max_action * action)
                    t += 1
                    episode_reward += r
                    episode_step += 1
                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    # save to on policy mem
                    agent.store_on_policy_transition(obs, action, r, new_obs,
                                                     done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # b) update exploitation policy $\pi_1$ and Q with D_0, on-policy
                # todo, save old policy
                # todo, log actor, critic losses
                # todo, off policy train, but seems on policy is pretty good?
                for t_op_train in range(nb_train_onpolicy_steps):
                    cl, al = agent.train_on_policy()
                    agent.update_target_net()
                # c) generate rollouts {D_1} with $\pi_1$, evaluate the performace $R_t = R_{new_1}(D_1) - R_{old_1}(D_1)$
                # Evaluation for only one trajectory,
                # todo, more trajectories, but maybe we could use evaluation
                # Q instead of monte carlo R?
                # evaluation for old policy
                eval_episode_reward = 0.
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q = agent.pi(eval_obs,
                                                   apply_noise=False,
                                                   compute_Q=True)
                    # scale for execution in env
                    new_eval_obs, eval_r, eval_done, eval_info = env.step(
                        max_action * eval_action)
                    eval_episode_reward += eval_r
                    eval_qs.append(eval_q)
                    agent.store_transition(eval_obs, eval_action, eval_r,
                                           new_eval_obs, eval_done)
                    eval_obs = new_eval_obs
                    if eval_done:
                        eval_obs = env.reset()
                        # R_t = eval_reward
                        eval_episode_rewards.append(eval_episode_reward)
                        eval_episode_rewards_history.append(
                            eval_episode_reward)
                        eval_episode_reward = 0.
                    # todo, dR_t = R_t - old_R
                    # maxR = max(maxR, R_t), save max policy

            # 2.update exploration policy $\pi_0$ with {D_0} and R_t
            # \sum_t \partial{\log \pi_0(D_0t)}{\theta} R_t
            # first use the same exploration policy here, a = a + N

            # 3.update exploitation policy $\pi_1$ and Q (or \pi_1 = \argmax_t {R_t})
            # Policy Upate, todo, use max policy
            epoch_actor_losses = []
            epoch_critic_losses = []
            for t_train in range(nb_train_steps):
                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = mpi_mean(
                np.mean(episode_rewards_history))
            combined_stats['rollout/episode_steps'] = mpi_mean(
                epoch_episode_steps)
            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)

            # Train statistics.
            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)

            # Evaluation statistics.
            combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
            combined_stats['eval/return_history'] = mpi_mean(
                np.mean(eval_episode_rewards_history))
            combined_stats['eval/Q'] = mpi_mean(eval_qs)
            combined_stats['eval/episodes'] = mpi_mean(
                len(eval_episode_rewards))

            # Total statistics.
            combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(
                float(t) / float(duration))
            combined_stats['total/episodes'] = mpi_mean(episodes)
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Exemple #7
0
def experiment(args):
    if args['v'] > 0:
        print("Experiment " + str(args))

    env = gym.make(args['environment'])

    experiments_rewards = []
    for i in range(args['repeats']):
        agent = naf.Agent(args['v'], env.observation_space, env.action_space,
                          args['learning_rate'], args['batch_normalize'],
                          args['gamma'], args['tau'], args['epsilon'],
                          args['hidden_size'], args['hidden_n'],
                          args['hidden_activation'], args['batch_size'],
                          args['memory_capacity'], args['load_path'],
                          args['covariance'])
        experiment_rewards = []
        terminate = None
        solved = 0  #only relevant if solved_threshold is set

        for j in range(args['episodes']):
            if terminate is not None:
                fill_value = 0
                if terminate == "solved":
                    fill_value = args['solve_threshold']
                experiment_rewards = fill_episodes(experiment_rewards,
                                                   args['episodes'] - j,
                                                   fill_value)
                break

            rewards = 0
            state = env.reset()

            for k in range(args['max_episode_steps']):
                #if args['render']:
                #env.render()
                env.render()
                action = agent.get_action(state)
                if np.isnan(np.min(
                        action)):  #if NaN action (neural network exploded)
                    print("Warning: NaN action, terminating agent")
                    with open("error.txt", "a") as error_file:
                        error_file.write(
                            str(args) + " repeat " + str(i) + " episode " +
                            str(j) + " step " + str(k) + " NaN\n")
                    rewards = 0  #TODO ?
                    terminate = "nan"
                    break
                #print(action)
                state_next, reward, terminal, _ = env.step(
                    agent.scale(action, env.action_space.low,
                                env.action_space.high))

                if k - 1 >= args['max_episode_steps']:
                    terminal = True

                agent.observe(state, action, reward, state_next, terminal)

                for l in range(args['train_steps']):
                    agent.learn()

                state = state_next
                rewards += reward
                if terminal:
                    agent.reset()
                    break
            experiment_rewards += [rewards]

            if args['solve_threshold'] is not None:
                if rewards >= args['solve_threshold']:
                    solved += 1
                else:
                    solved = 0
                if solved >= 10:  #number of repeated rewards above threshold to consider environment solved = 10
                    print("[Solved]")
                    terminate = "solved"

            if args['v'] > 0:
                print("Reward(" + str(i) + "," + str(j) + "," + str(k) + ")=" +
                      str(rewards))
        if args['v'] > 1:
            print(np.mean(experiment_rewards[-10:]))
        experiments_rewards += [experiment_rewards]
        logger.record_tabular("EpRew", mpi_mean(np.mean(experiment_rewards)))
        logger.record_tabular("EpRewStd", mpi_std(np.std(experiment_rewards)))
    print("experiments_rewards", mpi_mean(np.mean(experiment_rewards)))
    #tensorboard
    tensorboard_outdir = '/tmp/rosrl/GazeboModularScara3DOF-v3/deepq_naf/'
    summary_writer = tf.summary.FileWriter(tensorboard_outdir,
                                           graph=tf.get_default_graph())
    summary = tf.Summary(value=[
        tf.Summary.Value(tag="Experiment reward",
                         simple_value=experiment_rewards)
    ])
    summary_writer.add_summary(summary, job_id)

    return experiments_rewards
Exemple #8
0
def train_one_batch(env,
                    agent,
                    reward_giver,
                    timesteps_per_batch,
                    nb_train_steps,
                    g_step=3):
    """
    generate one batch of trajectories and update impplicit policy parameters
    """
    # reset agent and clear memory buffer
    agent.reset()
    agent.memory.reset()
    agent.fifomemory.reset()

    max_action = env.action_space.high
    obs_record = []
    action_record = []

    obs = env.reset()
    done = False

    epoch_actor_losses_record = []
    epoch_critic_losses_record = []
    epoch_classifier_losses_record = []
    epoch_approx_entropy_record = []

    logger.info("Collect trajectories on env")
    logger.info("num of policy gradients {}".format(g_step))
    for _ in range(g_step):

        t = 0

        while t < timesteps_per_batch:
            # Predict next action.
            action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
            assert action.shape == env.action_space.shape

            # Execute next action.
            assert max_action.shape == action.shape
            r = reward_giver.get_reward(obs, max_action * action)
            new_obs, _, done, info = env.step(max_action * action)
            t += 1

            obs_record.append(obs)
            action_record.append(max_action * action)
            agent.store_transition(obs, action, r, new_obs, done)
            obs = new_obs

            if done:
                # Episode done.
                agent.reset()
                obs = env.reset()

        logger.info("Training Implicit Policy")
        epoch_actor_losses = []
        epoch_critic_losses = []
        epoch_classifier_losses = []
        epoch_approx_entropy = []
        for t_train in range(nb_train_steps):

            cl, al, cll, ae = agent.train()
            epoch_actor_losses.append(al)
            epoch_critic_losses.append(cl)
            epoch_classifier_losses.append(cll)
            epoch_approx_entropy.append(ae)
            agent.update_target_net()

        logger.info('actor loss {}'.format(mpi_mean(epoch_actor_losses)))
        logger.info('critic loss {}'.format(mpi_mean(epoch_critic_losses)))
        logger.info('classifier loss {}'.format(
            mpi_mean(epoch_classifier_losses)))
        logger.info('approx entropy {}'.format(mpi_mean(epoch_approx_entropy)))

        epoch_actor_losses_record += [mpi_mean(epoch_actor_losses)]
        epoch_critic_losses_record += [mpi_mean(epoch_critic_losses)]
        epoch_classifier_losses_record += [mpi_mean(epoch_classifier_losses)]
        epoch_approx_entropy_record += [mpi_mean(epoch_approx_entropy)]

    losses_record = {}
    losses_record['actor_loss'] = epoch_actor_losses_record
    losses_record['critic_loss'] = epoch_critic_losses_record
    losses_record['classifier_loss'] = epoch_classifier_losses_record
    losses_record['entropy'] = epoch_approx_entropy_record
    return obs_record, action_record, losses_record
Exemple #9
0
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, logdir,
    popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
    tau=0.01, eval_env=None, param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None
    
    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = mpi_mean(np.mean(episode_rewards_history))
            combined_stats['rollout/episode_steps'] = mpi_mean(epoch_episode_steps)
            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)
    
            # Train statistics.
            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = mpi_mean(epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = mpi_mean(np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q'] = mpi_mean(eval_qs)
                combined_stats['eval/episodes'] = mpi_mean(len(eval_episode_rewards))

            # Total statistics.
            combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(float(t) / float(duration))
            combined_stats['total/episodes'] = mpi_mean(episodes)
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t
            
            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')

            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Exemple #10
0
def train(env,
          num_timesteps,
          nb_trials,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          test_interval,
          batch_size,
          memory,
          output,
          load_file,
          save=False,
          tau=0.01,
          evaluation=False,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    observation_range = [env.observation_space.low, env.observation_space.high]
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 observation_range=observation_range)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    trial_return_history = deque(maxlen=100)
    eval_trial_return_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        #dir_path = os.path.dirname(os.path.realpath(__file__))
        #tf.summary.FileWriter(dir_path, sess.graph)

        trial = 0
        ts = 0

        if load_file != '':
            saver.restore(sess, load_file)

        start_time = time.time()

        trial_returns = []
        trial_steps = []
        actions = []
        qs = []
        train_actor_losses = []
        train_critic_losses = []
        train_adaptive_distances = []

        while True:
            test = (test_interval >= 0
                    and trial % (test_interval + 1) == test_interval)

            if not test:
                # Perform rollout.
                env.set_test(test=False)
                obs = env.reset()
                agent.reset()
                done = 0
                trial_return = 0.
                trial_step = 0
                while done == 0:
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    ts += 1
                    if rank == 0 and render:
                        env.render()
                    trial_return += r
                    trial_step += 1

                    # Book-keeping.
                    actions.append(action)
                    qs.append(q)
                    agent.store_transition(
                        obs, action, r, new_obs,
                        done == 2)  # terminal indicator is 2
                    obs = new_obs

                    # Train.
                    if memory.nb_entries >= batch_size:
                        for t_train in range(nb_train_steps):
                            # Adapt param noise, if necessary.
                            if trial % param_noise_adaption_interval == 0:
                                distance = agent.adapt_param_noise()
                                train_adaptive_distances.append(distance)

                            cl, al = agent.train()
                            train_critic_losses.append(cl)
                            train_actor_losses.append(al)
                            agent.update_target_net()

                # Episode done.
                trial_steps.append(trial_step)
                trial_returns.append(trial_return)
                trial_return_history.append(trial_return)

            else:
                # Evaluate.
                eval_trial_return = 0.
                eval_trial_steps = 0
                if evaluation is not None:
                    env.set_test(test=True)
                    eval_obs = env.reset()
                    agent.reset()
                    eval_done = 0
                    while eval_done == 0:
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            env.render()
                        eval_trial_return += eval_r
                        eval_trial_steps += 1
                    # Episode done.
                    eval_trial_return_history.append(eval_trial_return)

                # Log stats.
                duration = time.time() - start_time
                combined_stats = {}
                if memory.nb_entries > 0:
                    # Print only if learing was happaning
                    stats = agent.get_stats()
                    for key in sorted(stats.keys()):
                        combined_stats[key] = mpi_mean(stats[key])

                    # Rollout statistics.
                    combined_stats['rollout/Q_mean'] = mpi_mean(qs)
                    combined_stats['rollout/actions_mean'] = mpi_mean(actions)
                    combined_stats['rollout/actions_std'] = mpi_std(actions)
                    combined_stats['rollout/trial_steps'] = mpi_mean(
                        trial_steps)
                    combined_stats['rollout/return'] = mpi_mean(trial_returns)
                    combined_stats['rollout/return_history'] = mpi_mean(
                        trial_return_history)

                    # Train statistics.
                    combined_stats['train/loss_actor'] = mpi_mean(
                        train_actor_losses)
                    combined_stats['train/loss_critic'] = mpi_mean(
                        train_critic_losses)
                    combined_stats['train/param_noise_distance'] = mpi_mean(
                        train_adaptive_distances)

                # Evaluation statistics.
                if evaluation is not None:
                    combined_stats['eval/Q'] = mpi_mean(eval_q)
                    combined_stats['eval/return'] = eval_trial_return
                    combined_stats['eval/return_history'] = mpi_mean(
                        eval_trial_return_history)
                    combined_stats['eval/steps'] = eval_trial_steps

                # Total statistics.
                combined_stats['total/duration'] = mpi_mean(duration)
                combined_stats['total/steps_per_second'] = mpi_mean(
                    float(ts) / float(duration))
                combined_stats['total/trials'] = trial
                combined_stats['total/steps'] = ts

                for key in sorted(combined_stats.keys()):
                    logger.record_tabular(key, combined_stats[key])
                logger.dump_tabular()
                logger.info('')
                logdir = logger.get_dir()
                if rank == 0 and logdir:
                    if hasattr(env, 'get_state'):
                        with open(os.path.join(logdir, 'env_state.pkl'),
                                  'wb') as f:
                            pickle.dump(env.get_state(), f)
                    if evaluation and hasattr(env, 'get_state'):
                        with open(os.path.join(logdir, 'eval_env_state.pkl'),
                                  'wb') as f:
                            pickle.dump(env.get_state(), f)

                # Reset statistics.
                trial_returns = []
                trial_steps = []
                actions = []
                qs = []
                train_actor_losses = []
                train_critic_losses = []
                train_adaptive_distances = []
                # End of evaluate and log statistics

            # Check if this is the last trial
            trial += 1
            if nb_trials and trial >= nb_trials:
                break
            if num_timesteps and ts >= num_timesteps:
                break

        # Saving policy and value function
        if save and saver and output != '':
            saver.save(sess, './%s' % output)